gl/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009
   3    Free Software Foundation, Inc.
   4    This file is part of the GNU C Library.
   5    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License along
  18    with this program; if not, write to the Free Software Foundation,
  19    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  50 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          Idx node, bool root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static Idx fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   Idx nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  Idx nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      Idx nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   Idx nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             bool accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         Idx *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       Idx *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        bool non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 static const char __re_error_msgid[] =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 static const size_t __re_error_msgid_idx[] =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 #ifdef _LIBC
 214 const char *
 215 re_compile_pattern (pattern, length, bufp)
 216     const char *pattern;
 217     size_t length;
 218     struct re_pattern_buffer *bufp;
 219 #else /* size_t might promote */
 220 const char *
 221 re_compile_pattern (const char *pattern, size_t length,
 222                     struct re_pattern_buffer *bufp)
 223 #endif
 224 {
 225   reg_errcode_t ret;
 226
 227   /* And GNU code determines whether or not to get register information
 228      by passing null for the REGS argument to re_match, etc., not by
 229      setting no_sub, unless RE_NO_SUB is set.  */
 230   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 231
 232   /* Match anchors at newline.  */
 233   bufp->newline_anchor = 1;
 234
 235   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 236
 237   if (!ret)
 238     return NULL;
 239   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 240 }
 241 #ifdef _LIBC
 242 weak_alias (__re_compile_pattern, re_compile_pattern)
 243 #endif
 244
 245 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 246    also be assigned to arbitrarily: each pattern buffer stores its own
 247    syntax, so it can be changed between regex compilations.  */
 248 /* This has no initializer because initialized variables in Emacs
 249    become read-only after dumping.  */
 250 reg_syntax_t re_syntax_options;
 251
 252
 253 /* Specify the precise syntax of regexps for compilation.  This provides
 254    for compatibility for various utilities which historically have
 255    different, incompatible syntaxes.
 256
 257    The argument SYNTAX is a bit mask comprised of the various bits
 258    defined in regex.h.  We return the old syntax.  */
 259
 260 reg_syntax_t
 261 re_set_syntax (syntax)
 262     reg_syntax_t syntax;
 263 {
 264   reg_syntax_t ret = re_syntax_options;
 265
 266   re_syntax_options = syntax;
 267   return ret;
 268 }
 269 #ifdef _LIBC
 270 weak_alias (__re_set_syntax, re_set_syntax)
 271 #endif
 272
 273 int
 274 re_compile_fastmap (bufp)
 275     struct re_pattern_buffer *bufp;
 276 {
 277   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 278   char *fastmap = bufp->fastmap;
 279
 280   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 281   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 282   if (dfa->init_state != dfa->init_state_word)
 283     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 284   if (dfa->init_state != dfa->init_state_nl)
 285     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 286   if (dfa->init_state != dfa->init_state_begbuf)
 287     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 288   bufp->fastmap_accurate = 1;
 289   return 0;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 293 #endif
 294
 295 static inline void
 296 __attribute ((always_inline))
 297 re_set_fastmap (char *fastmap, bool icase, int ch)
 298 {
 299   fastmap[ch] = 1;
 300   if (icase)
 301     fastmap[tolower (ch)] = 1;
 302 }
 303
 304 /* Helper function for re_compile_fastmap.
 305    Compile fastmap for the initial_state INIT_STATE.  */
 306
 307 static void
 308 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 309                          char *fastmap)
 310 {
 311   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 312   Idx node_cnt;
 313   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 314   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 315     {
 316       Idx node = init_state->nodes.elems[node_cnt];
 317       re_token_type_t type = dfa->nodes[node].type;
 318
 319       if (type == CHARACTER)
 320         {
 321           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 322 #ifdef RE_ENABLE_I18N
 323           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 324             {
 325               unsigned char buf[MB_LEN_MAX];
 326               unsigned char *p;
 327               wchar_t wc;
 328               mbstate_t state;
 329
 330               p = buf;
 331               *p++ = dfa->nodes[node].opr.c;
 332               while (++node < dfa->nodes_len
 333                      && dfa->nodes[node].type == CHARACTER
 334                      && dfa->nodes[node].mb_partial)
 335                 *p++ = dfa->nodes[node].opr.c;
 336               memset (&state, '\0', sizeof (state));
 337               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 338                              &state) == p - buf
 339                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 340                       != (size_t) -1))
 341                 re_set_fastmap (fastmap, false, buf[0]);
 342             }
 343 #endif
 344         }
 345       else if (type == SIMPLE_BRACKET)
 346         {
 347           int i, ch;
 348           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 349             {
 350               int j;
 351               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 352               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 353                 if (w & ((bitset_word_t) 1 << j))
 354                   re_set_fastmap (fastmap, icase, ch);
 355             }
 356         }
 357 #ifdef RE_ENABLE_I18N
 358       else if (type == COMPLEX_BRACKET)
 359         {
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           Idx i;
 362
 363 # ifdef _LIBC
 364           /* See if we have to try all bytes which start multiple collation
 365              elements.
 366              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 367                   collation element, and don't catch 'b' since 'b' is
 368                   the only collation element which starts from 'b' (and
 369                   it is caught by SIMPLE_BRACKET).  */
 370               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 371                   && (cset->ncoll_syms || cset->nranges))
 372                 {
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # endif /* _LIBC */
 380
 381           /* See if we have to start the match at all multibyte characters,
 382              i.e. where we would not find an invalid sequence.  This only
 383              applies to multibyte character sets; for single byte character
 384              sets, the SIMPLE_BRACKET again suffices.  */
 385           if (dfa->mb_cur_max > 1
 386               && (cset->nchar_classes || cset->non_match
 387 # ifdef _LIBC
 388                   || cset->nequiv_classes
 389 # endif /* _LIBC */
 390                  ))
 391             {
 392               unsigned char c = 0;
 393               do
 394                 {
 395                   mbstate_t mbs;
 396                   memset (&mbs, 0, sizeof (mbs));
 397                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 398                     re_set_fastmap (fastmap, false, (int) c);
 399                 }
 400               while (++c != 0);
 401             }
 402
 403           else
 404             {
 405               /* ... Else catch all bytes which can start the mbchars.  */
 406               for (i = 0; i < cset->nmbchars; ++i)
 407                 {
 408                   char buf[256];
 409                   mbstate_t state;
 410                   memset (&state, '\0', sizeof (state));
 411                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 412                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 413                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 414                     {
 415                       if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 416                           != (size_t) -1)
 417                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 418                     }
 419                 }
 420             }
 421         }
 422 #endif /* RE_ENABLE_I18N */
 423       else if (type == OP_PERIOD
 424 #ifdef RE_ENABLE_I18N
 425                || type == OP_UTF8_PERIOD
 426 #endif /* RE_ENABLE_I18N */
 427                || type == END_OF_RE)
 428         {
 429           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 430           if (type == END_OF_RE)
 431             bufp->can_be_null = 1;
 432           return;
 433         }
 434     }
 435 }
 436 \f
 437 /* Entry point for POSIX code.  */
 438 /* regcomp takes a regular expression as a string and compiles it.
 439
 440    PREG is a regex_t *.  We do not expect any fields to be initialized,
 441    since POSIX says we shouldn't.  Thus, we set
 442
 443      `buffer' to the compiled pattern;
 444      `used' to the length of the compiled pattern;
 445      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 446        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 447        RE_SYNTAX_POSIX_BASIC;
 448      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 449      `fastmap' to an allocated space for the fastmap;
 450      `fastmap_accurate' to zero;
 451      `re_nsub' to the number of subexpressions in PATTERN.
 452
 453    PATTERN is the address of the pattern string.
 454
 455    CFLAGS is a series of bits which affect compilation.
 456
 457      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 458      use POSIX basic syntax.
 459
 460      If REG_NEWLINE is set, then . and [^...] don't match newline.
 461      Also, regexec will try a match beginning after every newline.
 462
 463      If REG_ICASE is set, then we considers upper- and lowercase
 464      versions of letters to be equivalent when matching.
 465
 466      If REG_NOSUB is set, then when PREG is passed to regexec, that
 467      routine will report only success or failure, and nothing about the
 468      registers.
 469
 470    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 471    the return codes and their meanings.)  */
 472
 473 int
 474 regcomp (preg, pattern, cflags)
 475     regex_t *_Restrict_ preg;
 476     const char *_Restrict_ pattern;
 477     int cflags;
 478 {
 479   reg_errcode_t ret;
 480   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 481                          : RE_SYNTAX_POSIX_BASIC);
 482
 483   preg->buffer = NULL;
 484   preg->allocated = 0;
 485   preg->used = 0;
 486
 487   /* Try to allocate space for the fastmap.  */
 488   preg->fastmap = re_malloc (char, SBC_MAX);
 489   if (BE (preg->fastmap == NULL, 0))
 490     return REG_ESPACE;
 491
 492   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 493
 494   /* If REG_NEWLINE is set, newlines are treated differently.  */
 495   if (cflags & REG_NEWLINE)
 496     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 497       syntax &= ~RE_DOT_NEWLINE;
 498       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 499       /* It also changes the matching behavior.  */
 500       preg->newline_anchor = 1;
 501     }
 502   else
 503     preg->newline_anchor = 0;
 504   preg->no_sub = !!(cflags & REG_NOSUB);
 505   preg->translate = NULL;
 506
 507   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 508
 509   /* POSIX doesn't distinguish between an unmatched open-group and an
 510      unmatched close-group: both are REG_EPAREN.  */
 511   if (ret == REG_ERPAREN)
 512     ret = REG_EPAREN;
 513
 514   /* We have already checked preg->fastmap != NULL.  */
 515   if (BE (ret == REG_NOERROR, 1))
 516     /* Compute the fastmap now, since regexec cannot modify the pattern
 517        buffer.  This function never fails in this implementation.  */
 518     (void) re_compile_fastmap (preg);
 519   else
 520     {
 521       /* Some error occurred while compiling the expression.  */
 522       re_free (preg->fastmap);
 523       preg->fastmap = NULL;
 524     }
 525
 526   return (int) ret;
 527 }
 528 #ifdef _LIBC
 529 weak_alias (__regcomp, regcomp)
 530 #endif
 531
 532 /* Returns a message corresponding to an error code, ERRCODE, returned
 533    from either regcomp or regexec.   We don't use PREG here.  */
 534
 535 #ifdef _LIBC
 536 size_t
 537 regerror (errcode, preg, errbuf, errbuf_size)
 538     int errcode;
 539     const regex_t *_Restrict_ preg;
 540     char *_Restrict_ errbuf;
 541     size_t errbuf_size;
 542 #else /* size_t might promote */
 543 size_t
 544 regerror (int errcode, const regex_t *_Restrict_ preg,
 545           char *_Restrict_ errbuf, size_t errbuf_size)
 546 #endif
 547 {
 548   const char *msg;
 549   size_t msg_size;
 550
 551   if (BE (errcode < 0
 552           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 553                                / sizeof (__re_error_msgid_idx[0])), 0))
 554     /* Only error codes returned by the rest of the code should be passed
 555        to this routine.  If we are given anything else, or if other regex
 556        code generates an invalid error code, then the program has a bug.
 557        Dump core so we can fix it.  */
 558     abort ();
 559
 560   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 561
 562   msg_size = strlen (msg) + 1; /* Includes the null.  */
 563
 564   if (BE (errbuf_size != 0, 1))
 565     {
 566       size_t cpy_size = msg_size;
 567       if (BE (msg_size > errbuf_size, 0))
 568         {
 569           cpy_size = errbuf_size - 1;
 570           errbuf[cpy_size] = '\0';
 571         }
 572       memcpy (errbuf, msg, cpy_size);
 573     }
 574
 575   return msg_size;
 576 }
 577 #ifdef _LIBC
 578 weak_alias (__regerror, regerror)
 579 #endif
 580
 581
 582 #ifdef RE_ENABLE_I18N
 583 /* This static array is used for the map to single-byte characters when
 584    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 585    it the same all the time.  UTF-8 is the preferred encoding so this is
 586    a worthwhile optimization.  */
 587 static const bitset_t utf8_sb_map =
 588 {
 589   /* Set the first 128 bits.  */
 590 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 591 #  error "bitset_word_t is narrower than 32 bits"
 592 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 593   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 594 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 595   BITSET_WORD_MAX, BITSET_WORD_MAX,
 596 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 597   BITSET_WORD_MAX,
 598 # endif
 599   (BITSET_WORD_MAX
 600    >> (SBC_MAX % BITSET_WORD_BITS == 0
 601        ? 0
 602        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 603 };
 604 #endif
 605
 606
 607 static void
 608 free_dfa_content (re_dfa_t *dfa)
 609 {
 610   Idx i, j;
 611
 612   if (dfa->nodes)
 613     for (i = 0; i < dfa->nodes_len; ++i)
 614       free_token (dfa->nodes + i);
 615   re_free (dfa->nexts);
 616   for (i = 0; i < dfa->nodes_len; ++i)
 617     {
 618       if (dfa->eclosures != NULL)
 619         re_node_set_free (dfa->eclosures + i);
 620       if (dfa->inveclosures != NULL)
 621         re_node_set_free (dfa->inveclosures + i);
 622       if (dfa->edests != NULL)
 623         re_node_set_free (dfa->edests + i);
 624     }
 625   re_free (dfa->edests);
 626   re_free (dfa->eclosures);
 627   re_free (dfa->inveclosures);
 628   re_free (dfa->nodes);
 629
 630   if (dfa->state_table)
 631     for (i = 0; i <= dfa->state_hash_mask; ++i)
 632       {
 633         struct re_state_table_entry *entry = dfa->state_table + i;
 634         for (j = 0; j < entry->num; ++j)
 635           {
 636             re_dfastate_t *state = entry->array[j];
 637             free_state (state);
 638           }
 639         re_free (entry->array);
 640       }
 641   re_free (dfa->state_table);
 642 #ifdef RE_ENABLE_I18N
 643   if (dfa->sb_char != utf8_sb_map)
 644     re_free (dfa->sb_char);
 645 #endif
 646   re_free (dfa->subexp_map);
 647 #ifdef DEBUG
 648   re_free (dfa->re_str);
 649 #endif
 650
 651   re_free (dfa);
 652 }
 653
 654
 655 /* Free dynamically allocated space used by PREG.  */
 656
 657 void
 658 regfree (preg)
 659     regex_t *preg;
 660 {
 661   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 662   if (BE (dfa != NULL, 1))
 663     free_dfa_content (dfa);
 664   preg->buffer = NULL;
 665   preg->allocated = 0;
 666
 667   re_free (preg->fastmap);
 668   preg->fastmap = NULL;
 669
 670   re_free (preg->translate);
 671   preg->translate = NULL;
 672 }
 673 #ifdef _LIBC
 674 weak_alias (__regfree, regfree)
 675 #endif
 676 \f
 677 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 678    them unless specifically requested.  */
 679
 680 #if defined _REGEX_RE_COMP || defined _LIBC
 681
 682 /* BSD has one and only one pattern buffer.  */
 683 static struct re_pattern_buffer re_comp_buf;
 684
 685 char *
 686 # ifdef _LIBC
 687 /* Make these definitions weak in libc, so POSIX programs can redefine
 688    these names if they don't use our functions, and still use
 689    regcomp/regexec above without link errors.  */
 690 weak_function
 691 # endif
 692 re_comp (s)
 693      const char *s;
 694 {
 695   reg_errcode_t ret;
 696   char *fastmap;
 697
 698   if (!s)
 699     {
 700       if (!re_comp_buf.buffer)
 701         return gettext ("No previous regular expression");
 702       return 0;
 703     }
 704
 705   if (re_comp_buf.buffer)
 706     {
 707       fastmap = re_comp_buf.fastmap;
 708       re_comp_buf.fastmap = NULL;
 709       __regfree (&re_comp_buf);
 710       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 711       re_comp_buf.fastmap = fastmap;
 712     }
 713
 714   if (re_comp_buf.fastmap == NULL)
 715     {
 716       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 717       if (re_comp_buf.fastmap == NULL)
 718         return (char *) gettext (__re_error_msgid
 719                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 720     }
 721
 722   /* Since `re_exec' always passes NULL for the `regs' argument, we
 723      don't need to initialize the pattern buffer fields which affect it.  */
 724
 725   /* Match anchors at newlines.  */
 726   re_comp_buf.newline_anchor = 1;
 727
 728   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 729
 730   if (!ret)
 731     return NULL;
 732
 733   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 734   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 735 }
 736
 737 #ifdef _LIBC
 738 libc_freeres_fn (free_mem)
 739 {
 740   __regfree (&re_comp_buf);
 741 }
 742 #endif
 743
 744 #endif /* _REGEX_RE_COMP */
 745 \f
 746 /* Internal entry point.
 747    Compile the regular expression PATTERN, whose length is LENGTH.
 748    SYNTAX indicate regular expression's syntax.  */
 749
 750 static reg_errcode_t
 751 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 752                      reg_syntax_t syntax)
 753 {
 754   reg_errcode_t err = REG_NOERROR;
 755   re_dfa_t *dfa;
 756   re_string_t regexp;
 757
 758   /* Initialize the pattern buffer.  */
 759   preg->fastmap_accurate = 0;
 760   preg->syntax = syntax;
 761   preg->not_bol = preg->not_eol = 0;
 762   preg->used = 0;
 763   preg->re_nsub = 0;
 764   preg->can_be_null = 0;
 765   preg->regs_allocated = REGS_UNALLOCATED;
 766
 767   /* Initialize the dfa.  */
 768   dfa = (re_dfa_t *) preg->buffer;
 769   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 770     {
 771       /* If zero allocated, but buffer is non-null, try to realloc
 772          enough space.  This loses if buffer's address is bogus, but
 773          that is the user's responsibility.  If ->buffer is NULL this
 774          is a simple allocation.  */
 775       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 776       if (dfa == NULL)
 777         return REG_ESPACE;
 778       preg->allocated = sizeof (re_dfa_t);
 779       preg->buffer = (unsigned char *) dfa;
 780     }
 781   preg->used = sizeof (re_dfa_t);
 782
 783   err = init_dfa (dfa, length);
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786       free_dfa_content (dfa);
 787       preg->buffer = NULL;
 788       preg->allocated = 0;
 789       return err;
 790     }
 791 #ifdef DEBUG
 792   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 793   dfa->re_str = re_malloc (char, length + 1);
 794   strncpy (dfa->re_str, pattern, length + 1);
 795 #endif
 796
 797   __libc_lock_init (dfa->lock);
 798
 799   err = re_string_construct (&regexp, pattern, length, preg->translate,
 800                              (syntax & RE_ICASE) != 0, dfa);
 801   if (BE (err != REG_NOERROR, 0))
 802     {
 803     re_compile_internal_free_return:
 804       free_workarea_compile (preg);
 805       re_string_destruct (&regexp);
 806       free_dfa_content (dfa);
 807       preg->buffer = NULL;
 808       preg->allocated = 0;
 809       return err;
 810     }
 811
 812   /* Parse the regular expression, and build a structure tree.  */
 813   preg->re_nsub = 0;
 814   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 815   if (BE (dfa->str_tree == NULL, 0))
 816     goto re_compile_internal_free_return;
 817
 818   /* Analyze the tree and create the nfa.  */
 819   err = analyze (preg);
 820   if (BE (err != REG_NOERROR, 0))
 821     goto re_compile_internal_free_return;
 822
 823 #ifdef RE_ENABLE_I18N
 824   /* If possible, do searching in single byte encoding to speed things up.  */
 825   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 826     optimize_utf8 (dfa);
 827 #endif
 828
 829   /* Then create the initial state of the dfa.  */
 830   err = create_initial_state (dfa);
 831
 832   /* Release work areas.  */
 833   free_workarea_compile (preg);
 834   re_string_destruct (&regexp);
 835
 836   if (BE (err != REG_NOERROR, 0))
 837     {
 838       free_dfa_content (dfa);
 839       preg->buffer = NULL;
 840       preg->allocated = 0;
 841     }
 842
 843   return err;
 844 }
 845
 846 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 847    as the initial length of some arrays.  */
 848
 849 static reg_errcode_t
 850 init_dfa (re_dfa_t *dfa, size_t pat_len)
 851 {
 852   __re_size_t table_size;
 853 #ifdef RE_ENABLE_I18N
 854   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 855 #else
 856   size_t max_i18n_object_size = 0;
 857 #endif
 858   size_t max_object_size =
 859     MAX (sizeof (struct re_state_table_entry),
 860          MAX (sizeof (re_token_t),
 861               MAX (sizeof (re_node_set),
 862                    MAX (sizeof (regmatch_t),
 863                         max_i18n_object_size))));
 864
 865   memset (dfa, '\0', sizeof (re_dfa_t));
 866
 867   /* Force allocation of str_tree_storage the first time.  */
 868   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 869
 870   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 871      calculation below, and for similar doubling calculations
 872      elsewhere.  And it's <= rather than <, because some of the
 873      doubling calculations add 1 afterwards.  */
 874   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 875     return REG_ESPACE;
 876
 877   dfa->nodes_alloc = pat_len + 1;
 878   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 879
 880   /*  table_size = 2 ^ ceil(log pat_len) */
 881   for (table_size = 1; ; table_size <<= 1)
 882     if (table_size > pat_len)
 883       break;
 884
 885   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 886   dfa->state_hash_mask = table_size - 1;
 887
 888   dfa->mb_cur_max = MB_CUR_MAX;
 889 #ifdef _LIBC
 890   if (dfa->mb_cur_max == 6
 891       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 892     dfa->is_utf8 = 1;
 893   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 894                        != 0);
 895 #else
 896   if (strcmp (locale_charset (), "UTF-8") == 0)
 897     dfa->is_utf8 = 1;
 898
 899   /* We check exhaustively in the loop below if this charset is a
 900      superset of ASCII.  */
 901   dfa->map_notascii = 0;
 902 #endif
 903
 904 #ifdef RE_ENABLE_I18N
 905   if (dfa->mb_cur_max > 1)
 906     {
 907       if (dfa->is_utf8)
 908         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 909       else
 910         {
 911           int i, j, ch;
 912
 913           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 914           if (BE (dfa->sb_char == NULL, 0))
 915             return REG_ESPACE;
 916
 917           /* Set the bits corresponding to single byte chars.  */
 918           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 919             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 920               {
 921                 wint_t wch = __btowc (ch);
 922                 if (wch != WEOF)
 923                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 924 # ifndef _LIBC
 925                 if (isascii (ch) && wch != ch)
 926                   dfa->map_notascii = 1;
 927 # endif
 928               }
 929         }
 930     }
 931 #endif
 932
 933   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 934     return REG_ESPACE;
 935   return REG_NOERROR;
 936 }
 937
 938 /* Initialize WORD_CHAR table, which indicate which character is
 939    "word".  In this case "word" means that it is the word construction
 940    character used by some operators like "\<", "\>", etc.  */
 941
 942 static void
 943 internal_function
 944 init_word_char (re_dfa_t *dfa)
 945 {
 946   int i, j, ch;
 947   dfa->word_ops_used = 1;
 948   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 949     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 950       if (isalnum (ch) || ch == '_')
 951         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 952 }
 953
 954 /* Free the work area which are only used while compiling.  */
 955
 956 static void
 957 free_workarea_compile (regex_t *preg)
 958 {
 959   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 960   bin_tree_storage_t *storage, *next;
 961   for (storage = dfa->str_tree_storage; storage; storage = next)
 962     {
 963       next = storage->next;
 964       re_free (storage);
 965     }
 966   dfa->str_tree_storage = NULL;
 967   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 968   dfa->str_tree = NULL;
 969   re_free (dfa->org_indices);
 970   dfa->org_indices = NULL;
 971 }
 972
 973 /* Create initial states for all contexts.  */
 974
 975 static reg_errcode_t
 976 create_initial_state (re_dfa_t *dfa)
 977 {
 978   Idx first, i;
 979   reg_errcode_t err;
 980   re_node_set init_nodes;
 981
 982   /* Initial states have the epsilon closure of the node which is
 983      the first node of the regular expression.  */
 984   first = dfa->str_tree->first->node_idx;
 985   dfa->init_node = first;
 986   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 987   if (BE (err != REG_NOERROR, 0))
 988     return err;
 989
 990   /* The back-references which are in initial states can epsilon transit,
 991      since in this case all of the subexpressions can be null.
 992      Then we add epsilon closures of the nodes which are the next nodes of
 993      the back-references.  */
 994   if (dfa->nbackref > 0)
 995     for (i = 0; i < init_nodes.nelem; ++i)
 996       {
 997         Idx node_idx = init_nodes.elems[i];
 998         re_token_type_t type = dfa->nodes[node_idx].type;
 999
1000         Idx clexp_idx;
1001         if (type != OP_BACK_REF)
1002           continue;
1003         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1004           {
1005             re_token_t *clexp_node;
1006             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1007             if (clexp_node->type == OP_CLOSE_SUBEXP
1008                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1009               break;
1010           }
1011         if (clexp_idx == init_nodes.nelem)
1012           continue;
1013
1014         if (type == OP_BACK_REF)
1015           {
1016             Idx dest_idx = dfa->edests[node_idx].elems[0];
1017             if (!re_node_set_contains (&init_nodes, dest_idx))
1018               {
1019                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1020                 i = 0;
1021               }
1022           }
1023       }
1024
1025   /* It must be the first time to invoke acquire_state.  */
1026   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1027   /* We don't check ERR here, since the initial state must not be NULL.  */
1028   if (BE (dfa->init_state == NULL, 0))
1029     return err;
1030   if (dfa->init_state->has_constraint)
1031     {
1032       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1033                                                        CONTEXT_WORD);
1034       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1035                                                      CONTEXT_NEWLINE);
1036       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1037                                                          &init_nodes,
1038                                                          CONTEXT_NEWLINE
1039                                                          | CONTEXT_BEGBUF);
1040       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1041               || dfa->init_state_begbuf == NULL, 0))
1042         return err;
1043     }
1044   else
1045     dfa->init_state_word = dfa->init_state_nl
1046       = dfa->init_state_begbuf = dfa->init_state;
1047
1048   re_node_set_free (&init_nodes);
1049   return REG_NOERROR;
1050 }
1051 \f
1052 #ifdef RE_ENABLE_I18N
1053 /* If it is possible to do searching in single byte encoding instead of UTF-8
1054    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1055    DFA nodes where needed.  */
1056
1057 static void
1058 optimize_utf8 (re_dfa_t *dfa)
1059 {
1060   Idx node;
1061   int i;
1062   bool mb_chars = false;
1063   bool has_period = false;
1064
1065   for (node = 0; node < dfa->nodes_len; ++node)
1066     switch (dfa->nodes[node].type)
1067       {
1068       case CHARACTER:
1069         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1070           mb_chars = true;
1071         break;
1072       case ANCHOR:
1073         switch (dfa->nodes[node].opr.ctx_type)
1074           {
1075           case LINE_FIRST:
1076           case LINE_LAST:
1077           case BUF_FIRST:
1078           case BUF_LAST:
1079             break;
1080           default:
1081             /* Word anchors etc. cannot be handled.  It's okay to test
1082                opr.ctx_type since constraints (for all DFA nodes) are
1083                created by ORing one or more opr.ctx_type values.  */
1084             return;
1085           }
1086         break;
1087       case OP_PERIOD:
1088         has_period = true;
1089         break;
1090       case OP_BACK_REF:
1091       case OP_ALT:
1092       case END_OF_RE:
1093       case OP_DUP_ASTERISK:
1094       case OP_OPEN_SUBEXP:
1095       case OP_CLOSE_SUBEXP:
1096         break;
1097       case COMPLEX_BRACKET:
1098         return;
1099       case SIMPLE_BRACKET:
1100         /* Just double check.  */
1101         {
1102           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1103                         ? 0
1104                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1105           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1106             {
1107               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1108                 return;
1109               rshift = 0;
1110             }
1111         }
1112         break;
1113       default:
1114         abort ();
1115       }
1116
1117   if (mb_chars || has_period)
1118     for (node = 0; node < dfa->nodes_len; ++node)
1119       {
1120         if (dfa->nodes[node].type == CHARACTER
1121             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1122           dfa->nodes[node].mb_partial = 0;
1123         else if (dfa->nodes[node].type == OP_PERIOD)
1124           dfa->nodes[node].type = OP_UTF8_PERIOD;
1125       }
1126
1127   /* The search can be in single byte locale.  */
1128   dfa->mb_cur_max = 1;
1129   dfa->is_utf8 = 0;
1130   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1131 }
1132 #endif
1133 \f
1134 /* Analyze the structure tree, and calculate "first", "next", "edest",
1135    "eclosure", and "inveclosure".  */
1136
1137 static reg_errcode_t
1138 analyze (regex_t *preg)
1139 {
1140   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1141   reg_errcode_t ret;
1142
1143   /* Allocate arrays.  */
1144   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1145   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1146   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1147   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1148   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1149           || dfa->eclosures == NULL, 0))
1150     return REG_ESPACE;
1151
1152   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1153   if (dfa->subexp_map != NULL)
1154     {
1155       Idx i;
1156       for (i = 0; i < preg->re_nsub; i++)
1157         dfa->subexp_map[i] = i;
1158       preorder (dfa->str_tree, optimize_subexps, dfa);
1159       for (i = 0; i < preg->re_nsub; i++)
1160         if (dfa->subexp_map[i] != i)
1161           break;
1162       if (i == preg->re_nsub)
1163         {
1164           free (dfa->subexp_map);
1165           dfa->subexp_map = NULL;
1166         }
1167     }
1168
1169   ret = postorder (dfa->str_tree, lower_subexps, preg);
1170   if (BE (ret != REG_NOERROR, 0))
1171     return ret;
1172   ret = postorder (dfa->str_tree, calc_first, dfa);
1173   if (BE (ret != REG_NOERROR, 0))
1174     return ret;
1175   preorder (dfa->str_tree, calc_next, dfa);
1176   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1177   if (BE (ret != REG_NOERROR, 0))
1178     return ret;
1179   ret = calc_eclosure (dfa);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182
1183   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1184      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1185   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1186       || dfa->nbackref)
1187     {
1188       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1189       if (BE (dfa->inveclosures == NULL, 0))
1190         return REG_ESPACE;
1191       ret = calc_inveclosure (dfa);
1192     }
1193
1194   return ret;
1195 }
1196
1197 /* Our parse trees are very unbalanced, so we cannot use a stack to
1198    implement parse tree visits.  Instead, we use parent pointers and
1199    some hairy code in these two functions.  */
1200 static reg_errcode_t
1201 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1202            void *extra)
1203 {
1204   bin_tree_t *node, *prev;
1205
1206   for (node = root; ; )
1207     {
1208       /* Descend down the tree, preferably to the left (or to the right
1209          if that's the only child).  */
1210       while (node->left || node->right)
1211         if (node->left)
1212           node = node->left;
1213         else
1214           node = node->right;
1215
1216       do
1217         {
1218           reg_errcode_t err = fn (extra, node);
1219           if (BE (err != REG_NOERROR, 0))
1220             return err;
1221           if (node->parent == NULL)
1222             return REG_NOERROR;
1223           prev = node;
1224           node = node->parent;
1225         }
1226       /* Go up while we have a node that is reached from the right.  */
1227       while (node->right == prev || node->right == NULL);
1228       node = node->right;
1229     }
1230 }
1231
1232 static reg_errcode_t
1233 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1234           void *extra)
1235 {
1236   bin_tree_t *node;
1237
1238   for (node = root; ; )
1239     {
1240       reg_errcode_t err = fn (extra, node);
1241       if (BE (err != REG_NOERROR, 0))
1242         return err;
1243
1244       /* Go to the left node, or up and to the right.  */
1245       if (node->left)
1246         node = node->left;
1247       else
1248         {
1249           bin_tree_t *prev = NULL;
1250           while (node->right == prev || node->right == NULL)
1251             {
1252               prev = node;
1253               node = node->parent;
1254               if (!node)
1255                 return REG_NOERROR;
1256             }
1257           node = node->right;
1258         }
1259     }
1260 }
1261
1262 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1263    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1264    backreferences as well.  Requires a preorder visit.  */
1265 static reg_errcode_t
1266 optimize_subexps (void *extra, bin_tree_t *node)
1267 {
1268   re_dfa_t *dfa = (re_dfa_t *) extra;
1269
1270   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1271     {
1272       int idx = node->token.opr.idx;
1273       node->token.opr.idx = dfa->subexp_map[idx];
1274       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1275     }
1276
1277   else if (node->token.type == SUBEXP
1278            && node->left && node->left->token.type == SUBEXP)
1279     {
1280       Idx other_idx = node->left->token.opr.idx;
1281
1282       node->left = node->left->left;
1283       if (node->left)
1284         node->left->parent = node;
1285
1286       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1287       if (other_idx < BITSET_WORD_BITS)
1288         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1289     }
1290
1291   return REG_NOERROR;
1292 }
1293
1294 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1295    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1296 static reg_errcode_t
1297 lower_subexps (void *extra, bin_tree_t *node)
1298 {
1299   regex_t *preg = (regex_t *) extra;
1300   reg_errcode_t err = REG_NOERROR;
1301
1302   if (node->left && node->left->token.type == SUBEXP)
1303     {
1304       node->left = lower_subexp (&err, preg, node->left);
1305       if (node->left)
1306         node->left->parent = node;
1307     }
1308   if (node->right && node->right->token.type == SUBEXP)
1309     {
1310       node->right = lower_subexp (&err, preg, node->right);
1311       if (node->right)
1312         node->right->parent = node;
1313     }
1314
1315   return err;
1316 }
1317
1318 static bin_tree_t *
1319 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1320 {
1321   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1322   bin_tree_t *body = node->left;
1323   bin_tree_t *op, *cls, *tree1, *tree;
1324
1325   if (preg->no_sub
1326       /* We do not optimize empty subexpressions, because otherwise we may
1327          have bad CONCAT nodes with NULL children.  This is obviously not
1328          very common, so we do not lose much.  An example that triggers
1329          this case is the sed "script" /\(\)/x.  */
1330       && node->left != NULL
1331       && (node->token.opr.idx >= BITSET_WORD_BITS
1332           || !(dfa->used_bkref_map
1333                & ((bitset_word_t) 1 << node->token.opr.idx))))
1334     return node->left;
1335
1336   /* Convert the SUBEXP node to the concatenation of an
1337      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1338   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1339   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1340   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1341   tree = create_tree (dfa, op, tree1, CONCAT);
1342   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1343     {
1344       *err = REG_ESPACE;
1345       return NULL;
1346     }
1347
1348   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1349   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1350   return tree;
1351 }
1352
1353 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1354    nodes.  Requires a postorder visit.  */
1355 static reg_errcode_t
1356 calc_first (void *extra, bin_tree_t *node)
1357 {
1358   re_dfa_t *dfa = (re_dfa_t *) extra;
1359   if (node->token.type == CONCAT)
1360     {
1361       node->first = node->left->first;
1362       node->node_idx = node->left->node_idx;
1363     }
1364   else
1365     {
1366       node->first = node;
1367       node->node_idx = re_dfa_add_node (dfa, node->token);
1368       if (BE (node->node_idx == REG_MISSING, 0))
1369         return REG_ESPACE;
1370       if (node->token.type == ANCHOR)
1371         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1372     }
1373   return REG_NOERROR;
1374 }
1375
1376 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1377 static reg_errcode_t
1378 calc_next (void *extra, bin_tree_t *node)
1379 {
1380   switch (node->token.type)
1381     {
1382     case OP_DUP_ASTERISK:
1383       node->left->next = node;
1384       break;
1385     case CONCAT:
1386       node->left->next = node->right->first;
1387       node->right->next = node->next;
1388       break;
1389     default:
1390       if (node->left)
1391         node->left->next = node->next;
1392       if (node->right)
1393         node->right->next = node->next;
1394       break;
1395     }
1396   return REG_NOERROR;
1397 }
1398
1399 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1400 static reg_errcode_t
1401 link_nfa_nodes (void *extra, bin_tree_t *node)
1402 {
1403   re_dfa_t *dfa = (re_dfa_t *) extra;
1404   Idx idx = node->node_idx;
1405   reg_errcode_t err = REG_NOERROR;
1406
1407   switch (node->token.type)
1408     {
1409     case CONCAT:
1410       break;
1411
1412     case END_OF_RE:
1413       assert (node->next == NULL);
1414       break;
1415
1416     case OP_DUP_ASTERISK:
1417     case OP_ALT:
1418       {
1419         Idx left, right;
1420         dfa->has_plural_match = 1;
1421         if (node->left != NULL)
1422           left = node->left->first->node_idx;
1423         else
1424           left = node->next->node_idx;
1425         if (node->right != NULL)
1426           right = node->right->first->node_idx;
1427         else
1428           right = node->next->node_idx;
1429         assert (REG_VALID_INDEX (left));
1430         assert (REG_VALID_INDEX (right));
1431         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1432       }
1433       break;
1434
1435     case ANCHOR:
1436     case OP_OPEN_SUBEXP:
1437     case OP_CLOSE_SUBEXP:
1438       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1439       break;
1440
1441     case OP_BACK_REF:
1442       dfa->nexts[idx] = node->next->node_idx;
1443       if (node->token.type == OP_BACK_REF)
1444         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1445       break;
1446
1447     default:
1448       assert (!IS_EPSILON_NODE (node->token.type));
1449       dfa->nexts[idx] = node->next->node_idx;
1450       break;
1451     }
1452
1453   return err;
1454 }
1455
1456 /* Duplicate the epsilon closure of the node ROOT_NODE.
1457    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1458    to their own constraint.  */
1459
1460 static reg_errcode_t
1461 internal_function
1462 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1463                         Idx root_node, unsigned int init_constraint)
1464 {
1465   Idx org_node, clone_node;
1466   bool ok;
1467   unsigned int constraint = init_constraint;
1468   for (org_node = top_org_node, clone_node = top_clone_node;;)
1469     {
1470       Idx org_dest, clone_dest;
1471       if (dfa->nodes[org_node].type == OP_BACK_REF)
1472         {
1473           /* If the back reference epsilon-transit, its destination must
1474              also have the constraint.  Then duplicate the epsilon closure
1475              of the destination of the back reference, and store it in
1476              edests of the back reference.  */
1477           org_dest = dfa->nexts[org_node];
1478           re_node_set_empty (dfa->edests + clone_node);
1479           clone_dest = duplicate_node (dfa, org_dest, constraint);
1480           if (BE (clone_dest == REG_MISSING, 0))
1481             return REG_ESPACE;
1482           dfa->nexts[clone_node] = dfa->nexts[org_node];
1483           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1484           if (BE (! ok, 0))
1485             return REG_ESPACE;
1486         }
1487       else if (dfa->edests[org_node].nelem == 0)
1488         {
1489           /* In case of the node can't epsilon-transit, don't duplicate the
1490              destination and store the original destination as the
1491              destination of the node.  */
1492           dfa->nexts[clone_node] = dfa->nexts[org_node];
1493           break;
1494         }
1495       else if (dfa->edests[org_node].nelem == 1)
1496         {
1497           /* In case of the node can epsilon-transit, and it has only one
1498              destination.  */
1499           org_dest = dfa->edests[org_node].elems[0];
1500           re_node_set_empty (dfa->edests + clone_node);
1501           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1502           /* If the node is root_node itself, it means the epsilon closure
1503              has a loop.  Then tie it to the destination of the root_node.  */
1504           if (org_node == root_node && clone_node != org_node)
1505             {
1506               ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507               if (BE (! ok, 0))
1508                 return REG_ESPACE;
1509               break;
1510             }
1511           /* In case the node has another constraint, append it.  */
1512           constraint |= dfa->nodes[org_node].constraint;
1513           clone_dest = duplicate_node (dfa, org_dest, constraint);
1514           if (BE (clone_dest == REG_MISSING, 0))
1515             return REG_ESPACE;
1516           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517           if (BE (! ok, 0))
1518             return REG_ESPACE;
1519         }
1520       else /* dfa->edests[org_node].nelem == 2 */
1521         {
1522           /* In case of the node can epsilon-transit, and it has two
1523              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1524           org_dest = dfa->edests[org_node].elems[0];
1525           re_node_set_empty (dfa->edests + clone_node);
1526           /* Search for a duplicated node which satisfies the constraint.  */
1527           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528           if (clone_dest == REG_MISSING)
1529             {
1530               /* There is no such duplicated node, create a new one.  */
1531               reg_errcode_t err;
1532               clone_dest = duplicate_node (dfa, org_dest, constraint);
1533               if (BE (clone_dest == REG_MISSING, 0))
1534                 return REG_ESPACE;
1535               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536               if (BE (! ok, 0))
1537                 return REG_ESPACE;
1538               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539                                             root_node, constraint);
1540               if (BE (err != REG_NOERROR, 0))
1541                 return err;
1542             }
1543           else
1544             {
1545               /* There is a duplicated node which satisfy the constraint,
1546                  use it to avoid infinite loop.  */
1547               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548               if (BE (! ok, 0))
1549                 return REG_ESPACE;
1550             }
1551
1552           org_dest = dfa->edests[org_node].elems[1];
1553           clone_dest = duplicate_node (dfa, org_dest, constraint);
1554           if (BE (clone_dest == REG_MISSING, 0))
1555             return REG_ESPACE;
1556           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557           if (BE (! ok, 0))
1558             return REG_ESPACE;
1559         }
1560       org_node = org_dest;
1561       clone_node = clone_dest;
1562     }
1563   return REG_NOERROR;
1564 }
1565
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567    satisfies the constraint CONSTRAINT.  */
1568
1569 static Idx
1570 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1571                         unsigned int constraint)
1572 {
1573   Idx idx;
1574   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1575     {
1576       if (org_node == dfa->org_indices[idx]
1577           && constraint == dfa->nodes[idx].constraint)
1578         return idx; /* Found.  */
1579     }
1580   return REG_MISSING; /* Not found.  */
1581 }
1582
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584    Return the index of the new node, or REG_MISSING if insufficient storage is
1585    available.  */
1586
1587 static Idx
1588 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1589 {
1590   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591   if (BE (dup_idx != REG_MISSING, 1))
1592     {
1593       dfa->nodes[dup_idx].constraint = constraint;
1594       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1595       dfa->nodes[dup_idx].duplicated = 1;
1596
1597       /* Store the index of the original node.  */
1598       dfa->org_indices[dup_idx] = org_idx;
1599     }
1600   return dup_idx;
1601 }
1602
1603 static reg_errcode_t
1604 calc_inveclosure (re_dfa_t *dfa)
1605 {
1606   Idx src, idx;
1607   bool ok;
1608   for (idx = 0; idx < dfa->nodes_len; ++idx)
1609     re_node_set_init_empty (dfa->inveclosures + idx);
1610
1611   for (src = 0; src < dfa->nodes_len; ++src)
1612     {
1613       Idx *elems = dfa->eclosures[src].elems;
1614       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1615         {
1616           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1617           if (BE (! ok, 0))
1618             return REG_ESPACE;
1619         }
1620     }
1621
1622   return REG_NOERROR;
1623 }
1624
1625 /* Calculate "eclosure" for all the node in DFA.  */
1626
1627 static reg_errcode_t
1628 calc_eclosure (re_dfa_t *dfa)
1629 {
1630   Idx node_idx;
1631   bool incomplete;
1632 #ifdef DEBUG
1633   assert (dfa->nodes_len > 0);
1634 #endif
1635   incomplete = false;
1636   /* For each nodes, calculate epsilon closure.  */
1637   for (node_idx = 0; ; ++node_idx)
1638     {
1639       reg_errcode_t err;
1640       re_node_set eclosure_elem;
1641       if (node_idx == dfa->nodes_len)
1642         {
1643           if (!incomplete)
1644             break;
1645           incomplete = false;
1646           node_idx = 0;
1647         }
1648
1649 #ifdef DEBUG
1650       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1651 #endif
1652
1653       /* If we have already calculated, skip it.  */
1654       if (dfa->eclosures[node_idx].nelem != 0)
1655         continue;
1656       /* Calculate epsilon closure of `node_idx'.  */
1657       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1658       if (BE (err != REG_NOERROR, 0))
1659         return err;
1660
1661       if (dfa->eclosures[node_idx].nelem == 0)
1662         {
1663           incomplete = true;
1664           re_node_set_free (&eclosure_elem);
1665         }
1666     }
1667   return REG_NOERROR;
1668 }
1669
1670 /* Calculate epsilon closure of NODE.  */
1671
1672 static reg_errcode_t
1673 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1674 {
1675   reg_errcode_t err;
1676   Idx i;
1677   bool incomplete;
1678   bool ok;
1679   re_node_set eclosure;
1680   incomplete = false;
1681   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1682   if (BE (err != REG_NOERROR, 0))
1683     return err;
1684
1685   /* This indicates that we are calculating this node now.
1686      We reference this value to avoid infinite loop.  */
1687   dfa->eclosures[node].nelem = REG_MISSING;
1688
1689   /* If the current node has constraints, duplicate all nodes
1690      since they must inherit the constraints.  */
1691   if (dfa->nodes[node].constraint
1692       && dfa->edests[node].nelem
1693       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1694     {
1695       err = duplicate_node_closure (dfa, node, node, node,
1696                                     dfa->nodes[node].constraint);
1697       if (BE (err != REG_NOERROR, 0))
1698         return err;
1699     }
1700
1701   /* Expand each epsilon destination nodes.  */
1702   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1703     for (i = 0; i < dfa->edests[node].nelem; ++i)
1704       {
1705         re_node_set eclosure_elem;
1706         Idx edest = dfa->edests[node].elems[i];
1707         /* If calculating the epsilon closure of `edest' is in progress,
1708            return intermediate result.  */
1709         if (dfa->eclosures[edest].nelem == REG_MISSING)
1710           {
1711             incomplete = true;
1712             continue;
1713           }
1714         /* If we haven't calculated the epsilon closure of `edest' yet,
1715            calculate now. Otherwise use calculated epsilon closure.  */
1716         if (dfa->eclosures[edest].nelem == 0)
1717           {
1718             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1719             if (BE (err != REG_NOERROR, 0))
1720               return err;
1721           }
1722         else
1723           eclosure_elem = dfa->eclosures[edest];
1724         /* Merge the epsilon closure of `edest'.  */
1725         re_node_set_merge (&eclosure, &eclosure_elem);
1726         /* If the epsilon closure of `edest' is incomplete,
1727            the epsilon closure of this node is also incomplete.  */
1728         if (dfa->eclosures[edest].nelem == 0)
1729           {
1730             incomplete = true;
1731             re_node_set_free (&eclosure_elem);
1732           }
1733       }
1734
1735   /* Epsilon closures include itself.  */
1736   ok = re_node_set_insert (&eclosure, node);
1737   if (BE (! ok, 0))
1738     return REG_ESPACE;
1739   if (incomplete && !root)
1740     dfa->eclosures[node].nelem = 0;
1741   else
1742     dfa->eclosures[node] = eclosure;
1743   *new_set = eclosure;
1744   return REG_NOERROR;
1745 }
1746 \f
1747 /* Functions for token which are used in the parser.  */
1748
1749 /* Fetch a token from INPUT.
1750    We must not use this function inside bracket expressions.  */
1751
1752 static void
1753 internal_function
1754 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1755 {
1756   re_string_skip_bytes (input, peek_token (result, input, syntax));
1757 }
1758
1759 /* Peek a token from INPUT, and return the length of the token.
1760    We must not use this function inside bracket expressions.  */
1761
1762 static int
1763 internal_function
1764 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1765 {
1766   unsigned char c;
1767
1768   if (re_string_eoi (input))
1769     {
1770       token->type = END_OF_RE;
1771       return 0;
1772     }
1773
1774   c = re_string_peek_byte (input, 0);
1775   token->opr.c = c;
1776
1777   token->word_char = 0;
1778 #ifdef RE_ENABLE_I18N
1779   token->mb_partial = 0;
1780   if (input->mb_cur_max > 1 &&
1781       !re_string_first_byte (input, re_string_cur_idx (input)))
1782     {
1783       token->type = CHARACTER;
1784       token->mb_partial = 1;
1785       return 1;
1786     }
1787 #endif
1788   if (c == '\\')
1789     {
1790       unsigned char c2;
1791       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1792         {
1793           token->type = BACK_SLASH;
1794           return 1;
1795         }
1796
1797       c2 = re_string_peek_byte_case (input, 1);
1798       token->opr.c = c2;
1799       token->type = CHARACTER;
1800 #ifdef RE_ENABLE_I18N
1801       if (input->mb_cur_max > 1)
1802         {
1803           wint_t wc = re_string_wchar_at (input,
1804                                           re_string_cur_idx (input) + 1);
1805           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1806         }
1807       else
1808 #endif
1809         token->word_char = IS_WORD_CHAR (c2) != 0;
1810
1811       switch (c2)
1812         {
1813         case '|':
1814           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1815             token->type = OP_ALT;
1816           break;
1817         case '1': case '2': case '3': case '4': case '5':
1818         case '6': case '7': case '8': case '9':
1819           if (!(syntax & RE_NO_BK_REFS))
1820             {
1821               token->type = OP_BACK_REF;
1822               token->opr.idx = c2 - '1';
1823             }
1824           break;
1825         case '<':
1826           if (!(syntax & RE_NO_GNU_OPS))
1827             {
1828               token->type = ANCHOR;
1829               token->opr.ctx_type = WORD_FIRST;
1830             }
1831           break;
1832         case '>':
1833           if (!(syntax & RE_NO_GNU_OPS))
1834             {
1835               token->type = ANCHOR;
1836               token->opr.ctx_type = WORD_LAST;
1837             }
1838           break;
1839         case 'b':
1840           if (!(syntax & RE_NO_GNU_OPS))
1841             {
1842               token->type = ANCHOR;
1843               token->opr.ctx_type = WORD_DELIM;
1844             }
1845           break;
1846         case 'B':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             {
1849               token->type = ANCHOR;
1850               token->opr.ctx_type = NOT_WORD_DELIM;
1851             }
1852           break;
1853         case 'w':
1854           if (!(syntax & RE_NO_GNU_OPS))
1855             token->type = OP_WORD;
1856           break;
1857         case 'W':
1858           if (!(syntax & RE_NO_GNU_OPS))
1859             token->type = OP_NOTWORD;
1860           break;
1861         case 's':
1862           if (!(syntax & RE_NO_GNU_OPS))
1863             token->type = OP_SPACE;
1864           break;
1865         case 'S':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             token->type = OP_NOTSPACE;
1868           break;
1869         case '`':
1870           if (!(syntax & RE_NO_GNU_OPS))
1871             {
1872               token->type = ANCHOR;
1873               token->opr.ctx_type = BUF_FIRST;
1874             }
1875           break;
1876         case '\'':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             {
1879               token->type = ANCHOR;
1880               token->opr.ctx_type = BUF_LAST;
1881             }
1882           break;
1883         case '(':
1884           if (!(syntax & RE_NO_BK_PARENS))
1885             token->type = OP_OPEN_SUBEXP;
1886           break;
1887         case ')':
1888           if (!(syntax & RE_NO_BK_PARENS))
1889             token->type = OP_CLOSE_SUBEXP;
1890           break;
1891         case '+':
1892           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1893             token->type = OP_DUP_PLUS;
1894           break;
1895         case '?':
1896           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1897             token->type = OP_DUP_QUESTION;
1898           break;
1899         case '{':
1900           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1901             token->type = OP_OPEN_DUP_NUM;
1902           break;
1903         case '}':
1904           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1905             token->type = OP_CLOSE_DUP_NUM;
1906           break;
1907         default:
1908           break;
1909         }
1910       return 2;
1911     }
1912
1913   token->type = CHARACTER;
1914 #ifdef RE_ENABLE_I18N
1915   if (input->mb_cur_max > 1)
1916     {
1917       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1918       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1919     }
1920   else
1921 #endif
1922     token->word_char = IS_WORD_CHAR (token->opr.c);
1923
1924   switch (c)
1925     {
1926     case '\n':
1927       if (syntax & RE_NEWLINE_ALT)
1928         token->type = OP_ALT;
1929       break;
1930     case '|':
1931       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1932         token->type = OP_ALT;
1933       break;
1934     case '*':
1935       token->type = OP_DUP_ASTERISK;
1936       break;
1937     case '+':
1938       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1939         token->type = OP_DUP_PLUS;
1940       break;
1941     case '?':
1942       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1943         token->type = OP_DUP_QUESTION;
1944       break;
1945     case '{':
1946       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1947         token->type = OP_OPEN_DUP_NUM;
1948       break;
1949     case '}':
1950       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1951         token->type = OP_CLOSE_DUP_NUM;
1952       break;
1953     case '(':
1954       if (syntax & RE_NO_BK_PARENS)
1955         token->type = OP_OPEN_SUBEXP;
1956       break;
1957     case ')':
1958       if (syntax & RE_NO_BK_PARENS)
1959         token->type = OP_CLOSE_SUBEXP;
1960       break;
1961     case '[':
1962       token->type = OP_OPEN_BRACKET;
1963       break;
1964     case '.':
1965       token->type = OP_PERIOD;
1966       break;
1967     case '^':
1968       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1969           re_string_cur_idx (input) != 0)
1970         {
1971           char prev = re_string_peek_byte (input, -1);
1972           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1973             break;
1974         }
1975       token->type = ANCHOR;
1976       token->opr.ctx_type = LINE_FIRST;
1977       break;
1978     case '$':
1979       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1980           re_string_cur_idx (input) + 1 != re_string_length (input))
1981         {
1982           re_token_t next;
1983           re_string_skip_bytes (input, 1);
1984           peek_token (&next, input, syntax);
1985           re_string_skip_bytes (input, -1);
1986           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1987             break;
1988         }
1989       token->type = ANCHOR;
1990       token->opr.ctx_type = LINE_LAST;
1991       break;
1992     default:
1993       break;
1994     }
1995   return 1;
1996 }
1997
1998 /* Peek a token from INPUT, and return the length of the token.
1999    We must not use this function out of bracket expressions.  */
2000
2001 static int
2002 internal_function
2003 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2004 {
2005   unsigned char c;
2006   if (re_string_eoi (input))
2007     {
2008       token->type = END_OF_RE;
2009       return 0;
2010     }
2011   c = re_string_peek_byte (input, 0);
2012   token->opr.c = c;
2013
2014 #ifdef RE_ENABLE_I18N
2015   if (input->mb_cur_max > 1 &&
2016       !re_string_first_byte (input, re_string_cur_idx (input)))
2017     {
2018       token->type = CHARACTER;
2019       return 1;
2020     }
2021 #endif /* RE_ENABLE_I18N */
2022
2023   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2024       && re_string_cur_idx (input) + 1 < re_string_length (input))
2025     {
2026       /* In this case, '\' escape a character.  */
2027       unsigned char c2;
2028       re_string_skip_bytes (input, 1);
2029       c2 = re_string_peek_byte (input, 0);
2030       token->opr.c = c2;
2031       token->type = CHARACTER;
2032       return 1;
2033     }
2034   if (c == '[') /* '[' is a special char in a bracket exps.  */
2035     {
2036       unsigned char c2;
2037       int token_len;
2038       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2039         c2 = re_string_peek_byte (input, 1);
2040       else
2041         c2 = 0;
2042       token->opr.c = c2;
2043       token_len = 2;
2044       switch (c2)
2045         {
2046         case '.':
2047           token->type = OP_OPEN_COLL_ELEM;
2048           break;
2049         case '=':
2050           token->type = OP_OPEN_EQUIV_CLASS;
2051           break;
2052         case ':':
2053           if (syntax & RE_CHAR_CLASSES)
2054             {
2055               token->type = OP_OPEN_CHAR_CLASS;
2056               break;
2057             }
2058           /* else fall through.  */
2059         default:
2060           token->type = CHARACTER;
2061           token->opr.c = c;
2062           token_len = 1;
2063           break;
2064         }
2065       return token_len;
2066     }
2067   switch (c)
2068     {
2069     case '-':
2070       token->type = OP_CHARSET_RANGE;
2071       break;
2072     case ']':
2073       token->type = OP_CLOSE_BRACKET;
2074       break;
2075     case '^':
2076       token->type = OP_NON_MATCH_LIST;
2077       break;
2078     default:
2079       token->type = CHARACTER;
2080     }
2081   return 1;
2082 }
2083 \f
2084 /* Functions for parser.  */
2085
2086 /* Entry point of the parser.
2087    Parse the regular expression REGEXP and return the structure tree.
2088    If an error is occured, ERR is set by error code, and return NULL.
2089    This function build the following tree, from regular expression <reg_exp>:
2090            CAT
2091            / \
2092           /   \
2093    <reg_exp>  EOR
2094
2095    CAT means concatenation.
2096    EOR means end of regular expression.  */
2097
2098 static bin_tree_t *
2099 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2100        reg_errcode_t *err)
2101 {
2102   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2103   bin_tree_t *tree, *eor, *root;
2104   re_token_t current_token;
2105   dfa->syntax = syntax;
2106   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2107   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2108   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2109     return NULL;
2110   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2111   if (tree != NULL)
2112     root = create_tree (dfa, tree, eor, CONCAT);
2113   else
2114     root = eor;
2115   if (BE (eor == NULL || root == NULL, 0))
2116     {
2117       *err = REG_ESPACE;
2118       return NULL;
2119     }
2120   return root;
2121 }
2122
2123 /* This function build the following tree, from regular expression
2124    <branch1>|<branch2>:
2125            ALT
2126            / \
2127           /   \
2128    <branch1> <branch2>
2129
2130    ALT means alternative, which represents the operator `|'.  */
2131
2132 static bin_tree_t *
2133 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2134                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2135 {
2136   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2137   bin_tree_t *tree, *branch = NULL;
2138   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2139   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2140     return NULL;
2141
2142   while (token->type == OP_ALT)
2143     {
2144       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2145       if (token->type != OP_ALT && token->type != END_OF_RE
2146           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2147         {
2148           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2149           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2150             return NULL;
2151         }
2152       else
2153         branch = NULL;
2154       tree = create_tree (dfa, tree, branch, OP_ALT);
2155       if (BE (tree == NULL, 0))
2156         {
2157           *err = REG_ESPACE;
2158           return NULL;
2159         }
2160     }
2161   return tree;
2162 }
2163
2164 /* This function build the following tree, from regular expression
2165    <exp1><exp2>:
2166         CAT
2167         / \
2168        /   \
2169    <exp1> <exp2>
2170
2171    CAT means concatenation.  */
2172
2173 static bin_tree_t *
2174 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2175               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2176 {
2177   bin_tree_t *tree, *expr;
2178   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2179   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2180   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2181     return NULL;
2182
2183   while (token->type != OP_ALT && token->type != END_OF_RE
2184          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2185     {
2186       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2187       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2188         {
2189           return NULL;
2190         }
2191       if (tree != NULL && expr != NULL)
2192         {
2193           tree = create_tree (dfa, tree, expr, CONCAT);
2194           if (tree == NULL)
2195             {
2196               *err = REG_ESPACE;
2197               return NULL;
2198             }
2199         }
2200       else if (tree == NULL)
2201         tree = expr;
2202       /* Otherwise expr == NULL, we don't need to create new tree.  */
2203     }
2204   return tree;
2205 }
2206
2207 /* This function build the following tree, from regular expression a*:
2208          *
2209          |
2210          a
2211 */
2212
2213 static bin_tree_t *
2214 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2215                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2216 {
2217   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2218   bin_tree_t *tree;
2219   switch (token->type)
2220     {
2221     case CHARACTER:
2222       tree = create_token_tree (dfa, NULL, NULL, token);
2223       if (BE (tree == NULL, 0))
2224         {
2225           *err = REG_ESPACE;
2226           return NULL;
2227         }
2228 #ifdef RE_ENABLE_I18N
2229       if (dfa->mb_cur_max > 1)
2230         {
2231           while (!re_string_eoi (regexp)
2232                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2233             {
2234               bin_tree_t *mbc_remain;
2235               fetch_token (token, regexp, syntax);
2236               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2237               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2238               if (BE (mbc_remain == NULL || tree == NULL, 0))
2239                 {
2240                   *err = REG_ESPACE;
2241                   return NULL;
2242                 }
2243             }
2244         }
2245 #endif
2246       break;
2247     case OP_OPEN_SUBEXP:
2248       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2249       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2250         return NULL;
2251       break;
2252     case OP_OPEN_BRACKET:
2253       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2254       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2255         return NULL;
2256       break;
2257     case OP_BACK_REF:
2258       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2259         {
2260           *err = REG_ESUBREG;
2261           return NULL;
2262         }
2263       dfa->used_bkref_map |= 1 << token->opr.idx;
2264       tree = create_token_tree (dfa, NULL, NULL, token);
2265       if (BE (tree == NULL, 0))
2266         {
2267           *err = REG_ESPACE;
2268           return NULL;
2269         }
2270       ++dfa->nbackref;
2271       dfa->has_mb_node = 1;
2272       break;
2273     case OP_OPEN_DUP_NUM:
2274       if (syntax & RE_CONTEXT_INVALID_DUP)
2275         {
2276           *err = REG_BADRPT;
2277           return NULL;
2278         }
2279       /* FALLTHROUGH */
2280     case OP_DUP_ASTERISK:
2281     case OP_DUP_PLUS:
2282     case OP_DUP_QUESTION:
2283       if (syntax & RE_CONTEXT_INVALID_OPS)
2284         {
2285           *err = REG_BADRPT;
2286           return NULL;
2287         }
2288       else if (syntax & RE_CONTEXT_INDEP_OPS)
2289         {
2290           fetch_token (token, regexp, syntax);
2291           return parse_expression (regexp, preg, token, syntax, nest, err);
2292         }
2293       /* else fall through  */
2294     case OP_CLOSE_SUBEXP:
2295       if ((token->type == OP_CLOSE_SUBEXP) &&
2296           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2297         {
2298           *err = REG_ERPAREN;
2299           return NULL;
2300         }
2301       /* else fall through  */
2302     case OP_CLOSE_DUP_NUM:
2303       /* We treat it as a normal character.  */
2304
2305       /* Then we can these characters as normal characters.  */
2306       token->type = CHARACTER;
2307       /* mb_partial and word_char bits should be initialized already
2308          by peek_token.  */
2309       tree = create_token_tree (dfa, NULL, NULL, token);
2310       if (BE (tree == NULL, 0))
2311         {
2312           *err = REG_ESPACE;
2313           return NULL;
2314         }
2315       break;
2316     case ANCHOR:
2317       if ((token->opr.ctx_type
2318            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2319           && dfa->word_ops_used == 0)
2320         init_word_char (dfa);
2321       if (token->opr.ctx_type == WORD_DELIM
2322           || token->opr.ctx_type == NOT_WORD_DELIM)
2323         {
2324           bin_tree_t *tree_first, *tree_last;
2325           if (token->opr.ctx_type == WORD_DELIM)
2326             {
2327               token->opr.ctx_type = WORD_FIRST;
2328               tree_first = create_token_tree (dfa, NULL, NULL, token);
2329               token->opr.ctx_type = WORD_LAST;
2330             }
2331           else
2332             {
2333               token->opr.ctx_type = INSIDE_WORD;
2334               tree_first = create_token_tree (dfa, NULL, NULL, token);
2335               token->opr.ctx_type = INSIDE_NOTWORD;
2336             }
2337           tree_last = create_token_tree (dfa, NULL, NULL, token);
2338           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2339           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2340             {
2341               *err = REG_ESPACE;
2342               return NULL;
2343             }
2344         }
2345       else
2346         {
2347           tree = create_token_tree (dfa, NULL, NULL, token);
2348           if (BE (tree == NULL, 0))
2349             {
2350               *err = REG_ESPACE;
2351               return NULL;
2352             }
2353         }
2354       /* We must return here, since ANCHORs can't be followed
2355          by repetition operators.
2356          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2357              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2358       fetch_token (token, regexp, syntax);
2359       return tree;
2360     case OP_PERIOD:
2361       tree = create_token_tree (dfa, NULL, NULL, token);
2362       if (BE (tree == NULL, 0))
2363         {
2364           *err = REG_ESPACE;
2365           return NULL;
2366         }
2367       if (dfa->mb_cur_max > 1)
2368         dfa->has_mb_node = 1;
2369       break;
2370     case OP_WORD:
2371     case OP_NOTWORD:
2372       tree = build_charclass_op (dfa, regexp->trans,
2373                                  (const unsigned char *) "alnum",
2374                                  (const unsigned char *) "_",
2375                                  token->type == OP_NOTWORD, err);
2376       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2377         return NULL;
2378       break;
2379     case OP_SPACE:
2380     case OP_NOTSPACE:
2381       tree = build_charclass_op (dfa, regexp->trans,
2382                                  (const unsigned char *) "space",
2383                                  (const unsigned char *) "",
2384                                  token->type == OP_NOTSPACE, err);
2385       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2386         return NULL;
2387       break;
2388     case OP_ALT:
2389     case END_OF_RE:
2390       return NULL;
2391     case BACK_SLASH:
2392       *err = REG_EESCAPE;
2393       return NULL;
2394     default:
2395       /* Must not happen?  */
2396 #ifdef DEBUG
2397       assert (0);
2398 #endif
2399       return NULL;
2400     }
2401   fetch_token (token, regexp, syntax);
2402
2403   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2404          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2405     {
2406       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2407       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2408         return NULL;
2409       /* In BRE consecutive duplications are not allowed.  */
2410       if ((syntax & RE_CONTEXT_INVALID_DUP)
2411           && (token->type == OP_DUP_ASTERISK
2412               || token->type == OP_OPEN_DUP_NUM))
2413         {
2414           *err = REG_BADRPT;
2415           return NULL;
2416         }
2417     }
2418
2419   return tree;
2420 }
2421
2422 /* This function build the following tree, from regular expression
2423    (<reg_exp>):
2424          SUBEXP
2425             |
2426         <reg_exp>
2427 */
2428
2429 static bin_tree_t *
2430 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2431                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2432 {
2433   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2434   bin_tree_t *tree;
2435   size_t cur_nsub;
2436   cur_nsub = preg->re_nsub++;
2437
2438   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2439
2440   /* The subexpression may be a null string.  */
2441   if (token->type == OP_CLOSE_SUBEXP)
2442     tree = NULL;
2443   else
2444     {
2445       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2446       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2447         *err = REG_EPAREN;
2448       if (BE (*err != REG_NOERROR, 0))
2449         return NULL;
2450     }
2451
2452   if (cur_nsub <= '9' - '1')
2453     dfa->completed_bkref_map |= 1 << cur_nsub;
2454
2455   tree = create_tree (dfa, tree, NULL, SUBEXP);
2456   if (BE (tree == NULL, 0))
2457     {
2458       *err = REG_ESPACE;
2459       return NULL;
2460     }
2461   tree->token.opr.idx = cur_nsub;
2462   return tree;
2463 }
2464
2465 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2466
2467 static bin_tree_t *
2468 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2469               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2470 {
2471   bin_tree_t *tree = NULL, *old_tree = NULL;
2472   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2473   re_token_t start_token = *token;
2474
2475   if (token->type == OP_OPEN_DUP_NUM)
2476     {
2477       end = 0;
2478       start = fetch_number (regexp, token, syntax);
2479       if (start == REG_MISSING)
2480         {
2481           if (token->type == CHARACTER && token->opr.c == ',')
2482             start = 0; /* We treat "{,m}" as "{0,m}".  */
2483           else
2484             {
2485               *err = REG_BADBR; /* <re>{} is invalid.  */
2486               return NULL;
2487             }
2488         }
2489       if (BE (start != REG_ERROR, 1))
2490         {
2491           /* We treat "{n}" as "{n,n}".  */
2492           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2493                  : ((token->type == CHARACTER && token->opr.c == ',')
2494                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2495         }
2496       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2497         {
2498           /* Invalid sequence.  */
2499           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2500             {
2501               if (token->type == END_OF_RE)
2502                 *err = REG_EBRACE;
2503               else
2504                 *err = REG_BADBR;
2505
2506               return NULL;
2507             }
2508
2509           /* If the syntax bit is set, rollback.  */
2510           re_string_set_index (regexp, start_idx);
2511           *token = start_token;
2512           token->type = CHARACTER;
2513           /* mb_partial and word_char bits should be already initialized by
2514              peek_token.  */
2515           return elem;
2516         }
2517
2518       if (BE (end != REG_MISSING && start > end, 0))
2519         {
2520           /* First number greater than second.  */
2521           *err = REG_BADBR;
2522           return NULL;
2523         }
2524     }
2525   else
2526     {
2527       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2528       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2529     }
2530
2531   fetch_token (token, regexp, syntax);
2532
2533   if (BE (elem == NULL, 0))
2534     return NULL;
2535   if (BE (start == 0 && end == 0, 0))
2536     {
2537       postorder (elem, free_tree, NULL);
2538       return NULL;
2539     }
2540
2541   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2542   if (BE (start > 0, 0))
2543     {
2544       tree = elem;
2545       for (i = 2; i <= start; ++i)
2546         {
2547           elem = duplicate_tree (elem, dfa);
2548           tree = create_tree (dfa, tree, elem, CONCAT);
2549           if (BE (elem == NULL || tree == NULL, 0))
2550             goto parse_dup_op_espace;
2551         }
2552
2553       if (start == end)
2554         return tree;
2555
2556       /* Duplicate ELEM before it is marked optional.  */
2557       elem = duplicate_tree (elem, dfa);
2558       old_tree = tree;
2559     }
2560   else
2561     old_tree = NULL;
2562
2563   if (elem->token.type == SUBEXP)
2564     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2565
2566   tree = create_tree (dfa, elem, NULL,
2567                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2568   if (BE (tree == NULL, 0))
2569     goto parse_dup_op_espace;
2570
2571   /* This loop is actually executed only when end != REG_MISSING,
2572      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2573      already created the start+1-th copy.  */
2574   if ((Idx) -1 < 0 || end != REG_MISSING)
2575     for (i = start + 2; i <= end; ++i)
2576       {
2577         elem = duplicate_tree (elem, dfa);
2578         tree = create_tree (dfa, tree, elem, CONCAT);
2579         if (BE (elem == NULL || tree == NULL, 0))
2580           goto parse_dup_op_espace;
2581
2582         tree = create_tree (dfa, tree, NULL, OP_ALT);
2583         if (BE (tree == NULL, 0))
2584           goto parse_dup_op_espace;
2585       }
2586
2587   if (old_tree)
2588     tree = create_tree (dfa, old_tree, tree, CONCAT);
2589
2590   return tree;
2591
2592  parse_dup_op_espace:
2593   *err = REG_ESPACE;
2594   return NULL;
2595 }
2596
2597 /* Size of the names for collating symbol/equivalence_class/character_class.
2598    I'm not sure, but maybe enough.  */
2599 #define BRACKET_NAME_BUF_SIZE 32
2600
2601 #ifndef _LIBC
2602   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2603      Build the range expression which starts from START_ELEM, and ends
2604      at END_ELEM.  The result are written to MBCSET and SBCSET.
2605      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2606      mbcset->range_ends, is a pointer argument sinse we may
2607      update it.  */
2608
2609 static reg_errcode_t
2610 internal_function
2611 # ifdef RE_ENABLE_I18N
2612 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2613                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2614 # else /* not RE_ENABLE_I18N */
2615 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2616                  bracket_elem_t *end_elem)
2617 # endif /* not RE_ENABLE_I18N */
2618 {
2619   unsigned int start_ch, end_ch;
2620   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2621   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2622           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2623           0))
2624     return REG_ERANGE;
2625
2626   /* We can handle no multi character collating elements without libc
2627      support.  */
2628   if (BE ((start_elem->type == COLL_SYM
2629            && strlen ((char *) start_elem->opr.name) > 1)
2630           || (end_elem->type == COLL_SYM
2631               && strlen ((char *) end_elem->opr.name) > 1), 0))
2632     return REG_ECOLLATE;
2633
2634 # ifdef RE_ENABLE_I18N
2635   {
2636     wchar_t wc;
2637     wint_t start_wc;
2638     wint_t end_wc;
2639     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2640
2641     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2642                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2643                    : 0));
2644     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2645               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2646                  : 0));
2647     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2648                 ? __btowc (start_ch) : start_elem->opr.wch);
2649     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2650               ? __btowc (end_ch) : end_elem->opr.wch);
2651     if (start_wc == WEOF || end_wc == WEOF)
2652       return REG_ECOLLATE;
2653     cmp_buf[0] = start_wc;
2654     cmp_buf[4] = end_wc;
2655     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2656       return REG_ERANGE;
2657
2658     /* Got valid collation sequence values, add them as a new entry.
2659        However, for !_LIBC we have no collation elements: if the
2660        character set is single byte, the single byte character set
2661        that we build below suffices.  parse_bracket_exp passes
2662        no MBCSET if dfa->mb_cur_max == 1.  */
2663     if (mbcset)
2664       {
2665         /* Check the space of the arrays.  */
2666         if (BE (*range_alloc == mbcset->nranges, 0))
2667           {
2668             /* There is not enough space, need realloc.  */
2669             wchar_t *new_array_start, *new_array_end;
2670             Idx new_nranges;
2671
2672             /* +1 in case of mbcset->nranges is 0.  */
2673             new_nranges = 2 * mbcset->nranges + 1;
2674             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2675                are NULL if *range_alloc == 0.  */
2676             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2677                                           new_nranges);
2678             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2679                                         new_nranges);
2680
2681             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2682               return REG_ESPACE;
2683
2684             mbcset->range_starts = new_array_start;
2685             mbcset->range_ends = new_array_end;
2686             *range_alloc = new_nranges;
2687           }
2688
2689         mbcset->range_starts[mbcset->nranges] = start_wc;
2690         mbcset->range_ends[mbcset->nranges++] = end_wc;
2691       }
2692
2693     /* Build the table for single byte characters.  */
2694     for (wc = 0; wc < SBC_MAX; ++wc)
2695       {
2696         cmp_buf[2] = wc;
2697         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2698             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2699           bitset_set (sbcset, wc);
2700       }
2701   }
2702 # else /* not RE_ENABLE_I18N */
2703   {
2704     unsigned int ch;
2705     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2706                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2707                    : 0));
2708     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2709               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2710                  : 0));
2711     if (start_ch > end_ch)
2712       return REG_ERANGE;
2713     /* Build the table for single byte characters.  */
2714     for (ch = 0; ch < SBC_MAX; ++ch)
2715       if (start_ch <= ch  && ch <= end_ch)
2716         bitset_set (sbcset, ch);
2717   }
2718 # endif /* not RE_ENABLE_I18N */
2719   return REG_NOERROR;
2720 }
2721 #endif /* not _LIBC */
2722
2723 #ifndef _LIBC
2724 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2725    Build the collating element which is represented by NAME.
2726    The result are written to MBCSET and SBCSET.
2727    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2728    pointer argument since we may update it.  */
2729
2730 static reg_errcode_t
2731 internal_function
2732 build_collating_symbol (bitset_t sbcset,
2733 # ifdef RE_ENABLE_I18N
2734                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2735 # endif
2736                         const unsigned char *name)
2737 {
2738   size_t name_len = strlen ((const char *) name);
2739   if (BE (name_len != 1, 0))
2740     return REG_ECOLLATE;
2741   else
2742     {
2743       bitset_set (sbcset, name[0]);
2744       return REG_NOERROR;
2745     }
2746 }
2747 #endif /* not _LIBC */
2748
2749 /* This function parse bracket expression like "[abc]", "[a-c]",
2750    "[[.a-a.]]" etc.  */
2751
2752 static bin_tree_t *
2753 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2754                    reg_syntax_t syntax, reg_errcode_t *err)
2755 {
2756 #ifdef _LIBC
2757   const unsigned char *collseqmb;
2758   const char *collseqwc;
2759   uint32_t nrules;
2760   int32_t table_size;
2761   const int32_t *symb_table;
2762   const unsigned char *extra;
2763
2764   /* Local function for parse_bracket_exp used in _LIBC environement.
2765      Seek the collating symbol entry correspondings to NAME.
2766      Return the index of the symbol in the SYMB_TABLE.  */
2767
2768   auto inline int32_t
2769   __attribute ((always_inline))
2770   seek_collating_symbol_entry (name, name_len)
2771          const unsigned char *name;
2772          size_t name_len;
2773     {
2774       int32_t hash = elem_hash ((const char *) name, name_len);
2775       int32_t elem = hash % table_size;
2776       if (symb_table[2 * elem] != 0)
2777         {
2778           int32_t second = hash % (table_size - 2) + 1;
2779
2780           do
2781             {
2782               /* First compare the hashing value.  */
2783               if (symb_table[2 * elem] == hash
2784                   /* Compare the length of the name.  */
2785                   && name_len == extra[symb_table[2 * elem + 1]]
2786                   /* Compare the name.  */
2787                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2788                              name_len) == 0)
2789                 {
2790                   /* Yep, this is the entry.  */
2791                   break;
2792                 }
2793
2794               /* Next entry.  */
2795               elem += second;
2796             }
2797           while (symb_table[2 * elem] != 0);
2798         }
2799       return elem;
2800     }
2801
2802   /* Local function for parse_bracket_exp used in _LIBC environement.
2803      Look up the collation sequence value of BR_ELEM.
2804      Return the value if succeeded, UINT_MAX otherwise.  */
2805
2806   auto inline unsigned int
2807   __attribute ((always_inline))
2808   lookup_collation_sequence_value (br_elem)
2809          bracket_elem_t *br_elem;
2810     {
2811       if (br_elem->type == SB_CHAR)
2812         {
2813           /*
2814           if (MB_CUR_MAX == 1)
2815           */
2816           if (nrules == 0)
2817             return collseqmb[br_elem->opr.ch];
2818           else
2819             {
2820               wint_t wc = __btowc (br_elem->opr.ch);
2821               return __collseq_table_lookup (collseqwc, wc);
2822             }
2823         }
2824       else if (br_elem->type == MB_CHAR)
2825         {
2826           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2827         }
2828       else if (br_elem->type == COLL_SYM)
2829         {
2830           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2831           if (nrules != 0)
2832             {
2833               int32_t elem, idx;
2834               elem = seek_collating_symbol_entry (br_elem->opr.name,
2835                                                   sym_name_len);
2836               if (symb_table[2 * elem] != 0)
2837                 {
2838                   /* We found the entry.  */
2839                   idx = symb_table[2 * elem + 1];
2840                   /* Skip the name of collating element name.  */
2841                   idx += 1 + extra[idx];
2842                   /* Skip the byte sequence of the collating element.  */
2843                   idx += 1 + extra[idx];
2844                   /* Adjust for the alignment.  */
2845                   idx = (idx + 3) & ~3;
2846                   /* Skip the multibyte collation sequence value.  */
2847                   idx += sizeof (unsigned int);
2848                   /* Skip the wide char sequence of the collating element.  */
2849                   idx += sizeof (unsigned int) *
2850                     (1 + *(unsigned int *) (extra + idx));
2851                   /* Return the collation sequence value.  */
2852                   return *(unsigned int *) (extra + idx);
2853                 }
2854               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2855                 {
2856                   /* No valid character.  Match it as a single byte
2857                      character.  */
2858                   return collseqmb[br_elem->opr.name[0]];
2859                 }
2860             }
2861           else if (sym_name_len == 1)
2862             return collseqmb[br_elem->opr.name[0]];
2863         }
2864       return UINT_MAX;
2865     }
2866
2867   /* Local function for parse_bracket_exp used in _LIBC environement.
2868      Build the range expression which starts from START_ELEM, and ends
2869      at END_ELEM.  The result are written to MBCSET and SBCSET.
2870      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2871      mbcset->range_ends, is a pointer argument sinse we may
2872      update it.  */
2873
2874   auto inline reg_errcode_t
2875   __attribute ((always_inline))
2876   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2877          re_charset_t *mbcset;
2878          Idx *range_alloc;
2879          bitset_t sbcset;
2880          bracket_elem_t *start_elem, *end_elem;
2881     {
2882       unsigned int ch;
2883       uint32_t start_collseq;
2884       uint32_t end_collseq;
2885
2886       /* Equivalence Classes and Character Classes can't be a range
2887          start/end.  */
2888       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2889               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2890               0))
2891         return REG_ERANGE;
2892
2893       start_collseq = lookup_collation_sequence_value (start_elem);
2894       end_collseq = lookup_collation_sequence_value (end_elem);
2895       /* Check start/end collation sequence values.  */
2896       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2897         return REG_ECOLLATE;
2898       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2899         return REG_ERANGE;
2900
2901       /* Got valid collation sequence values, add them as a new entry.
2902          However, if we have no collation elements, and the character set
2903          is single byte, the single byte character set that we
2904          build below suffices. */
2905       if (nrules > 0 || dfa->mb_cur_max > 1)
2906         {
2907           /* Check the space of the arrays.  */
2908           if (BE (*range_alloc == mbcset->nranges, 0))
2909             {
2910               /* There is not enough space, need realloc.  */
2911               uint32_t *new_array_start;
2912               uint32_t *new_array_end;
2913               Idx new_nranges;
2914
2915               /* +1 in case of mbcset->nranges is 0.  */
2916               new_nranges = 2 * mbcset->nranges + 1;
2917               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2918                                             new_nranges);
2919               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2920                                           new_nranges);
2921
2922               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2923                 return REG_ESPACE;
2924
2925               mbcset->range_starts = new_array_start;
2926               mbcset->range_ends = new_array_end;
2927               *range_alloc = new_nranges;
2928             }
2929
2930           mbcset->range_starts[mbcset->nranges] = start_collseq;
2931           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2932         }
2933
2934       /* Build the table for single byte characters.  */
2935       for (ch = 0; ch < SBC_MAX; ch++)
2936         {
2937           uint32_t ch_collseq;
2938           /*
2939           if (MB_CUR_MAX == 1)
2940           */
2941           if (nrules == 0)
2942             ch_collseq = collseqmb[ch];
2943           else
2944             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2945           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2946             bitset_set (sbcset, ch);
2947         }
2948       return REG_NOERROR;
2949     }
2950
2951   /* Local function for parse_bracket_exp used in _LIBC environement.
2952      Build the collating element which is represented by NAME.
2953      The result are written to MBCSET and SBCSET.
2954      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2955      pointer argument sinse we may update it.  */
2956
2957   auto inline reg_errcode_t
2958   __attribute ((always_inline))
2959   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2960          re_charset_t *mbcset;
2961          Idx *coll_sym_alloc;
2962          bitset_t sbcset;
2963          const unsigned char *name;
2964     {
2965       int32_t elem, idx;
2966       size_t name_len = strlen ((const char *) name);
2967       if (nrules != 0)
2968         {
2969           elem = seek_collating_symbol_entry (name, name_len);
2970           if (symb_table[2 * elem] != 0)
2971             {
2972               /* We found the entry.  */
2973               idx = symb_table[2 * elem + 1];
2974               /* Skip the name of collating element name.  */
2975               idx += 1 + extra[idx];
2976             }
2977           else if (symb_table[2 * elem] == 0 && name_len == 1)
2978             {
2979               /* No valid character, treat it as a normal
2980                  character.  */
2981               bitset_set (sbcset, name[0]);
2982               return REG_NOERROR;
2983             }
2984           else
2985             return REG_ECOLLATE;
2986
2987           /* Got valid collation sequence, add it as a new entry.  */
2988           /* Check the space of the arrays.  */
2989           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2990             {
2991               /* Not enough, realloc it.  */
2992               /* +1 in case of mbcset->ncoll_syms is 0.  */
2993               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2994               /* Use realloc since mbcset->coll_syms is NULL
2995                  if *alloc == 0.  */
2996               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2997                                                    new_coll_sym_alloc);
2998               if (BE (new_coll_syms == NULL, 0))
2999                 return REG_ESPACE;
3000               mbcset->coll_syms = new_coll_syms;
3001               *coll_sym_alloc = new_coll_sym_alloc;
3002             }
3003           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3004           return REG_NOERROR;
3005         }
3006       else
3007         {
3008           if (BE (name_len != 1, 0))
3009             return REG_ECOLLATE;
3010           else
3011             {
3012               bitset_set (sbcset, name[0]);
3013               return REG_NOERROR;
3014             }
3015         }
3016     }
3017 #endif
3018
3019   re_token_t br_token;
3020   re_bitset_ptr_t sbcset;
3021 #ifdef RE_ENABLE_I18N
3022   re_charset_t *mbcset;
3023   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3024   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3025 #endif /* not RE_ENABLE_I18N */
3026   bool non_match = false;
3027   bin_tree_t *work_tree;
3028   int token_len;
3029   bool first_round = true;
3030 #ifdef _LIBC
3031   collseqmb = (const unsigned char *)
3032     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3033   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3034   if (nrules)
3035     {
3036       /*
3037       if (MB_CUR_MAX > 1)
3038       */
3039       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3040       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3041       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3042                                                   _NL_COLLATE_SYMB_TABLEMB);
3043       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3044                                                    _NL_COLLATE_SYMB_EXTRAMB);
3045     }
3046 #endif
3047   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3048 #ifdef RE_ENABLE_I18N
3049   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3050 #endif /* RE_ENABLE_I18N */
3051 #ifdef RE_ENABLE_I18N
3052   if (BE (sbcset == NULL || mbcset == NULL, 0))
3053 #else
3054   if (BE (sbcset == NULL, 0))
3055 #endif /* RE_ENABLE_I18N */
3056     {
3057       *err = REG_ESPACE;
3058       return NULL;
3059     }
3060
3061   token_len = peek_token_bracket (token, regexp, syntax);
3062   if (BE (token->type == END_OF_RE, 0))
3063     {
3064       *err = REG_BADPAT;
3065       goto parse_bracket_exp_free_return;
3066     }
3067   if (token->type == OP_NON_MATCH_LIST)
3068     {
3069 #ifdef RE_ENABLE_I18N
3070       mbcset->non_match = 1;
3071 #endif /* not RE_ENABLE_I18N */
3072       non_match = true;
3073       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3074         bitset_set (sbcset, '\n');
3075       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3076       token_len = peek_token_bracket (token, regexp, syntax);
3077       if (BE (token->type == END_OF_RE, 0))
3078         {
3079           *err = REG_BADPAT;
3080           goto parse_bracket_exp_free_return;
3081         }
3082     }
3083
3084   /* We treat the first ']' as a normal character.  */
3085   if (token->type == OP_CLOSE_BRACKET)
3086     token->type = CHARACTER;
3087
3088   while (1)
3089     {
3090       bracket_elem_t start_elem, end_elem;
3091       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3092       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3093       reg_errcode_t ret;
3094       int token_len2 = 0;
3095       bool is_range_exp = false;
3096       re_token_t token2;
3097
3098       start_elem.opr.name = start_name_buf;
3099       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3100                                    syntax, first_round);
3101       if (BE (ret != REG_NOERROR, 0))
3102         {
3103           *err = ret;
3104           goto parse_bracket_exp_free_return;
3105         }
3106       first_round = false;
3107
3108       /* Get information about the next token.  We need it in any case.  */
3109       token_len = peek_token_bracket (token, regexp, syntax);
3110
3111       /* Do not check for ranges if we know they are not allowed.  */
3112       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3113         {
3114           if (BE (token->type == END_OF_RE, 0))
3115             {
3116               *err = REG_EBRACK;
3117               goto parse_bracket_exp_free_return;
3118             }
3119           if (token->type == OP_CHARSET_RANGE)
3120             {
3121               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3122               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3123               if (BE (token2.type == END_OF_RE, 0))
3124                 {
3125                   *err = REG_EBRACK;
3126                   goto parse_bracket_exp_free_return;
3127                 }
3128               if (token2.type == OP_CLOSE_BRACKET)
3129                 {
3130                   /* We treat the last '-' as a normal character.  */
3131                   re_string_skip_bytes (regexp, -token_len);
3132                   token->type = CHARACTER;
3133                 }
3134               else
3135                 is_range_exp = true;
3136             }
3137         }
3138
3139       if (is_range_exp == true)
3140         {
3141           end_elem.opr.name = end_name_buf;
3142           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3143                                        dfa, syntax, true);
3144           if (BE (ret != REG_NOERROR, 0))
3145             {
3146               *err = ret;
3147               goto parse_bracket_exp_free_return;
3148             }
3149
3150           token_len = peek_token_bracket (token, regexp, syntax);
3151
3152 #ifdef _LIBC
3153           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3154                                   &start_elem, &end_elem);
3155 #else
3156 # ifdef RE_ENABLE_I18N
3157           *err = build_range_exp (sbcset,
3158                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3159                                   &range_alloc, &start_elem, &end_elem);
3160 # else
3161           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3162 # endif
3163 #endif /* RE_ENABLE_I18N */
3164           if (BE (*err != REG_NOERROR, 0))
3165             goto parse_bracket_exp_free_return;
3166         }
3167       else
3168         {
3169           switch (start_elem.type)
3170             {
3171             case SB_CHAR:
3172               bitset_set (sbcset, start_elem.opr.ch);
3173               break;
3174 #ifdef RE_ENABLE_I18N
3175             case MB_CHAR:
3176               /* Check whether the array has enough space.  */
3177               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3178                 {
3179                   wchar_t *new_mbchars;
3180                   /* Not enough, realloc it.  */
3181                   /* +1 in case of mbcset->nmbchars is 0.  */
3182                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3183                   /* Use realloc since array is NULL if *alloc == 0.  */
3184                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3185                                             mbchar_alloc);
3186                   if (BE (new_mbchars == NULL, 0))
3187                     goto parse_bracket_exp_espace;
3188                   mbcset->mbchars = new_mbchars;
3189                 }
3190               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3191               break;
3192 #endif /* RE_ENABLE_I18N */
3193             case EQUIV_CLASS:
3194               *err = build_equiv_class (sbcset,
3195 #ifdef RE_ENABLE_I18N
3196                                         mbcset, &equiv_class_alloc,
3197 #endif /* RE_ENABLE_I18N */
3198                                         start_elem.opr.name);
3199               if (BE (*err != REG_NOERROR, 0))
3200                 goto parse_bracket_exp_free_return;
3201               break;
3202             case COLL_SYM:
3203               *err = build_collating_symbol (sbcset,
3204 #ifdef RE_ENABLE_I18N
3205                                              mbcset, &coll_sym_alloc,
3206 #endif /* RE_ENABLE_I18N */
3207                                              start_elem.opr.name);
3208               if (BE (*err != REG_NOERROR, 0))
3209                 goto parse_bracket_exp_free_return;
3210               break;
3211             case CHAR_CLASS:
3212               *err = build_charclass (regexp->trans, sbcset,
3213 #ifdef RE_ENABLE_I18N
3214                                       mbcset, &char_class_alloc,
3215 #endif /* RE_ENABLE_I18N */
3216                                       start_elem.opr.name, syntax);
3217               if (BE (*err != REG_NOERROR, 0))
3218                goto parse_bracket_exp_free_return;
3219               break;
3220             default:
3221               assert (0);
3222               break;
3223             }
3224         }
3225       if (BE (token->type == END_OF_RE, 0))
3226         {
3227           *err = REG_EBRACK;
3228           goto parse_bracket_exp_free_return;
3229         }
3230       if (token->type == OP_CLOSE_BRACKET)
3231         break;
3232     }
3233
3234   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3235
3236   /* If it is non-matching list.  */
3237   if (non_match)
3238     bitset_not (sbcset);
3239
3240 #ifdef RE_ENABLE_I18N
3241   /* Ensure only single byte characters are set.  */
3242   if (dfa->mb_cur_max > 1)
3243     bitset_mask (sbcset, dfa->sb_char);
3244
3245   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3246       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3247                                                      || mbcset->non_match)))
3248     {
3249       bin_tree_t *mbc_tree;
3250       int sbc_idx;
3251       /* Build a tree for complex bracket.  */
3252       dfa->has_mb_node = 1;
3253       br_token.type = COMPLEX_BRACKET;
3254       br_token.opr.mbcset = mbcset;
3255       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3256       if (BE (mbc_tree == NULL, 0))
3257         goto parse_bracket_exp_espace;
3258       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3259         if (sbcset[sbc_idx])
3260           break;
3261       /* If there are no bits set in sbcset, there is no point
3262          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3263       if (sbc_idx < BITSET_WORDS)
3264         {
3265           /* Build a tree for simple bracket.  */
3266           br_token.type = SIMPLE_BRACKET;
3267           br_token.opr.sbcset = sbcset;
3268           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3269           if (BE (work_tree == NULL, 0))
3270             goto parse_bracket_exp_espace;
3271
3272           /* Then join them by ALT node.  */
3273           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3274           if (BE (work_tree == NULL, 0))
3275             goto parse_bracket_exp_espace;
3276         }
3277       else
3278         {
3279           re_free (sbcset);
3280           work_tree = mbc_tree;
3281         }
3282     }
3283   else
3284 #endif /* not RE_ENABLE_I18N */
3285     {
3286 #ifdef RE_ENABLE_I18N
3287       free_charset (mbcset);
3288 #endif
3289       /* Build a tree for simple bracket.  */
3290       br_token.type = SIMPLE_BRACKET;
3291       br_token.opr.sbcset = sbcset;
3292       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3293       if (BE (work_tree == NULL, 0))
3294         goto parse_bracket_exp_espace;
3295     }
3296   return work_tree;
3297
3298  parse_bracket_exp_espace:
3299   *err = REG_ESPACE;
3300  parse_bracket_exp_free_return:
3301   re_free (sbcset);
3302 #ifdef RE_ENABLE_I18N
3303   free_charset (mbcset);
3304 #endif /* RE_ENABLE_I18N */
3305   return NULL;
3306 }
3307
3308 /* Parse an element in the bracket expression.  */
3309
3310 static reg_errcode_t
3311 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3312                        re_token_t *token, int token_len, re_dfa_t *dfa,
3313                        reg_syntax_t syntax, bool accept_hyphen)
3314 {
3315 #ifdef RE_ENABLE_I18N
3316   int cur_char_size;
3317   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3318   if (cur_char_size > 1)
3319     {
3320       elem->type = MB_CHAR;
3321       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3322       re_string_skip_bytes (regexp, cur_char_size);
3323       return REG_NOERROR;
3324     }
3325 #endif /* RE_ENABLE_I18N */
3326   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3327   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3328       || token->type == OP_OPEN_EQUIV_CLASS)
3329     return parse_bracket_symbol (elem, regexp, token);
3330   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3331     {
3332       /* A '-' must only appear as anything but a range indicator before
3333          the closing bracket.  Everything else is an error.  */
3334       re_token_t token2;
3335       (void) peek_token_bracket (&token2, regexp, syntax);
3336       if (token2.type != OP_CLOSE_BRACKET)
3337         /* The actual error value is not standardized since this whole
3338            case is undefined.  But ERANGE makes good sense.  */
3339         return REG_ERANGE;
3340     }
3341   elem->type = SB_CHAR;
3342   elem->opr.ch = token->opr.c;
3343   return REG_NOERROR;
3344 }
3345
3346 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3347    such as [:<character_class>:], [.<collating_element>.], and
3348    [=<equivalent_class>=].  */
3349
3350 static reg_errcode_t
3351 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3352                       re_token_t *token)
3353 {
3354   unsigned char ch, delim = token->opr.c;
3355   int i = 0;
3356   if (re_string_eoi(regexp))
3357     return REG_EBRACK;
3358   for (;; ++i)
3359     {
3360       if (i >= BRACKET_NAME_BUF_SIZE)
3361         return REG_EBRACK;
3362       if (token->type == OP_OPEN_CHAR_CLASS)
3363         ch = re_string_fetch_byte_case (regexp);
3364       else
3365         ch = re_string_fetch_byte (regexp);
3366       if (re_string_eoi(regexp))
3367         return REG_EBRACK;
3368       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3369         break;
3370       elem->opr.name[i] = ch;
3371     }
3372   re_string_skip_bytes (regexp, 1);
3373   elem->opr.name[i] = '\0';
3374   switch (token->type)
3375     {
3376     case OP_OPEN_COLL_ELEM:
3377       elem->type = COLL_SYM;
3378       break;
3379     case OP_OPEN_EQUIV_CLASS:
3380       elem->type = EQUIV_CLASS;
3381       break;
3382     case OP_OPEN_CHAR_CLASS:
3383       elem->type = CHAR_CLASS;
3384       break;
3385     default:
3386       break;
3387     }
3388   return REG_NOERROR;
3389 }
3390
3391   /* Helper function for parse_bracket_exp.
3392      Build the equivalence class which is represented by NAME.
3393      The result are written to MBCSET and SBCSET.
3394      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3395      is a pointer argument sinse we may update it.  */
3396
3397 static reg_errcode_t
3398 #ifdef RE_ENABLE_I18N
3399 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3400                    Idx *equiv_class_alloc, const unsigned char *name)
3401 #else /* not RE_ENABLE_I18N */
3402 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3403 #endif /* not RE_ENABLE_I18N */
3404 {
3405 #ifdef _LIBC
3406   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3407   if (nrules != 0)
3408     {
3409       const int32_t *table, *indirect;
3410       const unsigned char *weights, *extra, *cp;
3411       unsigned char char_buf[2];
3412       int32_t idx1, idx2;
3413       unsigned int ch;
3414       size_t len;
3415       /* This #include defines a local function!  */
3416 # include <locale/weight.h>
3417       /* Calculate the index for equivalence class.  */
3418       cp = name;
3419       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3420       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3421                                                _NL_COLLATE_WEIGHTMB);
3422       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3423                                                    _NL_COLLATE_EXTRAMB);
3424       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3425                                                 _NL_COLLATE_INDIRECTMB);
3426       idx1 = findidx (&cp);
3427       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3428         /* This isn't a valid character.  */
3429         return REG_ECOLLATE;
3430
3431       /* Build single byte matcing table for this equivalence class.  */
3432       char_buf[1] = (unsigned char) '\0';
3433       len = weights[idx1];
3434       for (ch = 0; ch < SBC_MAX; ++ch)
3435         {
3436           char_buf[0] = ch;
3437           cp = char_buf;
3438           idx2 = findidx (&cp);
3439 /*
3440           idx2 = table[ch];
3441 */
3442           if (idx2 == 0)
3443             /* This isn't a valid character.  */
3444             continue;
3445           if (len == weights[idx2])
3446             {
3447               int cnt = 0;
3448               while (cnt <= len &&
3449                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3450                 ++cnt;
3451
3452               if (cnt > len)
3453                 bitset_set (sbcset, ch);
3454             }
3455         }
3456       /* Check whether the array has enough space.  */
3457       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3458         {
3459           /* Not enough, realloc it.  */
3460           /* +1 in case of mbcset->nequiv_classes is 0.  */
3461           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3462           /* Use realloc since the array is NULL if *alloc == 0.  */
3463           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3464                                                    int32_t,
3465                                                    new_equiv_class_alloc);
3466           if (BE (new_equiv_classes == NULL, 0))
3467             return REG_ESPACE;
3468           mbcset->equiv_classes = new_equiv_classes;
3469           *equiv_class_alloc = new_equiv_class_alloc;
3470         }
3471       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3472     }
3473   else
3474 #endif /* _LIBC */
3475     {
3476       if (BE (strlen ((const char *) name) != 1, 0))
3477         return REG_ECOLLATE;
3478       bitset_set (sbcset, *name);
3479     }
3480   return REG_NOERROR;
3481 }
3482
3483   /* Helper function for parse_bracket_exp.
3484      Build the character class which is represented by NAME.
3485      The result are written to MBCSET and SBCSET.
3486      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3487      is a pointer argument sinse we may update it.  */
3488
3489 static reg_errcode_t
3490 #ifdef RE_ENABLE_I18N
3491 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3492                  re_charset_t *mbcset, Idx *char_class_alloc,
3493                  const unsigned char *class_name, reg_syntax_t syntax)
3494 #else /* not RE_ENABLE_I18N */
3495 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3496                  const unsigned char *class_name, reg_syntax_t syntax)
3497 #endif /* not RE_ENABLE_I18N */
3498 {
3499   int i;
3500   const char *name = (const char *) class_name;
3501
3502   /* In case of REG_ICASE "upper" and "lower" match the both of
3503      upper and lower cases.  */
3504   if ((syntax & RE_ICASE)
3505       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3506     name = "alpha";
3507
3508 #ifdef RE_ENABLE_I18N
3509   /* Check the space of the arrays.  */
3510   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3511     {
3512       /* Not enough, realloc it.  */
3513       /* +1 in case of mbcset->nchar_classes is 0.  */
3514       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3515       /* Use realloc since array is NULL if *alloc == 0.  */
3516       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3517                                                new_char_class_alloc);
3518       if (BE (new_char_classes == NULL, 0))
3519         return REG_ESPACE;
3520       mbcset->char_classes = new_char_classes;
3521       *char_class_alloc = new_char_class_alloc;
3522     }
3523   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3524 #endif /* RE_ENABLE_I18N */
3525
3526 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3527   do {                                          \
3528     if (BE (trans != NULL, 0))                  \
3529       {                                         \
3530         for (i = 0; i < SBC_MAX; ++i)           \
3531           if (ctype_func (i))                   \
3532             bitset_set (sbcset, trans[i]);      \
3533       }                                         \
3534     else                                        \
3535       {                                         \
3536         for (i = 0; i < SBC_MAX; ++i)           \
3537           if (ctype_func (i))                   \
3538             bitset_set (sbcset, i);             \
3539       }                                         \
3540   } while (0)
3541
3542   if (strcmp (name, "alnum") == 0)
3543     BUILD_CHARCLASS_LOOP (isalnum);
3544   else if (strcmp (name, "cntrl") == 0)
3545     BUILD_CHARCLASS_LOOP (iscntrl);
3546   else if (strcmp (name, "lower") == 0)
3547     BUILD_CHARCLASS_LOOP (islower);
3548   else if (strcmp (name, "space") == 0)
3549     BUILD_CHARCLASS_LOOP (isspace);
3550   else if (strcmp (name, "alpha") == 0)
3551     BUILD_CHARCLASS_LOOP (isalpha);
3552   else if (strcmp (name, "digit") == 0)
3553     BUILD_CHARCLASS_LOOP (isdigit);
3554   else if (strcmp (name, "print") == 0)
3555     BUILD_CHARCLASS_LOOP (isprint);
3556   else if (strcmp (name, "upper") == 0)
3557     BUILD_CHARCLASS_LOOP (isupper);
3558   else if (strcmp (name, "blank") == 0)
3559     BUILD_CHARCLASS_LOOP (isblank);
3560   else if (strcmp (name, "graph") == 0)
3561     BUILD_CHARCLASS_LOOP (isgraph);
3562   else if (strcmp (name, "punct") == 0)
3563     BUILD_CHARCLASS_LOOP (ispunct);
3564   else if (strcmp (name, "xdigit") == 0)
3565     BUILD_CHARCLASS_LOOP (isxdigit);
3566   else
3567     return REG_ECTYPE;
3568
3569   return REG_NOERROR;
3570 }
3571
3572 static bin_tree_t *
3573 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3574                     const unsigned char *class_name,
3575                     const unsigned char *extra, bool non_match,
3576                     reg_errcode_t *err)
3577 {
3578   re_bitset_ptr_t sbcset;
3579 #ifdef RE_ENABLE_I18N
3580   re_charset_t *mbcset;
3581   Idx alloc = 0;
3582 #endif /* not RE_ENABLE_I18N */
3583   reg_errcode_t ret;
3584   re_token_t br_token;
3585   bin_tree_t *tree;
3586
3587   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3588 #ifdef RE_ENABLE_I18N
3589   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3590 #endif /* RE_ENABLE_I18N */
3591
3592 #ifdef RE_ENABLE_I18N
3593   if (BE (sbcset == NULL || mbcset == NULL, 0))
3594 #else /* not RE_ENABLE_I18N */
3595   if (BE (sbcset == NULL, 0))
3596 #endif /* not RE_ENABLE_I18N */
3597     {
3598       *err = REG_ESPACE;
3599       return NULL;
3600     }
3601
3602   if (non_match)
3603     {
3604 #ifdef RE_ENABLE_I18N
3605       mbcset->non_match = 1;
3606 #endif /* not RE_ENABLE_I18N */
3607     }
3608
3609   /* We don't care the syntax in this case.  */
3610   ret = build_charclass (trans, sbcset,
3611 #ifdef RE_ENABLE_I18N
3612                          mbcset, &alloc,
3613 #endif /* RE_ENABLE_I18N */
3614                          class_name, 0);
3615
3616   if (BE (ret != REG_NOERROR, 0))
3617     {
3618       re_free (sbcset);
3619 #ifdef RE_ENABLE_I18N
3620       free_charset (mbcset);
3621 #endif /* RE_ENABLE_I18N */
3622       *err = ret;
3623       return NULL;
3624     }
3625   /* \w match '_' also.  */
3626   for (; *extra; extra++)
3627     bitset_set (sbcset, *extra);
3628
3629   /* If it is non-matching list.  */
3630   if (non_match)
3631     bitset_not (sbcset);
3632
3633 #ifdef RE_ENABLE_I18N
3634   /* Ensure only single byte characters are set.  */
3635   if (dfa->mb_cur_max > 1)
3636     bitset_mask (sbcset, dfa->sb_char);
3637 #endif
3638
3639   /* Build a tree for simple bracket.  */
3640   br_token.type = SIMPLE_BRACKET;
3641   br_token.opr.sbcset = sbcset;
3642   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3643   if (BE (tree == NULL, 0))
3644     goto build_word_op_espace;
3645
3646 #ifdef RE_ENABLE_I18N
3647   if (dfa->mb_cur_max > 1)
3648     {
3649       bin_tree_t *mbc_tree;
3650       /* Build a tree for complex bracket.  */
3651       br_token.type = COMPLEX_BRACKET;
3652       br_token.opr.mbcset = mbcset;
3653       dfa->has_mb_node = 1;
3654       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3655       if (BE (mbc_tree == NULL, 0))
3656         goto build_word_op_espace;
3657       /* Then join them by ALT node.  */
3658       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3659       if (BE (mbc_tree != NULL, 1))
3660         return tree;
3661     }
3662   else
3663     {
3664       free_charset (mbcset);
3665       return tree;
3666     }
3667 #else /* not RE_ENABLE_I18N */
3668   return tree;
3669 #endif /* not RE_ENABLE_I18N */
3670
3671  build_word_op_espace:
3672   re_free (sbcset);
3673 #ifdef RE_ENABLE_I18N
3674   free_charset (mbcset);
3675 #endif /* RE_ENABLE_I18N */
3676   *err = REG_ESPACE;
3677   return NULL;
3678 }
3679
3680 /* This is intended for the expressions like "a{1,3}".
3681    Fetch a number from `input', and return the number.
3682    Return REG_MISSING if the number field is empty like "{,1}".
3683    Return REG_ERROR if an error occurred.  */
3684
3685 static Idx
3686 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3687 {
3688   Idx num = REG_MISSING;
3689   unsigned char c;
3690   while (1)
3691     {
3692       fetch_token (token, input, syntax);
3693       c = token->opr.c;
3694       if (BE (token->type == END_OF_RE, 0))
3695         return REG_ERROR;
3696       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3697         break;
3698       num = ((token->type != CHARACTER || c < '0' || '9' < c
3699               || num == REG_ERROR)
3700              ? REG_ERROR
3701              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3702       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3703     }
3704   return num;
3705 }
3706 \f
3707 #ifdef RE_ENABLE_I18N
3708 static void
3709 free_charset (re_charset_t *cset)
3710 {
3711   re_free (cset->mbchars);
3712 # ifdef _LIBC
3713   re_free (cset->coll_syms);
3714   re_free (cset->equiv_classes);
3715   re_free (cset->range_starts);
3716   re_free (cset->range_ends);
3717 # endif
3718   re_free (cset->char_classes);
3719   re_free (cset);
3720 }
3721 #endif /* RE_ENABLE_I18N */
3722 \f
3723 /* Functions for binary tree operation.  */
3724
3725 /* Create a tree node.  */
3726
3727 static bin_tree_t *
3728 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3729              re_token_type_t type)
3730 {
3731   re_token_t t;
3732   t.type = type;
3733   return create_token_tree (dfa, left, right, &t);
3734 }
3735
3736 static bin_tree_t *
3737 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3738                    const re_token_t *token)
3739 {
3740   bin_tree_t *tree;
3741   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3742     {
3743       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3744
3745       if (storage == NULL)
3746         return NULL;
3747       storage->next = dfa->str_tree_storage;
3748       dfa->str_tree_storage = storage;
3749       dfa->str_tree_storage_idx = 0;
3750     }
3751   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3752
3753   tree->parent = NULL;
3754   tree->left = left;
3755   tree->right = right;
3756   tree->token = *token;
3757   tree->token.duplicated = 0;
3758   tree->token.opt_subexp = 0;
3759   tree->first = NULL;
3760   tree->next = NULL;
3761   tree->node_idx = REG_MISSING;
3762
3763   if (left != NULL)
3764     left->parent = tree;
3765   if (right != NULL)
3766     right->parent = tree;
3767   return tree;
3768 }
3769
3770 /* Mark the tree SRC as an optional subexpression.
3771    To be called from preorder or postorder.  */
3772
3773 static reg_errcode_t
3774 mark_opt_subexp (void *extra, bin_tree_t *node)
3775 {
3776   Idx idx = (Idx) (long) extra;
3777   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3778     node->token.opt_subexp = 1;
3779
3780   return REG_NOERROR;
3781 }
3782
3783 /* Free the allocated memory inside NODE. */
3784
3785 static void
3786 free_token (re_token_t *node)
3787 {
3788 #ifdef RE_ENABLE_I18N
3789   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3790     free_charset (node->opr.mbcset);
3791   else
3792 #endif /* RE_ENABLE_I18N */
3793     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3794       re_free (node->opr.sbcset);
3795 }
3796
3797 /* Worker function for tree walking.  Free the allocated memory inside NODE
3798    and its children. */
3799
3800 static reg_errcode_t
3801 free_tree (void *extra, bin_tree_t *node)
3802 {
3803   free_token (&node->token);
3804   return REG_NOERROR;
3805 }
3806
3807
3808 /* Duplicate the node SRC, and return new node.  This is a preorder
3809    visit similar to the one implemented by the generic visitor, but
3810    we need more infrastructure to maintain two parallel trees --- so,
3811    it's easier to duplicate.  */
3812
3813 static bin_tree_t *
3814 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3815 {
3816   const bin_tree_t *node;
3817   bin_tree_t *dup_root;
3818   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3819
3820   for (node = root; ; )
3821     {
3822       /* Create a new tree and link it back to the current parent.  */
3823       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3824       if (*p_new == NULL)
3825         return NULL;
3826       (*p_new)->parent = dup_node;
3827       (*p_new)->token.duplicated = 1;
3828       dup_node = *p_new;
3829
3830       /* Go to the left node, or up and to the right.  */
3831       if (node->left)
3832         {
3833           node = node->left;
3834           p_new = &dup_node->left;
3835         }
3836       else
3837         {
3838           const bin_tree_t *prev = NULL;
3839           while (node->right == prev || node->right == NULL)
3840             {
3841               prev = node;
3842               node = node->parent;
3843               dup_node = dup_node->parent;
3844               if (!node)
3845                 return dup_root;
3846             }
3847           node = node->right;
3848           p_new = &dup_node->right;
3849         }
3850     }
3851 }