posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           size_t length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  27 #ifdef RE_ENABLE_I18N
  28 static void free_charset (re_charset_t *cset);
  29 #endif /* RE_ENABLE_I18N */
  30 static void free_workarea_compile (regex_t *preg);
  31 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  32 #ifdef RE_ENABLE_I18N
  33 static void optimize_utf8 (re_dfa_t *dfa);
  34 #endif
  35 static reg_errcode_t analyze (regex_t *preg);
  36 static reg_errcode_t preorder (bin_tree_t *root,
  37                                reg_errcode_t (fn (void *, bin_tree_t *)),
  38                                void *extra);
  39 static reg_errcode_t postorder (bin_tree_t *root,
  40                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  41                                 void *extra);
  42 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  43 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  44 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  45                                  bin_tree_t *node);
  46 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  47 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  48 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  49 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  50 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  51                                    unsigned int constraint);
  52 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  53 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  54                                          int node, int root);
  55 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  56 static int fetch_number (re_string_t *input, re_token_t *token,
  57                          reg_syntax_t syntax);
  58 static int peek_token (re_token_t *token, re_string_t *input,
  59                         reg_syntax_t syntax) internal_function;
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifdef RE_ENABLE_I18N
  90 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  91                                         re_charset_t *mbcset,
  92                                         int *equiv_class_alloc,
  93                                         const unsigned char *name);
  94 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  95                                       bitset_t sbcset,
  96                                       re_charset_t *mbcset,
  97                                       int *char_class_alloc,
  98                                       const unsigned char *class_name,
  99                                       reg_syntax_t syntax);
 100 #else  /* not RE_ENABLE_I18N */
 101 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 102                                         const unsigned char *name);
 103 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 104                                       bitset_t sbcset,
 105                                       const unsigned char *class_name,
 106                                       reg_syntax_t syntax);
 107 #endif /* not RE_ENABLE_I18N */
 108 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 109                                        RE_TRANSLATE_TYPE trans,
 110                                        const unsigned char *class_name,
 111                                        const unsigned char *extra,
 112                                        int non_match, reg_errcode_t *err);
 113 static bin_tree_t *create_tree (re_dfa_t *dfa,
 114                                 bin_tree_t *left, bin_tree_t *right,
 115                                 re_token_type_t type);
 116 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 117                                       bin_tree_t *left, bin_tree_t *right,
 118                                       const re_token_t *token);
 119 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 120 static void free_token (re_token_t *node);
 121 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 122 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 123 \f
 124 /* This table gives an error message for each of the error codes listed
 125    in regex.h.  Obviously the order here has to be same as there.
 126    POSIX doesn't require that we do anything for REG_NOERROR,
 127    but why not be nice?  */
 128
 129 const char __re_error_msgid[] attribute_hidden =
 130   {
 131 #define REG_NOERROR_IDX 0
 132     gettext_noop ("Success")    /* REG_NOERROR */
 133     "\0"
 134 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 135     gettext_noop ("No match")   /* REG_NOMATCH */
 136     "\0"
 137 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 138     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 139     "\0"
 140 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 141     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 142     "\0"
 143 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 144     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 145     "\0"
 146 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 147     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 148     "\0"
 149 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 150     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 151     "\0"
 152 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 153     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 154     "\0"
 155 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 156     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 157     "\0"
 158 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 159     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 160     "\0"
 161 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 162     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 163     "\0"
 164 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 165     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 166     "\0"
 167 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 168     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 169     "\0"
 170 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 171     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 172     "\0"
 173 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 174     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 175     "\0"
 176 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 177     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 178     "\0"
 179 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 180     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 181   };
 182
 183 const size_t __re_error_msgid_idx[] attribute_hidden =
 184   {
 185     REG_NOERROR_IDX,
 186     REG_NOMATCH_IDX,
 187     REG_BADPAT_IDX,
 188     REG_ECOLLATE_IDX,
 189     REG_ECTYPE_IDX,
 190     REG_EESCAPE_IDX,
 191     REG_ESUBREG_IDX,
 192     REG_EBRACK_IDX,
 193     REG_EPAREN_IDX,
 194     REG_EBRACE_IDX,
 195     REG_BADBR_IDX,
 196     REG_ERANGE_IDX,
 197     REG_ESPACE_IDX,
 198     REG_BADRPT_IDX,
 199     REG_EEND_IDX,
 200     REG_ESIZE_IDX,
 201     REG_ERPAREN_IDX
 202   };
 203 \f
 204 /* Entry points for GNU code.  */
 205
 206 /* re_compile_pattern is the GNU regular expression compiler: it
 207    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 208    Returns 0 if the pattern was valid, otherwise an error string.
 209
 210    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 211    are set in BUFP on entry.  */
 212
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 {
 219   reg_errcode_t ret;
 220
 221   /* And GNU code determines whether or not to get register information
 222      by passing null for the REGS argument to re_match, etc., not by
 223      setting no_sub, unless RE_NO_SUB is set.  */
 224   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 225
 226   /* Match anchors at newline.  */
 227   bufp->newline_anchor = 1;
 228
 229   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 230
 231   if (!ret)
 232     return NULL;
 233   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 234 }
 235 #ifdef _LIBC
 236 weak_alias (__re_compile_pattern, re_compile_pattern)
 237 #endif
 238
 239 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 240    also be assigned to arbitrarily: each pattern buffer stores its own
 241    syntax, so it can be changed between regex compilations.  */
 242 /* This has no initializer because initialized variables in Emacs
 243    become read-only after dumping.  */
 244 reg_syntax_t re_syntax_options;
 245
 246
 247 /* Specify the precise syntax of regexps for compilation.  This provides
 248    for compatibility for various utilities which historically have
 249    different, incompatible syntaxes.
 250
 251    The argument SYNTAX is a bit mask comprised of the various bits
 252    defined in regex.h.  We return the old syntax.  */
 253
 254 reg_syntax_t
 255 re_set_syntax (syntax)
 256     reg_syntax_t syntax;
 257 {
 258   reg_syntax_t ret = re_syntax_options;
 259
 260   re_syntax_options = syntax;
 261   return ret;
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_set_syntax, re_set_syntax)
 265 #endif
 266
 267 int
 268 re_compile_fastmap (bufp)
 269     struct re_pattern_buffer *bufp;
 270 {
 271   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 272   char *fastmap = bufp->fastmap;
 273
 274   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 275   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 276   if (dfa->init_state != dfa->init_state_word)
 277     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 278   if (dfa->init_state != dfa->init_state_nl)
 279     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 280   if (dfa->init_state != dfa->init_state_begbuf)
 281     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 282   bufp->fastmap_accurate = 1;
 283   return 0;
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 287 #endif
 288
 289 static inline void
 290 __attribute ((always_inline))
 291 re_set_fastmap (char *fastmap, int icase, int ch)
 292 {
 293   fastmap[ch] = 1;
 294   if (icase)
 295     fastmap[tolower (ch)] = 1;
 296 }
 297
 298 /* Helper function for re_compile_fastmap.
 299    Compile fastmap for the initial_state INIT_STATE.  */
 300
 301 static void
 302 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 303                          char *fastmap)
 304 {
 305   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 306   int node_cnt;
 307   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 308   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 309     {
 310       int node = init_state->nodes.elems[node_cnt];
 311       re_token_type_t type = dfa->nodes[node].type;
 312
 313       if (type == CHARACTER)
 314         {
 315           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 316 #ifdef RE_ENABLE_I18N
 317           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 318             {
 319               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 320               wchar_t wc;
 321               mbstate_t state;
 322
 323               p = buf;
 324               *p++ = dfa->nodes[node].opr.c;
 325               while (++node < dfa->nodes_len
 326                      && dfa->nodes[node].type == CHARACTER
 327                      && dfa->nodes[node].mb_partial)
 328                 *p++ = dfa->nodes[node].opr.c;
 329               memset (&state, '\0', sizeof (state));
 330               if (mbrtowc (&wc, (const char *) buf, p - buf,
 331                            &state) == p - buf
 332                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 333                       != (size_t) -1))
 334                 re_set_fastmap (fastmap, 0, buf[0]);
 335             }
 336 #endif
 337         }
 338       else if (type == SIMPLE_BRACKET)
 339         {
 340           int i, ch;
 341           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 342             {
 343               int j;
 344               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 345               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 346                 if (w & ((bitset_word_t) 1 << j))
 347                   re_set_fastmap (fastmap, icase, ch);
 348             }
 349         }
 350 #ifdef RE_ENABLE_I18N
 351       else if (type == COMPLEX_BRACKET)
 352         {
 353           int i;
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 356               || cset->nranges || cset->nchar_classes)
 357             {
 358 # ifdef _LIBC
 359               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 360                 {
 361                   /* In this case we want to catch the bytes which are
 362                      the first byte of any collation elements.
 363                      e.g. In da_DK, we want to catch 'a' since "aa"
 364                           is a valid collation element, and don't catch
 365                           'b' since 'b' is the only collation element
 366                           which starts from 'b'.  */
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # else
 374               if (dfa->mb_cur_max > 1)
 375                 for (i = 0; i < SBC_MAX; ++i)
 376                   if (__btowc (i) == WEOF)
 377                     re_set_fastmap (fastmap, icase, i);
 378 # endif /* not _LIBC */
 379             }
 380           for (i = 0; i < cset->nmbchars; ++i)
 381             {
 382               char buf[256];
 383               mbstate_t state;
 384               memset (&state, '\0', sizeof (state));
 385               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 386                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 387               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 388                 {
 389                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 390                       != (size_t) -1)
 391                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 392                 }
 393             }
 394         }
 395 #endif /* RE_ENABLE_I18N */
 396       else if (type == OP_PERIOD
 397 #ifdef RE_ENABLE_I18N
 398                || type == OP_UTF8_PERIOD
 399 #endif /* RE_ENABLE_I18N */
 400                || type == END_OF_RE)
 401         {
 402           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 403           if (type == END_OF_RE)
 404             bufp->can_be_null = 1;
 405           return;
 406         }
 407     }
 408 }
 409 \f
 410 /* Entry point for POSIX code.  */
 411 /* regcomp takes a regular expression as a string and compiles it.
 412
 413    PREG is a regex_t *.  We do not expect any fields to be initialized,
 414    since POSIX says we shouldn't.  Thus, we set
 415
 416      `buffer' to the compiled pattern;
 417      `used' to the length of the compiled pattern;
 418      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 419        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 420        RE_SYNTAX_POSIX_BASIC;
 421      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 422      `fastmap' to an allocated space for the fastmap;
 423      `fastmap_accurate' to zero;
 424      `re_nsub' to the number of subexpressions in PATTERN.
 425
 426    PATTERN is the address of the pattern string.
 427
 428    CFLAGS is a series of bits which affect compilation.
 429
 430      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 431      use POSIX basic syntax.
 432
 433      If REG_NEWLINE is set, then . and [^...] don't match newline.
 434      Also, regexec will try a match beginning after every newline.
 435
 436      If REG_ICASE is set, then we considers upper- and lowercase
 437      versions of letters to be equivalent when matching.
 438
 439      If REG_NOSUB is set, then when PREG is passed to regexec, that
 440      routine will report only success or failure, and nothing about the
 441      registers.
 442
 443    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 444    the return codes and their meanings.)  */
 445
 446 int
 447 regcomp (preg, pattern, cflags)
 448     regex_t *__restrict preg;
 449     const char *__restrict pattern;
 450     int cflags;
 451 {
 452   reg_errcode_t ret;
 453   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 454                          : RE_SYNTAX_POSIX_BASIC);
 455
 456   preg->buffer = NULL;
 457   preg->allocated = 0;
 458   preg->used = 0;
 459
 460   /* Try to allocate space for the fastmap.  */
 461   preg->fastmap = re_malloc (char, SBC_MAX);
 462   if (BE (preg->fastmap == NULL, 0))
 463     return REG_ESPACE;
 464
 465   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 466
 467   /* If REG_NEWLINE is set, newlines are treated differently.  */
 468   if (cflags & REG_NEWLINE)
 469     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 470       syntax &= ~RE_DOT_NEWLINE;
 471       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 472       /* It also changes the matching behavior.  */
 473       preg->newline_anchor = 1;
 474     }
 475   else
 476     preg->newline_anchor = 0;
 477   preg->no_sub = !!(cflags & REG_NOSUB);
 478   preg->translate = NULL;
 479
 480   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 481
 482   /* POSIX doesn't distinguish between an unmatched open-group and an
 483      unmatched close-group: both are REG_EPAREN.  */
 484   if (ret == REG_ERPAREN)
 485     ret = REG_EPAREN;
 486
 487   /* We have already checked preg->fastmap != NULL.  */
 488   if (BE (ret == REG_NOERROR, 1))
 489     /* Compute the fastmap now, since regexec cannot modify the pattern
 490        buffer.  This function never fails in this implementation.  */
 491     (void) re_compile_fastmap (preg);
 492   else
 493     {
 494       /* Some error occurred while compiling the expression.  */
 495       re_free (preg->fastmap);
 496       preg->fastmap = NULL;
 497     }
 498
 499   return (int) ret;
 500 }
 501 #ifdef _LIBC
 502 weak_alias (__regcomp, regcomp)
 503 #endif
 504
 505 /* Returns a message corresponding to an error code, ERRCODE, returned
 506    from either regcomp or regexec.   We don't use PREG here.  */
 507
 508 size_t
 509 regerror (errcode, preg, errbuf, errbuf_size)
 510     int errcode;
 511     const regex_t *__restrict preg;
 512     char *__restrict errbuf;
 513     size_t errbuf_size;
 514 {
 515   const char *msg;
 516   size_t msg_size;
 517
 518   if (BE (errcode < 0
 519           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 520                                / sizeof (__re_error_msgid_idx[0])), 0))
 521     /* Only error codes returned by the rest of the code should be passed
 522        to this routine.  If we are given anything else, or if other regex
 523        code generates an invalid error code, then the program has a bug.
 524        Dump core so we can fix it.  */
 525     abort ();
 526
 527   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 528
 529   msg_size = strlen (msg) + 1; /* Includes the null.  */
 530
 531   if (BE (errbuf_size != 0, 1))
 532     {
 533       if (BE (msg_size > errbuf_size, 0))
 534         {
 535 #if defined HAVE_MEMPCPY || defined _LIBC
 536           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 537 #else
 538           memcpy (errbuf, msg, errbuf_size - 1);
 539           errbuf[errbuf_size - 1] = 0;
 540 #endif
 541         }
 542       else
 543         memcpy (errbuf, msg, msg_size);
 544     }
 545
 546   return msg_size;
 547 }
 548 #ifdef _LIBC
 549 weak_alias (__regerror, regerror)
 550 #endif
 551
 552
 553 #ifdef RE_ENABLE_I18N
 554 /* This static array is used for the map to single-byte characters when
 555    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 556    it the same all the time.  UTF-8 is the preferred encoding so this is
 557    a worthwhile optimization.  */
 558 static const bitset_t utf8_sb_map =
 559 {
 560   /* Set the first 128 bits.  */
 561   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 562 };
 563 #endif
 564
 565
 566 static void
 567 free_dfa_content (re_dfa_t *dfa)
 568 {
 569   int i, j;
 570
 571   if (dfa->nodes)
 572     for (i = 0; i < dfa->nodes_len; ++i)
 573       free_token (dfa->nodes + i);
 574   re_free (dfa->nexts);
 575   for (i = 0; i < dfa->nodes_len; ++i)
 576     {
 577       if (dfa->eclosures != NULL)
 578         re_node_set_free (dfa->eclosures + i);
 579       if (dfa->inveclosures != NULL)
 580         re_node_set_free (dfa->inveclosures + i);
 581       if (dfa->edests != NULL)
 582         re_node_set_free (dfa->edests + i);
 583     }
 584   re_free (dfa->edests);
 585   re_free (dfa->eclosures);
 586   re_free (dfa->inveclosures);
 587   re_free (dfa->nodes);
 588
 589   if (dfa->state_table)
 590     for (i = 0; i <= dfa->state_hash_mask; ++i)
 591       {
 592         struct re_state_table_entry *entry = dfa->state_table + i;
 593         for (j = 0; j < entry->num; ++j)
 594           {
 595             re_dfastate_t *state = entry->array[j];
 596             free_state (state);
 597           }
 598         re_free (entry->array);
 599       }
 600   re_free (dfa->state_table);
 601 #ifdef RE_ENABLE_I18N
 602   if (dfa->sb_char != utf8_sb_map)
 603     re_free (dfa->sb_char);
 604 #endif
 605   re_free (dfa->subexp_map);
 606 #ifdef DEBUG
 607   re_free (dfa->re_str);
 608 #endif
 609
 610   re_free (dfa);
 611 }
 612
 613
 614 /* Free dynamically allocated space used by PREG.  */
 615
 616 void
 617 regfree (preg)
 618     regex_t *preg;
 619 {
 620   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 621   if (BE (dfa != NULL, 1))
 622     free_dfa_content (dfa);
 623   preg->buffer = NULL;
 624   preg->allocated = 0;
 625
 626   re_free (preg->fastmap);
 627   preg->fastmap = NULL;
 628
 629   re_free (preg->translate);
 630   preg->translate = NULL;
 631 }
 632 #ifdef _LIBC
 633 weak_alias (__regfree, regfree)
 634 #endif
 635 \f
 636 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 637    them unless specifically requested.  */
 638
 639 #if defined _REGEX_RE_COMP || defined _LIBC
 640
 641 /* BSD has one and only one pattern buffer.  */
 642 static struct re_pattern_buffer re_comp_buf;
 643
 644 char *
 645 # ifdef _LIBC
 646 /* Make these definitions weak in libc, so POSIX programs can redefine
 647    these names if they don't use our functions, and still use
 648    regcomp/regexec above without link errors.  */
 649 weak_function
 650 # endif
 651 re_comp (s)
 652      const char *s;
 653 {
 654   reg_errcode_t ret;
 655   char *fastmap;
 656
 657   if (!s)
 658     {
 659       if (!re_comp_buf.buffer)
 660         return gettext ("No previous regular expression");
 661       return 0;
 662     }
 663
 664   if (re_comp_buf.buffer)
 665     {
 666       fastmap = re_comp_buf.fastmap;
 667       re_comp_buf.fastmap = NULL;
 668       __regfree (&re_comp_buf);
 669       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 670       re_comp_buf.fastmap = fastmap;
 671     }
 672
 673   if (re_comp_buf.fastmap == NULL)
 674     {
 675       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 676       if (re_comp_buf.fastmap == NULL)
 677         return (char *) gettext (__re_error_msgid
 678                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 679     }
 680
 681   /* Since `re_exec' always passes NULL for the `regs' argument, we
 682      don't need to initialize the pattern buffer fields which affect it.  */
 683
 684   /* Match anchors at newlines.  */
 685   re_comp_buf.newline_anchor = 1;
 686
 687   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 688
 689   if (!ret)
 690     return NULL;
 691
 692   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 693   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 694 }
 695
 696 #ifdef _LIBC
 697 libc_freeres_fn (free_mem)
 698 {
 699   __regfree (&re_comp_buf);
 700 }
 701 #endif
 702
 703 #endif /* _REGEX_RE_COMP */
 704 \f
 705 /* Internal entry point.
 706    Compile the regular expression PATTERN, whose length is LENGTH.
 707    SYNTAX indicate regular expression's syntax.  */
 708
 709 static reg_errcode_t
 710 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 711                      reg_syntax_t syntax)
 712 {
 713   reg_errcode_t err = REG_NOERROR;
 714   re_dfa_t *dfa;
 715   re_string_t regexp;
 716
 717   /* Initialize the pattern buffer.  */
 718   preg->fastmap_accurate = 0;
 719   preg->syntax = syntax;
 720   preg->not_bol = preg->not_eol = 0;
 721   preg->used = 0;
 722   preg->re_nsub = 0;
 723   preg->can_be_null = 0;
 724   preg->regs_allocated = REGS_UNALLOCATED;
 725
 726   /* Initialize the dfa.  */
 727   dfa = (re_dfa_t *) preg->buffer;
 728   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 729     {
 730       /* If zero allocated, but buffer is non-null, try to realloc
 731          enough space.  This loses if buffer's address is bogus, but
 732          that is the user's responsibility.  If ->buffer is NULL this
 733          is a simple allocation.  */
 734       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 735       if (dfa == NULL)
 736         return REG_ESPACE;
 737       preg->allocated = sizeof (re_dfa_t);
 738       preg->buffer = (unsigned char *) dfa;
 739     }
 740   preg->used = sizeof (re_dfa_t);
 741
 742   err = init_dfa (dfa, length);
 743   if (BE (err != REG_NOERROR, 0))
 744     {
 745       free_dfa_content (dfa);
 746       preg->buffer = NULL;
 747       preg->allocated = 0;
 748       return err;
 749     }
 750 #ifdef DEBUG
 751   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 752   dfa->re_str = re_malloc (char, length + 1);
 753   strncpy (dfa->re_str, pattern, length + 1);
 754 #endif
 755
 756   __libc_lock_init (dfa->lock);
 757
 758   err = re_string_construct (&regexp, pattern, length, preg->translate,
 759                              syntax & RE_ICASE, dfa);
 760   if (BE (err != REG_NOERROR, 0))
 761     {
 762     re_compile_internal_free_return:
 763       free_workarea_compile (preg);
 764       re_string_destruct (&regexp);
 765       free_dfa_content (dfa);
 766       preg->buffer = NULL;
 767       preg->allocated = 0;
 768       return err;
 769     }
 770
 771   /* Parse the regular expression, and build a structure tree.  */
 772   preg->re_nsub = 0;
 773   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 774   if (BE (dfa->str_tree == NULL, 0))
 775     goto re_compile_internal_free_return;
 776
 777   /* Analyze the tree and create the nfa.  */
 778   err = analyze (preg);
 779   if (BE (err != REG_NOERROR, 0))
 780     goto re_compile_internal_free_return;
 781
 782 #ifdef RE_ENABLE_I18N
 783   /* If possible, do searching in single byte encoding to speed things up.  */
 784   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 785     optimize_utf8 (dfa);
 786 #endif
 787
 788   /* Then create the initial state of the dfa.  */
 789   err = create_initial_state (dfa);
 790
 791   /* Release work areas.  */
 792   free_workarea_compile (preg);
 793   re_string_destruct (&regexp);
 794
 795   if (BE (err != REG_NOERROR, 0))
 796     {
 797       free_dfa_content (dfa);
 798       preg->buffer = NULL;
 799       preg->allocated = 0;
 800     }
 801
 802   return err;
 803 }
 804
 805 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 806    as the initial length of some arrays.  */
 807
 808 static reg_errcode_t
 809 init_dfa (re_dfa_t *dfa, size_t pat_len)
 810 {
 811   unsigned int table_size;
 812 #ifndef _LIBC
 813   char *codeset_name;
 814 #endif
 815
 816   memset (dfa, '\0', sizeof (re_dfa_t));
 817
 818   /* Force allocation of str_tree_storage the first time.  */
 819   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 820
 821   /* Avoid overflows.  */
 822   if (pat_len == SIZE_MAX)
 823     return REG_ESPACE;
 824
 825   dfa->nodes_alloc = pat_len + 1;
 826   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 827
 828   /*  table_size = 2 ^ ceil(log pat_len) */
 829   for (table_size = 1; ; table_size <<= 1)
 830     if (table_size > pat_len)
 831       break;
 832
 833   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 834   dfa->state_hash_mask = table_size - 1;
 835
 836   dfa->mb_cur_max = MB_CUR_MAX;
 837 #ifdef _LIBC
 838   if (dfa->mb_cur_max == 6
 839       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 840     dfa->is_utf8 = 1;
 841   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 842                        != 0);
 843 #else
 844 # ifdef HAVE_LANGINFO_CODESET
 845   codeset_name = nl_langinfo (CODESET);
 846 # else
 847   codeset_name = getenv ("LC_ALL");
 848   if (codeset_name == NULL || codeset_name[0] == '\0')
 849     codeset_name = getenv ("LC_CTYPE");
 850   if (codeset_name == NULL || codeset_name[0] == '\0')
 851     codeset_name = getenv ("LANG");
 852   if (codeset_name == NULL)
 853     codeset_name = "";
 854   else if (strchr (codeset_name, '.') !=  NULL)
 855     codeset_name = strchr (codeset_name, '.') + 1;
 856 # endif
 857
 858   if (strcasecmp (codeset_name, "UTF-8") == 0
 859       || strcasecmp (codeset_name, "UTF8") == 0)
 860     dfa->is_utf8 = 1;
 861
 862   /* We check exhaustively in the loop below if this charset is a
 863      superset of ASCII.  */
 864   dfa->map_notascii = 0;
 865 #endif
 866
 867 #ifdef RE_ENABLE_I18N
 868   if (dfa->mb_cur_max > 1)
 869     {
 870       if (dfa->is_utf8)
 871         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 872       else
 873         {
 874           int i, j, ch;
 875
 876           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 877           if (BE (dfa->sb_char == NULL, 0))
 878             return REG_ESPACE;
 879
 880           /* Set the bits corresponding to single byte chars.  */
 881           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 882             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 883               {
 884                 wint_t wch = __btowc (ch);
 885                 if (wch != WEOF)
 886                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 887 # ifndef _LIBC
 888                 if (isascii (ch) && wch != ch)
 889                   dfa->map_notascii = 1;
 890 # endif
 891               }
 892         }
 893     }
 894 #endif
 895
 896   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 897     return REG_ESPACE;
 898   return REG_NOERROR;
 899 }
 900
 901 /* Initialize WORD_CHAR table, which indicate which character is
 902    "word".  In this case "word" means that it is the word construction
 903    character used by some operators like "\<", "\>", etc.  */
 904
 905 static void
 906 internal_function
 907 init_word_char (re_dfa_t *dfa)
 908 {
 909   int i, j, ch;
 910   dfa->word_ops_used = 1;
 911   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 912     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 913       if (isalnum (ch) || ch == '_')
 914         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 915 }
 916
 917 /* Free the work area which are only used while compiling.  */
 918
 919 static void
 920 free_workarea_compile (regex_t *preg)
 921 {
 922   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 923   bin_tree_storage_t *storage, *next;
 924   for (storage = dfa->str_tree_storage; storage; storage = next)
 925     {
 926       next = storage->next;
 927       re_free (storage);
 928     }
 929   dfa->str_tree_storage = NULL;
 930   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 931   dfa->str_tree = NULL;
 932   re_free (dfa->org_indices);
 933   dfa->org_indices = NULL;
 934 }
 935
 936 /* Create initial states for all contexts.  */
 937
 938 static reg_errcode_t
 939 create_initial_state (re_dfa_t *dfa)
 940 {
 941   int first, i;
 942   reg_errcode_t err;
 943   re_node_set init_nodes;
 944
 945   /* Initial states have the epsilon closure of the node which is
 946      the first node of the regular expression.  */
 947   first = dfa->str_tree->first->node_idx;
 948   dfa->init_node = first;
 949   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 950   if (BE (err != REG_NOERROR, 0))
 951     return err;
 952
 953   /* The back-references which are in initial states can epsilon transit,
 954      since in this case all of the subexpressions can be null.
 955      Then we add epsilon closures of the nodes which are the next nodes of
 956      the back-references.  */
 957   if (dfa->nbackref > 0)
 958     for (i = 0; i < init_nodes.nelem; ++i)
 959       {
 960         int node_idx = init_nodes.elems[i];
 961         re_token_type_t type = dfa->nodes[node_idx].type;
 962
 963         int clexp_idx;
 964         if (type != OP_BACK_REF)
 965           continue;
 966         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 967           {
 968             re_token_t *clexp_node;
 969             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 970             if (clexp_node->type == OP_CLOSE_SUBEXP
 971                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
 972               break;
 973           }
 974         if (clexp_idx == init_nodes.nelem)
 975           continue;
 976
 977         if (type == OP_BACK_REF)
 978           {
 979             int dest_idx = dfa->edests[node_idx].elems[0];
 980             if (!re_node_set_contains (&init_nodes, dest_idx))
 981               {
 982                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 983                 i = 0;
 984               }
 985           }
 986       }
 987
 988   /* It must be the first time to invoke acquire_state.  */
 989   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 990   /* We don't check ERR here, since the initial state must not be NULL.  */
 991   if (BE (dfa->init_state == NULL, 0))
 992     return err;
 993   if (dfa->init_state->has_constraint)
 994     {
 995       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 996                                                        CONTEXT_WORD);
 997       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 998                                                      CONTEXT_NEWLINE);
 999       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1000                                                          &init_nodes,
1001                                                          CONTEXT_NEWLINE
1002                                                          | CONTEXT_BEGBUF);
1003       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1004               || dfa->init_state_begbuf == NULL, 0))
1005         return err;
1006     }
1007   else
1008     dfa->init_state_word = dfa->init_state_nl
1009       = dfa->init_state_begbuf = dfa->init_state;
1010
1011   re_node_set_free (&init_nodes);
1012   return REG_NOERROR;
1013 }
1014 \f
1015 #ifdef RE_ENABLE_I18N
1016 /* If it is possible to do searching in single byte encoding instead of UTF-8
1017    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1018    DFA nodes where needed.  */
1019
1020 static void
1021 optimize_utf8 (re_dfa_t *dfa)
1022 {
1023   int node, i, mb_chars = 0, has_period = 0;
1024
1025   for (node = 0; node < dfa->nodes_len; ++node)
1026     switch (dfa->nodes[node].type)
1027       {
1028       case CHARACTER:
1029         if (dfa->nodes[node].opr.c >= 0x80)
1030           mb_chars = 1;
1031         break;
1032       case ANCHOR:
1033         switch (dfa->nodes[node].opr.idx)
1034           {
1035           case LINE_FIRST:
1036           case LINE_LAST:
1037           case BUF_FIRST:
1038           case BUF_LAST:
1039             break;
1040           default:
1041             /* Word anchors etc. cannot be handled.  */
1042             return;
1043           }
1044         break;
1045       case OP_PERIOD:
1046         has_period = 1;
1047         break;
1048       case OP_BACK_REF:
1049       case OP_ALT:
1050       case END_OF_RE:
1051       case OP_DUP_ASTERISK:
1052       case OP_OPEN_SUBEXP:
1053       case OP_CLOSE_SUBEXP:
1054         break;
1055       case COMPLEX_BRACKET:
1056         return;
1057       case SIMPLE_BRACKET:
1058         /* Just double check.  The non-ASCII range starts at 0x80.  */
1059         assert (0x80 % BITSET_WORD_BITS == 0);
1060         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1061           if (dfa->nodes[node].opr.sbcset[i])
1062             return;
1063         break;
1064       default:
1065         abort ();
1066       }
1067
1068   if (mb_chars || has_period)
1069     for (node = 0; node < dfa->nodes_len; ++node)
1070       {
1071         if (dfa->nodes[node].type == CHARACTER
1072             && dfa->nodes[node].opr.c >= 0x80)
1073           dfa->nodes[node].mb_partial = 0;
1074         else if (dfa->nodes[node].type == OP_PERIOD)
1075           dfa->nodes[node].type = OP_UTF8_PERIOD;
1076       }
1077
1078   /* The search can be in single byte locale.  */
1079   dfa->mb_cur_max = 1;
1080   dfa->is_utf8 = 0;
1081   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1082 }
1083 #endif
1084 \f
1085 /* Analyze the structure tree, and calculate "first", "next", "edest",
1086    "eclosure", and "inveclosure".  */
1087
1088 static reg_errcode_t
1089 analyze (regex_t *preg)
1090 {
1091   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1092   reg_errcode_t ret;
1093
1094   /* Allocate arrays.  */
1095   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1096   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1097   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1098   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1099   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1100           || dfa->eclosures == NULL, 0))
1101     return REG_ESPACE;
1102
1103   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1104   if (dfa->subexp_map != NULL)
1105     {
1106       int i;
1107       for (i = 0; i < preg->re_nsub; i++)
1108         dfa->subexp_map[i] = i;
1109       preorder (dfa->str_tree, optimize_subexps, dfa);
1110       for (i = 0; i < preg->re_nsub; i++)
1111         if (dfa->subexp_map[i] != i)
1112           break;
1113       if (i == preg->re_nsub)
1114         {
1115           free (dfa->subexp_map);
1116           dfa->subexp_map = NULL;
1117         }
1118     }
1119
1120   ret = postorder (dfa->str_tree, lower_subexps, preg);
1121   if (BE (ret != REG_NOERROR, 0))
1122     return ret;
1123   ret = postorder (dfa->str_tree, calc_first, dfa);
1124   if (BE (ret != REG_NOERROR, 0))
1125     return ret;
1126   preorder (dfa->str_tree, calc_next, dfa);
1127   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1128   if (BE (ret != REG_NOERROR, 0))
1129     return ret;
1130   ret = calc_eclosure (dfa);
1131   if (BE (ret != REG_NOERROR, 0))
1132     return ret;
1133
1134   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1135      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1136   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1137       || dfa->nbackref)
1138     {
1139       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1140       if (BE (dfa->inveclosures == NULL, 0))
1141         return REG_ESPACE;
1142       ret = calc_inveclosure (dfa);
1143     }
1144
1145   return ret;
1146 }
1147
1148 /* Our parse trees are very unbalanced, so we cannot use a stack to
1149    implement parse tree visits.  Instead, we use parent pointers and
1150    some hairy code in these two functions.  */
1151 static reg_errcode_t
1152 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1153            void *extra)
1154 {
1155   bin_tree_t *node, *prev;
1156
1157   for (node = root; ; )
1158     {
1159       /* Descend down the tree, preferably to the left (or to the right
1160          if that's the only child).  */
1161       while (node->left || node->right)
1162         if (node->left)
1163           node = node->left;
1164         else
1165           node = node->right;
1166
1167       do
1168         {
1169           reg_errcode_t err = fn (extra, node);
1170           if (BE (err != REG_NOERROR, 0))
1171             return err;
1172           if (node->parent == NULL)
1173             return REG_NOERROR;
1174           prev = node;
1175           node = node->parent;
1176         }
1177       /* Go up while we have a node that is reached from the right.  */
1178       while (node->right == prev || node->right == NULL);
1179       node = node->right;
1180     }
1181 }
1182
1183 static reg_errcode_t
1184 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1185           void *extra)
1186 {
1187   bin_tree_t *node;
1188
1189   for (node = root; ; )
1190     {
1191       reg_errcode_t err = fn (extra, node);
1192       if (BE (err != REG_NOERROR, 0))
1193         return err;
1194
1195       /* Go to the left node, or up and to the right.  */
1196       if (node->left)
1197         node = node->left;
1198       else
1199         {
1200           bin_tree_t *prev = NULL;
1201           while (node->right == prev || node->right == NULL)
1202             {
1203               prev = node;
1204               node = node->parent;
1205               if (!node)
1206                 return REG_NOERROR;
1207             }
1208           node = node->right;
1209         }
1210     }
1211 }
1212
1213 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1214    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1215    backreferences as well.  Requires a preorder visit.  */
1216 static reg_errcode_t
1217 optimize_subexps (void *extra, bin_tree_t *node)
1218 {
1219   re_dfa_t *dfa = (re_dfa_t *) extra;
1220
1221   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1222     {
1223       int idx = node->token.opr.idx;
1224       node->token.opr.idx = dfa->subexp_map[idx];
1225       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1226     }
1227
1228   else if (node->token.type == SUBEXP
1229            && node->left && node->left->token.type == SUBEXP)
1230     {
1231       int other_idx = node->left->token.opr.idx;
1232
1233       node->left = node->left->left;
1234       if (node->left)
1235         node->left->parent = node;
1236
1237       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1238       if (other_idx < BITSET_WORD_BITS)
1239           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1240     }
1241
1242   return REG_NOERROR;
1243 }
1244
1245 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1246    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1247 static reg_errcode_t
1248 lower_subexps (void *extra, bin_tree_t *node)
1249 {
1250   regex_t *preg = (regex_t *) extra;
1251   reg_errcode_t err = REG_NOERROR;
1252
1253   if (node->left && node->left->token.type == SUBEXP)
1254     {
1255       node->left = lower_subexp (&err, preg, node->left);
1256       if (node->left)
1257         node->left->parent = node;
1258     }
1259   if (node->right && node->right->token.type == SUBEXP)
1260     {
1261       node->right = lower_subexp (&err, preg, node->right);
1262       if (node->right)
1263         node->right->parent = node;
1264     }
1265
1266   return err;
1267 }
1268
1269 static bin_tree_t *
1270 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1271 {
1272   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1273   bin_tree_t *body = node->left;
1274   bin_tree_t *op, *cls, *tree1, *tree;
1275
1276   if (preg->no_sub
1277       /* We do not optimize empty subexpressions, because otherwise we may
1278          have bad CONCAT nodes with NULL children.  This is obviously not
1279          very common, so we do not lose much.  An example that triggers
1280          this case is the sed "script" /\(\)/x.  */
1281       && node->left != NULL
1282       && (node->token.opr.idx >= BITSET_WORD_BITS
1283           || !(dfa->used_bkref_map
1284                & ((bitset_word_t) 1 << node->token.opr.idx))))
1285     return node->left;
1286
1287   /* Convert the SUBEXP node to the concatenation of an
1288      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1289   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1290   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1291   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1292   tree = create_tree (dfa, op, tree1, CONCAT);
1293   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1294     {
1295       *err = REG_ESPACE;
1296       return NULL;
1297     }
1298
1299   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1300   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1301   return tree;
1302 }
1303
1304 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1305    nodes.  Requires a postorder visit.  */
1306 static reg_errcode_t
1307 calc_first (void *extra, bin_tree_t *node)
1308 {
1309   re_dfa_t *dfa = (re_dfa_t *) extra;
1310   if (node->token.type == CONCAT)
1311     {
1312       node->first = node->left->first;
1313       node->node_idx = node->left->node_idx;
1314     }
1315   else
1316     {
1317       node->first = node;
1318       node->node_idx = re_dfa_add_node (dfa, node->token);
1319       if (BE (node->node_idx == -1, 0))
1320         return REG_ESPACE;
1321     }
1322   return REG_NOERROR;
1323 }
1324
1325 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1326 static reg_errcode_t
1327 calc_next (void *extra, bin_tree_t *node)
1328 {
1329   switch (node->token.type)
1330     {
1331     case OP_DUP_ASTERISK:
1332       node->left->next = node;
1333       break;
1334     case CONCAT:
1335       node->left->next = node->right->first;
1336       node->right->next = node->next;
1337       break;
1338     default:
1339       if (node->left)
1340         node->left->next = node->next;
1341       if (node->right)
1342         node->right->next = node->next;
1343       break;
1344     }
1345   return REG_NOERROR;
1346 }
1347
1348 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1349 static reg_errcode_t
1350 link_nfa_nodes (void *extra, bin_tree_t *node)
1351 {
1352   re_dfa_t *dfa = (re_dfa_t *) extra;
1353   int idx = node->node_idx;
1354   reg_errcode_t err = REG_NOERROR;
1355
1356   switch (node->token.type)
1357     {
1358     case CONCAT:
1359       break;
1360
1361     case END_OF_RE:
1362       assert (node->next == NULL);
1363       break;
1364
1365     case OP_DUP_ASTERISK:
1366     case OP_ALT:
1367       {
1368         int left, right;
1369         dfa->has_plural_match = 1;
1370         if (node->left != NULL)
1371           left = node->left->first->node_idx;
1372         else
1373           left = node->next->node_idx;
1374         if (node->right != NULL)
1375           right = node->right->first->node_idx;
1376         else
1377           right = node->next->node_idx;
1378         assert (left > -1);
1379         assert (right > -1);
1380         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1381       }
1382       break;
1383
1384     case ANCHOR:
1385     case OP_OPEN_SUBEXP:
1386     case OP_CLOSE_SUBEXP:
1387       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1388       break;
1389
1390     case OP_BACK_REF:
1391       dfa->nexts[idx] = node->next->node_idx;
1392       if (node->token.type == OP_BACK_REF)
1393         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1394       break;
1395
1396     default:
1397       assert (!IS_EPSILON_NODE (node->token.type));
1398       dfa->nexts[idx] = node->next->node_idx;
1399       break;
1400     }
1401
1402   return err;
1403 }
1404
1405 /* Duplicate the epsilon closure of the node ROOT_NODE.
1406    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1407    to their own constraint.  */
1408
1409 static reg_errcode_t
1410 internal_function
1411 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1412                         int root_node, unsigned int init_constraint)
1413 {
1414   int org_node, clone_node, ret;
1415   unsigned int constraint = init_constraint;
1416   for (org_node = top_org_node, clone_node = top_clone_node;;)
1417     {
1418       int org_dest, clone_dest;
1419       if (dfa->nodes[org_node].type == OP_BACK_REF)
1420         {
1421           /* If the back reference epsilon-transit, its destination must
1422              also have the constraint.  Then duplicate the epsilon closure
1423              of the destination of the back reference, and store it in
1424              edests of the back reference.  */
1425           org_dest = dfa->nexts[org_node];
1426           re_node_set_empty (dfa->edests + clone_node);
1427           clone_dest = duplicate_node (dfa, org_dest, constraint);
1428           if (BE (clone_dest == -1, 0))
1429             return REG_ESPACE;
1430           dfa->nexts[clone_node] = dfa->nexts[org_node];
1431           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1432           if (BE (ret < 0, 0))
1433             return REG_ESPACE;
1434         }
1435       else if (dfa->edests[org_node].nelem == 0)
1436         {
1437           /* In case of the node can't epsilon-transit, don't duplicate the
1438              destination and store the original destination as the
1439              destination of the node.  */
1440           dfa->nexts[clone_node] = dfa->nexts[org_node];
1441           break;
1442         }
1443       else if (dfa->edests[org_node].nelem == 1)
1444         {
1445           /* In case of the node can epsilon-transit, and it has only one
1446              destination.  */
1447           org_dest = dfa->edests[org_node].elems[0];
1448           re_node_set_empty (dfa->edests + clone_node);
1449           if (dfa->nodes[org_node].type == ANCHOR)
1450             {
1451               /* In case of the node has another constraint, append it.  */
1452               if (org_node == root_node && clone_node != org_node)
1453                 {
1454                   /* ...but if the node is root_node itself, it means the
1455                      epsilon closure have a loop, then tie it to the
1456                      destination of the root_node.  */
1457                   ret = re_node_set_insert (dfa->edests + clone_node,
1458                                             org_dest);
1459                   if (BE (ret < 0, 0))
1460                     return REG_ESPACE;
1461                   break;
1462                 }
1463               constraint |= dfa->nodes[org_node].opr.ctx_type;
1464             }
1465           clone_dest = duplicate_node (dfa, org_dest, constraint);
1466           if (BE (clone_dest == -1, 0))
1467             return REG_ESPACE;
1468           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1469           if (BE (ret < 0, 0))
1470             return REG_ESPACE;
1471         }
1472       else /* dfa->edests[org_node].nelem == 2 */
1473         {
1474           /* In case of the node can epsilon-transit, and it has two
1475              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1476           org_dest = dfa->edests[org_node].elems[0];
1477           re_node_set_empty (dfa->edests + clone_node);
1478           /* Search for a duplicated node which satisfies the constraint.  */
1479           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1480           if (clone_dest == -1)
1481             {
1482               /* There are no such a duplicated node, create a new one.  */
1483               reg_errcode_t err;
1484               clone_dest = duplicate_node (dfa, org_dest, constraint);
1485               if (BE (clone_dest == -1, 0))
1486                 return REG_ESPACE;
1487               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1488               if (BE (ret < 0, 0))
1489                 return REG_ESPACE;
1490               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1491                                             root_node, constraint);
1492               if (BE (err != REG_NOERROR, 0))
1493                 return err;
1494             }
1495           else
1496             {
1497               /* There are a duplicated node which satisfy the constraint,
1498                  use it to avoid infinite loop.  */
1499               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1500               if (BE (ret < 0, 0))
1501                 return REG_ESPACE;
1502             }
1503
1504           org_dest = dfa->edests[org_node].elems[1];
1505           clone_dest = duplicate_node (dfa, org_dest, constraint);
1506           if (BE (clone_dest == -1, 0))
1507             return REG_ESPACE;
1508           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1509           if (BE (ret < 0, 0))
1510             return REG_ESPACE;
1511         }
1512       org_node = org_dest;
1513       clone_node = clone_dest;
1514     }
1515   return REG_NOERROR;
1516 }
1517
1518 /* Search for a node which is duplicated from the node ORG_NODE, and
1519    satisfies the constraint CONSTRAINT.  */
1520
1521 static int
1522 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1523                         unsigned int constraint)
1524 {
1525   int idx;
1526   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1527     {
1528       if (org_node == dfa->org_indices[idx]
1529           && constraint == dfa->nodes[idx].constraint)
1530         return idx; /* Found.  */
1531     }
1532   return -1; /* Not found.  */
1533 }
1534
1535 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1536    Return the index of the new node, or -1 if insufficient storage is
1537    available.  */
1538
1539 static int
1540 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1541 {
1542   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1543   if (BE (dup_idx != -1, 1))
1544     {
1545       dfa->nodes[dup_idx].constraint = constraint;
1546       if (dfa->nodes[org_idx].type == ANCHOR)
1547         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1548       dfa->nodes[dup_idx].duplicated = 1;
1549
1550       /* Store the index of the original node.  */
1551       dfa->org_indices[dup_idx] = org_idx;
1552     }
1553   return dup_idx;
1554 }
1555
1556 static reg_errcode_t
1557 calc_inveclosure (re_dfa_t *dfa)
1558 {
1559   int src, idx, ret;
1560   for (idx = 0; idx < dfa->nodes_len; ++idx)
1561     re_node_set_init_empty (dfa->inveclosures + idx);
1562
1563   for (src = 0; src < dfa->nodes_len; ++src)
1564     {
1565       int *elems = dfa->eclosures[src].elems;
1566       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1567         {
1568           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1569           if (BE (ret == -1, 0))
1570             return REG_ESPACE;
1571         }
1572     }
1573
1574   return REG_NOERROR;
1575 }
1576
1577 /* Calculate "eclosure" for all the node in DFA.  */
1578
1579 static reg_errcode_t
1580 calc_eclosure (re_dfa_t *dfa)
1581 {
1582   int node_idx, incomplete;
1583 #ifdef DEBUG
1584   assert (dfa->nodes_len > 0);
1585 #endif
1586   incomplete = 0;
1587   /* For each nodes, calculate epsilon closure.  */
1588   for (node_idx = 0; ; ++node_idx)
1589     {
1590       reg_errcode_t err;
1591       re_node_set eclosure_elem;
1592       if (node_idx == dfa->nodes_len)
1593         {
1594           if (!incomplete)
1595             break;
1596           incomplete = 0;
1597           node_idx = 0;
1598         }
1599
1600 #ifdef DEBUG
1601       assert (dfa->eclosures[node_idx].nelem != -1);
1602 #endif
1603
1604       /* If we have already calculated, skip it.  */
1605       if (dfa->eclosures[node_idx].nelem != 0)
1606         continue;
1607       /* Calculate epsilon closure of `node_idx'.  */
1608       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1609       if (BE (err != REG_NOERROR, 0))
1610         return err;
1611
1612       if (dfa->eclosures[node_idx].nelem == 0)
1613         {
1614           incomplete = 1;
1615           re_node_set_free (&eclosure_elem);
1616         }
1617     }
1618   return REG_NOERROR;
1619 }
1620
1621 /* Calculate epsilon closure of NODE.  */
1622
1623 static reg_errcode_t
1624 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1625 {
1626   reg_errcode_t err;
1627   unsigned int constraint;
1628   int i, incomplete;
1629   re_node_set eclosure;
1630   incomplete = 0;
1631   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1632   if (BE (err != REG_NOERROR, 0))
1633     return err;
1634
1635   /* This indicates that we are calculating this node now.
1636      We reference this value to avoid infinite loop.  */
1637   dfa->eclosures[node].nelem = -1;
1638
1639   constraint = ((dfa->nodes[node].type == ANCHOR)
1640                 ? dfa->nodes[node].opr.ctx_type : 0);
1641   /* If the current node has constraints, duplicate all nodes.
1642      Since they must inherit the constraints.  */
1643   if (constraint
1644       && dfa->edests[node].nelem
1645       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1646     {
1647       err = duplicate_node_closure (dfa, node, node, node, constraint);
1648       if (BE (err != REG_NOERROR, 0))
1649         return err;
1650     }
1651
1652   /* Expand each epsilon destination nodes.  */
1653   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1654     for (i = 0; i < dfa->edests[node].nelem; ++i)
1655       {
1656         re_node_set eclosure_elem;
1657         int edest = dfa->edests[node].elems[i];
1658         /* If calculating the epsilon closure of `edest' is in progress,
1659            return intermediate result.  */
1660         if (dfa->eclosures[edest].nelem == -1)
1661           {
1662             incomplete = 1;
1663             continue;
1664           }
1665         /* If we haven't calculated the epsilon closure of `edest' yet,
1666            calculate now. Otherwise use calculated epsilon closure.  */
1667         if (dfa->eclosures[edest].nelem == 0)
1668           {
1669             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1670             if (BE (err != REG_NOERROR, 0))
1671               return err;
1672           }
1673         else
1674           eclosure_elem = dfa->eclosures[edest];
1675         /* Merge the epsilon closure of `edest'.  */
1676         re_node_set_merge (&eclosure, &eclosure_elem);
1677         /* If the epsilon closure of `edest' is incomplete,
1678            the epsilon closure of this node is also incomplete.  */
1679         if (dfa->eclosures[edest].nelem == 0)
1680           {
1681             incomplete = 1;
1682             re_node_set_free (&eclosure_elem);
1683           }
1684       }
1685
1686   /* Epsilon closures include itself.  */
1687   re_node_set_insert (&eclosure, node);
1688   if (incomplete && !root)
1689     dfa->eclosures[node].nelem = 0;
1690   else
1691     dfa->eclosures[node] = eclosure;
1692   *new_set = eclosure;
1693   return REG_NOERROR;
1694 }
1695 \f
1696 /* Functions for token which are used in the parser.  */
1697
1698 /* Fetch a token from INPUT.
1699    We must not use this function inside bracket expressions.  */
1700
1701 static void
1702 internal_function
1703 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1704 {
1705   re_string_skip_bytes (input, peek_token (result, input, syntax));
1706 }
1707
1708 /* Peek a token from INPUT, and return the length of the token.
1709    We must not use this function inside bracket expressions.  */
1710
1711 static int
1712 internal_function
1713 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1714 {
1715   unsigned char c;
1716
1717   if (re_string_eoi (input))
1718     {
1719       token->type = END_OF_RE;
1720       return 0;
1721     }
1722
1723   c = re_string_peek_byte (input, 0);
1724   token->opr.c = c;
1725
1726   token->word_char = 0;
1727 #ifdef RE_ENABLE_I18N
1728   token->mb_partial = 0;
1729   if (input->mb_cur_max > 1 &&
1730       !re_string_first_byte (input, re_string_cur_idx (input)))
1731     {
1732       token->type = CHARACTER;
1733       token->mb_partial = 1;
1734       return 1;
1735     }
1736 #endif
1737   if (c == '\\')
1738     {
1739       unsigned char c2;
1740       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1741         {
1742           token->type = BACK_SLASH;
1743           return 1;
1744         }
1745
1746       c2 = re_string_peek_byte_case (input, 1);
1747       token->opr.c = c2;
1748       token->type = CHARACTER;
1749 #ifdef RE_ENABLE_I18N
1750       if (input->mb_cur_max > 1)
1751         {
1752           wint_t wc = re_string_wchar_at (input,
1753                                           re_string_cur_idx (input) + 1);
1754           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1755         }
1756       else
1757 #endif
1758         token->word_char = IS_WORD_CHAR (c2) != 0;
1759
1760       switch (c2)
1761         {
1762         case '|':
1763           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1764             token->type = OP_ALT;
1765           break;
1766         case '1': case '2': case '3': case '4': case '5':
1767         case '6': case '7': case '8': case '9':
1768           if (!(syntax & RE_NO_BK_REFS))
1769             {
1770               token->type = OP_BACK_REF;
1771               token->opr.idx = c2 - '1';
1772             }
1773           break;
1774         case '<':
1775           if (!(syntax & RE_NO_GNU_OPS))
1776             {
1777               token->type = ANCHOR;
1778               token->opr.ctx_type = WORD_FIRST;
1779             }
1780           break;
1781         case '>':
1782           if (!(syntax & RE_NO_GNU_OPS))
1783             {
1784               token->type = ANCHOR;
1785               token->opr.ctx_type = WORD_LAST;
1786             }
1787           break;
1788         case 'b':
1789           if (!(syntax & RE_NO_GNU_OPS))
1790             {
1791               token->type = ANCHOR;
1792               token->opr.ctx_type = WORD_DELIM;
1793             }
1794           break;
1795         case 'B':
1796           if (!(syntax & RE_NO_GNU_OPS))
1797             {
1798               token->type = ANCHOR;
1799               token->opr.ctx_type = NOT_WORD_DELIM;
1800             }
1801           break;
1802         case 'w':
1803           if (!(syntax & RE_NO_GNU_OPS))
1804             token->type = OP_WORD;
1805           break;
1806         case 'W':
1807           if (!(syntax & RE_NO_GNU_OPS))
1808             token->type = OP_NOTWORD;
1809           break;
1810         case 's':
1811           if (!(syntax & RE_NO_GNU_OPS))
1812             token->type = OP_SPACE;
1813           break;
1814         case 'S':
1815           if (!(syntax & RE_NO_GNU_OPS))
1816             token->type = OP_NOTSPACE;
1817           break;
1818         case '`':
1819           if (!(syntax & RE_NO_GNU_OPS))
1820             {
1821               token->type = ANCHOR;
1822               token->opr.ctx_type = BUF_FIRST;
1823             }
1824           break;
1825         case '\'':
1826           if (!(syntax & RE_NO_GNU_OPS))
1827             {
1828               token->type = ANCHOR;
1829               token->opr.ctx_type = BUF_LAST;
1830             }
1831           break;
1832         case '(':
1833           if (!(syntax & RE_NO_BK_PARENS))
1834             token->type = OP_OPEN_SUBEXP;
1835           break;
1836         case ')':
1837           if (!(syntax & RE_NO_BK_PARENS))
1838             token->type = OP_CLOSE_SUBEXP;
1839           break;
1840         case '+':
1841           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1842             token->type = OP_DUP_PLUS;
1843           break;
1844         case '?':
1845           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1846             token->type = OP_DUP_QUESTION;
1847           break;
1848         case '{':
1849           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1850             token->type = OP_OPEN_DUP_NUM;
1851           break;
1852         case '}':
1853           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1854             token->type = OP_CLOSE_DUP_NUM;
1855           break;
1856         default:
1857           break;
1858         }
1859       return 2;
1860     }
1861
1862   token->type = CHARACTER;
1863 #ifdef RE_ENABLE_I18N
1864   if (input->mb_cur_max > 1)
1865     {
1866       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1867       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1868     }
1869   else
1870 #endif
1871     token->word_char = IS_WORD_CHAR (token->opr.c);
1872
1873   switch (c)
1874     {
1875     case '\n':
1876       if (syntax & RE_NEWLINE_ALT)
1877         token->type = OP_ALT;
1878       break;
1879     case '|':
1880       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1881         token->type = OP_ALT;
1882       break;
1883     case '*':
1884       token->type = OP_DUP_ASTERISK;
1885       break;
1886     case '+':
1887       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1888         token->type = OP_DUP_PLUS;
1889       break;
1890     case '?':
1891       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1892         token->type = OP_DUP_QUESTION;
1893       break;
1894     case '{':
1895       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1896         token->type = OP_OPEN_DUP_NUM;
1897       break;
1898     case '}':
1899       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1900         token->type = OP_CLOSE_DUP_NUM;
1901       break;
1902     case '(':
1903       if (syntax & RE_NO_BK_PARENS)
1904         token->type = OP_OPEN_SUBEXP;
1905       break;
1906     case ')':
1907       if (syntax & RE_NO_BK_PARENS)
1908         token->type = OP_CLOSE_SUBEXP;
1909       break;
1910     case '[':
1911       token->type = OP_OPEN_BRACKET;
1912       break;
1913     case '.':
1914       token->type = OP_PERIOD;
1915       break;
1916     case '^':
1917       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1918           re_string_cur_idx (input) != 0)
1919         {
1920           char prev = re_string_peek_byte (input, -1);
1921           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1922             break;
1923         }
1924       token->type = ANCHOR;
1925       token->opr.ctx_type = LINE_FIRST;
1926       break;
1927     case '$':
1928       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1929           re_string_cur_idx (input) + 1 != re_string_length (input))
1930         {
1931           re_token_t next;
1932           re_string_skip_bytes (input, 1);
1933           peek_token (&next, input, syntax);
1934           re_string_skip_bytes (input, -1);
1935           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1936             break;
1937         }
1938       token->type = ANCHOR;
1939       token->opr.ctx_type = LINE_LAST;
1940       break;
1941     default:
1942       break;
1943     }
1944   return 1;
1945 }
1946
1947 /* Peek a token from INPUT, and return the length of the token.
1948    We must not use this function out of bracket expressions.  */
1949
1950 static int
1951 internal_function
1952 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1953 {
1954   unsigned char c;
1955   if (re_string_eoi (input))
1956     {
1957       token->type = END_OF_RE;
1958       return 0;
1959     }
1960   c = re_string_peek_byte (input, 0);
1961   token->opr.c = c;
1962
1963 #ifdef RE_ENABLE_I18N
1964   if (input->mb_cur_max > 1 &&
1965       !re_string_first_byte (input, re_string_cur_idx (input)))
1966     {
1967       token->type = CHARACTER;
1968       return 1;
1969     }
1970 #endif /* RE_ENABLE_I18N */
1971
1972   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1973       && re_string_cur_idx (input) + 1 < re_string_length (input))
1974     {
1975       /* In this case, '\' escape a character.  */
1976       unsigned char c2;
1977       re_string_skip_bytes (input, 1);
1978       c2 = re_string_peek_byte (input, 0);
1979       token->opr.c = c2;
1980       token->type = CHARACTER;
1981       return 1;
1982     }
1983   if (c == '[') /* '[' is a special char in a bracket exps.  */
1984     {
1985       unsigned char c2;
1986       int token_len;
1987       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1988         c2 = re_string_peek_byte (input, 1);
1989       else
1990         c2 = 0;
1991       token->opr.c = c2;
1992       token_len = 2;
1993       switch (c2)
1994         {
1995         case '.':
1996           token->type = OP_OPEN_COLL_ELEM;
1997           break;
1998         case '=':
1999           token->type = OP_OPEN_EQUIV_CLASS;
2000           break;
2001         case ':':
2002           if (syntax & RE_CHAR_CLASSES)
2003             {
2004               token->type = OP_OPEN_CHAR_CLASS;
2005               break;
2006             }
2007           /* else fall through.  */
2008         default:
2009           token->type = CHARACTER;
2010           token->opr.c = c;
2011           token_len = 1;
2012           break;
2013         }
2014       return token_len;
2015     }
2016   switch (c)
2017     {
2018     case '-':
2019       token->type = OP_CHARSET_RANGE;
2020       break;
2021     case ']':
2022       token->type = OP_CLOSE_BRACKET;
2023       break;
2024     case '^':
2025       token->type = OP_NON_MATCH_LIST;
2026       break;
2027     default:
2028       token->type = CHARACTER;
2029     }
2030   return 1;
2031 }
2032 \f
2033 /* Functions for parser.  */
2034
2035 /* Entry point of the parser.
2036    Parse the regular expression REGEXP and return the structure tree.
2037    If an error is occured, ERR is set by error code, and return NULL.
2038    This function build the following tree, from regular expression <reg_exp>:
2039            CAT
2040            / \
2041           /   \
2042    <reg_exp>  EOR
2043
2044    CAT means concatenation.
2045    EOR means end of regular expression.  */
2046
2047 static bin_tree_t *
2048 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2049        reg_errcode_t *err)
2050 {
2051   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2052   bin_tree_t *tree, *eor, *root;
2053   re_token_t current_token;
2054   dfa->syntax = syntax;
2055   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2056   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2057   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2058     return NULL;
2059   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2060   if (tree != NULL)
2061     root = create_tree (dfa, tree, eor, CONCAT);
2062   else
2063     root = eor;
2064   if (BE (eor == NULL || root == NULL, 0))
2065     {
2066       *err = REG_ESPACE;
2067       return NULL;
2068     }
2069   return root;
2070 }
2071
2072 /* This function build the following tree, from regular expression
2073    <branch1>|<branch2>:
2074            ALT
2075            / \
2076           /   \
2077    <branch1> <branch2>
2078
2079    ALT means alternative, which represents the operator `|'.  */
2080
2081 static bin_tree_t *
2082 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2083                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2084 {
2085   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2086   bin_tree_t *tree, *branch = NULL;
2087   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2088   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2089     return NULL;
2090
2091   while (token->type == OP_ALT)
2092     {
2093       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2094       if (token->type != OP_ALT && token->type != END_OF_RE
2095           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2096         {
2097           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2098           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2099             return NULL;
2100         }
2101       else
2102         branch = NULL;
2103       tree = create_tree (dfa, tree, branch, OP_ALT);
2104       if (BE (tree == NULL, 0))
2105         {
2106           *err = REG_ESPACE;
2107           return NULL;
2108         }
2109     }
2110   return tree;
2111 }
2112
2113 /* This function build the following tree, from regular expression
2114    <exp1><exp2>:
2115         CAT
2116         / \
2117        /   \
2118    <exp1> <exp2>
2119
2120    CAT means concatenation.  */
2121
2122 static bin_tree_t *
2123 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2124               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2125 {
2126   bin_tree_t *tree, *exp;
2127   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2128   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2129   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2130     return NULL;
2131
2132   while (token->type != OP_ALT && token->type != END_OF_RE
2133          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2134     {
2135       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2136       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2137         {
2138           return NULL;
2139         }
2140       if (tree != NULL && exp != NULL)
2141         {
2142           tree = create_tree (dfa, tree, exp, CONCAT);
2143           if (tree == NULL)
2144             {
2145               *err = REG_ESPACE;
2146               return NULL;
2147             }
2148         }
2149       else if (tree == NULL)
2150         tree = exp;
2151       /* Otherwise exp == NULL, we don't need to create new tree.  */
2152     }
2153   return tree;
2154 }
2155
2156 /* This function build the following tree, from regular expression a*:
2157          *
2158          |
2159          a
2160 */
2161
2162 static bin_tree_t *
2163 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2164                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2165 {
2166   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2167   bin_tree_t *tree;
2168   switch (token->type)
2169     {
2170     case CHARACTER:
2171       tree = create_token_tree (dfa, NULL, NULL, token);
2172       if (BE (tree == NULL, 0))
2173         {
2174           *err = REG_ESPACE;
2175           return NULL;
2176         }
2177 #ifdef RE_ENABLE_I18N
2178       if (dfa->mb_cur_max > 1)
2179         {
2180           while (!re_string_eoi (regexp)
2181                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2182             {
2183               bin_tree_t *mbc_remain;
2184               fetch_token (token, regexp, syntax);
2185               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2186               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2187               if (BE (mbc_remain == NULL || tree == NULL, 0))
2188                 {
2189                   *err = REG_ESPACE;
2190                   return NULL;
2191                 }
2192             }
2193         }
2194 #endif
2195       break;
2196     case OP_OPEN_SUBEXP:
2197       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2198       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2199         return NULL;
2200       break;
2201     case OP_OPEN_BRACKET:
2202       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2203       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2204         return NULL;
2205       break;
2206     case OP_BACK_REF:
2207       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2208         {
2209           *err = REG_ESUBREG;
2210           return NULL;
2211         }
2212       dfa->used_bkref_map |= 1 << token->opr.idx;
2213       tree = create_token_tree (dfa, NULL, NULL, token);
2214       if (BE (tree == NULL, 0))
2215         {
2216           *err = REG_ESPACE;
2217           return NULL;
2218         }
2219       ++dfa->nbackref;
2220       dfa->has_mb_node = 1;
2221       break;
2222     case OP_OPEN_DUP_NUM:
2223       if (syntax & RE_CONTEXT_INVALID_DUP)
2224         {
2225           *err = REG_BADRPT;
2226           return NULL;
2227         }
2228       /* FALLTHROUGH */
2229     case OP_DUP_ASTERISK:
2230     case OP_DUP_PLUS:
2231     case OP_DUP_QUESTION:
2232       if (syntax & RE_CONTEXT_INVALID_OPS)
2233         {
2234           *err = REG_BADRPT;
2235           return NULL;
2236         }
2237       else if (syntax & RE_CONTEXT_INDEP_OPS)
2238         {
2239           fetch_token (token, regexp, syntax);
2240           return parse_expression (regexp, preg, token, syntax, nest, err);
2241         }
2242       /* else fall through  */
2243     case OP_CLOSE_SUBEXP:
2244       if ((token->type == OP_CLOSE_SUBEXP) &&
2245           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2246         {
2247           *err = REG_ERPAREN;
2248           return NULL;
2249         }
2250       /* else fall through  */
2251     case OP_CLOSE_DUP_NUM:
2252       /* We treat it as a normal character.  */
2253
2254       /* Then we can these characters as normal characters.  */
2255       token->type = CHARACTER;
2256       /* mb_partial and word_char bits should be initialized already
2257          by peek_token.  */
2258       tree = create_token_tree (dfa, NULL, NULL, token);
2259       if (BE (tree == NULL, 0))
2260         {
2261           *err = REG_ESPACE;
2262           return NULL;
2263         }
2264       break;
2265     case ANCHOR:
2266       if ((token->opr.ctx_type
2267            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2268           && dfa->word_ops_used == 0)
2269         init_word_char (dfa);
2270       if (token->opr.ctx_type == WORD_DELIM
2271           || token->opr.ctx_type == NOT_WORD_DELIM)
2272         {
2273           bin_tree_t *tree_first, *tree_last;
2274           if (token->opr.ctx_type == WORD_DELIM)
2275             {
2276               token->opr.ctx_type = WORD_FIRST;
2277               tree_first = create_token_tree (dfa, NULL, NULL, token);
2278               token->opr.ctx_type = WORD_LAST;
2279             }
2280           else
2281             {
2282               token->opr.ctx_type = INSIDE_WORD;
2283               tree_first = create_token_tree (dfa, NULL, NULL, token);
2284               token->opr.ctx_type = INSIDE_NOTWORD;
2285             }
2286           tree_last = create_token_tree (dfa, NULL, NULL, token);
2287           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2288           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2289             {
2290               *err = REG_ESPACE;
2291               return NULL;
2292             }
2293         }
2294       else
2295         {
2296           tree = create_token_tree (dfa, NULL, NULL, token);
2297           if (BE (tree == NULL, 0))
2298             {
2299               *err = REG_ESPACE;
2300               return NULL;
2301             }
2302         }
2303       /* We must return here, since ANCHORs can't be followed
2304          by repetition operators.
2305          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2306              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2307       fetch_token (token, regexp, syntax);
2308       return tree;
2309     case OP_PERIOD:
2310       tree = create_token_tree (dfa, NULL, NULL, token);
2311       if (BE (tree == NULL, 0))
2312         {
2313           *err = REG_ESPACE;
2314           return NULL;
2315         }
2316       if (dfa->mb_cur_max > 1)
2317         dfa->has_mb_node = 1;
2318       break;
2319     case OP_WORD:
2320     case OP_NOTWORD:
2321       tree = build_charclass_op (dfa, regexp->trans,
2322                                  (const unsigned char *) "alnum",
2323                                  (const unsigned char *) "_",
2324                                  token->type == OP_NOTWORD, err);
2325       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2326         return NULL;
2327       break;
2328     case OP_SPACE:
2329     case OP_NOTSPACE:
2330       tree = build_charclass_op (dfa, regexp->trans,
2331                                  (const unsigned char *) "space",
2332                                  (const unsigned char *) "",
2333                                  token->type == OP_NOTSPACE, err);
2334       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2335         return NULL;
2336       break;
2337     case OP_ALT:
2338     case END_OF_RE:
2339       return NULL;
2340     case BACK_SLASH:
2341       *err = REG_EESCAPE;
2342       return NULL;
2343     default:
2344       /* Must not happen?  */
2345 #ifdef DEBUG
2346       assert (0);
2347 #endif
2348       return NULL;
2349     }
2350   fetch_token (token, regexp, syntax);
2351
2352   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2353          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2354     {
2355       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2356       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2357         return NULL;
2358       /* In BRE consecutive duplications are not allowed.  */
2359       if ((syntax & RE_CONTEXT_INVALID_DUP)
2360           && (token->type == OP_DUP_ASTERISK
2361               || token->type == OP_OPEN_DUP_NUM))
2362         {
2363           *err = REG_BADRPT;
2364           return NULL;
2365         }
2366     }
2367
2368   return tree;
2369 }
2370
2371 /* This function build the following tree, from regular expression
2372    (<reg_exp>):
2373          SUBEXP
2374             |
2375         <reg_exp>
2376 */
2377
2378 static bin_tree_t *
2379 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2380                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2381 {
2382   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2383   bin_tree_t *tree;
2384   size_t cur_nsub;
2385   cur_nsub = preg->re_nsub++;
2386
2387   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2388
2389   /* The subexpression may be a null string.  */
2390   if (token->type == OP_CLOSE_SUBEXP)
2391     tree = NULL;
2392   else
2393     {
2394       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2395       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2396         *err = REG_EPAREN;
2397       if (BE (*err != REG_NOERROR, 0))
2398         return NULL;
2399     }
2400
2401   if (cur_nsub <= '9' - '1')
2402     dfa->completed_bkref_map |= 1 << cur_nsub;
2403
2404   tree = create_tree (dfa, tree, NULL, SUBEXP);
2405   if (BE (tree == NULL, 0))
2406     {
2407       *err = REG_ESPACE;
2408       return NULL;
2409     }
2410   tree->token.opr.idx = cur_nsub;
2411   return tree;
2412 }
2413
2414 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2415
2416 static bin_tree_t *
2417 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2418               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2419 {
2420   bin_tree_t *tree = NULL, *old_tree = NULL;
2421   int i, start, end, start_idx = re_string_cur_idx (regexp);
2422   re_token_t start_token = *token;
2423
2424   if (token->type == OP_OPEN_DUP_NUM)
2425     {
2426       end = 0;
2427       start = fetch_number (regexp, token, syntax);
2428       if (start == -1)
2429         {
2430           if (token->type == CHARACTER && token->opr.c == ',')
2431             start = 0; /* We treat "{,m}" as "{0,m}".  */
2432           else
2433             {
2434               *err = REG_BADBR; /* <re>{} is invalid.  */
2435               return NULL;
2436             }
2437         }
2438       if (BE (start != -2, 1))
2439         {
2440           /* We treat "{n}" as "{n,n}".  */
2441           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2442                  : ((token->type == CHARACTER && token->opr.c == ',')
2443                     ? fetch_number (regexp, token, syntax) : -2));
2444         }
2445       if (BE (start == -2 || end == -2, 0))
2446         {
2447           /* Invalid sequence.  */
2448           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2449             {
2450               if (token->type == END_OF_RE)
2451                 *err = REG_EBRACE;
2452               else
2453                 *err = REG_BADBR;
2454
2455               return NULL;
2456             }
2457
2458           /* If the syntax bit is set, rollback.  */
2459           re_string_set_index (regexp, start_idx);
2460           *token = start_token;
2461           token->type = CHARACTER;
2462           /* mb_partial and word_char bits should be already initialized by
2463              peek_token.  */
2464           return elem;
2465         }
2466
2467       if (BE (end != -1 && start > end, 0))
2468         {
2469           /* First number greater than second.  */
2470           *err = REG_BADBR;
2471           return NULL;
2472         }
2473     }
2474   else
2475     {
2476       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2477       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2478     }
2479
2480   fetch_token (token, regexp, syntax);
2481
2482   if (BE (elem == NULL, 0))
2483     return NULL;
2484   if (BE (start == 0 && end == 0, 0))
2485     {
2486       postorder (elem, free_tree, NULL);
2487       return NULL;
2488     }
2489
2490   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2491   if (BE (start > 0, 0))
2492     {
2493       tree = elem;
2494       for (i = 2; i <= start; ++i)
2495         {
2496           elem = duplicate_tree (elem, dfa);
2497           tree = create_tree (dfa, tree, elem, CONCAT);
2498           if (BE (elem == NULL || tree == NULL, 0))
2499             goto parse_dup_op_espace;
2500         }
2501
2502       if (start == end)
2503         return tree;
2504
2505       /* Duplicate ELEM before it is marked optional.  */
2506       elem = duplicate_tree (elem, dfa);
2507       old_tree = tree;
2508     }
2509   else
2510     old_tree = NULL;
2511
2512   if (elem->token.type == SUBEXP)
2513     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2514
2515   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2516   if (BE (tree == NULL, 0))
2517     goto parse_dup_op_espace;
2518
2519   /* This loop is actually executed only when end != -1,
2520      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2521      already created the start+1-th copy.  */
2522   for (i = start + 2; i <= end; ++i)
2523     {
2524       elem = duplicate_tree (elem, dfa);
2525       tree = create_tree (dfa, tree, elem, CONCAT);
2526       if (BE (elem == NULL || tree == NULL, 0))
2527         goto parse_dup_op_espace;
2528
2529       tree = create_tree (dfa, tree, NULL, OP_ALT);
2530       if (BE (tree == NULL, 0))
2531         goto parse_dup_op_espace;
2532     }
2533
2534   if (old_tree)
2535     tree = create_tree (dfa, old_tree, tree, CONCAT);
2536
2537   return tree;
2538
2539  parse_dup_op_espace:
2540   *err = REG_ESPACE;
2541   return NULL;
2542 }
2543
2544 /* Size of the names for collating symbol/equivalence_class/character_class.
2545    I'm not sure, but maybe enough.  */
2546 #define BRACKET_NAME_BUF_SIZE 32
2547
2548 #ifndef _LIBC
2549   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2550      Build the range expression which starts from START_ELEM, and ends
2551      at END_ELEM.  The result are written to MBCSET and SBCSET.
2552      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2553      mbcset->range_ends, is a pointer argument sinse we may
2554      update it.  */
2555
2556 static reg_errcode_t
2557 internal_function
2558 # ifdef RE_ENABLE_I18N
2559 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2560                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2561 # else /* not RE_ENABLE_I18N */
2562 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2563                  bracket_elem_t *end_elem)
2564 # endif /* not RE_ENABLE_I18N */
2565 {
2566   unsigned int start_ch, end_ch;
2567   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2568   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2569           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2570           0))
2571     return REG_ERANGE;
2572
2573   /* We can handle no multi character collating elements without libc
2574      support.  */
2575   if (BE ((start_elem->type == COLL_SYM
2576            && strlen ((char *) start_elem->opr.name) > 1)
2577           || (end_elem->type == COLL_SYM
2578               && strlen ((char *) end_elem->opr.name) > 1), 0))
2579     return REG_ECOLLATE;
2580
2581 # ifdef RE_ENABLE_I18N
2582   {
2583     wchar_t wc;
2584     wint_t start_wc;
2585     wint_t end_wc;
2586     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2587
2588     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2589                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2590                    : 0));
2591     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2592               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2593                  : 0));
2594     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2595                 ? __btowc (start_ch) : start_elem->opr.wch);
2596     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2597               ? __btowc (end_ch) : end_elem->opr.wch);
2598     if (start_wc == WEOF || end_wc == WEOF)
2599       return REG_ECOLLATE;
2600     cmp_buf[0] = start_wc;
2601     cmp_buf[4] = end_wc;
2602     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2603       return REG_ERANGE;
2604
2605     /* Got valid collation sequence values, add them as a new entry.
2606        However, for !_LIBC we have no collation elements: if the
2607        character set is single byte, the single byte character set
2608        that we build below suffices.  parse_bracket_exp passes
2609        no MBCSET if dfa->mb_cur_max == 1.  */
2610     if (mbcset)
2611       {
2612         /* Check the space of the arrays.  */
2613         if (BE (*range_alloc == mbcset->nranges, 0))
2614           {
2615             /* There is not enough space, need realloc.  */
2616             wchar_t *new_array_start, *new_array_end;
2617             int new_nranges;
2618
2619             /* +1 in case of mbcset->nranges is 0.  */
2620             new_nranges = 2 * mbcset->nranges + 1;
2621             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2622                are NULL if *range_alloc == 0.  */
2623             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2624                                           new_nranges);
2625             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2626                                         new_nranges);
2627
2628             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2629               return REG_ESPACE;
2630
2631             mbcset->range_starts = new_array_start;
2632             mbcset->range_ends = new_array_end;
2633             *range_alloc = new_nranges;
2634           }
2635
2636         mbcset->range_starts[mbcset->nranges] = start_wc;
2637         mbcset->range_ends[mbcset->nranges++] = end_wc;
2638       }
2639
2640     /* Build the table for single byte characters.  */
2641     for (wc = 0; wc < SBC_MAX; ++wc)
2642       {
2643         cmp_buf[2] = wc;
2644         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2645             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2646           bitset_set (sbcset, wc);
2647       }
2648   }
2649 # else /* not RE_ENABLE_I18N */
2650   {
2651     unsigned int ch;
2652     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2653                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2654                    : 0));
2655     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2656               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2657                  : 0));
2658     if (start_ch > end_ch)
2659       return REG_ERANGE;
2660     /* Build the table for single byte characters.  */
2661     for (ch = 0; ch < SBC_MAX; ++ch)
2662       if (start_ch <= ch  && ch <= end_ch)
2663         bitset_set (sbcset, ch);
2664   }
2665 # endif /* not RE_ENABLE_I18N */
2666   return REG_NOERROR;
2667 }
2668 #endif /* not _LIBC */
2669
2670 #ifndef _LIBC
2671 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2672    Build the collating element which is represented by NAME.
2673    The result are written to MBCSET and SBCSET.
2674    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2675    pointer argument since we may update it.  */
2676
2677 static reg_errcode_t
2678 internal_function
2679 # ifdef RE_ENABLE_I18N
2680 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2681                         int *coll_sym_alloc, const unsigned char *name)
2682 # else /* not RE_ENABLE_I18N */
2683 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2684 # endif /* not RE_ENABLE_I18N */
2685 {
2686   size_t name_len = strlen ((const char *) name);
2687   if (BE (name_len != 1, 0))
2688     return REG_ECOLLATE;
2689   else
2690     {
2691       bitset_set (sbcset, name[0]);
2692       return REG_NOERROR;
2693     }
2694 }
2695 #endif /* not _LIBC */
2696
2697 /* This function parse bracket expression like "[abc]", "[a-c]",
2698    "[[.a-a.]]" etc.  */
2699
2700 static bin_tree_t *
2701 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2702                    reg_syntax_t syntax, reg_errcode_t *err)
2703 {
2704 #ifdef _LIBC
2705   const unsigned char *collseqmb;
2706   const char *collseqwc;
2707   uint32_t nrules;
2708   int32_t table_size;
2709   const int32_t *symb_table;
2710   const unsigned char *extra;
2711
2712   /* Local function for parse_bracket_exp used in _LIBC environement.
2713      Seek the collating symbol entry correspondings to NAME.
2714      Return the index of the symbol in the SYMB_TABLE.  */
2715
2716   auto inline int32_t
2717   __attribute ((always_inline))
2718   seek_collating_symbol_entry (name, name_len)
2719          const unsigned char *name;
2720          size_t name_len;
2721     {
2722       int32_t hash = elem_hash ((const char *) name, name_len);
2723       int32_t elem = hash % table_size;
2724       if (symb_table[2 * elem] != 0)
2725         {
2726           int32_t second = hash % (table_size - 2) + 1;
2727
2728           do
2729             {
2730               /* First compare the hashing value.  */
2731               if (symb_table[2 * elem] == hash
2732                   /* Compare the length of the name.  */
2733                   && name_len == extra[symb_table[2 * elem + 1]]
2734                   /* Compare the name.  */
2735                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2736                              name_len) == 0)
2737                 {
2738                   /* Yep, this is the entry.  */
2739                   break;
2740                 }
2741
2742               /* Next entry.  */
2743               elem += second;
2744             }
2745           while (symb_table[2 * elem] != 0);
2746         }
2747       return elem;
2748     }
2749
2750   /* Local function for parse_bracket_exp used in _LIBC environment.
2751      Look up the collation sequence value of BR_ELEM.
2752      Return the value if succeeded, UINT_MAX otherwise.  */
2753
2754   auto inline unsigned int
2755   __attribute ((always_inline))
2756   lookup_collation_sequence_value (br_elem)
2757          bracket_elem_t *br_elem;
2758     {
2759       if (br_elem->type == SB_CHAR)
2760         {
2761           /*
2762           if (MB_CUR_MAX == 1)
2763           */
2764           if (nrules == 0)
2765             return collseqmb[br_elem->opr.ch];
2766           else
2767             {
2768               wint_t wc = __btowc (br_elem->opr.ch);
2769               return __collseq_table_lookup (collseqwc, wc);
2770             }
2771         }
2772       else if (br_elem->type == MB_CHAR)
2773         {
2774           if (nrules != 0)
2775             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2776         }
2777       else if (br_elem->type == COLL_SYM)
2778         {
2779           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2780           if (nrules != 0)
2781             {
2782               int32_t elem, idx;
2783               elem = seek_collating_symbol_entry (br_elem->opr.name,
2784                                                   sym_name_len);
2785               if (symb_table[2 * elem] != 0)
2786                 {
2787                   /* We found the entry.  */
2788                   idx = symb_table[2 * elem + 1];
2789                   /* Skip the name of collating element name.  */
2790                   idx += 1 + extra[idx];
2791                   /* Skip the byte sequence of the collating element.  */
2792                   idx += 1 + extra[idx];
2793                   /* Adjust for the alignment.  */
2794                   idx = (idx + 3) & ~3;
2795                   /* Skip the multibyte collation sequence value.  */
2796                   idx += sizeof (unsigned int);
2797                   /* Skip the wide char sequence of the collating element.  */
2798                   idx += sizeof (unsigned int) *
2799                     (1 + *(unsigned int *) (extra + idx));
2800                   /* Return the collation sequence value.  */
2801                   return *(unsigned int *) (extra + idx);
2802                 }
2803               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2804                 {
2805                   /* No valid character.  Match it as a single byte
2806                      character.  */
2807                   return collseqmb[br_elem->opr.name[0]];
2808                 }
2809             }
2810           else if (sym_name_len == 1)
2811             return collseqmb[br_elem->opr.name[0]];
2812         }
2813       return UINT_MAX;
2814     }
2815
2816   /* Local function for parse_bracket_exp used in _LIBC environement.
2817      Build the range expression which starts from START_ELEM, and ends
2818      at END_ELEM.  The result are written to MBCSET and SBCSET.
2819      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2820      mbcset->range_ends, is a pointer argument sinse we may
2821      update it.  */
2822
2823   auto inline reg_errcode_t
2824   __attribute ((always_inline))
2825   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2826          re_charset_t *mbcset;
2827          int *range_alloc;
2828          bitset_t sbcset;
2829          bracket_elem_t *start_elem, *end_elem;
2830     {
2831       unsigned int ch;
2832       uint32_t start_collseq;
2833       uint32_t end_collseq;
2834
2835       /* Equivalence Classes and Character Classes can't be a range
2836          start/end.  */
2837       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2838               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2839               0))
2840         return REG_ERANGE;
2841
2842       start_collseq = lookup_collation_sequence_value (start_elem);
2843       end_collseq = lookup_collation_sequence_value (end_elem);
2844       /* Check start/end collation sequence values.  */
2845       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2846         return REG_ECOLLATE;
2847       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2848         return REG_ERANGE;
2849
2850       /* Got valid collation sequence values, add them as a new entry.
2851          However, if we have no collation elements, and the character set
2852          is single byte, the single byte character set that we
2853          build below suffices. */
2854       if (nrules > 0 || dfa->mb_cur_max > 1)
2855         {
2856           /* Check the space of the arrays.  */
2857           if (BE (*range_alloc == mbcset->nranges, 0))
2858             {
2859               /* There is not enough space, need realloc.  */
2860               uint32_t *new_array_start;
2861               uint32_t *new_array_end;
2862               int new_nranges;
2863
2864               /* +1 in case of mbcset->nranges is 0.  */
2865               new_nranges = 2 * mbcset->nranges + 1;
2866               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2867                                             new_nranges);
2868               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2869                                           new_nranges);
2870
2871               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2872                 return REG_ESPACE;
2873
2874               mbcset->range_starts = new_array_start;
2875               mbcset->range_ends = new_array_end;
2876               *range_alloc = new_nranges;
2877             }
2878
2879           mbcset->range_starts[mbcset->nranges] = start_collseq;
2880           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2881         }
2882
2883       /* Build the table for single byte characters.  */
2884       for (ch = 0; ch < SBC_MAX; ch++)
2885         {
2886           uint32_t ch_collseq;
2887           /*
2888           if (MB_CUR_MAX == 1)
2889           */
2890           if (nrules == 0)
2891             ch_collseq = collseqmb[ch];
2892           else
2893             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2894           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2895             bitset_set (sbcset, ch);
2896         }
2897       return REG_NOERROR;
2898     }
2899
2900   /* Local function for parse_bracket_exp used in _LIBC environement.
2901      Build the collating element which is represented by NAME.
2902      The result are written to MBCSET and SBCSET.
2903      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2904      pointer argument sinse we may update it.  */
2905
2906   auto inline reg_errcode_t
2907   __attribute ((always_inline))
2908   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2909          re_charset_t *mbcset;
2910          int *coll_sym_alloc;
2911          bitset_t sbcset;
2912          const unsigned char *name;
2913     {
2914       int32_t elem, idx;
2915       size_t name_len = strlen ((const char *) name);
2916       if (nrules != 0)
2917         {
2918           elem = seek_collating_symbol_entry (name, name_len);
2919           if (symb_table[2 * elem] != 0)
2920             {
2921               /* We found the entry.  */
2922               idx = symb_table[2 * elem + 1];
2923               /* Skip the name of collating element name.  */
2924               idx += 1 + extra[idx];
2925             }
2926           else if (symb_table[2 * elem] == 0 && name_len == 1)
2927             {
2928               /* No valid character, treat it as a normal
2929                  character.  */
2930               bitset_set (sbcset, name[0]);
2931               return REG_NOERROR;
2932             }
2933           else
2934             return REG_ECOLLATE;
2935
2936           /* Got valid collation sequence, add it as a new entry.  */
2937           /* Check the space of the arrays.  */
2938           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2939             {
2940               /* Not enough, realloc it.  */
2941               /* +1 in case of mbcset->ncoll_syms is 0.  */
2942               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2943               /* Use realloc since mbcset->coll_syms is NULL
2944                  if *alloc == 0.  */
2945               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2946                                                    new_coll_sym_alloc);
2947               if (BE (new_coll_syms == NULL, 0))
2948                 return REG_ESPACE;
2949               mbcset->coll_syms = new_coll_syms;
2950               *coll_sym_alloc = new_coll_sym_alloc;
2951             }
2952           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2953           return REG_NOERROR;
2954         }
2955       else
2956         {
2957           if (BE (name_len != 1, 0))
2958             return REG_ECOLLATE;
2959           else
2960             {
2961               bitset_set (sbcset, name[0]);
2962               return REG_NOERROR;
2963             }
2964         }
2965     }
2966 #endif
2967
2968   re_token_t br_token;
2969   re_bitset_ptr_t sbcset;
2970 #ifdef RE_ENABLE_I18N
2971   re_charset_t *mbcset;
2972   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2973   int equiv_class_alloc = 0, char_class_alloc = 0;
2974 #endif /* not RE_ENABLE_I18N */
2975   int non_match = 0;
2976   bin_tree_t *work_tree;
2977   int token_len;
2978   int first_round = 1;
2979 #ifdef _LIBC
2980   collseqmb = (const unsigned char *)
2981     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2982   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2983   if (nrules)
2984     {
2985       /*
2986       if (MB_CUR_MAX > 1)
2987       */
2988       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2989       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2990       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2991                                                   _NL_COLLATE_SYMB_TABLEMB);
2992       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2993                                                    _NL_COLLATE_SYMB_EXTRAMB);
2994     }
2995 #endif
2996   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
2997 #ifdef RE_ENABLE_I18N
2998   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2999 #endif /* RE_ENABLE_I18N */
3000 #ifdef RE_ENABLE_I18N
3001   if (BE (sbcset == NULL || mbcset == NULL, 0))
3002 #else
3003   if (BE (sbcset == NULL, 0))
3004 #endif /* RE_ENABLE_I18N */
3005     {
3006       *err = REG_ESPACE;
3007       return NULL;
3008     }
3009
3010   token_len = peek_token_bracket (token, regexp, syntax);
3011   if (BE (token->type == END_OF_RE, 0))
3012     {
3013       *err = REG_BADPAT;
3014       goto parse_bracket_exp_free_return;
3015     }
3016   if (token->type == OP_NON_MATCH_LIST)
3017     {
3018 #ifdef RE_ENABLE_I18N
3019       mbcset->non_match = 1;
3020 #endif /* not RE_ENABLE_I18N */
3021       non_match = 1;
3022       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3023         bitset_set (sbcset, '\n');
3024       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3025       token_len = peek_token_bracket (token, regexp, syntax);
3026       if (BE (token->type == END_OF_RE, 0))
3027         {
3028           *err = REG_BADPAT;
3029           goto parse_bracket_exp_free_return;
3030         }
3031     }
3032
3033   /* We treat the first ']' as a normal character.  */
3034   if (token->type == OP_CLOSE_BRACKET)
3035     token->type = CHARACTER;
3036
3037   while (1)
3038     {
3039       bracket_elem_t start_elem, end_elem;
3040       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3041       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3042       reg_errcode_t ret;
3043       int token_len2 = 0, is_range_exp = 0;
3044       re_token_t token2;
3045
3046       start_elem.opr.name = start_name_buf;
3047       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3048                                    syntax, first_round);
3049       if (BE (ret != REG_NOERROR, 0))
3050         {
3051           *err = ret;
3052           goto parse_bracket_exp_free_return;
3053         }
3054       first_round = 0;
3055
3056       /* Get information about the next token.  We need it in any case.  */
3057       token_len = peek_token_bracket (token, regexp, syntax);
3058
3059       /* Do not check for ranges if we know they are not allowed.  */
3060       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3061         {
3062           if (BE (token->type == END_OF_RE, 0))
3063             {
3064               *err = REG_EBRACK;
3065               goto parse_bracket_exp_free_return;
3066             }
3067           if (token->type == OP_CHARSET_RANGE)
3068             {
3069               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3070               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3071               if (BE (token2.type == END_OF_RE, 0))
3072                 {
3073                   *err = REG_EBRACK;
3074                   goto parse_bracket_exp_free_return;
3075                 }
3076               if (token2.type == OP_CLOSE_BRACKET)
3077                 {
3078                   /* We treat the last '-' as a normal character.  */
3079                   re_string_skip_bytes (regexp, -token_len);
3080                   token->type = CHARACTER;
3081                 }
3082               else
3083                 is_range_exp = 1;
3084             }
3085         }
3086
3087       if (is_range_exp == 1)
3088         {
3089           end_elem.opr.name = end_name_buf;
3090           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3091                                        dfa, syntax, 1);
3092           if (BE (ret != REG_NOERROR, 0))
3093             {
3094               *err = ret;
3095               goto parse_bracket_exp_free_return;
3096             }
3097
3098           token_len = peek_token_bracket (token, regexp, syntax);
3099
3100 #ifdef _LIBC
3101           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3102                                   &start_elem, &end_elem);
3103 #else
3104 # ifdef RE_ENABLE_I18N
3105           *err = build_range_exp (sbcset,
3106                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3107                                   &range_alloc, &start_elem, &end_elem);
3108 # else
3109           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3110 # endif
3111 #endif /* RE_ENABLE_I18N */
3112           if (BE (*err != REG_NOERROR, 0))
3113             goto parse_bracket_exp_free_return;
3114         }
3115       else
3116         {
3117           switch (start_elem.type)
3118             {
3119             case SB_CHAR:
3120               bitset_set (sbcset, start_elem.opr.ch);
3121               break;
3122 #ifdef RE_ENABLE_I18N
3123             case MB_CHAR:
3124               /* Check whether the array has enough space.  */
3125               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3126                 {
3127                   wchar_t *new_mbchars;
3128                   /* Not enough, realloc it.  */
3129                   /* +1 in case of mbcset->nmbchars is 0.  */
3130                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3131                   /* Use realloc since array is NULL if *alloc == 0.  */
3132                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3133                                             mbchar_alloc);
3134                   if (BE (new_mbchars == NULL, 0))
3135                     goto parse_bracket_exp_espace;
3136                   mbcset->mbchars = new_mbchars;
3137                 }
3138               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3139               break;
3140 #endif /* RE_ENABLE_I18N */
3141             case EQUIV_CLASS:
3142               *err = build_equiv_class (sbcset,
3143 #ifdef RE_ENABLE_I18N
3144                                         mbcset, &equiv_class_alloc,
3145 #endif /* RE_ENABLE_I18N */
3146                                         start_elem.opr.name);
3147               if (BE (*err != REG_NOERROR, 0))
3148                 goto parse_bracket_exp_free_return;
3149               break;
3150             case COLL_SYM:
3151               *err = build_collating_symbol (sbcset,
3152 #ifdef RE_ENABLE_I18N
3153                                              mbcset, &coll_sym_alloc,
3154 #endif /* RE_ENABLE_I18N */
3155                                              start_elem.opr.name);
3156               if (BE (*err != REG_NOERROR, 0))
3157                 goto parse_bracket_exp_free_return;
3158               break;
3159             case CHAR_CLASS:
3160               *err = build_charclass (regexp->trans, sbcset,
3161 #ifdef RE_ENABLE_I18N
3162                                       mbcset, &char_class_alloc,
3163 #endif /* RE_ENABLE_I18N */
3164                                       start_elem.opr.name, syntax);
3165               if (BE (*err != REG_NOERROR, 0))
3166                goto parse_bracket_exp_free_return;
3167               break;
3168             default:
3169               assert (0);
3170               break;
3171             }
3172         }
3173       if (BE (token->type == END_OF_RE, 0))
3174         {
3175           *err = REG_EBRACK;
3176           goto parse_bracket_exp_free_return;
3177         }
3178       if (token->type == OP_CLOSE_BRACKET)
3179         break;
3180     }
3181
3182   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3183
3184   /* If it is non-matching list.  */
3185   if (non_match)
3186     bitset_not (sbcset);
3187
3188 #ifdef RE_ENABLE_I18N
3189   /* Ensure only single byte characters are set.  */
3190   if (dfa->mb_cur_max > 1)
3191     bitset_mask (sbcset, dfa->sb_char);
3192
3193   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3194       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3195                                                      || mbcset->non_match)))
3196     {
3197       bin_tree_t *mbc_tree;
3198       int sbc_idx;
3199       /* Build a tree for complex bracket.  */
3200       dfa->has_mb_node = 1;
3201       br_token.type = COMPLEX_BRACKET;
3202       br_token.opr.mbcset = mbcset;
3203       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3204       if (BE (mbc_tree == NULL, 0))
3205         goto parse_bracket_exp_espace;
3206       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3207         if (sbcset[sbc_idx])
3208           break;
3209       /* If there are no bits set in sbcset, there is no point
3210          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3211       if (sbc_idx < BITSET_WORDS)
3212         {
3213           /* Build a tree for simple bracket.  */
3214           br_token.type = SIMPLE_BRACKET;
3215           br_token.opr.sbcset = sbcset;
3216           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3217           if (BE (work_tree == NULL, 0))
3218             goto parse_bracket_exp_espace;
3219
3220           /* Then join them by ALT node.  */
3221           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3222           if (BE (work_tree == NULL, 0))
3223             goto parse_bracket_exp_espace;
3224         }
3225       else
3226         {
3227           re_free (sbcset);
3228           work_tree = mbc_tree;
3229         }
3230     }
3231   else
3232 #endif /* not RE_ENABLE_I18N */
3233     {
3234 #ifdef RE_ENABLE_I18N
3235       free_charset (mbcset);
3236 #endif
3237       /* Build a tree for simple bracket.  */
3238       br_token.type = SIMPLE_BRACKET;
3239       br_token.opr.sbcset = sbcset;
3240       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3241       if (BE (work_tree == NULL, 0))
3242         goto parse_bracket_exp_espace;
3243     }
3244   return work_tree;
3245
3246  parse_bracket_exp_espace:
3247   *err = REG_ESPACE;
3248  parse_bracket_exp_free_return:
3249   re_free (sbcset);
3250 #ifdef RE_ENABLE_I18N
3251   free_charset (mbcset);
3252 #endif /* RE_ENABLE_I18N */
3253   return NULL;
3254 }
3255
3256 /* Parse an element in the bracket expression.  */
3257
3258 static reg_errcode_t
3259 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3260                        re_token_t *token, int token_len, re_dfa_t *dfa,
3261                        reg_syntax_t syntax, int accept_hyphen)
3262 {
3263 #ifdef RE_ENABLE_I18N
3264   int cur_char_size;
3265   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3266   if (cur_char_size > 1)
3267     {
3268       elem->type = MB_CHAR;
3269       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3270       re_string_skip_bytes (regexp, cur_char_size);
3271       return REG_NOERROR;
3272     }
3273 #endif /* RE_ENABLE_I18N */
3274   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3275   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3276       || token->type == OP_OPEN_EQUIV_CLASS)
3277     return parse_bracket_symbol (elem, regexp, token);
3278   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3279     {
3280       /* A '-' must only appear as anything but a range indicator before
3281          the closing bracket.  Everything else is an error.  */
3282       re_token_t token2;
3283       (void) peek_token_bracket (&token2, regexp, syntax);
3284       if (token2.type != OP_CLOSE_BRACKET)
3285         /* The actual error value is not standardized since this whole
3286            case is undefined.  But ERANGE makes good sense.  */
3287         return REG_ERANGE;
3288     }
3289   elem->type = SB_CHAR;
3290   elem->opr.ch = token->opr.c;
3291   return REG_NOERROR;
3292 }
3293
3294 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3295    such as [:<character_class>:], [.<collating_element>.], and
3296    [=<equivalent_class>=].  */
3297
3298 static reg_errcode_t
3299 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3300                       re_token_t *token)
3301 {
3302   unsigned char ch, delim = token->opr.c;
3303   int i = 0;
3304   if (re_string_eoi(regexp))
3305     return REG_EBRACK;
3306   for (;; ++i)
3307     {
3308       if (i >= BRACKET_NAME_BUF_SIZE)
3309         return REG_EBRACK;
3310       if (token->type == OP_OPEN_CHAR_CLASS)
3311         ch = re_string_fetch_byte_case (regexp);
3312       else
3313         ch = re_string_fetch_byte (regexp);
3314       if (re_string_eoi(regexp))
3315         return REG_EBRACK;
3316       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3317         break;
3318       elem->opr.name[i] = ch;
3319     }
3320   re_string_skip_bytes (regexp, 1);
3321   elem->opr.name[i] = '\0';
3322   switch (token->type)
3323     {
3324     case OP_OPEN_COLL_ELEM:
3325       elem->type = COLL_SYM;
3326       break;
3327     case OP_OPEN_EQUIV_CLASS:
3328       elem->type = EQUIV_CLASS;
3329       break;
3330     case OP_OPEN_CHAR_CLASS:
3331       elem->type = CHAR_CLASS;
3332       break;
3333     default:
3334       break;
3335     }
3336   return REG_NOERROR;
3337 }
3338
3339   /* Helper function for parse_bracket_exp.
3340      Build the equivalence class which is represented by NAME.
3341      The result are written to MBCSET and SBCSET.
3342      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3343      is a pointer argument sinse we may update it.  */
3344
3345 static reg_errcode_t
3346 #ifdef RE_ENABLE_I18N
3347 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3348                    int *equiv_class_alloc, const unsigned char *name)
3349 #else /* not RE_ENABLE_I18N */
3350 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3351 #endif /* not RE_ENABLE_I18N */
3352 {
3353 #ifdef _LIBC
3354   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3355   if (nrules != 0)
3356     {
3357       const int32_t *table, *indirect;
3358       const unsigned char *weights, *extra, *cp;
3359       unsigned char char_buf[2];
3360       int32_t idx1, idx2;
3361       unsigned int ch;
3362       size_t len;
3363       /* This #include defines a local function!  */
3364 # include <locale/weight.h>
3365       /* Calculate the index for equivalence class.  */
3366       cp = name;
3367       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3368       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3369                                                _NL_COLLATE_WEIGHTMB);
3370       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3371                                                    _NL_COLLATE_EXTRAMB);
3372       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3373                                                 _NL_COLLATE_INDIRECTMB);
3374       idx1 = findidx (&cp);
3375       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3376         /* This isn't a valid character.  */
3377         return REG_ECOLLATE;
3378
3379       /* Build single byte matcing table for this equivalence class.  */
3380       char_buf[1] = (unsigned char) '\0';
3381       len = weights[idx1 & 0xffffff];
3382       for (ch = 0; ch < SBC_MAX; ++ch)
3383         {
3384           char_buf[0] = ch;
3385           cp = char_buf;
3386           idx2 = findidx (&cp);
3387 /*
3388           idx2 = table[ch];
3389 */
3390           if (idx2 == 0)
3391             /* This isn't a valid character.  */
3392             continue;
3393           /* Compare only if the length matches and the collation rule
3394              index is the same.  */
3395           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3396             {
3397               int cnt = 0;
3398
3399               while (cnt <= len &&
3400                      weights[(idx1 & 0xffffff) + 1 + cnt]
3401                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3402                 ++cnt;
3403
3404               if (cnt > len)
3405                 bitset_set (sbcset, ch);
3406             }
3407         }
3408       /* Check whether the array has enough space.  */
3409       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3410         {
3411           /* Not enough, realloc it.  */
3412           /* +1 in case of mbcset->nequiv_classes is 0.  */
3413           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3414           /* Use realloc since the array is NULL if *alloc == 0.  */
3415           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3416                                                    int32_t,
3417                                                    new_equiv_class_alloc);
3418           if (BE (new_equiv_classes == NULL, 0))
3419             return REG_ESPACE;
3420           mbcset->equiv_classes = new_equiv_classes;
3421           *equiv_class_alloc = new_equiv_class_alloc;
3422         }
3423       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3424     }
3425   else
3426 #endif /* _LIBC */
3427     {
3428       if (BE (strlen ((const char *) name) != 1, 0))
3429         return REG_ECOLLATE;
3430       bitset_set (sbcset, *name);
3431     }
3432   return REG_NOERROR;
3433 }
3434
3435   /* Helper function for parse_bracket_exp.
3436      Build the character class which is represented by NAME.
3437      The result are written to MBCSET and SBCSET.
3438      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3439      is a pointer argument sinse we may update it.  */
3440
3441 static reg_errcode_t
3442 #ifdef RE_ENABLE_I18N
3443 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3444                  re_charset_t *mbcset, int *char_class_alloc,
3445                  const unsigned char *class_name, reg_syntax_t syntax)
3446 #else /* not RE_ENABLE_I18N */
3447 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3448                  const unsigned char *class_name, reg_syntax_t syntax)
3449 #endif /* not RE_ENABLE_I18N */
3450 {
3451   int i;
3452   const char *name = (const char *) class_name;
3453
3454   /* In case of REG_ICASE "upper" and "lower" match the both of
3455      upper and lower cases.  */
3456   if ((syntax & RE_ICASE)
3457       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3458     name = "alpha";
3459
3460 #ifdef RE_ENABLE_I18N
3461   /* Check the space of the arrays.  */
3462   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3463     {
3464       /* Not enough, realloc it.  */
3465       /* +1 in case of mbcset->nchar_classes is 0.  */
3466       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3467       /* Use realloc since array is NULL if *alloc == 0.  */
3468       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3469                                                new_char_class_alloc);
3470       if (BE (new_char_classes == NULL, 0))
3471         return REG_ESPACE;
3472       mbcset->char_classes = new_char_classes;
3473       *char_class_alloc = new_char_class_alloc;
3474     }
3475   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3476 #endif /* RE_ENABLE_I18N */
3477
3478 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3479   do {                                          \
3480     if (BE (trans != NULL, 0))                  \
3481       {                                         \
3482         for (i = 0; i < SBC_MAX; ++i)           \
3483           if (ctype_func (i))                   \
3484             bitset_set (sbcset, trans[i]);      \
3485       }                                         \
3486     else                                        \
3487       {                                         \
3488         for (i = 0; i < SBC_MAX; ++i)           \
3489           if (ctype_func (i))                   \
3490             bitset_set (sbcset, i);             \
3491       }                                         \
3492   } while (0)
3493
3494   if (strcmp (name, "alnum") == 0)
3495     BUILD_CHARCLASS_LOOP (isalnum);
3496   else if (strcmp (name, "cntrl") == 0)
3497     BUILD_CHARCLASS_LOOP (iscntrl);
3498   else if (strcmp (name, "lower") == 0)
3499     BUILD_CHARCLASS_LOOP (islower);
3500   else if (strcmp (name, "space") == 0)
3501     BUILD_CHARCLASS_LOOP (isspace);
3502   else if (strcmp (name, "alpha") == 0)
3503     BUILD_CHARCLASS_LOOP (isalpha);
3504   else if (strcmp (name, "digit") == 0)
3505     BUILD_CHARCLASS_LOOP (isdigit);
3506   else if (strcmp (name, "print") == 0)
3507     BUILD_CHARCLASS_LOOP (isprint);
3508   else if (strcmp (name, "upper") == 0)
3509     BUILD_CHARCLASS_LOOP (isupper);
3510   else if (strcmp (name, "blank") == 0)
3511     BUILD_CHARCLASS_LOOP (isblank);
3512   else if (strcmp (name, "graph") == 0)
3513     BUILD_CHARCLASS_LOOP (isgraph);
3514   else if (strcmp (name, "punct") == 0)
3515     BUILD_CHARCLASS_LOOP (ispunct);
3516   else if (strcmp (name, "xdigit") == 0)
3517     BUILD_CHARCLASS_LOOP (isxdigit);
3518   else
3519     return REG_ECTYPE;
3520
3521   return REG_NOERROR;
3522 }
3523
3524 static bin_tree_t *
3525 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3526                     const unsigned char *class_name,
3527                     const unsigned char *extra, int non_match,
3528                     reg_errcode_t *err)
3529 {
3530   re_bitset_ptr_t sbcset;
3531 #ifdef RE_ENABLE_I18N
3532   re_charset_t *mbcset;
3533   int alloc = 0;
3534 #endif /* not RE_ENABLE_I18N */
3535   reg_errcode_t ret;
3536   re_token_t br_token;
3537   bin_tree_t *tree;
3538
3539   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3540 #ifdef RE_ENABLE_I18N
3541   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3542 #endif /* RE_ENABLE_I18N */
3543
3544 #ifdef RE_ENABLE_I18N
3545   if (BE (sbcset == NULL || mbcset == NULL, 0))
3546 #else /* not RE_ENABLE_I18N */
3547   if (BE (sbcset == NULL, 0))
3548 #endif /* not RE_ENABLE_I18N */
3549     {
3550       *err = REG_ESPACE;
3551       return NULL;
3552     }
3553
3554   if (non_match)
3555     {
3556 #ifdef RE_ENABLE_I18N
3557       mbcset->non_match = 1;
3558 #endif /* not RE_ENABLE_I18N */
3559     }
3560
3561   /* We don't care the syntax in this case.  */
3562   ret = build_charclass (trans, sbcset,
3563 #ifdef RE_ENABLE_I18N
3564                          mbcset, &alloc,
3565 #endif /* RE_ENABLE_I18N */
3566                          class_name, 0);
3567
3568   if (BE (ret != REG_NOERROR, 0))
3569     {
3570       re_free (sbcset);
3571 #ifdef RE_ENABLE_I18N
3572       free_charset (mbcset);
3573 #endif /* RE_ENABLE_I18N */
3574       *err = ret;
3575       return NULL;
3576     }
3577   /* \w match '_' also.  */
3578   for (; *extra; extra++)
3579     bitset_set (sbcset, *extra);
3580
3581   /* If it is non-matching list.  */
3582   if (non_match)
3583     bitset_not (sbcset);
3584
3585 #ifdef RE_ENABLE_I18N
3586   /* Ensure only single byte characters are set.  */
3587   if (dfa->mb_cur_max > 1)
3588     bitset_mask (sbcset, dfa->sb_char);
3589 #endif
3590
3591   /* Build a tree for simple bracket.  */
3592   br_token.type = SIMPLE_BRACKET;
3593   br_token.opr.sbcset = sbcset;
3594   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3595   if (BE (tree == NULL, 0))
3596     goto build_word_op_espace;
3597
3598 #ifdef RE_ENABLE_I18N
3599   if (dfa->mb_cur_max > 1)
3600     {
3601       bin_tree_t *mbc_tree;
3602       /* Build a tree for complex bracket.  */
3603       br_token.type = COMPLEX_BRACKET;
3604       br_token.opr.mbcset = mbcset;
3605       dfa->has_mb_node = 1;
3606       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3607       if (BE (mbc_tree == NULL, 0))
3608         goto build_word_op_espace;
3609       /* Then join them by ALT node.  */
3610       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3611       if (BE (mbc_tree != NULL, 1))
3612         return tree;
3613     }
3614   else
3615     {
3616       free_charset (mbcset);
3617       return tree;
3618     }
3619 #else /* not RE_ENABLE_I18N */
3620   return tree;
3621 #endif /* not RE_ENABLE_I18N */
3622
3623  build_word_op_espace:
3624   re_free (sbcset);
3625 #ifdef RE_ENABLE_I18N
3626   free_charset (mbcset);
3627 #endif /* RE_ENABLE_I18N */
3628   *err = REG_ESPACE;
3629   return NULL;
3630 }
3631
3632 /* This is intended for the expressions like "a{1,3}".
3633    Fetch a number from `input', and return the number.
3634    Return -1, if the number field is empty like "{,1}".
3635    Return -2, If an error is occured.  */
3636
3637 static int
3638 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3639 {
3640   int num = -1;
3641   unsigned char c;
3642   while (1)
3643     {
3644       fetch_token (token, input, syntax);
3645       c = token->opr.c;
3646       if (BE (token->type == END_OF_RE, 0))
3647         return -2;
3648       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3649         break;
3650       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3651              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3652       num = (num > RE_DUP_MAX) ? -2 : num;
3653     }
3654   return num;
3655 }
3656 \f
3657 #ifdef RE_ENABLE_I18N
3658 static void
3659 free_charset (re_charset_t *cset)
3660 {
3661   re_free (cset->mbchars);
3662 # ifdef _LIBC
3663   re_free (cset->coll_syms);
3664   re_free (cset->equiv_classes);
3665   re_free (cset->range_starts);
3666   re_free (cset->range_ends);
3667 # endif
3668   re_free (cset->char_classes);
3669   re_free (cset);
3670 }
3671 #endif /* RE_ENABLE_I18N */
3672 \f
3673 /* Functions for binary tree operation.  */
3674
3675 /* Create a tree node.  */
3676
3677 static bin_tree_t *
3678 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3679              re_token_type_t type)
3680 {
3681   re_token_t t;
3682   t.type = type;
3683   return create_token_tree (dfa, left, right, &t);
3684 }
3685
3686 static bin_tree_t *
3687 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3688                    const re_token_t *token)
3689 {
3690   bin_tree_t *tree;
3691   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3692     {
3693       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3694
3695       if (storage == NULL)
3696         return NULL;
3697       storage->next = dfa->str_tree_storage;
3698       dfa->str_tree_storage = storage;
3699       dfa->str_tree_storage_idx = 0;
3700     }
3701   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3702
3703   tree->parent = NULL;
3704   tree->left = left;
3705   tree->right = right;
3706   tree->token = *token;
3707   tree->token.duplicated = 0;
3708   tree->token.opt_subexp = 0;
3709   tree->first = NULL;
3710   tree->next = NULL;
3711   tree->node_idx = -1;
3712
3713   if (left != NULL)
3714     left->parent = tree;
3715   if (right != NULL)
3716     right->parent = tree;
3717   return tree;
3718 }
3719
3720 /* Mark the tree SRC as an optional subexpression.
3721    To be called from preorder or postorder.  */
3722
3723 static reg_errcode_t
3724 mark_opt_subexp (void *extra, bin_tree_t *node)
3725 {
3726   int idx = (int) (long) extra;
3727   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3728     node->token.opt_subexp = 1;
3729
3730   return REG_NOERROR;
3731 }
3732
3733 /* Free the allocated memory inside NODE. */
3734
3735 static void
3736 free_token (re_token_t *node)
3737 {
3738 #ifdef RE_ENABLE_I18N
3739   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3740     free_charset (node->opr.mbcset);
3741   else
3742 #endif /* RE_ENABLE_I18N */
3743     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3744       re_free (node->opr.sbcset);
3745 }
3746
3747 /* Worker function for tree walking.  Free the allocated memory inside NODE
3748    and its children. */
3749
3750 static reg_errcode_t
3751 free_tree (void *extra, bin_tree_t *node)
3752 {
3753   free_token (&node->token);
3754   return REG_NOERROR;
3755 }
3756
3757
3758 /* Duplicate the node SRC, and return new node.  This is a preorder
3759    visit similar to the one implemented by the generic visitor, but
3760    we need more infrastructure to maintain two parallel trees --- so,
3761    it's easier to duplicate.  */
3762
3763 static bin_tree_t *
3764 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3765 {
3766   const bin_tree_t *node;
3767   bin_tree_t *dup_root;
3768   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3769
3770   for (node = root; ; )
3771     {
3772       /* Create a new tree and link it back to the current parent.  */
3773       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3774       if (*p_new == NULL)
3775         return NULL;
3776       (*p_new)->parent = dup_node;
3777       (*p_new)->token.duplicated = 1;
3778       dup_node = *p_new;
3779
3780       /* Go to the left node, or up and to the right.  */
3781       if (node->left)
3782         {
3783           node = node->left;
3784           p_new = &dup_node->left;
3785         }
3786       else
3787         {
3788           const bin_tree_t *prev = NULL;
3789           while (node->right == prev || node->right == NULL)
3790             {
3791               prev = node;
3792               node = node->parent;
3793               dup_node = dup_node->parent;
3794               if (!node)
3795                 return dup_root;
3796             }
3797           node = node->right;
3798           p_new = &dup_node->right;
3799         }
3800     }
3801 }