posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (re_dfa_t *dfa);
  37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  42                                              int top_clone_node, int root_node,
  43                                              unsigned int constraint);
  44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  45                                      unsigned int constraint);
  46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  47                                    unsigned int constraint);
  48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  50                                          int node, int root);
  51 static void calc_inveclosure (re_dfa_t *dfa);
  52 static int fetch_number (re_string_t *input, re_token_t *token,
  53                          reg_syntax_t syntax);
  54 static void fetch_token (re_token_t *result, re_string_t *input,
  55                          reg_syntax_t syntax);
  56 static int peek_token (re_token_t *token, re_string_t *input,
  57                         reg_syntax_t syntax);
  58 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  59                                reg_syntax_t syntax);
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifndef _LIBC
  90 # ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  92                                       re_charset_t *mbcset, int *range_alloc,
  93                                       bracket_elem_t *start_elem,
  94                                       bracket_elem_t *end_elem);
  95 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  96                                              re_charset_t *mbcset,
  97                                              int *coll_sym_alloc,
  98                                              const unsigned char *name);
  99 # else /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              const unsigned char *name);
 105 # endif /* not RE_ENABLE_I18N */
 106 #endif /* not _LIBC */
 107 #ifdef RE_ENABLE_I18N
 108 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 109                                         re_charset_t *mbcset,
 110                                         int *equiv_class_alloc,
 111                                         const unsigned char *name);
 112 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 113                                       re_bitset_ptr_t sbcset,
 114                                       re_charset_t *mbcset,
 115                                       int *char_class_alloc,
 116                                       const unsigned char *class_name,
 117                                       reg_syntax_t syntax);
 118 #else  /* not RE_ENABLE_I18N */
 119 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 120                                         const unsigned char *name);
 121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 122                                       re_bitset_ptr_t sbcset,
 123                                       const unsigned char *class_name,
 124                                       reg_syntax_t syntax);
 125 #endif /* not RE_ENABLE_I18N */
 126 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 127                                        unsigned RE_TRANSLATE_TYPE trans,
 128                                        const unsigned char *class_name,
 129                                        const unsigned char *extra,
 130                                        int non_match, reg_errcode_t *err);
 131 static bin_tree_t *create_tree (re_dfa_t *dfa,
 132                                 bin_tree_t *left, bin_tree_t *right,
 133                                 re_token_type_t type, int index);
 134 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
 135                                          bin_tree_t *left, bin_tree_t *right,
 136                                          const re_token_t *token)
 137   __attribute ((noinline));
 138 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 139 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
 140 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
 141 \f
 142 /* This table gives an error message for each of the error codes listed
 143    in regex.h.  Obviously the order here has to be same as there.
 144    POSIX doesn't require that we do anything for REG_NOERROR,
 145    but why not be nice?  */
 146
 147 const char __re_error_msgid[] attribute_hidden =
 148   {
 149 #define REG_NOERROR_IDX 0
 150     gettext_noop ("Success")    /* REG_NOERROR */
 151     "\0"
 152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 153     gettext_noop ("No match")   /* REG_NOMATCH */
 154     "\0"
 155 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 156     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 157     "\0"
 158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 159     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 160     "\0"
 161 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 162     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 163     "\0"
 164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 165     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 166     "\0"
 167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 168     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 169     "\0"
 170 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 171     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 172     "\0"
 173 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 174     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 175     "\0"
 176 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 177     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 178     "\0"
 179 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 180     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 181     "\0"
 182 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 183     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 184     "\0"
 185 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 186     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 187     "\0"
 188 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 189     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 190     "\0"
 191 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 192     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 193     "\0"
 194 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 195     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 196     "\0"
 197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 198     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 199   };
 200
 201 const size_t __re_error_msgid_idx[] attribute_hidden =
 202   {
 203     REG_NOERROR_IDX,
 204     REG_NOMATCH_IDX,
 205     REG_BADPAT_IDX,
 206     REG_ECOLLATE_IDX,
 207     REG_ECTYPE_IDX,
 208     REG_EESCAPE_IDX,
 209     REG_ESUBREG_IDX,
 210     REG_EBRACK_IDX,
 211     REG_EPAREN_IDX,
 212     REG_EBRACE_IDX,
 213     REG_BADBR_IDX,
 214     REG_ERANGE_IDX,
 215     REG_ESPACE_IDX,
 216     REG_BADRPT_IDX,
 217     REG_EEND_IDX,
 218     REG_ESIZE_IDX,
 219     REG_ERPAREN_IDX
 220   };
 221 \f
 222 /* Entry points for GNU code.  */
 223
 224 /* re_compile_pattern is the GNU regular expression compiler: it
 225    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 226    Returns 0 if the pattern was valid, otherwise an error string.
 227
 228    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 229    are set in BUFP on entry.  */
 230
 231 const char *
 232 re_compile_pattern (pattern, length, bufp)
 233     const char *pattern;
 234     size_t length;
 235     struct re_pattern_buffer *bufp;
 236 {
 237   reg_errcode_t ret;
 238
 239   /* And GNU code determines whether or not to get register information
 240      by passing null for the REGS argument to re_match, etc., not by
 241      setting no_sub.  */
 242   bufp->no_sub = 0;
 243
 244   /* Match anchors at newline.  */
 245   bufp->newline_anchor = 1;
 246
 247   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 248
 249   if (!ret)
 250     return NULL;
 251   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 252 }
 253 #ifdef _LIBC
 254 weak_alias (__re_compile_pattern, re_compile_pattern)
 255 #endif
 256
 257 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 258    also be assigned to arbitrarily: each pattern buffer stores its own
 259    syntax, so it can be changed between regex compilations.  */
 260 /* This has no initializer because initialized variables in Emacs
 261    become read-only after dumping.  */
 262 reg_syntax_t re_syntax_options;
 263
 264
 265 /* Specify the precise syntax of regexps for compilation.  This provides
 266    for compatibility for various utilities which historically have
 267    different, incompatible syntaxes.
 268
 269    The argument SYNTAX is a bit mask comprised of the various bits
 270    defined in regex.h.  We return the old syntax.  */
 271
 272 reg_syntax_t
 273 re_set_syntax (syntax)
 274     reg_syntax_t syntax;
 275 {
 276   reg_syntax_t ret = re_syntax_options;
 277
 278   re_syntax_options = syntax;
 279   return ret;
 280 }
 281 #ifdef _LIBC
 282 weak_alias (__re_set_syntax, re_set_syntax)
 283 #endif
 284
 285 int
 286 re_compile_fastmap (bufp)
 287     struct re_pattern_buffer *bufp;
 288 {
 289   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 290   char *fastmap = bufp->fastmap;
 291
 292   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 293   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 294   if (dfa->init_state != dfa->init_state_word)
 295     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 296   if (dfa->init_state != dfa->init_state_nl)
 297     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 298   if (dfa->init_state != dfa->init_state_begbuf)
 299     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 300   bufp->fastmap_accurate = 1;
 301   return 0;
 302 }
 303 #ifdef _LIBC
 304 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 305 #endif
 306
 307 static inline void
 308 __attribute ((always_inline))
 309 re_set_fastmap (char *fastmap, int icase, int ch)
 310 {
 311   fastmap[ch] = 1;
 312   if (icase)
 313     fastmap[tolower (ch)] = 1;
 314 }
 315
 316 /* Helper function for re_compile_fastmap.
 317    Compile fastmap for the initial_state INIT_STATE.  */
 318
 319 static void
 320 re_compile_fastmap_iter (bufp, init_state, fastmap)
 321      regex_t *bufp;
 322      const re_dfastate_t *init_state;
 323      char *fastmap;
 324 {
 325   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 326   int node_cnt;
 327   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 328   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 329     {
 330       int node = init_state->nodes.elems[node_cnt];
 331       re_token_type_t type = dfa->nodes[node].type;
 332
 333       if (type == CHARACTER)
 334         {
 335           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 336 #ifdef RE_ENABLE_I18N
 337           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 338             {
 339               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 340               wchar_t wc;
 341               mbstate_t state;
 342
 343               p = buf;
 344               *p++ = dfa->nodes[node].opr.c;
 345               while (++node < dfa->nodes_len
 346                      && dfa->nodes[node].type == CHARACTER
 347                      && dfa->nodes[node].mb_partial)
 348                 *p++ = dfa->nodes[node].opr.c;
 349               memset (&state, 0, sizeof (state));
 350               if (mbrtowc (&wc, (const char *) buf, p - buf,
 351                            &state) == p - buf
 352                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 353                 re_set_fastmap (fastmap, 0, buf[0]);
 354             }
 355 #endif
 356         }
 357       else if (type == SIMPLE_BRACKET)
 358         {
 359           int i, j, ch;
 360           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 361             for (j = 0; j < UINT_BITS; ++j, ++ch)
 362               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 363                 re_set_fastmap (fastmap, icase, ch);
 364         }
 365 #ifdef RE_ENABLE_I18N
 366       else if (type == COMPLEX_BRACKET)
 367         {
 368           int i;
 369           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 370           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 371               || cset->nranges || cset->nchar_classes)
 372             {
 373 # ifdef _LIBC
 374               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 375                 {
 376                   /* In this case we want to catch the bytes which are
 377                      the first byte of any collation elements.
 378                      e.g. In da_DK, we want to catch 'a' since "aa"
 379                           is a valid collation element, and don't catch
 380                           'b' since 'b' is the only collation element
 381                           which starts from 'b'.  */
 382                   int j, ch;
 383                   const int32_t *table = (const int32_t *)
 384                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 385                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 386                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 387                       if (table[ch] < 0)
 388                         re_set_fastmap (fastmap, icase, ch);
 389                 }
 390 # else
 391               if (dfa->mb_cur_max > 1)
 392                 for (i = 0; i < SBC_MAX; ++i)
 393                   if (__btowc (i) == WEOF)
 394                     re_set_fastmap (fastmap, icase, i);
 395 # endif /* not _LIBC */
 396             }
 397           for (i = 0; i < cset->nmbchars; ++i)
 398             {
 399               char buf[256];
 400               mbstate_t state;
 401               memset (&state, '\0', sizeof (state));
 402               __wcrtomb (buf, cset->mbchars[i], &state);
 403               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 404               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 405                 {
 406                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 407                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 408                 }
 409             }
 410         }
 411 #endif /* RE_ENABLE_I18N */
 412       else if (type == OP_PERIOD
 413 #ifdef RE_ENABLE_I18N
 414                || type == OP_UTF8_PERIOD
 415 #endif /* RE_ENABLE_I18N */
 416                || type == END_OF_RE)
 417         {
 418           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 419           if (type == END_OF_RE)
 420             bufp->can_be_null = 1;
 421           return;
 422         }
 423     }
 424 }
 425 \f
 426 /* Entry point for POSIX code.  */
 427 /* regcomp takes a regular expression as a string and compiles it.
 428
 429    PREG is a regex_t *.  We do not expect any fields to be initialized,
 430    since POSIX says we shouldn't.  Thus, we set
 431
 432      `buffer' to the compiled pattern;
 433      `used' to the length of the compiled pattern;
 434      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 435        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 436        RE_SYNTAX_POSIX_BASIC;
 437      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 438      `fastmap' to an allocated space for the fastmap;
 439      `fastmap_accurate' to zero;
 440      `re_nsub' to the number of subexpressions in PATTERN.
 441
 442    PATTERN is the address of the pattern string.
 443
 444    CFLAGS is a series of bits which affect compilation.
 445
 446      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 447      use POSIX basic syntax.
 448
 449      If REG_NEWLINE is set, then . and [^...] don't match newline.
 450      Also, regexec will try a match beginning after every newline.
 451
 452      If REG_ICASE is set, then we considers upper- and lowercase
 453      versions of letters to be equivalent when matching.
 454
 455      If REG_NOSUB is set, then when PREG is passed to regexec, that
 456      routine will report only success or failure, and nothing about the
 457      registers.
 458
 459    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 460    the return codes and their meanings.)  */
 461
 462 int
 463 regcomp (preg, pattern, cflags)
 464     regex_t *__restrict preg;
 465     const char *__restrict pattern;
 466     int cflags;
 467 {
 468   reg_errcode_t ret;
 469   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 470                          : RE_SYNTAX_POSIX_BASIC);
 471
 472   preg->buffer = NULL;
 473   preg->allocated = 0;
 474   preg->used = 0;
 475
 476   /* Try to allocate space for the fastmap.  */
 477   preg->fastmap = re_malloc (char, SBC_MAX);
 478   if (BE (preg->fastmap == NULL, 0))
 479     return REG_ESPACE;
 480
 481   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 482
 483   /* If REG_NEWLINE is set, newlines are treated differently.  */
 484   if (cflags & REG_NEWLINE)
 485     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 486       syntax &= ~RE_DOT_NEWLINE;
 487       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 488       /* It also changes the matching behavior.  */
 489       preg->newline_anchor = 1;
 490     }
 491   else
 492     preg->newline_anchor = 0;
 493   preg->no_sub = !!(cflags & REG_NOSUB);
 494   preg->translate = NULL;
 495
 496   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 497
 498   /* POSIX doesn't distinguish between an unmatched open-group and an
 499      unmatched close-group: both are REG_EPAREN.  */
 500   if (ret == REG_ERPAREN)
 501     ret = REG_EPAREN;
 502
 503   /* We have already checked preg->fastmap != NULL.  */
 504   if (BE (ret == REG_NOERROR, 1))
 505     /* Compute the fastmap now, since regexec cannot modify the pattern
 506        buffer.  This function never fails in this implementation.  */
 507     (void) re_compile_fastmap (preg);
 508   else
 509     {
 510       /* Some error occurred while compiling the expression.  */
 511       re_free (preg->fastmap);
 512       preg->fastmap = NULL;
 513     }
 514
 515   return (int) ret;
 516 }
 517 #ifdef _LIBC
 518 weak_alias (__regcomp, regcomp)
 519 #endif
 520
 521 /* Returns a message corresponding to an error code, ERRCODE, returned
 522    from either regcomp or regexec.   We don't use PREG here.  */
 523
 524 size_t
 525 regerror (errcode, preg, errbuf, errbuf_size)
 526     int errcode;
 527     const regex_t *preg;
 528     char *errbuf;
 529     size_t errbuf_size;
 530 {
 531   const char *msg;
 532   size_t msg_size;
 533
 534   if (BE (errcode < 0
 535           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 536                                / sizeof (__re_error_msgid_idx[0])), 0))
 537     /* Only error codes returned by the rest of the code should be passed
 538        to this routine.  If we are given anything else, or if other regex
 539        code generates an invalid error code, then the program has a bug.
 540        Dump core so we can fix it.  */
 541     abort ();
 542
 543   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 544
 545   msg_size = strlen (msg) + 1; /* Includes the null.  */
 546
 547   if (BE (errbuf_size != 0, 1))
 548     {
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551 #if defined HAVE_MEMPCPY || defined _LIBC
 552           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 553 #else
 554           memcpy (errbuf, msg, errbuf_size - 1);
 555           errbuf[errbuf_size - 1] = 0;
 556 #endif
 557         }
 558       else
 559         memcpy (errbuf, msg, msg_size);
 560     }
 561
 562   return msg_size;
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__regerror, regerror)
 566 #endif
 567
 568
 569 static void
 570 free_dfa_content (re_dfa_t *dfa)
 571 {
 572   int i, j;
 573
 574   re_free (dfa->subexps);
 575
 576   if (dfa->nodes)
 577     for (i = 0; i < dfa->nodes_len; ++i)
 578       {
 579         re_token_t *node = dfa->nodes + i;
 580 #ifdef RE_ENABLE_I18N
 581         if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 582           free_charset (node->opr.mbcset);
 583         else
 584 #endif /* RE_ENABLE_I18N */
 585           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 586             re_free (node->opr.sbcset);
 587       }
 588   re_free (dfa->nexts);
 589   for (i = 0; i < dfa->nodes_len; ++i)
 590     {
 591       if (dfa->eclosures != NULL)
 592         re_node_set_free (dfa->eclosures + i);
 593       if (dfa->inveclosures != NULL)
 594         re_node_set_free (dfa->inveclosures + i);
 595       if (dfa->edests != NULL)
 596         re_node_set_free (dfa->edests + i);
 597     }
 598   re_free (dfa->edests);
 599   re_free (dfa->eclosures);
 600   re_free (dfa->inveclosures);
 601   re_free (dfa->nodes);
 602
 603   if (dfa->state_table)
 604     for (i = 0; i <= dfa->state_hash_mask; ++i)
 605       {
 606         struct re_state_table_entry *entry = dfa->state_table + i;
 607         for (j = 0; j < entry->num; ++j)
 608           {
 609             re_dfastate_t *state = entry->array[j];
 610             free_state (state);
 611           }
 612         re_free (entry->array);
 613       }
 614   re_free (dfa->state_table);
 615 #ifdef RE_ENABLE_I18N
 616   re_free (dfa->sb_char);
 617 #endif
 618 #ifdef DEBUG
 619   re_free (dfa->re_str);
 620 #endif
 621
 622   re_free (dfa);
 623 }
 624
 625
 626 /* Free dynamically allocated space used by PREG.  */
 627
 628 void
 629 regfree (preg)
 630     regex_t *preg;
 631 {
 632   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 633   if (BE (dfa != NULL, 1))
 634     free_dfa_content (dfa);
 635   preg->buffer = NULL;
 636   preg->allocated = 0;
 637
 638   re_free (preg->fastmap);
 639   preg->fastmap = NULL;
 640
 641   re_free (preg->translate);
 642   preg->translate = NULL;
 643 }
 644 #ifdef _LIBC
 645 weak_alias (__regfree, regfree)
 646 #endif
 647 \f
 648 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 649    them unless specifically requested.  */
 650
 651 #if defined _REGEX_RE_COMP || defined _LIBC
 652
 653 /* BSD has one and only one pattern buffer.  */
 654 static struct re_pattern_buffer re_comp_buf;
 655
 656 char *
 657 # ifdef _LIBC
 658 /* Make these definitions weak in libc, so POSIX programs can redefine
 659    these names if they don't use our functions, and still use
 660    regcomp/regexec above without link errors.  */
 661 weak_function
 662 # endif
 663 re_comp (s)
 664      const char *s;
 665 {
 666   reg_errcode_t ret;
 667   char *fastmap;
 668
 669   if (!s)
 670     {
 671       if (!re_comp_buf.buffer)
 672         return gettext ("No previous regular expression");
 673       return 0;
 674     }
 675
 676   if (re_comp_buf.buffer)
 677     {
 678       fastmap = re_comp_buf.fastmap;
 679       re_comp_buf.fastmap = NULL;
 680       __regfree (&re_comp_buf);
 681       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 682       re_comp_buf.fastmap = fastmap;
 683     }
 684
 685   if (re_comp_buf.fastmap == NULL)
 686     {
 687       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 688       if (re_comp_buf.fastmap == NULL)
 689         return (char *) gettext (__re_error_msgid
 690                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 691     }
 692
 693   /* Since `re_exec' always passes NULL for the `regs' argument, we
 694      don't need to initialize the pattern buffer fields which affect it.  */
 695
 696   /* Match anchors at newlines.  */
 697   re_comp_buf.newline_anchor = 1;
 698
 699   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 700
 701   if (!ret)
 702     return NULL;
 703
 704   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 705   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 706 }
 707
 708 #ifdef _LIBC
 709 libc_freeres_fn (free_mem)
 710 {
 711   __regfree (&re_comp_buf);
 712 }
 713 #endif
 714
 715 #endif /* _REGEX_RE_COMP */
 716 \f
 717 /* Internal entry point.
 718    Compile the regular expression PATTERN, whose length is LENGTH.
 719    SYNTAX indicate regular expression's syntax.  */
 720
 721 static reg_errcode_t
 722 re_compile_internal (preg, pattern, length, syntax)
 723      regex_t *preg;
 724      const char * pattern;
 725      int length;
 726      reg_syntax_t syntax;
 727 {
 728   reg_errcode_t err = REG_NOERROR;
 729   re_dfa_t *dfa;
 730   re_string_t regexp;
 731
 732   /* Initialize the pattern buffer.  */
 733   preg->fastmap_accurate = 0;
 734   preg->syntax = syntax;
 735   preg->not_bol = preg->not_eol = 0;
 736   preg->used = 0;
 737   preg->re_nsub = 0;
 738   preg->can_be_null = 0;
 739   preg->regs_allocated = REGS_UNALLOCATED;
 740
 741   /* Initialize the dfa.  */
 742   dfa = (re_dfa_t *) preg->buffer;
 743   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 744     {
 745       /* If zero allocated, but buffer is non-null, try to realloc
 746          enough space.  This loses if buffer's address is bogus, but
 747          that is the user's responsibility.  If ->buffer is NULL this
 748          is a simple allocation.  */
 749       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 750       if (dfa == NULL)
 751         return REG_ESPACE;
 752       preg->allocated = sizeof (re_dfa_t);
 753       preg->buffer = (unsigned char *) dfa;
 754     }
 755   preg->used = sizeof (re_dfa_t);
 756
 757   err = init_dfa (dfa, length);
 758   if (BE (err != REG_NOERROR, 0))
 759     {
 760       free_dfa_content (dfa);
 761       preg->buffer = NULL;
 762       preg->allocated = 0;
 763       return err;
 764     }
 765 #ifdef DEBUG
 766   dfa->re_str = re_malloc (char, length + 1);
 767   strncpy (dfa->re_str, pattern, length + 1);
 768 #endif
 769
 770   err = re_string_construct (&regexp, pattern, length, preg->translate,
 771                              syntax & RE_ICASE, dfa);
 772   if (BE (err != REG_NOERROR, 0))
 773     {
 774     re_compile_internal_free_return:
 775       free_workarea_compile (preg);
 776       re_string_destruct (&regexp);
 777       free_dfa_content (dfa);
 778       preg->buffer = NULL;
 779       preg->allocated = 0;
 780       return err;
 781     }
 782
 783   /* Parse the regular expression, and build a structure tree.  */
 784   preg->re_nsub = 0;
 785   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 786   if (BE (dfa->str_tree == NULL, 0))
 787     goto re_compile_internal_free_return;
 788
 789 #ifdef RE_ENABLE_I18N
 790   /* If possible, do searching in single byte encoding to speed things up.  */
 791   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 792     optimize_utf8 (dfa);
 793 #endif
 794
 795   /* Analyze the tree and collect information which is necessary to
 796      create the dfa.  */
 797   err = analyze (dfa);
 798   if (BE (err != REG_NOERROR, 0))
 799     goto re_compile_internal_free_return;
 800
 801   /* Then create the initial state of the dfa.  */
 802   err = create_initial_state (dfa);
 803
 804   /* Release work areas.  */
 805   free_workarea_compile (preg);
 806   re_string_destruct (&regexp);
 807
 808   if (BE (err != REG_NOERROR, 0))
 809     {
 810       free_dfa_content (dfa);
 811       preg->buffer = NULL;
 812       preg->allocated = 0;
 813     }
 814
 815   return err;
 816 }
 817
 818 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 819    as the initial length of some arrays.  */
 820
 821 static reg_errcode_t
 822 init_dfa (dfa, pat_len)
 823      re_dfa_t *dfa;
 824      int pat_len;
 825 {
 826   int table_size;
 827
 828   memset (dfa, '\0', sizeof (re_dfa_t));
 829
 830   /* Force allocation of str_tree_storage the first time.  */
 831   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 832
 833   dfa->nodes_alloc = pat_len + 1;
 834   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 835
 836   dfa->states_alloc = pat_len + 1;
 837
 838   /*  table_size = 2 ^ ceil(log pat_len) */
 839   for (table_size = 1; table_size > 0; table_size <<= 1)
 840     if (table_size > pat_len)
 841       break;
 842
 843   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 844   dfa->state_hash_mask = table_size - 1;
 845
 846   dfa->subexps_alloc = 1;
 847   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 848
 849   dfa->mb_cur_max = MB_CUR_MAX;
 850 #ifdef _LIBC
 851   if (dfa->mb_cur_max == 6
 852       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 853     dfa->is_utf8 = 1;
 854   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 855                        != 0);
 856 #endif
 857 #ifdef RE_ENABLE_I18N
 858   if (dfa->mb_cur_max > 1)
 859     {
 860       int i, j, ch;
 861
 862       dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 863       if (BE (dfa->sb_char == NULL, 0))
 864         return REG_ESPACE;
 865       if (dfa->is_utf8)
 866         memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
 867       else
 868         for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 869           for (j = 0; j < UINT_BITS; ++j, ++ch)
 870             if (__btowc (ch) != WEOF)
 871               dfa->sb_char[i] |= 1 << j;
 872     }
 873 #endif
 874
 875   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 876           || dfa->subexps == NULL, 0))
 877     return REG_ESPACE;
 878   return REG_NOERROR;
 879 }
 880
 881 /* Initialize WORD_CHAR table, which indicate which character is
 882    "word".  In this case "word" means that it is the word construction
 883    character used by some operators like "\<", "\>", etc.  */
 884
 885 static void
 886 init_word_char (dfa)
 887      re_dfa_t *dfa;
 888 {
 889   int i, j, ch;
 890   dfa->word_ops_used = 1;
 891   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 892     for (j = 0; j < UINT_BITS; ++j, ++ch)
 893       if (isalnum (ch) || ch == '_')
 894         dfa->word_char[i] |= 1 << j;
 895 }
 896
 897 /* Free the work area which are only used while compiling.  */
 898
 899 static void
 900 free_workarea_compile (preg)
 901      regex_t *preg;
 902 {
 903   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 904   bin_tree_storage_t *storage, *next;
 905   for (storage = dfa->str_tree_storage; storage; storage = next)
 906     {
 907       next = storage->next;
 908       re_free (storage);
 909     }
 910   dfa->str_tree_storage = NULL;
 911   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 912   dfa->str_tree = NULL;
 913   re_free (dfa->org_indices);
 914   dfa->org_indices = NULL;
 915 }
 916
 917 /* Create initial states for all contexts.  */
 918
 919 static reg_errcode_t
 920 create_initial_state (dfa)
 921      re_dfa_t *dfa;
 922 {
 923   int first, i;
 924   reg_errcode_t err;
 925   re_node_set init_nodes;
 926
 927   /* Initial states have the epsilon closure of the node which is
 928      the first node of the regular expression.  */
 929   first = dfa->str_tree->first;
 930   dfa->init_node = first;
 931   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 932   if (BE (err != REG_NOERROR, 0))
 933     return err;
 934
 935   /* The back-references which are in initial states can epsilon transit,
 936      since in this case all of the subexpressions can be null.
 937      Then we add epsilon closures of the nodes which are the next nodes of
 938      the back-references.  */
 939   if (dfa->nbackref > 0)
 940     for (i = 0; i < init_nodes.nelem; ++i)
 941       {
 942         int node_idx = init_nodes.elems[i];
 943         re_token_type_t type = dfa->nodes[node_idx].type;
 944
 945         int clexp_idx;
 946         if (type != OP_BACK_REF)
 947           continue;
 948         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 949           {
 950             re_token_t *clexp_node;
 951             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 952             if (clexp_node->type == OP_CLOSE_SUBEXP
 953                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 954               break;
 955           }
 956         if (clexp_idx == init_nodes.nelem)
 957           continue;
 958
 959         if (type == OP_BACK_REF)
 960           {
 961             int dest_idx = dfa->edests[node_idx].elems[0];
 962             if (!re_node_set_contains (&init_nodes, dest_idx))
 963               {
 964                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 965                 i = 0;
 966               }
 967           }
 968       }
 969
 970   /* It must be the first time to invoke acquire_state.  */
 971   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 972   /* We don't check ERR here, since the initial state must not be NULL.  */
 973   if (BE (dfa->init_state == NULL, 0))
 974     return err;
 975   if (dfa->init_state->has_constraint)
 976     {
 977       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 978                                                        CONTEXT_WORD);
 979       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 980                                                      CONTEXT_NEWLINE);
 981       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 982                                                          &init_nodes,
 983                                                          CONTEXT_NEWLINE
 984                                                          | CONTEXT_BEGBUF);
 985       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 986               || dfa->init_state_begbuf == NULL, 0))
 987         return err;
 988     }
 989   else
 990     dfa->init_state_word = dfa->init_state_nl
 991       = dfa->init_state_begbuf = dfa->init_state;
 992
 993   re_node_set_free (&init_nodes);
 994   return REG_NOERROR;
 995 }
 996 \f
 997 #ifdef RE_ENABLE_I18N
 998 /* If it is possible to do searching in single byte encoding instead of UTF-8
 999    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1000    DFA nodes where needed.  */
1001
1002 static void
1003 optimize_utf8 (dfa)
1004      re_dfa_t *dfa;
1005 {
1006   int node, i, mb_chars = 0, has_period = 0;
1007
1008   for (node = 0; node < dfa->nodes_len; ++node)
1009     switch (dfa->nodes[node].type)
1010       {
1011       case CHARACTER:
1012         if (dfa->nodes[node].opr.c >= 0x80)
1013           mb_chars = 1;
1014         break;
1015       case ANCHOR:
1016         switch (dfa->nodes[node].opr.idx)
1017           {
1018           case LINE_FIRST:
1019           case LINE_LAST:
1020           case BUF_FIRST:
1021           case BUF_LAST:
1022             break;
1023           default:
1024             /* Word anchors etc. cannot be handled.  */
1025             return;
1026           }
1027         break;
1028       case OP_PERIOD:
1029         has_period = 1;
1030         break;
1031       case OP_BACK_REF:
1032       case OP_ALT:
1033       case END_OF_RE:
1034       case OP_DUP_ASTERISK:
1035       case OP_DUP_QUESTION:
1036       case OP_OPEN_SUBEXP:
1037       case OP_CLOSE_SUBEXP:
1038         break;
1039       case SIMPLE_BRACKET:
1040         /* Just double check.  */
1041         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1042           if (dfa->nodes[node].opr.sbcset[i])
1043             return;
1044         break;
1045       default:
1046         return;
1047       }
1048
1049   if (mb_chars || has_period)
1050     for (node = 0; node < dfa->nodes_len; ++node)
1051       {
1052         if (dfa->nodes[node].type == CHARACTER
1053             && dfa->nodes[node].opr.c >= 0x80)
1054           dfa->nodes[node].mb_partial = 0;
1055         else if (dfa->nodes[node].type == OP_PERIOD)
1056           dfa->nodes[node].type = OP_UTF8_PERIOD;
1057       }
1058
1059   /* The search can be in single byte locale.  */
1060   dfa->mb_cur_max = 1;
1061   dfa->is_utf8 = 0;
1062   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1063 }
1064 #endif
1065 \f
1066 /* Analyze the structure tree, and calculate "first", "next", "edest",
1067    "eclosure", and "inveclosure".  */
1068
1069 static reg_errcode_t
1070 analyze (dfa)
1071      re_dfa_t *dfa;
1072 {
1073   int i;
1074   reg_errcode_t ret;
1075
1076   /* Allocate arrays.  */
1077   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1078   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1079   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1080   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1081   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1082   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1083           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1084     return REG_ESPACE;
1085   /* Initialize them.  */
1086   for (i = 0; i < dfa->nodes_len; ++i)
1087     {
1088       dfa->nexts[i] = -1;
1089       re_node_set_init_empty (dfa->edests + i);
1090       re_node_set_init_empty (dfa->eclosures + i);
1091       re_node_set_init_empty (dfa->inveclosures + i);
1092     }
1093
1094   ret = analyze_tree (dfa, dfa->str_tree);
1095   if (BE (ret == REG_NOERROR, 1))
1096     {
1097       ret = calc_eclosure (dfa);
1098       if (ret == REG_NOERROR)
1099         calc_inveclosure (dfa);
1100     }
1101   return ret;
1102 }
1103
1104 /* Helper functions for analyze.
1105    This function calculate "first", "next", and "edest" for the subtree
1106    whose root is NODE.  */
1107
1108 static reg_errcode_t
1109 analyze_tree (dfa, node)
1110      re_dfa_t *dfa;
1111      bin_tree_t *node;
1112 {
1113   reg_errcode_t ret;
1114   if (node->first == -1)
1115     calc_first (dfa, node);
1116   if (node->next == -1)
1117     calc_next (dfa, node);
1118   if (node->eclosure.nelem == 0)
1119     calc_epsdest (dfa, node);
1120   /* Calculate "first" etc. for the left child.  */
1121   if (node->left != NULL)
1122     {
1123       ret = analyze_tree (dfa, node->left);
1124       if (BE (ret != REG_NOERROR, 0))
1125         return ret;
1126     }
1127   /* Calculate "first" etc. for the right child.  */
1128   if (node->right != NULL)
1129     {
1130       ret = analyze_tree (dfa, node->right);
1131       if (BE (ret != REG_NOERROR, 0))
1132         return ret;
1133     }
1134   return REG_NOERROR;
1135 }
1136
1137 /* Calculate "first" for the node NODE.  */
1138 static void
1139 calc_first (dfa, node)
1140      re_dfa_t *dfa;
1141      bin_tree_t *node;
1142 {
1143   int idx, type;
1144   idx = node->node_idx;
1145   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1146
1147   switch (type)
1148     {
1149 #ifdef DEBUG
1150     case OP_OPEN_BRACKET:
1151     case OP_CLOSE_BRACKET:
1152     case OP_OPEN_DUP_NUM:
1153     case OP_CLOSE_DUP_NUM:
1154     case OP_DUP_PLUS:
1155     case OP_NON_MATCH_LIST:
1156     case OP_OPEN_COLL_ELEM:
1157     case OP_CLOSE_COLL_ELEM:
1158     case OP_OPEN_EQUIV_CLASS:
1159     case OP_CLOSE_EQUIV_CLASS:
1160     case OP_OPEN_CHAR_CLASS:
1161     case OP_CLOSE_CHAR_CLASS:
1162       /* These must not appear here.  */
1163       assert (0);
1164 #endif
1165     case END_OF_RE:
1166     case CHARACTER:
1167     case OP_PERIOD:
1168     case OP_DUP_ASTERISK:
1169     case OP_DUP_QUESTION:
1170 #ifdef RE_ENABLE_I18N
1171     case OP_UTF8_PERIOD:
1172     case COMPLEX_BRACKET:
1173 #endif /* RE_ENABLE_I18N */
1174     case SIMPLE_BRACKET:
1175     case OP_BACK_REF:
1176     case ANCHOR:
1177     case OP_OPEN_SUBEXP:
1178     case OP_CLOSE_SUBEXP:
1179       node->first = idx;
1180       break;
1181     case OP_ALT:
1182       node->first = idx;
1183       break;
1184       /* else fall through */
1185     default:
1186 #ifdef DEBUG
1187       assert (node->left != NULL);
1188 #endif
1189       if (node->left->first == -1)
1190         calc_first (dfa, node->left);
1191       node->first = node->left->first;
1192       break;
1193     }
1194 }
1195
1196 /* Calculate "next" for the node NODE.  */
1197
1198 static void
1199 calc_next (dfa, node)
1200      re_dfa_t *dfa;
1201      bin_tree_t *node;
1202 {
1203   int idx, type;
1204   bin_tree_t *parent = node->parent;
1205   if (parent == NULL)
1206     {
1207       node->next = -1;
1208       idx = node->node_idx;
1209       if (node->type == 0)
1210         dfa->nexts[idx] = node->next;
1211       return;
1212     }
1213
1214   idx = parent->node_idx;
1215   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1216
1217   switch (type)
1218     {
1219     case OP_DUP_ASTERISK:
1220       node->next = idx;
1221       break;
1222     case CONCAT:
1223       if (parent->left == node)
1224         {
1225           if (parent->right->first == -1)
1226             calc_first (dfa, parent->right);
1227           node->next = parent->right->first;
1228           break;
1229         }
1230       /* else fall through */
1231     default:
1232       if (parent->next == -1)
1233         calc_next (dfa, parent);
1234       node->next = parent->next;
1235       break;
1236     }
1237   idx = node->node_idx;
1238   if (node->type == 0)
1239     dfa->nexts[idx] = node->next;
1240 }
1241
1242 /* Calculate "edest" for the node NODE.  */
1243
1244 static void
1245 calc_epsdest (dfa, node)
1246      re_dfa_t *dfa;
1247      bin_tree_t *node;
1248 {
1249   int idx;
1250   idx = node->node_idx;
1251   if (node->type == 0)
1252     {
1253       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1254           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1255         {
1256           if (node->left->first == -1)
1257             calc_first (dfa, node->left);
1258           if (node->next == -1)
1259             calc_next (dfa, node);
1260           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1261                               node->next);
1262         }
1263       else if (dfa->nodes[idx].type == OP_ALT)
1264         {
1265           int left, right;
1266           if (node->left != NULL)
1267             {
1268               if (node->left->first == -1)
1269                 calc_first (dfa, node->left);
1270               left = node->left->first;
1271             }
1272           else
1273             {
1274               if (node->next == -1)
1275                 calc_next (dfa, node);
1276               left = node->next;
1277             }
1278           if (node->right != NULL)
1279             {
1280               if (node->right->first == -1)
1281                 calc_first (dfa, node->right);
1282               right = node->right->first;
1283             }
1284           else
1285             {
1286               if (node->next == -1)
1287                 calc_next (dfa, node);
1288               right = node->next;
1289             }
1290           re_node_set_init_2 (dfa->edests + idx, left, right);
1291         }
1292       else if (dfa->nodes[idx].type == ANCHOR
1293                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1294                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1295                || dfa->nodes[idx].type == OP_BACK_REF)
1296         re_node_set_init_1 (dfa->edests + idx, node->next);
1297       else
1298         assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1299     }
1300 }
1301
1302 /* Duplicate the epsilon closure of the node ROOT_NODE.
1303    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1304    to their own constraint.  */
1305
1306 static reg_errcode_t
1307 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1308                         init_constraint)
1309      re_dfa_t *dfa;
1310      int top_org_node, top_clone_node, root_node;
1311      unsigned int init_constraint;
1312 {
1313   reg_errcode_t err;
1314   int org_node, clone_node, ret;
1315   unsigned int constraint = init_constraint;
1316   for (org_node = top_org_node, clone_node = top_clone_node;;)
1317     {
1318       int org_dest, clone_dest;
1319       if (dfa->nodes[org_node].type == OP_BACK_REF)
1320         {
1321           /* If the back reference epsilon-transit, its destination must
1322              also have the constraint.  Then duplicate the epsilon closure
1323              of the destination of the back reference, and store it in
1324              edests of the back reference.  */
1325           org_dest = dfa->nexts[org_node];
1326           re_node_set_empty (dfa->edests + clone_node);
1327           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1328           if (BE (err != REG_NOERROR, 0))
1329             return err;
1330           dfa->nexts[clone_node] = dfa->nexts[org_node];
1331           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1332           if (BE (ret < 0, 0))
1333             return REG_ESPACE;
1334         }
1335       else if (dfa->edests[org_node].nelem == 0)
1336         {
1337           /* In case of the node can't epsilon-transit, don't duplicate the
1338              destination and store the original destination as the
1339              destination of the node.  */
1340           dfa->nexts[clone_node] = dfa->nexts[org_node];
1341           break;
1342         }
1343       else if (dfa->edests[org_node].nelem == 1)
1344         {
1345           /* In case of the node can epsilon-transit, and it has only one
1346              destination.  */
1347           org_dest = dfa->edests[org_node].elems[0];
1348           re_node_set_empty (dfa->edests + clone_node);
1349           if (dfa->nodes[org_node].type == ANCHOR)
1350             {
1351               /* In case of the node has another constraint, append it.  */
1352               if (org_node == root_node && clone_node != org_node)
1353                 {
1354                   /* ...but if the node is root_node itself, it means the
1355                      epsilon closure have a loop, then tie it to the
1356                      destination of the root_node.  */
1357                   ret = re_node_set_insert (dfa->edests + clone_node,
1358                                             org_dest);
1359                   if (BE (ret < 0, 0))
1360                     return REG_ESPACE;
1361                   break;
1362                 }
1363               constraint |= dfa->nodes[org_node].opr.ctx_type;
1364             }
1365           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1366           if (BE (err != REG_NOERROR, 0))
1367             return err;
1368           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1369           if (BE (ret < 0, 0))
1370             return REG_ESPACE;
1371         }
1372       else /* dfa->edests[org_node].nelem == 2 */
1373         {
1374           /* In case of the node can epsilon-transit, and it has two
1375              destinations. E.g. '|', '*', '+', '?'.   */
1376           org_dest = dfa->edests[org_node].elems[0];
1377           re_node_set_empty (dfa->edests + clone_node);
1378           /* Search for a duplicated node which satisfies the constraint.  */
1379           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1380           if (clone_dest == -1)
1381             {
1382               /* There are no such a duplicated node, create a new one.  */
1383               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1384               if (BE (err != REG_NOERROR, 0))
1385                 return err;
1386               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1387               if (BE (ret < 0, 0))
1388                 return REG_ESPACE;
1389               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1390                                             root_node, constraint);
1391               if (BE (err != REG_NOERROR, 0))
1392                 return err;
1393             }
1394           else
1395             {
1396               /* There are a duplicated node which satisfy the constraint,
1397                  use it to avoid infinite loop.  */
1398               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1399               if (BE (ret < 0, 0))
1400                 return REG_ESPACE;
1401             }
1402
1403           org_dest = dfa->edests[org_node].elems[1];
1404           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1405           if (BE (err != REG_NOERROR, 0))
1406             return err;
1407           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1408           if (BE (ret < 0, 0))
1409             return REG_ESPACE;
1410         }
1411       org_node = org_dest;
1412       clone_node = clone_dest;
1413     }
1414   return REG_NOERROR;
1415 }
1416
1417 /* Search for a node which is duplicated from the node ORG_NODE, and
1418    satisfies the constraint CONSTRAINT.  */
1419
1420 static int
1421 search_duplicated_node (dfa, org_node, constraint)
1422      re_dfa_t *dfa;
1423      int org_node;
1424      unsigned int constraint;
1425 {
1426   int idx;
1427   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1428     {
1429       if (org_node == dfa->org_indices[idx]
1430           && constraint == dfa->nodes[idx].constraint)
1431         return idx; /* Found.  */
1432     }
1433   return -1; /* Not found.  */
1434 }
1435
1436 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1437    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1438    otherwise return the error code.  */
1439
1440 static reg_errcode_t
1441 duplicate_node (new_idx, dfa, org_idx, constraint)
1442      re_dfa_t *dfa;
1443      int *new_idx, org_idx;
1444      unsigned int constraint;
1445 {
1446   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1447   if (BE (dup_idx == -1, 0))
1448     return REG_ESPACE;
1449   dfa->nodes[dup_idx].constraint = constraint;
1450   if (dfa->nodes[org_idx].type == ANCHOR)
1451     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1452   dfa->nodes[dup_idx].duplicated = 1;
1453   re_node_set_init_empty (dfa->edests + dup_idx);
1454   re_node_set_init_empty (dfa->eclosures + dup_idx);
1455   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1456
1457   /* Store the index of the original node.  */
1458   dfa->org_indices[dup_idx] = org_idx;
1459   *new_idx = dup_idx;
1460   return REG_NOERROR;
1461 }
1462
1463 static void
1464 calc_inveclosure (dfa)
1465      re_dfa_t *dfa;
1466 {
1467   int src, idx, dest;
1468   for (src = 0; src < dfa->nodes_len; ++src)
1469     {
1470       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1471         {
1472           dest = dfa->eclosures[src].elems[idx];
1473           re_node_set_insert (dfa->inveclosures + dest, src);
1474         }
1475     }
1476 }
1477
1478 /* Calculate "eclosure" for all the node in DFA.  */
1479
1480 static reg_errcode_t
1481 calc_eclosure (dfa)
1482      re_dfa_t *dfa;
1483 {
1484   int node_idx, incomplete;
1485 #ifdef DEBUG
1486   assert (dfa->nodes_len > 0);
1487 #endif
1488   incomplete = 0;
1489   /* For each nodes, calculate epsilon closure.  */
1490   for (node_idx = 0; ; ++node_idx)
1491     {
1492       reg_errcode_t err;
1493       re_node_set eclosure_elem;
1494       if (node_idx == dfa->nodes_len)
1495         {
1496           if (!incomplete)
1497             break;
1498           incomplete = 0;
1499           node_idx = 0;
1500         }
1501
1502 #ifdef DEBUG
1503       assert (dfa->eclosures[node_idx].nelem != -1);
1504 #endif
1505       /* If we have already calculated, skip it.  */
1506       if (dfa->eclosures[node_idx].nelem != 0)
1507         continue;
1508       /* Calculate epsilon closure of `node_idx'.  */
1509       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1510       if (BE (err != REG_NOERROR, 0))
1511         return err;
1512
1513       if (dfa->eclosures[node_idx].nelem == 0)
1514         {
1515           incomplete = 1;
1516           re_node_set_free (&eclosure_elem);
1517         }
1518     }
1519   return REG_NOERROR;
1520 }
1521
1522 /* Calculate epsilon closure of NODE.  */
1523
1524 static reg_errcode_t
1525 calc_eclosure_iter (new_set, dfa, node, root)
1526      re_node_set *new_set;
1527      re_dfa_t *dfa;
1528      int node, root;
1529 {
1530   reg_errcode_t err;
1531   unsigned int constraint;
1532   int i, incomplete;
1533   re_node_set eclosure;
1534   incomplete = 0;
1535   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1536   if (BE (err != REG_NOERROR, 0))
1537     return err;
1538
1539   /* This indicates that we are calculating this node now.
1540      We reference this value to avoid infinite loop.  */
1541   dfa->eclosures[node].nelem = -1;
1542
1543   constraint = ((dfa->nodes[node].type == ANCHOR)
1544                 ? dfa->nodes[node].opr.ctx_type : 0);
1545   /* If the current node has constraints, duplicate all nodes.
1546      Since they must inherit the constraints.  */
1547   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1548     {
1549       int org_node, cur_node;
1550       org_node = cur_node = node;
1551       err = duplicate_node_closure (dfa, node, node, node, constraint);
1552       if (BE (err != REG_NOERROR, 0))
1553         return err;
1554     }
1555
1556   /* Expand each epsilon destination nodes.  */
1557   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1558     for (i = 0; i < dfa->edests[node].nelem; ++i)
1559       {
1560         re_node_set eclosure_elem;
1561         int edest = dfa->edests[node].elems[i];
1562         /* If calculating the epsilon closure of `edest' is in progress,
1563            return intermediate result.  */
1564         if (dfa->eclosures[edest].nelem == -1)
1565           {
1566             incomplete = 1;
1567             continue;
1568           }
1569         /* If we haven't calculated the epsilon closure of `edest' yet,
1570            calculate now. Otherwise use calculated epsilon closure.  */
1571         if (dfa->eclosures[edest].nelem == 0)
1572           {
1573             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1574             if (BE (err != REG_NOERROR, 0))
1575               return err;
1576           }
1577         else
1578           eclosure_elem = dfa->eclosures[edest];
1579         /* Merge the epsilon closure of `edest'.  */
1580         re_node_set_merge (&eclosure, &eclosure_elem);
1581         /* If the epsilon closure of `edest' is incomplete,
1582            the epsilon closure of this node is also incomplete.  */
1583         if (dfa->eclosures[edest].nelem == 0)
1584           {
1585             incomplete = 1;
1586             re_node_set_free (&eclosure_elem);
1587           }
1588       }
1589
1590   /* Epsilon closures include itself.  */
1591   re_node_set_insert (&eclosure, node);
1592   if (incomplete && !root)
1593     dfa->eclosures[node].nelem = 0;
1594   else
1595     dfa->eclosures[node] = eclosure;
1596   *new_set = eclosure;
1597   return REG_NOERROR;
1598 }
1599 \f
1600 /* Functions for token which are used in the parser.  */
1601
1602 /* Fetch a token from INPUT.
1603    We must not use this function inside bracket expressions.  */
1604
1605 static void
1606 fetch_token (result, input, syntax)
1607      re_token_t *result;
1608      re_string_t *input;
1609      reg_syntax_t syntax;
1610 {
1611   re_string_skip_bytes (input, peek_token (result, input, syntax));
1612 }
1613
1614 /* Peek a token from INPUT, and return the length of the token.
1615    We must not use this function inside bracket expressions.  */
1616
1617 static int
1618 peek_token (token, input, syntax)
1619      re_token_t *token;
1620      re_string_t *input;
1621      reg_syntax_t syntax;
1622 {
1623   unsigned char c;
1624
1625   if (re_string_eoi (input))
1626     {
1627       token->type = END_OF_RE;
1628       return 0;
1629     }
1630
1631   c = re_string_peek_byte (input, 0);
1632   token->opr.c = c;
1633
1634   token->word_char = 0;
1635 #ifdef RE_ENABLE_I18N
1636   token->mb_partial = 0;
1637   if (input->mb_cur_max > 1 &&
1638       !re_string_first_byte (input, re_string_cur_idx (input)))
1639     {
1640       token->type = CHARACTER;
1641       token->mb_partial = 1;
1642       return 1;
1643     }
1644 #endif
1645   if (c == '\\')
1646     {
1647       unsigned char c2;
1648       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1649         {
1650           token->type = BACK_SLASH;
1651           return 1;
1652         }
1653
1654       c2 = re_string_peek_byte_case (input, 1);
1655       token->opr.c = c2;
1656       token->type = CHARACTER;
1657 #ifdef RE_ENABLE_I18N
1658       if (input->mb_cur_max > 1)
1659         {
1660           wint_t wc = re_string_wchar_at (input,
1661                                           re_string_cur_idx (input) + 1);
1662           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1663         }
1664       else
1665 #endif
1666         token->word_char = IS_WORD_CHAR (c2) != 0;
1667
1668       switch (c2)
1669         {
1670         case '|':
1671           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1672             token->type = OP_ALT;
1673           break;
1674         case '1': case '2': case '3': case '4': case '5':
1675         case '6': case '7': case '8': case '9':
1676           if (!(syntax & RE_NO_BK_REFS))
1677             {
1678               token->type = OP_BACK_REF;
1679               token->opr.idx = c2 - '0';
1680             }
1681           break;
1682         case '<':
1683           if (!(syntax & RE_NO_GNU_OPS))
1684             {
1685               token->type = ANCHOR;
1686               token->opr.ctx_type = WORD_FIRST;
1687             }
1688           break;
1689         case '>':
1690           if (!(syntax & RE_NO_GNU_OPS))
1691             {
1692               token->type = ANCHOR;
1693               token->opr.ctx_type = WORD_LAST;
1694             }
1695           break;
1696         case 'b':
1697           if (!(syntax & RE_NO_GNU_OPS))
1698             {
1699               token->type = ANCHOR;
1700               token->opr.ctx_type = WORD_DELIM;
1701             }
1702           break;
1703         case 'B':
1704           if (!(syntax & RE_NO_GNU_OPS))
1705             {
1706               token->type = ANCHOR;
1707               token->opr.ctx_type = INSIDE_WORD;
1708             }
1709           break;
1710         case 'w':
1711           if (!(syntax & RE_NO_GNU_OPS))
1712             token->type = OP_WORD;
1713           break;
1714         case 'W':
1715           if (!(syntax & RE_NO_GNU_OPS))
1716             token->type = OP_NOTWORD;
1717           break;
1718         case 's':
1719           if (!(syntax & RE_NO_GNU_OPS))
1720             token->type = OP_SPACE;
1721           break;
1722         case 'S':
1723           if (!(syntax & RE_NO_GNU_OPS))
1724             token->type = OP_NOTSPACE;
1725           break;
1726         case '`':
1727           if (!(syntax & RE_NO_GNU_OPS))
1728             {
1729               token->type = ANCHOR;
1730               token->opr.ctx_type = BUF_FIRST;
1731             }
1732           break;
1733         case '\'':
1734           if (!(syntax & RE_NO_GNU_OPS))
1735             {
1736               token->type = ANCHOR;
1737               token->opr.ctx_type = BUF_LAST;
1738             }
1739           break;
1740         case '(':
1741           if (!(syntax & RE_NO_BK_PARENS))
1742             token->type = OP_OPEN_SUBEXP;
1743           break;
1744         case ')':
1745           if (!(syntax & RE_NO_BK_PARENS))
1746             token->type = OP_CLOSE_SUBEXP;
1747           break;
1748         case '+':
1749           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1750             token->type = OP_DUP_PLUS;
1751           break;
1752         case '?':
1753           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1754             token->type = OP_DUP_QUESTION;
1755           break;
1756         case '{':
1757           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1758             token->type = OP_OPEN_DUP_NUM;
1759           break;
1760         case '}':
1761           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1762             token->type = OP_CLOSE_DUP_NUM;
1763           break;
1764         default:
1765           break;
1766         }
1767       return 2;
1768     }
1769
1770   token->type = CHARACTER;
1771 #ifdef RE_ENABLE_I18N
1772   if (input->mb_cur_max > 1)
1773     {
1774       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1775       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1776     }
1777   else
1778 #endif
1779     token->word_char = IS_WORD_CHAR (token->opr.c);
1780
1781   switch (c)
1782     {
1783     case '\n':
1784       if (syntax & RE_NEWLINE_ALT)
1785         token->type = OP_ALT;
1786       break;
1787     case '|':
1788       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1789         token->type = OP_ALT;
1790       break;
1791     case '*':
1792       token->type = OP_DUP_ASTERISK;
1793       break;
1794     case '+':
1795       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1796         token->type = OP_DUP_PLUS;
1797       break;
1798     case '?':
1799       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1800         token->type = OP_DUP_QUESTION;
1801       break;
1802     case '{':
1803       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1804         token->type = OP_OPEN_DUP_NUM;
1805       break;
1806     case '}':
1807       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1808         token->type = OP_CLOSE_DUP_NUM;
1809       break;
1810     case '(':
1811       if (syntax & RE_NO_BK_PARENS)
1812         token->type = OP_OPEN_SUBEXP;
1813       break;
1814     case ')':
1815       if (syntax & RE_NO_BK_PARENS)
1816         token->type = OP_CLOSE_SUBEXP;
1817       break;
1818     case '[':
1819       token->type = OP_OPEN_BRACKET;
1820       break;
1821     case '.':
1822       token->type = OP_PERIOD;
1823       break;
1824     case '^':
1825       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1826           re_string_cur_idx (input) != 0)
1827         {
1828           char prev = re_string_peek_byte (input, -1);
1829           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1830             break;
1831         }
1832       token->type = ANCHOR;
1833       token->opr.ctx_type = LINE_FIRST;
1834       break;
1835     case '$':
1836       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1837           re_string_cur_idx (input) + 1 != re_string_length (input))
1838         {
1839           re_token_t next;
1840           re_string_skip_bytes (input, 1);
1841           peek_token (&next, input, syntax);
1842           re_string_skip_bytes (input, -1);
1843           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1844             break;
1845         }
1846       token->type = ANCHOR;
1847       token->opr.ctx_type = LINE_LAST;
1848       break;
1849     default:
1850       break;
1851     }
1852   return 1;
1853 }
1854
1855 /* Peek a token from INPUT, and return the length of the token.
1856    We must not use this function out of bracket expressions.  */
1857
1858 static int
1859 peek_token_bracket (token, input, syntax)
1860      re_token_t *token;
1861      re_string_t *input;
1862      reg_syntax_t syntax;
1863 {
1864   unsigned char c;
1865   if (re_string_eoi (input))
1866     {
1867       token->type = END_OF_RE;
1868       return 0;
1869     }
1870   c = re_string_peek_byte (input, 0);
1871   token->opr.c = c;
1872
1873 #ifdef RE_ENABLE_I18N
1874   if (input->mb_cur_max > 1 &&
1875       !re_string_first_byte (input, re_string_cur_idx (input)))
1876     {
1877       token->type = CHARACTER;
1878       return 1;
1879     }
1880 #endif /* RE_ENABLE_I18N */
1881
1882   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1883       && re_string_cur_idx (input) + 1 < re_string_length (input))
1884     {
1885       /* In this case, '\' escape a character.  */
1886       unsigned char c2;
1887       re_string_skip_bytes (input, 1);
1888       c2 = re_string_peek_byte (input, 0);
1889       token->opr.c = c2;
1890       token->type = CHARACTER;
1891       return 1;
1892     }
1893   if (c == '[') /* '[' is a special char in a bracket exps.  */
1894     {
1895       unsigned char c2;
1896       int token_len;
1897       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1898         c2 = re_string_peek_byte (input, 1);
1899       else
1900         c2 = 0;
1901       token->opr.c = c2;
1902       token_len = 2;
1903       switch (c2)
1904         {
1905         case '.':
1906           token->type = OP_OPEN_COLL_ELEM;
1907           break;
1908         case '=':
1909           token->type = OP_OPEN_EQUIV_CLASS;
1910           break;
1911         case ':':
1912           if (syntax & RE_CHAR_CLASSES)
1913             {
1914               token->type = OP_OPEN_CHAR_CLASS;
1915               break;
1916             }
1917           /* else fall through.  */
1918         default:
1919           token->type = CHARACTER;
1920           token->opr.c = c;
1921           token_len = 1;
1922           break;
1923         }
1924       return token_len;
1925     }
1926   switch (c)
1927     {
1928     case '-':
1929       token->type = OP_CHARSET_RANGE;
1930       break;
1931     case ']':
1932       token->type = OP_CLOSE_BRACKET;
1933       break;
1934     case '^':
1935       token->type = OP_NON_MATCH_LIST;
1936       break;
1937     default:
1938       token->type = CHARACTER;
1939     }
1940   return 1;
1941 }
1942 \f
1943 /* Functions for parser.  */
1944
1945 /* Entry point of the parser.
1946    Parse the regular expression REGEXP and return the structure tree.
1947    If an error is occured, ERR is set by error code, and return NULL.
1948    This function build the following tree, from regular expression <reg_exp>:
1949            CAT
1950            / \
1951           /   \
1952    <reg_exp>  EOR
1953
1954    CAT means concatenation.
1955    EOR means end of regular expression.  */
1956
1957 static bin_tree_t *
1958 parse (regexp, preg, syntax, err)
1959      re_string_t *regexp;
1960      regex_t *preg;
1961      reg_syntax_t syntax;
1962      reg_errcode_t *err;
1963 {
1964   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1965   bin_tree_t *tree, *eor, *root;
1966   re_token_t current_token;
1967   dfa->syntax = syntax;
1968   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
1969   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1970   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1971     return NULL;
1972   eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
1973   if (tree != NULL)
1974     root = create_tree (dfa, tree, eor, CONCAT, 0);
1975   else
1976     root = eor;
1977   if (BE (eor == NULL || root == NULL, 0))
1978     {
1979       *err = REG_ESPACE;
1980       return NULL;
1981     }
1982   return root;
1983 }
1984
1985 /* This function build the following tree, from regular expression
1986    <branch1>|<branch2>:
1987            ALT
1988            / \
1989           /   \
1990    <branch1> <branch2>
1991
1992    ALT means alternative, which represents the operator `|'.  */
1993
1994 static bin_tree_t *
1995 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1996      re_string_t *regexp;
1997      regex_t *preg;
1998      re_token_t *token;
1999      reg_syntax_t syntax;
2000      int nest;
2001      reg_errcode_t *err;
2002 {
2003   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2004   bin_tree_t *tree, *branch = NULL;
2005   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2006   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2007     return NULL;
2008
2009   while (token->type == OP_ALT)
2010     {
2011       re_token_t alt_token = *token;
2012       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2013       if (token->type != OP_ALT && token->type != END_OF_RE
2014           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2015         {
2016           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2017           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2018             return NULL;
2019         }
2020       else
2021         branch = NULL;
2022       tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2023       if (BE (tree == NULL, 0))
2024         {
2025           *err = REG_ESPACE;
2026           return NULL;
2027         }
2028       dfa->has_plural_match = 1;
2029     }
2030   return tree;
2031 }
2032
2033 /* This function build the following tree, from regular expression
2034    <exp1><exp2>:
2035         CAT
2036         / \
2037        /   \
2038    <exp1> <exp2>
2039
2040    CAT means concatenation.  */
2041
2042 static bin_tree_t *
2043 parse_branch (regexp, preg, token, syntax, nest, err)
2044      re_string_t *regexp;
2045      regex_t *preg;
2046      re_token_t *token;
2047      reg_syntax_t syntax;
2048      int nest;
2049      reg_errcode_t *err;
2050 {
2051   bin_tree_t *tree, *exp;
2052   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2053   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2054   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2055     return NULL;
2056
2057   while (token->type != OP_ALT && token->type != END_OF_RE
2058          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2059     {
2060       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2061       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2062         {
2063           return NULL;
2064         }
2065       if (tree != NULL && exp != NULL)
2066         {
2067           tree = create_tree (dfa, tree, exp, CONCAT, 0);
2068           if (tree == NULL)
2069             {
2070               *err = REG_ESPACE;
2071               return NULL;
2072             }
2073         }
2074       else if (tree == NULL)
2075         tree = exp;
2076       /* Otherwise exp == NULL, we don't need to create new tree.  */
2077     }
2078   return tree;
2079 }
2080
2081 /* This function build the following tree, from regular expression a*:
2082          *
2083          |
2084          a
2085 */
2086
2087 static bin_tree_t *
2088 parse_expression (regexp, preg, token, syntax, nest, err)
2089      re_string_t *regexp;
2090      regex_t *preg;
2091      re_token_t *token;
2092      reg_syntax_t syntax;
2093      int nest;
2094      reg_errcode_t *err;
2095 {
2096   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2097   bin_tree_t *tree;
2098   switch (token->type)
2099     {
2100     case CHARACTER:
2101       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2102       if (BE (tree == NULL, 0))
2103         {
2104           *err = REG_ESPACE;
2105           return NULL;
2106         }
2107 #ifdef RE_ENABLE_I18N
2108       if (dfa->mb_cur_max > 1)
2109         {
2110           while (!re_string_eoi (regexp)
2111                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2112             {
2113               bin_tree_t *mbc_remain;
2114               fetch_token (token, regexp, syntax);
2115               mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2116               tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2117               if (BE (mbc_remain == NULL || tree == NULL, 0))
2118                 {
2119                   *err = REG_ESPACE;
2120                   return NULL;
2121                 }
2122             }
2123         }
2124 #endif
2125       break;
2126     case OP_OPEN_SUBEXP:
2127       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2128       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2129         return NULL;
2130       break;
2131     case OP_OPEN_BRACKET:
2132       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2133       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2134         return NULL;
2135       break;
2136     case OP_BACK_REF:
2137       if (BE (preg->re_nsub < token->opr.idx
2138               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2139         {
2140           *err = REG_ESUBREG;
2141           return NULL;
2142         }
2143       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2144       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2145       if (BE (tree == NULL, 0))
2146         {
2147           *err = REG_ESPACE;
2148           return NULL;
2149         }
2150       ++dfa->nbackref;
2151       dfa->has_mb_node = 1;
2152       break;
2153     case OP_OPEN_DUP_NUM:
2154       if (syntax & RE_CONTEXT_INVALID_DUP)
2155         {
2156           *err = REG_BADRPT;
2157           return NULL;
2158         }
2159       /* FALLTHROUGH */
2160     case OP_DUP_ASTERISK:
2161     case OP_DUP_PLUS:
2162     case OP_DUP_QUESTION:
2163       if (syntax & RE_CONTEXT_INVALID_OPS)
2164         {
2165           *err = REG_BADRPT;
2166           return NULL;
2167         }
2168       else if (syntax & RE_CONTEXT_INDEP_OPS)
2169         {
2170           fetch_token (token, regexp, syntax);
2171           return parse_expression (regexp, preg, token, syntax, nest, err);
2172         }
2173       /* else fall through  */
2174     case OP_CLOSE_SUBEXP:
2175       if ((token->type == OP_CLOSE_SUBEXP) &&
2176           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2177         {
2178           *err = REG_ERPAREN;
2179           return NULL;
2180         }
2181       /* else fall through  */
2182     case OP_CLOSE_DUP_NUM:
2183       /* We treat it as a normal character.  */
2184
2185       /* Then we can these characters as normal characters.  */
2186       token->type = CHARACTER;
2187       /* mb_partial and word_char bits should be initialized already
2188          by peek_token.  */
2189       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2190       if (BE (tree == NULL, 0))
2191         {
2192           *err = REG_ESPACE;
2193           return NULL;
2194         }
2195       break;
2196     case ANCHOR:
2197       if ((token->opr.ctx_type
2198            & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2199           && dfa->word_ops_used == 0)
2200         init_word_char (dfa);
2201       if (token->opr.ctx_type == WORD_DELIM)
2202         {
2203           bin_tree_t *tree_first, *tree_last;
2204           token->opr.ctx_type = WORD_FIRST;
2205           tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2206           token->opr.ctx_type = WORD_LAST;
2207           tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2208           token->type = OP_ALT;
2209           tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2210           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2211             {
2212               *err = REG_ESPACE;
2213               return NULL;
2214             }
2215         }
2216       else
2217         {
2218           tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2219           if (BE (tree == NULL, 0))
2220             {
2221               *err = REG_ESPACE;
2222               return NULL;
2223             }
2224         }
2225       /* We must return here, since ANCHORs can't be followed
2226          by repetition operators.
2227          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2228              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2229       fetch_token (token, regexp, syntax);
2230       return tree;
2231     case OP_PERIOD:
2232       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2233       if (BE (tree == NULL, 0))
2234         {
2235           *err = REG_ESPACE;
2236           return NULL;
2237         }
2238       if (dfa->mb_cur_max > 1)
2239         dfa->has_mb_node = 1;
2240       break;
2241     case OP_WORD:
2242     case OP_NOTWORD:
2243       tree = build_charclass_op (dfa, regexp->trans,
2244                                  (const unsigned char *) "alnum",
2245                                  (const unsigned char *) "_",
2246                                  token->type == OP_NOTWORD, err);
2247       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2248         return NULL;
2249       break;
2250     case OP_SPACE:
2251     case OP_NOTSPACE:
2252       tree = build_charclass_op (dfa, regexp->trans,
2253                                  (const unsigned char *) "space",
2254                                  (const unsigned char *) "",
2255                                  token->type == OP_NOTSPACE, err);
2256       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2257         return NULL;
2258       break;
2259     case OP_ALT:
2260     case END_OF_RE:
2261       return NULL;
2262     case BACK_SLASH:
2263       *err = REG_EESCAPE;
2264       return NULL;
2265     default:
2266       /* Must not happen?  */
2267 #ifdef DEBUG
2268       assert (0);
2269 #endif
2270       return NULL;
2271     }
2272   fetch_token (token, regexp, syntax);
2273
2274   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2275          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2276     {
2277       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2278       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2279         return NULL;
2280       /* In BRE consecutive duplications are not allowed.  */
2281       if ((syntax & RE_CONTEXT_INVALID_DUP)
2282           && (token->type == OP_DUP_ASTERISK
2283               || token->type == OP_OPEN_DUP_NUM))
2284         {
2285           *err = REG_BADRPT;
2286           return NULL;
2287         }
2288       dfa->has_plural_match = 1;
2289     }
2290
2291   return tree;
2292 }
2293
2294 /* This function build the following tree, from regular expression
2295    (<reg_exp>):
2296          SUBEXP
2297             |
2298         <reg_exp>
2299 */
2300
2301 static bin_tree_t *
2302 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2303      re_string_t *regexp;
2304      regex_t *preg;
2305      re_token_t *token;
2306      reg_syntax_t syntax;
2307      int nest;
2308      reg_errcode_t *err;
2309 {
2310   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2311   bin_tree_t *tree, *left_par, *right_par;
2312   size_t cur_nsub;
2313   cur_nsub = preg->re_nsub++;
2314   if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
2315     {
2316       re_subexp_t *new_array;
2317       dfa->subexps_alloc *= 2;
2318       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2319       if (BE (new_array == NULL, 0))
2320         {
2321           dfa->subexps_alloc /= 2;
2322           *err = REG_ESPACE;
2323           return NULL;
2324         }
2325       dfa->subexps = new_array;
2326     }
2327   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2328   dfa->subexps[cur_nsub].end = -1;
2329
2330   left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2331   if (BE (left_par == NULL, 0))
2332     {
2333       *err = REG_ESPACE;
2334       return NULL;
2335     }
2336   dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2337   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2338
2339   /* The subexpression may be a null string.  */
2340   if (token->type == OP_CLOSE_SUBEXP)
2341     tree = NULL;
2342   else
2343     {
2344       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2345       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2346         return NULL;
2347     }
2348   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2349     {
2350       *err = REG_EPAREN;
2351       return NULL;
2352     }
2353   right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2354   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2355   tree = ((tree == NULL) ? right_par
2356           : create_tree (dfa, tree, right_par, CONCAT, 0));
2357   tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2358   if (BE (right_par == NULL || tree == NULL, 0))
2359     {
2360       *err = REG_ESPACE;
2361       return NULL;
2362     }
2363   dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2364
2365   return tree;
2366 }
2367
2368 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2369
2370 static bin_tree_t *
2371 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2372      bin_tree_t *elem;
2373      re_string_t *regexp;
2374      re_dfa_t *dfa;
2375      re_token_t *token;
2376      reg_syntax_t syntax;
2377      reg_errcode_t *err;
2378 {
2379   re_token_t dup_token;
2380   bin_tree_t *tree = NULL;
2381   int i, start, end, start_idx = re_string_cur_idx (regexp);
2382   re_token_t start_token = *token;
2383
2384   if (token->type == OP_OPEN_DUP_NUM)
2385     {
2386       end = 0;
2387       start = fetch_number (regexp, token, syntax);
2388       if (start == -1)
2389         {
2390           if (token->type == CHARACTER && token->opr.c == ',')
2391             start = 0; /* We treat "{,m}" as "{0,m}".  */
2392           else
2393             {
2394               *err = REG_BADBR; /* <re>{} is invalid.  */
2395               return NULL;
2396             }
2397         }
2398       if (BE (start != -2, 1))
2399         {
2400           /* We treat "{n}" as "{n,n}".  */
2401           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2402                  : ((token->type == CHARACTER && token->opr.c == ',')
2403                     ? fetch_number (regexp, token, syntax) : -2));
2404         }
2405       if (BE (start == -2 || end == -2, 0))
2406         {
2407           /* Invalid sequence.  */
2408           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2409             {
2410               if (token->type == END_OF_RE)
2411                 *err = REG_EBRACE;
2412               else
2413                 *err = REG_BADBR;
2414
2415               return NULL;
2416             }
2417
2418           /* If the syntax bit is set, rollback.  */
2419           re_string_set_index (regexp, start_idx);
2420           *token = start_token;
2421           token->type = CHARACTER;
2422           /* mb_partial and word_char bits should be already initialized by
2423              peek_token.  */
2424           return elem;
2425         }
2426
2427       if (BE (end != -1 && start > end, 0))
2428         {
2429           /* First number greater than second.  */
2430           *err = REG_BADBR;
2431           return NULL;
2432         }
2433     }
2434   else
2435     {
2436       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2437       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2438     }
2439
2440   /* Treat "<re>{0}*" etc. as "<re>{0}".  */
2441   if (BE (elem == NULL, 0))
2442     start = end = 0;
2443
2444   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2445   else if (BE (start > 0, 0))
2446     {
2447       tree = elem;
2448       for (i = 2; i <= start; ++i)
2449         {
2450           elem = duplicate_tree (elem, dfa);
2451           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2452           if (BE (elem == NULL || tree == NULL, 0))
2453             goto parse_dup_op_espace;
2454         }
2455     }
2456
2457   if (BE (end != start, 1))
2458     {
2459       dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2460       if (BE (start > 0, 0))
2461         {
2462           elem = duplicate_tree (elem, dfa);
2463           if (BE (elem == NULL, 0))
2464             goto parse_dup_op_espace;
2465
2466           /* This subexpression will be marked as optional, so that
2467              empty matches do not touch the registers.  */
2468           mark_opt_subexp (elem, dfa);
2469
2470           /* Prepare the tree with the modifier.  */
2471           elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2472           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2473         }
2474       else
2475         {
2476           /* We do not need to duplicate the tree because we have not
2477              created it yet.  */
2478           mark_opt_subexp (elem, dfa);
2479           tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2480         }
2481
2482       if (BE (elem == NULL || tree == NULL, 0))
2483         goto parse_dup_op_espace;
2484
2485       /* This loop is actually executed only when end != -1,
2486          to rewrite <re>{0,n} as <re>?<re>?<re>?...  We have
2487          already created the start+1-th copy.  */
2488       for (i = start + 2; i <= end; ++i)
2489         {
2490           elem = duplicate_tree (elem, dfa);
2491           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2492           if (BE (elem == NULL || tree == NULL, 0))
2493             {
2494               *err = REG_ESPACE;
2495               return NULL;
2496             }
2497         }
2498     }
2499
2500   fetch_token (token, regexp, syntax);
2501   return tree;
2502
2503  parse_dup_op_espace:
2504   *err = REG_ESPACE;
2505   return NULL;
2506 }
2507
2508 /* Size of the names for collating symbol/equivalence_class/character_class.
2509    I'm not sure, but maybe enough.  */
2510 #define BRACKET_NAME_BUF_SIZE 32
2511
2512 #ifndef _LIBC
2513   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2514      Build the range expression which starts from START_ELEM, and ends
2515      at END_ELEM.  The result are written to MBCSET and SBCSET.
2516      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2517      mbcset->range_ends, is a pointer argument sinse we may
2518      update it.  */
2519
2520 static reg_errcode_t
2521 # ifdef RE_ENABLE_I18N
2522 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2523      re_charset_t *mbcset;
2524      int *range_alloc;
2525 # else /* not RE_ENABLE_I18N */
2526 build_range_exp (sbcset, start_elem, end_elem)
2527 # endif /* not RE_ENABLE_I18N */
2528      re_bitset_ptr_t sbcset;
2529      bracket_elem_t *start_elem, *end_elem;
2530 {
2531   unsigned int start_ch, end_ch;
2532   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2533   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2534           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2535           0))
2536     return REG_ERANGE;
2537
2538   /* We can handle no multi character collating elements without libc
2539      support.  */
2540   if (BE ((start_elem->type == COLL_SYM
2541            && strlen ((char *) start_elem->opr.name) > 1)
2542           || (end_elem->type == COLL_SYM
2543               && strlen ((char *) end_elem->opr.name) > 1), 0))
2544     return REG_ECOLLATE;
2545
2546 # ifdef RE_ENABLE_I18N
2547   {
2548     wchar_t wc, start_wc, end_wc;
2549     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2550
2551     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2552                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2553                    : 0));
2554     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2555               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2556                  : 0));
2557     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2558                 ? __btowc (start_ch) : start_elem->opr.wch);
2559     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2560               ? __btowc (end_ch) : end_elem->opr.wch);
2561     if (start_wc == WEOF || end_wc == WEOF)
2562       return REG_ECOLLATE;
2563     cmp_buf[0] = start_wc;
2564     cmp_buf[4] = end_wc;
2565     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2566       return REG_ERANGE;
2567
2568     /* Got valid collation sequence values, add them as a new entry.
2569        However, for !_LIBC we have no collation elements: if the
2570        character set is single byte, the single byte character set
2571        that we build below suffices.  parse_bracket_exp passes
2572        no MBCSET if dfa->mb_cur_max == 1.  */
2573     if (mbcset)
2574       {
2575         /* Check the space of the arrays.  */
2576         if (BE (*range_alloc == mbcset->nranges, 0))
2577           {
2578             /* There is not enough space, need realloc.  */
2579             wchar_t *new_array_start, *new_array_end;
2580             int new_nranges;
2581
2582             /* +1 in case of mbcset->nranges is 0.  */
2583             new_nranges = 2 * mbcset->nranges + 1;
2584             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2585                are NULL if *range_alloc == 0.  */
2586             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2587                                           new_nranges);
2588             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2589                                         new_nranges);
2590
2591             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2592               return REG_ESPACE;
2593
2594             mbcset->range_starts = new_array_start;
2595             mbcset->range_ends = new_array_end;
2596             *range_alloc = new_nranges;
2597           }
2598
2599         mbcset->range_starts[mbcset->nranges] = start_wc;
2600         mbcset->range_ends[mbcset->nranges++] = end_wc;
2601       }
2602
2603     /* Build the table for single byte characters.  */
2604     for (wc = 0; wc < SBC_MAX; ++wc)
2605       {
2606         cmp_buf[2] = wc;
2607         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2608             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2609           bitset_set (sbcset, wc);
2610       }
2611   }
2612 # else /* not RE_ENABLE_I18N */
2613   {
2614     unsigned int ch;
2615     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2616                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2617                    : 0));
2618     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2619               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2620                  : 0));
2621     if (start_ch > end_ch)
2622       return REG_ERANGE;
2623     /* Build the table for single byte characters.  */
2624     for (ch = 0; ch < SBC_MAX; ++ch)
2625       if (start_ch <= ch  && ch <= end_ch)
2626         bitset_set (sbcset, ch);
2627   }
2628 # endif /* not RE_ENABLE_I18N */
2629   return REG_NOERROR;
2630 }
2631 #endif /* not _LIBC */
2632
2633 #ifndef _LIBC
2634 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2635    Build the collating element which is represented by NAME.
2636    The result are written to MBCSET and SBCSET.
2637    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2638    pointer argument since we may update it.  */
2639
2640 static reg_errcode_t
2641 # ifdef RE_ENABLE_I18N
2642 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2643      re_charset_t *mbcset;
2644      int *coll_sym_alloc;
2645 # else /* not RE_ENABLE_I18N */
2646 build_collating_symbol (sbcset, name)
2647 # endif /* not RE_ENABLE_I18N */
2648      re_bitset_ptr_t sbcset;
2649      const unsigned char *name;
2650 {
2651   size_t name_len = strlen ((const char *) name);
2652   if (BE (name_len != 1, 0))
2653     return REG_ECOLLATE;
2654   else
2655     {
2656       bitset_set (sbcset, name[0]);
2657       return REG_NOERROR;
2658     }
2659 }
2660 #endif /* not _LIBC */
2661
2662 /* This function parse bracket expression like "[abc]", "[a-c]",
2663    "[[.a-a.]]" etc.  */
2664
2665 static bin_tree_t *
2666 parse_bracket_exp (regexp, dfa, token, syntax, err)
2667      re_string_t *regexp;
2668      re_dfa_t *dfa;
2669      re_token_t *token;
2670      reg_syntax_t syntax;
2671      reg_errcode_t *err;
2672 {
2673 #ifdef _LIBC
2674   const unsigned char *collseqmb;
2675   const char *collseqwc;
2676   uint32_t nrules;
2677   int32_t table_size;
2678   const int32_t *symb_table;
2679   const unsigned char *extra;
2680
2681   /* Local function for parse_bracket_exp used in _LIBC environement.
2682      Seek the collating symbol entry correspondings to NAME.
2683      Return the index of the symbol in the SYMB_TABLE.  */
2684
2685   auto inline int32_t
2686   __attribute ((always_inline))
2687   seek_collating_symbol_entry (name, name_len)
2688          const unsigned char *name;
2689          size_t name_len;
2690     {
2691       int32_t hash = elem_hash ((const char *) name, name_len);
2692       int32_t elem = hash % table_size;
2693       int32_t second = hash % (table_size - 2);
2694       while (symb_table[2 * elem] != 0)
2695         {
2696           /* First compare the hashing value.  */
2697           if (symb_table[2 * elem] == hash
2698               /* Compare the length of the name.  */
2699               && name_len == extra[symb_table[2 * elem + 1]]
2700               /* Compare the name.  */
2701               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2702                          name_len) == 0)
2703             {
2704               /* Yep, this is the entry.  */
2705               break;
2706             }
2707
2708           /* Next entry.  */
2709           elem += second;
2710         }
2711       return elem;
2712     }
2713
2714   /* Local function for parse_bracket_exp used in _LIBC environement.
2715      Look up the collation sequence value of BR_ELEM.
2716      Return the value if succeeded, UINT_MAX otherwise.  */
2717
2718   auto inline unsigned int
2719   __attribute ((always_inline))
2720   lookup_collation_sequence_value (br_elem)
2721          bracket_elem_t *br_elem;
2722     {
2723       if (br_elem->type == SB_CHAR)
2724         {
2725           /*
2726           if (MB_CUR_MAX == 1)
2727           */
2728           if (nrules == 0)
2729             return collseqmb[br_elem->opr.ch];
2730           else
2731             {
2732               wint_t wc = __btowc (br_elem->opr.ch);
2733               return __collseq_table_lookup (collseqwc, wc);
2734             }
2735         }
2736       else if (br_elem->type == MB_CHAR)
2737         {
2738           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2739         }
2740       else if (br_elem->type == COLL_SYM)
2741         {
2742           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2743           if (nrules != 0)
2744             {
2745               int32_t elem, idx;
2746               elem = seek_collating_symbol_entry (br_elem->opr.name,
2747                                                   sym_name_len);
2748               if (symb_table[2 * elem] != 0)
2749                 {
2750                   /* We found the entry.  */
2751                   idx = symb_table[2 * elem + 1];
2752                   /* Skip the name of collating element name.  */
2753                   idx += 1 + extra[idx];
2754                   /* Skip the byte sequence of the collating element.  */
2755                   idx += 1 + extra[idx];
2756                   /* Adjust for the alignment.  */
2757                   idx = (idx + 3) & ~3;
2758                   /* Skip the multibyte collation sequence value.  */
2759                   idx += sizeof (unsigned int);
2760                   /* Skip the wide char sequence of the collating element.  */
2761                   idx += sizeof (unsigned int) *
2762                     (1 + *(unsigned int *) (extra + idx));
2763                   /* Return the collation sequence value.  */
2764                   return *(unsigned int *) (extra + idx);
2765                 }
2766               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2767                 {
2768                   /* No valid character.  Match it as a single byte
2769                      character.  */
2770                   return collseqmb[br_elem->opr.name[0]];
2771                 }
2772             }
2773           else if (sym_name_len == 1)
2774             return collseqmb[br_elem->opr.name[0]];
2775         }
2776       return UINT_MAX;
2777     }
2778
2779   /* Local function for parse_bracket_exp used in _LIBC environement.
2780      Build the range expression which starts from START_ELEM, and ends
2781      at END_ELEM.  The result are written to MBCSET and SBCSET.
2782      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2783      mbcset->range_ends, is a pointer argument sinse we may
2784      update it.  */
2785
2786   auto inline reg_errcode_t
2787   __attribute ((always_inline))
2788   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2789          re_charset_t *mbcset;
2790          int *range_alloc;
2791          re_bitset_ptr_t sbcset;
2792          bracket_elem_t *start_elem, *end_elem;
2793     {
2794       unsigned int ch;
2795       uint32_t start_collseq;
2796       uint32_t end_collseq;
2797
2798       /* Equivalence Classes and Character Classes can't be a range
2799          start/end.  */
2800       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2801               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2802               0))
2803         return REG_ERANGE;
2804
2805       start_collseq = lookup_collation_sequence_value (start_elem);
2806       end_collseq = lookup_collation_sequence_value (end_elem);
2807       /* Check start/end collation sequence values.  */
2808       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2809         return REG_ECOLLATE;
2810       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2811         return REG_ERANGE;
2812
2813       /* Got valid collation sequence values, add them as a new entry.
2814          However, if we have no collation elements, and the character set
2815          is single byte, the single byte character set that we
2816          build below suffices. */
2817       if (nrules > 0 || dfa->mb_cur_max > 1)
2818         {
2819           /* Check the space of the arrays.  */
2820           if (BE (*range_alloc == mbcset->nranges, 0))
2821             {
2822               /* There is not enough space, need realloc.  */
2823               uint32_t *new_array_start;
2824               uint32_t *new_array_end;
2825               int new_nranges;
2826
2827               /* +1 in case of mbcset->nranges is 0.  */
2828               new_nranges = 2 * mbcset->nranges + 1;
2829               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2830                                             new_nranges);
2831               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2832                                           new_nranges);
2833
2834               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2835                 return REG_ESPACE;
2836
2837               mbcset->range_starts = new_array_start;
2838               mbcset->range_ends = new_array_end;
2839               *range_alloc = new_nranges;
2840             }
2841
2842           mbcset->range_starts[mbcset->nranges] = start_collseq;
2843           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2844         }
2845
2846       /* Build the table for single byte characters.  */
2847       for (ch = 0; ch < SBC_MAX; ch++)
2848         {
2849           uint32_t ch_collseq;
2850           /*
2851           if (MB_CUR_MAX == 1)
2852           */
2853           if (nrules == 0)
2854             ch_collseq = collseqmb[ch];
2855           else
2856             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2857           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2858             bitset_set (sbcset, ch);
2859         }
2860       return REG_NOERROR;
2861     }
2862
2863   /* Local function for parse_bracket_exp used in _LIBC environement.
2864      Build the collating element which is represented by NAME.
2865      The result are written to MBCSET and SBCSET.
2866      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2867      pointer argument sinse we may update it.  */
2868
2869   auto inline reg_errcode_t
2870   __attribute ((always_inline))
2871   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2872          re_charset_t *mbcset;
2873          int *coll_sym_alloc;
2874          re_bitset_ptr_t sbcset;
2875          const unsigned char *name;
2876     {
2877       int32_t elem, idx;
2878       size_t name_len = strlen ((const char *) name);
2879       if (nrules != 0)
2880         {
2881           elem = seek_collating_symbol_entry (name, name_len);
2882           if (symb_table[2 * elem] != 0)
2883             {
2884               /* We found the entry.  */
2885               idx = symb_table[2 * elem + 1];
2886               /* Skip the name of collating element name.  */
2887               idx += 1 + extra[idx];
2888             }
2889           else if (symb_table[2 * elem] == 0 && name_len == 1)
2890             {
2891               /* No valid character, treat it as a normal
2892                  character.  */
2893               bitset_set (sbcset, name[0]);
2894               return REG_NOERROR;
2895             }
2896           else
2897             return REG_ECOLLATE;
2898
2899           /* Got valid collation sequence, add it as a new entry.  */
2900           /* Check the space of the arrays.  */
2901           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2902             {
2903               /* Not enough, realloc it.  */
2904               /* +1 in case of mbcset->ncoll_syms is 0.  */
2905               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2906               /* Use realloc since mbcset->coll_syms is NULL
2907                  if *alloc == 0.  */
2908               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2909                                                    new_coll_sym_alloc);
2910               if (BE (new_coll_syms == NULL, 0))
2911                 return REG_ESPACE;
2912               mbcset->coll_syms = new_coll_syms;
2913               *coll_sym_alloc = new_coll_sym_alloc;
2914             }
2915           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2916           return REG_NOERROR;
2917         }
2918       else
2919         {
2920           if (BE (name_len != 1, 0))
2921             return REG_ECOLLATE;
2922           else
2923             {
2924               bitset_set (sbcset, name[0]);
2925               return REG_NOERROR;
2926             }
2927         }
2928     }
2929 #endif
2930
2931   re_token_t br_token;
2932   re_bitset_ptr_t sbcset;
2933 #ifdef RE_ENABLE_I18N
2934   re_charset_t *mbcset;
2935   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2936   int equiv_class_alloc = 0, char_class_alloc = 0;
2937 #endif /* not RE_ENABLE_I18N */
2938   int non_match = 0;
2939   bin_tree_t *work_tree;
2940   int token_len;
2941   int first_round = 1;
2942 #ifdef _LIBC
2943   collseqmb = (const unsigned char *)
2944     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2945   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2946   if (nrules)
2947     {
2948       /*
2949       if (MB_CUR_MAX > 1)
2950       */
2951         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2952       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2953       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2954                                                   _NL_COLLATE_SYMB_TABLEMB);
2955       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2956                                                    _NL_COLLATE_SYMB_EXTRAMB);
2957     }
2958 #endif
2959   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2960 #ifdef RE_ENABLE_I18N
2961   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2962 #endif /* RE_ENABLE_I18N */
2963 #ifdef RE_ENABLE_I18N
2964   if (BE (sbcset == NULL || mbcset == NULL, 0))
2965 #else
2966   if (BE (sbcset == NULL, 0))
2967 #endif /* RE_ENABLE_I18N */
2968     {
2969       *err = REG_ESPACE;
2970       return NULL;
2971     }
2972
2973   token_len = peek_token_bracket (token, regexp, syntax);
2974   if (BE (token->type == END_OF_RE, 0))
2975     {
2976       *err = REG_BADPAT;
2977       goto parse_bracket_exp_free_return;
2978     }
2979   if (token->type == OP_NON_MATCH_LIST)
2980     {
2981 #ifdef RE_ENABLE_I18N
2982       mbcset->non_match = 1;
2983 #endif /* not RE_ENABLE_I18N */
2984       non_match = 1;
2985       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2986         bitset_set (sbcset, '\0');
2987       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2988       token_len = peek_token_bracket (token, regexp, syntax);
2989       if (BE (token->type == END_OF_RE, 0))
2990         {
2991           *err = REG_BADPAT;
2992           goto parse_bracket_exp_free_return;
2993         }
2994     }
2995
2996   /* We treat the first ']' as a normal character.  */
2997   if (token->type == OP_CLOSE_BRACKET)
2998     token->type = CHARACTER;
2999
3000   while (1)
3001     {
3002       bracket_elem_t start_elem, end_elem;
3003       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3004       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3005       reg_errcode_t ret;
3006       int token_len2 = 0, is_range_exp = 0;
3007       re_token_t token2;
3008
3009       start_elem.opr.name = start_name_buf;
3010       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3011                                    syntax, first_round);
3012       if (BE (ret != REG_NOERROR, 0))
3013         {
3014           *err = ret;
3015           goto parse_bracket_exp_free_return;
3016         }
3017       first_round = 0;
3018
3019       /* Get information about the next token.  We need it in any case.  */
3020       token_len = peek_token_bracket (token, regexp, syntax);
3021
3022       /* Do not check for ranges if we know they are not allowed.  */
3023       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3024         {
3025           if (BE (token->type == END_OF_RE, 0))
3026             {
3027               *err = REG_EBRACK;
3028               goto parse_bracket_exp_free_return;
3029             }
3030           if (token->type == OP_CHARSET_RANGE)
3031             {
3032               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3033               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3034               if (BE (token2.type == END_OF_RE, 0))
3035                 {
3036                   *err = REG_EBRACK;
3037                   goto parse_bracket_exp_free_return;
3038                 }
3039               if (token2.type == OP_CLOSE_BRACKET)
3040                 {
3041                   /* We treat the last '-' as a normal character.  */
3042                   re_string_skip_bytes (regexp, -token_len);
3043                   token->type = CHARACTER;
3044                 }
3045               else
3046                 is_range_exp = 1;
3047             }
3048         }
3049
3050       if (is_range_exp == 1)
3051         {
3052           end_elem.opr.name = end_name_buf;
3053           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3054                                        dfa, syntax, 1);
3055           if (BE (ret != REG_NOERROR, 0))
3056             {
3057               *err = ret;
3058               goto parse_bracket_exp_free_return;
3059             }
3060
3061           token_len = peek_token_bracket (token, regexp, syntax);
3062
3063 #ifdef _LIBC
3064           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3065                                   &start_elem, &end_elem);
3066 #else
3067 # ifdef RE_ENABLE_I18N
3068           *err = build_range_exp (sbcset,
3069                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3070                                   &range_alloc, &start_elem, &end_elem);
3071 # else
3072           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3073 # endif
3074 #endif /* RE_ENABLE_I18N */
3075           if (BE (*err != REG_NOERROR, 0))
3076             goto parse_bracket_exp_free_return;
3077         }
3078       else
3079         {
3080           switch (start_elem.type)
3081             {
3082             case SB_CHAR:
3083               bitset_set (sbcset, start_elem.opr.ch);
3084               break;
3085 #ifdef RE_ENABLE_I18N
3086             case MB_CHAR:
3087               /* Check whether the array has enough space.  */
3088               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3089                 {
3090                   wchar_t *new_mbchars;
3091                   /* Not enough, realloc it.  */
3092                   /* +1 in case of mbcset->nmbchars is 0.  */
3093                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3094                   /* Use realloc since array is NULL if *alloc == 0.  */
3095                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3096                                             mbchar_alloc);
3097                   if (BE (new_mbchars == NULL, 0))
3098                     goto parse_bracket_exp_espace;
3099                   mbcset->mbchars = new_mbchars;
3100                 }
3101               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3102               break;
3103 #endif /* RE_ENABLE_I18N */
3104             case EQUIV_CLASS:
3105               *err = build_equiv_class (sbcset,
3106 #ifdef RE_ENABLE_I18N
3107                                         mbcset, &equiv_class_alloc,
3108 #endif /* RE_ENABLE_I18N */
3109                                         start_elem.opr.name);
3110               if (BE (*err != REG_NOERROR, 0))
3111                 goto parse_bracket_exp_free_return;
3112               break;
3113             case COLL_SYM:
3114               *err = build_collating_symbol (sbcset,
3115 #ifdef RE_ENABLE_I18N
3116                                              mbcset, &coll_sym_alloc,
3117 #endif /* RE_ENABLE_I18N */
3118                                              start_elem.opr.name);
3119               if (BE (*err != REG_NOERROR, 0))
3120                 goto parse_bracket_exp_free_return;
3121               break;
3122             case CHAR_CLASS:
3123               *err = build_charclass (regexp->trans, sbcset,
3124 #ifdef RE_ENABLE_I18N
3125                                       mbcset, &char_class_alloc,
3126 #endif /* RE_ENABLE_I18N */
3127                                       start_elem.opr.name, syntax);
3128               if (BE (*err != REG_NOERROR, 0))
3129                goto parse_bracket_exp_free_return;
3130               break;
3131             default:
3132               assert (0);
3133               break;
3134             }
3135         }
3136       if (BE (token->type == END_OF_RE, 0))
3137         {
3138           *err = REG_EBRACK;
3139           goto parse_bracket_exp_free_return;
3140         }
3141       if (token->type == OP_CLOSE_BRACKET)
3142         break;
3143     }
3144
3145   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3146
3147   /* If it is non-matching list.  */
3148   if (non_match)
3149     bitset_not (sbcset);
3150
3151 #ifdef RE_ENABLE_I18N
3152   /* Ensure only single byte characters are set.  */
3153   if (dfa->mb_cur_max > 1)
3154     bitset_mask (sbcset, dfa->sb_char);
3155 #endif /* RE_ENABLE_I18N */
3156
3157   /* Build a tree for simple bracket.  */
3158   br_token.type = SIMPLE_BRACKET;
3159   br_token.opr.sbcset = sbcset;
3160   work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3161   if (BE (work_tree == NULL, 0))
3162     goto parse_bracket_exp_espace;
3163
3164 #ifdef RE_ENABLE_I18N
3165   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3166       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3167                                                      || mbcset->non_match)))
3168     {
3169       re_token_t alt_token;
3170       bin_tree_t *mbc_tree;
3171       int sbc_idx;
3172       /* Build a tree for complex bracket.  */
3173       dfa->has_mb_node = 1;
3174       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3175         if (sbcset[sbc_idx])
3176           break;
3177       /* If there are no bits set in sbcset, there is no point
3178          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3179       if (sbc_idx == BITSET_UINTS)
3180         {
3181           re_free (sbcset);
3182           dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3183           dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3184           return work_tree;
3185         }
3186       br_token.type = COMPLEX_BRACKET;
3187       br_token.opr.mbcset = mbcset;
3188       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3189       if (BE (mbc_tree == NULL, 0))
3190         goto parse_bracket_exp_espace;
3191       /* Then join them by ALT node.  */
3192       alt_token.type = OP_ALT;
3193       dfa->has_plural_match = 1;
3194       work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3195       if (BE (mbc_tree != NULL, 1))
3196         return work_tree;
3197     }
3198   else
3199     {
3200       free_charset (mbcset);
3201       return work_tree;
3202     }
3203 #else /* not RE_ENABLE_I18N */
3204   return work_tree;
3205 #endif /* not RE_ENABLE_I18N */
3206
3207  parse_bracket_exp_espace:
3208   *err = REG_ESPACE;
3209  parse_bracket_exp_free_return:
3210   re_free (sbcset);
3211 #ifdef RE_ENABLE_I18N
3212   free_charset (mbcset);
3213 #endif /* RE_ENABLE_I18N */
3214   return NULL;
3215 }
3216
3217 /* Parse an element in the bracket expression.  */
3218
3219 static reg_errcode_t
3220 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3221                        accept_hyphen)
3222      bracket_elem_t *elem;
3223      re_string_t *regexp;
3224      re_token_t *token;
3225      int token_len;
3226      re_dfa_t *dfa;
3227      reg_syntax_t syntax;
3228      int accept_hyphen;
3229 {
3230 #ifdef RE_ENABLE_I18N
3231   int cur_char_size;
3232   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3233   if (cur_char_size > 1)
3234     {
3235       elem->type = MB_CHAR;
3236       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3237       re_string_skip_bytes (regexp, cur_char_size);
3238       return REG_NOERROR;
3239     }
3240 #endif /* RE_ENABLE_I18N */
3241   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3242   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3243       || token->type == OP_OPEN_EQUIV_CLASS)
3244     return parse_bracket_symbol (elem, regexp, token);
3245   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3246     {
3247       /* A '-' must only appear as anything but a range indicator before
3248          the closing bracket.  Everything else is an error.  */
3249       re_token_t token2;
3250       (void) peek_token_bracket (&token2, regexp, syntax);
3251       if (token2.type != OP_CLOSE_BRACKET)
3252         /* The actual error value is not standardized since this whole
3253            case is undefined.  But ERANGE makes good sense.  */
3254         return REG_ERANGE;
3255     }
3256   elem->type = SB_CHAR;
3257   elem->opr.ch = token->opr.c;
3258   return REG_NOERROR;
3259 }
3260
3261 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3262    such as [:<character_class>:], [.<collating_element>.], and
3263    [=<equivalent_class>=].  */
3264
3265 static reg_errcode_t
3266 parse_bracket_symbol (elem, regexp, token)
3267      bracket_elem_t *elem;
3268      re_string_t *regexp;
3269      re_token_t *token;
3270 {
3271   unsigned char ch, delim = token->opr.c;
3272   int i = 0;
3273   if (re_string_eoi(regexp))
3274     return REG_EBRACK;
3275   for (;; ++i)
3276     {
3277       if (i >= BRACKET_NAME_BUF_SIZE)
3278         return REG_EBRACK;
3279       if (token->type == OP_OPEN_CHAR_CLASS)
3280         ch = re_string_fetch_byte_case (regexp);
3281       else
3282         ch = re_string_fetch_byte (regexp);
3283       if (re_string_eoi(regexp))
3284         return REG_EBRACK;
3285       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3286         break;
3287       elem->opr.name[i] = ch;
3288     }
3289   re_string_skip_bytes (regexp, 1);
3290   elem->opr.name[i] = '\0';
3291   switch (token->type)
3292     {
3293     case OP_OPEN_COLL_ELEM:
3294       elem->type = COLL_SYM;
3295       break;
3296     case OP_OPEN_EQUIV_CLASS:
3297       elem->type = EQUIV_CLASS;
3298       break;
3299     case OP_OPEN_CHAR_CLASS:
3300       elem->type = CHAR_CLASS;
3301       break;
3302     default:
3303       break;
3304     }
3305   return REG_NOERROR;
3306 }
3307
3308   /* Helper function for parse_bracket_exp.
3309      Build the equivalence class which is represented by NAME.
3310      The result are written to MBCSET and SBCSET.
3311      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3312      is a pointer argument sinse we may update it.  */
3313
3314 static reg_errcode_t
3315 #ifdef RE_ENABLE_I18N
3316 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3317      re_charset_t *mbcset;
3318      int *equiv_class_alloc;
3319 #else /* not RE_ENABLE_I18N */
3320 build_equiv_class (sbcset, name)
3321 #endif /* not RE_ENABLE_I18N */
3322      re_bitset_ptr_t sbcset;
3323      const unsigned char *name;
3324 {
3325 #if defined _LIBC
3326   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3327   if (nrules != 0)
3328     {
3329       const int32_t *table, *indirect;
3330       const unsigned char *weights, *extra, *cp;
3331       unsigned char char_buf[2];
3332       int32_t idx1, idx2;
3333       unsigned int ch;
3334       size_t len;
3335       /* This #include defines a local function!  */
3336 # include <locale/weight.h>
3337       /* Calculate the index for equivalence class.  */
3338       cp = name;
3339       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3340       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3341                                                _NL_COLLATE_WEIGHTMB);
3342       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3343                                                    _NL_COLLATE_EXTRAMB);
3344       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3345                                                 _NL_COLLATE_INDIRECTMB);
3346       idx1 = findidx (&cp);
3347       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3348         /* This isn't a valid character.  */
3349         return REG_ECOLLATE;
3350
3351       /* Build single byte matcing table for this equivalence class.  */
3352       char_buf[1] = (unsigned char) '\0';
3353       len = weights[idx1];
3354       for (ch = 0; ch < SBC_MAX; ++ch)
3355         {
3356           char_buf[0] = ch;
3357           cp = char_buf;
3358           idx2 = findidx (&cp);
3359 /*
3360           idx2 = table[ch];
3361 */
3362           if (idx2 == 0)
3363             /* This isn't a valid character.  */
3364             continue;
3365           if (len == weights[idx2])
3366             {
3367               int cnt = 0;
3368               while (cnt <= len &&
3369                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3370                 ++cnt;
3371
3372               if (cnt > len)
3373                 bitset_set (sbcset, ch);
3374             }
3375         }
3376       /* Check whether the array has enough space.  */
3377       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3378         {
3379           /* Not enough, realloc it.  */
3380           /* +1 in case of mbcset->nequiv_classes is 0.  */
3381           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3382           /* Use realloc since the array is NULL if *alloc == 0.  */
3383           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3384                                                    int32_t,
3385                                                    new_equiv_class_alloc);
3386           if (BE (new_equiv_classes == NULL, 0))
3387             return REG_ESPACE;
3388           mbcset->equiv_classes = new_equiv_classes;
3389           *equiv_class_alloc = new_equiv_class_alloc;
3390         }
3391       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3392     }
3393   else
3394 #endif /* _LIBC */
3395     {
3396       if (BE (strlen ((const char *) name) != 1, 0))
3397         return REG_ECOLLATE;
3398       bitset_set (sbcset, *name);
3399     }
3400   return REG_NOERROR;
3401 }
3402
3403   /* Helper function for parse_bracket_exp.
3404      Build the character class which is represented by NAME.
3405      The result are written to MBCSET and SBCSET.
3406      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3407      is a pointer argument sinse we may update it.  */
3408
3409 static reg_errcode_t
3410 #ifdef RE_ENABLE_I18N
3411 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3412      re_charset_t *mbcset;
3413      int *char_class_alloc;
3414 #else /* not RE_ENABLE_I18N */
3415 build_charclass (trans, sbcset, class_name, syntax)
3416 #endif /* not RE_ENABLE_I18N */
3417      unsigned RE_TRANSLATE_TYPE trans;
3418      re_bitset_ptr_t sbcset;
3419      const unsigned char *class_name;
3420      reg_syntax_t syntax;
3421 {
3422   int i;
3423   const char *name = (const char *) class_name;
3424
3425   /* In case of REG_ICASE "upper" and "lower" match the both of
3426      upper and lower cases.  */
3427   if ((syntax & RE_ICASE)
3428       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3429     name = "alpha";
3430
3431 #ifdef RE_ENABLE_I18N
3432   /* Check the space of the arrays.  */
3433   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3434     {
3435       /* Not enough, realloc it.  */
3436       /* +1 in case of mbcset->nchar_classes is 0.  */
3437       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3438       /* Use realloc since array is NULL if *alloc == 0.  */
3439       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3440                                                new_char_class_alloc);
3441       if (BE (new_char_classes == NULL, 0))
3442         return REG_ESPACE;
3443       mbcset->char_classes = new_char_classes;
3444       *char_class_alloc = new_char_class_alloc;
3445     }
3446   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3447 #endif /* RE_ENABLE_I18N */
3448
3449 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3450     for (i = 0; i < SBC_MAX; ++i)               \
3451       {                                         \
3452         if (ctype_func (i))                     \
3453           {                                     \
3454             int ch = trans ? trans[i] : i;      \
3455             bitset_set (sbcset, ch);            \
3456           }                                     \
3457       }
3458
3459   if (strcmp (name, "alnum") == 0)
3460     BUILD_CHARCLASS_LOOP (isalnum)
3461   else if (strcmp (name, "cntrl") == 0)
3462     BUILD_CHARCLASS_LOOP (iscntrl)
3463   else if (strcmp (name, "lower") == 0)
3464     BUILD_CHARCLASS_LOOP (islower)
3465   else if (strcmp (name, "space") == 0)
3466     BUILD_CHARCLASS_LOOP (isspace)
3467   else if (strcmp (name, "alpha") == 0)
3468     BUILD_CHARCLASS_LOOP (isalpha)
3469   else if (strcmp (name, "digit") == 0)
3470     BUILD_CHARCLASS_LOOP (isdigit)
3471   else if (strcmp (name, "print") == 0)
3472     BUILD_CHARCLASS_LOOP (isprint)
3473   else if (strcmp (name, "upper") == 0)
3474     BUILD_CHARCLASS_LOOP (isupper)
3475   else if (strcmp (name, "blank") == 0)
3476     BUILD_CHARCLASS_LOOP (isblank)
3477   else if (strcmp (name, "graph") == 0)
3478     BUILD_CHARCLASS_LOOP (isgraph)
3479   else if (strcmp (name, "punct") == 0)
3480     BUILD_CHARCLASS_LOOP (ispunct)
3481   else if (strcmp (name, "xdigit") == 0)
3482     BUILD_CHARCLASS_LOOP (isxdigit)
3483   else
3484     return REG_ECTYPE;
3485
3486   return REG_NOERROR;
3487 }
3488
3489 static bin_tree_t *
3490 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3491      re_dfa_t *dfa;
3492      unsigned RE_TRANSLATE_TYPE trans;
3493      const unsigned char *class_name;
3494      const unsigned char *extra;
3495      int non_match;
3496      reg_errcode_t *err;
3497 {
3498   re_bitset_ptr_t sbcset;
3499 #ifdef RE_ENABLE_I18N
3500   re_charset_t *mbcset;
3501   int alloc = 0;
3502 #endif /* not RE_ENABLE_I18N */
3503   reg_errcode_t ret;
3504   re_token_t br_token;
3505   bin_tree_t *tree;
3506
3507   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3508 #ifdef RE_ENABLE_I18N
3509   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3510 #endif /* RE_ENABLE_I18N */
3511
3512 #ifdef RE_ENABLE_I18N
3513   if (BE (sbcset == NULL || mbcset == NULL, 0))
3514 #else /* not RE_ENABLE_I18N */
3515   if (BE (sbcset == NULL, 0))
3516 #endif /* not RE_ENABLE_I18N */
3517     {
3518       *err = REG_ESPACE;
3519       return NULL;
3520     }
3521
3522   if (non_match)
3523     {
3524 #ifdef RE_ENABLE_I18N
3525       /*
3526       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3527         bitset_set(cset->sbcset, '\0');
3528       */
3529       mbcset->non_match = 1;
3530 #endif /* not RE_ENABLE_I18N */
3531     }
3532
3533   /* We don't care the syntax in this case.  */
3534   ret = build_charclass (trans, sbcset,
3535 #ifdef RE_ENABLE_I18N
3536                          mbcset, &alloc,
3537 #endif /* RE_ENABLE_I18N */
3538                          class_name, 0);
3539
3540   if (BE (ret != REG_NOERROR, 0))
3541     {
3542       re_free (sbcset);
3543 #ifdef RE_ENABLE_I18N
3544       free_charset (mbcset);
3545 #endif /* RE_ENABLE_I18N */
3546       *err = ret;
3547       return NULL;
3548     }
3549   /* \w match '_' also.  */
3550   for (; *extra; extra++)
3551     bitset_set (sbcset, *extra);
3552
3553   /* If it is non-matching list.  */
3554   if (non_match)
3555     bitset_not (sbcset);
3556
3557 #ifdef RE_ENABLE_I18N
3558   /* Ensure only single byte characters are set.  */
3559   if (dfa->mb_cur_max > 1)
3560     bitset_mask (sbcset, dfa->sb_char);
3561 #endif
3562
3563   /* Build a tree for simple bracket.  */
3564   br_token.type = SIMPLE_BRACKET;
3565   br_token.opr.sbcset = sbcset;
3566   tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3567   if (BE (tree == NULL, 0))
3568     goto build_word_op_espace;
3569
3570 #ifdef RE_ENABLE_I18N
3571   if (dfa->mb_cur_max > 1)
3572     {
3573       re_token_t alt_token;
3574       bin_tree_t *mbc_tree;
3575       /* Build a tree for complex bracket.  */
3576       br_token.type = COMPLEX_BRACKET;
3577       br_token.opr.mbcset = mbcset;
3578       dfa->has_mb_node = 1;
3579       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3580       if (BE (mbc_tree == NULL, 0))
3581         goto build_word_op_espace;
3582       /* Then join them by ALT node.  */
3583       alt_token.type = OP_ALT;
3584       dfa->has_plural_match = 1;
3585       tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3586       if (BE (mbc_tree != NULL, 1))
3587         return tree;
3588     }
3589   else
3590     {
3591       free_charset (mbcset);
3592       return tree;
3593     }
3594 #else /* not RE_ENABLE_I18N */
3595   return tree;
3596 #endif /* not RE_ENABLE_I18N */
3597
3598  build_word_op_espace:
3599   re_free (sbcset);
3600 #ifdef RE_ENABLE_I18N
3601   free_charset (mbcset);
3602 #endif /* RE_ENABLE_I18N */
3603   *err = REG_ESPACE;
3604   return NULL;
3605 }
3606
3607 /* This is intended for the expressions like "a{1,3}".
3608    Fetch a number from `input', and return the number.
3609    Return -1, if the number field is empty like "{,1}".
3610    Return -2, If an error is occured.  */
3611
3612 static int
3613 fetch_number (input, token, syntax)
3614      re_string_t *input;
3615      re_token_t *token;
3616      reg_syntax_t syntax;
3617 {
3618   int num = -1;
3619   unsigned char c;
3620   while (1)
3621     {
3622       fetch_token (token, input, syntax);
3623       c = token->opr.c;
3624       if (BE (token->type == END_OF_RE, 0))
3625         return -2;
3626       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3627         break;
3628       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3629              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3630       num = (num > RE_DUP_MAX) ? -2 : num;
3631     }
3632   return num;
3633 }
3634 \f
3635 #ifdef RE_ENABLE_I18N
3636 static void
3637 free_charset (re_charset_t *cset)
3638 {
3639   re_free (cset->mbchars);
3640 # ifdef _LIBC
3641   re_free (cset->coll_syms);
3642   re_free (cset->equiv_classes);
3643   re_free (cset->range_starts);
3644   re_free (cset->range_ends);
3645 # endif
3646   re_free (cset->char_classes);
3647   re_free (cset);
3648 }
3649 #endif /* RE_ENABLE_I18N */
3650 \f
3651 /* Functions for binary tree operation.  */
3652
3653 /* Create a tree node.  */
3654
3655 static bin_tree_t *
3656 create_tree (dfa, left, right, type, index)
3657      re_dfa_t *dfa;
3658      bin_tree_t *left;
3659      bin_tree_t *right;
3660      re_token_type_t type;
3661      int index;
3662 {
3663   bin_tree_t *tree;
3664   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3665     {
3666       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3667
3668       if (storage == NULL)
3669         return NULL;
3670       storage->next = dfa->str_tree_storage;
3671       dfa->str_tree_storage = storage;
3672       dfa->str_tree_storage_idx = 0;
3673     }
3674   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3675
3676   tree->parent = NULL;
3677   tree->left = left;
3678   tree->right = right;
3679   tree->type = type;
3680   tree->node_idx = index;
3681   tree->first = -1;
3682   tree->next = -1;
3683   re_node_set_init_empty (&tree->eclosure);
3684
3685   if (left != NULL)
3686     left->parent = tree;
3687   if (right != NULL)
3688     right->parent = tree;
3689   return tree;
3690 }
3691
3692 /* Create both a DFA node and a tree for it.  */
3693
3694 static bin_tree_t *
3695 re_dfa_add_tree_node (dfa, left, right, token)
3696      re_dfa_t *dfa;
3697      bin_tree_t *left;
3698      bin_tree_t *right;
3699      const re_token_t *token;
3700 {
3701   int new_idx = re_dfa_add_node (dfa, *token, 0);
3702
3703   if (new_idx == -1)
3704     return NULL;
3705
3706   return create_tree (dfa, left, right, 0, new_idx);
3707 }
3708
3709 /* Mark the tree SRC as an optional subexpression.  */
3710
3711 static void
3712 mark_opt_subexp (src, dfa)
3713      const bin_tree_t *src;
3714      re_dfa_t *dfa;
3715 {
3716   /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3717      a subexpression.  */
3718   if (src->type == CONCAT
3719       && src->left->type == NON_TYPE
3720       && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3721     mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3722 }
3723
3724
3725 /* Recursive tree walker for mark_opt_subexp.  */
3726
3727 static void
3728 mark_opt_subexp_iter (src, dfa, idx)
3729      const bin_tree_t *src;
3730      re_dfa_t *dfa;
3731      int idx;
3732 {
3733   int node_idx;
3734
3735   if (src->type == NON_TYPE)
3736     {
3737       node_idx = src->node_idx;
3738       if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3739            || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3740           && dfa->nodes[node_idx].opr.idx == idx)
3741         dfa->nodes[node_idx].opt_subexp = 1;
3742      }
3743
3744   if (src->left != NULL)
3745     mark_opt_subexp_iter (src->left, dfa, idx);
3746
3747   if (src->right != NULL)
3748     mark_opt_subexp_iter (src->right, dfa, idx);
3749 }
3750
3751
3752 /* Duplicate the node SRC, and return new node.  */
3753
3754 static bin_tree_t *
3755 duplicate_tree (src, dfa)
3756      const bin_tree_t *src;
3757      re_dfa_t *dfa;
3758 {
3759   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3760   int new_node_idx;
3761   /* Since node indies must be according to Post-order of the tree,
3762      we must duplicate the left at first.  */
3763   if (src->left != NULL)
3764     {
3765       left = duplicate_tree (src->left, dfa);
3766       if (left == NULL)
3767         return NULL;
3768     }
3769
3770   /* Secondaly, duplicate the right.  */
3771   if (src->right != NULL)
3772     {
3773       right = duplicate_tree (src->right, dfa);
3774       if (right == NULL)
3775         return NULL;
3776     }
3777
3778   /* At last, duplicate itself.  */
3779   if (src->type == NON_TYPE)
3780     {
3781       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3782       dfa->nodes[new_node_idx].duplicated = 1;
3783       if (BE (new_node_idx == -1, 0))
3784         return NULL;
3785     }
3786   else
3787     new_node_idx = src->type;
3788
3789   new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3790   return new_tree;
3791 }