posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (re_dfa_t *dfa);
  37 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  38 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  39 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  40 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  41 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  42                                              int top_clone_node, int root_node,
  43                                              unsigned int constraint);
  44 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  45                                      unsigned int constraint);
  46 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  47                                    unsigned int constraint);
  48 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  49 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  50                                          int node, int root);
  51 static void calc_inveclosure (re_dfa_t *dfa);
  52 static int fetch_number (re_string_t *input, re_token_t *token,
  53                          reg_syntax_t syntax);
  54 static void fetch_token (re_token_t *result, re_string_t *input,
  55                          reg_syntax_t syntax);
  56 static int peek_token (re_token_t *token, re_string_t *input,
  57                         reg_syntax_t syntax);
  58 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  59                                reg_syntax_t syntax);
  60 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  61                           reg_syntax_t syntax, reg_errcode_t *err);
  62 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  63                                   re_token_t *token, reg_syntax_t syntax,
  64                                   int nest, reg_errcode_t *err);
  65 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  66                                  re_token_t *token, reg_syntax_t syntax,
  67                                  int nest, reg_errcode_t *err);
  68 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  69                                      re_token_t *token, reg_syntax_t syntax,
  70                                      int nest, reg_errcode_t *err);
  71 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  72                                   re_token_t *token, reg_syntax_t syntax,
  73                                   int nest, reg_errcode_t *err);
  74 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  75                                  re_dfa_t *dfa, re_token_t *token,
  76                                  reg_syntax_t syntax, reg_errcode_t *err);
  77 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  78                                       re_token_t *token, reg_syntax_t syntax,
  79                                       reg_errcode_t *err);
  80 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  81                                             re_string_t *regexp,
  82                                             re_token_t *token, int token_len,
  83                                             re_dfa_t *dfa,
  84                                             reg_syntax_t syntax,
  85                                             int accept_hyphen);
  86 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  87                                           re_string_t *regexp,
  88                                           re_token_t *token);
  89 #ifndef _LIBC
  90 # ifdef RE_ENABLE_I18N
  91 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
  92                                       re_charset_t *mbcset, int *range_alloc,
  93                                       bracket_elem_t *start_elem,
  94                                       bracket_elem_t *end_elem);
  95 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
  96                                              re_charset_t *mbcset,
  97                                              int *coll_sym_alloc,
  98                                              const unsigned char *name);
  99 # else /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              const unsigned char *name);
 105 # endif /* not RE_ENABLE_I18N */
 106 #endif /* not _LIBC */
 107 #ifdef RE_ENABLE_I18N
 108 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 109                                         re_charset_t *mbcset,
 110                                         int *equiv_class_alloc,
 111                                         const unsigned char *name);
 112 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 113                                       re_bitset_ptr_t sbcset,
 114                                       re_charset_t *mbcset,
 115                                       int *char_class_alloc,
 116                                       const unsigned char *class_name,
 117                                       reg_syntax_t syntax);
 118 #else  /* not RE_ENABLE_I18N */
 119 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 120                                         const unsigned char *name);
 121 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 122                                       re_bitset_ptr_t sbcset,
 123                                       const unsigned char *class_name,
 124                                       reg_syntax_t syntax);
 125 #endif /* not RE_ENABLE_I18N */
 126 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 127                                        unsigned RE_TRANSLATE_TYPE trans,
 128                                        const unsigned char *class_name,
 129                                        const unsigned char *extra,
 130                                        int non_match, reg_errcode_t *err);
 131 static bin_tree_t *create_tree (re_dfa_t *dfa,
 132                                 bin_tree_t *left, bin_tree_t *right,
 133                                 re_token_type_t type, int index);
 134 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
 135                                          bin_tree_t *left, bin_tree_t *right,
 136                                          const re_token_t *token)
 137   __attribute ((noinline));
 138 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 139 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
 140 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
 141 \f
 142 /* This table gives an error message for each of the error codes listed
 143    in regex.h.  Obviously the order here has to be same as there.
 144    POSIX doesn't require that we do anything for REG_NOERROR,
 145    but why not be nice?  */
 146
 147 const char __re_error_msgid[] attribute_hidden =
 148   {
 149 #define REG_NOERROR_IDX 0
 150     gettext_noop ("Success")    /* REG_NOERROR */
 151     "\0"
 152 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 153     gettext_noop ("No match")   /* REG_NOMATCH */
 154     "\0"
 155 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 156     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 157     "\0"
 158 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 159     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 160     "\0"
 161 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 162     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 163     "\0"
 164 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 165     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 166     "\0"
 167 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 168     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 169     "\0"
 170 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 171     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 172     "\0"
 173 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 174     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 175     "\0"
 176 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 177     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 178     "\0"
 179 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 180     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 181     "\0"
 182 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 183     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 184     "\0"
 185 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 186     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 187     "\0"
 188 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 189     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 190     "\0"
 191 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 192     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 193     "\0"
 194 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 195     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 196     "\0"
 197 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 198     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 199   };
 200
 201 const size_t __re_error_msgid_idx[] attribute_hidden =
 202   {
 203     REG_NOERROR_IDX,
 204     REG_NOMATCH_IDX,
 205     REG_BADPAT_IDX,
 206     REG_ECOLLATE_IDX,
 207     REG_ECTYPE_IDX,
 208     REG_EESCAPE_IDX,
 209     REG_ESUBREG_IDX,
 210     REG_EBRACK_IDX,
 211     REG_EPAREN_IDX,
 212     REG_EBRACE_IDX,
 213     REG_BADBR_IDX,
 214     REG_ERANGE_IDX,
 215     REG_ESPACE_IDX,
 216     REG_BADRPT_IDX,
 217     REG_EEND_IDX,
 218     REG_ESIZE_IDX,
 219     REG_ERPAREN_IDX
 220   };
 221 \f
 222 /* Entry points for GNU code.  */
 223
 224 /* re_compile_pattern is the GNU regular expression compiler: it
 225    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 226    Returns 0 if the pattern was valid, otherwise an error string.
 227
 228    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 229    are set in BUFP on entry.  */
 230
 231 const char *
 232 re_compile_pattern (pattern, length, bufp)
 233     const char *pattern;
 234     size_t length;
 235     struct re_pattern_buffer *bufp;
 236 {
 237   reg_errcode_t ret;
 238
 239   /* And GNU code determines whether or not to get register information
 240      by passing null for the REGS argument to re_match, etc., not by
 241      setting no_sub.  */
 242   bufp->no_sub = 0;
 243
 244   /* Match anchors at newline.  */
 245   bufp->newline_anchor = 1;
 246
 247   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 248
 249   if (!ret)
 250     return NULL;
 251   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 252 }
 253 #ifdef _LIBC
 254 weak_alias (__re_compile_pattern, re_compile_pattern)
 255 #endif
 256
 257 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 258    also be assigned to arbitrarily: each pattern buffer stores its own
 259    syntax, so it can be changed between regex compilations.  */
 260 /* This has no initializer because initialized variables in Emacs
 261    become read-only after dumping.  */
 262 reg_syntax_t re_syntax_options;
 263
 264
 265 /* Specify the precise syntax of regexps for compilation.  This provides
 266    for compatibility for various utilities which historically have
 267    different, incompatible syntaxes.
 268
 269    The argument SYNTAX is a bit mask comprised of the various bits
 270    defined in regex.h.  We return the old syntax.  */
 271
 272 reg_syntax_t
 273 re_set_syntax (syntax)
 274     reg_syntax_t syntax;
 275 {
 276   reg_syntax_t ret = re_syntax_options;
 277
 278   re_syntax_options = syntax;
 279   return ret;
 280 }
 281 #ifdef _LIBC
 282 weak_alias (__re_set_syntax, re_set_syntax)
 283 #endif
 284
 285 int
 286 re_compile_fastmap (bufp)
 287     struct re_pattern_buffer *bufp;
 288 {
 289   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 290   char *fastmap = bufp->fastmap;
 291
 292   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 293   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 294   if (dfa->init_state != dfa->init_state_word)
 295     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 296   if (dfa->init_state != dfa->init_state_nl)
 297     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 298   if (dfa->init_state != dfa->init_state_begbuf)
 299     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 300   bufp->fastmap_accurate = 1;
 301   return 0;
 302 }
 303 #ifdef _LIBC
 304 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 305 #endif
 306
 307 static inline void
 308 __attribute ((always_inline))
 309 re_set_fastmap (char *fastmap, int icase, int ch)
 310 {
 311   fastmap[ch] = 1;
 312   if (icase)
 313     fastmap[tolower (ch)] = 1;
 314 }
 315
 316 /* Helper function for re_compile_fastmap.
 317    Compile fastmap for the initial_state INIT_STATE.  */
 318
 319 static void
 320 re_compile_fastmap_iter (bufp, init_state, fastmap)
 321      regex_t *bufp;
 322      const re_dfastate_t *init_state;
 323      char *fastmap;
 324 {
 325   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 326   int node_cnt;
 327   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 328   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 329     {
 330       int node = init_state->nodes.elems[node_cnt];
 331       re_token_type_t type = dfa->nodes[node].type;
 332
 333       if (type == CHARACTER)
 334         {
 335           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 336 #ifdef RE_ENABLE_I18N
 337           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 338             {
 339               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 340               wchar_t wc;
 341               mbstate_t state;
 342
 343               p = buf;
 344               *p++ = dfa->nodes[node].opr.c;
 345               while (++node < dfa->nodes_len
 346                      && dfa->nodes[node].type == CHARACTER
 347                      && dfa->nodes[node].mb_partial)
 348                 *p++ = dfa->nodes[node].opr.c;
 349               memset (&state, 0, sizeof (state));
 350               if (mbrtowc (&wc, (const char *) buf, p - buf,
 351                            &state) == p - buf
 352                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 353                 re_set_fastmap (fastmap, 0, buf[0]);
 354             }
 355 #endif
 356         }
 357       else if (type == SIMPLE_BRACKET)
 358         {
 359           int i, j, ch;
 360           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 361             for (j = 0; j < UINT_BITS; ++j, ++ch)
 362               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 363                 re_set_fastmap (fastmap, icase, ch);
 364         }
 365 #ifdef RE_ENABLE_I18N
 366       else if (type == COMPLEX_BRACKET)
 367         {
 368           int i;
 369           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 370           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 371               || cset->nranges || cset->nchar_classes)
 372             {
 373 # ifdef _LIBC
 374               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 375                 {
 376                   /* In this case we want to catch the bytes which are
 377                      the first byte of any collation elements.
 378                      e.g. In da_DK, we want to catch 'a' since "aa"
 379                           is a valid collation element, and don't catch
 380                           'b' since 'b' is the only collation element
 381                           which starts from 'b'.  */
 382                   int j, ch;
 383                   const int32_t *table = (const int32_t *)
 384                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 385                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 386                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 387                       if (table[ch] < 0)
 388                         re_set_fastmap (fastmap, icase, ch);
 389                 }
 390 # else
 391               if (dfa->mb_cur_max > 1)
 392                 for (i = 0; i < SBC_MAX; ++i)
 393                   if (__btowc (i) == WEOF)
 394                     re_set_fastmap (fastmap, icase, i);
 395 # endif /* not _LIBC */
 396             }
 397           for (i = 0; i < cset->nmbchars; ++i)
 398             {
 399               char buf[256];
 400               mbstate_t state;
 401               memset (&state, '\0', sizeof (state));
 402               __wcrtomb (buf, cset->mbchars[i], &state);
 403               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 404               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 405                 {
 406                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 407                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 408                 }
 409             }
 410         }
 411 #endif /* RE_ENABLE_I18N */
 412       else if (type == OP_PERIOD
 413 #ifdef RE_ENABLE_I18N
 414                || type == OP_UTF8_PERIOD
 415 #endif /* RE_ENABLE_I18N */
 416                || type == END_OF_RE)
 417         {
 418           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 419           if (type == END_OF_RE)
 420             bufp->can_be_null = 1;
 421           return;
 422         }
 423     }
 424 }
 425 \f
 426 /* Entry point for POSIX code.  */
 427 /* regcomp takes a regular expression as a string and compiles it.
 428
 429    PREG is a regex_t *.  We do not expect any fields to be initialized,
 430    since POSIX says we shouldn't.  Thus, we set
 431
 432      `buffer' to the compiled pattern;
 433      `used' to the length of the compiled pattern;
 434      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 435        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 436        RE_SYNTAX_POSIX_BASIC;
 437      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 438      `fastmap' to an allocated space for the fastmap;
 439      `fastmap_accurate' to zero;
 440      `re_nsub' to the number of subexpressions in PATTERN.
 441
 442    PATTERN is the address of the pattern string.
 443
 444    CFLAGS is a series of bits which affect compilation.
 445
 446      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 447      use POSIX basic syntax.
 448
 449      If REG_NEWLINE is set, then . and [^...] don't match newline.
 450      Also, regexec will try a match beginning after every newline.
 451
 452      If REG_ICASE is set, then we considers upper- and lowercase
 453      versions of letters to be equivalent when matching.
 454
 455      If REG_NOSUB is set, then when PREG is passed to regexec, that
 456      routine will report only success or failure, and nothing about the
 457      registers.
 458
 459    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 460    the return codes and their meanings.)  */
 461
 462 int
 463 regcomp (preg, pattern, cflags)
 464     regex_t *__restrict preg;
 465     const char *__restrict pattern;
 466     int cflags;
 467 {
 468   reg_errcode_t ret;
 469   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 470                          : RE_SYNTAX_POSIX_BASIC);
 471
 472   preg->buffer = NULL;
 473   preg->allocated = 0;
 474   preg->used = 0;
 475
 476   /* Try to allocate space for the fastmap.  */
 477   preg->fastmap = re_malloc (char, SBC_MAX);
 478   if (BE (preg->fastmap == NULL, 0))
 479     return REG_ESPACE;
 480
 481   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 482
 483   /* If REG_NEWLINE is set, newlines are treated differently.  */
 484   if (cflags & REG_NEWLINE)
 485     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 486       syntax &= ~RE_DOT_NEWLINE;
 487       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 488       /* It also changes the matching behavior.  */
 489       preg->newline_anchor = 1;
 490     }
 491   else
 492     preg->newline_anchor = 0;
 493   preg->no_sub = !!(cflags & REG_NOSUB);
 494   preg->translate = NULL;
 495
 496   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 497
 498   /* POSIX doesn't distinguish between an unmatched open-group and an
 499      unmatched close-group: both are REG_EPAREN.  */
 500   if (ret == REG_ERPAREN)
 501     ret = REG_EPAREN;
 502
 503   /* We have already checked preg->fastmap != NULL.  */
 504   if (BE (ret == REG_NOERROR, 1))
 505     /* Compute the fastmap now, since regexec cannot modify the pattern
 506        buffer.  This function never fails in this implementation.  */
 507     (void) re_compile_fastmap (preg);
 508   else
 509     {
 510       /* Some error occurred while compiling the expression.  */
 511       re_free (preg->fastmap);
 512       preg->fastmap = NULL;
 513     }
 514
 515   return (int) ret;
 516 }
 517 #ifdef _LIBC
 518 weak_alias (__regcomp, regcomp)
 519 #endif
 520
 521 /* Returns a message corresponding to an error code, ERRCODE, returned
 522    from either regcomp or regexec.   We don't use PREG here.  */
 523
 524 size_t
 525 regerror (errcode, preg, errbuf, errbuf_size)
 526     int errcode;
 527     const regex_t *preg;
 528     char *errbuf;
 529     size_t errbuf_size;
 530 {
 531   const char *msg;
 532   size_t msg_size;
 533
 534   if (BE (errcode < 0
 535           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 536                                / sizeof (__re_error_msgid_idx[0])), 0))
 537     /* Only error codes returned by the rest of the code should be passed
 538        to this routine.  If we are given anything else, or if other regex
 539        code generates an invalid error code, then the program has a bug.
 540        Dump core so we can fix it.  */
 541     abort ();
 542
 543   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 544
 545   msg_size = strlen (msg) + 1; /* Includes the null.  */
 546
 547   if (BE (errbuf_size != 0, 1))
 548     {
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551 #if defined HAVE_MEMPCPY || defined _LIBC
 552           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 553 #else
 554           memcpy (errbuf, msg, errbuf_size - 1);
 555           errbuf[errbuf_size - 1] = 0;
 556 #endif
 557         }
 558       else
 559         memcpy (errbuf, msg, msg_size);
 560     }
 561
 562   return msg_size;
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__regerror, regerror)
 566 #endif
 567
 568
 569 #ifdef RE_ENABLE_I18N
 570 /* This static array is used for the map to single-byte characters when
 571    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 572    it the same all the time.  UTF-8 is the preferred encoding so this is
 573    a worthwhile optimization.  */
 574 static const bitset utf8_sb_map =
 575 {
 576   /* Set the first 128 bits.  */
 577 # if UINT_MAX == 0xffffffff
 578   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 579 # else
 580 #  error "Add case for new unsigned int size"
 581 # endif
 582 };
 583 #endif
 584
 585
 586 static void
 587 free_dfa_content (re_dfa_t *dfa)
 588 {
 589   int i, j;
 590
 591   re_free (dfa->subexps);
 592
 593   if (dfa->nodes)
 594     for (i = 0; i < dfa->nodes_len; ++i)
 595       {
 596         re_token_t *node = dfa->nodes + i;
 597 #ifdef RE_ENABLE_I18N
 598         if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 599           free_charset (node->opr.mbcset);
 600         else
 601 #endif /* RE_ENABLE_I18N */
 602           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 603             re_free (node->opr.sbcset);
 604       }
 605   re_free (dfa->nexts);
 606   for (i = 0; i < dfa->nodes_len; ++i)
 607     {
 608       if (dfa->eclosures != NULL)
 609         re_node_set_free (dfa->eclosures + i);
 610       if (dfa->inveclosures != NULL)
 611         re_node_set_free (dfa->inveclosures + i);
 612       if (dfa->edests != NULL)
 613         re_node_set_free (dfa->edests + i);
 614     }
 615   re_free (dfa->edests);
 616   re_free (dfa->eclosures);
 617   re_free (dfa->inveclosures);
 618   re_free (dfa->nodes);
 619
 620   if (dfa->state_table)
 621     for (i = 0; i <= dfa->state_hash_mask; ++i)
 622       {
 623         struct re_state_table_entry *entry = dfa->state_table + i;
 624         for (j = 0; j < entry->num; ++j)
 625           {
 626             re_dfastate_t *state = entry->array[j];
 627             free_state (state);
 628           }
 629         re_free (entry->array);
 630       }
 631   re_free (dfa->state_table);
 632 #ifdef RE_ENABLE_I18N
 633   if (dfa->sb_char != utf8_sb_map)
 634     re_free (dfa->sb_char);
 635 #endif
 636 #ifdef DEBUG
 637   re_free (dfa->re_str);
 638 #endif
 639
 640   re_free (dfa);
 641 }
 642
 643
 644 /* Free dynamically allocated space used by PREG.  */
 645
 646 void
 647 regfree (preg)
 648     regex_t *preg;
 649 {
 650   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 651   if (BE (dfa != NULL, 1))
 652     free_dfa_content (dfa);
 653   preg->buffer = NULL;
 654   preg->allocated = 0;
 655
 656   re_free (preg->fastmap);
 657   preg->fastmap = NULL;
 658
 659   re_free (preg->translate);
 660   preg->translate = NULL;
 661 }
 662 #ifdef _LIBC
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (s)
 682      const char *s;
 683 {
 684   reg_errcode_t ret;
 685   char *fastmap;
 686
 687   if (!s)
 688     {
 689       if (!re_comp_buf.buffer)
 690         return gettext ("No previous regular expression");
 691       return 0;
 692     }
 693
 694   if (re_comp_buf.buffer)
 695     {
 696       fastmap = re_comp_buf.fastmap;
 697       re_comp_buf.fastmap = NULL;
 698       __regfree (&re_comp_buf);
 699       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 700       re_comp_buf.fastmap = fastmap;
 701     }
 702
 703   if (re_comp_buf.fastmap == NULL)
 704     {
 705       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 706       if (re_comp_buf.fastmap == NULL)
 707         return (char *) gettext (__re_error_msgid
 708                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 709     }
 710
 711   /* Since `re_exec' always passes NULL for the `regs' argument, we
 712      don't need to initialize the pattern buffer fields which affect it.  */
 713
 714   /* Match anchors at newlines.  */
 715   re_comp_buf.newline_anchor = 1;
 716
 717   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 718
 719   if (!ret)
 720     return NULL;
 721
 722   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 723   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 724 }
 725
 726 #ifdef _LIBC
 727 libc_freeres_fn (free_mem)
 728 {
 729   __regfree (&re_comp_buf);
 730 }
 731 #endif
 732
 733 #endif /* _REGEX_RE_COMP */
 734 \f
 735 /* Internal entry point.
 736    Compile the regular expression PATTERN, whose length is LENGTH.
 737    SYNTAX indicate regular expression's syntax.  */
 738
 739 static reg_errcode_t
 740 re_compile_internal (preg, pattern, length, syntax)
 741      regex_t *preg;
 742      const char * pattern;
 743      int length;
 744      reg_syntax_t syntax;
 745 {
 746   reg_errcode_t err = REG_NOERROR;
 747   re_dfa_t *dfa;
 748   re_string_t regexp;
 749
 750   /* Initialize the pattern buffer.  */
 751   preg->fastmap_accurate = 0;
 752   preg->syntax = syntax;
 753   preg->not_bol = preg->not_eol = 0;
 754   preg->used = 0;
 755   preg->re_nsub = 0;
 756   preg->can_be_null = 0;
 757   preg->regs_allocated = REGS_UNALLOCATED;
 758
 759   /* Initialize the dfa.  */
 760   dfa = (re_dfa_t *) preg->buffer;
 761   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762     {
 763       /* If zero allocated, but buffer is non-null, try to realloc
 764          enough space.  This loses if buffer's address is bogus, but
 765          that is the user's responsibility.  If ->buffer is NULL this
 766          is a simple allocation.  */
 767       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768       if (dfa == NULL)
 769         return REG_ESPACE;
 770       preg->allocated = sizeof (re_dfa_t);
 771       preg->buffer = (unsigned char *) dfa;
 772     }
 773   preg->used = sizeof (re_dfa_t);
 774
 775   err = init_dfa (dfa, length);
 776   if (BE (err != REG_NOERROR, 0))
 777     {
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783 #ifdef DEBUG
 784   dfa->re_str = re_malloc (char, length + 1);
 785   strncpy (dfa->re_str, pattern, length + 1);
 786 #endif
 787
 788   err = re_string_construct (&regexp, pattern, length, preg->translate,
 789                              syntax & RE_ICASE, dfa);
 790   if (BE (err != REG_NOERROR, 0))
 791     {
 792     re_compile_internal_free_return:
 793       free_workarea_compile (preg);
 794       re_string_destruct (&regexp);
 795       free_dfa_content (dfa);
 796       preg->buffer = NULL;
 797       preg->allocated = 0;
 798       return err;
 799     }
 800
 801   /* Parse the regular expression, and build a structure tree.  */
 802   preg->re_nsub = 0;
 803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804   if (BE (dfa->str_tree == NULL, 0))
 805     goto re_compile_internal_free_return;
 806
 807 #ifdef RE_ENABLE_I18N
 808   /* If possible, do searching in single byte encoding to speed things up.  */
 809   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 810     optimize_utf8 (dfa);
 811 #endif
 812
 813   /* Analyze the tree and collect information which is necessary to
 814      create the dfa.  */
 815   err = analyze (dfa);
 816   if (BE (err != REG_NOERROR, 0))
 817     goto re_compile_internal_free_return;
 818
 819   /* Then create the initial state of the dfa.  */
 820   err = create_initial_state (dfa);
 821
 822   /* Release work areas.  */
 823   free_workarea_compile (preg);
 824   re_string_destruct (&regexp);
 825
 826   if (BE (err != REG_NOERROR, 0))
 827     {
 828       free_dfa_content (dfa);
 829       preg->buffer = NULL;
 830       preg->allocated = 0;
 831     }
 832
 833   return err;
 834 }
 835
 836 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 837    as the initial length of some arrays.  */
 838
 839 static reg_errcode_t
 840 init_dfa (dfa, pat_len)
 841      re_dfa_t *dfa;
 842      int pat_len;
 843 {
 844   int table_size;
 845 #ifndef _LIBC
 846   char *codeset_name;
 847 #endif
 848
 849   memset (dfa, '\0', sizeof (re_dfa_t));
 850
 851   /* Force allocation of str_tree_storage the first time.  */
 852   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 853
 854   dfa->nodes_alloc = pat_len + 1;
 855   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 856
 857   dfa->states_alloc = pat_len + 1;
 858
 859   /*  table_size = 2 ^ ceil(log pat_len) */
 860   for (table_size = 1; table_size > 0; table_size <<= 1)
 861     if (table_size > pat_len)
 862       break;
 863
 864   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 865   dfa->state_hash_mask = table_size - 1;
 866
 867   dfa->subexps_alloc = 1;
 868   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 869
 870   dfa->mb_cur_max = MB_CUR_MAX;
 871 #ifdef _LIBC
 872   if (dfa->mb_cur_max == 6
 873       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 874     dfa->is_utf8 = 1;
 875   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 876                        != 0);
 877 #else
 878 # ifdef HAVE_LANGINFO_CODESET
 879   codeset_name = nl_langinfo (CODESET);
 880 # else
 881   codeset_name = getenv ("LC_ALL");
 882   if (codeset_name == NULL || codeset[0] == '\0')
 883     codeset_name = getenv ("LC_CTYPE");
 884   if (codeset_name == NULL || codeset[0] == '\0')
 885     codeset_name = getenv ("LANG");
 886   if (codeset_name == NULL)
 887     codeset_name = "";
 888   else if (strchr (codeset_name, '.') !=  NULL)
 889     codeset_name = strchr (codeset_name, '.') + 1;
 890 # endif
 891
 892   if (strcasecmp (codeset_name, "UTF-8") == 0
 893       || strcasecmp (codeset_name, "UTF8") == 0)
 894     dfa->is_utf8 = 1;
 895
 896   /* We check exhaustively in the loop below if this charset is a
 897      superset of ASCII.  */
 898   dfa->map_notascii = 0;
 899 #endif
 900
 901 #ifdef RE_ENABLE_I18N
 902   if (dfa->mb_cur_max > 1)
 903     {
 904       if (dfa->is_utf8)
 905         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 906       else
 907         {
 908           int i, j, ch;
 909
 910           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 911           if (BE (dfa->sb_char == NULL, 0))
 912             return REG_ESPACE;
 913
 914           /* Clear all bits by, then set those corresponding to single
 915              byte chars.  */
 916           bitset_empty (dfa->sb_char);
 917
 918           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 919             for (j = 0; j < UINT_BITS; ++j, ++ch)
 920               {
 921                 wchar_t wch = __btowc (ch);
 922                 if (wch != WEOF)
 923                   dfa->sb_char[i] |= 1 << j;
 924 # ifndef _LIBC
 925                 if (isascii (ch) && wch != (wchar_t) ch)
 926                   dfa->map_notascii = 1;
 927 # endif
 928               }
 929         }
 930     }
 931 #endif
 932
 933   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 934           || dfa->subexps == NULL, 0))
 935     return REG_ESPACE;
 936   return REG_NOERROR;
 937 }
 938
 939 /* Initialize WORD_CHAR table, which indicate which character is
 940    "word".  In this case "word" means that it is the word construction
 941    character used by some operators like "\<", "\>", etc.  */
 942
 943 static void
 944 init_word_char (dfa)
 945      re_dfa_t *dfa;
 946 {
 947   int i, j, ch;
 948   dfa->word_ops_used = 1;
 949   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 950     for (j = 0; j < UINT_BITS; ++j, ++ch)
 951       if (isalnum (ch) || ch == '_')
 952         dfa->word_char[i] |= 1 << j;
 953 }
 954
 955 /* Free the work area which are only used while compiling.  */
 956
 957 static void
 958 free_workarea_compile (preg)
 959      regex_t *preg;
 960 {
 961   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 962   bin_tree_storage_t *storage, *next;
 963   for (storage = dfa->str_tree_storage; storage; storage = next)
 964     {
 965       next = storage->next;
 966       re_free (storage);
 967     }
 968   dfa->str_tree_storage = NULL;
 969   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 970   dfa->str_tree = NULL;
 971   re_free (dfa->org_indices);
 972   dfa->org_indices = NULL;
 973 }
 974
 975 /* Create initial states for all contexts.  */
 976
 977 static reg_errcode_t
 978 create_initial_state (dfa)
 979      re_dfa_t *dfa;
 980 {
 981   int first, i;
 982   reg_errcode_t err;
 983   re_node_set init_nodes;
 984
 985   /* Initial states have the epsilon closure of the node which is
 986      the first node of the regular expression.  */
 987   first = dfa->str_tree->first;
 988   dfa->init_node = first;
 989   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 990   if (BE (err != REG_NOERROR, 0))
 991     return err;
 992
 993   /* The back-references which are in initial states can epsilon transit,
 994      since in this case all of the subexpressions can be null.
 995      Then we add epsilon closures of the nodes which are the next nodes of
 996      the back-references.  */
 997   if (dfa->nbackref > 0)
 998     for (i = 0; i < init_nodes.nelem; ++i)
 999       {
1000         int node_idx = init_nodes.elems[i];
1001         re_token_type_t type = dfa->nodes[node_idx].type;
1002
1003         int clexp_idx;
1004         if (type != OP_BACK_REF)
1005           continue;
1006         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1007           {
1008             re_token_t *clexp_node;
1009             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1010             if (clexp_node->type == OP_CLOSE_SUBEXP
1011                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
1012               break;
1013           }
1014         if (clexp_idx == init_nodes.nelem)
1015           continue;
1016
1017         if (type == OP_BACK_REF)
1018           {
1019             int dest_idx = dfa->edests[node_idx].elems[0];
1020             if (!re_node_set_contains (&init_nodes, dest_idx))
1021               {
1022                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1023                 i = 0;
1024               }
1025           }
1026       }
1027
1028   /* It must be the first time to invoke acquire_state.  */
1029   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1030   /* We don't check ERR here, since the initial state must not be NULL.  */
1031   if (BE (dfa->init_state == NULL, 0))
1032     return err;
1033   if (dfa->init_state->has_constraint)
1034     {
1035       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1036                                                        CONTEXT_WORD);
1037       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1038                                                      CONTEXT_NEWLINE);
1039       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1040                                                          &init_nodes,
1041                                                          CONTEXT_NEWLINE
1042                                                          | CONTEXT_BEGBUF);
1043       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1044               || dfa->init_state_begbuf == NULL, 0))
1045         return err;
1046     }
1047   else
1048     dfa->init_state_word = dfa->init_state_nl
1049       = dfa->init_state_begbuf = dfa->init_state;
1050
1051   re_node_set_free (&init_nodes);
1052   return REG_NOERROR;
1053 }
1054 \f
1055 #ifdef RE_ENABLE_I18N
1056 /* If it is possible to do searching in single byte encoding instead of UTF-8
1057    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1058    DFA nodes where needed.  */
1059
1060 static void
1061 optimize_utf8 (dfa)
1062      re_dfa_t *dfa;
1063 {
1064   int node, i, mb_chars = 0, has_period = 0;
1065
1066   for (node = 0; node < dfa->nodes_len; ++node)
1067     switch (dfa->nodes[node].type)
1068       {
1069       case CHARACTER:
1070         if (dfa->nodes[node].opr.c >= 0x80)
1071           mb_chars = 1;
1072         break;
1073       case ANCHOR:
1074         switch (dfa->nodes[node].opr.idx)
1075           {
1076           case LINE_FIRST:
1077           case LINE_LAST:
1078           case BUF_FIRST:
1079           case BUF_LAST:
1080             break;
1081           default:
1082             /* Word anchors etc. cannot be handled.  */
1083             return;
1084           }
1085         break;
1086       case OP_PERIOD:
1087         has_period = 1;
1088         break;
1089       case OP_BACK_REF:
1090       case OP_ALT:
1091       case END_OF_RE:
1092       case OP_DUP_ASTERISK:
1093       case OP_DUP_QUESTION:
1094       case OP_OPEN_SUBEXP:
1095       case OP_CLOSE_SUBEXP:
1096         break;
1097       case SIMPLE_BRACKET:
1098         /* Just double check.  */
1099         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1100           if (dfa->nodes[node].opr.sbcset[i])
1101             return;
1102         break;
1103       default:
1104         return;
1105       }
1106
1107   if (mb_chars || has_period)
1108     for (node = 0; node < dfa->nodes_len; ++node)
1109       {
1110         if (dfa->nodes[node].type == CHARACTER
1111             && dfa->nodes[node].opr.c >= 0x80)
1112           dfa->nodes[node].mb_partial = 0;
1113         else if (dfa->nodes[node].type == OP_PERIOD)
1114           dfa->nodes[node].type = OP_UTF8_PERIOD;
1115       }
1116
1117   /* The search can be in single byte locale.  */
1118   dfa->mb_cur_max = 1;
1119   dfa->is_utf8 = 0;
1120   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1121 }
1122 #endif
1123 \f
1124 /* Analyze the structure tree, and calculate "first", "next", "edest",
1125    "eclosure", and "inveclosure".  */
1126
1127 static reg_errcode_t
1128 analyze (dfa)
1129      re_dfa_t *dfa;
1130 {
1131   int i;
1132   reg_errcode_t ret;
1133
1134   /* Allocate arrays.  */
1135   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1136   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1137   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1138   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1139   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1140   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1141           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1142     return REG_ESPACE;
1143   /* Initialize them.  */
1144   for (i = 0; i < dfa->nodes_len; ++i)
1145     {
1146       dfa->nexts[i] = -1;
1147       re_node_set_init_empty (dfa->edests + i);
1148       re_node_set_init_empty (dfa->eclosures + i);
1149       re_node_set_init_empty (dfa->inveclosures + i);
1150     }
1151
1152   ret = analyze_tree (dfa, dfa->str_tree);
1153   if (BE (ret == REG_NOERROR, 1))
1154     {
1155       ret = calc_eclosure (dfa);
1156       if (ret == REG_NOERROR)
1157         calc_inveclosure (dfa);
1158     }
1159   return ret;
1160 }
1161
1162 /* Helper functions for analyze.
1163    This function calculate "first", "next", and "edest" for the subtree
1164    whose root is NODE.  */
1165
1166 static reg_errcode_t
1167 analyze_tree (dfa, node)
1168      re_dfa_t *dfa;
1169      bin_tree_t *node;
1170 {
1171   reg_errcode_t ret;
1172   if (node->first == -1)
1173     calc_first (dfa, node);
1174   if (node->next == -1)
1175     calc_next (dfa, node);
1176   if (node->eclosure.nelem == 0)
1177     calc_epsdest (dfa, node);
1178   /* Calculate "first" etc. for the left child.  */
1179   if (node->left != NULL)
1180     {
1181       ret = analyze_tree (dfa, node->left);
1182       if (BE (ret != REG_NOERROR, 0))
1183         return ret;
1184     }
1185   /* Calculate "first" etc. for the right child.  */
1186   if (node->right != NULL)
1187     {
1188       ret = analyze_tree (dfa, node->right);
1189       if (BE (ret != REG_NOERROR, 0))
1190         return ret;
1191     }
1192   return REG_NOERROR;
1193 }
1194
1195 /* Calculate "first" for the node NODE.  */
1196 static void
1197 calc_first (dfa, node)
1198      re_dfa_t *dfa;
1199      bin_tree_t *node;
1200 {
1201   int idx, type;
1202   idx = node->node_idx;
1203   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1204
1205   switch (type)
1206     {
1207 #ifdef DEBUG
1208     case OP_OPEN_BRACKET:
1209     case OP_CLOSE_BRACKET:
1210     case OP_OPEN_DUP_NUM:
1211     case OP_CLOSE_DUP_NUM:
1212     case OP_DUP_PLUS:
1213     case OP_NON_MATCH_LIST:
1214     case OP_OPEN_COLL_ELEM:
1215     case OP_CLOSE_COLL_ELEM:
1216     case OP_OPEN_EQUIV_CLASS:
1217     case OP_CLOSE_EQUIV_CLASS:
1218     case OP_OPEN_CHAR_CLASS:
1219     case OP_CLOSE_CHAR_CLASS:
1220       /* These must not appear here.  */
1221       assert (0);
1222 #endif
1223     case END_OF_RE:
1224     case CHARACTER:
1225     case OP_PERIOD:
1226     case OP_DUP_ASTERISK:
1227     case OP_DUP_QUESTION:
1228 #ifdef RE_ENABLE_I18N
1229     case OP_UTF8_PERIOD:
1230     case COMPLEX_BRACKET:
1231 #endif /* RE_ENABLE_I18N */
1232     case SIMPLE_BRACKET:
1233     case OP_BACK_REF:
1234     case ANCHOR:
1235     case OP_OPEN_SUBEXP:
1236     case OP_CLOSE_SUBEXP:
1237       node->first = idx;
1238       break;
1239     case OP_ALT:
1240       node->first = idx;
1241       break;
1242       /* else fall through */
1243     default:
1244 #ifdef DEBUG
1245       assert (node->left != NULL);
1246 #endif
1247       if (node->left->first == -1)
1248         calc_first (dfa, node->left);
1249       node->first = node->left->first;
1250       break;
1251     }
1252 }
1253
1254 /* Calculate "next" for the node NODE.  */
1255
1256 static void
1257 calc_next (dfa, node)
1258      re_dfa_t *dfa;
1259      bin_tree_t *node;
1260 {
1261   int idx, type;
1262   bin_tree_t *parent = node->parent;
1263   if (parent == NULL)
1264     {
1265       node->next = -1;
1266       idx = node->node_idx;
1267       if (node->type == 0)
1268         dfa->nexts[idx] = node->next;
1269       return;
1270     }
1271
1272   idx = parent->node_idx;
1273   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1274
1275   switch (type)
1276     {
1277     case OP_DUP_ASTERISK:
1278       node->next = idx;
1279       break;
1280     case CONCAT:
1281       if (parent->left == node)
1282         {
1283           if (parent->right->first == -1)
1284             calc_first (dfa, parent->right);
1285           node->next = parent->right->first;
1286           break;
1287         }
1288       /* else fall through */
1289     default:
1290       if (parent->next == -1)
1291         calc_next (dfa, parent);
1292       node->next = parent->next;
1293       break;
1294     }
1295   idx = node->node_idx;
1296   if (node->type == 0)
1297     dfa->nexts[idx] = node->next;
1298 }
1299
1300 /* Calculate "edest" for the node NODE.  */
1301
1302 static void
1303 calc_epsdest (dfa, node)
1304      re_dfa_t *dfa;
1305      bin_tree_t *node;
1306 {
1307   int idx;
1308   idx = node->node_idx;
1309   if (node->type == 0)
1310     {
1311       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1312           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1313         {
1314           if (node->left->first == -1)
1315             calc_first (dfa, node->left);
1316           if (node->next == -1)
1317             calc_next (dfa, node);
1318           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1319                               node->next);
1320         }
1321       else if (dfa->nodes[idx].type == OP_ALT)
1322         {
1323           int left, right;
1324           if (node->left != NULL)
1325             {
1326               if (node->left->first == -1)
1327                 calc_first (dfa, node->left);
1328               left = node->left->first;
1329             }
1330           else
1331             {
1332               if (node->next == -1)
1333                 calc_next (dfa, node);
1334               left = node->next;
1335             }
1336           if (node->right != NULL)
1337             {
1338               if (node->right->first == -1)
1339                 calc_first (dfa, node->right);
1340               right = node->right->first;
1341             }
1342           else
1343             {
1344               if (node->next == -1)
1345                 calc_next (dfa, node);
1346               right = node->next;
1347             }
1348           re_node_set_init_2 (dfa->edests + idx, left, right);
1349         }
1350       else if (dfa->nodes[idx].type == ANCHOR
1351                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1352                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1353                || dfa->nodes[idx].type == OP_BACK_REF)
1354         re_node_set_init_1 (dfa->edests + idx, node->next);
1355       else
1356         assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1357     }
1358 }
1359
1360 /* Duplicate the epsilon closure of the node ROOT_NODE.
1361    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1362    to their own constraint.  */
1363
1364 static reg_errcode_t
1365 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1366                         init_constraint)
1367      re_dfa_t *dfa;
1368      int top_org_node, top_clone_node, root_node;
1369      unsigned int init_constraint;
1370 {
1371   reg_errcode_t err;
1372   int org_node, clone_node, ret;
1373   unsigned int constraint = init_constraint;
1374   for (org_node = top_org_node, clone_node = top_clone_node;;)
1375     {
1376       int org_dest, clone_dest;
1377       if (dfa->nodes[org_node].type == OP_BACK_REF)
1378         {
1379           /* If the back reference epsilon-transit, its destination must
1380              also have the constraint.  Then duplicate the epsilon closure
1381              of the destination of the back reference, and store it in
1382              edests of the back reference.  */
1383           org_dest = dfa->nexts[org_node];
1384           re_node_set_empty (dfa->edests + clone_node);
1385           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1386           if (BE (err != REG_NOERROR, 0))
1387             return err;
1388           dfa->nexts[clone_node] = dfa->nexts[org_node];
1389           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1390           if (BE (ret < 0, 0))
1391             return REG_ESPACE;
1392         }
1393       else if (dfa->edests[org_node].nelem == 0)
1394         {
1395           /* In case of the node can't epsilon-transit, don't duplicate the
1396              destination and store the original destination as the
1397              destination of the node.  */
1398           dfa->nexts[clone_node] = dfa->nexts[org_node];
1399           break;
1400         }
1401       else if (dfa->edests[org_node].nelem == 1)
1402         {
1403           /* In case of the node can epsilon-transit, and it has only one
1404              destination.  */
1405           org_dest = dfa->edests[org_node].elems[0];
1406           re_node_set_empty (dfa->edests + clone_node);
1407           if (dfa->nodes[org_node].type == ANCHOR)
1408             {
1409               /* In case of the node has another constraint, append it.  */
1410               if (org_node == root_node && clone_node != org_node)
1411                 {
1412                   /* ...but if the node is root_node itself, it means the
1413                      epsilon closure have a loop, then tie it to the
1414                      destination of the root_node.  */
1415                   ret = re_node_set_insert (dfa->edests + clone_node,
1416                                             org_dest);
1417                   if (BE (ret < 0, 0))
1418                     return REG_ESPACE;
1419                   break;
1420                 }
1421               constraint |= dfa->nodes[org_node].opr.ctx_type;
1422             }
1423           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1424           if (BE (err != REG_NOERROR, 0))
1425             return err;
1426           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1427           if (BE (ret < 0, 0))
1428             return REG_ESPACE;
1429         }
1430       else /* dfa->edests[org_node].nelem == 2 */
1431         {
1432           /* In case of the node can epsilon-transit, and it has two
1433              destinations. E.g. '|', '*', '+', '?'.   */
1434           org_dest = dfa->edests[org_node].elems[0];
1435           re_node_set_empty (dfa->edests + clone_node);
1436           /* Search for a duplicated node which satisfies the constraint.  */
1437           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1438           if (clone_dest == -1)
1439             {
1440               /* There are no such a duplicated node, create a new one.  */
1441               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1442               if (BE (err != REG_NOERROR, 0))
1443                 return err;
1444               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1445               if (BE (ret < 0, 0))
1446                 return REG_ESPACE;
1447               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1448                                             root_node, constraint);
1449               if (BE (err != REG_NOERROR, 0))
1450                 return err;
1451             }
1452           else
1453             {
1454               /* There are a duplicated node which satisfy the constraint,
1455                  use it to avoid infinite loop.  */
1456               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1457               if (BE (ret < 0, 0))
1458                 return REG_ESPACE;
1459             }
1460
1461           org_dest = dfa->edests[org_node].elems[1];
1462           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1463           if (BE (err != REG_NOERROR, 0))
1464             return err;
1465           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1466           if (BE (ret < 0, 0))
1467             return REG_ESPACE;
1468         }
1469       org_node = org_dest;
1470       clone_node = clone_dest;
1471     }
1472   return REG_NOERROR;
1473 }
1474
1475 /* Search for a node which is duplicated from the node ORG_NODE, and
1476    satisfies the constraint CONSTRAINT.  */
1477
1478 static int
1479 search_duplicated_node (dfa, org_node, constraint)
1480      re_dfa_t *dfa;
1481      int org_node;
1482      unsigned int constraint;
1483 {
1484   int idx;
1485   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1486     {
1487       if (org_node == dfa->org_indices[idx]
1488           && constraint == dfa->nodes[idx].constraint)
1489         return idx; /* Found.  */
1490     }
1491   return -1; /* Not found.  */
1492 }
1493
1494 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1495    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1496    otherwise return the error code.  */
1497
1498 static reg_errcode_t
1499 duplicate_node (new_idx, dfa, org_idx, constraint)
1500      re_dfa_t *dfa;
1501      int *new_idx, org_idx;
1502      unsigned int constraint;
1503 {
1504   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1505   if (BE (dup_idx == -1, 0))
1506     return REG_ESPACE;
1507   dfa->nodes[dup_idx].constraint = constraint;
1508   if (dfa->nodes[org_idx].type == ANCHOR)
1509     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1510   dfa->nodes[dup_idx].duplicated = 1;
1511   re_node_set_init_empty (dfa->edests + dup_idx);
1512   re_node_set_init_empty (dfa->eclosures + dup_idx);
1513   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1514
1515   /* Store the index of the original node.  */
1516   dfa->org_indices[dup_idx] = org_idx;
1517   *new_idx = dup_idx;
1518   return REG_NOERROR;
1519 }
1520
1521 static void
1522 calc_inveclosure (dfa)
1523      re_dfa_t *dfa;
1524 {
1525   int src, idx, dest;
1526   for (src = 0; src < dfa->nodes_len; ++src)
1527     {
1528       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1529         {
1530           dest = dfa->eclosures[src].elems[idx];
1531           re_node_set_insert (dfa->inveclosures + dest, src);
1532         }
1533     }
1534 }
1535
1536 /* Calculate "eclosure" for all the node in DFA.  */
1537
1538 static reg_errcode_t
1539 calc_eclosure (dfa)
1540      re_dfa_t *dfa;
1541 {
1542   int node_idx, incomplete;
1543 #ifdef DEBUG
1544   assert (dfa->nodes_len > 0);
1545 #endif
1546   incomplete = 0;
1547   /* For each nodes, calculate epsilon closure.  */
1548   for (node_idx = 0; ; ++node_idx)
1549     {
1550       reg_errcode_t err;
1551       re_node_set eclosure_elem;
1552       if (node_idx == dfa->nodes_len)
1553         {
1554           if (!incomplete)
1555             break;
1556           incomplete = 0;
1557           node_idx = 0;
1558         }
1559
1560 #ifdef DEBUG
1561       assert (dfa->eclosures[node_idx].nelem != -1);
1562 #endif
1563       /* If we have already calculated, skip it.  */
1564       if (dfa->eclosures[node_idx].nelem != 0)
1565         continue;
1566       /* Calculate epsilon closure of `node_idx'.  */
1567       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1568       if (BE (err != REG_NOERROR, 0))
1569         return err;
1570
1571       if (dfa->eclosures[node_idx].nelem == 0)
1572         {
1573           incomplete = 1;
1574           re_node_set_free (&eclosure_elem);
1575         }
1576     }
1577   return REG_NOERROR;
1578 }
1579
1580 /* Calculate epsilon closure of NODE.  */
1581
1582 static reg_errcode_t
1583 calc_eclosure_iter (new_set, dfa, node, root)
1584      re_node_set *new_set;
1585      re_dfa_t *dfa;
1586      int node, root;
1587 {
1588   reg_errcode_t err;
1589   unsigned int constraint;
1590   int i, incomplete;
1591   re_node_set eclosure;
1592   incomplete = 0;
1593   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1594   if (BE (err != REG_NOERROR, 0))
1595     return err;
1596
1597   /* This indicates that we are calculating this node now.
1598      We reference this value to avoid infinite loop.  */
1599   dfa->eclosures[node].nelem = -1;
1600
1601   constraint = ((dfa->nodes[node].type == ANCHOR)
1602                 ? dfa->nodes[node].opr.ctx_type : 0);
1603   /* If the current node has constraints, duplicate all nodes.
1604      Since they must inherit the constraints.  */
1605   if (constraint
1606       && dfa->edests[node].nelem
1607       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1608     {
1609       int org_node, cur_node;
1610       org_node = cur_node = node;
1611       err = duplicate_node_closure (dfa, node, node, node, constraint);
1612       if (BE (err != REG_NOERROR, 0))
1613         return err;
1614     }
1615
1616   /* Expand each epsilon destination nodes.  */
1617   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1618     for (i = 0; i < dfa->edests[node].nelem; ++i)
1619       {
1620         re_node_set eclosure_elem;
1621         int edest = dfa->edests[node].elems[i];
1622         /* If calculating the epsilon closure of `edest' is in progress,
1623            return intermediate result.  */
1624         if (dfa->eclosures[edest].nelem == -1)
1625           {
1626             incomplete = 1;
1627             continue;
1628           }
1629         /* If we haven't calculated the epsilon closure of `edest' yet,
1630            calculate now. Otherwise use calculated epsilon closure.  */
1631         if (dfa->eclosures[edest].nelem == 0)
1632           {
1633             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1634             if (BE (err != REG_NOERROR, 0))
1635               return err;
1636           }
1637         else
1638           eclosure_elem = dfa->eclosures[edest];
1639         /* Merge the epsilon closure of `edest'.  */
1640         re_node_set_merge (&eclosure, &eclosure_elem);
1641         /* If the epsilon closure of `edest' is incomplete,
1642            the epsilon closure of this node is also incomplete.  */
1643         if (dfa->eclosures[edest].nelem == 0)
1644           {
1645             incomplete = 1;
1646             re_node_set_free (&eclosure_elem);
1647           }
1648       }
1649
1650   /* Epsilon closures include itself.  */
1651   re_node_set_insert (&eclosure, node);
1652   if (incomplete && !root)
1653     dfa->eclosures[node].nelem = 0;
1654   else
1655     dfa->eclosures[node] = eclosure;
1656   *new_set = eclosure;
1657   return REG_NOERROR;
1658 }
1659 \f
1660 /* Functions for token which are used in the parser.  */
1661
1662 /* Fetch a token from INPUT.
1663    We must not use this function inside bracket expressions.  */
1664
1665 static void
1666 fetch_token (result, input, syntax)
1667      re_token_t *result;
1668      re_string_t *input;
1669      reg_syntax_t syntax;
1670 {
1671   re_string_skip_bytes (input, peek_token (result, input, syntax));
1672 }
1673
1674 /* Peek a token from INPUT, and return the length of the token.
1675    We must not use this function inside bracket expressions.  */
1676
1677 static int
1678 peek_token (token, input, syntax)
1679      re_token_t *token;
1680      re_string_t *input;
1681      reg_syntax_t syntax;
1682 {
1683   unsigned char c;
1684
1685   if (re_string_eoi (input))
1686     {
1687       token->type = END_OF_RE;
1688       return 0;
1689     }
1690
1691   c = re_string_peek_byte (input, 0);
1692   token->opr.c = c;
1693
1694   token->word_char = 0;
1695 #ifdef RE_ENABLE_I18N
1696   token->mb_partial = 0;
1697   if (input->mb_cur_max > 1 &&
1698       !re_string_first_byte (input, re_string_cur_idx (input)))
1699     {
1700       token->type = CHARACTER;
1701       token->mb_partial = 1;
1702       return 1;
1703     }
1704 #endif
1705   if (c == '\\')
1706     {
1707       unsigned char c2;
1708       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1709         {
1710           token->type = BACK_SLASH;
1711           return 1;
1712         }
1713
1714       c2 = re_string_peek_byte_case (input, 1);
1715       token->opr.c = c2;
1716       token->type = CHARACTER;
1717 #ifdef RE_ENABLE_I18N
1718       if (input->mb_cur_max > 1)
1719         {
1720           wint_t wc = re_string_wchar_at (input,
1721                                           re_string_cur_idx (input) + 1);
1722           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1723         }
1724       else
1725 #endif
1726         token->word_char = IS_WORD_CHAR (c2) != 0;
1727
1728       switch (c2)
1729         {
1730         case '|':
1731           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1732             token->type = OP_ALT;
1733           break;
1734         case '1': case '2': case '3': case '4': case '5':
1735         case '6': case '7': case '8': case '9':
1736           if (!(syntax & RE_NO_BK_REFS))
1737             {
1738               token->type = OP_BACK_REF;
1739               token->opr.idx = c2 - '0';
1740             }
1741           break;
1742         case '<':
1743           if (!(syntax & RE_NO_GNU_OPS))
1744             {
1745               token->type = ANCHOR;
1746               token->opr.ctx_type = WORD_FIRST;
1747             }
1748           break;
1749         case '>':
1750           if (!(syntax & RE_NO_GNU_OPS))
1751             {
1752               token->type = ANCHOR;
1753               token->opr.ctx_type = WORD_LAST;
1754             }
1755           break;
1756         case 'b':
1757           if (!(syntax & RE_NO_GNU_OPS))
1758             {
1759               token->type = ANCHOR;
1760               token->opr.ctx_type = WORD_DELIM;
1761             }
1762           break;
1763         case 'B':
1764           if (!(syntax & RE_NO_GNU_OPS))
1765             {
1766               token->type = ANCHOR;
1767               token->opr.ctx_type = INSIDE_WORD;
1768             }
1769           break;
1770         case 'w':
1771           if (!(syntax & RE_NO_GNU_OPS))
1772             token->type = OP_WORD;
1773           break;
1774         case 'W':
1775           if (!(syntax & RE_NO_GNU_OPS))
1776             token->type = OP_NOTWORD;
1777           break;
1778         case 's':
1779           if (!(syntax & RE_NO_GNU_OPS))
1780             token->type = OP_SPACE;
1781           break;
1782         case 'S':
1783           if (!(syntax & RE_NO_GNU_OPS))
1784             token->type = OP_NOTSPACE;
1785           break;
1786         case '`':
1787           if (!(syntax & RE_NO_GNU_OPS))
1788             {
1789               token->type = ANCHOR;
1790               token->opr.ctx_type = BUF_FIRST;
1791             }
1792           break;
1793         case '\'':
1794           if (!(syntax & RE_NO_GNU_OPS))
1795             {
1796               token->type = ANCHOR;
1797               token->opr.ctx_type = BUF_LAST;
1798             }
1799           break;
1800         case '(':
1801           if (!(syntax & RE_NO_BK_PARENS))
1802             token->type = OP_OPEN_SUBEXP;
1803           break;
1804         case ')':
1805           if (!(syntax & RE_NO_BK_PARENS))
1806             token->type = OP_CLOSE_SUBEXP;
1807           break;
1808         case '+':
1809           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1810             token->type = OP_DUP_PLUS;
1811           break;
1812         case '?':
1813           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1814             token->type = OP_DUP_QUESTION;
1815           break;
1816         case '{':
1817           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1818             token->type = OP_OPEN_DUP_NUM;
1819           break;
1820         case '}':
1821           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1822             token->type = OP_CLOSE_DUP_NUM;
1823           break;
1824         default:
1825           break;
1826         }
1827       return 2;
1828     }
1829
1830   token->type = CHARACTER;
1831 #ifdef RE_ENABLE_I18N
1832   if (input->mb_cur_max > 1)
1833     {
1834       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1835       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1836     }
1837   else
1838 #endif
1839     token->word_char = IS_WORD_CHAR (token->opr.c);
1840
1841   switch (c)
1842     {
1843     case '\n':
1844       if (syntax & RE_NEWLINE_ALT)
1845         token->type = OP_ALT;
1846       break;
1847     case '|':
1848       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1849         token->type = OP_ALT;
1850       break;
1851     case '*':
1852       token->type = OP_DUP_ASTERISK;
1853       break;
1854     case '+':
1855       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1856         token->type = OP_DUP_PLUS;
1857       break;
1858     case '?':
1859       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1860         token->type = OP_DUP_QUESTION;
1861       break;
1862     case '{':
1863       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1864         token->type = OP_OPEN_DUP_NUM;
1865       break;
1866     case '}':
1867       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1868         token->type = OP_CLOSE_DUP_NUM;
1869       break;
1870     case '(':
1871       if (syntax & RE_NO_BK_PARENS)
1872         token->type = OP_OPEN_SUBEXP;
1873       break;
1874     case ')':
1875       if (syntax & RE_NO_BK_PARENS)
1876         token->type = OP_CLOSE_SUBEXP;
1877       break;
1878     case '[':
1879       token->type = OP_OPEN_BRACKET;
1880       break;
1881     case '.':
1882       token->type = OP_PERIOD;
1883       break;
1884     case '^':
1885       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1886           re_string_cur_idx (input) != 0)
1887         {
1888           char prev = re_string_peek_byte (input, -1);
1889           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1890             break;
1891         }
1892       token->type = ANCHOR;
1893       token->opr.ctx_type = LINE_FIRST;
1894       break;
1895     case '$':
1896       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1897           re_string_cur_idx (input) + 1 != re_string_length (input))
1898         {
1899           re_token_t next;
1900           re_string_skip_bytes (input, 1);
1901           peek_token (&next, input, syntax);
1902           re_string_skip_bytes (input, -1);
1903           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1904             break;
1905         }
1906       token->type = ANCHOR;
1907       token->opr.ctx_type = LINE_LAST;
1908       break;
1909     default:
1910       break;
1911     }
1912   return 1;
1913 }
1914
1915 /* Peek a token from INPUT, and return the length of the token.
1916    We must not use this function out of bracket expressions.  */
1917
1918 static int
1919 peek_token_bracket (token, input, syntax)
1920      re_token_t *token;
1921      re_string_t *input;
1922      reg_syntax_t syntax;
1923 {
1924   unsigned char c;
1925   if (re_string_eoi (input))
1926     {
1927       token->type = END_OF_RE;
1928       return 0;
1929     }
1930   c = re_string_peek_byte (input, 0);
1931   token->opr.c = c;
1932
1933 #ifdef RE_ENABLE_I18N
1934   if (input->mb_cur_max > 1 &&
1935       !re_string_first_byte (input, re_string_cur_idx (input)))
1936     {
1937       token->type = CHARACTER;
1938       return 1;
1939     }
1940 #endif /* RE_ENABLE_I18N */
1941
1942   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1943       && re_string_cur_idx (input) + 1 < re_string_length (input))
1944     {
1945       /* In this case, '\' escape a character.  */
1946       unsigned char c2;
1947       re_string_skip_bytes (input, 1);
1948       c2 = re_string_peek_byte (input, 0);
1949       token->opr.c = c2;
1950       token->type = CHARACTER;
1951       return 1;
1952     }
1953   if (c == '[') /* '[' is a special char in a bracket exps.  */
1954     {
1955       unsigned char c2;
1956       int token_len;
1957       if (re_string_cur_idx (input) + 1 < re_string_length (input))
1958         c2 = re_string_peek_byte (input, 1);
1959       else
1960         c2 = 0;
1961       token->opr.c = c2;
1962       token_len = 2;
1963       switch (c2)
1964         {
1965         case '.':
1966           token->type = OP_OPEN_COLL_ELEM;
1967           break;
1968         case '=':
1969           token->type = OP_OPEN_EQUIV_CLASS;
1970           break;
1971         case ':':
1972           if (syntax & RE_CHAR_CLASSES)
1973             {
1974               token->type = OP_OPEN_CHAR_CLASS;
1975               break;
1976             }
1977           /* else fall through.  */
1978         default:
1979           token->type = CHARACTER;
1980           token->opr.c = c;
1981           token_len = 1;
1982           break;
1983         }
1984       return token_len;
1985     }
1986   switch (c)
1987     {
1988     case '-':
1989       token->type = OP_CHARSET_RANGE;
1990       break;
1991     case ']':
1992       token->type = OP_CLOSE_BRACKET;
1993       break;
1994     case '^':
1995       token->type = OP_NON_MATCH_LIST;
1996       break;
1997     default:
1998       token->type = CHARACTER;
1999     }
2000   return 1;
2001 }
2002 \f
2003 /* Functions for parser.  */
2004
2005 /* Entry point of the parser.
2006    Parse the regular expression REGEXP and return the structure tree.
2007    If an error is occured, ERR is set by error code, and return NULL.
2008    This function build the following tree, from regular expression <reg_exp>:
2009            CAT
2010            / \
2011           /   \
2012    <reg_exp>  EOR
2013
2014    CAT means concatenation.
2015    EOR means end of regular expression.  */
2016
2017 static bin_tree_t *
2018 parse (regexp, preg, syntax, err)
2019      re_string_t *regexp;
2020      regex_t *preg;
2021      reg_syntax_t syntax;
2022      reg_errcode_t *err;
2023 {
2024   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2025   bin_tree_t *tree, *eor, *root;
2026   re_token_t current_token;
2027   dfa->syntax = syntax;
2028   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2029   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2030   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2031     return NULL;
2032   eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
2033   if (tree != NULL)
2034     root = create_tree (dfa, tree, eor, CONCAT, 0);
2035   else
2036     root = eor;
2037   if (BE (eor == NULL || root == NULL, 0))
2038     {
2039       *err = REG_ESPACE;
2040       return NULL;
2041     }
2042   return root;
2043 }
2044
2045 /* This function build the following tree, from regular expression
2046    <branch1>|<branch2>:
2047            ALT
2048            / \
2049           /   \
2050    <branch1> <branch2>
2051
2052    ALT means alternative, which represents the operator `|'.  */
2053
2054 static bin_tree_t *
2055 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2056      re_string_t *regexp;
2057      regex_t *preg;
2058      re_token_t *token;
2059      reg_syntax_t syntax;
2060      int nest;
2061      reg_errcode_t *err;
2062 {
2063   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2064   bin_tree_t *tree, *branch = NULL;
2065   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2066   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2067     return NULL;
2068
2069   while (token->type == OP_ALT)
2070     {
2071       re_token_t alt_token = *token;
2072       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2073       if (token->type != OP_ALT && token->type != END_OF_RE
2074           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2075         {
2076           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2077           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2078             return NULL;
2079         }
2080       else
2081         branch = NULL;
2082       tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2083       if (BE (tree == NULL, 0))
2084         {
2085           *err = REG_ESPACE;
2086           return NULL;
2087         }
2088       dfa->has_plural_match = 1;
2089     }
2090   return tree;
2091 }
2092
2093 /* This function build the following tree, from regular expression
2094    <exp1><exp2>:
2095         CAT
2096         / \
2097        /   \
2098    <exp1> <exp2>
2099
2100    CAT means concatenation.  */
2101
2102 static bin_tree_t *
2103 parse_branch (regexp, preg, token, syntax, nest, err)
2104      re_string_t *regexp;
2105      regex_t *preg;
2106      re_token_t *token;
2107      reg_syntax_t syntax;
2108      int nest;
2109      reg_errcode_t *err;
2110 {
2111   bin_tree_t *tree, *exp;
2112   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2113   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2114   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2115     return NULL;
2116
2117   while (token->type != OP_ALT && token->type != END_OF_RE
2118          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2119     {
2120       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2121       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2122         {
2123           return NULL;
2124         }
2125       if (tree != NULL && exp != NULL)
2126         {
2127           tree = create_tree (dfa, tree, exp, CONCAT, 0);
2128           if (tree == NULL)
2129             {
2130               *err = REG_ESPACE;
2131               return NULL;
2132             }
2133         }
2134       else if (tree == NULL)
2135         tree = exp;
2136       /* Otherwise exp == NULL, we don't need to create new tree.  */
2137     }
2138   return tree;
2139 }
2140
2141 /* This function build the following tree, from regular expression a*:
2142          *
2143          |
2144          a
2145 */
2146
2147 static bin_tree_t *
2148 parse_expression (regexp, preg, token, syntax, nest, err)
2149      re_string_t *regexp;
2150      regex_t *preg;
2151      re_token_t *token;
2152      reg_syntax_t syntax;
2153      int nest;
2154      reg_errcode_t *err;
2155 {
2156   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2157   bin_tree_t *tree;
2158   switch (token->type)
2159     {
2160     case CHARACTER:
2161       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2162       if (BE (tree == NULL, 0))
2163         {
2164           *err = REG_ESPACE;
2165           return NULL;
2166         }
2167 #ifdef RE_ENABLE_I18N
2168       if (dfa->mb_cur_max > 1)
2169         {
2170           while (!re_string_eoi (regexp)
2171                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2172             {
2173               bin_tree_t *mbc_remain;
2174               fetch_token (token, regexp, syntax);
2175               mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2176               tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2177               if (BE (mbc_remain == NULL || tree == NULL, 0))
2178                 {
2179                   *err = REG_ESPACE;
2180                   return NULL;
2181                 }
2182             }
2183         }
2184 #endif
2185       break;
2186     case OP_OPEN_SUBEXP:
2187       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2188       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2189         return NULL;
2190       break;
2191     case OP_OPEN_BRACKET:
2192       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2193       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2194         return NULL;
2195       break;
2196     case OP_BACK_REF:
2197       if (BE (preg->re_nsub < token->opr.idx
2198               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2199         {
2200           *err = REG_ESUBREG;
2201           return NULL;
2202         }
2203       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2204       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2205       if (BE (tree == NULL, 0))
2206         {
2207           *err = REG_ESPACE;
2208           return NULL;
2209         }
2210       ++dfa->nbackref;
2211       dfa->has_mb_node = 1;
2212       break;
2213     case OP_OPEN_DUP_NUM:
2214       if (syntax & RE_CONTEXT_INVALID_DUP)
2215         {
2216           *err = REG_BADRPT;
2217           return NULL;
2218         }
2219       /* FALLTHROUGH */
2220     case OP_DUP_ASTERISK:
2221     case OP_DUP_PLUS:
2222     case OP_DUP_QUESTION:
2223       if (syntax & RE_CONTEXT_INVALID_OPS)
2224         {
2225           *err = REG_BADRPT;
2226           return NULL;
2227         }
2228       else if (syntax & RE_CONTEXT_INDEP_OPS)
2229         {
2230           fetch_token (token, regexp, syntax);
2231           return parse_expression (regexp, preg, token, syntax, nest, err);
2232         }
2233       /* else fall through  */
2234     case OP_CLOSE_SUBEXP:
2235       if ((token->type == OP_CLOSE_SUBEXP) &&
2236           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2237         {
2238           *err = REG_ERPAREN;
2239           return NULL;
2240         }
2241       /* else fall through  */
2242     case OP_CLOSE_DUP_NUM:
2243       /* We treat it as a normal character.  */
2244
2245       /* Then we can these characters as normal characters.  */
2246       token->type = CHARACTER;
2247       /* mb_partial and word_char bits should be initialized already
2248          by peek_token.  */
2249       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2250       if (BE (tree == NULL, 0))
2251         {
2252           *err = REG_ESPACE;
2253           return NULL;
2254         }
2255       break;
2256     case ANCHOR:
2257       if ((token->opr.ctx_type
2258            & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2259           && dfa->word_ops_used == 0)
2260         init_word_char (dfa);
2261       if (token->opr.ctx_type == WORD_DELIM)
2262         {
2263           bin_tree_t *tree_first, *tree_last;
2264           token->opr.ctx_type = WORD_FIRST;
2265           tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2266           token->opr.ctx_type = WORD_LAST;
2267           tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2268           token->type = OP_ALT;
2269           tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2270           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2271             {
2272               *err = REG_ESPACE;
2273               return NULL;
2274             }
2275         }
2276       else
2277         {
2278           tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2279           if (BE (tree == NULL, 0))
2280             {
2281               *err = REG_ESPACE;
2282               return NULL;
2283             }
2284         }
2285       /* We must return here, since ANCHORs can't be followed
2286          by repetition operators.
2287          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2288              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2289       fetch_token (token, regexp, syntax);
2290       return tree;
2291     case OP_PERIOD:
2292       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2293       if (BE (tree == NULL, 0))
2294         {
2295           *err = REG_ESPACE;
2296           return NULL;
2297         }
2298       if (dfa->mb_cur_max > 1)
2299         dfa->has_mb_node = 1;
2300       break;
2301     case OP_WORD:
2302     case OP_NOTWORD:
2303       tree = build_charclass_op (dfa, regexp->trans,
2304                                  (const unsigned char *) "alnum",
2305                                  (const unsigned char *) "_",
2306                                  token->type == OP_NOTWORD, err);
2307       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2308         return NULL;
2309       break;
2310     case OP_SPACE:
2311     case OP_NOTSPACE:
2312       tree = build_charclass_op (dfa, regexp->trans,
2313                                  (const unsigned char *) "space",
2314                                  (const unsigned char *) "",
2315                                  token->type == OP_NOTSPACE, err);
2316       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2317         return NULL;
2318       break;
2319     case OP_ALT:
2320     case END_OF_RE:
2321       return NULL;
2322     case BACK_SLASH:
2323       *err = REG_EESCAPE;
2324       return NULL;
2325     default:
2326       /* Must not happen?  */
2327 #ifdef DEBUG
2328       assert (0);
2329 #endif
2330       return NULL;
2331     }
2332   fetch_token (token, regexp, syntax);
2333
2334   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2335          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2336     {
2337       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2338       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2339         return NULL;
2340       /* In BRE consecutive duplications are not allowed.  */
2341       if ((syntax & RE_CONTEXT_INVALID_DUP)
2342           && (token->type == OP_DUP_ASTERISK
2343               || token->type == OP_OPEN_DUP_NUM))
2344         {
2345           *err = REG_BADRPT;
2346           return NULL;
2347         }
2348       dfa->has_plural_match = 1;
2349     }
2350
2351   return tree;
2352 }
2353
2354 /* This function build the following tree, from regular expression
2355    (<reg_exp>):
2356          SUBEXP
2357             |
2358         <reg_exp>
2359 */
2360
2361 static bin_tree_t *
2362 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2363      re_string_t *regexp;
2364      regex_t *preg;
2365      re_token_t *token;
2366      reg_syntax_t syntax;
2367      int nest;
2368      reg_errcode_t *err;
2369 {
2370   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2371   bin_tree_t *tree, *left_par, *right_par;
2372   size_t cur_nsub;
2373   cur_nsub = preg->re_nsub++;
2374   if (BE (dfa->subexps_alloc < preg->re_nsub, 0))
2375     {
2376       re_subexp_t *new_array;
2377       dfa->subexps_alloc *= 2;
2378       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2379       if (BE (new_array == NULL, 0))
2380         {
2381           dfa->subexps_alloc /= 2;
2382           *err = REG_ESPACE;
2383           return NULL;
2384         }
2385       dfa->subexps = new_array;
2386     }
2387   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2388   dfa->subexps[cur_nsub].end = -1;
2389
2390   left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2391   if (BE (left_par == NULL, 0))
2392     {
2393       *err = REG_ESPACE;
2394       return NULL;
2395     }
2396   dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2397   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2398
2399   /* The subexpression may be a null string.  */
2400   if (token->type == OP_CLOSE_SUBEXP)
2401     tree = NULL;
2402   else
2403     {
2404       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2405       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2406         return NULL;
2407     }
2408   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2409     {
2410       *err = REG_EPAREN;
2411       return NULL;
2412     }
2413   right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2414   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2415   tree = ((tree == NULL) ? right_par
2416           : create_tree (dfa, tree, right_par, CONCAT, 0));
2417   tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2418   if (BE (right_par == NULL || tree == NULL, 0))
2419     {
2420       *err = REG_ESPACE;
2421       return NULL;
2422     }
2423   dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2424
2425   return tree;
2426 }
2427
2428 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2429
2430 static bin_tree_t *
2431 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2432      bin_tree_t *elem;
2433      re_string_t *regexp;
2434      re_dfa_t *dfa;
2435      re_token_t *token;
2436      reg_syntax_t syntax;
2437      reg_errcode_t *err;
2438 {
2439   re_token_t dup_token;
2440   bin_tree_t *tree = NULL;
2441   int i, start, end, start_idx = re_string_cur_idx (regexp);
2442   re_token_t start_token = *token;
2443
2444   if (token->type == OP_OPEN_DUP_NUM)
2445     {
2446       end = 0;
2447       start = fetch_number (regexp, token, syntax);
2448       if (start == -1)
2449         {
2450           if (token->type == CHARACTER && token->opr.c == ',')
2451             start = 0; /* We treat "{,m}" as "{0,m}".  */
2452           else
2453             {
2454               *err = REG_BADBR; /* <re>{} is invalid.  */
2455               return NULL;
2456             }
2457         }
2458       if (BE (start != -2, 1))
2459         {
2460           /* We treat "{n}" as "{n,n}".  */
2461           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2462                  : ((token->type == CHARACTER && token->opr.c == ',')
2463                     ? fetch_number (regexp, token, syntax) : -2));
2464         }
2465       if (BE (start == -2 || end == -2, 0))
2466         {
2467           /* Invalid sequence.  */
2468           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2469             {
2470               if (token->type == END_OF_RE)
2471                 *err = REG_EBRACE;
2472               else
2473                 *err = REG_BADBR;
2474
2475               return NULL;
2476             }
2477
2478           /* If the syntax bit is set, rollback.  */
2479           re_string_set_index (regexp, start_idx);
2480           *token = start_token;
2481           token->type = CHARACTER;
2482           /* mb_partial and word_char bits should be already initialized by
2483              peek_token.  */
2484           return elem;
2485         }
2486
2487       if (BE (end != -1 && start > end, 0))
2488         {
2489           /* First number greater than second.  */
2490           *err = REG_BADBR;
2491           return NULL;
2492         }
2493     }
2494   else
2495     {
2496       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2497       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2498     }
2499
2500   /* Treat "<re>{0}*" etc. as "<re>{0}".  */
2501   if (BE (elem == NULL, 0))
2502     start = end = 0;
2503
2504   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2505   else if (BE (start > 0, 0))
2506     {
2507       tree = elem;
2508       for (i = 2; i <= start; ++i)
2509         {
2510           elem = duplicate_tree (elem, dfa);
2511           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2512           if (BE (elem == NULL || tree == NULL, 0))
2513             goto parse_dup_op_espace;
2514         }
2515     }
2516
2517   if (BE (end != start, 1))
2518     {
2519       dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2520       if (BE (start > 0, 0))
2521         {
2522           elem = duplicate_tree (elem, dfa);
2523           if (BE (elem == NULL, 0))
2524             goto parse_dup_op_espace;
2525
2526           /* This subexpression will be marked as optional, so that
2527              empty matches do not touch the registers.  */
2528           mark_opt_subexp (elem, dfa);
2529
2530           /* Prepare the tree with the modifier.  */
2531           elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2532           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2533         }
2534       else
2535         {
2536           /* We do not need to duplicate the tree because we have not
2537              created it yet.  */
2538           mark_opt_subexp (elem, dfa);
2539           tree = elem = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2540         }
2541
2542       if (BE (elem == NULL || tree == NULL, 0))
2543         goto parse_dup_op_espace;
2544
2545       /* This loop is actually executed only when end != -1,
2546          to rewrite <re>{0,n} as <re>?<re>?<re>?...  We have
2547          already created the start+1-th copy.  */
2548       for (i = start + 2; i <= end; ++i)
2549         {
2550           elem = duplicate_tree (elem, dfa);
2551           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2552           if (BE (elem == NULL || tree == NULL, 0))
2553             {
2554               *err = REG_ESPACE;
2555               return NULL;
2556             }
2557         }
2558     }
2559
2560   fetch_token (token, regexp, syntax);
2561   return tree;
2562
2563  parse_dup_op_espace:
2564   *err = REG_ESPACE;
2565   return NULL;
2566 }
2567
2568 /* Size of the names for collating symbol/equivalence_class/character_class.
2569    I'm not sure, but maybe enough.  */
2570 #define BRACKET_NAME_BUF_SIZE 32
2571
2572 #ifndef _LIBC
2573   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2574      Build the range expression which starts from START_ELEM, and ends
2575      at END_ELEM.  The result are written to MBCSET and SBCSET.
2576      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2577      mbcset->range_ends, is a pointer argument sinse we may
2578      update it.  */
2579
2580 static reg_errcode_t
2581 # ifdef RE_ENABLE_I18N
2582 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2583      re_charset_t *mbcset;
2584      int *range_alloc;
2585 # else /* not RE_ENABLE_I18N */
2586 build_range_exp (sbcset, start_elem, end_elem)
2587 # endif /* not RE_ENABLE_I18N */
2588      re_bitset_ptr_t sbcset;
2589      bracket_elem_t *start_elem, *end_elem;
2590 {
2591   unsigned int start_ch, end_ch;
2592   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2593   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2594           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2595           0))
2596     return REG_ERANGE;
2597
2598   /* We can handle no multi character collating elements without libc
2599      support.  */
2600   if (BE ((start_elem->type == COLL_SYM
2601            && strlen ((char *) start_elem->opr.name) > 1)
2602           || (end_elem->type == COLL_SYM
2603               && strlen ((char *) end_elem->opr.name) > 1), 0))
2604     return REG_ECOLLATE;
2605
2606 # ifdef RE_ENABLE_I18N
2607   {
2608     wchar_t wc, start_wc, end_wc;
2609     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2610
2611     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2612                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2613                    : 0));
2614     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2615               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2616                  : 0));
2617     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2618                 ? __btowc (start_ch) : start_elem->opr.wch);
2619     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2620               ? __btowc (end_ch) : end_elem->opr.wch);
2621     if (start_wc == WEOF || end_wc == WEOF)
2622       return REG_ECOLLATE;
2623     cmp_buf[0] = start_wc;
2624     cmp_buf[4] = end_wc;
2625     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2626       return REG_ERANGE;
2627
2628     /* Got valid collation sequence values, add them as a new entry.
2629        However, for !_LIBC we have no collation elements: if the
2630        character set is single byte, the single byte character set
2631        that we build below suffices.  parse_bracket_exp passes
2632        no MBCSET if dfa->mb_cur_max == 1.  */
2633     if (mbcset)
2634       {
2635         /* Check the space of the arrays.  */
2636         if (BE (*range_alloc == mbcset->nranges, 0))
2637           {
2638             /* There is not enough space, need realloc.  */
2639             wchar_t *new_array_start, *new_array_end;
2640             int new_nranges;
2641
2642             /* +1 in case of mbcset->nranges is 0.  */
2643             new_nranges = 2 * mbcset->nranges + 1;
2644             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2645                are NULL if *range_alloc == 0.  */
2646             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2647                                           new_nranges);
2648             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2649                                         new_nranges);
2650
2651             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2652               return REG_ESPACE;
2653
2654             mbcset->range_starts = new_array_start;
2655             mbcset->range_ends = new_array_end;
2656             *range_alloc = new_nranges;
2657           }
2658
2659         mbcset->range_starts[mbcset->nranges] = start_wc;
2660         mbcset->range_ends[mbcset->nranges++] = end_wc;
2661       }
2662
2663     /* Build the table for single byte characters.  */
2664     for (wc = 0; wc < SBC_MAX; ++wc)
2665       {
2666         cmp_buf[2] = wc;
2667         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2668             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2669           bitset_set (sbcset, wc);
2670       }
2671   }
2672 # else /* not RE_ENABLE_I18N */
2673   {
2674     unsigned int ch;
2675     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2676                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2677                    : 0));
2678     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2679               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2680                  : 0));
2681     if (start_ch > end_ch)
2682       return REG_ERANGE;
2683     /* Build the table for single byte characters.  */
2684     for (ch = 0; ch < SBC_MAX; ++ch)
2685       if (start_ch <= ch  && ch <= end_ch)
2686         bitset_set (sbcset, ch);
2687   }
2688 # endif /* not RE_ENABLE_I18N */
2689   return REG_NOERROR;
2690 }
2691 #endif /* not _LIBC */
2692
2693 #ifndef _LIBC
2694 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2695    Build the collating element which is represented by NAME.
2696    The result are written to MBCSET and SBCSET.
2697    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2698    pointer argument since we may update it.  */
2699
2700 static reg_errcode_t
2701 # ifdef RE_ENABLE_I18N
2702 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2703      re_charset_t *mbcset;
2704      int *coll_sym_alloc;
2705 # else /* not RE_ENABLE_I18N */
2706 build_collating_symbol (sbcset, name)
2707 # endif /* not RE_ENABLE_I18N */
2708      re_bitset_ptr_t sbcset;
2709      const unsigned char *name;
2710 {
2711   size_t name_len = strlen ((const char *) name);
2712   if (BE (name_len != 1, 0))
2713     return REG_ECOLLATE;
2714   else
2715     {
2716       bitset_set (sbcset, name[0]);
2717       return REG_NOERROR;
2718     }
2719 }
2720 #endif /* not _LIBC */
2721
2722 /* This function parse bracket expression like "[abc]", "[a-c]",
2723    "[[.a-a.]]" etc.  */
2724
2725 static bin_tree_t *
2726 parse_bracket_exp (regexp, dfa, token, syntax, err)
2727      re_string_t *regexp;
2728      re_dfa_t *dfa;
2729      re_token_t *token;
2730      reg_syntax_t syntax;
2731      reg_errcode_t *err;
2732 {
2733 #ifdef _LIBC
2734   const unsigned char *collseqmb;
2735   const char *collseqwc;
2736   uint32_t nrules;
2737   int32_t table_size;
2738   const int32_t *symb_table;
2739   const unsigned char *extra;
2740
2741   /* Local function for parse_bracket_exp used in _LIBC environement.
2742      Seek the collating symbol entry correspondings to NAME.
2743      Return the index of the symbol in the SYMB_TABLE.  */
2744
2745   auto inline int32_t
2746   __attribute ((always_inline))
2747   seek_collating_symbol_entry (name, name_len)
2748          const unsigned char *name;
2749          size_t name_len;
2750     {
2751       int32_t hash = elem_hash ((const char *) name, name_len);
2752       int32_t elem = hash % table_size;
2753       int32_t second = hash % (table_size - 2);
2754       while (symb_table[2 * elem] != 0)
2755         {
2756           /* First compare the hashing value.  */
2757           if (symb_table[2 * elem] == hash
2758               /* Compare the length of the name.  */
2759               && name_len == extra[symb_table[2 * elem + 1]]
2760               /* Compare the name.  */
2761               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2762                          name_len) == 0)
2763             {
2764               /* Yep, this is the entry.  */
2765               break;
2766             }
2767
2768           /* Next entry.  */
2769           elem += second;
2770         }
2771       return elem;
2772     }
2773
2774   /* Local function for parse_bracket_exp used in _LIBC environement.
2775      Look up the collation sequence value of BR_ELEM.
2776      Return the value if succeeded, UINT_MAX otherwise.  */
2777
2778   auto inline unsigned int
2779   __attribute ((always_inline))
2780   lookup_collation_sequence_value (br_elem)
2781          bracket_elem_t *br_elem;
2782     {
2783       if (br_elem->type == SB_CHAR)
2784         {
2785           /*
2786           if (MB_CUR_MAX == 1)
2787           */
2788           if (nrules == 0)
2789             return collseqmb[br_elem->opr.ch];
2790           else
2791             {
2792               wint_t wc = __btowc (br_elem->opr.ch);
2793               return __collseq_table_lookup (collseqwc, wc);
2794             }
2795         }
2796       else if (br_elem->type == MB_CHAR)
2797         {
2798           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2799         }
2800       else if (br_elem->type == COLL_SYM)
2801         {
2802           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2803           if (nrules != 0)
2804             {
2805               int32_t elem, idx;
2806               elem = seek_collating_symbol_entry (br_elem->opr.name,
2807                                                   sym_name_len);
2808               if (symb_table[2 * elem] != 0)
2809                 {
2810                   /* We found the entry.  */
2811                   idx = symb_table[2 * elem + 1];
2812                   /* Skip the name of collating element name.  */
2813                   idx += 1 + extra[idx];
2814                   /* Skip the byte sequence of the collating element.  */
2815                   idx += 1 + extra[idx];
2816                   /* Adjust for the alignment.  */
2817                   idx = (idx + 3) & ~3;
2818                   /* Skip the multibyte collation sequence value.  */
2819                   idx += sizeof (unsigned int);
2820                   /* Skip the wide char sequence of the collating element.  */
2821                   idx += sizeof (unsigned int) *
2822                     (1 + *(unsigned int *) (extra + idx));
2823                   /* Return the collation sequence value.  */
2824                   return *(unsigned int *) (extra + idx);
2825                 }
2826               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2827                 {
2828                   /* No valid character.  Match it as a single byte
2829                      character.  */
2830                   return collseqmb[br_elem->opr.name[0]];
2831                 }
2832             }
2833           else if (sym_name_len == 1)
2834             return collseqmb[br_elem->opr.name[0]];
2835         }
2836       return UINT_MAX;
2837     }
2838
2839   /* Local function for parse_bracket_exp used in _LIBC environement.
2840      Build the range expression which starts from START_ELEM, and ends
2841      at END_ELEM.  The result are written to MBCSET and SBCSET.
2842      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2843      mbcset->range_ends, is a pointer argument sinse we may
2844      update it.  */
2845
2846   auto inline reg_errcode_t
2847   __attribute ((always_inline))
2848   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2849          re_charset_t *mbcset;
2850          int *range_alloc;
2851          re_bitset_ptr_t sbcset;
2852          bracket_elem_t *start_elem, *end_elem;
2853     {
2854       unsigned int ch;
2855       uint32_t start_collseq;
2856       uint32_t end_collseq;
2857
2858       /* Equivalence Classes and Character Classes can't be a range
2859          start/end.  */
2860       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2861               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2862               0))
2863         return REG_ERANGE;
2864
2865       start_collseq = lookup_collation_sequence_value (start_elem);
2866       end_collseq = lookup_collation_sequence_value (end_elem);
2867       /* Check start/end collation sequence values.  */
2868       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2869         return REG_ECOLLATE;
2870       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2871         return REG_ERANGE;
2872
2873       /* Got valid collation sequence values, add them as a new entry.
2874          However, if we have no collation elements, and the character set
2875          is single byte, the single byte character set that we
2876          build below suffices. */
2877       if (nrules > 0 || dfa->mb_cur_max > 1)
2878         {
2879           /* Check the space of the arrays.  */
2880           if (BE (*range_alloc == mbcset->nranges, 0))
2881             {
2882               /* There is not enough space, need realloc.  */
2883               uint32_t *new_array_start;
2884               uint32_t *new_array_end;
2885               int new_nranges;
2886
2887               /* +1 in case of mbcset->nranges is 0.  */
2888               new_nranges = 2 * mbcset->nranges + 1;
2889               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2890                                             new_nranges);
2891               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2892                                           new_nranges);
2893
2894               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2895                 return REG_ESPACE;
2896
2897               mbcset->range_starts = new_array_start;
2898               mbcset->range_ends = new_array_end;
2899               *range_alloc = new_nranges;
2900             }
2901
2902           mbcset->range_starts[mbcset->nranges] = start_collseq;
2903           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2904         }
2905
2906       /* Build the table for single byte characters.  */
2907       for (ch = 0; ch < SBC_MAX; ch++)
2908         {
2909           uint32_t ch_collseq;
2910           /*
2911           if (MB_CUR_MAX == 1)
2912           */
2913           if (nrules == 0)
2914             ch_collseq = collseqmb[ch];
2915           else
2916             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2917           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2918             bitset_set (sbcset, ch);
2919         }
2920       return REG_NOERROR;
2921     }
2922
2923   /* Local function for parse_bracket_exp used in _LIBC environement.
2924      Build the collating element which is represented by NAME.
2925      The result are written to MBCSET and SBCSET.
2926      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2927      pointer argument sinse we may update it.  */
2928
2929   auto inline reg_errcode_t
2930   __attribute ((always_inline))
2931   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2932          re_charset_t *mbcset;
2933          int *coll_sym_alloc;
2934          re_bitset_ptr_t sbcset;
2935          const unsigned char *name;
2936     {
2937       int32_t elem, idx;
2938       size_t name_len = strlen ((const char *) name);
2939       if (nrules != 0)
2940         {
2941           elem = seek_collating_symbol_entry (name, name_len);
2942           if (symb_table[2 * elem] != 0)
2943             {
2944               /* We found the entry.  */
2945               idx = symb_table[2 * elem + 1];
2946               /* Skip the name of collating element name.  */
2947               idx += 1 + extra[idx];
2948             }
2949           else if (symb_table[2 * elem] == 0 && name_len == 1)
2950             {
2951               /* No valid character, treat it as a normal
2952                  character.  */
2953               bitset_set (sbcset, name[0]);
2954               return REG_NOERROR;
2955             }
2956           else
2957             return REG_ECOLLATE;
2958
2959           /* Got valid collation sequence, add it as a new entry.  */
2960           /* Check the space of the arrays.  */
2961           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2962             {
2963               /* Not enough, realloc it.  */
2964               /* +1 in case of mbcset->ncoll_syms is 0.  */
2965               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2966               /* Use realloc since mbcset->coll_syms is NULL
2967                  if *alloc == 0.  */
2968               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2969                                                    new_coll_sym_alloc);
2970               if (BE (new_coll_syms == NULL, 0))
2971                 return REG_ESPACE;
2972               mbcset->coll_syms = new_coll_syms;
2973               *coll_sym_alloc = new_coll_sym_alloc;
2974             }
2975           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2976           return REG_NOERROR;
2977         }
2978       else
2979         {
2980           if (BE (name_len != 1, 0))
2981             return REG_ECOLLATE;
2982           else
2983             {
2984               bitset_set (sbcset, name[0]);
2985               return REG_NOERROR;
2986             }
2987         }
2988     }
2989 #endif
2990
2991   re_token_t br_token;
2992   re_bitset_ptr_t sbcset;
2993 #ifdef RE_ENABLE_I18N
2994   re_charset_t *mbcset;
2995   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2996   int equiv_class_alloc = 0, char_class_alloc = 0;
2997 #endif /* not RE_ENABLE_I18N */
2998   int non_match = 0;
2999   bin_tree_t *work_tree;
3000   int token_len;
3001   int first_round = 1;
3002 #ifdef _LIBC
3003   collseqmb = (const unsigned char *)
3004     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3005   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3006   if (nrules)
3007     {
3008       /*
3009       if (MB_CUR_MAX > 1)
3010       */
3011         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3012       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3013       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3014                                                   _NL_COLLATE_SYMB_TABLEMB);
3015       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3016                                                    _NL_COLLATE_SYMB_EXTRAMB);
3017     }
3018 #endif
3019   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3020 #ifdef RE_ENABLE_I18N
3021   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3022 #endif /* RE_ENABLE_I18N */
3023 #ifdef RE_ENABLE_I18N
3024   if (BE (sbcset == NULL || mbcset == NULL, 0))
3025 #else
3026   if (BE (sbcset == NULL, 0))
3027 #endif /* RE_ENABLE_I18N */
3028     {
3029       *err = REG_ESPACE;
3030       return NULL;
3031     }
3032
3033   token_len = peek_token_bracket (token, regexp, syntax);
3034   if (BE (token->type == END_OF_RE, 0))
3035     {
3036       *err = REG_BADPAT;
3037       goto parse_bracket_exp_free_return;
3038     }
3039   if (token->type == OP_NON_MATCH_LIST)
3040     {
3041 #ifdef RE_ENABLE_I18N
3042       mbcset->non_match = 1;
3043 #endif /* not RE_ENABLE_I18N */
3044       non_match = 1;
3045       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3046         bitset_set (sbcset, '\0');
3047       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3048       token_len = peek_token_bracket (token, regexp, syntax);
3049       if (BE (token->type == END_OF_RE, 0))
3050         {
3051           *err = REG_BADPAT;
3052           goto parse_bracket_exp_free_return;
3053         }
3054     }
3055
3056   /* We treat the first ']' as a normal character.  */
3057   if (token->type == OP_CLOSE_BRACKET)
3058     token->type = CHARACTER;
3059
3060   while (1)
3061     {
3062       bracket_elem_t start_elem, end_elem;
3063       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3064       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3065       reg_errcode_t ret;
3066       int token_len2 = 0, is_range_exp = 0;
3067       re_token_t token2;
3068
3069       start_elem.opr.name = start_name_buf;
3070       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3071                                    syntax, first_round);
3072       if (BE (ret != REG_NOERROR, 0))
3073         {
3074           *err = ret;
3075           goto parse_bracket_exp_free_return;
3076         }
3077       first_round = 0;
3078
3079       /* Get information about the next token.  We need it in any case.  */
3080       token_len = peek_token_bracket (token, regexp, syntax);
3081
3082       /* Do not check for ranges if we know they are not allowed.  */
3083       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3084         {
3085           if (BE (token->type == END_OF_RE, 0))
3086             {
3087               *err = REG_EBRACK;
3088               goto parse_bracket_exp_free_return;
3089             }
3090           if (token->type == OP_CHARSET_RANGE)
3091             {
3092               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3093               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3094               if (BE (token2.type == END_OF_RE, 0))
3095                 {
3096                   *err = REG_EBRACK;
3097                   goto parse_bracket_exp_free_return;
3098                 }
3099               if (token2.type == OP_CLOSE_BRACKET)
3100                 {
3101                   /* We treat the last '-' as a normal character.  */
3102                   re_string_skip_bytes (regexp, -token_len);
3103                   token->type = CHARACTER;
3104                 }
3105               else
3106                 is_range_exp = 1;
3107             }
3108         }
3109
3110       if (is_range_exp == 1)
3111         {
3112           end_elem.opr.name = end_name_buf;
3113           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3114                                        dfa, syntax, 1);
3115           if (BE (ret != REG_NOERROR, 0))
3116             {
3117               *err = ret;
3118               goto parse_bracket_exp_free_return;
3119             }
3120
3121           token_len = peek_token_bracket (token, regexp, syntax);
3122
3123 #ifdef _LIBC
3124           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3125                                   &start_elem, &end_elem);
3126 #else
3127 # ifdef RE_ENABLE_I18N
3128           *err = build_range_exp (sbcset,
3129                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3130                                   &range_alloc, &start_elem, &end_elem);
3131 # else
3132           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3133 # endif
3134 #endif /* RE_ENABLE_I18N */
3135           if (BE (*err != REG_NOERROR, 0))
3136             goto parse_bracket_exp_free_return;
3137         }
3138       else
3139         {
3140           switch (start_elem.type)
3141             {
3142             case SB_CHAR:
3143               bitset_set (sbcset, start_elem.opr.ch);
3144               break;
3145 #ifdef RE_ENABLE_I18N
3146             case MB_CHAR:
3147               /* Check whether the array has enough space.  */
3148               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3149                 {
3150                   wchar_t *new_mbchars;
3151                   /* Not enough, realloc it.  */
3152                   /* +1 in case of mbcset->nmbchars is 0.  */
3153                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3154                   /* Use realloc since array is NULL if *alloc == 0.  */
3155                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3156                                             mbchar_alloc);
3157                   if (BE (new_mbchars == NULL, 0))
3158                     goto parse_bracket_exp_espace;
3159                   mbcset->mbchars = new_mbchars;
3160                 }
3161               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3162               break;
3163 #endif /* RE_ENABLE_I18N */
3164             case EQUIV_CLASS:
3165               *err = build_equiv_class (sbcset,
3166 #ifdef RE_ENABLE_I18N
3167                                         mbcset, &equiv_class_alloc,
3168 #endif /* RE_ENABLE_I18N */
3169                                         start_elem.opr.name);
3170               if (BE (*err != REG_NOERROR, 0))
3171                 goto parse_bracket_exp_free_return;
3172               break;
3173             case COLL_SYM:
3174               *err = build_collating_symbol (sbcset,
3175 #ifdef RE_ENABLE_I18N
3176                                              mbcset, &coll_sym_alloc,
3177 #endif /* RE_ENABLE_I18N */
3178                                              start_elem.opr.name);
3179               if (BE (*err != REG_NOERROR, 0))
3180                 goto parse_bracket_exp_free_return;
3181               break;
3182             case CHAR_CLASS:
3183               *err = build_charclass (regexp->trans, sbcset,
3184 #ifdef RE_ENABLE_I18N
3185                                       mbcset, &char_class_alloc,
3186 #endif /* RE_ENABLE_I18N */
3187                                       start_elem.opr.name, syntax);
3188               if (BE (*err != REG_NOERROR, 0))
3189                goto parse_bracket_exp_free_return;
3190               break;
3191             default:
3192               assert (0);
3193               break;
3194             }
3195         }
3196       if (BE (token->type == END_OF_RE, 0))
3197         {
3198           *err = REG_EBRACK;
3199           goto parse_bracket_exp_free_return;
3200         }
3201       if (token->type == OP_CLOSE_BRACKET)
3202         break;
3203     }
3204
3205   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3206
3207   /* If it is non-matching list.  */
3208   if (non_match)
3209     bitset_not (sbcset);
3210
3211 #ifdef RE_ENABLE_I18N
3212   /* Ensure only single byte characters are set.  */
3213   if (dfa->mb_cur_max > 1)
3214     bitset_mask (sbcset, dfa->sb_char);
3215 #endif /* RE_ENABLE_I18N */
3216
3217   /* Build a tree for simple bracket.  */
3218   br_token.type = SIMPLE_BRACKET;
3219   br_token.opr.sbcset = sbcset;
3220   work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3221   if (BE (work_tree == NULL, 0))
3222     goto parse_bracket_exp_espace;
3223
3224 #ifdef RE_ENABLE_I18N
3225   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3226       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3227                                                      || mbcset->non_match)))
3228     {
3229       re_token_t alt_token;
3230       bin_tree_t *mbc_tree;
3231       int sbc_idx;
3232       /* Build a tree for complex bracket.  */
3233       dfa->has_mb_node = 1;
3234       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3235         if (sbcset[sbc_idx])
3236           break;
3237       /* If there are no bits set in sbcset, there is no point
3238          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3239       if (sbc_idx == BITSET_UINTS)
3240         {
3241           re_free (sbcset);
3242           dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3243           dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3244           return work_tree;
3245         }
3246       br_token.type = COMPLEX_BRACKET;
3247       br_token.opr.mbcset = mbcset;
3248       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3249       if (BE (mbc_tree == NULL, 0))
3250         goto parse_bracket_exp_espace;
3251       /* Then join them by ALT node.  */
3252       alt_token.type = OP_ALT;
3253       dfa->has_plural_match = 1;
3254       work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3255       if (BE (mbc_tree != NULL, 1))
3256         return work_tree;
3257     }
3258   else
3259     {
3260       free_charset (mbcset);
3261       return work_tree;
3262     }
3263 #else /* not RE_ENABLE_I18N */
3264   return work_tree;
3265 #endif /* not RE_ENABLE_I18N */
3266
3267  parse_bracket_exp_espace:
3268   *err = REG_ESPACE;
3269  parse_bracket_exp_free_return:
3270   re_free (sbcset);
3271 #ifdef RE_ENABLE_I18N
3272   free_charset (mbcset);
3273 #endif /* RE_ENABLE_I18N */
3274   return NULL;
3275 }
3276
3277 /* Parse an element in the bracket expression.  */
3278
3279 static reg_errcode_t
3280 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3281                        accept_hyphen)
3282      bracket_elem_t *elem;
3283      re_string_t *regexp;
3284      re_token_t *token;
3285      int token_len;
3286      re_dfa_t *dfa;
3287      reg_syntax_t syntax;
3288      int accept_hyphen;
3289 {
3290 #ifdef RE_ENABLE_I18N
3291   int cur_char_size;
3292   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3293   if (cur_char_size > 1)
3294     {
3295       elem->type = MB_CHAR;
3296       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3297       re_string_skip_bytes (regexp, cur_char_size);
3298       return REG_NOERROR;
3299     }
3300 #endif /* RE_ENABLE_I18N */
3301   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3302   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3303       || token->type == OP_OPEN_EQUIV_CLASS)
3304     return parse_bracket_symbol (elem, regexp, token);
3305   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3306     {
3307       /* A '-' must only appear as anything but a range indicator before
3308          the closing bracket.  Everything else is an error.  */
3309       re_token_t token2;
3310       (void) peek_token_bracket (&token2, regexp, syntax);
3311       if (token2.type != OP_CLOSE_BRACKET)
3312         /* The actual error value is not standardized since this whole
3313            case is undefined.  But ERANGE makes good sense.  */
3314         return REG_ERANGE;
3315     }
3316   elem->type = SB_CHAR;
3317   elem->opr.ch = token->opr.c;
3318   return REG_NOERROR;
3319 }
3320
3321 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3322    such as [:<character_class>:], [.<collating_element>.], and
3323    [=<equivalent_class>=].  */
3324
3325 static reg_errcode_t
3326 parse_bracket_symbol (elem, regexp, token)
3327      bracket_elem_t *elem;
3328      re_string_t *regexp;
3329      re_token_t *token;
3330 {
3331   unsigned char ch, delim = token->opr.c;
3332   int i = 0;
3333   if (re_string_eoi(regexp))
3334     return REG_EBRACK;
3335   for (;; ++i)
3336     {
3337       if (i >= BRACKET_NAME_BUF_SIZE)
3338         return REG_EBRACK;
3339       if (token->type == OP_OPEN_CHAR_CLASS)
3340         ch = re_string_fetch_byte_case (regexp);
3341       else
3342         ch = re_string_fetch_byte (regexp);
3343       if (re_string_eoi(regexp))
3344         return REG_EBRACK;
3345       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3346         break;
3347       elem->opr.name[i] = ch;
3348     }
3349   re_string_skip_bytes (regexp, 1);
3350   elem->opr.name[i] = '\0';
3351   switch (token->type)
3352     {
3353     case OP_OPEN_COLL_ELEM:
3354       elem->type = COLL_SYM;
3355       break;
3356     case OP_OPEN_EQUIV_CLASS:
3357       elem->type = EQUIV_CLASS;
3358       break;
3359     case OP_OPEN_CHAR_CLASS:
3360       elem->type = CHAR_CLASS;
3361       break;
3362     default:
3363       break;
3364     }
3365   return REG_NOERROR;
3366 }
3367
3368   /* Helper function for parse_bracket_exp.
3369      Build the equivalence class which is represented by NAME.
3370      The result are written to MBCSET and SBCSET.
3371      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3372      is a pointer argument sinse we may update it.  */
3373
3374 static reg_errcode_t
3375 #ifdef RE_ENABLE_I18N
3376 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3377      re_charset_t *mbcset;
3378      int *equiv_class_alloc;
3379 #else /* not RE_ENABLE_I18N */
3380 build_equiv_class (sbcset, name)
3381 #endif /* not RE_ENABLE_I18N */
3382      re_bitset_ptr_t sbcset;
3383      const unsigned char *name;
3384 {
3385 #if defined _LIBC
3386   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3387   if (nrules != 0)
3388     {
3389       const int32_t *table, *indirect;
3390       const unsigned char *weights, *extra, *cp;
3391       unsigned char char_buf[2];
3392       int32_t idx1, idx2;
3393       unsigned int ch;
3394       size_t len;
3395       /* This #include defines a local function!  */
3396 # include <locale/weight.h>
3397       /* Calculate the index for equivalence class.  */
3398       cp = name;
3399       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3400       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3401                                                _NL_COLLATE_WEIGHTMB);
3402       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3403                                                    _NL_COLLATE_EXTRAMB);
3404       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3405                                                 _NL_COLLATE_INDIRECTMB);
3406       idx1 = findidx (&cp);
3407       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3408         /* This isn't a valid character.  */
3409         return REG_ECOLLATE;
3410
3411       /* Build single byte matcing table for this equivalence class.  */
3412       char_buf[1] = (unsigned char) '\0';
3413       len = weights[idx1];
3414       for (ch = 0; ch < SBC_MAX; ++ch)
3415         {
3416           char_buf[0] = ch;
3417           cp = char_buf;
3418           idx2 = findidx (&cp);
3419 /*
3420           idx2 = table[ch];
3421 */
3422           if (idx2 == 0)
3423             /* This isn't a valid character.  */
3424             continue;
3425           if (len == weights[idx2])
3426             {
3427               int cnt = 0;
3428               while (cnt <= len &&
3429                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3430                 ++cnt;
3431
3432               if (cnt > len)
3433                 bitset_set (sbcset, ch);
3434             }
3435         }
3436       /* Check whether the array has enough space.  */
3437       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3438         {
3439           /* Not enough, realloc it.  */
3440           /* +1 in case of mbcset->nequiv_classes is 0.  */
3441           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3442           /* Use realloc since the array is NULL if *alloc == 0.  */
3443           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3444                                                    int32_t,
3445                                                    new_equiv_class_alloc);
3446           if (BE (new_equiv_classes == NULL, 0))
3447             return REG_ESPACE;
3448           mbcset->equiv_classes = new_equiv_classes;
3449           *equiv_class_alloc = new_equiv_class_alloc;
3450         }
3451       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3452     }
3453   else
3454 #endif /* _LIBC */
3455     {
3456       if (BE (strlen ((const char *) name) != 1, 0))
3457         return REG_ECOLLATE;
3458       bitset_set (sbcset, *name);
3459     }
3460   return REG_NOERROR;
3461 }
3462
3463   /* Helper function for parse_bracket_exp.
3464      Build the character class which is represented by NAME.
3465      The result are written to MBCSET and SBCSET.
3466      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3467      is a pointer argument sinse we may update it.  */
3468
3469 static reg_errcode_t
3470 #ifdef RE_ENABLE_I18N
3471 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3472      re_charset_t *mbcset;
3473      int *char_class_alloc;
3474 #else /* not RE_ENABLE_I18N */
3475 build_charclass (trans, sbcset, class_name, syntax)
3476 #endif /* not RE_ENABLE_I18N */
3477      unsigned RE_TRANSLATE_TYPE trans;
3478      re_bitset_ptr_t sbcset;
3479      const unsigned char *class_name;
3480      reg_syntax_t syntax;
3481 {
3482   int i;
3483   const char *name = (const char *) class_name;
3484
3485   /* In case of REG_ICASE "upper" and "lower" match the both of
3486      upper and lower cases.  */
3487   if ((syntax & RE_ICASE)
3488       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3489     name = "alpha";
3490
3491 #ifdef RE_ENABLE_I18N
3492   /* Check the space of the arrays.  */
3493   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3494     {
3495       /* Not enough, realloc it.  */
3496       /* +1 in case of mbcset->nchar_classes is 0.  */
3497       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3498       /* Use realloc since array is NULL if *alloc == 0.  */
3499       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3500                                                new_char_class_alloc);
3501       if (BE (new_char_classes == NULL, 0))
3502         return REG_ESPACE;
3503       mbcset->char_classes = new_char_classes;
3504       *char_class_alloc = new_char_class_alloc;
3505     }
3506   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3507 #endif /* RE_ENABLE_I18N */
3508
3509 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3510     for (i = 0; i < SBC_MAX; ++i)               \
3511       {                                         \
3512         if (ctype_func (i))                     \
3513           {                                     \
3514             int ch = trans ? trans[i] : i;      \
3515             bitset_set (sbcset, ch);            \
3516           }                                     \
3517       }
3518
3519   if (strcmp (name, "alnum") == 0)
3520     BUILD_CHARCLASS_LOOP (isalnum)
3521   else if (strcmp (name, "cntrl") == 0)
3522     BUILD_CHARCLASS_LOOP (iscntrl)
3523   else if (strcmp (name, "lower") == 0)
3524     BUILD_CHARCLASS_LOOP (islower)
3525   else if (strcmp (name, "space") == 0)
3526     BUILD_CHARCLASS_LOOP (isspace)
3527   else if (strcmp (name, "alpha") == 0)
3528     BUILD_CHARCLASS_LOOP (isalpha)
3529   else if (strcmp (name, "digit") == 0)
3530     BUILD_CHARCLASS_LOOP (isdigit)
3531   else if (strcmp (name, "print") == 0)
3532     BUILD_CHARCLASS_LOOP (isprint)
3533   else if (strcmp (name, "upper") == 0)
3534     BUILD_CHARCLASS_LOOP (isupper)
3535   else if (strcmp (name, "blank") == 0)
3536     BUILD_CHARCLASS_LOOP (isblank)
3537   else if (strcmp (name, "graph") == 0)
3538     BUILD_CHARCLASS_LOOP (isgraph)
3539   else if (strcmp (name, "punct") == 0)
3540     BUILD_CHARCLASS_LOOP (ispunct)
3541   else if (strcmp (name, "xdigit") == 0)
3542     BUILD_CHARCLASS_LOOP (isxdigit)
3543   else
3544     return REG_ECTYPE;
3545
3546   return REG_NOERROR;
3547 }
3548
3549 static bin_tree_t *
3550 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3551      re_dfa_t *dfa;
3552      unsigned RE_TRANSLATE_TYPE trans;
3553      const unsigned char *class_name;
3554      const unsigned char *extra;
3555      int non_match;
3556      reg_errcode_t *err;
3557 {
3558   re_bitset_ptr_t sbcset;
3559 #ifdef RE_ENABLE_I18N
3560   re_charset_t *mbcset;
3561   int alloc = 0;
3562 #endif /* not RE_ENABLE_I18N */
3563   reg_errcode_t ret;
3564   re_token_t br_token;
3565   bin_tree_t *tree;
3566
3567   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3568 #ifdef RE_ENABLE_I18N
3569   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3570 #endif /* RE_ENABLE_I18N */
3571
3572 #ifdef RE_ENABLE_I18N
3573   if (BE (sbcset == NULL || mbcset == NULL, 0))
3574 #else /* not RE_ENABLE_I18N */
3575   if (BE (sbcset == NULL, 0))
3576 #endif /* not RE_ENABLE_I18N */
3577     {
3578       *err = REG_ESPACE;
3579       return NULL;
3580     }
3581
3582   if (non_match)
3583     {
3584 #ifdef RE_ENABLE_I18N
3585       /*
3586       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3587         bitset_set(cset->sbcset, '\0');
3588       */
3589       mbcset->non_match = 1;
3590 #endif /* not RE_ENABLE_I18N */
3591     }
3592
3593   /* We don't care the syntax in this case.  */
3594   ret = build_charclass (trans, sbcset,
3595 #ifdef RE_ENABLE_I18N
3596                          mbcset, &alloc,
3597 #endif /* RE_ENABLE_I18N */
3598                          class_name, 0);
3599
3600   if (BE (ret != REG_NOERROR, 0))
3601     {
3602       re_free (sbcset);
3603 #ifdef RE_ENABLE_I18N
3604       free_charset (mbcset);
3605 #endif /* RE_ENABLE_I18N */
3606       *err = ret;
3607       return NULL;
3608     }
3609   /* \w match '_' also.  */
3610   for (; *extra; extra++)
3611     bitset_set (sbcset, *extra);
3612
3613   /* If it is non-matching list.  */
3614   if (non_match)
3615     bitset_not (sbcset);
3616
3617 #ifdef RE_ENABLE_I18N
3618   /* Ensure only single byte characters are set.  */
3619   if (dfa->mb_cur_max > 1)
3620     bitset_mask (sbcset, dfa->sb_char);
3621 #endif
3622
3623   /* Build a tree for simple bracket.  */
3624   br_token.type = SIMPLE_BRACKET;
3625   br_token.opr.sbcset = sbcset;
3626   tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3627   if (BE (tree == NULL, 0))
3628     goto build_word_op_espace;
3629
3630 #ifdef RE_ENABLE_I18N
3631   if (dfa->mb_cur_max > 1)
3632     {
3633       re_token_t alt_token;
3634       bin_tree_t *mbc_tree;
3635       /* Build a tree for complex bracket.  */
3636       br_token.type = COMPLEX_BRACKET;
3637       br_token.opr.mbcset = mbcset;
3638       dfa->has_mb_node = 1;
3639       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3640       if (BE (mbc_tree == NULL, 0))
3641         goto build_word_op_espace;
3642       /* Then join them by ALT node.  */
3643       alt_token.type = OP_ALT;
3644       dfa->has_plural_match = 1;
3645       tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3646       if (BE (mbc_tree != NULL, 1))
3647         return tree;
3648     }
3649   else
3650     {
3651       free_charset (mbcset);
3652       return tree;
3653     }
3654 #else /* not RE_ENABLE_I18N */
3655   return tree;
3656 #endif /* not RE_ENABLE_I18N */
3657
3658  build_word_op_espace:
3659   re_free (sbcset);
3660 #ifdef RE_ENABLE_I18N
3661   free_charset (mbcset);
3662 #endif /* RE_ENABLE_I18N */
3663   *err = REG_ESPACE;
3664   return NULL;
3665 }
3666
3667 /* This is intended for the expressions like "a{1,3}".
3668    Fetch a number from `input', and return the number.
3669    Return -1, if the number field is empty like "{,1}".
3670    Return -2, If an error is occured.  */
3671
3672 static int
3673 fetch_number (input, token, syntax)
3674      re_string_t *input;
3675      re_token_t *token;
3676      reg_syntax_t syntax;
3677 {
3678   int num = -1;
3679   unsigned char c;
3680   while (1)
3681     {
3682       fetch_token (token, input, syntax);
3683       c = token->opr.c;
3684       if (BE (token->type == END_OF_RE, 0))
3685         return -2;
3686       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3687         break;
3688       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3689              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3690       num = (num > RE_DUP_MAX) ? -2 : num;
3691     }
3692   return num;
3693 }
3694 \f
3695 #ifdef RE_ENABLE_I18N
3696 static void
3697 free_charset (re_charset_t *cset)
3698 {
3699   re_free (cset->mbchars);
3700 # ifdef _LIBC
3701   re_free (cset->coll_syms);
3702   re_free (cset->equiv_classes);
3703   re_free (cset->range_starts);
3704   re_free (cset->range_ends);
3705 # endif
3706   re_free (cset->char_classes);
3707   re_free (cset);
3708 }
3709 #endif /* RE_ENABLE_I18N */
3710 \f
3711 /* Functions for binary tree operation.  */
3712
3713 /* Create a tree node.  */
3714
3715 static bin_tree_t *
3716 create_tree (dfa, left, right, type, index)
3717      re_dfa_t *dfa;
3718      bin_tree_t *left;
3719      bin_tree_t *right;
3720      re_token_type_t type;
3721      int index;
3722 {
3723   bin_tree_t *tree;
3724   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3725     {
3726       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3727
3728       if (storage == NULL)
3729         return NULL;
3730       storage->next = dfa->str_tree_storage;
3731       dfa->str_tree_storage = storage;
3732       dfa->str_tree_storage_idx = 0;
3733     }
3734   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3735
3736   tree->parent = NULL;
3737   tree->left = left;
3738   tree->right = right;
3739   tree->type = type;
3740   tree->node_idx = index;
3741   tree->first = -1;
3742   tree->next = -1;
3743   re_node_set_init_empty (&tree->eclosure);
3744
3745   if (left != NULL)
3746     left->parent = tree;
3747   if (right != NULL)
3748     right->parent = tree;
3749   return tree;
3750 }
3751
3752 /* Create both a DFA node and a tree for it.  */
3753
3754 static bin_tree_t *
3755 re_dfa_add_tree_node (dfa, left, right, token)
3756      re_dfa_t *dfa;
3757      bin_tree_t *left;
3758      bin_tree_t *right;
3759      const re_token_t *token;
3760 {
3761   int new_idx = re_dfa_add_node (dfa, *token, 0);
3762
3763   if (new_idx == -1)
3764     return NULL;
3765
3766   return create_tree (dfa, left, right, 0, new_idx);
3767 }
3768
3769 /* Mark the tree SRC as an optional subexpression.  */
3770
3771 static void
3772 mark_opt_subexp (src, dfa)
3773      const bin_tree_t *src;
3774      re_dfa_t *dfa;
3775 {
3776   /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3777      a subexpression.  */
3778   if (src->type == CONCAT
3779       && src->left->type == NON_TYPE
3780       && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3781     mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3782 }
3783
3784
3785 /* Recursive tree walker for mark_opt_subexp.  */
3786
3787 static void
3788 mark_opt_subexp_iter (src, dfa, idx)
3789      const bin_tree_t *src;
3790      re_dfa_t *dfa;
3791      int idx;
3792 {
3793   int node_idx;
3794
3795   if (src->type == NON_TYPE)
3796     {
3797       node_idx = src->node_idx;
3798       if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3799            || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3800           && dfa->nodes[node_idx].opr.idx == idx)
3801         dfa->nodes[node_idx].opt_subexp = 1;
3802      }
3803
3804   if (src->left != NULL)
3805     mark_opt_subexp_iter (src->left, dfa, idx);
3806
3807   if (src->right != NULL)
3808     mark_opt_subexp_iter (src->right, dfa, idx);
3809 }
3810
3811
3812 /* Duplicate the node SRC, and return new node.  */
3813
3814 static bin_tree_t *
3815 duplicate_tree (src, dfa)
3816      const bin_tree_t *src;
3817      re_dfa_t *dfa;
3818 {
3819   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3820   int new_node_idx;
3821   /* Since node indies must be according to Post-order of the tree,
3822      we must duplicate the left at first.  */
3823   if (src->left != NULL)
3824     {
3825       left = duplicate_tree (src->left, dfa);
3826       if (left == NULL)
3827         return NULL;
3828     }
3829
3830   /* Secondaly, duplicate the right.  */
3831   if (src->right != NULL)
3832     {
3833       right = duplicate_tree (src->right, dfa);
3834       if (right == NULL)
3835         return NULL;
3836     }
3837
3838   /* At last, duplicate itself.  */
3839   if (src->type == NON_TYPE)
3840     {
3841       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3842       dfa->nodes[new_node_idx].duplicated = 1;
3843       if (BE (new_node_idx == -1, 0))
3844         return NULL;
3845     }
3846   else
3847     new_node_idx = src->type;
3848
3849   new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3850   return new_tree;
3851 }