posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 struct subexp_optimize
  37 {
  38   re_dfa_t *dfa;
  39   re_token_t *nodes;
  40   int no_sub, re_nsub;
  41 };
  42 static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
  43                                      bin_tree_t *node, int sidx, int depth);
  44 static reg_errcode_t analyze (re_dfa_t *dfa);
  45 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  46 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  47 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  48 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  49 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  50                                              int top_clone_node, int root_node,
  51                                              unsigned int constraint);
  52 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  53                                      unsigned int constraint);
  54 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  55                                    unsigned int constraint);
  56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  58                                          int node, int root);
  59 static void calc_inveclosure (re_dfa_t *dfa);
  60 static int fetch_number (re_string_t *input, re_token_t *token,
  61                          reg_syntax_t syntax);
  62 static void fetch_token (re_token_t *result, re_string_t *input,
  63                          reg_syntax_t syntax);
  64 static int peek_token (re_token_t *token, re_string_t *input,
  65                         reg_syntax_t syntax);
  66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  67                                reg_syntax_t syntax);
  68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  69                           reg_syntax_t syntax, reg_errcode_t *err);
  70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  74                                  re_token_t *token, reg_syntax_t syntax,
  75                                  int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  77                                      re_token_t *token, reg_syntax_t syntax,
  78                                      int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  80                                   re_token_t *token, reg_syntax_t syntax,
  81                                   int nest, reg_errcode_t *err);
  82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  83                                  re_dfa_t *dfa, re_token_t *token,
  84                                  reg_syntax_t syntax, reg_errcode_t *err);
  85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  86                                       re_token_t *token, reg_syntax_t syntax,
  87                                       reg_errcode_t *err);
  88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  89                                             re_string_t *regexp,
  90                                             re_token_t *token, int token_len,
  91                                             re_dfa_t *dfa,
  92                                             reg_syntax_t syntax,
  93                                             int accept_hyphen);
  94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  95                                           re_string_t *regexp,
  96                                           re_token_t *token);
  97 #ifndef _LIBC
  98 # ifdef RE_ENABLE_I18N
  99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 100                                       re_charset_t *mbcset, int *range_alloc,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              re_charset_t *mbcset,
 105                                              int *coll_sym_alloc,
 106                                              const unsigned char *name);
 107 # else /* not RE_ENABLE_I18N */
 108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 109                                       bracket_elem_t *start_elem,
 110                                       bracket_elem_t *end_elem);
 111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 112                                              const unsigned char *name);
 113 # endif /* not RE_ENABLE_I18N */
 114 #endif /* not _LIBC */
 115 #ifdef RE_ENABLE_I18N
 116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 117                                         re_charset_t *mbcset,
 118                                         int *equiv_class_alloc,
 119                                         const unsigned char *name);
 120 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 121                                       re_bitset_ptr_t sbcset,
 122                                       re_charset_t *mbcset,
 123                                       int *char_class_alloc,
 124                                       const unsigned char *class_name,
 125                                       reg_syntax_t syntax);
 126 #else  /* not RE_ENABLE_I18N */
 127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 128                                         const unsigned char *name);
 129 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 130                                       re_bitset_ptr_t sbcset,
 131                                       const unsigned char *class_name,
 132                                       reg_syntax_t syntax);
 133 #endif /* not RE_ENABLE_I18N */
 134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 135                                        unsigned RE_TRANSLATE_TYPE trans,
 136                                        const unsigned char *class_name,
 137                                        const unsigned char *extra,
 138                                        int non_match, reg_errcode_t *err);
 139 static bin_tree_t *create_tree (re_dfa_t *dfa,
 140                                 bin_tree_t *left, bin_tree_t *right,
 141                                 re_token_type_t type, int index);
 142 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
 143                                          bin_tree_t *left, bin_tree_t *right,
 144                                          const re_token_t *token)
 145   __attribute ((noinline));
 146 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 147 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
 148 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
 149 \f
 150 /* This table gives an error message for each of the error codes listed
 151    in regex.h.  Obviously the order here has to be same as there.
 152    POSIX doesn't require that we do anything for REG_NOERROR,
 153    but why not be nice?  */
 154
 155 const char __re_error_msgid[] attribute_hidden =
 156   {
 157 #define REG_NOERROR_IDX 0
 158     gettext_noop ("Success")    /* REG_NOERROR */
 159     "\0"
 160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 161     gettext_noop ("No match")   /* REG_NOMATCH */
 162     "\0"
 163 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 164     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 165     "\0"
 166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 167     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 168     "\0"
 169 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 170     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 171     "\0"
 172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 173     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 174     "\0"
 175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 176     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 177     "\0"
 178 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 179     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 180     "\0"
 181 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 182     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 183     "\0"
 184 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 185     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 186     "\0"
 187 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 188     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 189     "\0"
 190 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 191     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 192     "\0"
 193 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 194     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 195     "\0"
 196 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 197     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 198     "\0"
 199 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 200     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 201     "\0"
 202 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 203     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 204     "\0"
 205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 206     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 207   };
 208
 209 const size_t __re_error_msgid_idx[] attribute_hidden =
 210   {
 211     REG_NOERROR_IDX,
 212     REG_NOMATCH_IDX,
 213     REG_BADPAT_IDX,
 214     REG_ECOLLATE_IDX,
 215     REG_ECTYPE_IDX,
 216     REG_EESCAPE_IDX,
 217     REG_ESUBREG_IDX,
 218     REG_EBRACK_IDX,
 219     REG_EPAREN_IDX,
 220     REG_EBRACE_IDX,
 221     REG_BADBR_IDX,
 222     REG_ERANGE_IDX,
 223     REG_ESPACE_IDX,
 224     REG_BADRPT_IDX,
 225     REG_EEND_IDX,
 226     REG_ESIZE_IDX,
 227     REG_ERPAREN_IDX
 228   };
 229 \f
 230 /* Entry points for GNU code.  */
 231
 232 /* re_compile_pattern is the GNU regular expression compiler: it
 233    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 234    Returns 0 if the pattern was valid, otherwise an error string.
 235
 236    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 237    are set in BUFP on entry.  */
 238
 239 const char *
 240 re_compile_pattern (pattern, length, bufp)
 241     const char *pattern;
 242     size_t length;
 243     struct re_pattern_buffer *bufp;
 244 {
 245   reg_errcode_t ret;
 246
 247   /* And GNU code determines whether or not to get register information
 248      by passing null for the REGS argument to re_match, etc., not by
 249      setting no_sub, unless RE_NO_SUB is set.  */
 250   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 251
 252   /* Match anchors at newline.  */
 253   bufp->newline_anchor = 1;
 254
 255   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 256
 257   if (!ret)
 258     return NULL;
 259   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 260 }
 261 #ifdef _LIBC
 262 weak_alias (__re_compile_pattern, re_compile_pattern)
 263 #endif
 264
 265 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 266    also be assigned to arbitrarily: each pattern buffer stores its own
 267    syntax, so it can be changed between regex compilations.  */
 268 /* This has no initializer because initialized variables in Emacs
 269    become read-only after dumping.  */
 270 reg_syntax_t re_syntax_options;
 271
 272
 273 /* Specify the precise syntax of regexps for compilation.  This provides
 274    for compatibility for various utilities which historically have
 275    different, incompatible syntaxes.
 276
 277    The argument SYNTAX is a bit mask comprised of the various bits
 278    defined in regex.h.  We return the old syntax.  */
 279
 280 reg_syntax_t
 281 re_set_syntax (syntax)
 282     reg_syntax_t syntax;
 283 {
 284   reg_syntax_t ret = re_syntax_options;
 285
 286   re_syntax_options = syntax;
 287   return ret;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_set_syntax, re_set_syntax)
 291 #endif
 292
 293 int
 294 re_compile_fastmap (bufp)
 295     struct re_pattern_buffer *bufp;
 296 {
 297   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 298   char *fastmap = bufp->fastmap;
 299
 300   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 301   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 302   if (dfa->init_state != dfa->init_state_word)
 303     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 304   if (dfa->init_state != dfa->init_state_nl)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 306   if (dfa->init_state != dfa->init_state_begbuf)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 308   bufp->fastmap_accurate = 1;
 309   return 0;
 310 }
 311 #ifdef _LIBC
 312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 313 #endif
 314
 315 static inline void
 316 __attribute ((always_inline))
 317 re_set_fastmap (char *fastmap, int icase, int ch)
 318 {
 319   fastmap[ch] = 1;
 320   if (icase)
 321     fastmap[tolower (ch)] = 1;
 322 }
 323
 324 /* Helper function for re_compile_fastmap.
 325    Compile fastmap for the initial_state INIT_STATE.  */
 326
 327 static void
 328 re_compile_fastmap_iter (bufp, init_state, fastmap)
 329      regex_t *bufp;
 330      const re_dfastate_t *init_state;
 331      char *fastmap;
 332 {
 333   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 334   int node_cnt;
 335   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 336   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 337     {
 338       int node = init_state->nodes.elems[node_cnt];
 339       re_token_type_t type = dfa->nodes[node].type;
 340
 341       if (type == CHARACTER)
 342         {
 343           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 344 #ifdef RE_ENABLE_I18N
 345           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 346             {
 347               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 348               wchar_t wc;
 349               mbstate_t state;
 350
 351               p = buf;
 352               *p++ = dfa->nodes[node].opr.c;
 353               while (++node < dfa->nodes_len
 354                      && dfa->nodes[node].type == CHARACTER
 355                      && dfa->nodes[node].mb_partial)
 356                 *p++ = dfa->nodes[node].opr.c;
 357               memset (&state, 0, sizeof (state));
 358               if (mbrtowc (&wc, (const char *) buf, p - buf,
 359                            &state) == p - buf
 360                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 361                 re_set_fastmap (fastmap, 0, buf[0]);
 362             }
 363 #endif
 364         }
 365       else if (type == SIMPLE_BRACKET)
 366         {
 367           int i, j, ch;
 368           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 369             for (j = 0; j < UINT_BITS; ++j, ++ch)
 370               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 371                 re_set_fastmap (fastmap, icase, ch);
 372         }
 373 #ifdef RE_ENABLE_I18N
 374       else if (type == COMPLEX_BRACKET)
 375         {
 376           int i;
 377           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 378           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 379               || cset->nranges || cset->nchar_classes)
 380             {
 381 # ifdef _LIBC
 382               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 383                 {
 384                   /* In this case we want to catch the bytes which are
 385                      the first byte of any collation elements.
 386                      e.g. In da_DK, we want to catch 'a' since "aa"
 387                           is a valid collation element, and don't catch
 388                           'b' since 'b' is the only collation element
 389                           which starts from 'b'.  */
 390                   int j, ch;
 391                   const int32_t *table = (const int32_t *)
 392                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 393                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 394                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 395                       if (table[ch] < 0)
 396                         re_set_fastmap (fastmap, icase, ch);
 397                 }
 398 # else
 399               if (dfa->mb_cur_max > 1)
 400                 for (i = 0; i < SBC_MAX; ++i)
 401                   if (__btowc (i) == WEOF)
 402                     re_set_fastmap (fastmap, icase, i);
 403 # endif /* not _LIBC */
 404             }
 405           for (i = 0; i < cset->nmbchars; ++i)
 406             {
 407               char buf[256];
 408               mbstate_t state;
 409               memset (&state, '\0', sizeof (state));
 410               __wcrtomb (buf, cset->mbchars[i], &state);
 411               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 413                 {
 414                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 415                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 416                 }
 417             }
 418         }
 419 #endif /* RE_ENABLE_I18N */
 420       else if (type == OP_PERIOD
 421 #ifdef RE_ENABLE_I18N
 422                || type == OP_UTF8_PERIOD
 423 #endif /* RE_ENABLE_I18N */
 424                || type == END_OF_RE)
 425         {
 426           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 427           if (type == END_OF_RE)
 428             bufp->can_be_null = 1;
 429           return;
 430         }
 431     }
 432 }
 433 \f
 434 /* Entry point for POSIX code.  */
 435 /* regcomp takes a regular expression as a string and compiles it.
 436
 437    PREG is a regex_t *.  We do not expect any fields to be initialized,
 438    since POSIX says we shouldn't.  Thus, we set
 439
 440      `buffer' to the compiled pattern;
 441      `used' to the length of the compiled pattern;
 442      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 443        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 444        RE_SYNTAX_POSIX_BASIC;
 445      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 446      `fastmap' to an allocated space for the fastmap;
 447      `fastmap_accurate' to zero;
 448      `re_nsub' to the number of subexpressions in PATTERN.
 449
 450    PATTERN is the address of the pattern string.
 451
 452    CFLAGS is a series of bits which affect compilation.
 453
 454      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 455      use POSIX basic syntax.
 456
 457      If REG_NEWLINE is set, then . and [^...] don't match newline.
 458      Also, regexec will try a match beginning after every newline.
 459
 460      If REG_ICASE is set, then we considers upper- and lowercase
 461      versions of letters to be equivalent when matching.
 462
 463      If REG_NOSUB is set, then when PREG is passed to regexec, that
 464      routine will report only success or failure, and nothing about the
 465      registers.
 466
 467    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 468    the return codes and their meanings.)  */
 469
 470 int
 471 regcomp (preg, pattern, cflags)
 472     regex_t *__restrict preg;
 473     const char *__restrict pattern;
 474     int cflags;
 475 {
 476   reg_errcode_t ret;
 477   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 478                          : RE_SYNTAX_POSIX_BASIC);
 479
 480   preg->buffer = NULL;
 481   preg->allocated = 0;
 482   preg->used = 0;
 483
 484   /* Try to allocate space for the fastmap.  */
 485   preg->fastmap = re_malloc (char, SBC_MAX);
 486   if (BE (preg->fastmap == NULL, 0))
 487     return REG_ESPACE;
 488
 489   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 490
 491   /* If REG_NEWLINE is set, newlines are treated differently.  */
 492   if (cflags & REG_NEWLINE)
 493     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 494       syntax &= ~RE_DOT_NEWLINE;
 495       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 496       /* It also changes the matching behavior.  */
 497       preg->newline_anchor = 1;
 498     }
 499   else
 500     preg->newline_anchor = 0;
 501   preg->no_sub = !!(cflags & REG_NOSUB);
 502   preg->translate = NULL;
 503
 504   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 505
 506   /* POSIX doesn't distinguish between an unmatched open-group and an
 507      unmatched close-group: both are REG_EPAREN.  */
 508   if (ret == REG_ERPAREN)
 509     ret = REG_EPAREN;
 510
 511   /* We have already checked preg->fastmap != NULL.  */
 512   if (BE (ret == REG_NOERROR, 1))
 513     /* Compute the fastmap now, since regexec cannot modify the pattern
 514        buffer.  This function never fails in this implementation.  */
 515     (void) re_compile_fastmap (preg);
 516   else
 517     {
 518       /* Some error occurred while compiling the expression.  */
 519       re_free (preg->fastmap);
 520       preg->fastmap = NULL;
 521     }
 522
 523   return (int) ret;
 524 }
 525 #ifdef _LIBC
 526 weak_alias (__regcomp, regcomp)
 527 #endif
 528
 529 /* Returns a message corresponding to an error code, ERRCODE, returned
 530    from either regcomp or regexec.   We don't use PREG here.  */
 531
 532 size_t
 533 regerror (errcode, preg, errbuf, errbuf_size)
 534     int errcode;
 535     const regex_t *preg;
 536     char *errbuf;
 537     size_t errbuf_size;
 538 {
 539   const char *msg;
 540   size_t msg_size;
 541
 542   if (BE (errcode < 0
 543           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 544                                / sizeof (__re_error_msgid_idx[0])), 0))
 545     /* Only error codes returned by the rest of the code should be passed
 546        to this routine.  If we are given anything else, or if other regex
 547        code generates an invalid error code, then the program has a bug.
 548        Dump core so we can fix it.  */
 549     abort ();
 550
 551   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 552
 553   msg_size = strlen (msg) + 1; /* Includes the null.  */
 554
 555   if (BE (errbuf_size != 0, 1))
 556     {
 557       if (BE (msg_size > errbuf_size, 0))
 558         {
 559 #if defined HAVE_MEMPCPY || defined _LIBC
 560           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 561 #else
 562           memcpy (errbuf, msg, errbuf_size - 1);
 563           errbuf[errbuf_size - 1] = 0;
 564 #endif
 565         }
 566       else
 567         memcpy (errbuf, msg, msg_size);
 568     }
 569
 570   return msg_size;
 571 }
 572 #ifdef _LIBC
 573 weak_alias (__regerror, regerror)
 574 #endif
 575
 576
 577 #ifdef RE_ENABLE_I18N
 578 /* This static array is used for the map to single-byte characters when
 579    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 580    it the same all the time.  UTF-8 is the preferred encoding so this is
 581    a worthwhile optimization.  */
 582 static const bitset utf8_sb_map =
 583 {
 584   /* Set the first 128 bits.  */
 585 # if UINT_MAX == 0xffffffff
 586   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 587 # else
 588 #  error "Add case for new unsigned int size"
 589 # endif
 590 };
 591 #endif
 592
 593
 594 static void
 595 free_dfa_content (re_dfa_t *dfa)
 596 {
 597   int i, j;
 598
 599   if (dfa->nodes)
 600     for (i = 0; i < dfa->nodes_len; ++i)
 601       {
 602         re_token_t *node = dfa->nodes + i;
 603 #ifdef RE_ENABLE_I18N
 604         if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 605           free_charset (node->opr.mbcset);
 606         else
 607 #endif /* RE_ENABLE_I18N */
 608           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 609             re_free (node->opr.sbcset);
 610       }
 611   re_free (dfa->nexts);
 612   for (i = 0; i < dfa->nodes_len; ++i)
 613     {
 614       if (dfa->eclosures != NULL)
 615         re_node_set_free (dfa->eclosures + i);
 616       if (dfa->inveclosures != NULL)
 617         re_node_set_free (dfa->inveclosures + i);
 618       if (dfa->edests != NULL)
 619         re_node_set_free (dfa->edests + i);
 620     }
 621   re_free (dfa->edests);
 622   re_free (dfa->eclosures);
 623   re_free (dfa->inveclosures);
 624   re_free (dfa->nodes);
 625
 626   if (dfa->state_table)
 627     for (i = 0; i <= dfa->state_hash_mask; ++i)
 628       {
 629         struct re_state_table_entry *entry = dfa->state_table + i;
 630         for (j = 0; j < entry->num; ++j)
 631           {
 632             re_dfastate_t *state = entry->array[j];
 633             free_state (state);
 634           }
 635         re_free (entry->array);
 636       }
 637   re_free (dfa->state_table);
 638 #ifdef RE_ENABLE_I18N
 639   if (dfa->sb_char != utf8_sb_map)
 640     re_free (dfa->sb_char);
 641 #endif
 642   re_free (dfa->subexp_map);
 643 #ifdef DEBUG
 644   re_free (dfa->re_str);
 645 #endif
 646
 647   re_free (dfa);
 648 }
 649
 650
 651 /* Free dynamically allocated space used by PREG.  */
 652
 653 void
 654 regfree (preg)
 655     regex_t *preg;
 656 {
 657   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 658   if (BE (dfa != NULL, 1))
 659     free_dfa_content (dfa);
 660   preg->buffer = NULL;
 661   preg->allocated = 0;
 662
 663   re_free (preg->fastmap);
 664   preg->fastmap = NULL;
 665
 666   re_free (preg->translate);
 667   preg->translate = NULL;
 668 }
 669 #ifdef _LIBC
 670 weak_alias (__regfree, regfree)
 671 #endif
 672 \f
 673 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 674    them unless specifically requested.  */
 675
 676 #if defined _REGEX_RE_COMP || defined _LIBC
 677
 678 /* BSD has one and only one pattern buffer.  */
 679 static struct re_pattern_buffer re_comp_buf;
 680
 681 char *
 682 # ifdef _LIBC
 683 /* Make these definitions weak in libc, so POSIX programs can redefine
 684    these names if they don't use our functions, and still use
 685    regcomp/regexec above without link errors.  */
 686 weak_function
 687 # endif
 688 re_comp (s)
 689      const char *s;
 690 {
 691   reg_errcode_t ret;
 692   char *fastmap;
 693
 694   if (!s)
 695     {
 696       if (!re_comp_buf.buffer)
 697         return gettext ("No previous regular expression");
 698       return 0;
 699     }
 700
 701   if (re_comp_buf.buffer)
 702     {
 703       fastmap = re_comp_buf.fastmap;
 704       re_comp_buf.fastmap = NULL;
 705       __regfree (&re_comp_buf);
 706       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 707       re_comp_buf.fastmap = fastmap;
 708     }
 709
 710   if (re_comp_buf.fastmap == NULL)
 711     {
 712       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 713       if (re_comp_buf.fastmap == NULL)
 714         return (char *) gettext (__re_error_msgid
 715                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 716     }
 717
 718   /* Since `re_exec' always passes NULL for the `regs' argument, we
 719      don't need to initialize the pattern buffer fields which affect it.  */
 720
 721   /* Match anchors at newlines.  */
 722   re_comp_buf.newline_anchor = 1;
 723
 724   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 725
 726   if (!ret)
 727     return NULL;
 728
 729   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 730   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 731 }
 732
 733 #ifdef _LIBC
 734 libc_freeres_fn (free_mem)
 735 {
 736   __regfree (&re_comp_buf);
 737 }
 738 #endif
 739
 740 #endif /* _REGEX_RE_COMP */
 741 \f
 742 /* Internal entry point.
 743    Compile the regular expression PATTERN, whose length is LENGTH.
 744    SYNTAX indicate regular expression's syntax.  */
 745
 746 static reg_errcode_t
 747 re_compile_internal (preg, pattern, length, syntax)
 748      regex_t *preg;
 749      const char * pattern;
 750      int length;
 751      reg_syntax_t syntax;
 752 {
 753   reg_errcode_t err = REG_NOERROR;
 754   re_dfa_t *dfa;
 755   re_string_t regexp;
 756
 757   /* Initialize the pattern buffer.  */
 758   preg->fastmap_accurate = 0;
 759   preg->syntax = syntax;
 760   preg->not_bol = preg->not_eol = 0;
 761   preg->used = 0;
 762   preg->re_nsub = 0;
 763   preg->can_be_null = 0;
 764   preg->regs_allocated = REGS_UNALLOCATED;
 765
 766   /* Initialize the dfa.  */
 767   dfa = (re_dfa_t *) preg->buffer;
 768   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 769     {
 770       /* If zero allocated, but buffer is non-null, try to realloc
 771          enough space.  This loses if buffer's address is bogus, but
 772          that is the user's responsibility.  If ->buffer is NULL this
 773          is a simple allocation.  */
 774       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 775       if (dfa == NULL)
 776         return REG_ESPACE;
 777       preg->allocated = sizeof (re_dfa_t);
 778       preg->buffer = (unsigned char *) dfa;
 779     }
 780   preg->used = sizeof (re_dfa_t);
 781
 782   err = init_dfa (dfa, length);
 783   if (BE (err != REG_NOERROR, 0))
 784     {
 785       free_dfa_content (dfa);
 786       preg->buffer = NULL;
 787       preg->allocated = 0;
 788       return err;
 789     }
 790 #ifdef DEBUG
 791   dfa->re_str = re_malloc (char, length + 1);
 792   strncpy (dfa->re_str, pattern, length + 1);
 793 #endif
 794
 795   err = re_string_construct (&regexp, pattern, length, preg->translate,
 796                              syntax & RE_ICASE, dfa);
 797   if (BE (err != REG_NOERROR, 0))
 798     {
 799     re_compile_internal_free_return:
 800       free_workarea_compile (preg);
 801       re_string_destruct (&regexp);
 802       free_dfa_content (dfa);
 803       preg->buffer = NULL;
 804       preg->allocated = 0;
 805       return err;
 806     }
 807
 808   /* Parse the regular expression, and build a structure tree.  */
 809   preg->re_nsub = 0;
 810   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 811   if (BE (dfa->str_tree == NULL, 0))
 812     goto re_compile_internal_free_return;
 813
 814 #ifdef RE_ENABLE_I18N
 815   /* If possible, do searching in single byte encoding to speed things up.  */
 816   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 817     optimize_utf8 (dfa);
 818 #endif
 819
 820   if (preg->re_nsub > 0)
 821     {
 822       struct subexp_optimize so;
 823
 824       so.dfa = dfa;
 825       so.nodes = dfa->nodes;
 826       so.no_sub = preg->no_sub;
 827       so.re_nsub = preg->re_nsub;
 828       dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
 829     }
 830
 831   /* Analyze the tree and collect information which is necessary to
 832      create the dfa.  */
 833   err = analyze (dfa);
 834   if (BE (err != REG_NOERROR, 0))
 835     goto re_compile_internal_free_return;
 836
 837   /* Then create the initial state of the dfa.  */
 838   err = create_initial_state (dfa);
 839
 840   /* Release work areas.  */
 841   free_workarea_compile (preg);
 842   re_string_destruct (&regexp);
 843
 844   if (BE (err != REG_NOERROR, 0))
 845     {
 846       free_dfa_content (dfa);
 847       preg->buffer = NULL;
 848       preg->allocated = 0;
 849     }
 850
 851   return err;
 852 }
 853
 854 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 855    as the initial length of some arrays.  */
 856
 857 static reg_errcode_t
 858 init_dfa (dfa, pat_len)
 859      re_dfa_t *dfa;
 860      int pat_len;
 861 {
 862   int table_size;
 863 #ifndef _LIBC
 864   char *codeset_name;
 865 #endif
 866
 867   memset (dfa, '\0', sizeof (re_dfa_t));
 868
 869   /* Force allocation of str_tree_storage the first time.  */
 870   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 871
 872   dfa->nodes_alloc = pat_len + 1;
 873   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 874
 875   dfa->states_alloc = pat_len + 1;
 876
 877   /*  table_size = 2 ^ ceil(log pat_len) */
 878   for (table_size = 1; table_size > 0; table_size <<= 1)
 879     if (table_size > pat_len)
 880       break;
 881
 882   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 883   dfa->state_hash_mask = table_size - 1;
 884
 885   dfa->mb_cur_max = MB_CUR_MAX;
 886 #ifdef _LIBC
 887   if (dfa->mb_cur_max == 6
 888       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 889     dfa->is_utf8 = 1;
 890   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 891                        != 0);
 892 #else
 893 # ifdef HAVE_LANGINFO_CODESET
 894   codeset_name = nl_langinfo (CODESET);
 895 # else
 896   codeset_name = getenv ("LC_ALL");
 897   if (codeset_name == NULL || codeset[0] == '\0')
 898     codeset_name = getenv ("LC_CTYPE");
 899   if (codeset_name == NULL || codeset[0] == '\0')
 900     codeset_name = getenv ("LANG");
 901   if (codeset_name == NULL)
 902     codeset_name = "";
 903   else if (strchr (codeset_name, '.') !=  NULL)
 904     codeset_name = strchr (codeset_name, '.') + 1;
 905 # endif
 906
 907   if (strcasecmp (codeset_name, "UTF-8") == 0
 908       || strcasecmp (codeset_name, "UTF8") == 0)
 909     dfa->is_utf8 = 1;
 910
 911   /* We check exhaustively in the loop below if this charset is a
 912      superset of ASCII.  */
 913   dfa->map_notascii = 0;
 914 #endif
 915
 916 #ifdef RE_ENABLE_I18N
 917   if (dfa->mb_cur_max > 1)
 918     {
 919       if (dfa->is_utf8)
 920         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 921       else
 922         {
 923           int i, j, ch;
 924
 925           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 926           if (BE (dfa->sb_char == NULL, 0))
 927             return REG_ESPACE;
 928
 929           /* Clear all bits by, then set those corresponding to single
 930              byte chars.  */
 931           bitset_empty (dfa->sb_char);
 932
 933           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 934             for (j = 0; j < UINT_BITS; ++j, ++ch)
 935               {
 936                 wchar_t wch = __btowc (ch);
 937                 if (wch != WEOF)
 938                   dfa->sb_char[i] |= 1 << j;
 939 # ifndef _LIBC
 940                 if (isascii (ch) && wch != (wchar_t) ch)
 941                   dfa->map_notascii = 1;
 942 # endif
 943               }
 944         }
 945     }
 946 #endif
 947
 948   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 949     return REG_ESPACE;
 950   return REG_NOERROR;
 951 }
 952
 953 /* Initialize WORD_CHAR table, which indicate which character is
 954    "word".  In this case "word" means that it is the word construction
 955    character used by some operators like "\<", "\>", etc.  */
 956
 957 static void
 958 init_word_char (dfa)
 959      re_dfa_t *dfa;
 960 {
 961   int i, j, ch;
 962   dfa->word_ops_used = 1;
 963   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 964     for (j = 0; j < UINT_BITS; ++j, ++ch)
 965       if (isalnum (ch) || ch == '_')
 966         dfa->word_char[i] |= 1 << j;
 967 }
 968
 969 /* Free the work area which are only used while compiling.  */
 970
 971 static void
 972 free_workarea_compile (preg)
 973      regex_t *preg;
 974 {
 975   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 976   bin_tree_storage_t *storage, *next;
 977   for (storage = dfa->str_tree_storage; storage; storage = next)
 978     {
 979       next = storage->next;
 980       re_free (storage);
 981     }
 982   dfa->str_tree_storage = NULL;
 983   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 984   dfa->str_tree = NULL;
 985   re_free (dfa->org_indices);
 986   dfa->org_indices = NULL;
 987 }
 988
 989 /* Create initial states for all contexts.  */
 990
 991 static reg_errcode_t
 992 create_initial_state (dfa)
 993      re_dfa_t *dfa;
 994 {
 995   int first, i;
 996   reg_errcode_t err;
 997   re_node_set init_nodes;
 998
 999   /* Initial states have the epsilon closure of the node which is
1000      the first node of the regular expression.  */
1001   first = dfa->str_tree->first;
1002   dfa->init_node = first;
1003   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1004   if (BE (err != REG_NOERROR, 0))
1005     return err;
1006
1007   /* The back-references which are in initial states can epsilon transit,
1008      since in this case all of the subexpressions can be null.
1009      Then we add epsilon closures of the nodes which are the next nodes of
1010      the back-references.  */
1011   if (dfa->nbackref > 0)
1012     for (i = 0; i < init_nodes.nelem; ++i)
1013       {
1014         int node_idx = init_nodes.elems[i];
1015         re_token_type_t type = dfa->nodes[node_idx].type;
1016
1017         int clexp_idx;
1018         if (type != OP_BACK_REF)
1019           continue;
1020         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1021           {
1022             re_token_t *clexp_node;
1023             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1024             if (clexp_node->type == OP_CLOSE_SUBEXP
1025                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1026               break;
1027           }
1028         if (clexp_idx == init_nodes.nelem)
1029           continue;
1030
1031         if (type == OP_BACK_REF)
1032           {
1033             int dest_idx = dfa->edests[node_idx].elems[0];
1034             if (!re_node_set_contains (&init_nodes, dest_idx))
1035               {
1036                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1037                 i = 0;
1038               }
1039           }
1040       }
1041
1042   /* It must be the first time to invoke acquire_state.  */
1043   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1044   /* We don't check ERR here, since the initial state must not be NULL.  */
1045   if (BE (dfa->init_state == NULL, 0))
1046     return err;
1047   if (dfa->init_state->has_constraint)
1048     {
1049       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1050                                                        CONTEXT_WORD);
1051       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1052                                                      CONTEXT_NEWLINE);
1053       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1054                                                          &init_nodes,
1055                                                          CONTEXT_NEWLINE
1056                                                          | CONTEXT_BEGBUF);
1057       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1058               || dfa->init_state_begbuf == NULL, 0))
1059         return err;
1060     }
1061   else
1062     dfa->init_state_word = dfa->init_state_nl
1063       = dfa->init_state_begbuf = dfa->init_state;
1064
1065   re_node_set_free (&init_nodes);
1066   return REG_NOERROR;
1067 }
1068 \f
1069 #ifdef RE_ENABLE_I18N
1070 /* If it is possible to do searching in single byte encoding instead of UTF-8
1071    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1072    DFA nodes where needed.  */
1073
1074 static void
1075 optimize_utf8 (dfa)
1076      re_dfa_t *dfa;
1077 {
1078   int node, i, mb_chars = 0, has_period = 0;
1079
1080   for (node = 0; node < dfa->nodes_len; ++node)
1081     switch (dfa->nodes[node].type)
1082       {
1083       case CHARACTER:
1084         if (dfa->nodes[node].opr.c >= 0x80)
1085           mb_chars = 1;
1086         break;
1087       case ANCHOR:
1088         switch (dfa->nodes[node].opr.idx)
1089           {
1090           case LINE_FIRST:
1091           case LINE_LAST:
1092           case BUF_FIRST:
1093           case BUF_LAST:
1094             break;
1095           default:
1096             /* Word anchors etc. cannot be handled.  */
1097             return;
1098           }
1099         break;
1100       case OP_PERIOD:
1101         has_period = 1;
1102         break;
1103       case OP_BACK_REF:
1104       case OP_ALT:
1105       case END_OF_RE:
1106       case OP_DUP_ASTERISK:
1107       case OP_DUP_QUESTION:
1108       case OP_OPEN_SUBEXP:
1109       case OP_CLOSE_SUBEXP:
1110         break;
1111       case SIMPLE_BRACKET:
1112         /* Just double check.  */
1113         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1114           if (dfa->nodes[node].opr.sbcset[i])
1115             return;
1116         break;
1117       default:
1118         return;
1119       }
1120
1121   if (mb_chars || has_period)
1122     for (node = 0; node < dfa->nodes_len; ++node)
1123       {
1124         if (dfa->nodes[node].type == CHARACTER
1125             && dfa->nodes[node].opr.c >= 0x80)
1126           dfa->nodes[node].mb_partial = 0;
1127         else if (dfa->nodes[node].type == OP_PERIOD)
1128           dfa->nodes[node].type = OP_UTF8_PERIOD;
1129       }
1130
1131   /* The search can be in single byte locale.  */
1132   dfa->mb_cur_max = 1;
1133   dfa->is_utf8 = 0;
1134   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1135 }
1136 #endif
1137 \f
1138 static bin_tree_t *
1139 optimize_subexps (so, node, sidx, depth)
1140      struct subexp_optimize *so;
1141      bin_tree_t *node;
1142      int sidx, depth;
1143 {
1144   int idx, new_depth, new_sidx;
1145   bin_tree_t *ret;
1146   if (node == NULL)
1147     return NULL;
1148
1149   new_depth = 0;
1150   new_sidx = sidx;
1151   if ((depth & 1) && node->type == CONCAT
1152       && node->right && node->right->type == 0
1153       && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1154     {
1155       new_depth = depth + 1;
1156       if (new_depth == 2
1157           || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1158               && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
1159         new_sidx = so->nodes[idx].opr.idx;
1160     }
1161   node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
1162   new_depth = (depth & 1) == 0 && node->type == CONCAT
1163               && node->left && node->left->type == 0
1164               && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
1165               ? depth + 1 : 0;
1166   node->right = optimize_subexps (so, node->right, sidx, new_depth);
1167
1168   if (node->type != CONCAT)
1169     return node;
1170   if ((depth & 1) == 0
1171       && node->left
1172       && node->left->type == 0
1173       && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
1174     ret = node->right;
1175   else if ((depth & 1)
1176            && node->right
1177            && node->right->type == 0
1178            && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1179     ret = node->left;
1180   else
1181     return node;
1182
1183   if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1184       && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
1185     return node;
1186
1187   if (!so->no_sub)
1188     {
1189       int i;
1190
1191       if (depth < 2)
1192         return node;
1193
1194       if (so->dfa->subexp_map == NULL)
1195         {
1196           so->dfa->subexp_map = re_malloc (int, so->re_nsub);
1197           if (so->dfa->subexp_map == NULL)
1198             return node;
1199
1200           for (i = 0; i < so->re_nsub; i++)
1201             so->dfa->subexp_map[i] = i;
1202         }
1203
1204       i = so->nodes[idx].opr.idx;
1205       assert (sidx < i);
1206       so->dfa->subexp_map[i] = sidx;
1207     }
1208
1209   so->nodes[idx].type = OP_DELETED_SUBEXP;
1210   ret->parent = node->parent;
1211   return ret;
1212 }
1213 \f
1214 /* Analyze the structure tree, and calculate "first", "next", "edest",
1215    "eclosure", and "inveclosure".  */
1216
1217 static reg_errcode_t
1218 analyze (dfa)
1219      re_dfa_t *dfa;
1220 {
1221   int i;
1222   reg_errcode_t ret;
1223
1224   /* Allocate arrays.  */
1225   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1226   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1227   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1228   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1229   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1230   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1231           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1232     return REG_ESPACE;
1233   /* Initialize them.  */
1234   for (i = 0; i < dfa->nodes_len; ++i)
1235     {
1236       dfa->nexts[i] = -1;
1237       re_node_set_init_empty (dfa->edests + i);
1238       re_node_set_init_empty (dfa->eclosures + i);
1239       re_node_set_init_empty (dfa->inveclosures + i);
1240     }
1241
1242   ret = analyze_tree (dfa, dfa->str_tree);
1243   if (BE (ret == REG_NOERROR, 1))
1244     {
1245       ret = calc_eclosure (dfa);
1246       if (ret == REG_NOERROR)
1247         calc_inveclosure (dfa);
1248     }
1249   return ret;
1250 }
1251
1252 /* Helper functions for analyze.
1253    This function calculate "first", "next", and "edest" for the subtree
1254    whose root is NODE.  */
1255
1256 static reg_errcode_t
1257 analyze_tree (dfa, node)
1258      re_dfa_t *dfa;
1259      bin_tree_t *node;
1260 {
1261   reg_errcode_t ret;
1262   if (node->first == -1)
1263     calc_first (dfa, node);
1264   if (node->next == -1)
1265     calc_next (dfa, node);
1266   calc_epsdest (dfa, node);
1267
1268   /* Calculate "first" etc. for the left child.  */
1269   if (node->left != NULL)
1270     {
1271       ret = analyze_tree (dfa, node->left);
1272       if (BE (ret != REG_NOERROR, 0))
1273         return ret;
1274     }
1275   /* Calculate "first" etc. for the right child.  */
1276   if (node->right != NULL)
1277     {
1278       ret = analyze_tree (dfa, node->right);
1279       if (BE (ret != REG_NOERROR, 0))
1280         return ret;
1281     }
1282   return REG_NOERROR;
1283 }
1284
1285 /* Calculate "first" for the node NODE.  */
1286 static void
1287 calc_first (dfa, node)
1288      re_dfa_t *dfa;
1289      bin_tree_t *node;
1290 {
1291   int idx, type;
1292   idx = node->node_idx;
1293   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1294
1295   switch (type)
1296     {
1297 #ifdef DEBUG
1298     case OP_OPEN_BRACKET:
1299     case OP_CLOSE_BRACKET:
1300     case OP_OPEN_DUP_NUM:
1301     case OP_CLOSE_DUP_NUM:
1302     case OP_DUP_PLUS:
1303     case OP_NON_MATCH_LIST:
1304     case OP_OPEN_COLL_ELEM:
1305     case OP_CLOSE_COLL_ELEM:
1306     case OP_OPEN_EQUIV_CLASS:
1307     case OP_CLOSE_EQUIV_CLASS:
1308     case OP_OPEN_CHAR_CLASS:
1309     case OP_CLOSE_CHAR_CLASS:
1310       /* These must not appear here.  */
1311       assert (0);
1312 #endif
1313     case END_OF_RE:
1314     case CHARACTER:
1315     case OP_PERIOD:
1316     case OP_DUP_ASTERISK:
1317     case OP_DUP_QUESTION:
1318 #ifdef RE_ENABLE_I18N
1319     case OP_UTF8_PERIOD:
1320     case COMPLEX_BRACKET:
1321 #endif /* RE_ENABLE_I18N */
1322     case SIMPLE_BRACKET:
1323     case OP_BACK_REF:
1324     case ANCHOR:
1325     case OP_OPEN_SUBEXP:
1326     case OP_CLOSE_SUBEXP:
1327       node->first = idx;
1328       break;
1329     case OP_ALT:
1330       node->first = idx;
1331       break;
1332       /* else fall through */
1333     default:
1334 #ifdef DEBUG
1335       assert (node->left != NULL);
1336 #endif
1337       if (node->left->first == -1)
1338         calc_first (dfa, node->left);
1339       node->first = node->left->first;
1340       break;
1341     }
1342 }
1343
1344 /* Calculate "next" for the node NODE.  */
1345
1346 static void
1347 calc_next (dfa, node)
1348      re_dfa_t *dfa;
1349      bin_tree_t *node;
1350 {
1351   int idx, type;
1352   bin_tree_t *parent = node->parent;
1353   if (parent == NULL)
1354     {
1355       node->next = -1;
1356       idx = node->node_idx;
1357       if (node->type == 0)
1358         dfa->nexts[idx] = node->next;
1359       return;
1360     }
1361
1362   idx = parent->node_idx;
1363   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1364
1365   switch (type)
1366     {
1367     case OP_DUP_ASTERISK:
1368       node->next = idx;
1369       break;
1370     case CONCAT:
1371       if (parent->left == node)
1372         {
1373           if (parent->right->first == -1)
1374             calc_first (dfa, parent->right);
1375           node->next = parent->right->first;
1376           break;
1377         }
1378       /* else fall through */
1379     default:
1380       if (parent->next == -1)
1381         calc_next (dfa, parent);
1382       node->next = parent->next;
1383       break;
1384     }
1385   idx = node->node_idx;
1386   if (node->type == 0)
1387     dfa->nexts[idx] = node->next;
1388 }
1389
1390 /* Calculate "edest" for the node NODE.  */
1391
1392 static void
1393 calc_epsdest (dfa, node)
1394      re_dfa_t *dfa;
1395      bin_tree_t *node;
1396 {
1397   int idx;
1398   idx = node->node_idx;
1399   if (node->type == 0)
1400     {
1401       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1402           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1403         {
1404           if (node->left->first == -1)
1405             calc_first (dfa, node->left);
1406           if (node->next == -1)
1407             calc_next (dfa, node);
1408           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1409                               node->next);
1410         }
1411       else if (dfa->nodes[idx].type == OP_ALT)
1412         {
1413           int left, right;
1414           if (node->left != NULL)
1415             {
1416               if (node->left->first == -1)
1417                 calc_first (dfa, node->left);
1418               left = node->left->first;
1419             }
1420           else
1421             {
1422               if (node->next == -1)
1423                 calc_next (dfa, node);
1424               left = node->next;
1425             }
1426           if (node->right != NULL)
1427             {
1428               if (node->right->first == -1)
1429                 calc_first (dfa, node->right);
1430               right = node->right->first;
1431             }
1432           else
1433             {
1434               if (node->next == -1)
1435                 calc_next (dfa, node);
1436               right = node->next;
1437             }
1438           re_node_set_init_2 (dfa->edests + idx, left, right);
1439         }
1440       else if (dfa->nodes[idx].type == ANCHOR
1441                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1442                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1443                || dfa->nodes[idx].type == OP_BACK_REF)
1444         re_node_set_init_1 (dfa->edests + idx, node->next);
1445       else
1446         assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1447     }
1448 }
1449
1450 /* Duplicate the epsilon closure of the node ROOT_NODE.
1451    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1452    to their own constraint.  */
1453
1454 static reg_errcode_t
1455 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1456                         init_constraint)
1457      re_dfa_t *dfa;
1458      int top_org_node, top_clone_node, root_node;
1459      unsigned int init_constraint;
1460 {
1461   reg_errcode_t err;
1462   int org_node, clone_node, ret;
1463   unsigned int constraint = init_constraint;
1464   for (org_node = top_org_node, clone_node = top_clone_node;;)
1465     {
1466       int org_dest, clone_dest;
1467       if (dfa->nodes[org_node].type == OP_BACK_REF)
1468         {
1469           /* If the back reference epsilon-transit, its destination must
1470              also have the constraint.  Then duplicate the epsilon closure
1471              of the destination of the back reference, and store it in
1472              edests of the back reference.  */
1473           org_dest = dfa->nexts[org_node];
1474           re_node_set_empty (dfa->edests + clone_node);
1475           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1476           if (BE (err != REG_NOERROR, 0))
1477             return err;
1478           dfa->nexts[clone_node] = dfa->nexts[org_node];
1479           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1480           if (BE (ret < 0, 0))
1481             return REG_ESPACE;
1482         }
1483       else if (dfa->edests[org_node].nelem == 0)
1484         {
1485           /* In case of the node can't epsilon-transit, don't duplicate the
1486              destination and store the original destination as the
1487              destination of the node.  */
1488           dfa->nexts[clone_node] = dfa->nexts[org_node];
1489           break;
1490         }
1491       else if (dfa->edests[org_node].nelem == 1)
1492         {
1493           /* In case of the node can epsilon-transit, and it has only one
1494              destination.  */
1495           org_dest = dfa->edests[org_node].elems[0];
1496           re_node_set_empty (dfa->edests + clone_node);
1497           if (dfa->nodes[org_node].type == ANCHOR)
1498             {
1499               /* In case of the node has another constraint, append it.  */
1500               if (org_node == root_node && clone_node != org_node)
1501                 {
1502                   /* ...but if the node is root_node itself, it means the
1503                      epsilon closure have a loop, then tie it to the
1504                      destination of the root_node.  */
1505                   ret = re_node_set_insert (dfa->edests + clone_node,
1506                                             org_dest);
1507                   if (BE (ret < 0, 0))
1508                     return REG_ESPACE;
1509                   break;
1510                 }
1511               constraint |= dfa->nodes[org_node].opr.ctx_type;
1512             }
1513           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1514           if (BE (err != REG_NOERROR, 0))
1515             return err;
1516           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517           if (BE (ret < 0, 0))
1518             return REG_ESPACE;
1519         }
1520       else /* dfa->edests[org_node].nelem == 2 */
1521         {
1522           /* In case of the node can epsilon-transit, and it has two
1523              destinations. E.g. '|', '*', '+', '?'.   */
1524           org_dest = dfa->edests[org_node].elems[0];
1525           re_node_set_empty (dfa->edests + clone_node);
1526           /* Search for a duplicated node which satisfies the constraint.  */
1527           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528           if (clone_dest == -1)
1529             {
1530               /* There are no such a duplicated node, create a new one.  */
1531               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1532               if (BE (err != REG_NOERROR, 0))
1533                 return err;
1534               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1535               if (BE (ret < 0, 0))
1536                 return REG_ESPACE;
1537               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1538                                             root_node, constraint);
1539               if (BE (err != REG_NOERROR, 0))
1540                 return err;
1541             }
1542           else
1543             {
1544               /* There are a duplicated node which satisfy the constraint,
1545                  use it to avoid infinite loop.  */
1546               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1547               if (BE (ret < 0, 0))
1548                 return REG_ESPACE;
1549             }
1550
1551           org_dest = dfa->edests[org_node].elems[1];
1552           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1553           if (BE (err != REG_NOERROR, 0))
1554             return err;
1555           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556           if (BE (ret < 0, 0))
1557             return REG_ESPACE;
1558         }
1559       org_node = org_dest;
1560       clone_node = clone_dest;
1561     }
1562   return REG_NOERROR;
1563 }
1564
1565 /* Search for a node which is duplicated from the node ORG_NODE, and
1566    satisfies the constraint CONSTRAINT.  */
1567
1568 static int
1569 search_duplicated_node (dfa, org_node, constraint)
1570      re_dfa_t *dfa;
1571      int org_node;
1572      unsigned int constraint;
1573 {
1574   int idx;
1575   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1576     {
1577       if (org_node == dfa->org_indices[idx]
1578           && constraint == dfa->nodes[idx].constraint)
1579         return idx; /* Found.  */
1580     }
1581   return -1; /* Not found.  */
1582 }
1583
1584 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1585    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1586    otherwise return the error code.  */
1587
1588 static reg_errcode_t
1589 duplicate_node (new_idx, dfa, org_idx, constraint)
1590      re_dfa_t *dfa;
1591      int *new_idx, org_idx;
1592      unsigned int constraint;
1593 {
1594   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1595   if (BE (dup_idx == -1, 0))
1596     return REG_ESPACE;
1597   dfa->nodes[dup_idx].constraint = constraint;
1598   if (dfa->nodes[org_idx].type == ANCHOR)
1599     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1600   dfa->nodes[dup_idx].duplicated = 1;
1601   re_node_set_init_empty (dfa->edests + dup_idx);
1602   re_node_set_init_empty (dfa->eclosures + dup_idx);
1603   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1604
1605   /* Store the index of the original node.  */
1606   dfa->org_indices[dup_idx] = org_idx;
1607   *new_idx = dup_idx;
1608   return REG_NOERROR;
1609 }
1610
1611 static void
1612 calc_inveclosure (dfa)
1613      re_dfa_t *dfa;
1614 {
1615   int src, idx, dest;
1616   for (src = 0; src < dfa->nodes_len; ++src)
1617     {
1618       if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
1619         continue;
1620       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1621         {
1622           dest = dfa->eclosures[src].elems[idx];
1623           re_node_set_insert_last (dfa->inveclosures + dest, src);
1624         }
1625     }
1626 }
1627
1628 /* Calculate "eclosure" for all the node in DFA.  */
1629
1630 static reg_errcode_t
1631 calc_eclosure (dfa)
1632      re_dfa_t *dfa;
1633 {
1634   int node_idx, incomplete;
1635 #ifdef DEBUG
1636   assert (dfa->nodes_len > 0);
1637 #endif
1638   incomplete = 0;
1639   /* For each nodes, calculate epsilon closure.  */
1640   for (node_idx = 0; ; ++node_idx)
1641     {
1642       reg_errcode_t err;
1643       re_node_set eclosure_elem;
1644       if (node_idx == dfa->nodes_len)
1645         {
1646           if (!incomplete)
1647             break;
1648           incomplete = 0;
1649           node_idx = 0;
1650         }
1651
1652 #ifdef DEBUG
1653       assert (dfa->eclosures[node_idx].nelem != -1);
1654 #endif
1655       if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
1656         continue;
1657
1658       /* If we have already calculated, skip it.  */
1659       if (dfa->eclosures[node_idx].nelem != 0)
1660         continue;
1661       /* Calculate epsilon closure of `node_idx'.  */
1662       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1663       if (BE (err != REG_NOERROR, 0))
1664         return err;
1665
1666       if (dfa->eclosures[node_idx].nelem == 0)
1667         {
1668           incomplete = 1;
1669           re_node_set_free (&eclosure_elem);
1670         }
1671     }
1672   return REG_NOERROR;
1673 }
1674
1675 /* Calculate epsilon closure of NODE.  */
1676
1677 static reg_errcode_t
1678 calc_eclosure_iter (new_set, dfa, node, root)
1679      re_node_set *new_set;
1680      re_dfa_t *dfa;
1681      int node, root;
1682 {
1683   reg_errcode_t err;
1684   unsigned int constraint;
1685   int i, incomplete;
1686   re_node_set eclosure;
1687   incomplete = 0;
1688   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1689   if (BE (err != REG_NOERROR, 0))
1690     return err;
1691
1692   /* This indicates that we are calculating this node now.
1693      We reference this value to avoid infinite loop.  */
1694   dfa->eclosures[node].nelem = -1;
1695
1696   constraint = ((dfa->nodes[node].type == ANCHOR)
1697                 ? dfa->nodes[node].opr.ctx_type : 0);
1698   /* If the current node has constraints, duplicate all nodes.
1699      Since they must inherit the constraints.  */
1700   if (constraint
1701       && dfa->edests[node].nelem
1702       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1703     {
1704       int org_node, cur_node;
1705       org_node = cur_node = node;
1706       err = duplicate_node_closure (dfa, node, node, node, constraint);
1707       if (BE (err != REG_NOERROR, 0))
1708         return err;
1709     }
1710
1711   /* Expand each epsilon destination nodes.  */
1712   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1713     for (i = 0; i < dfa->edests[node].nelem; ++i)
1714       {
1715         re_node_set eclosure_elem;
1716         int edest = dfa->edests[node].elems[i];
1717         /* If calculating the epsilon closure of `edest' is in progress,
1718            return intermediate result.  */
1719         if (dfa->eclosures[edest].nelem == -1)
1720           {
1721             incomplete = 1;
1722             continue;
1723           }
1724         /* If we haven't calculated the epsilon closure of `edest' yet,
1725            calculate now. Otherwise use calculated epsilon closure.  */
1726         if (dfa->eclosures[edest].nelem == 0)
1727           {
1728             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1729             if (BE (err != REG_NOERROR, 0))
1730               return err;
1731           }
1732         else
1733           eclosure_elem = dfa->eclosures[edest];
1734         /* Merge the epsilon closure of `edest'.  */
1735         re_node_set_merge (&eclosure, &eclosure_elem);
1736         /* If the epsilon closure of `edest' is incomplete,
1737            the epsilon closure of this node is also incomplete.  */
1738         if (dfa->eclosures[edest].nelem == 0)
1739           {
1740             incomplete = 1;
1741             re_node_set_free (&eclosure_elem);
1742           }
1743       }
1744
1745   /* Epsilon closures include itself.  */
1746   re_node_set_insert (&eclosure, node);
1747   if (incomplete && !root)
1748     dfa->eclosures[node].nelem = 0;
1749   else
1750     dfa->eclosures[node] = eclosure;
1751   *new_set = eclosure;
1752   return REG_NOERROR;
1753 }
1754 \f
1755 /* Functions for token which are used in the parser.  */
1756
1757 /* Fetch a token from INPUT.
1758    We must not use this function inside bracket expressions.  */
1759
1760 static void
1761 fetch_token (result, input, syntax)
1762      re_token_t *result;
1763      re_string_t *input;
1764      reg_syntax_t syntax;
1765 {
1766   re_string_skip_bytes (input, peek_token (result, input, syntax));
1767 }
1768
1769 /* Peek a token from INPUT, and return the length of the token.
1770    We must not use this function inside bracket expressions.  */
1771
1772 static int
1773 peek_token (token, input, syntax)
1774      re_token_t *token;
1775      re_string_t *input;
1776      reg_syntax_t syntax;
1777 {
1778   unsigned char c;
1779
1780   if (re_string_eoi (input))
1781     {
1782       token->type = END_OF_RE;
1783       return 0;
1784     }
1785
1786   c = re_string_peek_byte (input, 0);
1787   token->opr.c = c;
1788
1789   token->word_char = 0;
1790 #ifdef RE_ENABLE_I18N
1791   token->mb_partial = 0;
1792   if (input->mb_cur_max > 1 &&
1793       !re_string_first_byte (input, re_string_cur_idx (input)))
1794     {
1795       token->type = CHARACTER;
1796       token->mb_partial = 1;
1797       return 1;
1798     }
1799 #endif
1800   if (c == '\\')
1801     {
1802       unsigned char c2;
1803       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1804         {
1805           token->type = BACK_SLASH;
1806           return 1;
1807         }
1808
1809       c2 = re_string_peek_byte_case (input, 1);
1810       token->opr.c = c2;
1811       token->type = CHARACTER;
1812 #ifdef RE_ENABLE_I18N
1813       if (input->mb_cur_max > 1)
1814         {
1815           wint_t wc = re_string_wchar_at (input,
1816                                           re_string_cur_idx (input) + 1);
1817           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1818         }
1819       else
1820 #endif
1821         token->word_char = IS_WORD_CHAR (c2) != 0;
1822
1823       switch (c2)
1824         {
1825         case '|':
1826           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1827             token->type = OP_ALT;
1828           break;
1829         case '1': case '2': case '3': case '4': case '5':
1830         case '6': case '7': case '8': case '9':
1831           if (!(syntax & RE_NO_BK_REFS))
1832             {
1833               token->type = OP_BACK_REF;
1834               token->opr.idx = c2 - '1';
1835             }
1836           break;
1837         case '<':
1838           if (!(syntax & RE_NO_GNU_OPS))
1839             {
1840               token->type = ANCHOR;
1841               token->opr.ctx_type = WORD_FIRST;
1842             }
1843           break;
1844         case '>':
1845           if (!(syntax & RE_NO_GNU_OPS))
1846             {
1847               token->type = ANCHOR;
1848               token->opr.ctx_type = WORD_LAST;
1849             }
1850           break;
1851         case 'b':
1852           if (!(syntax & RE_NO_GNU_OPS))
1853             {
1854               token->type = ANCHOR;
1855               token->opr.ctx_type = WORD_DELIM;
1856             }
1857           break;
1858         case 'B':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             {
1861               token->type = ANCHOR;
1862               token->opr.ctx_type = INSIDE_WORD;
1863             }
1864           break;
1865         case 'w':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             token->type = OP_WORD;
1868           break;
1869         case 'W':
1870           if (!(syntax & RE_NO_GNU_OPS))
1871             token->type = OP_NOTWORD;
1872           break;
1873         case 's':
1874           if (!(syntax & RE_NO_GNU_OPS))
1875             token->type = OP_SPACE;
1876           break;
1877         case 'S':
1878           if (!(syntax & RE_NO_GNU_OPS))
1879             token->type = OP_NOTSPACE;
1880           break;
1881         case '`':
1882           if (!(syntax & RE_NO_GNU_OPS))
1883             {
1884               token->type = ANCHOR;
1885               token->opr.ctx_type = BUF_FIRST;
1886             }
1887           break;
1888         case '\'':
1889           if (!(syntax & RE_NO_GNU_OPS))
1890             {
1891               token->type = ANCHOR;
1892               token->opr.ctx_type = BUF_LAST;
1893             }
1894           break;
1895         case '(':
1896           if (!(syntax & RE_NO_BK_PARENS))
1897             token->type = OP_OPEN_SUBEXP;
1898           break;
1899         case ')':
1900           if (!(syntax & RE_NO_BK_PARENS))
1901             token->type = OP_CLOSE_SUBEXP;
1902           break;
1903         case '+':
1904           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1905             token->type = OP_DUP_PLUS;
1906           break;
1907         case '?':
1908           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1909             token->type = OP_DUP_QUESTION;
1910           break;
1911         case '{':
1912           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1913             token->type = OP_OPEN_DUP_NUM;
1914           break;
1915         case '}':
1916           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1917             token->type = OP_CLOSE_DUP_NUM;
1918           break;
1919         default:
1920           break;
1921         }
1922       return 2;
1923     }
1924
1925   token->type = CHARACTER;
1926 #ifdef RE_ENABLE_I18N
1927   if (input->mb_cur_max > 1)
1928     {
1929       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1930       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1931     }
1932   else
1933 #endif
1934     token->word_char = IS_WORD_CHAR (token->opr.c);
1935
1936   switch (c)
1937     {
1938     case '\n':
1939       if (syntax & RE_NEWLINE_ALT)
1940         token->type = OP_ALT;
1941       break;
1942     case '|':
1943       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1944         token->type = OP_ALT;
1945       break;
1946     case '*':
1947       token->type = OP_DUP_ASTERISK;
1948       break;
1949     case '+':
1950       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1951         token->type = OP_DUP_PLUS;
1952       break;
1953     case '?':
1954       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1955         token->type = OP_DUP_QUESTION;
1956       break;
1957     case '{':
1958       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1959         token->type = OP_OPEN_DUP_NUM;
1960       break;
1961     case '}':
1962       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1963         token->type = OP_CLOSE_DUP_NUM;
1964       break;
1965     case '(':
1966       if (syntax & RE_NO_BK_PARENS)
1967         token->type = OP_OPEN_SUBEXP;
1968       break;
1969     case ')':
1970       if (syntax & RE_NO_BK_PARENS)
1971         token->type = OP_CLOSE_SUBEXP;
1972       break;
1973     case '[':
1974       token->type = OP_OPEN_BRACKET;
1975       break;
1976     case '.':
1977       token->type = OP_PERIOD;
1978       break;
1979     case '^':
1980       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1981           re_string_cur_idx (input) != 0)
1982         {
1983           char prev = re_string_peek_byte (input, -1);
1984           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1985             break;
1986         }
1987       token->type = ANCHOR;
1988       token->opr.ctx_type = LINE_FIRST;
1989       break;
1990     case '$':
1991       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1992           re_string_cur_idx (input) + 1 != re_string_length (input))
1993         {
1994           re_token_t next;
1995           re_string_skip_bytes (input, 1);
1996           peek_token (&next, input, syntax);
1997           re_string_skip_bytes (input, -1);
1998           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1999             break;
2000         }
2001       token->type = ANCHOR;
2002       token->opr.ctx_type = LINE_LAST;
2003       break;
2004     default:
2005       break;
2006     }
2007   return 1;
2008 }
2009
2010 /* Peek a token from INPUT, and return the length of the token.
2011    We must not use this function out of bracket expressions.  */
2012
2013 static int
2014 peek_token_bracket (token, input, syntax)
2015      re_token_t *token;
2016      re_string_t *input;
2017      reg_syntax_t syntax;
2018 {
2019   unsigned char c;
2020   if (re_string_eoi (input))
2021     {
2022       token->type = END_OF_RE;
2023       return 0;
2024     }
2025   c = re_string_peek_byte (input, 0);
2026   token->opr.c = c;
2027
2028 #ifdef RE_ENABLE_I18N
2029   if (input->mb_cur_max > 1 &&
2030       !re_string_first_byte (input, re_string_cur_idx (input)))
2031     {
2032       token->type = CHARACTER;
2033       return 1;
2034     }
2035 #endif /* RE_ENABLE_I18N */
2036
2037   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2038       && re_string_cur_idx (input) + 1 < re_string_length (input))
2039     {
2040       /* In this case, '\' escape a character.  */
2041       unsigned char c2;
2042       re_string_skip_bytes (input, 1);
2043       c2 = re_string_peek_byte (input, 0);
2044       token->opr.c = c2;
2045       token->type = CHARACTER;
2046       return 1;
2047     }
2048   if (c == '[') /* '[' is a special char in a bracket exps.  */
2049     {
2050       unsigned char c2;
2051       int token_len;
2052       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2053         c2 = re_string_peek_byte (input, 1);
2054       else
2055         c2 = 0;
2056       token->opr.c = c2;
2057       token_len = 2;
2058       switch (c2)
2059         {
2060         case '.':
2061           token->type = OP_OPEN_COLL_ELEM;
2062           break;
2063         case '=':
2064           token->type = OP_OPEN_EQUIV_CLASS;
2065           break;
2066         case ':':
2067           if (syntax & RE_CHAR_CLASSES)
2068             {
2069               token->type = OP_OPEN_CHAR_CLASS;
2070               break;
2071             }
2072           /* else fall through.  */
2073         default:
2074           token->type = CHARACTER;
2075           token->opr.c = c;
2076           token_len = 1;
2077           break;
2078         }
2079       return token_len;
2080     }
2081   switch (c)
2082     {
2083     case '-':
2084       token->type = OP_CHARSET_RANGE;
2085       break;
2086     case ']':
2087       token->type = OP_CLOSE_BRACKET;
2088       break;
2089     case '^':
2090       token->type = OP_NON_MATCH_LIST;
2091       break;
2092     default:
2093       token->type = CHARACTER;
2094     }
2095   return 1;
2096 }
2097 \f
2098 /* Functions for parser.  */
2099
2100 /* Entry point of the parser.
2101    Parse the regular expression REGEXP and return the structure tree.
2102    If an error is occured, ERR is set by error code, and return NULL.
2103    This function build the following tree, from regular expression <reg_exp>:
2104            CAT
2105            / \
2106           /   \
2107    <reg_exp>  EOR
2108
2109    CAT means concatenation.
2110    EOR means end of regular expression.  */
2111
2112 static bin_tree_t *
2113 parse (regexp, preg, syntax, err)
2114      re_string_t *regexp;
2115      regex_t *preg;
2116      reg_syntax_t syntax;
2117      reg_errcode_t *err;
2118 {
2119   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2120   bin_tree_t *tree, *eor, *root;
2121   re_token_t current_token;
2122   dfa->syntax = syntax;
2123   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2124   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2125   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2126     return NULL;
2127   eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
2128   if (tree != NULL)
2129     root = create_tree (dfa, tree, eor, CONCAT, 0);
2130   else
2131     root = eor;
2132   if (BE (eor == NULL || root == NULL, 0))
2133     {
2134       *err = REG_ESPACE;
2135       return NULL;
2136     }
2137   return root;
2138 }
2139
2140 /* This function build the following tree, from regular expression
2141    <branch1>|<branch2>:
2142            ALT
2143            / \
2144           /   \
2145    <branch1> <branch2>
2146
2147    ALT means alternative, which represents the operator `|'.  */
2148
2149 static bin_tree_t *
2150 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2151      re_string_t *regexp;
2152      regex_t *preg;
2153      re_token_t *token;
2154      reg_syntax_t syntax;
2155      int nest;
2156      reg_errcode_t *err;
2157 {
2158   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2159   bin_tree_t *tree, *branch = NULL;
2160   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2161   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162     return NULL;
2163
2164   while (token->type == OP_ALT)
2165     {
2166       re_token_t alt_token = *token;
2167       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2168       if (token->type != OP_ALT && token->type != END_OF_RE
2169           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2170         {
2171           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2172           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2173             return NULL;
2174         }
2175       else
2176         branch = NULL;
2177       tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2178       if (BE (tree == NULL, 0))
2179         {
2180           *err = REG_ESPACE;
2181           return NULL;
2182         }
2183       dfa->has_plural_match = 1;
2184     }
2185   return tree;
2186 }
2187
2188 /* This function build the following tree, from regular expression
2189    <exp1><exp2>:
2190         CAT
2191         / \
2192        /   \
2193    <exp1> <exp2>
2194
2195    CAT means concatenation.  */
2196
2197 static bin_tree_t *
2198 parse_branch (regexp, preg, token, syntax, nest, err)
2199      re_string_t *regexp;
2200      regex_t *preg;
2201      re_token_t *token;
2202      reg_syntax_t syntax;
2203      int nest;
2204      reg_errcode_t *err;
2205 {
2206   bin_tree_t *tree, *exp;
2207   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2208   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2209   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210     return NULL;
2211
2212   while (token->type != OP_ALT && token->type != END_OF_RE
2213          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2214     {
2215       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2216       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2217         {
2218           return NULL;
2219         }
2220       if (tree != NULL && exp != NULL)
2221         {
2222           tree = create_tree (dfa, tree, exp, CONCAT, 0);
2223           if (tree == NULL)
2224             {
2225               *err = REG_ESPACE;
2226               return NULL;
2227             }
2228         }
2229       else if (tree == NULL)
2230         tree = exp;
2231       /* Otherwise exp == NULL, we don't need to create new tree.  */
2232     }
2233   return tree;
2234 }
2235
2236 /* This function build the following tree, from regular expression a*:
2237          *
2238          |
2239          a
2240 */
2241
2242 static bin_tree_t *
2243 parse_expression (regexp, preg, token, syntax, nest, err)
2244      re_string_t *regexp;
2245      regex_t *preg;
2246      re_token_t *token;
2247      reg_syntax_t syntax;
2248      int nest;
2249      reg_errcode_t *err;
2250 {
2251   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2252   bin_tree_t *tree;
2253   switch (token->type)
2254     {
2255     case CHARACTER:
2256       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2257       if (BE (tree == NULL, 0))
2258         {
2259           *err = REG_ESPACE;
2260           return NULL;
2261         }
2262 #ifdef RE_ENABLE_I18N
2263       if (dfa->mb_cur_max > 1)
2264         {
2265           while (!re_string_eoi (regexp)
2266                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2267             {
2268               bin_tree_t *mbc_remain;
2269               fetch_token (token, regexp, syntax);
2270               mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2271               tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2272               if (BE (mbc_remain == NULL || tree == NULL, 0))
2273                 {
2274                   *err = REG_ESPACE;
2275                   return NULL;
2276                 }
2277             }
2278         }
2279 #endif
2280       break;
2281     case OP_OPEN_SUBEXP:
2282       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2283       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2284         return NULL;
2285       break;
2286     case OP_OPEN_BRACKET:
2287       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2288       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2289         return NULL;
2290       break;
2291     case OP_BACK_REF:
2292       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2293         {
2294           *err = REG_ESUBREG;
2295           return NULL;
2296         }
2297       dfa->used_bkref_map |= 1 << token->opr.idx;
2298       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2299       if (BE (tree == NULL, 0))
2300         {
2301           *err = REG_ESPACE;
2302           return NULL;
2303         }
2304       ++dfa->nbackref;
2305       dfa->has_mb_node = 1;
2306       break;
2307     case OP_OPEN_DUP_NUM:
2308       if (syntax & RE_CONTEXT_INVALID_DUP)
2309         {
2310           *err = REG_BADRPT;
2311           return NULL;
2312         }
2313       /* FALLTHROUGH */
2314     case OP_DUP_ASTERISK:
2315     case OP_DUP_PLUS:
2316     case OP_DUP_QUESTION:
2317       if (syntax & RE_CONTEXT_INVALID_OPS)
2318         {
2319           *err = REG_BADRPT;
2320           return NULL;
2321         }
2322       else if (syntax & RE_CONTEXT_INDEP_OPS)
2323         {
2324           fetch_token (token, regexp, syntax);
2325           return parse_expression (regexp, preg, token, syntax, nest, err);
2326         }
2327       /* else fall through  */
2328     case OP_CLOSE_SUBEXP:
2329       if ((token->type == OP_CLOSE_SUBEXP) &&
2330           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2331         {
2332           *err = REG_ERPAREN;
2333           return NULL;
2334         }
2335       /* else fall through  */
2336     case OP_CLOSE_DUP_NUM:
2337       /* We treat it as a normal character.  */
2338
2339       /* Then we can these characters as normal characters.  */
2340       token->type = CHARACTER;
2341       /* mb_partial and word_char bits should be initialized already
2342          by peek_token.  */
2343       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2344       if (BE (tree == NULL, 0))
2345         {
2346           *err = REG_ESPACE;
2347           return NULL;
2348         }
2349       break;
2350     case ANCHOR:
2351       if ((token->opr.ctx_type
2352            & (WORD_DELIM | INSIDE_WORD | WORD_FIRST | WORD_LAST))
2353           && dfa->word_ops_used == 0)
2354         init_word_char (dfa);
2355       if (token->opr.ctx_type == WORD_DELIM)
2356         {
2357           bin_tree_t *tree_first, *tree_last;
2358           token->opr.ctx_type = WORD_FIRST;
2359           tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2360           token->opr.ctx_type = WORD_LAST;
2361           tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2362           token->type = OP_ALT;
2363           tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2364           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2365             {
2366               *err = REG_ESPACE;
2367               return NULL;
2368             }
2369         }
2370       else
2371         {
2372           tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2373           if (BE (tree == NULL, 0))
2374             {
2375               *err = REG_ESPACE;
2376               return NULL;
2377             }
2378         }
2379       /* We must return here, since ANCHORs can't be followed
2380          by repetition operators.
2381          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2382              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2383       fetch_token (token, regexp, syntax);
2384       return tree;
2385     case OP_PERIOD:
2386       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2387       if (BE (tree == NULL, 0))
2388         {
2389           *err = REG_ESPACE;
2390           return NULL;
2391         }
2392       if (dfa->mb_cur_max > 1)
2393         dfa->has_mb_node = 1;
2394       break;
2395     case OP_WORD:
2396     case OP_NOTWORD:
2397       tree = build_charclass_op (dfa, regexp->trans,
2398                                  (const unsigned char *) "alnum",
2399                                  (const unsigned char *) "_",
2400                                  token->type == OP_NOTWORD, err);
2401       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2402         return NULL;
2403       break;
2404     case OP_SPACE:
2405     case OP_NOTSPACE:
2406       tree = build_charclass_op (dfa, regexp->trans,
2407                                  (const unsigned char *) "space",
2408                                  (const unsigned char *) "",
2409                                  token->type == OP_NOTSPACE, err);
2410       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2411         return NULL;
2412       break;
2413     case OP_ALT:
2414     case END_OF_RE:
2415       return NULL;
2416     case BACK_SLASH:
2417       *err = REG_EESCAPE;
2418       return NULL;
2419     default:
2420       /* Must not happen?  */
2421 #ifdef DEBUG
2422       assert (0);
2423 #endif
2424       return NULL;
2425     }
2426   fetch_token (token, regexp, syntax);
2427
2428   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2429          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2430     {
2431       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2432       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2433         return NULL;
2434       /* In BRE consecutive duplications are not allowed.  */
2435       if ((syntax & RE_CONTEXT_INVALID_DUP)
2436           && (token->type == OP_DUP_ASTERISK
2437               || token->type == OP_OPEN_DUP_NUM))
2438         {
2439           *err = REG_BADRPT;
2440           return NULL;
2441         }
2442       dfa->has_plural_match = 1;
2443     }
2444
2445   return tree;
2446 }
2447
2448 /* This function build the following tree, from regular expression
2449    (<reg_exp>):
2450          SUBEXP
2451             |
2452         <reg_exp>
2453 */
2454
2455 static bin_tree_t *
2456 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2457      re_string_t *regexp;
2458      regex_t *preg;
2459      re_token_t *token;
2460      reg_syntax_t syntax;
2461      int nest;
2462      reg_errcode_t *err;
2463 {
2464   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2465   bin_tree_t *tree, *left_par, *right_par;
2466   size_t cur_nsub;
2467   cur_nsub = preg->re_nsub++;
2468
2469   left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2470   if (BE (left_par == NULL, 0))
2471     {
2472       *err = REG_ESPACE;
2473       return NULL;
2474     }
2475   dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2476   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2477
2478   /* The subexpression may be a null string.  */
2479   if (token->type == OP_CLOSE_SUBEXP)
2480     tree = NULL;
2481   else
2482     {
2483       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2484       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2485         return NULL;
2486     }
2487   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2488     {
2489       *err = REG_EPAREN;
2490       return NULL;
2491     }
2492   right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2493   dfa->completed_bkref_map |= 1 << cur_nsub;
2494   tree = ((tree == NULL) ? right_par
2495           : create_tree (dfa, tree, right_par, CONCAT, 0));
2496   tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2497   if (BE (right_par == NULL || tree == NULL, 0))
2498     {
2499       *err = REG_ESPACE;
2500       return NULL;
2501     }
2502   dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2503
2504   return tree;
2505 }
2506
2507 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2508
2509 static bin_tree_t *
2510 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2511      bin_tree_t *elem;
2512      re_string_t *regexp;
2513      re_dfa_t *dfa;
2514      re_token_t *token;
2515      reg_syntax_t syntax;
2516      reg_errcode_t *err;
2517 {
2518   re_token_t dup_token;
2519   bin_tree_t *tree = NULL, *old_tree = NULL;
2520   int i, start, end, start_idx = re_string_cur_idx (regexp);
2521   re_token_t start_token = *token;
2522
2523   if (token->type == OP_OPEN_DUP_NUM)
2524     {
2525       end = 0;
2526       start = fetch_number (regexp, token, syntax);
2527       if (start == -1)
2528         {
2529           if (token->type == CHARACTER && token->opr.c == ',')
2530             start = 0; /* We treat "{,m}" as "{0,m}".  */
2531           else
2532             {
2533               *err = REG_BADBR; /* <re>{} is invalid.  */
2534               return NULL;
2535             }
2536         }
2537       if (BE (start != -2, 1))
2538         {
2539           /* We treat "{n}" as "{n,n}".  */
2540           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2541                  : ((token->type == CHARACTER && token->opr.c == ',')
2542                     ? fetch_number (regexp, token, syntax) : -2));
2543         }
2544       if (BE (start == -2 || end == -2, 0))
2545         {
2546           /* Invalid sequence.  */
2547           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2548             {
2549               if (token->type == END_OF_RE)
2550                 *err = REG_EBRACE;
2551               else
2552                 *err = REG_BADBR;
2553
2554               return NULL;
2555             }
2556
2557           /* If the syntax bit is set, rollback.  */
2558           re_string_set_index (regexp, start_idx);
2559           *token = start_token;
2560           token->type = CHARACTER;
2561           /* mb_partial and word_char bits should be already initialized by
2562              peek_token.  */
2563           return elem;
2564         }
2565
2566       if (BE (end != -1 && start > end, 0))
2567         {
2568           /* First number greater than second.  */
2569           *err = REG_BADBR;
2570           return NULL;
2571         }
2572     }
2573   else
2574     {
2575       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2576       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2577     }
2578
2579   fetch_token (token, regexp, syntax);
2580
2581   /* Treat "<re>{0}*" etc. as "<re>{0}".  */
2582   if (BE (elem == NULL || (start == 0 && end == 0), 0))
2583     return NULL;
2584
2585   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2586   if (BE (start > 0, 0))
2587     {
2588       tree = elem;
2589       for (i = 2; i <= start; ++i)
2590         {
2591           elem = duplicate_tree (elem, dfa);
2592           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2593           if (BE (elem == NULL || tree == NULL, 0))
2594             goto parse_dup_op_espace;
2595         }
2596
2597       if (start == end)
2598         return tree;
2599
2600       /* Duplicate ELEM before it is marked optional.  */
2601       elem = duplicate_tree (elem, dfa);
2602       old_tree = tree;
2603     }
2604   else
2605     old_tree = NULL;
2606
2607   mark_opt_subexp (elem, dfa);
2608   dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2609   tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2610   if (BE (tree == NULL, 0))
2611     goto parse_dup_op_espace;
2612
2613   /* This loop is actually executed only when end != -1,
2614      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2615      already created the start+1-th copy.  */
2616   for (i = start + 2; i <= end; ++i)
2617     {
2618       elem = duplicate_tree (elem, dfa);
2619       tree = create_tree (dfa, tree, elem, CONCAT, 0);
2620       if (BE (elem == NULL || tree == NULL, 0))
2621         goto parse_dup_op_espace;
2622
2623       tree = re_dfa_add_tree_node (dfa, tree, NULL, &dup_token);
2624       if (BE (tree == NULL, 0))
2625         goto parse_dup_op_espace;
2626     }
2627
2628   if (old_tree)
2629     tree = create_tree (dfa, old_tree, tree, CONCAT, 0);
2630
2631   return tree;
2632
2633  parse_dup_op_espace:
2634   *err = REG_ESPACE;
2635   return NULL;
2636 }
2637
2638 /* Size of the names for collating symbol/equivalence_class/character_class.
2639    I'm not sure, but maybe enough.  */
2640 #define BRACKET_NAME_BUF_SIZE 32
2641
2642 #ifndef _LIBC
2643   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2644      Build the range expression which starts from START_ELEM, and ends
2645      at END_ELEM.  The result are written to MBCSET and SBCSET.
2646      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2647      mbcset->range_ends, is a pointer argument sinse we may
2648      update it.  */
2649
2650 static reg_errcode_t
2651 # ifdef RE_ENABLE_I18N
2652 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2653      re_charset_t *mbcset;
2654      int *range_alloc;
2655 # else /* not RE_ENABLE_I18N */
2656 build_range_exp (sbcset, start_elem, end_elem)
2657 # endif /* not RE_ENABLE_I18N */
2658      re_bitset_ptr_t sbcset;
2659      bracket_elem_t *start_elem, *end_elem;
2660 {
2661   unsigned int start_ch, end_ch;
2662   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2663   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2664           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2665           0))
2666     return REG_ERANGE;
2667
2668   /* We can handle no multi character collating elements without libc
2669      support.  */
2670   if (BE ((start_elem->type == COLL_SYM
2671            && strlen ((char *) start_elem->opr.name) > 1)
2672           || (end_elem->type == COLL_SYM
2673               && strlen ((char *) end_elem->opr.name) > 1), 0))
2674     return REG_ECOLLATE;
2675
2676 # ifdef RE_ENABLE_I18N
2677   {
2678     wchar_t wc, start_wc, end_wc;
2679     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2680
2681     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2682                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2683                    : 0));
2684     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2685               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2686                  : 0));
2687     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2688                 ? __btowc (start_ch) : start_elem->opr.wch);
2689     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2690               ? __btowc (end_ch) : end_elem->opr.wch);
2691     if (start_wc == WEOF || end_wc == WEOF)
2692       return REG_ECOLLATE;
2693     cmp_buf[0] = start_wc;
2694     cmp_buf[4] = end_wc;
2695     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2696       return REG_ERANGE;
2697
2698     /* Got valid collation sequence values, add them as a new entry.
2699        However, for !_LIBC we have no collation elements: if the
2700        character set is single byte, the single byte character set
2701        that we build below suffices.  parse_bracket_exp passes
2702        no MBCSET if dfa->mb_cur_max == 1.  */
2703     if (mbcset)
2704       {
2705         /* Check the space of the arrays.  */
2706         if (BE (*range_alloc == mbcset->nranges, 0))
2707           {
2708             /* There is not enough space, need realloc.  */
2709             wchar_t *new_array_start, *new_array_end;
2710             int new_nranges;
2711
2712             /* +1 in case of mbcset->nranges is 0.  */
2713             new_nranges = 2 * mbcset->nranges + 1;
2714             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2715                are NULL if *range_alloc == 0.  */
2716             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2717                                           new_nranges);
2718             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2719                                         new_nranges);
2720
2721             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2722               return REG_ESPACE;
2723
2724             mbcset->range_starts = new_array_start;
2725             mbcset->range_ends = new_array_end;
2726             *range_alloc = new_nranges;
2727           }
2728
2729         mbcset->range_starts[mbcset->nranges] = start_wc;
2730         mbcset->range_ends[mbcset->nranges++] = end_wc;
2731       }
2732
2733     /* Build the table for single byte characters.  */
2734     for (wc = 0; wc < SBC_MAX; ++wc)
2735       {
2736         cmp_buf[2] = wc;
2737         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2738             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2739           bitset_set (sbcset, wc);
2740       }
2741   }
2742 # else /* not RE_ENABLE_I18N */
2743   {
2744     unsigned int ch;
2745     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2746                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2747                    : 0));
2748     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2749               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2750                  : 0));
2751     if (start_ch > end_ch)
2752       return REG_ERANGE;
2753     /* Build the table for single byte characters.  */
2754     for (ch = 0; ch < SBC_MAX; ++ch)
2755       if (start_ch <= ch  && ch <= end_ch)
2756         bitset_set (sbcset, ch);
2757   }
2758 # endif /* not RE_ENABLE_I18N */
2759   return REG_NOERROR;
2760 }
2761 #endif /* not _LIBC */
2762
2763 #ifndef _LIBC
2764 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2765    Build the collating element which is represented by NAME.
2766    The result are written to MBCSET and SBCSET.
2767    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2768    pointer argument since we may update it.  */
2769
2770 static reg_errcode_t
2771 # ifdef RE_ENABLE_I18N
2772 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2773      re_charset_t *mbcset;
2774      int *coll_sym_alloc;
2775 # else /* not RE_ENABLE_I18N */
2776 build_collating_symbol (sbcset, name)
2777 # endif /* not RE_ENABLE_I18N */
2778      re_bitset_ptr_t sbcset;
2779      const unsigned char *name;
2780 {
2781   size_t name_len = strlen ((const char *) name);
2782   if (BE (name_len != 1, 0))
2783     return REG_ECOLLATE;
2784   else
2785     {
2786       bitset_set (sbcset, name[0]);
2787       return REG_NOERROR;
2788     }
2789 }
2790 #endif /* not _LIBC */
2791
2792 /* This function parse bracket expression like "[abc]", "[a-c]",
2793    "[[.a-a.]]" etc.  */
2794
2795 static bin_tree_t *
2796 parse_bracket_exp (regexp, dfa, token, syntax, err)
2797      re_string_t *regexp;
2798      re_dfa_t *dfa;
2799      re_token_t *token;
2800      reg_syntax_t syntax;
2801      reg_errcode_t *err;
2802 {
2803 #ifdef _LIBC
2804   const unsigned char *collseqmb;
2805   const char *collseqwc;
2806   uint32_t nrules;
2807   int32_t table_size;
2808   const int32_t *symb_table;
2809   const unsigned char *extra;
2810
2811   /* Local function for parse_bracket_exp used in _LIBC environement.
2812      Seek the collating symbol entry correspondings to NAME.
2813      Return the index of the symbol in the SYMB_TABLE.  */
2814
2815   auto inline int32_t
2816   __attribute ((always_inline))
2817   seek_collating_symbol_entry (name, name_len)
2818          const unsigned char *name;
2819          size_t name_len;
2820     {
2821       int32_t hash = elem_hash ((const char *) name, name_len);
2822       int32_t elem = hash % table_size;
2823       int32_t second = hash % (table_size - 2);
2824       while (symb_table[2 * elem] != 0)
2825         {
2826           /* First compare the hashing value.  */
2827           if (symb_table[2 * elem] == hash
2828               /* Compare the length of the name.  */
2829               && name_len == extra[symb_table[2 * elem + 1]]
2830               /* Compare the name.  */
2831               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2832                          name_len) == 0)
2833             {
2834               /* Yep, this is the entry.  */
2835               break;
2836             }
2837
2838           /* Next entry.  */
2839           elem += second;
2840         }
2841       return elem;
2842     }
2843
2844   /* Local function for parse_bracket_exp used in _LIBC environement.
2845      Look up the collation sequence value of BR_ELEM.
2846      Return the value if succeeded, UINT_MAX otherwise.  */
2847
2848   auto inline unsigned int
2849   __attribute ((always_inline))
2850   lookup_collation_sequence_value (br_elem)
2851          bracket_elem_t *br_elem;
2852     {
2853       if (br_elem->type == SB_CHAR)
2854         {
2855           /*
2856           if (MB_CUR_MAX == 1)
2857           */
2858           if (nrules == 0)
2859             return collseqmb[br_elem->opr.ch];
2860           else
2861             {
2862               wint_t wc = __btowc (br_elem->opr.ch);
2863               return __collseq_table_lookup (collseqwc, wc);
2864             }
2865         }
2866       else if (br_elem->type == MB_CHAR)
2867         {
2868           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2869         }
2870       else if (br_elem->type == COLL_SYM)
2871         {
2872           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2873           if (nrules != 0)
2874             {
2875               int32_t elem, idx;
2876               elem = seek_collating_symbol_entry (br_elem->opr.name,
2877                                                   sym_name_len);
2878               if (symb_table[2 * elem] != 0)
2879                 {
2880                   /* We found the entry.  */
2881                   idx = symb_table[2 * elem + 1];
2882                   /* Skip the name of collating element name.  */
2883                   idx += 1 + extra[idx];
2884                   /* Skip the byte sequence of the collating element.  */
2885                   idx += 1 + extra[idx];
2886                   /* Adjust for the alignment.  */
2887                   idx = (idx + 3) & ~3;
2888                   /* Skip the multibyte collation sequence value.  */
2889                   idx += sizeof (unsigned int);
2890                   /* Skip the wide char sequence of the collating element.  */
2891                   idx += sizeof (unsigned int) *
2892                     (1 + *(unsigned int *) (extra + idx));
2893                   /* Return the collation sequence value.  */
2894                   return *(unsigned int *) (extra + idx);
2895                 }
2896               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2897                 {
2898                   /* No valid character.  Match it as a single byte
2899                      character.  */
2900                   return collseqmb[br_elem->opr.name[0]];
2901                 }
2902             }
2903           else if (sym_name_len == 1)
2904             return collseqmb[br_elem->opr.name[0]];
2905         }
2906       return UINT_MAX;
2907     }
2908
2909   /* Local function for parse_bracket_exp used in _LIBC environement.
2910      Build the range expression which starts from START_ELEM, and ends
2911      at END_ELEM.  The result are written to MBCSET and SBCSET.
2912      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2913      mbcset->range_ends, is a pointer argument sinse we may
2914      update it.  */
2915
2916   auto inline reg_errcode_t
2917   __attribute ((always_inline))
2918   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2919          re_charset_t *mbcset;
2920          int *range_alloc;
2921          re_bitset_ptr_t sbcset;
2922          bracket_elem_t *start_elem, *end_elem;
2923     {
2924       unsigned int ch;
2925       uint32_t start_collseq;
2926       uint32_t end_collseq;
2927
2928       /* Equivalence Classes and Character Classes can't be a range
2929          start/end.  */
2930       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2931               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2932               0))
2933         return REG_ERANGE;
2934
2935       start_collseq = lookup_collation_sequence_value (start_elem);
2936       end_collseq = lookup_collation_sequence_value (end_elem);
2937       /* Check start/end collation sequence values.  */
2938       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2939         return REG_ECOLLATE;
2940       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2941         return REG_ERANGE;
2942
2943       /* Got valid collation sequence values, add them as a new entry.
2944          However, if we have no collation elements, and the character set
2945          is single byte, the single byte character set that we
2946          build below suffices. */
2947       if (nrules > 0 || dfa->mb_cur_max > 1)
2948         {
2949           /* Check the space of the arrays.  */
2950           if (BE (*range_alloc == mbcset->nranges, 0))
2951             {
2952               /* There is not enough space, need realloc.  */
2953               uint32_t *new_array_start;
2954               uint32_t *new_array_end;
2955               int new_nranges;
2956
2957               /* +1 in case of mbcset->nranges is 0.  */
2958               new_nranges = 2 * mbcset->nranges + 1;
2959               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2960                                             new_nranges);
2961               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2962                                           new_nranges);
2963
2964               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2965                 return REG_ESPACE;
2966
2967               mbcset->range_starts = new_array_start;
2968               mbcset->range_ends = new_array_end;
2969               *range_alloc = new_nranges;
2970             }
2971
2972           mbcset->range_starts[mbcset->nranges] = start_collseq;
2973           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2974         }
2975
2976       /* Build the table for single byte characters.  */
2977       for (ch = 0; ch < SBC_MAX; ch++)
2978         {
2979           uint32_t ch_collseq;
2980           /*
2981           if (MB_CUR_MAX == 1)
2982           */
2983           if (nrules == 0)
2984             ch_collseq = collseqmb[ch];
2985           else
2986             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2987           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2988             bitset_set (sbcset, ch);
2989         }
2990       return REG_NOERROR;
2991     }
2992
2993   /* Local function for parse_bracket_exp used in _LIBC environement.
2994      Build the collating element which is represented by NAME.
2995      The result are written to MBCSET and SBCSET.
2996      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2997      pointer argument sinse we may update it.  */
2998
2999   auto inline reg_errcode_t
3000   __attribute ((always_inline))
3001   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3002          re_charset_t *mbcset;
3003          int *coll_sym_alloc;
3004          re_bitset_ptr_t sbcset;
3005          const unsigned char *name;
3006     {
3007       int32_t elem, idx;
3008       size_t name_len = strlen ((const char *) name);
3009       if (nrules != 0)
3010         {
3011           elem = seek_collating_symbol_entry (name, name_len);
3012           if (symb_table[2 * elem] != 0)
3013             {
3014               /* We found the entry.  */
3015               idx = symb_table[2 * elem + 1];
3016               /* Skip the name of collating element name.  */
3017               idx += 1 + extra[idx];
3018             }
3019           else if (symb_table[2 * elem] == 0 && name_len == 1)
3020             {
3021               /* No valid character, treat it as a normal
3022                  character.  */
3023               bitset_set (sbcset, name[0]);
3024               return REG_NOERROR;
3025             }
3026           else
3027             return REG_ECOLLATE;
3028
3029           /* Got valid collation sequence, add it as a new entry.  */
3030           /* Check the space of the arrays.  */
3031           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3032             {
3033               /* Not enough, realloc it.  */
3034               /* +1 in case of mbcset->ncoll_syms is 0.  */
3035               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3036               /* Use realloc since mbcset->coll_syms is NULL
3037                  if *alloc == 0.  */
3038               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3039                                                    new_coll_sym_alloc);
3040               if (BE (new_coll_syms == NULL, 0))
3041                 return REG_ESPACE;
3042               mbcset->coll_syms = new_coll_syms;
3043               *coll_sym_alloc = new_coll_sym_alloc;
3044             }
3045           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3046           return REG_NOERROR;
3047         }
3048       else
3049         {
3050           if (BE (name_len != 1, 0))
3051             return REG_ECOLLATE;
3052           else
3053             {
3054               bitset_set (sbcset, name[0]);
3055               return REG_NOERROR;
3056             }
3057         }
3058     }
3059 #endif
3060
3061   re_token_t br_token;
3062   re_bitset_ptr_t sbcset;
3063 #ifdef RE_ENABLE_I18N
3064   re_charset_t *mbcset;
3065   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3066   int equiv_class_alloc = 0, char_class_alloc = 0;
3067 #endif /* not RE_ENABLE_I18N */
3068   int non_match = 0;
3069   bin_tree_t *work_tree;
3070   int token_len;
3071   int first_round = 1;
3072 #ifdef _LIBC
3073   collseqmb = (const unsigned char *)
3074     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3075   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3076   if (nrules)
3077     {
3078       /*
3079       if (MB_CUR_MAX > 1)
3080       */
3081         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3082       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3083       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3084                                                   _NL_COLLATE_SYMB_TABLEMB);
3085       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3086                                                    _NL_COLLATE_SYMB_EXTRAMB);
3087     }
3088 #endif
3089   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3090 #ifdef RE_ENABLE_I18N
3091   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3092 #endif /* RE_ENABLE_I18N */
3093 #ifdef RE_ENABLE_I18N
3094   if (BE (sbcset == NULL || mbcset == NULL, 0))
3095 #else
3096   if (BE (sbcset == NULL, 0))
3097 #endif /* RE_ENABLE_I18N */
3098     {
3099       *err = REG_ESPACE;
3100       return NULL;
3101     }
3102
3103   token_len = peek_token_bracket (token, regexp, syntax);
3104   if (BE (token->type == END_OF_RE, 0))
3105     {
3106       *err = REG_BADPAT;
3107       goto parse_bracket_exp_free_return;
3108     }
3109   if (token->type == OP_NON_MATCH_LIST)
3110     {
3111 #ifdef RE_ENABLE_I18N
3112       mbcset->non_match = 1;
3113 #endif /* not RE_ENABLE_I18N */
3114       non_match = 1;
3115       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3116         bitset_set (sbcset, '\0');
3117       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3118       token_len = peek_token_bracket (token, regexp, syntax);
3119       if (BE (token->type == END_OF_RE, 0))
3120         {
3121           *err = REG_BADPAT;
3122           goto parse_bracket_exp_free_return;
3123         }
3124     }
3125
3126   /* We treat the first ']' as a normal character.  */
3127   if (token->type == OP_CLOSE_BRACKET)
3128     token->type = CHARACTER;
3129
3130   while (1)
3131     {
3132       bracket_elem_t start_elem, end_elem;
3133       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3134       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3135       reg_errcode_t ret;
3136       int token_len2 = 0, is_range_exp = 0;
3137       re_token_t token2;
3138
3139       start_elem.opr.name = start_name_buf;
3140       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3141                                    syntax, first_round);
3142       if (BE (ret != REG_NOERROR, 0))
3143         {
3144           *err = ret;
3145           goto parse_bracket_exp_free_return;
3146         }
3147       first_round = 0;
3148
3149       /* Get information about the next token.  We need it in any case.  */
3150       token_len = peek_token_bracket (token, regexp, syntax);
3151
3152       /* Do not check for ranges if we know they are not allowed.  */
3153       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3154         {
3155           if (BE (token->type == END_OF_RE, 0))
3156             {
3157               *err = REG_EBRACK;
3158               goto parse_bracket_exp_free_return;
3159             }
3160           if (token->type == OP_CHARSET_RANGE)
3161             {
3162               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3163               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3164               if (BE (token2.type == END_OF_RE, 0))
3165                 {
3166                   *err = REG_EBRACK;
3167                   goto parse_bracket_exp_free_return;
3168                 }
3169               if (token2.type == OP_CLOSE_BRACKET)
3170                 {
3171                   /* We treat the last '-' as a normal character.  */
3172                   re_string_skip_bytes (regexp, -token_len);
3173                   token->type = CHARACTER;
3174                 }
3175               else
3176                 is_range_exp = 1;
3177             }
3178         }
3179
3180       if (is_range_exp == 1)
3181         {
3182           end_elem.opr.name = end_name_buf;
3183           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3184                                        dfa, syntax, 1);
3185           if (BE (ret != REG_NOERROR, 0))
3186             {
3187               *err = ret;
3188               goto parse_bracket_exp_free_return;
3189             }
3190
3191           token_len = peek_token_bracket (token, regexp, syntax);
3192
3193 #ifdef _LIBC
3194           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3195                                   &start_elem, &end_elem);
3196 #else
3197 # ifdef RE_ENABLE_I18N
3198           *err = build_range_exp (sbcset,
3199                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3200                                   &range_alloc, &start_elem, &end_elem);
3201 # else
3202           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3203 # endif
3204 #endif /* RE_ENABLE_I18N */
3205           if (BE (*err != REG_NOERROR, 0))
3206             goto parse_bracket_exp_free_return;
3207         }
3208       else
3209         {
3210           switch (start_elem.type)
3211             {
3212             case SB_CHAR:
3213               bitset_set (sbcset, start_elem.opr.ch);
3214               break;
3215 #ifdef RE_ENABLE_I18N
3216             case MB_CHAR:
3217               /* Check whether the array has enough space.  */
3218               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3219                 {
3220                   wchar_t *new_mbchars;
3221                   /* Not enough, realloc it.  */
3222                   /* +1 in case of mbcset->nmbchars is 0.  */
3223                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3224                   /* Use realloc since array is NULL if *alloc == 0.  */
3225                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3226                                             mbchar_alloc);
3227                   if (BE (new_mbchars == NULL, 0))
3228                     goto parse_bracket_exp_espace;
3229                   mbcset->mbchars = new_mbchars;
3230                 }
3231               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3232               break;
3233 #endif /* RE_ENABLE_I18N */
3234             case EQUIV_CLASS:
3235               *err = build_equiv_class (sbcset,
3236 #ifdef RE_ENABLE_I18N
3237                                         mbcset, &equiv_class_alloc,
3238 #endif /* RE_ENABLE_I18N */
3239                                         start_elem.opr.name);
3240               if (BE (*err != REG_NOERROR, 0))
3241                 goto parse_bracket_exp_free_return;
3242               break;
3243             case COLL_SYM:
3244               *err = build_collating_symbol (sbcset,
3245 #ifdef RE_ENABLE_I18N
3246                                              mbcset, &coll_sym_alloc,
3247 #endif /* RE_ENABLE_I18N */
3248                                              start_elem.opr.name);
3249               if (BE (*err != REG_NOERROR, 0))
3250                 goto parse_bracket_exp_free_return;
3251               break;
3252             case CHAR_CLASS:
3253               *err = build_charclass (regexp->trans, sbcset,
3254 #ifdef RE_ENABLE_I18N
3255                                       mbcset, &char_class_alloc,
3256 #endif /* RE_ENABLE_I18N */
3257                                       start_elem.opr.name, syntax);
3258               if (BE (*err != REG_NOERROR, 0))
3259                goto parse_bracket_exp_free_return;
3260               break;
3261             default:
3262               assert (0);
3263               break;
3264             }
3265         }
3266       if (BE (token->type == END_OF_RE, 0))
3267         {
3268           *err = REG_EBRACK;
3269           goto parse_bracket_exp_free_return;
3270         }
3271       if (token->type == OP_CLOSE_BRACKET)
3272         break;
3273     }
3274
3275   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3276
3277   /* If it is non-matching list.  */
3278   if (non_match)
3279     bitset_not (sbcset);
3280
3281 #ifdef RE_ENABLE_I18N
3282   /* Ensure only single byte characters are set.  */
3283   if (dfa->mb_cur_max > 1)
3284     bitset_mask (sbcset, dfa->sb_char);
3285 #endif /* RE_ENABLE_I18N */
3286
3287   /* Build a tree for simple bracket.  */
3288   br_token.type = SIMPLE_BRACKET;
3289   br_token.opr.sbcset = sbcset;
3290   work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3291   if (BE (work_tree == NULL, 0))
3292     goto parse_bracket_exp_espace;
3293
3294 #ifdef RE_ENABLE_I18N
3295   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3296       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3297                                                      || mbcset->non_match)))
3298     {
3299       re_token_t alt_token;
3300       bin_tree_t *mbc_tree;
3301       int sbc_idx;
3302       /* Build a tree for complex bracket.  */
3303       dfa->has_mb_node = 1;
3304       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3305         if (sbcset[sbc_idx])
3306           break;
3307       /* If there are no bits set in sbcset, there is no point
3308          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3309       if (sbc_idx == BITSET_UINTS)
3310         {
3311           re_free (sbcset);
3312           dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3313           dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3314           return work_tree;
3315         }
3316       br_token.type = COMPLEX_BRACKET;
3317       br_token.opr.mbcset = mbcset;
3318       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3319       if (BE (mbc_tree == NULL, 0))
3320         goto parse_bracket_exp_espace;
3321       /* Then join them by ALT node.  */
3322       alt_token.type = OP_ALT;
3323       dfa->has_plural_match = 1;
3324       work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3325       if (BE (mbc_tree != NULL, 1))
3326         return work_tree;
3327     }
3328   else
3329     {
3330       free_charset (mbcset);
3331       return work_tree;
3332     }
3333 #else /* not RE_ENABLE_I18N */
3334   return work_tree;
3335 #endif /* not RE_ENABLE_I18N */
3336
3337  parse_bracket_exp_espace:
3338   *err = REG_ESPACE;
3339  parse_bracket_exp_free_return:
3340   re_free (sbcset);
3341 #ifdef RE_ENABLE_I18N
3342   free_charset (mbcset);
3343 #endif /* RE_ENABLE_I18N */
3344   return NULL;
3345 }
3346
3347 /* Parse an element in the bracket expression.  */
3348
3349 static reg_errcode_t
3350 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3351                        accept_hyphen)
3352      bracket_elem_t *elem;
3353      re_string_t *regexp;
3354      re_token_t *token;
3355      int token_len;
3356      re_dfa_t *dfa;
3357      reg_syntax_t syntax;
3358      int accept_hyphen;
3359 {
3360 #ifdef RE_ENABLE_I18N
3361   int cur_char_size;
3362   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3363   if (cur_char_size > 1)
3364     {
3365       elem->type = MB_CHAR;
3366       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3367       re_string_skip_bytes (regexp, cur_char_size);
3368       return REG_NOERROR;
3369     }
3370 #endif /* RE_ENABLE_I18N */
3371   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3372   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3373       || token->type == OP_OPEN_EQUIV_CLASS)
3374     return parse_bracket_symbol (elem, regexp, token);
3375   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3376     {
3377       /* A '-' must only appear as anything but a range indicator before
3378          the closing bracket.  Everything else is an error.  */
3379       re_token_t token2;
3380       (void) peek_token_bracket (&token2, regexp, syntax);
3381       if (token2.type != OP_CLOSE_BRACKET)
3382         /* The actual error value is not standardized since this whole
3383            case is undefined.  But ERANGE makes good sense.  */
3384         return REG_ERANGE;
3385     }
3386   elem->type = SB_CHAR;
3387   elem->opr.ch = token->opr.c;
3388   return REG_NOERROR;
3389 }
3390
3391 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3392    such as [:<character_class>:], [.<collating_element>.], and
3393    [=<equivalent_class>=].  */
3394
3395 static reg_errcode_t
3396 parse_bracket_symbol (elem, regexp, token)
3397      bracket_elem_t *elem;
3398      re_string_t *regexp;
3399      re_token_t *token;
3400 {
3401   unsigned char ch, delim = token->opr.c;
3402   int i = 0;
3403   if (re_string_eoi(regexp))
3404     return REG_EBRACK;
3405   for (;; ++i)
3406     {
3407       if (i >= BRACKET_NAME_BUF_SIZE)
3408         return REG_EBRACK;
3409       if (token->type == OP_OPEN_CHAR_CLASS)
3410         ch = re_string_fetch_byte_case (regexp);
3411       else
3412         ch = re_string_fetch_byte (regexp);
3413       if (re_string_eoi(regexp))
3414         return REG_EBRACK;
3415       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3416         break;
3417       elem->opr.name[i] = ch;
3418     }
3419   re_string_skip_bytes (regexp, 1);
3420   elem->opr.name[i] = '\0';
3421   switch (token->type)
3422     {
3423     case OP_OPEN_COLL_ELEM:
3424       elem->type = COLL_SYM;
3425       break;
3426     case OP_OPEN_EQUIV_CLASS:
3427       elem->type = EQUIV_CLASS;
3428       break;
3429     case OP_OPEN_CHAR_CLASS:
3430       elem->type = CHAR_CLASS;
3431       break;
3432     default:
3433       break;
3434     }
3435   return REG_NOERROR;
3436 }
3437
3438   /* Helper function for parse_bracket_exp.
3439      Build the equivalence class which is represented by NAME.
3440      The result are written to MBCSET and SBCSET.
3441      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3442      is a pointer argument sinse we may update it.  */
3443
3444 static reg_errcode_t
3445 #ifdef RE_ENABLE_I18N
3446 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3447      re_charset_t *mbcset;
3448      int *equiv_class_alloc;
3449 #else /* not RE_ENABLE_I18N */
3450 build_equiv_class (sbcset, name)
3451 #endif /* not RE_ENABLE_I18N */
3452      re_bitset_ptr_t sbcset;
3453      const unsigned char *name;
3454 {
3455 #if defined _LIBC
3456   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3457   if (nrules != 0)
3458     {
3459       const int32_t *table, *indirect;
3460       const unsigned char *weights, *extra, *cp;
3461       unsigned char char_buf[2];
3462       int32_t idx1, idx2;
3463       unsigned int ch;
3464       size_t len;
3465       /* This #include defines a local function!  */
3466 # include <locale/weight.h>
3467       /* Calculate the index for equivalence class.  */
3468       cp = name;
3469       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3470       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3471                                                _NL_COLLATE_WEIGHTMB);
3472       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3473                                                    _NL_COLLATE_EXTRAMB);
3474       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3475                                                 _NL_COLLATE_INDIRECTMB);
3476       idx1 = findidx (&cp);
3477       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3478         /* This isn't a valid character.  */
3479         return REG_ECOLLATE;
3480
3481       /* Build single byte matcing table for this equivalence class.  */
3482       char_buf[1] = (unsigned char) '\0';
3483       len = weights[idx1];
3484       for (ch = 0; ch < SBC_MAX; ++ch)
3485         {
3486           char_buf[0] = ch;
3487           cp = char_buf;
3488           idx2 = findidx (&cp);
3489 /*
3490           idx2 = table[ch];
3491 */
3492           if (idx2 == 0)
3493             /* This isn't a valid character.  */
3494             continue;
3495           if (len == weights[idx2])
3496             {
3497               int cnt = 0;
3498               while (cnt <= len &&
3499                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3500                 ++cnt;
3501
3502               if (cnt > len)
3503                 bitset_set (sbcset, ch);
3504             }
3505         }
3506       /* Check whether the array has enough space.  */
3507       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3508         {
3509           /* Not enough, realloc it.  */
3510           /* +1 in case of mbcset->nequiv_classes is 0.  */
3511           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3512           /* Use realloc since the array is NULL if *alloc == 0.  */
3513           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3514                                                    int32_t,
3515                                                    new_equiv_class_alloc);
3516           if (BE (new_equiv_classes == NULL, 0))
3517             return REG_ESPACE;
3518           mbcset->equiv_classes = new_equiv_classes;
3519           *equiv_class_alloc = new_equiv_class_alloc;
3520         }
3521       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3522     }
3523   else
3524 #endif /* _LIBC */
3525     {
3526       if (BE (strlen ((const char *) name) != 1, 0))
3527         return REG_ECOLLATE;
3528       bitset_set (sbcset, *name);
3529     }
3530   return REG_NOERROR;
3531 }
3532
3533   /* Helper function for parse_bracket_exp.
3534      Build the character class which is represented by NAME.
3535      The result are written to MBCSET and SBCSET.
3536      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3537      is a pointer argument sinse we may update it.  */
3538
3539 static reg_errcode_t
3540 #ifdef RE_ENABLE_I18N
3541 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3542      re_charset_t *mbcset;
3543      int *char_class_alloc;
3544 #else /* not RE_ENABLE_I18N */
3545 build_charclass (trans, sbcset, class_name, syntax)
3546 #endif /* not RE_ENABLE_I18N */
3547      unsigned RE_TRANSLATE_TYPE trans;
3548      re_bitset_ptr_t sbcset;
3549      const unsigned char *class_name;
3550      reg_syntax_t syntax;
3551 {
3552   int i;
3553   const char *name = (const char *) class_name;
3554
3555   /* In case of REG_ICASE "upper" and "lower" match the both of
3556      upper and lower cases.  */
3557   if ((syntax & RE_ICASE)
3558       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3559     name = "alpha";
3560
3561 #ifdef RE_ENABLE_I18N
3562   /* Check the space of the arrays.  */
3563   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3564     {
3565       /* Not enough, realloc it.  */
3566       /* +1 in case of mbcset->nchar_classes is 0.  */
3567       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3568       /* Use realloc since array is NULL if *alloc == 0.  */
3569       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3570                                                new_char_class_alloc);
3571       if (BE (new_char_classes == NULL, 0))
3572         return REG_ESPACE;
3573       mbcset->char_classes = new_char_classes;
3574       *char_class_alloc = new_char_class_alloc;
3575     }
3576   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3577 #endif /* RE_ENABLE_I18N */
3578
3579 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3580     for (i = 0; i < SBC_MAX; ++i)               \
3581       {                                         \
3582         if (ctype_func (i))                     \
3583           {                                     \
3584             int ch = trans ? trans[i] : i;      \
3585             bitset_set (sbcset, ch);            \
3586           }                                     \
3587       }
3588
3589   if (strcmp (name, "alnum") == 0)
3590     BUILD_CHARCLASS_LOOP (isalnum)
3591   else if (strcmp (name, "cntrl") == 0)
3592     BUILD_CHARCLASS_LOOP (iscntrl)
3593   else if (strcmp (name, "lower") == 0)
3594     BUILD_CHARCLASS_LOOP (islower)
3595   else if (strcmp (name, "space") == 0)
3596     BUILD_CHARCLASS_LOOP (isspace)
3597   else if (strcmp (name, "alpha") == 0)
3598     BUILD_CHARCLASS_LOOP (isalpha)
3599   else if (strcmp (name, "digit") == 0)
3600     BUILD_CHARCLASS_LOOP (isdigit)
3601   else if (strcmp (name, "print") == 0)
3602     BUILD_CHARCLASS_LOOP (isprint)
3603   else if (strcmp (name, "upper") == 0)
3604     BUILD_CHARCLASS_LOOP (isupper)
3605   else if (strcmp (name, "blank") == 0)
3606     BUILD_CHARCLASS_LOOP (isblank)
3607   else if (strcmp (name, "graph") == 0)
3608     BUILD_CHARCLASS_LOOP (isgraph)
3609   else if (strcmp (name, "punct") == 0)
3610     BUILD_CHARCLASS_LOOP (ispunct)
3611   else if (strcmp (name, "xdigit") == 0)
3612     BUILD_CHARCLASS_LOOP (isxdigit)
3613   else
3614     return REG_ECTYPE;
3615
3616   return REG_NOERROR;
3617 }
3618
3619 static bin_tree_t *
3620 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3621      re_dfa_t *dfa;
3622      unsigned RE_TRANSLATE_TYPE trans;
3623      const unsigned char *class_name;
3624      const unsigned char *extra;
3625      int non_match;
3626      reg_errcode_t *err;
3627 {
3628   re_bitset_ptr_t sbcset;
3629 #ifdef RE_ENABLE_I18N
3630   re_charset_t *mbcset;
3631   int alloc = 0;
3632 #endif /* not RE_ENABLE_I18N */
3633   reg_errcode_t ret;
3634   re_token_t br_token;
3635   bin_tree_t *tree;
3636
3637   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3638 #ifdef RE_ENABLE_I18N
3639   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3640 #endif /* RE_ENABLE_I18N */
3641
3642 #ifdef RE_ENABLE_I18N
3643   if (BE (sbcset == NULL || mbcset == NULL, 0))
3644 #else /* not RE_ENABLE_I18N */
3645   if (BE (sbcset == NULL, 0))
3646 #endif /* not RE_ENABLE_I18N */
3647     {
3648       *err = REG_ESPACE;
3649       return NULL;
3650     }
3651
3652   if (non_match)
3653     {
3654 #ifdef RE_ENABLE_I18N
3655       /*
3656       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3657         bitset_set(cset->sbcset, '\0');
3658       */
3659       mbcset->non_match = 1;
3660 #endif /* not RE_ENABLE_I18N */
3661     }
3662
3663   /* We don't care the syntax in this case.  */
3664   ret = build_charclass (trans, sbcset,
3665 #ifdef RE_ENABLE_I18N
3666                          mbcset, &alloc,
3667 #endif /* RE_ENABLE_I18N */
3668                          class_name, 0);
3669
3670   if (BE (ret != REG_NOERROR, 0))
3671     {
3672       re_free (sbcset);
3673 #ifdef RE_ENABLE_I18N
3674       free_charset (mbcset);
3675 #endif /* RE_ENABLE_I18N */
3676       *err = ret;
3677       return NULL;
3678     }
3679   /* \w match '_' also.  */
3680   for (; *extra; extra++)
3681     bitset_set (sbcset, *extra);
3682
3683   /* If it is non-matching list.  */
3684   if (non_match)
3685     bitset_not (sbcset);
3686
3687 #ifdef RE_ENABLE_I18N
3688   /* Ensure only single byte characters are set.  */
3689   if (dfa->mb_cur_max > 1)
3690     bitset_mask (sbcset, dfa->sb_char);
3691 #endif
3692
3693   /* Build a tree for simple bracket.  */
3694   br_token.type = SIMPLE_BRACKET;
3695   br_token.opr.sbcset = sbcset;
3696   tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3697   if (BE (tree == NULL, 0))
3698     goto build_word_op_espace;
3699
3700 #ifdef RE_ENABLE_I18N
3701   if (dfa->mb_cur_max > 1)
3702     {
3703       re_token_t alt_token;
3704       bin_tree_t *mbc_tree;
3705       /* Build a tree for complex bracket.  */
3706       br_token.type = COMPLEX_BRACKET;
3707       br_token.opr.mbcset = mbcset;
3708       dfa->has_mb_node = 1;
3709       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3710       if (BE (mbc_tree == NULL, 0))
3711         goto build_word_op_espace;
3712       /* Then join them by ALT node.  */
3713       alt_token.type = OP_ALT;
3714       dfa->has_plural_match = 1;
3715       tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3716       if (BE (mbc_tree != NULL, 1))
3717         return tree;
3718     }
3719   else
3720     {
3721       free_charset (mbcset);
3722       return tree;
3723     }
3724 #else /* not RE_ENABLE_I18N */
3725   return tree;
3726 #endif /* not RE_ENABLE_I18N */
3727
3728  build_word_op_espace:
3729   re_free (sbcset);
3730 #ifdef RE_ENABLE_I18N
3731   free_charset (mbcset);
3732 #endif /* RE_ENABLE_I18N */
3733   *err = REG_ESPACE;
3734   return NULL;
3735 }
3736
3737 /* This is intended for the expressions like "a{1,3}".
3738    Fetch a number from `input', and return the number.
3739    Return -1, if the number field is empty like "{,1}".
3740    Return -2, If an error is occured.  */
3741
3742 static int
3743 fetch_number (input, token, syntax)
3744      re_string_t *input;
3745      re_token_t *token;
3746      reg_syntax_t syntax;
3747 {
3748   int num = -1;
3749   unsigned char c;
3750   while (1)
3751     {
3752       fetch_token (token, input, syntax);
3753       c = token->opr.c;
3754       if (BE (token->type == END_OF_RE, 0))
3755         return -2;
3756       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3757         break;
3758       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3759              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3760       num = (num > RE_DUP_MAX) ? -2 : num;
3761     }
3762   return num;
3763 }
3764 \f
3765 #ifdef RE_ENABLE_I18N
3766 static void
3767 free_charset (re_charset_t *cset)
3768 {
3769   re_free (cset->mbchars);
3770 # ifdef _LIBC
3771   re_free (cset->coll_syms);
3772   re_free (cset->equiv_classes);
3773   re_free (cset->range_starts);
3774   re_free (cset->range_ends);
3775 # endif
3776   re_free (cset->char_classes);
3777   re_free (cset);
3778 }
3779 #endif /* RE_ENABLE_I18N */
3780 \f
3781 /* Functions for binary tree operation.  */
3782
3783 /* Create a tree node.  */
3784
3785 static bin_tree_t *
3786 create_tree (dfa, left, right, type, index)
3787      re_dfa_t *dfa;
3788      bin_tree_t *left;
3789      bin_tree_t *right;
3790      re_token_type_t type;
3791      int index;
3792 {
3793   bin_tree_t *tree;
3794   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3795     {
3796       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3797
3798       if (storage == NULL)
3799         return NULL;
3800       storage->next = dfa->str_tree_storage;
3801       dfa->str_tree_storage = storage;
3802       dfa->str_tree_storage_idx = 0;
3803     }
3804   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3805
3806   tree->parent = NULL;
3807   tree->left = left;
3808   tree->right = right;
3809   tree->type = type;
3810   tree->node_idx = index;
3811   tree->first = -1;
3812   tree->next = -1;
3813   re_node_set_init_empty (&tree->eclosure);
3814
3815   if (left != NULL)
3816     left->parent = tree;
3817   if (right != NULL)
3818     right->parent = tree;
3819   return tree;
3820 }
3821
3822 /* Create both a DFA node and a tree for it.  */
3823
3824 static bin_tree_t *
3825 re_dfa_add_tree_node (dfa, left, right, token)
3826      re_dfa_t *dfa;
3827      bin_tree_t *left;
3828      bin_tree_t *right;
3829      const re_token_t *token;
3830 {
3831   int new_idx = re_dfa_add_node (dfa, *token, 0);
3832
3833   if (new_idx == -1)
3834     return NULL;
3835
3836   return create_tree (dfa, left, right, 0, new_idx);
3837 }
3838
3839 /* Mark the tree SRC as an optional subexpression.  */
3840
3841 static void
3842 mark_opt_subexp (src, dfa)
3843      const bin_tree_t *src;
3844      re_dfa_t *dfa;
3845 {
3846   /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3847      a subexpression.  */
3848   if (src->type == CONCAT
3849       && src->left->type == NON_TYPE
3850       && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3851     mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3852 }
3853
3854
3855 /* Recursive tree walker for mark_opt_subexp.  */
3856
3857 static void
3858 mark_opt_subexp_iter (src, dfa, idx)
3859      const bin_tree_t *src;
3860      re_dfa_t *dfa;
3861      int idx;
3862 {
3863   int node_idx;
3864
3865   if (src->type == NON_TYPE)
3866     {
3867       node_idx = src->node_idx;
3868       if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3869            || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3870           && dfa->nodes[node_idx].opr.idx == idx)
3871         dfa->nodes[node_idx].opt_subexp = 1;
3872      }
3873
3874   if (src->left != NULL)
3875     mark_opt_subexp_iter (src->left, dfa, idx);
3876
3877   if (src->right != NULL)
3878     mark_opt_subexp_iter (src->right, dfa, idx);
3879 }
3880
3881
3882 /* Duplicate the node SRC, and return new node.  */
3883
3884 static bin_tree_t *
3885 duplicate_tree (src, dfa)
3886      const bin_tree_t *src;
3887      re_dfa_t *dfa;
3888 {
3889   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3890   int new_node_idx;
3891   /* Since node indies must be according to Post-order of the tree,
3892      we must duplicate the left at first.  */
3893   if (src->left != NULL)
3894     {
3895       left = duplicate_tree (src->left, dfa);
3896       if (left == NULL)
3897         return NULL;
3898     }
3899
3900   /* Secondaly, duplicate the right.  */
3901   if (src->right != NULL)
3902     {
3903       right = duplicate_tree (src->right, dfa);
3904       if (right == NULL)
3905         return NULL;
3906     }
3907
3908   /* At last, duplicate itself.  */
3909   if (src->type == NON_TYPE)
3910     {
3911       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3912       dfa->nodes[new_node_idx].duplicated = 1;
3913       if (BE (new_node_idx == -1, 0))
3914         return NULL;
3915     }
3916   else
3917     new_node_idx = src->type;
3918
3919   new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3920   return new_tree;
3921 }