posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  52                                              int top_clone_node, int root_node,
  53                                              unsigned int constraint);
  54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  55                                      unsigned int constraint);
  56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  57                                    unsigned int constraint);
  58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  60                                          int node, int root);
  61 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  62 static int fetch_number (re_string_t *input, re_token_t *token,
  63                          reg_syntax_t syntax);
  64 static void fetch_token (re_token_t *result, re_string_t *input,
  65                          reg_syntax_t syntax);
  66 static int peek_token (re_token_t *token, re_string_t *input,
  67                         reg_syntax_t syntax);
  68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  69                                reg_syntax_t syntax);
  70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  71                           reg_syntax_t syntax, reg_errcode_t *err);
  72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   int nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  76                                  re_token_t *token, reg_syntax_t syntax,
  77                                  int nest, reg_errcode_t *err);
  78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  79                                      re_token_t *token, reg_syntax_t syntax,
  80                                      int nest, reg_errcode_t *err);
  81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  82                                   re_token_t *token, reg_syntax_t syntax,
  83                                   int nest, reg_errcode_t *err);
  84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  85                                  re_dfa_t *dfa, re_token_t *token,
  86                                  reg_syntax_t syntax, reg_errcode_t *err);
  87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  88                                       re_token_t *token, reg_syntax_t syntax,
  89                                       reg_errcode_t *err);
  90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  91                                             re_string_t *regexp,
  92                                             re_token_t *token, int token_len,
  93                                             re_dfa_t *dfa,
  94                                             reg_syntax_t syntax,
  95                                             int accept_hyphen);
  96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  97                                           re_string_t *regexp,
  98                                           re_token_t *token);
  99 #ifndef _LIBC
 100 # ifdef RE_ENABLE_I18N
 101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 102                                       re_charset_t *mbcset, int *range_alloc,
 103                                       bracket_elem_t *start_elem,
 104                                       bracket_elem_t *end_elem);
 105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 106                                              re_charset_t *mbcset,
 107                                              int *coll_sym_alloc,
 108                                              const unsigned char *name);
 109 # else /* not RE_ENABLE_I18N */
 110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 111                                       bracket_elem_t *start_elem,
 112                                       bracket_elem_t *end_elem);
 113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 114                                              const unsigned char *name);
 115 # endif /* not RE_ENABLE_I18N */
 116 #endif /* not _LIBC */
 117 #ifdef RE_ENABLE_I18N
 118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 119                                         re_charset_t *mbcset,
 120                                         int *equiv_class_alloc,
 121                                         const unsigned char *name);
 122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 123                                       re_bitset_ptr_t sbcset,
 124                                       re_charset_t *mbcset,
 125                                       int *char_class_alloc,
 126                                       const unsigned char *class_name,
 127                                       reg_syntax_t syntax);
 128 #else  /* not RE_ENABLE_I18N */
 129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 130                                         const unsigned char *name);
 131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 132                                       re_bitset_ptr_t sbcset,
 133                                       const unsigned char *class_name,
 134                                       reg_syntax_t syntax);
 135 #endif /* not RE_ENABLE_I18N */
 136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 137                                        unsigned RE_TRANSLATE_TYPE trans,
 138                                        const unsigned char *class_name,
 139                                        const unsigned char *extra,
 140                                        int non_match, reg_errcode_t *err);
 141 static bin_tree_t *create_tree (re_dfa_t *dfa,
 142                                 bin_tree_t *left, bin_tree_t *right,
 143                                 re_token_type_t type);
 144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 145                                       bin_tree_t *left, bin_tree_t *right,
 146                                       const re_token_t *token);
 147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 148 static void free_token (re_token_t *node);
 149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 151 \f
 152 /* This table gives an error message for each of the error codes listed
 153    in regex.h.  Obviously the order here has to be same as there.
 154    POSIX doesn't require that we do anything for REG_NOERROR,
 155    but why not be nice?  */
 156
 157 const char __re_error_msgid[] attribute_hidden =
 158   {
 159 #define REG_NOERROR_IDX 0
 160     gettext_noop ("Success")    /* REG_NOERROR */
 161     "\0"
 162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 163     gettext_noop ("No match")   /* REG_NOMATCH */
 164     "\0"
 165 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 166     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 167     "\0"
 168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 169     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 170     "\0"
 171 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 172     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 173     "\0"
 174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 175     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 176     "\0"
 177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 178     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 179     "\0"
 180 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 181     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 182     "\0"
 183 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 184     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 185     "\0"
 186 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 187     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 188     "\0"
 189 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 190     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 191     "\0"
 192 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 193     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 194     "\0"
 195 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 196     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 197     "\0"
 198 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 199     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 200     "\0"
 201 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 202     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 203     "\0"
 204 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 205     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 206     "\0"
 207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 208     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 209   };
 210
 211 const size_t __re_error_msgid_idx[] attribute_hidden =
 212   {
 213     REG_NOERROR_IDX,
 214     REG_NOMATCH_IDX,
 215     REG_BADPAT_IDX,
 216     REG_ECOLLATE_IDX,
 217     REG_ECTYPE_IDX,
 218     REG_EESCAPE_IDX,
 219     REG_ESUBREG_IDX,
 220     REG_EBRACK_IDX,
 221     REG_EPAREN_IDX,
 222     REG_EBRACE_IDX,
 223     REG_BADBR_IDX,
 224     REG_ERANGE_IDX,
 225     REG_ESPACE_IDX,
 226     REG_BADRPT_IDX,
 227     REG_EEND_IDX,
 228     REG_ESIZE_IDX,
 229     REG_ERPAREN_IDX
 230   };
 231 \f
 232 /* Entry points for GNU code.  */
 233
 234 /* re_compile_pattern is the GNU regular expression compiler: it
 235    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 236    Returns 0 if the pattern was valid, otherwise an error string.
 237
 238    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 239    are set in BUFP on entry.  */
 240
 241 const char *
 242 re_compile_pattern (pattern, length, bufp)
 243     const char *pattern;
 244     size_t length;
 245     struct re_pattern_buffer *bufp;
 246 {
 247   reg_errcode_t ret;
 248
 249   /* And GNU code determines whether or not to get register information
 250      by passing null for the REGS argument to re_match, etc., not by
 251      setting no_sub, unless RE_NO_SUB is set.  */
 252   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 253
 254   /* Match anchors at newline.  */
 255   bufp->newline_anchor = 1;
 256
 257   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 258
 259   if (!ret)
 260     return NULL;
 261   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_compile_pattern, re_compile_pattern)
 265 #endif
 266
 267 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 268    also be assigned to arbitrarily: each pattern buffer stores its own
 269    syntax, so it can be changed between regex compilations.  */
 270 /* This has no initializer because initialized variables in Emacs
 271    become read-only after dumping.  */
 272 reg_syntax_t re_syntax_options;
 273
 274
 275 /* Specify the precise syntax of regexps for compilation.  This provides
 276    for compatibility for various utilities which historically have
 277    different, incompatible syntaxes.
 278
 279    The argument SYNTAX is a bit mask comprised of the various bits
 280    defined in regex.h.  We return the old syntax.  */
 281
 282 reg_syntax_t
 283 re_set_syntax (syntax)
 284     reg_syntax_t syntax;
 285 {
 286   reg_syntax_t ret = re_syntax_options;
 287
 288   re_syntax_options = syntax;
 289   return ret;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_set_syntax, re_set_syntax)
 293 #endif
 294
 295 int
 296 re_compile_fastmap (bufp)
 297     struct re_pattern_buffer *bufp;
 298 {
 299   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 300   char *fastmap = bufp->fastmap;
 301
 302   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 303   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 304   if (dfa->init_state != dfa->init_state_word)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 306   if (dfa->init_state != dfa->init_state_nl)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 308   if (dfa->init_state != dfa->init_state_begbuf)
 309     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 310   bufp->fastmap_accurate = 1;
 311   return 0;
 312 }
 313 #ifdef _LIBC
 314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 315 #endif
 316
 317 static inline void
 318 __attribute ((always_inline))
 319 re_set_fastmap (char *fastmap, int icase, int ch)
 320 {
 321   fastmap[ch] = 1;
 322   if (icase)
 323     fastmap[tolower (ch)] = 1;
 324 }
 325
 326 /* Helper function for re_compile_fastmap.
 327    Compile fastmap for the initial_state INIT_STATE.  */
 328
 329 static void
 330 re_compile_fastmap_iter (bufp, init_state, fastmap)
 331      regex_t *bufp;
 332      const re_dfastate_t *init_state;
 333      char *fastmap;
 334 {
 335   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 336   int node_cnt;
 337   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 338   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 339     {
 340       int node = init_state->nodes.elems[node_cnt];
 341       re_token_type_t type = dfa->nodes[node].type;
 342
 343       if (type == CHARACTER)
 344         {
 345           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 346 #ifdef RE_ENABLE_I18N
 347           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 348             {
 349               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 350               wchar_t wc;
 351               mbstate_t state;
 352
 353               p = buf;
 354               *p++ = dfa->nodes[node].opr.c;
 355               while (++node < dfa->nodes_len
 356                      && dfa->nodes[node].type == CHARACTER
 357                      && dfa->nodes[node].mb_partial)
 358                 *p++ = dfa->nodes[node].opr.c;
 359               memset (&state, 0, sizeof (state));
 360               if (mbrtowc (&wc, (const char *) buf, p - buf,
 361                            &state) == p - buf
 362                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 363                 re_set_fastmap (fastmap, 0, buf[0]);
 364             }
 365 #endif
 366         }
 367       else if (type == SIMPLE_BRACKET)
 368         {
 369           int i, j, ch;
 370           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 371             for (j = 0; j < UINT_BITS; ++j, ++ch)
 372               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 373                 re_set_fastmap (fastmap, icase, ch);
 374         }
 375 #ifdef RE_ENABLE_I18N
 376       else if (type == COMPLEX_BRACKET)
 377         {
 378           int i;
 379           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 380           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 381               || cset->nranges || cset->nchar_classes)
 382             {
 383 # ifdef _LIBC
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 385                 {
 386                   /* In this case we want to catch the bytes which are
 387                      the first byte of any collation elements.
 388                      e.g. In da_DK, we want to catch 'a' since "aa"
 389                           is a valid collation element, and don't catch
 390                           'b' since 'b' is the only collation element
 391                           which starts from 'b'.  */
 392                   int j, ch;
 393                   const int32_t *table = (const int32_t *)
 394                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 395                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 396                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 397                       if (table[ch] < 0)
 398                         re_set_fastmap (fastmap, icase, ch);
 399                 }
 400 # else
 401               if (dfa->mb_cur_max > 1)
 402                 for (i = 0; i < SBC_MAX; ++i)
 403                   if (__btowc (i) == WEOF)
 404                     re_set_fastmap (fastmap, icase, i);
 405 # endif /* not _LIBC */
 406             }
 407           for (i = 0; i < cset->nmbchars; ++i)
 408             {
 409               char buf[256];
 410               mbstate_t state;
 411               memset (&state, '\0', sizeof (state));
 412               __wcrtomb (buf, cset->mbchars[i], &state);
 413               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 415                 {
 416                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 417                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 418                 }
 419             }
 420         }
 421 #endif /* RE_ENABLE_I18N */
 422       else if (type == OP_PERIOD
 423 #ifdef RE_ENABLE_I18N
 424                || type == OP_UTF8_PERIOD
 425 #endif /* RE_ENABLE_I18N */
 426                || type == END_OF_RE)
 427         {
 428           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 429           if (type == END_OF_RE)
 430             bufp->can_be_null = 1;
 431           return;
 432         }
 433     }
 434 }
 435 \f
 436 /* Entry point for POSIX code.  */
 437 /* regcomp takes a regular expression as a string and compiles it.
 438
 439    PREG is a regex_t *.  We do not expect any fields to be initialized,
 440    since POSIX says we shouldn't.  Thus, we set
 441
 442      `buffer' to the compiled pattern;
 443      `used' to the length of the compiled pattern;
 444      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 445        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 446        RE_SYNTAX_POSIX_BASIC;
 447      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 448      `fastmap' to an allocated space for the fastmap;
 449      `fastmap_accurate' to zero;
 450      `re_nsub' to the number of subexpressions in PATTERN.
 451
 452    PATTERN is the address of the pattern string.
 453
 454    CFLAGS is a series of bits which affect compilation.
 455
 456      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 457      use POSIX basic syntax.
 458
 459      If REG_NEWLINE is set, then . and [^...] don't match newline.
 460      Also, regexec will try a match beginning after every newline.
 461
 462      If REG_ICASE is set, then we considers upper- and lowercase
 463      versions of letters to be equivalent when matching.
 464
 465      If REG_NOSUB is set, then when PREG is passed to regexec, that
 466      routine will report only success or failure, and nothing about the
 467      registers.
 468
 469    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 470    the return codes and their meanings.)  */
 471
 472 int
 473 regcomp (preg, pattern, cflags)
 474     regex_t *__restrict preg;
 475     const char *__restrict pattern;
 476     int cflags;
 477 {
 478   reg_errcode_t ret;
 479   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 480                          : RE_SYNTAX_POSIX_BASIC);
 481
 482   preg->buffer = NULL;
 483   preg->allocated = 0;
 484   preg->used = 0;
 485
 486   /* Try to allocate space for the fastmap.  */
 487   preg->fastmap = re_malloc (char, SBC_MAX);
 488   if (BE (preg->fastmap == NULL, 0))
 489     return REG_ESPACE;
 490
 491   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 492
 493   /* If REG_NEWLINE is set, newlines are treated differently.  */
 494   if (cflags & REG_NEWLINE)
 495     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 496       syntax &= ~RE_DOT_NEWLINE;
 497       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 498       /* It also changes the matching behavior.  */
 499       preg->newline_anchor = 1;
 500     }
 501   else
 502     preg->newline_anchor = 0;
 503   preg->no_sub = !!(cflags & REG_NOSUB);
 504   preg->translate = NULL;
 505
 506   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 507
 508   /* POSIX doesn't distinguish between an unmatched open-group and an
 509      unmatched close-group: both are REG_EPAREN.  */
 510   if (ret == REG_ERPAREN)
 511     ret = REG_EPAREN;
 512
 513   /* We have already checked preg->fastmap != NULL.  */
 514   if (BE (ret == REG_NOERROR, 1))
 515     /* Compute the fastmap now, since regexec cannot modify the pattern
 516        buffer.  This function never fails in this implementation.  */
 517     (void) re_compile_fastmap (preg);
 518   else
 519     {
 520       /* Some error occurred while compiling the expression.  */
 521       re_free (preg->fastmap);
 522       preg->fastmap = NULL;
 523     }
 524
 525   return (int) ret;
 526 }
 527 #ifdef _LIBC
 528 weak_alias (__regcomp, regcomp)
 529 #endif
 530
 531 /* Returns a message corresponding to an error code, ERRCODE, returned
 532    from either regcomp or regexec.   We don't use PREG here.  */
 533
 534 size_t
 535 regerror (errcode, preg, errbuf, errbuf_size)
 536     int errcode;
 537     const regex_t *preg;
 538     char *errbuf;
 539     size_t errbuf_size;
 540 {
 541   const char *msg;
 542   size_t msg_size;
 543
 544   if (BE (errcode < 0
 545           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 546                                / sizeof (__re_error_msgid_idx[0])), 0))
 547     /* Only error codes returned by the rest of the code should be passed
 548        to this routine.  If we are given anything else, or if other regex
 549        code generates an invalid error code, then the program has a bug.
 550        Dump core so we can fix it.  */
 551     abort ();
 552
 553   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 554
 555   msg_size = strlen (msg) + 1; /* Includes the null.  */
 556
 557   if (BE (errbuf_size != 0, 1))
 558     {
 559       if (BE (msg_size > errbuf_size, 0))
 560         {
 561 #if defined HAVE_MEMPCPY || defined _LIBC
 562           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 563 #else
 564           memcpy (errbuf, msg, errbuf_size - 1);
 565           errbuf[errbuf_size - 1] = 0;
 566 #endif
 567         }
 568       else
 569         memcpy (errbuf, msg, msg_size);
 570     }
 571
 572   return msg_size;
 573 }
 574 #ifdef _LIBC
 575 weak_alias (__regerror, regerror)
 576 #endif
 577
 578
 579 #ifdef RE_ENABLE_I18N
 580 /* This static array is used for the map to single-byte characters when
 581    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 582    it the same all the time.  UTF-8 is the preferred encoding so this is
 583    a worthwhile optimization.  */
 584 static const bitset utf8_sb_map =
 585 {
 586   /* Set the first 128 bits.  */
 587 # if UINT_MAX == 0xffffffff
 588   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 589 # else
 590 #  error "Add case for new unsigned int size"
 591 # endif
 592 };
 593 #endif
 594
 595
 596 static void
 597 free_dfa_content (re_dfa_t *dfa)
 598 {
 599   int i, j;
 600
 601   if (dfa->nodes)
 602     for (i = 0; i < dfa->nodes_len; ++i)
 603       free_token (dfa->nodes + i);
 604   re_free (dfa->nexts);
 605   for (i = 0; i < dfa->nodes_len; ++i)
 606     {
 607       if (dfa->eclosures != NULL)
 608         re_node_set_free (dfa->eclosures + i);
 609       if (dfa->inveclosures != NULL)
 610         re_node_set_free (dfa->inveclosures + i);
 611       if (dfa->edests != NULL)
 612         re_node_set_free (dfa->edests + i);
 613     }
 614   re_free (dfa->edests);
 615   re_free (dfa->eclosures);
 616   re_free (dfa->inveclosures);
 617   re_free (dfa->nodes);
 618
 619   if (dfa->state_table)
 620     for (i = 0; i <= dfa->state_hash_mask; ++i)
 621       {
 622         struct re_state_table_entry *entry = dfa->state_table + i;
 623         for (j = 0; j < entry->num; ++j)
 624           {
 625             re_dfastate_t *state = entry->array[j];
 626             free_state (state);
 627           }
 628         re_free (entry->array);
 629       }
 630   re_free (dfa->state_table);
 631 #ifdef RE_ENABLE_I18N
 632   if (dfa->sb_char != utf8_sb_map)
 633     re_free (dfa->sb_char);
 634 #endif
 635   re_free (dfa->subexp_map);
 636 #ifdef DEBUG
 637   re_free (dfa->re_str);
 638 #endif
 639
 640   re_free (dfa);
 641 }
 642
 643
 644 /* Free dynamically allocated space used by PREG.  */
 645
 646 void
 647 regfree (preg)
 648     regex_t *preg;
 649 {
 650   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 651   if (BE (dfa != NULL, 1))
 652     free_dfa_content (dfa);
 653   preg->buffer = NULL;
 654   preg->allocated = 0;
 655
 656   re_free (preg->fastmap);
 657   preg->fastmap = NULL;
 658
 659   re_free (preg->translate);
 660   preg->translate = NULL;
 661 }
 662 #ifdef _LIBC
 663 weak_alias (__regfree, regfree)
 664 #endif
 665 \f
 666 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 667    them unless specifically requested.  */
 668
 669 #if defined _REGEX_RE_COMP || defined _LIBC
 670
 671 /* BSD has one and only one pattern buffer.  */
 672 static struct re_pattern_buffer re_comp_buf;
 673
 674 char *
 675 # ifdef _LIBC
 676 /* Make these definitions weak in libc, so POSIX programs can redefine
 677    these names if they don't use our functions, and still use
 678    regcomp/regexec above without link errors.  */
 679 weak_function
 680 # endif
 681 re_comp (s)
 682      const char *s;
 683 {
 684   reg_errcode_t ret;
 685   char *fastmap;
 686
 687   if (!s)
 688     {
 689       if (!re_comp_buf.buffer)
 690         return gettext ("No previous regular expression");
 691       return 0;
 692     }
 693
 694   if (re_comp_buf.buffer)
 695     {
 696       fastmap = re_comp_buf.fastmap;
 697       re_comp_buf.fastmap = NULL;
 698       __regfree (&re_comp_buf);
 699       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 700       re_comp_buf.fastmap = fastmap;
 701     }
 702
 703   if (re_comp_buf.fastmap == NULL)
 704     {
 705       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 706       if (re_comp_buf.fastmap == NULL)
 707         return (char *) gettext (__re_error_msgid
 708                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 709     }
 710
 711   /* Since `re_exec' always passes NULL for the `regs' argument, we
 712      don't need to initialize the pattern buffer fields which affect it.  */
 713
 714   /* Match anchors at newlines.  */
 715   re_comp_buf.newline_anchor = 1;
 716
 717   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 718
 719   if (!ret)
 720     return NULL;
 721
 722   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 723   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 724 }
 725
 726 #ifdef _LIBC
 727 libc_freeres_fn (free_mem)
 728 {
 729   __regfree (&re_comp_buf);
 730 }
 731 #endif
 732
 733 #endif /* _REGEX_RE_COMP */
 734 \f
 735 /* Internal entry point.
 736    Compile the regular expression PATTERN, whose length is LENGTH.
 737    SYNTAX indicate regular expression's syntax.  */
 738
 739 static reg_errcode_t
 740 re_compile_internal (preg, pattern, length, syntax)
 741      regex_t *preg;
 742      const char * pattern;
 743      int length;
 744      reg_syntax_t syntax;
 745 {
 746   reg_errcode_t err = REG_NOERROR;
 747   re_dfa_t *dfa;
 748   re_string_t regexp;
 749
 750   /* Initialize the pattern buffer.  */
 751   preg->fastmap_accurate = 0;
 752   preg->syntax = syntax;
 753   preg->not_bol = preg->not_eol = 0;
 754   preg->used = 0;
 755   preg->re_nsub = 0;
 756   preg->can_be_null = 0;
 757   preg->regs_allocated = REGS_UNALLOCATED;
 758
 759   /* Initialize the dfa.  */
 760   dfa = (re_dfa_t *) preg->buffer;
 761   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 762     {
 763       /* If zero allocated, but buffer is non-null, try to realloc
 764          enough space.  This loses if buffer's address is bogus, but
 765          that is the user's responsibility.  If ->buffer is NULL this
 766          is a simple allocation.  */
 767       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 768       if (dfa == NULL)
 769         return REG_ESPACE;
 770       preg->allocated = sizeof (re_dfa_t);
 771       preg->buffer = (unsigned char *) dfa;
 772     }
 773   preg->used = sizeof (re_dfa_t);
 774
 775   err = init_dfa (dfa, length);
 776   if (BE (err != REG_NOERROR, 0))
 777     {
 778       free_dfa_content (dfa);
 779       preg->buffer = NULL;
 780       preg->allocated = 0;
 781       return err;
 782     }
 783 #ifdef DEBUG
 784   dfa->re_str = re_malloc (char, length + 1);
 785   strncpy (dfa->re_str, pattern, length + 1);
 786 #endif
 787
 788   err = re_string_construct (&regexp, pattern, length, preg->translate,
 789                              syntax & RE_ICASE, dfa);
 790   if (BE (err != REG_NOERROR, 0))
 791     {
 792     re_compile_internal_free_return:
 793       free_workarea_compile (preg);
 794       re_string_destruct (&regexp);
 795       free_dfa_content (dfa);
 796       preg->buffer = NULL;
 797       preg->allocated = 0;
 798       return err;
 799     }
 800
 801   /* Parse the regular expression, and build a structure tree.  */
 802   preg->re_nsub = 0;
 803   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 804   if (BE (dfa->str_tree == NULL, 0))
 805     goto re_compile_internal_free_return;
 806
 807   /* Analyze the tree and create the nfa.  */
 808   err = analyze (preg);
 809   if (BE (err != REG_NOERROR, 0))
 810     goto re_compile_internal_free_return;
 811
 812 #ifdef RE_ENABLE_I18N
 813   /* If possible, do searching in single byte encoding to speed things up.  */
 814   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 815     optimize_utf8 (dfa);
 816 #endif
 817
 818   /* Then create the initial state of the dfa.  */
 819   err = create_initial_state (dfa);
 820
 821   /* Release work areas.  */
 822   free_workarea_compile (preg);
 823   re_string_destruct (&regexp);
 824
 825   if (BE (err != REG_NOERROR, 0))
 826     {
 827       free_dfa_content (dfa);
 828       preg->buffer = NULL;
 829       preg->allocated = 0;
 830     }
 831
 832   return err;
 833 }
 834
 835 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 836    as the initial length of some arrays.  */
 837
 838 static reg_errcode_t
 839 init_dfa (dfa, pat_len)
 840      re_dfa_t *dfa;
 841      int pat_len;
 842 {
 843   int table_size;
 844 #ifndef _LIBC
 845   char *codeset_name;
 846 #endif
 847
 848   memset (dfa, '\0', sizeof (re_dfa_t));
 849
 850   /* Force allocation of str_tree_storage the first time.  */
 851   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 852
 853   dfa->nodes_alloc = pat_len + 1;
 854   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 855
 856   dfa->states_alloc = pat_len + 1;
 857
 858   /*  table_size = 2 ^ ceil(log pat_len) */
 859   for (table_size = 1; table_size > 0; table_size <<= 1)
 860     if (table_size > pat_len)
 861       break;
 862
 863   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 864   dfa->state_hash_mask = table_size - 1;
 865
 866   dfa->mb_cur_max = MB_CUR_MAX;
 867 #ifdef _LIBC
 868   if (dfa->mb_cur_max == 6
 869       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 870     dfa->is_utf8 = 1;
 871   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 872                        != 0);
 873 #else
 874 # ifdef HAVE_LANGINFO_CODESET
 875   codeset_name = nl_langinfo (CODESET);
 876 # else
 877   codeset_name = getenv ("LC_ALL");
 878   if (codeset_name == NULL || codeset_name[0] == '\0')
 879     codeset_name = getenv ("LC_CTYPE");
 880   if (codeset_name == NULL || codeset_name[0] == '\0')
 881     codeset_name = getenv ("LANG");
 882   if (codeset_name == NULL)
 883     codeset_name = "";
 884   else if (strchr (codeset_name, '.') !=  NULL)
 885     codeset_name = strchr (codeset_name, '.') + 1;
 886 # endif
 887
 888   if (strcasecmp (codeset_name, "UTF-8") == 0
 889       || strcasecmp (codeset_name, "UTF8") == 0)
 890     dfa->is_utf8 = 1;
 891
 892   /* We check exhaustively in the loop below if this charset is a
 893      superset of ASCII.  */
 894   dfa->map_notascii = 0;
 895 #endif
 896
 897 #ifdef RE_ENABLE_I18N
 898   if (dfa->mb_cur_max > 1)
 899     {
 900       if (dfa->is_utf8)
 901         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 902       else
 903         {
 904           int i, j, ch;
 905
 906           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 907           if (BE (dfa->sb_char == NULL, 0))
 908             return REG_ESPACE;
 909
 910           /* Clear all bits by, then set those corresponding to single
 911              byte chars.  */
 912           bitset_empty (dfa->sb_char);
 913
 914           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 915             for (j = 0; j < UINT_BITS; ++j, ++ch)
 916               {
 917                 wchar_t wch = __btowc (ch);
 918                 if (wch != WEOF)
 919                   dfa->sb_char[i] |= 1 << j;
 920 # ifndef _LIBC
 921                 if (isascii (ch) && wch != (wchar_t) ch)
 922                   dfa->map_notascii = 1;
 923 # endif
 924               }
 925         }
 926     }
 927 #endif
 928
 929   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 930     return REG_ESPACE;
 931   return REG_NOERROR;
 932 }
 933
 934 /* Initialize WORD_CHAR table, which indicate which character is
 935    "word".  In this case "word" means that it is the word construction
 936    character used by some operators like "\<", "\>", etc.  */
 937
 938 static void
 939 init_word_char (dfa)
 940      re_dfa_t *dfa;
 941 {
 942   int i, j, ch;
 943   dfa->word_ops_used = 1;
 944   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 945     for (j = 0; j < UINT_BITS; ++j, ++ch)
 946       if (isalnum (ch) || ch == '_')
 947         dfa->word_char[i] |= 1 << j;
 948 }
 949
 950 /* Free the work area which are only used while compiling.  */
 951
 952 static void
 953 free_workarea_compile (preg)
 954      regex_t *preg;
 955 {
 956   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 957   bin_tree_storage_t *storage, *next;
 958   for (storage = dfa->str_tree_storage; storage; storage = next)
 959     {
 960       next = storage->next;
 961       re_free (storage);
 962     }
 963   dfa->str_tree_storage = NULL;
 964   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 965   dfa->str_tree = NULL;
 966   re_free (dfa->org_indices);
 967   dfa->org_indices = NULL;
 968 }
 969
 970 /* Create initial states for all contexts.  */
 971
 972 static reg_errcode_t
 973 create_initial_state (dfa)
 974      re_dfa_t *dfa;
 975 {
 976   int first, i;
 977   reg_errcode_t err;
 978   re_node_set init_nodes;
 979
 980   /* Initial states have the epsilon closure of the node which is
 981      the first node of the regular expression.  */
 982   first = dfa->str_tree->first->node_idx;
 983   dfa->init_node = first;
 984   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 985   if (BE (err != REG_NOERROR, 0))
 986     return err;
 987
 988   /* The back-references which are in initial states can epsilon transit,
 989      since in this case all of the subexpressions can be null.
 990      Then we add epsilon closures of the nodes which are the next nodes of
 991      the back-references.  */
 992   if (dfa->nbackref > 0)
 993     for (i = 0; i < init_nodes.nelem; ++i)
 994       {
 995         int node_idx = init_nodes.elems[i];
 996         re_token_type_t type = dfa->nodes[node_idx].type;
 997
 998         int clexp_idx;
 999         if (type != OP_BACK_REF)
1000           continue;
1001         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1002           {
1003             re_token_t *clexp_node;
1004             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1005             if (clexp_node->type == OP_CLOSE_SUBEXP
1006                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1007               break;
1008           }
1009         if (clexp_idx == init_nodes.nelem)
1010           continue;
1011
1012         if (type == OP_BACK_REF)
1013           {
1014             int dest_idx = dfa->edests[node_idx].elems[0];
1015             if (!re_node_set_contains (&init_nodes, dest_idx))
1016               {
1017                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1018                 i = 0;
1019               }
1020           }
1021       }
1022
1023   /* It must be the first time to invoke acquire_state.  */
1024   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1025   /* We don't check ERR here, since the initial state must not be NULL.  */
1026   if (BE (dfa->init_state == NULL, 0))
1027     return err;
1028   if (dfa->init_state->has_constraint)
1029     {
1030       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1031                                                        CONTEXT_WORD);
1032       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1033                                                      CONTEXT_NEWLINE);
1034       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1035                                                          &init_nodes,
1036                                                          CONTEXT_NEWLINE
1037                                                          | CONTEXT_BEGBUF);
1038       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1039               || dfa->init_state_begbuf == NULL, 0))
1040         return err;
1041     }
1042   else
1043     dfa->init_state_word = dfa->init_state_nl
1044       = dfa->init_state_begbuf = dfa->init_state;
1045
1046   re_node_set_free (&init_nodes);
1047   return REG_NOERROR;
1048 }
1049 \f
1050 #ifdef RE_ENABLE_I18N
1051 /* If it is possible to do searching in single byte encoding instead of UTF-8
1052    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1053    DFA nodes where needed.  */
1054
1055 static void
1056 optimize_utf8 (dfa)
1057      re_dfa_t *dfa;
1058 {
1059   int node, i, mb_chars = 0, has_period = 0;
1060
1061   for (node = 0; node < dfa->nodes_len; ++node)
1062     switch (dfa->nodes[node].type)
1063       {
1064       case CHARACTER:
1065         if (dfa->nodes[node].opr.c >= 0x80)
1066           mb_chars = 1;
1067         break;
1068       case ANCHOR:
1069         switch (dfa->nodes[node].opr.idx)
1070           {
1071           case LINE_FIRST:
1072           case LINE_LAST:
1073           case BUF_FIRST:
1074           case BUF_LAST:
1075             break;
1076           default:
1077             /* Word anchors etc. cannot be handled.  */
1078             return;
1079           }
1080         break;
1081       case OP_PERIOD:
1082         has_period = 1;
1083         break;
1084       case OP_BACK_REF:
1085       case OP_ALT:
1086       case END_OF_RE:
1087       case OP_DUP_ASTERISK:
1088       case OP_OPEN_SUBEXP:
1089       case OP_CLOSE_SUBEXP:
1090         break;
1091       case COMPLEX_BRACKET:
1092         return;
1093       case SIMPLE_BRACKET:
1094         /* Just double check.  */
1095         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1096           if (dfa->nodes[node].opr.sbcset[i])
1097             return;
1098         break;
1099       default:
1100         abort ();
1101       }
1102
1103   if (mb_chars || has_period)
1104     for (node = 0; node < dfa->nodes_len; ++node)
1105       {
1106         if (dfa->nodes[node].type == CHARACTER
1107             && dfa->nodes[node].opr.c >= 0x80)
1108           dfa->nodes[node].mb_partial = 0;
1109         else if (dfa->nodes[node].type == OP_PERIOD)
1110           dfa->nodes[node].type = OP_UTF8_PERIOD;
1111       }
1112
1113   /* The search can be in single byte locale.  */
1114   dfa->mb_cur_max = 1;
1115   dfa->is_utf8 = 0;
1116   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1117 }
1118 #endif
1119 \f
1120 /* Analyze the structure tree, and calculate "first", "next", "edest",
1121    "eclosure", and "inveclosure".  */
1122
1123 static reg_errcode_t
1124 analyze (preg)
1125      regex_t *preg;
1126 {
1127   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1128   reg_errcode_t ret;
1129
1130   /* Allocate arrays.  */
1131   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1132   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1133   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1134   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1135   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1136           || dfa->eclosures == NULL, 0))
1137     return REG_ESPACE;
1138
1139   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1140   if (dfa->subexp_map != NULL)
1141     {
1142       int i;
1143       for (i = 0; i < preg->re_nsub; i++)
1144         dfa->subexp_map[i] = i;
1145       preorder (dfa->str_tree, optimize_subexps, dfa);
1146       for (i = 0; i < preg->re_nsub; i++)
1147         if (dfa->subexp_map[i] != i)
1148           break;
1149       if (i == preg->re_nsub)
1150         {
1151           free (dfa->subexp_map);
1152           dfa->subexp_map = NULL;
1153         }
1154     }
1155
1156   ret = postorder (dfa->str_tree, lower_subexps, preg);
1157   if (BE (ret != REG_NOERROR, 0))
1158     return ret;
1159   ret = postorder (dfa->str_tree, calc_first, dfa);
1160   if (BE (ret != REG_NOERROR, 0))
1161     return ret;
1162   preorder (dfa->str_tree, calc_next, dfa);
1163   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1164   if (BE (ret != REG_NOERROR, 0))
1165     return ret;
1166   ret = calc_eclosure (dfa);
1167   if (BE (ret != REG_NOERROR, 0))
1168     return ret;
1169
1170   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1171      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1172   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1173       || dfa->nbackref)
1174     {
1175       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1176       if (BE (dfa->inveclosures == NULL, 0))
1177         return REG_ESPACE;
1178       ret = calc_inveclosure (dfa);
1179     }
1180
1181   return ret;
1182 }
1183
1184 /* Our parse trees are very unbalanced, so we cannot use a stack to
1185    implement parse tree visits.  Instead, we use parent pointers and
1186    some hairy code in these two functions.  */
1187 static reg_errcode_t
1188 postorder (root, fn, extra)
1189      bin_tree_t *root;
1190      reg_errcode_t (fn (void *, bin_tree_t *));
1191      void *extra;
1192 {
1193   bin_tree_t *node, *prev;
1194
1195   for (node = root; ; )
1196     {
1197       /* Descend down the tree, preferably to the left (or to the right
1198          if that's the only child).  */
1199       while (node->left || node->right)
1200         if (node->left)
1201           node = node->left;
1202         else
1203           node = node->right;
1204
1205       do
1206         {
1207           reg_errcode_t err = fn (extra, node);
1208           if (BE (err != REG_NOERROR, 0))
1209             return err;
1210           if (node->parent == NULL)
1211             return REG_NOERROR;
1212           prev = node;
1213           node = node->parent;
1214         }
1215       /* Go up while we have a node that is reached from the right.  */
1216       while (node->right == prev || node->right == NULL);
1217       node = node->right;
1218     }
1219 }
1220
1221 static reg_errcode_t
1222 preorder (root, fn, extra)
1223      bin_tree_t *root;
1224      reg_errcode_t (fn (void *, bin_tree_t *));
1225      void *extra;
1226 {
1227   bin_tree_t *node;
1228
1229   for (node = root; ; )
1230     {
1231       reg_errcode_t err = fn (extra, node);
1232       if (BE (err != REG_NOERROR, 0))
1233         return err;
1234
1235       /* Go to the left node, or up and to the right.  */
1236       if (node->left)
1237         node = node->left;
1238       else
1239         {
1240           bin_tree_t *prev = NULL;
1241           while (node->right == prev || node->right == NULL)
1242             {
1243               prev = node;
1244               node = node->parent;
1245               if (!node)
1246                 return REG_NOERROR;
1247             }
1248           node = node->right;
1249         }
1250     }
1251 }
1252
1253 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1254    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1255    backreferences as well.  Requires a preorder visit.  */
1256 static reg_errcode_t
1257 optimize_subexps (extra, node)
1258      void *extra;
1259      bin_tree_t *node;
1260 {
1261   re_dfa_t *dfa = (re_dfa_t *) extra;
1262
1263   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1264     {
1265       int idx = node->token.opr.idx;
1266       node->token.opr.idx = dfa->subexp_map[idx];
1267       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1268     }
1269
1270   else if (node->token.type == SUBEXP
1271            && node->left && node->left->token.type == SUBEXP)
1272     {
1273       int other_idx = node->left->token.opr.idx;
1274
1275       node->left = node->left->left;
1276       if (node->left)
1277         node->left->parent = node;
1278
1279       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1280       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1281         dfa->used_bkref_map &= ~(1 << other_idx);
1282     }
1283
1284   return REG_NOERROR;
1285 }
1286
1287 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1288    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1289 static reg_errcode_t
1290 lower_subexps (extra, node)
1291      void *extra;
1292      bin_tree_t *node;
1293 {
1294   regex_t *preg = (regex_t *) extra;
1295   reg_errcode_t err = REG_NOERROR;
1296
1297   if (node->left && node->left->token.type == SUBEXP)
1298     {
1299       node->left = lower_subexp (&err, preg, node->left);
1300       if (node->left)
1301         node->left->parent = node;
1302     }
1303   if (node->right && node->right->token.type == SUBEXP)
1304     {
1305       node->right = lower_subexp (&err, preg, node->right);
1306       if (node->right)
1307         node->right->parent = node;
1308     }
1309
1310   return err;
1311 }
1312
1313 static bin_tree_t *
1314 lower_subexp (err, preg, node)
1315      reg_errcode_t *err;
1316      regex_t *preg;
1317      bin_tree_t *node;
1318 {
1319   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1320   bin_tree_t *body = node->left;
1321   bin_tree_t *op, *cls, *tree1, *tree;
1322
1323   if (preg->no_sub
1324       /* We do not optimize empty subexpressions, because otherwise we may
1325          have bad CONCAT nodes with NULL children.  This is obviously not
1326          very common, so we do not lose much.  An example that triggers
1327          this case is the sed "script" /\(\)/x.  */
1328       && node->left != NULL
1329       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1330           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1331     return node->left;
1332
1333   /* Convert the SUBEXP node to the concatenation of an
1334      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1335   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1336   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1337   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1338   tree = create_tree (dfa, op, tree1, CONCAT);
1339   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1340     {
1341       *err = REG_ESPACE;
1342       return NULL;
1343     }
1344
1345   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1346   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1347   return tree;
1348 }
1349
1350 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1351    nodes.  Requires a postorder visit.  */
1352 static reg_errcode_t
1353 calc_first (extra, node)
1354      void *extra;
1355      bin_tree_t *node;
1356 {
1357   re_dfa_t *dfa = (re_dfa_t *) extra;
1358   if (node->token.type == CONCAT)
1359     {
1360       node->first = node->left->first;
1361       node->node_idx = node->left->node_idx;
1362     }
1363   else
1364     {
1365       node->first = node;
1366       node->node_idx = re_dfa_add_node (dfa, node->token);
1367       if (BE (node->node_idx == -1, 0))
1368         return REG_ESPACE;
1369     }
1370   return REG_NOERROR;
1371 }
1372
1373 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1374 static reg_errcode_t
1375 calc_next (extra, node)
1376      void *extra;
1377      bin_tree_t *node;
1378 {
1379   switch (node->token.type)
1380     {
1381     case OP_DUP_ASTERISK:
1382       node->left->next = node;
1383       break;
1384     case CONCAT:
1385       node->left->next = node->right->first;
1386       node->right->next = node->next;
1387       break;
1388     default:
1389       if (node->left)
1390         node->left->next = node->next;
1391       if (node->right)
1392         node->right->next = node->next;
1393       break;
1394     }
1395   return REG_NOERROR;
1396 }
1397
1398 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1399 static reg_errcode_t
1400 link_nfa_nodes (extra, node)
1401      void *extra;
1402      bin_tree_t *node;
1403 {
1404   re_dfa_t *dfa = (re_dfa_t *) extra;
1405   int idx = node->node_idx;
1406   reg_errcode_t err = REG_NOERROR;
1407
1408   switch (node->token.type)
1409     {
1410     case CONCAT:
1411       break;
1412
1413     case END_OF_RE:
1414       assert (node->next == NULL);
1415       break;
1416
1417     case OP_DUP_ASTERISK:
1418     case OP_ALT:
1419       {
1420         int left, right;
1421         dfa->has_plural_match = 1;
1422         if (node->left != NULL)
1423           left = node->left->first->node_idx;
1424         else
1425           left = node->next->node_idx;
1426         if (node->right != NULL)
1427           right = node->right->first->node_idx;
1428         else
1429           right = node->next->node_idx;
1430         assert (left > -1);
1431         assert (right > -1);
1432         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1433       }
1434       break;
1435
1436     case ANCHOR:
1437     case OP_OPEN_SUBEXP:
1438     case OP_CLOSE_SUBEXP:
1439       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1440       break;
1441
1442     case OP_BACK_REF:
1443       dfa->nexts[idx] = node->next->node_idx;
1444       if (node->token.type == OP_BACK_REF)
1445         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1446       break;
1447
1448     default:
1449       assert (!IS_EPSILON_NODE (node->token.type));
1450       dfa->nexts[idx] = node->next->node_idx;
1451       break;
1452     }
1453
1454   return err;
1455 }
1456
1457 /* Duplicate the epsilon closure of the node ROOT_NODE.
1458    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1459    to their own constraint.  */
1460
1461 static reg_errcode_t
1462 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1463                         init_constraint)
1464      re_dfa_t *dfa;
1465      int top_org_node, top_clone_node, root_node;
1466      unsigned int init_constraint;
1467 {
1468   reg_errcode_t err;
1469   int org_node, clone_node, ret;
1470   unsigned int constraint = init_constraint;
1471   for (org_node = top_org_node, clone_node = top_clone_node;;)
1472     {
1473       int org_dest, clone_dest;
1474       if (dfa->nodes[org_node].type == OP_BACK_REF)
1475         {
1476           /* If the back reference epsilon-transit, its destination must
1477              also have the constraint.  Then duplicate the epsilon closure
1478              of the destination of the back reference, and store it in
1479              edests of the back reference.  */
1480           org_dest = dfa->nexts[org_node];
1481           re_node_set_empty (dfa->edests + clone_node);
1482           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1483           if (BE (err != REG_NOERROR, 0))
1484             return err;
1485           dfa->nexts[clone_node] = dfa->nexts[org_node];
1486           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1487           if (BE (ret < 0, 0))
1488             return REG_ESPACE;
1489         }
1490       else if (dfa->edests[org_node].nelem == 0)
1491         {
1492           /* In case of the node can't epsilon-transit, don't duplicate the
1493              destination and store the original destination as the
1494              destination of the node.  */
1495           dfa->nexts[clone_node] = dfa->nexts[org_node];
1496           break;
1497         }
1498       else if (dfa->edests[org_node].nelem == 1)
1499         {
1500           /* In case of the node can epsilon-transit, and it has only one
1501              destination.  */
1502           org_dest = dfa->edests[org_node].elems[0];
1503           re_node_set_empty (dfa->edests + clone_node);
1504           if (dfa->nodes[org_node].type == ANCHOR)
1505             {
1506               /* In case of the node has another constraint, append it.  */
1507               if (org_node == root_node && clone_node != org_node)
1508                 {
1509                   /* ...but if the node is root_node itself, it means the
1510                      epsilon closure have a loop, then tie it to the
1511                      destination of the root_node.  */
1512                   ret = re_node_set_insert (dfa->edests + clone_node,
1513                                             org_dest);
1514                   if (BE (ret < 0, 0))
1515                     return REG_ESPACE;
1516                   break;
1517                 }
1518               constraint |= dfa->nodes[org_node].opr.ctx_type;
1519             }
1520           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1521           if (BE (err != REG_NOERROR, 0))
1522             return err;
1523           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1524           if (BE (ret < 0, 0))
1525             return REG_ESPACE;
1526         }
1527       else /* dfa->edests[org_node].nelem == 2 */
1528         {
1529           /* In case of the node can epsilon-transit, and it has two
1530              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1531           org_dest = dfa->edests[org_node].elems[0];
1532           re_node_set_empty (dfa->edests + clone_node);
1533           /* Search for a duplicated node which satisfies the constraint.  */
1534           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1535           if (clone_dest == -1)
1536             {
1537               /* There are no such a duplicated node, create a new one.  */
1538               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1539               if (BE (err != REG_NOERROR, 0))
1540                 return err;
1541               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1542               if (BE (ret < 0, 0))
1543                 return REG_ESPACE;
1544               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1545                                             root_node, constraint);
1546               if (BE (err != REG_NOERROR, 0))
1547                 return err;
1548             }
1549           else
1550             {
1551               /* There are a duplicated node which satisfy the constraint,
1552                  use it to avoid infinite loop.  */
1553               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1554               if (BE (ret < 0, 0))
1555                 return REG_ESPACE;
1556             }
1557
1558           org_dest = dfa->edests[org_node].elems[1];
1559           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1560           if (BE (err != REG_NOERROR, 0))
1561             return err;
1562           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1563           if (BE (ret < 0, 0))
1564             return REG_ESPACE;
1565         }
1566       org_node = org_dest;
1567       clone_node = clone_dest;
1568     }
1569   return REG_NOERROR;
1570 }
1571
1572 /* Search for a node which is duplicated from the node ORG_NODE, and
1573    satisfies the constraint CONSTRAINT.  */
1574
1575 static int
1576 search_duplicated_node (dfa, org_node, constraint)
1577      re_dfa_t *dfa;
1578      int org_node;
1579      unsigned int constraint;
1580 {
1581   int idx;
1582   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1583     {
1584       if (org_node == dfa->org_indices[idx]
1585           && constraint == dfa->nodes[idx].constraint)
1586         return idx; /* Found.  */
1587     }
1588   return -1; /* Not found.  */
1589 }
1590
1591 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1592    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1593    otherwise return the error code.  */
1594
1595 static reg_errcode_t
1596 duplicate_node (new_idx, dfa, org_idx, constraint)
1597      re_dfa_t *dfa;
1598      int *new_idx, org_idx;
1599      unsigned int constraint;
1600 {
1601   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1602   if (BE (dup_idx == -1, 0))
1603     return REG_ESPACE;
1604   dfa->nodes[dup_idx].constraint = constraint;
1605   if (dfa->nodes[org_idx].type == ANCHOR)
1606     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1607   dfa->nodes[dup_idx].duplicated = 1;
1608
1609   /* Store the index of the original node.  */
1610   dfa->org_indices[dup_idx] = org_idx;
1611   *new_idx = dup_idx;
1612   return REG_NOERROR;
1613 }
1614
1615 static reg_errcode_t
1616 calc_inveclosure (dfa)
1617      re_dfa_t *dfa;
1618 {
1619   int src, idx, ret;
1620   for (idx = 0; idx < dfa->nodes_len; ++idx)
1621     re_node_set_init_empty (dfa->inveclosures + idx);
1622
1623   for (src = 0; src < dfa->nodes_len; ++src)
1624     {
1625       int *elems = dfa->eclosures[src].elems;
1626       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1627         {
1628           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1629           if (BE (ret == -1, 0))
1630             return REG_ESPACE;
1631         }
1632     }
1633
1634   return REG_NOERROR;
1635 }
1636
1637 /* Calculate "eclosure" for all the node in DFA.  */
1638
1639 static reg_errcode_t
1640 calc_eclosure (dfa)
1641      re_dfa_t *dfa;
1642 {
1643   int node_idx, incomplete;
1644 #ifdef DEBUG
1645   assert (dfa->nodes_len > 0);
1646 #endif
1647   incomplete = 0;
1648   /* For each nodes, calculate epsilon closure.  */
1649   for (node_idx = 0; ; ++node_idx)
1650     {
1651       reg_errcode_t err;
1652       re_node_set eclosure_elem;
1653       if (node_idx == dfa->nodes_len)
1654         {
1655           if (!incomplete)
1656             break;
1657           incomplete = 0;
1658           node_idx = 0;
1659         }
1660
1661 #ifdef DEBUG
1662       assert (dfa->eclosures[node_idx].nelem != -1);
1663 #endif
1664
1665       /* If we have already calculated, skip it.  */
1666       if (dfa->eclosures[node_idx].nelem != 0)
1667         continue;
1668       /* Calculate epsilon closure of `node_idx'.  */
1669       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1670       if (BE (err != REG_NOERROR, 0))
1671         return err;
1672
1673       if (dfa->eclosures[node_idx].nelem == 0)
1674         {
1675           incomplete = 1;
1676           re_node_set_free (&eclosure_elem);
1677         }
1678     }
1679   return REG_NOERROR;
1680 }
1681
1682 /* Calculate epsilon closure of NODE.  */
1683
1684 static reg_errcode_t
1685 calc_eclosure_iter (new_set, dfa, node, root)
1686      re_node_set *new_set;
1687      re_dfa_t *dfa;
1688      int node, root;
1689 {
1690   reg_errcode_t err;
1691   unsigned int constraint;
1692   int i, incomplete;
1693   re_node_set eclosure;
1694   incomplete = 0;
1695   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1696   if (BE (err != REG_NOERROR, 0))
1697     return err;
1698
1699   /* This indicates that we are calculating this node now.
1700      We reference this value to avoid infinite loop.  */
1701   dfa->eclosures[node].nelem = -1;
1702
1703   constraint = ((dfa->nodes[node].type == ANCHOR)
1704                 ? dfa->nodes[node].opr.ctx_type : 0);
1705   /* If the current node has constraints, duplicate all nodes.
1706      Since they must inherit the constraints.  */
1707   if (constraint
1708       && dfa->edests[node].nelem
1709       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1710     {
1711       int org_node, cur_node;
1712       org_node = cur_node = node;
1713       err = duplicate_node_closure (dfa, node, node, node, constraint);
1714       if (BE (err != REG_NOERROR, 0))
1715         return err;
1716     }
1717
1718   /* Expand each epsilon destination nodes.  */
1719   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1720     for (i = 0; i < dfa->edests[node].nelem; ++i)
1721       {
1722         re_node_set eclosure_elem;
1723         int edest = dfa->edests[node].elems[i];
1724         /* If calculating the epsilon closure of `edest' is in progress,
1725            return intermediate result.  */
1726         if (dfa->eclosures[edest].nelem == -1)
1727           {
1728             incomplete = 1;
1729             continue;
1730           }
1731         /* If we haven't calculated the epsilon closure of `edest' yet,
1732            calculate now. Otherwise use calculated epsilon closure.  */
1733         if (dfa->eclosures[edest].nelem == 0)
1734           {
1735             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1736             if (BE (err != REG_NOERROR, 0))
1737               return err;
1738           }
1739         else
1740           eclosure_elem = dfa->eclosures[edest];
1741         /* Merge the epsilon closure of `edest'.  */
1742         re_node_set_merge (&eclosure, &eclosure_elem);
1743         /* If the epsilon closure of `edest' is incomplete,
1744            the epsilon closure of this node is also incomplete.  */
1745         if (dfa->eclosures[edest].nelem == 0)
1746           {
1747             incomplete = 1;
1748             re_node_set_free (&eclosure_elem);
1749           }
1750       }
1751
1752   /* Epsilon closures include itself.  */
1753   re_node_set_insert (&eclosure, node);
1754   if (incomplete && !root)
1755     dfa->eclosures[node].nelem = 0;
1756   else
1757     dfa->eclosures[node] = eclosure;
1758   *new_set = eclosure;
1759   return REG_NOERROR;
1760 }
1761 \f
1762 /* Functions for token which are used in the parser.  */
1763
1764 /* Fetch a token from INPUT.
1765    We must not use this function inside bracket expressions.  */
1766
1767 static void
1768 fetch_token (result, input, syntax)
1769      re_token_t *result;
1770      re_string_t *input;
1771      reg_syntax_t syntax;
1772 {
1773   re_string_skip_bytes (input, peek_token (result, input, syntax));
1774 }
1775
1776 /* Peek a token from INPUT, and return the length of the token.
1777    We must not use this function inside bracket expressions.  */
1778
1779 static int
1780 peek_token (token, input, syntax)
1781      re_token_t *token;
1782      re_string_t *input;
1783      reg_syntax_t syntax;
1784 {
1785   unsigned char c;
1786
1787   if (re_string_eoi (input))
1788     {
1789       token->type = END_OF_RE;
1790       return 0;
1791     }
1792
1793   c = re_string_peek_byte (input, 0);
1794   token->opr.c = c;
1795
1796   token->word_char = 0;
1797 #ifdef RE_ENABLE_I18N
1798   token->mb_partial = 0;
1799   if (input->mb_cur_max > 1 &&
1800       !re_string_first_byte (input, re_string_cur_idx (input)))
1801     {
1802       token->type = CHARACTER;
1803       token->mb_partial = 1;
1804       return 1;
1805     }
1806 #endif
1807   if (c == '\\')
1808     {
1809       unsigned char c2;
1810       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1811         {
1812           token->type = BACK_SLASH;
1813           return 1;
1814         }
1815
1816       c2 = re_string_peek_byte_case (input, 1);
1817       token->opr.c = c2;
1818       token->type = CHARACTER;
1819 #ifdef RE_ENABLE_I18N
1820       if (input->mb_cur_max > 1)
1821         {
1822           wint_t wc = re_string_wchar_at (input,
1823                                           re_string_cur_idx (input) + 1);
1824           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1825         }
1826       else
1827 #endif
1828         token->word_char = IS_WORD_CHAR (c2) != 0;
1829
1830       switch (c2)
1831         {
1832         case '|':
1833           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1834             token->type = OP_ALT;
1835           break;
1836         case '1': case '2': case '3': case '4': case '5':
1837         case '6': case '7': case '8': case '9':
1838           if (!(syntax & RE_NO_BK_REFS))
1839             {
1840               token->type = OP_BACK_REF;
1841               token->opr.idx = c2 - '1';
1842             }
1843           break;
1844         case '<':
1845           if (!(syntax & RE_NO_GNU_OPS))
1846             {
1847               token->type = ANCHOR;
1848               token->opr.ctx_type = WORD_FIRST;
1849             }
1850           break;
1851         case '>':
1852           if (!(syntax & RE_NO_GNU_OPS))
1853             {
1854               token->type = ANCHOR;
1855               token->opr.ctx_type = WORD_LAST;
1856             }
1857           break;
1858         case 'b':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             {
1861               token->type = ANCHOR;
1862               token->opr.ctx_type = WORD_DELIM;
1863             }
1864           break;
1865         case 'B':
1866           if (!(syntax & RE_NO_GNU_OPS))
1867             {
1868               token->type = ANCHOR;
1869               token->opr.ctx_type = NOT_WORD_DELIM;
1870             }
1871           break;
1872         case 'w':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             token->type = OP_WORD;
1875           break;
1876         case 'W':
1877           if (!(syntax & RE_NO_GNU_OPS))
1878             token->type = OP_NOTWORD;
1879           break;
1880         case 's':
1881           if (!(syntax & RE_NO_GNU_OPS))
1882             token->type = OP_SPACE;
1883           break;
1884         case 'S':
1885           if (!(syntax & RE_NO_GNU_OPS))
1886             token->type = OP_NOTSPACE;
1887           break;
1888         case '`':
1889           if (!(syntax & RE_NO_GNU_OPS))
1890             {
1891               token->type = ANCHOR;
1892               token->opr.ctx_type = BUF_FIRST;
1893             }
1894           break;
1895         case '\'':
1896           if (!(syntax & RE_NO_GNU_OPS))
1897             {
1898               token->type = ANCHOR;
1899               token->opr.ctx_type = BUF_LAST;
1900             }
1901           break;
1902         case '(':
1903           if (!(syntax & RE_NO_BK_PARENS))
1904             token->type = OP_OPEN_SUBEXP;
1905           break;
1906         case ')':
1907           if (!(syntax & RE_NO_BK_PARENS))
1908             token->type = OP_CLOSE_SUBEXP;
1909           break;
1910         case '+':
1911           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1912             token->type = OP_DUP_PLUS;
1913           break;
1914         case '?':
1915           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1916             token->type = OP_DUP_QUESTION;
1917           break;
1918         case '{':
1919           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1920             token->type = OP_OPEN_DUP_NUM;
1921           break;
1922         case '}':
1923           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1924             token->type = OP_CLOSE_DUP_NUM;
1925           break;
1926         default:
1927           break;
1928         }
1929       return 2;
1930     }
1931
1932   token->type = CHARACTER;
1933 #ifdef RE_ENABLE_I18N
1934   if (input->mb_cur_max > 1)
1935     {
1936       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1937       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1938     }
1939   else
1940 #endif
1941     token->word_char = IS_WORD_CHAR (token->opr.c);
1942
1943   switch (c)
1944     {
1945     case '\n':
1946       if (syntax & RE_NEWLINE_ALT)
1947         token->type = OP_ALT;
1948       break;
1949     case '|':
1950       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1951         token->type = OP_ALT;
1952       break;
1953     case '*':
1954       token->type = OP_DUP_ASTERISK;
1955       break;
1956     case '+':
1957       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1958         token->type = OP_DUP_PLUS;
1959       break;
1960     case '?':
1961       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1962         token->type = OP_DUP_QUESTION;
1963       break;
1964     case '{':
1965       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1966         token->type = OP_OPEN_DUP_NUM;
1967       break;
1968     case '}':
1969       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1970         token->type = OP_CLOSE_DUP_NUM;
1971       break;
1972     case '(':
1973       if (syntax & RE_NO_BK_PARENS)
1974         token->type = OP_OPEN_SUBEXP;
1975       break;
1976     case ')':
1977       if (syntax & RE_NO_BK_PARENS)
1978         token->type = OP_CLOSE_SUBEXP;
1979       break;
1980     case '[':
1981       token->type = OP_OPEN_BRACKET;
1982       break;
1983     case '.':
1984       token->type = OP_PERIOD;
1985       break;
1986     case '^':
1987       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1988           re_string_cur_idx (input) != 0)
1989         {
1990           char prev = re_string_peek_byte (input, -1);
1991           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1992             break;
1993         }
1994       token->type = ANCHOR;
1995       token->opr.ctx_type = LINE_FIRST;
1996       break;
1997     case '$':
1998       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1999           re_string_cur_idx (input) + 1 != re_string_length (input))
2000         {
2001           re_token_t next;
2002           re_string_skip_bytes (input, 1);
2003           peek_token (&next, input, syntax);
2004           re_string_skip_bytes (input, -1);
2005           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2006             break;
2007         }
2008       token->type = ANCHOR;
2009       token->opr.ctx_type = LINE_LAST;
2010       break;
2011     default:
2012       break;
2013     }
2014   return 1;
2015 }
2016
2017 /* Peek a token from INPUT, and return the length of the token.
2018    We must not use this function out of bracket expressions.  */
2019
2020 static int
2021 peek_token_bracket (token, input, syntax)
2022      re_token_t *token;
2023      re_string_t *input;
2024      reg_syntax_t syntax;
2025 {
2026   unsigned char c;
2027   if (re_string_eoi (input))
2028     {
2029       token->type = END_OF_RE;
2030       return 0;
2031     }
2032   c = re_string_peek_byte (input, 0);
2033   token->opr.c = c;
2034
2035 #ifdef RE_ENABLE_I18N
2036   if (input->mb_cur_max > 1 &&
2037       !re_string_first_byte (input, re_string_cur_idx (input)))
2038     {
2039       token->type = CHARACTER;
2040       return 1;
2041     }
2042 #endif /* RE_ENABLE_I18N */
2043
2044   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2045       && re_string_cur_idx (input) + 1 < re_string_length (input))
2046     {
2047       /* In this case, '\' escape a character.  */
2048       unsigned char c2;
2049       re_string_skip_bytes (input, 1);
2050       c2 = re_string_peek_byte (input, 0);
2051       token->opr.c = c2;
2052       token->type = CHARACTER;
2053       return 1;
2054     }
2055   if (c == '[') /* '[' is a special char in a bracket exps.  */
2056     {
2057       unsigned char c2;
2058       int token_len;
2059       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2060         c2 = re_string_peek_byte (input, 1);
2061       else
2062         c2 = 0;
2063       token->opr.c = c2;
2064       token_len = 2;
2065       switch (c2)
2066         {
2067         case '.':
2068           token->type = OP_OPEN_COLL_ELEM;
2069           break;
2070         case '=':
2071           token->type = OP_OPEN_EQUIV_CLASS;
2072           break;
2073         case ':':
2074           if (syntax & RE_CHAR_CLASSES)
2075             {
2076               token->type = OP_OPEN_CHAR_CLASS;
2077               break;
2078             }
2079           /* else fall through.  */
2080         default:
2081           token->type = CHARACTER;
2082           token->opr.c = c;
2083           token_len = 1;
2084           break;
2085         }
2086       return token_len;
2087     }
2088   switch (c)
2089     {
2090     case '-':
2091       token->type = OP_CHARSET_RANGE;
2092       break;
2093     case ']':
2094       token->type = OP_CLOSE_BRACKET;
2095       break;
2096     case '^':
2097       token->type = OP_NON_MATCH_LIST;
2098       break;
2099     default:
2100       token->type = CHARACTER;
2101     }
2102   return 1;
2103 }
2104 \f
2105 /* Functions for parser.  */
2106
2107 /* Entry point of the parser.
2108    Parse the regular expression REGEXP and return the structure tree.
2109    If an error is occured, ERR is set by error code, and return NULL.
2110    This function build the following tree, from regular expression <reg_exp>:
2111            CAT
2112            / \
2113           /   \
2114    <reg_exp>  EOR
2115
2116    CAT means concatenation.
2117    EOR means end of regular expression.  */
2118
2119 static bin_tree_t *
2120 parse (regexp, preg, syntax, err)
2121      re_string_t *regexp;
2122      regex_t *preg;
2123      reg_syntax_t syntax;
2124      reg_errcode_t *err;
2125 {
2126   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2127   bin_tree_t *tree, *eor, *root;
2128   re_token_t current_token;
2129   dfa->syntax = syntax;
2130   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2131   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2132   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2133     return NULL;
2134   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2135   if (tree != NULL)
2136     root = create_tree (dfa, tree, eor, CONCAT);
2137   else
2138     root = eor;
2139   if (BE (eor == NULL || root == NULL, 0))
2140     {
2141       *err = REG_ESPACE;
2142       return NULL;
2143     }
2144   return root;
2145 }
2146
2147 /* This function build the following tree, from regular expression
2148    <branch1>|<branch2>:
2149            ALT
2150            / \
2151           /   \
2152    <branch1> <branch2>
2153
2154    ALT means alternative, which represents the operator `|'.  */
2155
2156 static bin_tree_t *
2157 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2158      re_string_t *regexp;
2159      regex_t *preg;
2160      re_token_t *token;
2161      reg_syntax_t syntax;
2162      int nest;
2163      reg_errcode_t *err;
2164 {
2165   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2166   bin_tree_t *tree, *branch = NULL;
2167   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2168   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2169     return NULL;
2170
2171   while (token->type == OP_ALT)
2172     {
2173       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2174       if (token->type != OP_ALT && token->type != END_OF_RE
2175           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2176         {
2177           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2178           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2179             return NULL;
2180         }
2181       else
2182         branch = NULL;
2183       tree = create_tree (dfa, tree, branch, OP_ALT);
2184       if (BE (tree == NULL, 0))
2185         {
2186           *err = REG_ESPACE;
2187           return NULL;
2188         }
2189     }
2190   return tree;
2191 }
2192
2193 /* This function build the following tree, from regular expression
2194    <exp1><exp2>:
2195         CAT
2196         / \
2197        /   \
2198    <exp1> <exp2>
2199
2200    CAT means concatenation.  */
2201
2202 static bin_tree_t *
2203 parse_branch (regexp, preg, token, syntax, nest, err)
2204      re_string_t *regexp;
2205      regex_t *preg;
2206      re_token_t *token;
2207      reg_syntax_t syntax;
2208      int nest;
2209      reg_errcode_t *err;
2210 {
2211   bin_tree_t *tree, *exp;
2212   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2213   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2214   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2215     return NULL;
2216
2217   while (token->type != OP_ALT && token->type != END_OF_RE
2218          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2219     {
2220       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2221       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2222         {
2223           return NULL;
2224         }
2225       if (tree != NULL && exp != NULL)
2226         {
2227           tree = create_tree (dfa, tree, exp, CONCAT);
2228           if (tree == NULL)
2229             {
2230               *err = REG_ESPACE;
2231               return NULL;
2232             }
2233         }
2234       else if (tree == NULL)
2235         tree = exp;
2236       /* Otherwise exp == NULL, we don't need to create new tree.  */
2237     }
2238   return tree;
2239 }
2240
2241 /* This function build the following tree, from regular expression a*:
2242          *
2243          |
2244          a
2245 */
2246
2247 static bin_tree_t *
2248 parse_expression (regexp, preg, token, syntax, nest, err)
2249      re_string_t *regexp;
2250      regex_t *preg;
2251      re_token_t *token;
2252      reg_syntax_t syntax;
2253      int nest;
2254      reg_errcode_t *err;
2255 {
2256   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2257   bin_tree_t *tree;
2258   switch (token->type)
2259     {
2260     case CHARACTER:
2261       tree = create_token_tree (dfa, NULL, NULL, token);
2262       if (BE (tree == NULL, 0))
2263         {
2264           *err = REG_ESPACE;
2265           return NULL;
2266         }
2267 #ifdef RE_ENABLE_I18N
2268       if (dfa->mb_cur_max > 1)
2269         {
2270           while (!re_string_eoi (regexp)
2271                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2272             {
2273               bin_tree_t *mbc_remain;
2274               fetch_token (token, regexp, syntax);
2275               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2276               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2277               if (BE (mbc_remain == NULL || tree == NULL, 0))
2278                 {
2279                   *err = REG_ESPACE;
2280                   return NULL;
2281                 }
2282             }
2283         }
2284 #endif
2285       break;
2286     case OP_OPEN_SUBEXP:
2287       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2288       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2289         return NULL;
2290       break;
2291     case OP_OPEN_BRACKET:
2292       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2293       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2294         return NULL;
2295       break;
2296     case OP_BACK_REF:
2297       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2298         {
2299           *err = REG_ESUBREG;
2300           return NULL;
2301         }
2302       dfa->used_bkref_map |= 1 << token->opr.idx;
2303       tree = create_token_tree (dfa, NULL, NULL, token);
2304       if (BE (tree == NULL, 0))
2305         {
2306           *err = REG_ESPACE;
2307           return NULL;
2308         }
2309       ++dfa->nbackref;
2310       dfa->has_mb_node = 1;
2311       break;
2312     case OP_OPEN_DUP_NUM:
2313       if (syntax & RE_CONTEXT_INVALID_DUP)
2314         {
2315           *err = REG_BADRPT;
2316           return NULL;
2317         }
2318       /* FALLTHROUGH */
2319     case OP_DUP_ASTERISK:
2320     case OP_DUP_PLUS:
2321     case OP_DUP_QUESTION:
2322       if (syntax & RE_CONTEXT_INVALID_OPS)
2323         {
2324           *err = REG_BADRPT;
2325           return NULL;
2326         }
2327       else if (syntax & RE_CONTEXT_INDEP_OPS)
2328         {
2329           fetch_token (token, regexp, syntax);
2330           return parse_expression (regexp, preg, token, syntax, nest, err);
2331         }
2332       /* else fall through  */
2333     case OP_CLOSE_SUBEXP:
2334       if ((token->type == OP_CLOSE_SUBEXP) &&
2335           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2336         {
2337           *err = REG_ERPAREN;
2338           return NULL;
2339         }
2340       /* else fall through  */
2341     case OP_CLOSE_DUP_NUM:
2342       /* We treat it as a normal character.  */
2343
2344       /* Then we can these characters as normal characters.  */
2345       token->type = CHARACTER;
2346       /* mb_partial and word_char bits should be initialized already
2347          by peek_token.  */
2348       tree = create_token_tree (dfa, NULL, NULL, token);
2349       if (BE (tree == NULL, 0))
2350         {
2351           *err = REG_ESPACE;
2352           return NULL;
2353         }
2354       break;
2355     case ANCHOR:
2356       if ((token->opr.ctx_type
2357            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2358           && dfa->word_ops_used == 0)
2359         init_word_char (dfa);
2360       if (token->opr.ctx_type == WORD_DELIM
2361           || token->opr.ctx_type == NOT_WORD_DELIM)
2362         {
2363           bin_tree_t *tree_first, *tree_last;
2364           if (token->opr.ctx_type == WORD_DELIM)
2365             {
2366               token->opr.ctx_type = WORD_FIRST;
2367               tree_first = create_token_tree (dfa, NULL, NULL, token);
2368               token->opr.ctx_type = WORD_LAST;
2369             }
2370           else
2371             {
2372               token->opr.ctx_type = INSIDE_WORD;
2373               tree_first = create_token_tree (dfa, NULL, NULL, token);
2374               token->opr.ctx_type = INSIDE_NOTWORD;
2375             }
2376           tree_last = create_token_tree (dfa, NULL, NULL, token);
2377           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2378           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2379             {
2380               *err = REG_ESPACE;
2381               return NULL;
2382             }
2383         }
2384       else
2385         {
2386           tree = create_token_tree (dfa, NULL, NULL, token);
2387           if (BE (tree == NULL, 0))
2388             {
2389               *err = REG_ESPACE;
2390               return NULL;
2391             }
2392         }
2393       /* We must return here, since ANCHORs can't be followed
2394          by repetition operators.
2395          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2396              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2397       fetch_token (token, regexp, syntax);
2398       return tree;
2399     case OP_PERIOD:
2400       tree = create_token_tree (dfa, NULL, NULL, token);
2401       if (BE (tree == NULL, 0))
2402         {
2403           *err = REG_ESPACE;
2404           return NULL;
2405         }
2406       if (dfa->mb_cur_max > 1)
2407         dfa->has_mb_node = 1;
2408       break;
2409     case OP_WORD:
2410     case OP_NOTWORD:
2411       tree = build_charclass_op (dfa, regexp->trans,
2412                                  (const unsigned char *) "alnum",
2413                                  (const unsigned char *) "_",
2414                                  token->type == OP_NOTWORD, err);
2415       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2416         return NULL;
2417       break;
2418     case OP_SPACE:
2419     case OP_NOTSPACE:
2420       tree = build_charclass_op (dfa, regexp->trans,
2421                                  (const unsigned char *) "space",
2422                                  (const unsigned char *) "",
2423                                  token->type == OP_NOTSPACE, err);
2424       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2425         return NULL;
2426       break;
2427     case OP_ALT:
2428     case END_OF_RE:
2429       return NULL;
2430     case BACK_SLASH:
2431       *err = REG_EESCAPE;
2432       return NULL;
2433     default:
2434       /* Must not happen?  */
2435 #ifdef DEBUG
2436       assert (0);
2437 #endif
2438       return NULL;
2439     }
2440   fetch_token (token, regexp, syntax);
2441
2442   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2443          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2444     {
2445       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2446       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2447         return NULL;
2448       /* In BRE consecutive duplications are not allowed.  */
2449       if ((syntax & RE_CONTEXT_INVALID_DUP)
2450           && (token->type == OP_DUP_ASTERISK
2451               || token->type == OP_OPEN_DUP_NUM))
2452         {
2453           *err = REG_BADRPT;
2454           return NULL;
2455         }
2456     }
2457
2458   return tree;
2459 }
2460
2461 /* This function build the following tree, from regular expression
2462    (<reg_exp>):
2463          SUBEXP
2464             |
2465         <reg_exp>
2466 */
2467
2468 static bin_tree_t *
2469 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2470      re_string_t *regexp;
2471      regex_t *preg;
2472      re_token_t *token;
2473      reg_syntax_t syntax;
2474      int nest;
2475      reg_errcode_t *err;
2476 {
2477   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2478   bin_tree_t *tree;
2479   size_t cur_nsub;
2480   cur_nsub = preg->re_nsub++;
2481
2482   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2483
2484   /* The subexpression may be a null string.  */
2485   if (token->type == OP_CLOSE_SUBEXP)
2486     tree = NULL;
2487   else
2488     {
2489       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2490       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2491         *err = REG_EPAREN;
2492       if (BE (*err != REG_NOERROR, 0))
2493         return NULL;
2494     }
2495   dfa->completed_bkref_map |= 1 << cur_nsub;
2496
2497   tree = create_tree (dfa, tree, NULL, SUBEXP);
2498   if (BE (tree == NULL, 0))
2499     {
2500       *err = REG_ESPACE;
2501       return NULL;
2502     }
2503   tree->token.opr.idx = cur_nsub;
2504   return tree;
2505 }
2506
2507 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2508
2509 static bin_tree_t *
2510 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2511      bin_tree_t *elem;
2512      re_string_t *regexp;
2513      re_dfa_t *dfa;
2514      re_token_t *token;
2515      reg_syntax_t syntax;
2516      reg_errcode_t *err;
2517 {
2518   bin_tree_t *tree = NULL, *old_tree = NULL;
2519   int i, start, end, start_idx = re_string_cur_idx (regexp);
2520   re_token_t start_token = *token;
2521
2522   if (token->type == OP_OPEN_DUP_NUM)
2523     {
2524       end = 0;
2525       start = fetch_number (regexp, token, syntax);
2526       if (start == -1)
2527         {
2528           if (token->type == CHARACTER && token->opr.c == ',')
2529             start = 0; /* We treat "{,m}" as "{0,m}".  */
2530           else
2531             {
2532               *err = REG_BADBR; /* <re>{} is invalid.  */
2533               return NULL;
2534             }
2535         }
2536       if (BE (start != -2, 1))
2537         {
2538           /* We treat "{n}" as "{n,n}".  */
2539           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2540                  : ((token->type == CHARACTER && token->opr.c == ',')
2541                     ? fetch_number (regexp, token, syntax) : -2));
2542         }
2543       if (BE (start == -2 || end == -2, 0))
2544         {
2545           /* Invalid sequence.  */
2546           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2547             {
2548               if (token->type == END_OF_RE)
2549                 *err = REG_EBRACE;
2550               else
2551                 *err = REG_BADBR;
2552
2553               return NULL;
2554             }
2555
2556           /* If the syntax bit is set, rollback.  */
2557           re_string_set_index (regexp, start_idx);
2558           *token = start_token;
2559           token->type = CHARACTER;
2560           /* mb_partial and word_char bits should be already initialized by
2561              peek_token.  */
2562           return elem;
2563         }
2564
2565       if (BE (end != -1 && start > end, 0))
2566         {
2567           /* First number greater than second.  */
2568           *err = REG_BADBR;
2569           return NULL;
2570         }
2571     }
2572   else
2573     {
2574       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2575       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2576     }
2577
2578   fetch_token (token, regexp, syntax);
2579
2580   if (BE (elem == NULL, 0))
2581     return NULL;
2582   if (BE (start == 0 && end == 0, 0))
2583     {
2584       postorder (elem, free_tree, NULL);
2585       return NULL;
2586     }
2587
2588   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2589   if (BE (start > 0, 0))
2590     {
2591       tree = elem;
2592       for (i = 2; i <= start; ++i)
2593         {
2594           elem = duplicate_tree (elem, dfa);
2595           tree = create_tree (dfa, tree, elem, CONCAT);
2596           if (BE (elem == NULL || tree == NULL, 0))
2597             goto parse_dup_op_espace;
2598         }
2599
2600       if (start == end)
2601         return tree;
2602
2603       /* Duplicate ELEM before it is marked optional.  */
2604       elem = duplicate_tree (elem, dfa);
2605       old_tree = tree;
2606     }
2607   else
2608     old_tree = NULL;
2609
2610   if (elem->token.type == SUBEXP)
2611     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2612
2613   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2614   if (BE (tree == NULL, 0))
2615     goto parse_dup_op_espace;
2616
2617   /* This loop is actually executed only when end != -1,
2618      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2619      already created the start+1-th copy.  */
2620   for (i = start + 2; i <= end; ++i)
2621     {
2622       elem = duplicate_tree (elem, dfa);
2623       tree = create_tree (dfa, tree, elem, CONCAT);
2624       if (BE (elem == NULL || tree == NULL, 0))
2625         goto parse_dup_op_espace;
2626
2627       tree = create_tree (dfa, tree, NULL, OP_ALT);
2628       if (BE (tree == NULL, 0))
2629         goto parse_dup_op_espace;
2630     }
2631
2632   if (old_tree)
2633     tree = create_tree (dfa, old_tree, tree, CONCAT);
2634
2635   return tree;
2636
2637  parse_dup_op_espace:
2638   *err = REG_ESPACE;
2639   return NULL;
2640 }
2641
2642 /* Size of the names for collating symbol/equivalence_class/character_class.
2643    I'm not sure, but maybe enough.  */
2644 #define BRACKET_NAME_BUF_SIZE 32
2645
2646 #ifndef _LIBC
2647   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2648      Build the range expression which starts from START_ELEM, and ends
2649      at END_ELEM.  The result are written to MBCSET and SBCSET.
2650      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2651      mbcset->range_ends, is a pointer argument sinse we may
2652      update it.  */
2653
2654 static reg_errcode_t
2655 # ifdef RE_ENABLE_I18N
2656 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2657      re_charset_t *mbcset;
2658      int *range_alloc;
2659 # else /* not RE_ENABLE_I18N */
2660 build_range_exp (sbcset, start_elem, end_elem)
2661 # endif /* not RE_ENABLE_I18N */
2662      re_bitset_ptr_t sbcset;
2663      bracket_elem_t *start_elem, *end_elem;
2664 {
2665   unsigned int start_ch, end_ch;
2666   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2667   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2668           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2669           0))
2670     return REG_ERANGE;
2671
2672   /* We can handle no multi character collating elements without libc
2673      support.  */
2674   if (BE ((start_elem->type == COLL_SYM
2675            && strlen ((char *) start_elem->opr.name) > 1)
2676           || (end_elem->type == COLL_SYM
2677               && strlen ((char *) end_elem->opr.name) > 1), 0))
2678     return REG_ECOLLATE;
2679
2680 # ifdef RE_ENABLE_I18N
2681   {
2682     wchar_t wc, start_wc, end_wc;
2683     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2684
2685     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2686                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2687                    : 0));
2688     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2689               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2690                  : 0));
2691     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2692                 ? __btowc (start_ch) : start_elem->opr.wch);
2693     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2694               ? __btowc (end_ch) : end_elem->opr.wch);
2695     if (start_wc == WEOF || end_wc == WEOF)
2696       return REG_ECOLLATE;
2697     cmp_buf[0] = start_wc;
2698     cmp_buf[4] = end_wc;
2699     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2700       return REG_ERANGE;
2701
2702     /* Got valid collation sequence values, add them as a new entry.
2703        However, for !_LIBC we have no collation elements: if the
2704        character set is single byte, the single byte character set
2705        that we build below suffices.  parse_bracket_exp passes
2706        no MBCSET if dfa->mb_cur_max == 1.  */
2707     if (mbcset)
2708       {
2709         /* Check the space of the arrays.  */
2710         if (BE (*range_alloc == mbcset->nranges, 0))
2711           {
2712             /* There is not enough space, need realloc.  */
2713             wchar_t *new_array_start, *new_array_end;
2714             int new_nranges;
2715
2716             /* +1 in case of mbcset->nranges is 0.  */
2717             new_nranges = 2 * mbcset->nranges + 1;
2718             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2719                are NULL if *range_alloc == 0.  */
2720             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2721                                           new_nranges);
2722             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2723                                         new_nranges);
2724
2725             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2726               return REG_ESPACE;
2727
2728             mbcset->range_starts = new_array_start;
2729             mbcset->range_ends = new_array_end;
2730             *range_alloc = new_nranges;
2731           }
2732
2733         mbcset->range_starts[mbcset->nranges] = start_wc;
2734         mbcset->range_ends[mbcset->nranges++] = end_wc;
2735       }
2736
2737     /* Build the table for single byte characters.  */
2738     for (wc = 0; wc < SBC_MAX; ++wc)
2739       {
2740         cmp_buf[2] = wc;
2741         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2742             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2743           bitset_set (sbcset, wc);
2744       }
2745   }
2746 # else /* not RE_ENABLE_I18N */
2747   {
2748     unsigned int ch;
2749     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2750                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2751                    : 0));
2752     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2753               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2754                  : 0));
2755     if (start_ch > end_ch)
2756       return REG_ERANGE;
2757     /* Build the table for single byte characters.  */
2758     for (ch = 0; ch < SBC_MAX; ++ch)
2759       if (start_ch <= ch  && ch <= end_ch)
2760         bitset_set (sbcset, ch);
2761   }
2762 # endif /* not RE_ENABLE_I18N */
2763   return REG_NOERROR;
2764 }
2765 #endif /* not _LIBC */
2766
2767 #ifndef _LIBC
2768 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2769    Build the collating element which is represented by NAME.
2770    The result are written to MBCSET and SBCSET.
2771    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2772    pointer argument since we may update it.  */
2773
2774 static reg_errcode_t
2775 # ifdef RE_ENABLE_I18N
2776 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2777      re_charset_t *mbcset;
2778      int *coll_sym_alloc;
2779 # else /* not RE_ENABLE_I18N */
2780 build_collating_symbol (sbcset, name)
2781 # endif /* not RE_ENABLE_I18N */
2782      re_bitset_ptr_t sbcset;
2783      const unsigned char *name;
2784 {
2785   size_t name_len = strlen ((const char *) name);
2786   if (BE (name_len != 1, 0))
2787     return REG_ECOLLATE;
2788   else
2789     {
2790       bitset_set (sbcset, name[0]);
2791       return REG_NOERROR;
2792     }
2793 }
2794 #endif /* not _LIBC */
2795
2796 /* This function parse bracket expression like "[abc]", "[a-c]",
2797    "[[.a-a.]]" etc.  */
2798
2799 static bin_tree_t *
2800 parse_bracket_exp (regexp, dfa, token, syntax, err)
2801      re_string_t *regexp;
2802      re_dfa_t *dfa;
2803      re_token_t *token;
2804      reg_syntax_t syntax;
2805      reg_errcode_t *err;
2806 {
2807 #ifdef _LIBC
2808   const unsigned char *collseqmb;
2809   const char *collseqwc;
2810   uint32_t nrules;
2811   int32_t table_size;
2812   const int32_t *symb_table;
2813   const unsigned char *extra;
2814
2815   /* Local function for parse_bracket_exp used in _LIBC environement.
2816      Seek the collating symbol entry correspondings to NAME.
2817      Return the index of the symbol in the SYMB_TABLE.  */
2818
2819   auto inline int32_t
2820   __attribute ((always_inline))
2821   seek_collating_symbol_entry (name, name_len)
2822          const unsigned char *name;
2823          size_t name_len;
2824     {
2825       int32_t hash = elem_hash ((const char *) name, name_len);
2826       int32_t elem = hash % table_size;
2827       int32_t second = hash % (table_size - 2);
2828       while (symb_table[2 * elem] != 0)
2829         {
2830           /* First compare the hashing value.  */
2831           if (symb_table[2 * elem] == hash
2832               /* Compare the length of the name.  */
2833               && name_len == extra[symb_table[2 * elem + 1]]
2834               /* Compare the name.  */
2835               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2836                          name_len) == 0)
2837             {
2838               /* Yep, this is the entry.  */
2839               break;
2840             }
2841
2842           /* Next entry.  */
2843           elem += second;
2844         }
2845       return elem;
2846     }
2847
2848   /* Local function for parse_bracket_exp used in _LIBC environement.
2849      Look up the collation sequence value of BR_ELEM.
2850      Return the value if succeeded, UINT_MAX otherwise.  */
2851
2852   auto inline unsigned int
2853   __attribute ((always_inline))
2854   lookup_collation_sequence_value (br_elem)
2855          bracket_elem_t *br_elem;
2856     {
2857       if (br_elem->type == SB_CHAR)
2858         {
2859           /*
2860           if (MB_CUR_MAX == 1)
2861           */
2862           if (nrules == 0)
2863             return collseqmb[br_elem->opr.ch];
2864           else
2865             {
2866               wint_t wc = __btowc (br_elem->opr.ch);
2867               return __collseq_table_lookup (collseqwc, wc);
2868             }
2869         }
2870       else if (br_elem->type == MB_CHAR)
2871         {
2872           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2873         }
2874       else if (br_elem->type == COLL_SYM)
2875         {
2876           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2877           if (nrules != 0)
2878             {
2879               int32_t elem, idx;
2880               elem = seek_collating_symbol_entry (br_elem->opr.name,
2881                                                   sym_name_len);
2882               if (symb_table[2 * elem] != 0)
2883                 {
2884                   /* We found the entry.  */
2885                   idx = symb_table[2 * elem + 1];
2886                   /* Skip the name of collating element name.  */
2887                   idx += 1 + extra[idx];
2888                   /* Skip the byte sequence of the collating element.  */
2889                   idx += 1 + extra[idx];
2890                   /* Adjust for the alignment.  */
2891                   idx = (idx + 3) & ~3;
2892                   /* Skip the multibyte collation sequence value.  */
2893                   idx += sizeof (unsigned int);
2894                   /* Skip the wide char sequence of the collating element.  */
2895                   idx += sizeof (unsigned int) *
2896                     (1 + *(unsigned int *) (extra + idx));
2897                   /* Return the collation sequence value.  */
2898                   return *(unsigned int *) (extra + idx);
2899                 }
2900               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2901                 {
2902                   /* No valid character.  Match it as a single byte
2903                      character.  */
2904                   return collseqmb[br_elem->opr.name[0]];
2905                 }
2906             }
2907           else if (sym_name_len == 1)
2908             return collseqmb[br_elem->opr.name[0]];
2909         }
2910       return UINT_MAX;
2911     }
2912
2913   /* Local function for parse_bracket_exp used in _LIBC environement.
2914      Build the range expression which starts from START_ELEM, and ends
2915      at END_ELEM.  The result are written to MBCSET and SBCSET.
2916      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2917      mbcset->range_ends, is a pointer argument sinse we may
2918      update it.  */
2919
2920   auto inline reg_errcode_t
2921   __attribute ((always_inline))
2922   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2923          re_charset_t *mbcset;
2924          int *range_alloc;
2925          re_bitset_ptr_t sbcset;
2926          bracket_elem_t *start_elem, *end_elem;
2927     {
2928       unsigned int ch;
2929       uint32_t start_collseq;
2930       uint32_t end_collseq;
2931
2932       /* Equivalence Classes and Character Classes can't be a range
2933          start/end.  */
2934       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2935               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2936               0))
2937         return REG_ERANGE;
2938
2939       start_collseq = lookup_collation_sequence_value (start_elem);
2940       end_collseq = lookup_collation_sequence_value (end_elem);
2941       /* Check start/end collation sequence values.  */
2942       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2943         return REG_ECOLLATE;
2944       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2945         return REG_ERANGE;
2946
2947       /* Got valid collation sequence values, add them as a new entry.
2948          However, if we have no collation elements, and the character set
2949          is single byte, the single byte character set that we
2950          build below suffices. */
2951       if (nrules > 0 || dfa->mb_cur_max > 1)
2952         {
2953           /* Check the space of the arrays.  */
2954           if (BE (*range_alloc == mbcset->nranges, 0))
2955             {
2956               /* There is not enough space, need realloc.  */
2957               uint32_t *new_array_start;
2958               uint32_t *new_array_end;
2959               int new_nranges;
2960
2961               /* +1 in case of mbcset->nranges is 0.  */
2962               new_nranges = 2 * mbcset->nranges + 1;
2963               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2964                                             new_nranges);
2965               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2966                                           new_nranges);
2967
2968               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2969                 return REG_ESPACE;
2970
2971               mbcset->range_starts = new_array_start;
2972               mbcset->range_ends = new_array_end;
2973               *range_alloc = new_nranges;
2974             }
2975
2976           mbcset->range_starts[mbcset->nranges] = start_collseq;
2977           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2978         }
2979
2980       /* Build the table for single byte characters.  */
2981       for (ch = 0; ch < SBC_MAX; ch++)
2982         {
2983           uint32_t ch_collseq;
2984           /*
2985           if (MB_CUR_MAX == 1)
2986           */
2987           if (nrules == 0)
2988             ch_collseq = collseqmb[ch];
2989           else
2990             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2991           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2992             bitset_set (sbcset, ch);
2993         }
2994       return REG_NOERROR;
2995     }
2996
2997   /* Local function for parse_bracket_exp used in _LIBC environement.
2998      Build the collating element which is represented by NAME.
2999      The result are written to MBCSET and SBCSET.
3000      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3001      pointer argument sinse we may update it.  */
3002
3003   auto inline reg_errcode_t
3004   __attribute ((always_inline))
3005   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3006          re_charset_t *mbcset;
3007          int *coll_sym_alloc;
3008          re_bitset_ptr_t sbcset;
3009          const unsigned char *name;
3010     {
3011       int32_t elem, idx;
3012       size_t name_len = strlen ((const char *) name);
3013       if (nrules != 0)
3014         {
3015           elem = seek_collating_symbol_entry (name, name_len);
3016           if (symb_table[2 * elem] != 0)
3017             {
3018               /* We found the entry.  */
3019               idx = symb_table[2 * elem + 1];
3020               /* Skip the name of collating element name.  */
3021               idx += 1 + extra[idx];
3022             }
3023           else if (symb_table[2 * elem] == 0 && name_len == 1)
3024             {
3025               /* No valid character, treat it as a normal
3026                  character.  */
3027               bitset_set (sbcset, name[0]);
3028               return REG_NOERROR;
3029             }
3030           else
3031             return REG_ECOLLATE;
3032
3033           /* Got valid collation sequence, add it as a new entry.  */
3034           /* Check the space of the arrays.  */
3035           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3036             {
3037               /* Not enough, realloc it.  */
3038               /* +1 in case of mbcset->ncoll_syms is 0.  */
3039               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3040               /* Use realloc since mbcset->coll_syms is NULL
3041                  if *alloc == 0.  */
3042               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3043                                                    new_coll_sym_alloc);
3044               if (BE (new_coll_syms == NULL, 0))
3045                 return REG_ESPACE;
3046               mbcset->coll_syms = new_coll_syms;
3047               *coll_sym_alloc = new_coll_sym_alloc;
3048             }
3049           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3050           return REG_NOERROR;
3051         }
3052       else
3053         {
3054           if (BE (name_len != 1, 0))
3055             return REG_ECOLLATE;
3056           else
3057             {
3058               bitset_set (sbcset, name[0]);
3059               return REG_NOERROR;
3060             }
3061         }
3062     }
3063 #endif
3064
3065   re_token_t br_token;
3066   re_bitset_ptr_t sbcset;
3067 #ifdef RE_ENABLE_I18N
3068   re_charset_t *mbcset;
3069   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3070   int equiv_class_alloc = 0, char_class_alloc = 0;
3071 #endif /* not RE_ENABLE_I18N */
3072   int non_match = 0;
3073   bin_tree_t *work_tree;
3074   int token_len;
3075   int first_round = 1;
3076 #ifdef _LIBC
3077   collseqmb = (const unsigned char *)
3078     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3079   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3080   if (nrules)
3081     {
3082       /*
3083       if (MB_CUR_MAX > 1)
3084       */
3085         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3086       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3087       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3088                                                   _NL_COLLATE_SYMB_TABLEMB);
3089       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3090                                                    _NL_COLLATE_SYMB_EXTRAMB);
3091     }
3092 #endif
3093   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3094 #ifdef RE_ENABLE_I18N
3095   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3096 #endif /* RE_ENABLE_I18N */
3097 #ifdef RE_ENABLE_I18N
3098   if (BE (sbcset == NULL || mbcset == NULL, 0))
3099 #else
3100   if (BE (sbcset == NULL, 0))
3101 #endif /* RE_ENABLE_I18N */
3102     {
3103       *err = REG_ESPACE;
3104       return NULL;
3105     }
3106
3107   token_len = peek_token_bracket (token, regexp, syntax);
3108   if (BE (token->type == END_OF_RE, 0))
3109     {
3110       *err = REG_BADPAT;
3111       goto parse_bracket_exp_free_return;
3112     }
3113   if (token->type == OP_NON_MATCH_LIST)
3114     {
3115 #ifdef RE_ENABLE_I18N
3116       mbcset->non_match = 1;
3117 #endif /* not RE_ENABLE_I18N */
3118       non_match = 1;
3119       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3120         bitset_set (sbcset, '\0');
3121       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3122       token_len = peek_token_bracket (token, regexp, syntax);
3123       if (BE (token->type == END_OF_RE, 0))
3124         {
3125           *err = REG_BADPAT;
3126           goto parse_bracket_exp_free_return;
3127         }
3128     }
3129
3130   /* We treat the first ']' as a normal character.  */
3131   if (token->type == OP_CLOSE_BRACKET)
3132     token->type = CHARACTER;
3133
3134   while (1)
3135     {
3136       bracket_elem_t start_elem, end_elem;
3137       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3138       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3139       reg_errcode_t ret;
3140       int token_len2 = 0, is_range_exp = 0;
3141       re_token_t token2;
3142
3143       start_elem.opr.name = start_name_buf;
3144       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3145                                    syntax, first_round);
3146       if (BE (ret != REG_NOERROR, 0))
3147         {
3148           *err = ret;
3149           goto parse_bracket_exp_free_return;
3150         }
3151       first_round = 0;
3152
3153       /* Get information about the next token.  We need it in any case.  */
3154       token_len = peek_token_bracket (token, regexp, syntax);
3155
3156       /* Do not check for ranges if we know they are not allowed.  */
3157       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3158         {
3159           if (BE (token->type == END_OF_RE, 0))
3160             {
3161               *err = REG_EBRACK;
3162               goto parse_bracket_exp_free_return;
3163             }
3164           if (token->type == OP_CHARSET_RANGE)
3165             {
3166               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3167               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3168               if (BE (token2.type == END_OF_RE, 0))
3169                 {
3170                   *err = REG_EBRACK;
3171                   goto parse_bracket_exp_free_return;
3172                 }
3173               if (token2.type == OP_CLOSE_BRACKET)
3174                 {
3175                   /* We treat the last '-' as a normal character.  */
3176                   re_string_skip_bytes (regexp, -token_len);
3177                   token->type = CHARACTER;
3178                 }
3179               else
3180                 is_range_exp = 1;
3181             }
3182         }
3183
3184       if (is_range_exp == 1)
3185         {
3186           end_elem.opr.name = end_name_buf;
3187           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3188                                        dfa, syntax, 1);
3189           if (BE (ret != REG_NOERROR, 0))
3190             {
3191               *err = ret;
3192               goto parse_bracket_exp_free_return;
3193             }
3194
3195           token_len = peek_token_bracket (token, regexp, syntax);
3196
3197 #ifdef _LIBC
3198           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3199                                   &start_elem, &end_elem);
3200 #else
3201 # ifdef RE_ENABLE_I18N
3202           *err = build_range_exp (sbcset,
3203                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3204                                   &range_alloc, &start_elem, &end_elem);
3205 # else
3206           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3207 # endif
3208 #endif /* RE_ENABLE_I18N */
3209           if (BE (*err != REG_NOERROR, 0))
3210             goto parse_bracket_exp_free_return;
3211         }
3212       else
3213         {
3214           switch (start_elem.type)
3215             {
3216             case SB_CHAR:
3217               bitset_set (sbcset, start_elem.opr.ch);
3218               break;
3219 #ifdef RE_ENABLE_I18N
3220             case MB_CHAR:
3221               /* Check whether the array has enough space.  */
3222               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3223                 {
3224                   wchar_t *new_mbchars;
3225                   /* Not enough, realloc it.  */
3226                   /* +1 in case of mbcset->nmbchars is 0.  */
3227                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3228                   /* Use realloc since array is NULL if *alloc == 0.  */
3229                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3230                                             mbchar_alloc);
3231                   if (BE (new_mbchars == NULL, 0))
3232                     goto parse_bracket_exp_espace;
3233                   mbcset->mbchars = new_mbchars;
3234                 }
3235               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3236               break;
3237 #endif /* RE_ENABLE_I18N */
3238             case EQUIV_CLASS:
3239               *err = build_equiv_class (sbcset,
3240 #ifdef RE_ENABLE_I18N
3241                                         mbcset, &equiv_class_alloc,
3242 #endif /* RE_ENABLE_I18N */
3243                                         start_elem.opr.name);
3244               if (BE (*err != REG_NOERROR, 0))
3245                 goto parse_bracket_exp_free_return;
3246               break;
3247             case COLL_SYM:
3248               *err = build_collating_symbol (sbcset,
3249 #ifdef RE_ENABLE_I18N
3250                                              mbcset, &coll_sym_alloc,
3251 #endif /* RE_ENABLE_I18N */
3252                                              start_elem.opr.name);
3253               if (BE (*err != REG_NOERROR, 0))
3254                 goto parse_bracket_exp_free_return;
3255               break;
3256             case CHAR_CLASS:
3257               *err = build_charclass (regexp->trans, sbcset,
3258 #ifdef RE_ENABLE_I18N
3259                                       mbcset, &char_class_alloc,
3260 #endif /* RE_ENABLE_I18N */
3261                                       start_elem.opr.name, syntax);
3262               if (BE (*err != REG_NOERROR, 0))
3263                goto parse_bracket_exp_free_return;
3264               break;
3265             default:
3266               assert (0);
3267               break;
3268             }
3269         }
3270       if (BE (token->type == END_OF_RE, 0))
3271         {
3272           *err = REG_EBRACK;
3273           goto parse_bracket_exp_free_return;
3274         }
3275       if (token->type == OP_CLOSE_BRACKET)
3276         break;
3277     }
3278
3279   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3280
3281   /* If it is non-matching list.  */
3282   if (non_match)
3283     bitset_not (sbcset);
3284
3285 #ifdef RE_ENABLE_I18N
3286   /* Ensure only single byte characters are set.  */
3287   if (dfa->mb_cur_max > 1)
3288     bitset_mask (sbcset, dfa->sb_char);
3289
3290   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3291       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3292                                                      || mbcset->non_match)))
3293     {
3294       bin_tree_t *mbc_tree;
3295       int sbc_idx;
3296       /* Build a tree for complex bracket.  */
3297       dfa->has_mb_node = 1;
3298       br_token.type = COMPLEX_BRACKET;
3299       br_token.opr.mbcset = mbcset;
3300       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3301       if (BE (mbc_tree == NULL, 0))
3302         goto parse_bracket_exp_espace;
3303       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3304         if (sbcset[sbc_idx])
3305           break;
3306       /* If there are no bits set in sbcset, there is no point
3307          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3308       if (sbc_idx < BITSET_UINTS)
3309         {
3310           /* Build a tree for simple bracket.  */
3311           br_token.type = SIMPLE_BRACKET;
3312           br_token.opr.sbcset = sbcset;
3313           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3314           if (BE (work_tree == NULL, 0))
3315             goto parse_bracket_exp_espace;
3316
3317           /* Then join them by ALT node.  */
3318           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3319           if (BE (work_tree == NULL, 0))
3320             goto parse_bracket_exp_espace;
3321         }
3322       else
3323         {
3324           re_free (sbcset);
3325           work_tree = mbc_tree;
3326         }
3327     }
3328   else
3329 #endif /* not RE_ENABLE_I18N */
3330     {
3331 #ifdef RE_ENABLE_I18N
3332       free_charset (mbcset);
3333 #endif
3334       /* Build a tree for simple bracket.  */
3335       br_token.type = SIMPLE_BRACKET;
3336       br_token.opr.sbcset = sbcset;
3337       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3338       if (BE (work_tree == NULL, 0))
3339         goto parse_bracket_exp_espace;
3340     }
3341   return work_tree;
3342
3343  parse_bracket_exp_espace:
3344   *err = REG_ESPACE;
3345  parse_bracket_exp_free_return:
3346   re_free (sbcset);
3347 #ifdef RE_ENABLE_I18N
3348   free_charset (mbcset);
3349 #endif /* RE_ENABLE_I18N */
3350   return NULL;
3351 }
3352
3353 /* Parse an element in the bracket expression.  */
3354
3355 static reg_errcode_t
3356 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3357                        accept_hyphen)
3358      bracket_elem_t *elem;
3359      re_string_t *regexp;
3360      re_token_t *token;
3361      int token_len;
3362      re_dfa_t *dfa;
3363      reg_syntax_t syntax;
3364      int accept_hyphen;
3365 {
3366 #ifdef RE_ENABLE_I18N
3367   int cur_char_size;
3368   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3369   if (cur_char_size > 1)
3370     {
3371       elem->type = MB_CHAR;
3372       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3373       re_string_skip_bytes (regexp, cur_char_size);
3374       return REG_NOERROR;
3375     }
3376 #endif /* RE_ENABLE_I18N */
3377   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3378   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3379       || token->type == OP_OPEN_EQUIV_CLASS)
3380     return parse_bracket_symbol (elem, regexp, token);
3381   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3382     {
3383       /* A '-' must only appear as anything but a range indicator before
3384          the closing bracket.  Everything else is an error.  */
3385       re_token_t token2;
3386       (void) peek_token_bracket (&token2, regexp, syntax);
3387       if (token2.type != OP_CLOSE_BRACKET)
3388         /* The actual error value is not standardized since this whole
3389            case is undefined.  But ERANGE makes good sense.  */
3390         return REG_ERANGE;
3391     }
3392   elem->type = SB_CHAR;
3393   elem->opr.ch = token->opr.c;
3394   return REG_NOERROR;
3395 }
3396
3397 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3398    such as [:<character_class>:], [.<collating_element>.], and
3399    [=<equivalent_class>=].  */
3400
3401 static reg_errcode_t
3402 parse_bracket_symbol (elem, regexp, token)
3403      bracket_elem_t *elem;
3404      re_string_t *regexp;
3405      re_token_t *token;
3406 {
3407   unsigned char ch, delim = token->opr.c;
3408   int i = 0;
3409   if (re_string_eoi(regexp))
3410     return REG_EBRACK;
3411   for (;; ++i)
3412     {
3413       if (i >= BRACKET_NAME_BUF_SIZE)
3414         return REG_EBRACK;
3415       if (token->type == OP_OPEN_CHAR_CLASS)
3416         ch = re_string_fetch_byte_case (regexp);
3417       else
3418         ch = re_string_fetch_byte (regexp);
3419       if (re_string_eoi(regexp))
3420         return REG_EBRACK;
3421       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3422         break;
3423       elem->opr.name[i] = ch;
3424     }
3425   re_string_skip_bytes (regexp, 1);
3426   elem->opr.name[i] = '\0';
3427   switch (token->type)
3428     {
3429     case OP_OPEN_COLL_ELEM:
3430       elem->type = COLL_SYM;
3431       break;
3432     case OP_OPEN_EQUIV_CLASS:
3433       elem->type = EQUIV_CLASS;
3434       break;
3435     case OP_OPEN_CHAR_CLASS:
3436       elem->type = CHAR_CLASS;
3437       break;
3438     default:
3439       break;
3440     }
3441   return REG_NOERROR;
3442 }
3443
3444   /* Helper function for parse_bracket_exp.
3445      Build the equivalence class which is represented by NAME.
3446      The result are written to MBCSET and SBCSET.
3447      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3448      is a pointer argument sinse we may update it.  */
3449
3450 static reg_errcode_t
3451 #ifdef RE_ENABLE_I18N
3452 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3453      re_charset_t *mbcset;
3454      int *equiv_class_alloc;
3455 #else /* not RE_ENABLE_I18N */
3456 build_equiv_class (sbcset, name)
3457 #endif /* not RE_ENABLE_I18N */
3458      re_bitset_ptr_t sbcset;
3459      const unsigned char *name;
3460 {
3461 #if defined _LIBC
3462   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3463   if (nrules != 0)
3464     {
3465       const int32_t *table, *indirect;
3466       const unsigned char *weights, *extra, *cp;
3467       unsigned char char_buf[2];
3468       int32_t idx1, idx2;
3469       unsigned int ch;
3470       size_t len;
3471       /* This #include defines a local function!  */
3472 # include <locale/weight.h>
3473       /* Calculate the index for equivalence class.  */
3474       cp = name;
3475       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3476       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3477                                                _NL_COLLATE_WEIGHTMB);
3478       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3479                                                    _NL_COLLATE_EXTRAMB);
3480       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3481                                                 _NL_COLLATE_INDIRECTMB);
3482       idx1 = findidx (&cp);
3483       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3484         /* This isn't a valid character.  */
3485         return REG_ECOLLATE;
3486
3487       /* Build single byte matcing table for this equivalence class.  */
3488       char_buf[1] = (unsigned char) '\0';
3489       len = weights[idx1];
3490       for (ch = 0; ch < SBC_MAX; ++ch)
3491         {
3492           char_buf[0] = ch;
3493           cp = char_buf;
3494           idx2 = findidx (&cp);
3495 /*
3496           idx2 = table[ch];
3497 */
3498           if (idx2 == 0)
3499             /* This isn't a valid character.  */
3500             continue;
3501           if (len == weights[idx2])
3502             {
3503               int cnt = 0;
3504               while (cnt <= len &&
3505                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3506                 ++cnt;
3507
3508               if (cnt > len)
3509                 bitset_set (sbcset, ch);
3510             }
3511         }
3512       /* Check whether the array has enough space.  */
3513       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3514         {
3515           /* Not enough, realloc it.  */
3516           /* +1 in case of mbcset->nequiv_classes is 0.  */
3517           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3518           /* Use realloc since the array is NULL if *alloc == 0.  */
3519           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3520                                                    int32_t,
3521                                                    new_equiv_class_alloc);
3522           if (BE (new_equiv_classes == NULL, 0))
3523             return REG_ESPACE;
3524           mbcset->equiv_classes = new_equiv_classes;
3525           *equiv_class_alloc = new_equiv_class_alloc;
3526         }
3527       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3528     }
3529   else
3530 #endif /* _LIBC */
3531     {
3532       if (BE (strlen ((const char *) name) != 1, 0))
3533         return REG_ECOLLATE;
3534       bitset_set (sbcset, *name);
3535     }
3536   return REG_NOERROR;
3537 }
3538
3539   /* Helper function for parse_bracket_exp.
3540      Build the character class which is represented by NAME.
3541      The result are written to MBCSET and SBCSET.
3542      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3543      is a pointer argument sinse we may update it.  */
3544
3545 static reg_errcode_t
3546 #ifdef RE_ENABLE_I18N
3547 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3548      re_charset_t *mbcset;
3549      int *char_class_alloc;
3550 #else /* not RE_ENABLE_I18N */
3551 build_charclass (trans, sbcset, class_name, syntax)
3552 #endif /* not RE_ENABLE_I18N */
3553      unsigned RE_TRANSLATE_TYPE trans;
3554      re_bitset_ptr_t sbcset;
3555      const unsigned char *class_name;
3556      reg_syntax_t syntax;
3557 {
3558   int i;
3559   const char *name = (const char *) class_name;
3560
3561   /* In case of REG_ICASE "upper" and "lower" match the both of
3562      upper and lower cases.  */
3563   if ((syntax & RE_ICASE)
3564       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3565     name = "alpha";
3566
3567 #ifdef RE_ENABLE_I18N
3568   /* Check the space of the arrays.  */
3569   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3570     {
3571       /* Not enough, realloc it.  */
3572       /* +1 in case of mbcset->nchar_classes is 0.  */
3573       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3574       /* Use realloc since array is NULL if *alloc == 0.  */
3575       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3576                                                new_char_class_alloc);
3577       if (BE (new_char_classes == NULL, 0))
3578         return REG_ESPACE;
3579       mbcset->char_classes = new_char_classes;
3580       *char_class_alloc = new_char_class_alloc;
3581     }
3582   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3583 #endif /* RE_ENABLE_I18N */
3584
3585 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3586     for (i = 0; i < SBC_MAX; ++i)               \
3587       {                                         \
3588         if (ctype_func (i))                     \
3589           {                                     \
3590             int ch = trans ? trans[i] : i;      \
3591             bitset_set (sbcset, ch);            \
3592           }                                     \
3593       }
3594
3595   if (strcmp (name, "alnum") == 0)
3596     BUILD_CHARCLASS_LOOP (isalnum)
3597   else if (strcmp (name, "cntrl") == 0)
3598     BUILD_CHARCLASS_LOOP (iscntrl)
3599   else if (strcmp (name, "lower") == 0)
3600     BUILD_CHARCLASS_LOOP (islower)
3601   else if (strcmp (name, "space") == 0)
3602     BUILD_CHARCLASS_LOOP (isspace)
3603   else if (strcmp (name, "alpha") == 0)
3604     BUILD_CHARCLASS_LOOP (isalpha)
3605   else if (strcmp (name, "digit") == 0)
3606     BUILD_CHARCLASS_LOOP (isdigit)
3607   else if (strcmp (name, "print") == 0)
3608     BUILD_CHARCLASS_LOOP (isprint)
3609   else if (strcmp (name, "upper") == 0)
3610     BUILD_CHARCLASS_LOOP (isupper)
3611   else if (strcmp (name, "blank") == 0)
3612     BUILD_CHARCLASS_LOOP (isblank)
3613   else if (strcmp (name, "graph") == 0)
3614     BUILD_CHARCLASS_LOOP (isgraph)
3615   else if (strcmp (name, "punct") == 0)
3616     BUILD_CHARCLASS_LOOP (ispunct)
3617   else if (strcmp (name, "xdigit") == 0)
3618     BUILD_CHARCLASS_LOOP (isxdigit)
3619   else
3620     return REG_ECTYPE;
3621
3622   return REG_NOERROR;
3623 }
3624
3625 static bin_tree_t *
3626 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3627      re_dfa_t *dfa;
3628      unsigned RE_TRANSLATE_TYPE trans;
3629      const unsigned char *class_name;
3630      const unsigned char *extra;
3631      int non_match;
3632      reg_errcode_t *err;
3633 {
3634   re_bitset_ptr_t sbcset;
3635 #ifdef RE_ENABLE_I18N
3636   re_charset_t *mbcset;
3637   int alloc = 0;
3638 #endif /* not RE_ENABLE_I18N */
3639   reg_errcode_t ret;
3640   re_token_t br_token;
3641   bin_tree_t *tree;
3642
3643   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3644 #ifdef RE_ENABLE_I18N
3645   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3646 #endif /* RE_ENABLE_I18N */
3647
3648 #ifdef RE_ENABLE_I18N
3649   if (BE (sbcset == NULL || mbcset == NULL, 0))
3650 #else /* not RE_ENABLE_I18N */
3651   if (BE (sbcset == NULL, 0))
3652 #endif /* not RE_ENABLE_I18N */
3653     {
3654       *err = REG_ESPACE;
3655       return NULL;
3656     }
3657
3658   if (non_match)
3659     {
3660 #ifdef RE_ENABLE_I18N
3661       /*
3662       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3663         bitset_set(cset->sbcset, '\0');
3664       */
3665       mbcset->non_match = 1;
3666 #endif /* not RE_ENABLE_I18N */
3667     }
3668
3669   /* We don't care the syntax in this case.  */
3670   ret = build_charclass (trans, sbcset,
3671 #ifdef RE_ENABLE_I18N
3672                          mbcset, &alloc,
3673 #endif /* RE_ENABLE_I18N */
3674                          class_name, 0);
3675
3676   if (BE (ret != REG_NOERROR, 0))
3677     {
3678       re_free (sbcset);
3679 #ifdef RE_ENABLE_I18N
3680       free_charset (mbcset);
3681 #endif /* RE_ENABLE_I18N */
3682       *err = ret;
3683       return NULL;
3684     }
3685   /* \w match '_' also.  */
3686   for (; *extra; extra++)
3687     bitset_set (sbcset, *extra);
3688
3689   /* If it is non-matching list.  */
3690   if (non_match)
3691     bitset_not (sbcset);
3692
3693 #ifdef RE_ENABLE_I18N
3694   /* Ensure only single byte characters are set.  */
3695   if (dfa->mb_cur_max > 1)
3696     bitset_mask (sbcset, dfa->sb_char);
3697 #endif
3698
3699   /* Build a tree for simple bracket.  */
3700   br_token.type = SIMPLE_BRACKET;
3701   br_token.opr.sbcset = sbcset;
3702   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3703   if (BE (tree == NULL, 0))
3704     goto build_word_op_espace;
3705
3706 #ifdef RE_ENABLE_I18N
3707   if (dfa->mb_cur_max > 1)
3708     {
3709       bin_tree_t *mbc_tree;
3710       /* Build a tree for complex bracket.  */
3711       br_token.type = COMPLEX_BRACKET;
3712       br_token.opr.mbcset = mbcset;
3713       dfa->has_mb_node = 1;
3714       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3715       if (BE (mbc_tree == NULL, 0))
3716         goto build_word_op_espace;
3717       /* Then join them by ALT node.  */
3718       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3719       if (BE (mbc_tree != NULL, 1))
3720         return tree;
3721     }
3722   else
3723     {
3724       free_charset (mbcset);
3725       return tree;
3726     }
3727 #else /* not RE_ENABLE_I18N */
3728   return tree;
3729 #endif /* not RE_ENABLE_I18N */
3730
3731  build_word_op_espace:
3732   re_free (sbcset);
3733 #ifdef RE_ENABLE_I18N
3734   free_charset (mbcset);
3735 #endif /* RE_ENABLE_I18N */
3736   *err = REG_ESPACE;
3737   return NULL;
3738 }
3739
3740 /* This is intended for the expressions like "a{1,3}".
3741    Fetch a number from `input', and return the number.
3742    Return -1, if the number field is empty like "{,1}".
3743    Return -2, If an error is occured.  */
3744
3745 static int
3746 fetch_number (input, token, syntax)
3747      re_string_t *input;
3748      re_token_t *token;
3749      reg_syntax_t syntax;
3750 {
3751   int num = -1;
3752   unsigned char c;
3753   while (1)
3754     {
3755       fetch_token (token, input, syntax);
3756       c = token->opr.c;
3757       if (BE (token->type == END_OF_RE, 0))
3758         return -2;
3759       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3760         break;
3761       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3762              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3763       num = (num > RE_DUP_MAX) ? -2 : num;
3764     }
3765   return num;
3766 }
3767 \f
3768 #ifdef RE_ENABLE_I18N
3769 static void
3770 free_charset (re_charset_t *cset)
3771 {
3772   re_free (cset->mbchars);
3773 # ifdef _LIBC
3774   re_free (cset->coll_syms);
3775   re_free (cset->equiv_classes);
3776   re_free (cset->range_starts);
3777   re_free (cset->range_ends);
3778 # endif
3779   re_free (cset->char_classes);
3780   re_free (cset);
3781 }
3782 #endif /* RE_ENABLE_I18N */
3783 \f
3784 /* Functions for binary tree operation.  */
3785
3786 /* Create a tree node.  */
3787
3788 static bin_tree_t *
3789 create_tree (dfa, left, right, type)
3790      re_dfa_t *dfa;
3791      bin_tree_t *left;
3792      bin_tree_t *right;
3793      re_token_type_t type;
3794 {
3795   re_token_t t;
3796   t.type = type;
3797   return create_token_tree (dfa, left, right, &t);
3798 }
3799
3800 static bin_tree_t *
3801 create_token_tree (dfa, left, right, token)
3802      re_dfa_t *dfa;
3803      bin_tree_t *left;
3804      bin_tree_t *right;
3805      const re_token_t *token;
3806 {
3807   bin_tree_t *tree;
3808   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3809     {
3810       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3811
3812       if (storage == NULL)
3813         return NULL;
3814       storage->next = dfa->str_tree_storage;
3815       dfa->str_tree_storage = storage;
3816       dfa->str_tree_storage_idx = 0;
3817     }
3818   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3819
3820   tree->parent = NULL;
3821   tree->left = left;
3822   tree->right = right;
3823   tree->token = *token;
3824   tree->token.duplicated = 0;
3825   tree->token.opt_subexp = 0;
3826   tree->first = NULL;
3827   tree->next = NULL;
3828   tree->node_idx = -1;
3829
3830   if (left != NULL)
3831     left->parent = tree;
3832   if (right != NULL)
3833     right->parent = tree;
3834   return tree;
3835 }
3836
3837 /* Mark the tree SRC as an optional subexpression.
3838    To be called from preorder or postorder.  */
3839
3840 static reg_errcode_t
3841 mark_opt_subexp (extra, node)
3842      void *extra;
3843      bin_tree_t *node;
3844 {
3845   int idx = (int) (long) extra;
3846   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3847     node->token.opt_subexp = 1;
3848
3849   return REG_NOERROR;
3850 }
3851
3852 /* Free the allocated memory inside NODE. */
3853
3854 static void
3855 free_token (re_token_t *node)
3856 {
3857 #ifdef RE_ENABLE_I18N
3858   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3859     free_charset (node->opr.mbcset);
3860   else
3861 #endif /* RE_ENABLE_I18N */
3862     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3863       re_free (node->opr.sbcset);
3864 }
3865
3866 /* Worker function for tree walking.  Free the allocated memory inside NODE
3867    and its children. */
3868
3869 static reg_errcode_t
3870 free_tree (void *extra, bin_tree_t *node)
3871 {
3872   free_token (&node->token);
3873   return REG_NOERROR;
3874 }
3875
3876
3877 /* Duplicate the node SRC, and return new node.  This is a preorder
3878    visit similar to the one implemented by the generic visitor, but
3879    we need more infrastructure to maintain two parallel trees --- so,
3880    it's easier to duplicate.  */
3881
3882 static bin_tree_t *
3883 duplicate_tree (root, dfa)
3884      const bin_tree_t *root;
3885      re_dfa_t *dfa;
3886 {
3887   const bin_tree_t *node;
3888   bin_tree_t *dup_root;
3889   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3890
3891   for (node = root; ; )
3892     {
3893       /* Create a new tree and link it back to the current parent.  */
3894       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3895       if (*p_new == NULL)
3896         return NULL;
3897       (*p_new)->parent = dup_node;
3898       (*p_new)->token.duplicated = 1;
3899       dup_node = *p_new;
3900
3901       /* Go to the left node, or up and to the right.  */
3902       if (node->left)
3903         {
3904           node = node->left;
3905           p_new = &dup_node->left;
3906         }
3907       else
3908         {
3909           const bin_tree_t *prev = NULL;
3910           while (node->right == prev || node->right == NULL)
3911             {
3912               prev = node;
3913               node = node->parent;
3914               dup_node = dup_node->parent;
3915               if (!node)
3916                 return dup_root;
3917             }
3918           node = node->right;
3919           p_new = &dup_node->right;
3920         }
3921     }
3922 }