posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 static reg_errcode_t analyze (regex_t *preg);
  37 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  38 static reg_errcode_t preorder (bin_tree_t *root,
  39                                reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                void *extra);
  41 static reg_errcode_t postorder (bin_tree_t *root,
  42                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                 void *extra);
  44 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  45 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  46 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  47                                  bin_tree_t *node);
  48 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  49 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  50 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  51 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  52                                              int top_clone_node, int root_node,
  53                                              unsigned int constraint);
  54 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  55                                      unsigned int constraint);
  56 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  57                                    unsigned int constraint);
  58 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  59 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  60                                          int node, int root);
  61 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  62 static int fetch_number (re_string_t *input, re_token_t *token,
  63                          reg_syntax_t syntax);
  64 static void fetch_token (re_token_t *result, re_string_t *input,
  65                          reg_syntax_t syntax);
  66 static int peek_token (re_token_t *token, re_string_t *input,
  67                         reg_syntax_t syntax);
  68 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  69                                reg_syntax_t syntax);
  70 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  71                           reg_syntax_t syntax, reg_errcode_t *err);
  72 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  73                                   re_token_t *token, reg_syntax_t syntax,
  74                                   int nest, reg_errcode_t *err);
  75 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  76                                  re_token_t *token, reg_syntax_t syntax,
  77                                  int nest, reg_errcode_t *err);
  78 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  79                                      re_token_t *token, reg_syntax_t syntax,
  80                                      int nest, reg_errcode_t *err);
  81 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  82                                   re_token_t *token, reg_syntax_t syntax,
  83                                   int nest, reg_errcode_t *err);
  84 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  85                                  re_dfa_t *dfa, re_token_t *token,
  86                                  reg_syntax_t syntax, reg_errcode_t *err);
  87 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  88                                       re_token_t *token, reg_syntax_t syntax,
  89                                       reg_errcode_t *err);
  90 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  91                                             re_string_t *regexp,
  92                                             re_token_t *token, int token_len,
  93                                             re_dfa_t *dfa,
  94                                             reg_syntax_t syntax,
  95                                             int accept_hyphen);
  96 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  97                                           re_string_t *regexp,
  98                                           re_token_t *token);
  99 #ifndef _LIBC
 100 # ifdef RE_ENABLE_I18N
 101 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 102                                       re_charset_t *mbcset, int *range_alloc,
 103                                       bracket_elem_t *start_elem,
 104                                       bracket_elem_t *end_elem);
 105 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 106                                              re_charset_t *mbcset,
 107                                              int *coll_sym_alloc,
 108                                              const unsigned char *name);
 109 # else /* not RE_ENABLE_I18N */
 110 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 111                                       bracket_elem_t *start_elem,
 112                                       bracket_elem_t *end_elem);
 113 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 114                                              const unsigned char *name);
 115 # endif /* not RE_ENABLE_I18N */
 116 #endif /* not _LIBC */
 117 #ifdef RE_ENABLE_I18N
 118 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 119                                         re_charset_t *mbcset,
 120                                         int *equiv_class_alloc,
 121                                         const unsigned char *name);
 122 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 123                                       re_bitset_ptr_t sbcset,
 124                                       re_charset_t *mbcset,
 125                                       int *char_class_alloc,
 126                                       const unsigned char *class_name,
 127                                       reg_syntax_t syntax);
 128 #else  /* not RE_ENABLE_I18N */
 129 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 130                                         const unsigned char *name);
 131 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 132                                       re_bitset_ptr_t sbcset,
 133                                       const unsigned char *class_name,
 134                                       reg_syntax_t syntax);
 135 #endif /* not RE_ENABLE_I18N */
 136 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 137                                        unsigned RE_TRANSLATE_TYPE trans,
 138                                        const unsigned char *class_name,
 139                                        const unsigned char *extra,
 140                                        int non_match, reg_errcode_t *err);
 141 static bin_tree_t *create_tree (re_dfa_t *dfa,
 142                                 bin_tree_t *left, bin_tree_t *right,
 143                                 re_token_type_t type);
 144 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 145                                       bin_tree_t *left, bin_tree_t *right,
 146                                       const re_token_t *token);
 147 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 148 static void free_token (re_token_t *node);
 149 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 150 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 151 \f
 152 /* This table gives an error message for each of the error codes listed
 153    in regex.h.  Obviously the order here has to be same as there.
 154    POSIX doesn't require that we do anything for REG_NOERROR,
 155    but why not be nice?  */
 156
 157 const char __re_error_msgid[] attribute_hidden =
 158   {
 159 #define REG_NOERROR_IDX 0
 160     gettext_noop ("Success")    /* REG_NOERROR */
 161     "\0"
 162 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 163     gettext_noop ("No match")   /* REG_NOMATCH */
 164     "\0"
 165 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 166     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 167     "\0"
 168 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 169     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 170     "\0"
 171 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 172     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 173     "\0"
 174 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 175     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 176     "\0"
 177 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 178     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 179     "\0"
 180 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 181     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 182     "\0"
 183 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 184     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 185     "\0"
 186 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 187     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 188     "\0"
 189 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 190     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 191     "\0"
 192 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 193     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 194     "\0"
 195 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 196     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 197     "\0"
 198 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 199     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 200     "\0"
 201 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 202     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 203     "\0"
 204 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 205     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 206     "\0"
 207 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 208     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 209   };
 210
 211 const size_t __re_error_msgid_idx[] attribute_hidden =
 212   {
 213     REG_NOERROR_IDX,
 214     REG_NOMATCH_IDX,
 215     REG_BADPAT_IDX,
 216     REG_ECOLLATE_IDX,
 217     REG_ECTYPE_IDX,
 218     REG_EESCAPE_IDX,
 219     REG_ESUBREG_IDX,
 220     REG_EBRACK_IDX,
 221     REG_EPAREN_IDX,
 222     REG_EBRACE_IDX,
 223     REG_BADBR_IDX,
 224     REG_ERANGE_IDX,
 225     REG_ESPACE_IDX,
 226     REG_BADRPT_IDX,
 227     REG_EEND_IDX,
 228     REG_ESIZE_IDX,
 229     REG_ERPAREN_IDX
 230   };
 231 \f
 232 /* Entry points for GNU code.  */
 233
 234 /* re_compile_pattern is the GNU regular expression compiler: it
 235    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 236    Returns 0 if the pattern was valid, otherwise an error string.
 237
 238    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 239    are set in BUFP on entry.  */
 240
 241 const char *
 242 re_compile_pattern (pattern, length, bufp)
 243     const char *pattern;
 244     size_t length;
 245     struct re_pattern_buffer *bufp;
 246 {
 247   reg_errcode_t ret;
 248
 249   /* And GNU code determines whether or not to get register information
 250      by passing null for the REGS argument to re_match, etc., not by
 251      setting no_sub, unless RE_NO_SUB is set.  */
 252   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 253
 254   /* Match anchors at newline.  */
 255   bufp->newline_anchor = 1;
 256
 257   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 258
 259   if (!ret)
 260     return NULL;
 261   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 262 }
 263 #ifdef _LIBC
 264 weak_alias (__re_compile_pattern, re_compile_pattern)
 265 #endif
 266
 267 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 268    also be assigned to arbitrarily: each pattern buffer stores its own
 269    syntax, so it can be changed between regex compilations.  */
 270 /* This has no initializer because initialized variables in Emacs
 271    become read-only after dumping.  */
 272 reg_syntax_t re_syntax_options;
 273
 274
 275 /* Specify the precise syntax of regexps for compilation.  This provides
 276    for compatibility for various utilities which historically have
 277    different, incompatible syntaxes.
 278
 279    The argument SYNTAX is a bit mask comprised of the various bits
 280    defined in regex.h.  We return the old syntax.  */
 281
 282 reg_syntax_t
 283 re_set_syntax (syntax)
 284     reg_syntax_t syntax;
 285 {
 286   reg_syntax_t ret = re_syntax_options;
 287
 288   re_syntax_options = syntax;
 289   return ret;
 290 }
 291 #ifdef _LIBC
 292 weak_alias (__re_set_syntax, re_set_syntax)
 293 #endif
 294
 295 int
 296 re_compile_fastmap (bufp)
 297     struct re_pattern_buffer *bufp;
 298 {
 299   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 300   char *fastmap = bufp->fastmap;
 301
 302   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 303   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 304   if (dfa->init_state != dfa->init_state_word)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 306   if (dfa->init_state != dfa->init_state_nl)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 308   if (dfa->init_state != dfa->init_state_begbuf)
 309     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 310   bufp->fastmap_accurate = 1;
 311   return 0;
 312 }
 313 #ifdef _LIBC
 314 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 315 #endif
 316
 317 static inline void
 318 __attribute ((always_inline))
 319 re_set_fastmap (char *fastmap, int icase, int ch)
 320 {
 321   fastmap[ch] = 1;
 322   if (icase)
 323     fastmap[tolower (ch)] = 1;
 324 }
 325
 326 /* Helper function for re_compile_fastmap.
 327    Compile fastmap for the initial_state INIT_STATE.  */
 328
 329 static void
 330 re_compile_fastmap_iter (bufp, init_state, fastmap)
 331      regex_t *bufp;
 332      const re_dfastate_t *init_state;
 333      char *fastmap;
 334 {
 335   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 336   int node_cnt;
 337   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 338   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 339     {
 340       int node = init_state->nodes.elems[node_cnt];
 341       re_token_type_t type = dfa->nodes[node].type;
 342
 343       if (type == CHARACTER)
 344         {
 345           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 346 #ifdef RE_ENABLE_I18N
 347           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 348             {
 349               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 350               wchar_t wc;
 351               mbstate_t state;
 352
 353               p = buf;
 354               *p++ = dfa->nodes[node].opr.c;
 355               while (++node < dfa->nodes_len
 356                      && dfa->nodes[node].type == CHARACTER
 357                      && dfa->nodes[node].mb_partial)
 358                 *p++ = dfa->nodes[node].opr.c;
 359               memset (&state, 0, sizeof (state));
 360               if (mbrtowc (&wc, (const char *) buf, p - buf,
 361                            &state) == p - buf
 362                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 363                       != (size_t) -1))
 364                 re_set_fastmap (fastmap, 0, buf[0]);
 365             }
 366 #endif
 367         }
 368       else if (type == SIMPLE_BRACKET)
 369         {
 370           int i, j, ch;
 371           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 372             for (j = 0; j < UINT_BITS; ++j, ++ch)
 373               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 374                 re_set_fastmap (fastmap, icase, ch);
 375         }
 376 #ifdef RE_ENABLE_I18N
 377       else if (type == COMPLEX_BRACKET)
 378         {
 379           int i;
 380           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 381           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 382               || cset->nranges || cset->nchar_classes)
 383             {
 384 # ifdef _LIBC
 385               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 386                 {
 387                   /* In this case we want to catch the bytes which are
 388                      the first byte of any collation elements.
 389                      e.g. In da_DK, we want to catch 'a' since "aa"
 390                           is a valid collation element, and don't catch
 391                           'b' since 'b' is the only collation element
 392                           which starts from 'b'.  */
 393                   int j, ch;
 394                   const int32_t *table = (const int32_t *)
 395                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 396                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 397                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 398                       if (table[ch] < 0)
 399                         re_set_fastmap (fastmap, icase, ch);
 400                 }
 401 # else
 402               if (dfa->mb_cur_max > 1)
 403                 for (i = 0; i < SBC_MAX; ++i)
 404                   if (__btowc (i) == WEOF)
 405                     re_set_fastmap (fastmap, icase, i);
 406 # endif /* not _LIBC */
 407             }
 408           for (i = 0; i < cset->nmbchars; ++i)
 409             {
 410               char buf[256];
 411               mbstate_t state;
 412               memset (&state, '\0', sizeof (state));
 413               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 414                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 415               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 416                 {
 417                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 418                       != (size_t) -1)
 419                     re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 420                 }
 421             }
 422         }
 423 #endif /* RE_ENABLE_I18N */
 424       else if (type == OP_PERIOD
 425 #ifdef RE_ENABLE_I18N
 426                || type == OP_UTF8_PERIOD
 427 #endif /* RE_ENABLE_I18N */
 428                || type == END_OF_RE)
 429         {
 430           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 431           if (type == END_OF_RE)
 432             bufp->can_be_null = 1;
 433           return;
 434         }
 435     }
 436 }
 437 \f
 438 /* Entry point for POSIX code.  */
 439 /* regcomp takes a regular expression as a string and compiles it.
 440
 441    PREG is a regex_t *.  We do not expect any fields to be initialized,
 442    since POSIX says we shouldn't.  Thus, we set
 443
 444      `buffer' to the compiled pattern;
 445      `used' to the length of the compiled pattern;
 446      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 447        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 448        RE_SYNTAX_POSIX_BASIC;
 449      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 450      `fastmap' to an allocated space for the fastmap;
 451      `fastmap_accurate' to zero;
 452      `re_nsub' to the number of subexpressions in PATTERN.
 453
 454    PATTERN is the address of the pattern string.
 455
 456    CFLAGS is a series of bits which affect compilation.
 457
 458      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 459      use POSIX basic syntax.
 460
 461      If REG_NEWLINE is set, then . and [^...] don't match newline.
 462      Also, regexec will try a match beginning after every newline.
 463
 464      If REG_ICASE is set, then we considers upper- and lowercase
 465      versions of letters to be equivalent when matching.
 466
 467      If REG_NOSUB is set, then when PREG is passed to regexec, that
 468      routine will report only success or failure, and nothing about the
 469      registers.
 470
 471    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 472    the return codes and their meanings.)  */
 473
 474 int
 475 regcomp (preg, pattern, cflags)
 476     regex_t *__restrict preg;
 477     const char *__restrict pattern;
 478     int cflags;
 479 {
 480   reg_errcode_t ret;
 481   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 482                          : RE_SYNTAX_POSIX_BASIC);
 483
 484   preg->buffer = NULL;
 485   preg->allocated = 0;
 486   preg->used = 0;
 487
 488   /* Try to allocate space for the fastmap.  */
 489   preg->fastmap = re_malloc (char, SBC_MAX);
 490   if (BE (preg->fastmap == NULL, 0))
 491     return REG_ESPACE;
 492
 493   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 494
 495   /* If REG_NEWLINE is set, newlines are treated differently.  */
 496   if (cflags & REG_NEWLINE)
 497     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 498       syntax &= ~RE_DOT_NEWLINE;
 499       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 500       /* It also changes the matching behavior.  */
 501       preg->newline_anchor = 1;
 502     }
 503   else
 504     preg->newline_anchor = 0;
 505   preg->no_sub = !!(cflags & REG_NOSUB);
 506   preg->translate = NULL;
 507
 508   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 509
 510   /* POSIX doesn't distinguish between an unmatched open-group and an
 511      unmatched close-group: both are REG_EPAREN.  */
 512   if (ret == REG_ERPAREN)
 513     ret = REG_EPAREN;
 514
 515   /* We have already checked preg->fastmap != NULL.  */
 516   if (BE (ret == REG_NOERROR, 1))
 517     /* Compute the fastmap now, since regexec cannot modify the pattern
 518        buffer.  This function never fails in this implementation.  */
 519     (void) re_compile_fastmap (preg);
 520   else
 521     {
 522       /* Some error occurred while compiling the expression.  */
 523       re_free (preg->fastmap);
 524       preg->fastmap = NULL;
 525     }
 526
 527   return (int) ret;
 528 }
 529 #ifdef _LIBC
 530 weak_alias (__regcomp, regcomp)
 531 #endif
 532
 533 /* Returns a message corresponding to an error code, ERRCODE, returned
 534    from either regcomp or regexec.   We don't use PREG here.  */
 535
 536 size_t
 537 regerror (errcode, preg, errbuf, errbuf_size)
 538     int errcode;
 539     const regex_t *preg;
 540     char *errbuf;
 541     size_t errbuf_size;
 542 {
 543   const char *msg;
 544   size_t msg_size;
 545
 546   if (BE (errcode < 0
 547           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 548                                / sizeof (__re_error_msgid_idx[0])), 0))
 549     /* Only error codes returned by the rest of the code should be passed
 550        to this routine.  If we are given anything else, or if other regex
 551        code generates an invalid error code, then the program has a bug.
 552        Dump core so we can fix it.  */
 553     abort ();
 554
 555   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 556
 557   msg_size = strlen (msg) + 1; /* Includes the null.  */
 558
 559   if (BE (errbuf_size != 0, 1))
 560     {
 561       if (BE (msg_size > errbuf_size, 0))
 562         {
 563 #if defined HAVE_MEMPCPY || defined _LIBC
 564           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 565 #else
 566           memcpy (errbuf, msg, errbuf_size - 1);
 567           errbuf[errbuf_size - 1] = 0;
 568 #endif
 569         }
 570       else
 571         memcpy (errbuf, msg, msg_size);
 572     }
 573
 574   return msg_size;
 575 }
 576 #ifdef _LIBC
 577 weak_alias (__regerror, regerror)
 578 #endif
 579
 580
 581 #ifdef RE_ENABLE_I18N
 582 /* This static array is used for the map to single-byte characters when
 583    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 584    it the same all the time.  UTF-8 is the preferred encoding so this is
 585    a worthwhile optimization.  */
 586 static const bitset utf8_sb_map =
 587 {
 588   /* Set the first 128 bits.  */
 589 # if UINT_MAX == 0xffffffff
 590   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 591 # else
 592 #  error "Add case for new unsigned int size"
 593 # endif
 594 };
 595 #endif
 596
 597
 598 static void
 599 free_dfa_content (re_dfa_t *dfa)
 600 {
 601   int i, j;
 602
 603   if (dfa->nodes)
 604     for (i = 0; i < dfa->nodes_len; ++i)
 605       free_token (dfa->nodes + i);
 606   re_free (dfa->nexts);
 607   for (i = 0; i < dfa->nodes_len; ++i)
 608     {
 609       if (dfa->eclosures != NULL)
 610         re_node_set_free (dfa->eclosures + i);
 611       if (dfa->inveclosures != NULL)
 612         re_node_set_free (dfa->inveclosures + i);
 613       if (dfa->edests != NULL)
 614         re_node_set_free (dfa->edests + i);
 615     }
 616   re_free (dfa->edests);
 617   re_free (dfa->eclosures);
 618   re_free (dfa->inveclosures);
 619   re_free (dfa->nodes);
 620
 621   if (dfa->state_table)
 622     for (i = 0; i <= dfa->state_hash_mask; ++i)
 623       {
 624         struct re_state_table_entry *entry = dfa->state_table + i;
 625         for (j = 0; j < entry->num; ++j)
 626           {
 627             re_dfastate_t *state = entry->array[j];
 628             free_state (state);
 629           }
 630         re_free (entry->array);
 631       }
 632   re_free (dfa->state_table);
 633 #ifdef RE_ENABLE_I18N
 634   if (dfa->sb_char != utf8_sb_map)
 635     re_free (dfa->sb_char);
 636 #endif
 637   re_free (dfa->subexp_map);
 638 #ifdef DEBUG
 639   re_free (dfa->re_str);
 640 #endif
 641
 642   re_free (dfa);
 643 }
 644
 645
 646 /* Free dynamically allocated space used by PREG.  */
 647
 648 void
 649 regfree (preg)
 650     regex_t *preg;
 651 {
 652   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 653   if (BE (dfa != NULL, 1))
 654     free_dfa_content (dfa);
 655   preg->buffer = NULL;
 656   preg->allocated = 0;
 657
 658   re_free (preg->fastmap);
 659   preg->fastmap = NULL;
 660
 661   re_free (preg->translate);
 662   preg->translate = NULL;
 663 }
 664 #ifdef _LIBC
 665 weak_alias (__regfree, regfree)
 666 #endif
 667 \f
 668 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 669    them unless specifically requested.  */
 670
 671 #if defined _REGEX_RE_COMP || defined _LIBC
 672
 673 /* BSD has one and only one pattern buffer.  */
 674 static struct re_pattern_buffer re_comp_buf;
 675
 676 char *
 677 # ifdef _LIBC
 678 /* Make these definitions weak in libc, so POSIX programs can redefine
 679    these names if they don't use our functions, and still use
 680    regcomp/regexec above without link errors.  */
 681 weak_function
 682 # endif
 683 re_comp (s)
 684      const char *s;
 685 {
 686   reg_errcode_t ret;
 687   char *fastmap;
 688
 689   if (!s)
 690     {
 691       if (!re_comp_buf.buffer)
 692         return gettext ("No previous regular expression");
 693       return 0;
 694     }
 695
 696   if (re_comp_buf.buffer)
 697     {
 698       fastmap = re_comp_buf.fastmap;
 699       re_comp_buf.fastmap = NULL;
 700       __regfree (&re_comp_buf);
 701       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 702       re_comp_buf.fastmap = fastmap;
 703     }
 704
 705   if (re_comp_buf.fastmap == NULL)
 706     {
 707       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 708       if (re_comp_buf.fastmap == NULL)
 709         return (char *) gettext (__re_error_msgid
 710                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 711     }
 712
 713   /* Since `re_exec' always passes NULL for the `regs' argument, we
 714      don't need to initialize the pattern buffer fields which affect it.  */
 715
 716   /* Match anchors at newlines.  */
 717   re_comp_buf.newline_anchor = 1;
 718
 719   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 720
 721   if (!ret)
 722     return NULL;
 723
 724   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 725   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 726 }
 727
 728 #ifdef _LIBC
 729 libc_freeres_fn (free_mem)
 730 {
 731   __regfree (&re_comp_buf);
 732 }
 733 #endif
 734
 735 #endif /* _REGEX_RE_COMP */
 736 \f
 737 /* Internal entry point.
 738    Compile the regular expression PATTERN, whose length is LENGTH.
 739    SYNTAX indicate regular expression's syntax.  */
 740
 741 static reg_errcode_t
 742 re_compile_internal (preg, pattern, length, syntax)
 743      regex_t *preg;
 744      const char * pattern;
 745      int length;
 746      reg_syntax_t syntax;
 747 {
 748   reg_errcode_t err = REG_NOERROR;
 749   re_dfa_t *dfa;
 750   re_string_t regexp;
 751
 752   /* Initialize the pattern buffer.  */
 753   preg->fastmap_accurate = 0;
 754   preg->syntax = syntax;
 755   preg->not_bol = preg->not_eol = 0;
 756   preg->used = 0;
 757   preg->re_nsub = 0;
 758   preg->can_be_null = 0;
 759   preg->regs_allocated = REGS_UNALLOCATED;
 760
 761   /* Initialize the dfa.  */
 762   dfa = (re_dfa_t *) preg->buffer;
 763   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 764     {
 765       /* If zero allocated, but buffer is non-null, try to realloc
 766          enough space.  This loses if buffer's address is bogus, but
 767          that is the user's responsibility.  If ->buffer is NULL this
 768          is a simple allocation.  */
 769       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 770       if (dfa == NULL)
 771         return REG_ESPACE;
 772       preg->allocated = sizeof (re_dfa_t);
 773       preg->buffer = (unsigned char *) dfa;
 774     }
 775   preg->used = sizeof (re_dfa_t);
 776
 777   err = init_dfa (dfa, length);
 778   if (BE (err != REG_NOERROR, 0))
 779     {
 780       free_dfa_content (dfa);
 781       preg->buffer = NULL;
 782       preg->allocated = 0;
 783       return err;
 784     }
 785 #ifdef DEBUG
 786   dfa->re_str = re_malloc (char, length + 1);
 787   strncpy (dfa->re_str, pattern, length + 1);
 788 #endif
 789
 790   err = re_string_construct (&regexp, pattern, length, preg->translate,
 791                              syntax & RE_ICASE, dfa);
 792   if (BE (err != REG_NOERROR, 0))
 793     {
 794     re_compile_internal_free_return:
 795       free_workarea_compile (preg);
 796       re_string_destruct (&regexp);
 797       free_dfa_content (dfa);
 798       preg->buffer = NULL;
 799       preg->allocated = 0;
 800       return err;
 801     }
 802
 803   /* Parse the regular expression, and build a structure tree.  */
 804   preg->re_nsub = 0;
 805   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 806   if (BE (dfa->str_tree == NULL, 0))
 807     goto re_compile_internal_free_return;
 808
 809   /* Analyze the tree and create the nfa.  */
 810   err = analyze (preg);
 811   if (BE (err != REG_NOERROR, 0))
 812     goto re_compile_internal_free_return;
 813
 814 #ifdef RE_ENABLE_I18N
 815   /* If possible, do searching in single byte encoding to speed things up.  */
 816   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 817     optimize_utf8 (dfa);
 818 #endif
 819
 820   /* Then create the initial state of the dfa.  */
 821   err = create_initial_state (dfa);
 822
 823   /* Release work areas.  */
 824   free_workarea_compile (preg);
 825   re_string_destruct (&regexp);
 826
 827   if (BE (err != REG_NOERROR, 0))
 828     {
 829       free_dfa_content (dfa);
 830       preg->buffer = NULL;
 831       preg->allocated = 0;
 832     }
 833
 834   return err;
 835 }
 836
 837 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 838    as the initial length of some arrays.  */
 839
 840 static reg_errcode_t
 841 init_dfa (dfa, pat_len)
 842      re_dfa_t *dfa;
 843      int pat_len;
 844 {
 845   int table_size;
 846 #ifndef _LIBC
 847   char *codeset_name;
 848 #endif
 849
 850   memset (dfa, '\0', sizeof (re_dfa_t));
 851
 852   /* Force allocation of str_tree_storage the first time.  */
 853   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 854
 855   dfa->nodes_alloc = pat_len + 1;
 856   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 857
 858   dfa->states_alloc = pat_len + 1;
 859
 860   /*  table_size = 2 ^ ceil(log pat_len) */
 861   for (table_size = 1; table_size > 0; table_size <<= 1)
 862     if (table_size > pat_len)
 863       break;
 864
 865   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 866   dfa->state_hash_mask = table_size - 1;
 867
 868   dfa->mb_cur_max = MB_CUR_MAX;
 869 #ifdef _LIBC
 870   if (dfa->mb_cur_max == 6
 871       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 872     dfa->is_utf8 = 1;
 873   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 874                        != 0);
 875 #else
 876 # ifdef HAVE_LANGINFO_CODESET
 877   codeset_name = nl_langinfo (CODESET);
 878 # else
 879   codeset_name = getenv ("LC_ALL");
 880   if (codeset_name == NULL || codeset_name[0] == '\0')
 881     codeset_name = getenv ("LC_CTYPE");
 882   if (codeset_name == NULL || codeset_name[0] == '\0')
 883     codeset_name = getenv ("LANG");
 884   if (codeset_name == NULL)
 885     codeset_name = "";
 886   else if (strchr (codeset_name, '.') !=  NULL)
 887     codeset_name = strchr (codeset_name, '.') + 1;
 888 # endif
 889
 890   if (strcasecmp (codeset_name, "UTF-8") == 0
 891       || strcasecmp (codeset_name, "UTF8") == 0)
 892     dfa->is_utf8 = 1;
 893
 894   /* We check exhaustively in the loop below if this charset is a
 895      superset of ASCII.  */
 896   dfa->map_notascii = 0;
 897 #endif
 898
 899 #ifdef RE_ENABLE_I18N
 900   if (dfa->mb_cur_max > 1)
 901     {
 902       if (dfa->is_utf8)
 903         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 904       else
 905         {
 906           int i, j, ch;
 907
 908           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 909           if (BE (dfa->sb_char == NULL, 0))
 910             return REG_ESPACE;
 911
 912           /* Clear all bits by, then set those corresponding to single
 913              byte chars.  */
 914           bitset_empty (dfa->sb_char);
 915
 916           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 917             for (j = 0; j < UINT_BITS; ++j, ++ch)
 918               {
 919                 wchar_t wch = __btowc (ch);
 920                 if (wch != WEOF)
 921                   dfa->sb_char[i] |= 1 << j;
 922 # ifndef _LIBC
 923                 if (isascii (ch) && wch != (wchar_t) ch)
 924                   dfa->map_notascii = 1;
 925 # endif
 926               }
 927         }
 928     }
 929 #endif
 930
 931   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 932     return REG_ESPACE;
 933   return REG_NOERROR;
 934 }
 935
 936 /* Initialize WORD_CHAR table, which indicate which character is
 937    "word".  In this case "word" means that it is the word construction
 938    character used by some operators like "\<", "\>", etc.  */
 939
 940 static void
 941 init_word_char (dfa)
 942      re_dfa_t *dfa;
 943 {
 944   int i, j, ch;
 945   dfa->word_ops_used = 1;
 946   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 947     for (j = 0; j < UINT_BITS; ++j, ++ch)
 948       if (isalnum (ch) || ch == '_')
 949         dfa->word_char[i] |= 1 << j;
 950 }
 951
 952 /* Free the work area which are only used while compiling.  */
 953
 954 static void
 955 free_workarea_compile (preg)
 956      regex_t *preg;
 957 {
 958   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 959   bin_tree_storage_t *storage, *next;
 960   for (storage = dfa->str_tree_storage; storage; storage = next)
 961     {
 962       next = storage->next;
 963       re_free (storage);
 964     }
 965   dfa->str_tree_storage = NULL;
 966   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 967   dfa->str_tree = NULL;
 968   re_free (dfa->org_indices);
 969   dfa->org_indices = NULL;
 970 }
 971
 972 /* Create initial states for all contexts.  */
 973
 974 static reg_errcode_t
 975 create_initial_state (dfa)
 976      re_dfa_t *dfa;
 977 {
 978   int first, i;
 979   reg_errcode_t err;
 980   re_node_set init_nodes;
 981
 982   /* Initial states have the epsilon closure of the node which is
 983      the first node of the regular expression.  */
 984   first = dfa->str_tree->first->node_idx;
 985   dfa->init_node = first;
 986   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 987   if (BE (err != REG_NOERROR, 0))
 988     return err;
 989
 990   /* The back-references which are in initial states can epsilon transit,
 991      since in this case all of the subexpressions can be null.
 992      Then we add epsilon closures of the nodes which are the next nodes of
 993      the back-references.  */
 994   if (dfa->nbackref > 0)
 995     for (i = 0; i < init_nodes.nelem; ++i)
 996       {
 997         int node_idx = init_nodes.elems[i];
 998         re_token_type_t type = dfa->nodes[node_idx].type;
 999
1000         int clexp_idx;
1001         if (type != OP_BACK_REF)
1002           continue;
1003         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1004           {
1005             re_token_t *clexp_node;
1006             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1007             if (clexp_node->type == OP_CLOSE_SUBEXP
1008                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1009               break;
1010           }
1011         if (clexp_idx == init_nodes.nelem)
1012           continue;
1013
1014         if (type == OP_BACK_REF)
1015           {
1016             int dest_idx = dfa->edests[node_idx].elems[0];
1017             if (!re_node_set_contains (&init_nodes, dest_idx))
1018               {
1019                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1020                 i = 0;
1021               }
1022           }
1023       }
1024
1025   /* It must be the first time to invoke acquire_state.  */
1026   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1027   /* We don't check ERR here, since the initial state must not be NULL.  */
1028   if (BE (dfa->init_state == NULL, 0))
1029     return err;
1030   if (dfa->init_state->has_constraint)
1031     {
1032       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1033                                                        CONTEXT_WORD);
1034       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1035                                                      CONTEXT_NEWLINE);
1036       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1037                                                          &init_nodes,
1038                                                          CONTEXT_NEWLINE
1039                                                          | CONTEXT_BEGBUF);
1040       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1041               || dfa->init_state_begbuf == NULL, 0))
1042         return err;
1043     }
1044   else
1045     dfa->init_state_word = dfa->init_state_nl
1046       = dfa->init_state_begbuf = dfa->init_state;
1047
1048   re_node_set_free (&init_nodes);
1049   return REG_NOERROR;
1050 }
1051 \f
1052 #ifdef RE_ENABLE_I18N
1053 /* If it is possible to do searching in single byte encoding instead of UTF-8
1054    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1055    DFA nodes where needed.  */
1056
1057 static void
1058 optimize_utf8 (dfa)
1059      re_dfa_t *dfa;
1060 {
1061   int node, i, mb_chars = 0, has_period = 0;
1062
1063   for (node = 0; node < dfa->nodes_len; ++node)
1064     switch (dfa->nodes[node].type)
1065       {
1066       case CHARACTER:
1067         if (dfa->nodes[node].opr.c >= 0x80)
1068           mb_chars = 1;
1069         break;
1070       case ANCHOR:
1071         switch (dfa->nodes[node].opr.idx)
1072           {
1073           case LINE_FIRST:
1074           case LINE_LAST:
1075           case BUF_FIRST:
1076           case BUF_LAST:
1077             break;
1078           default:
1079             /* Word anchors etc. cannot be handled.  */
1080             return;
1081           }
1082         break;
1083       case OP_PERIOD:
1084         has_period = 1;
1085         break;
1086       case OP_BACK_REF:
1087       case OP_ALT:
1088       case END_OF_RE:
1089       case OP_DUP_ASTERISK:
1090       case OP_OPEN_SUBEXP:
1091       case OP_CLOSE_SUBEXP:
1092         break;
1093       case COMPLEX_BRACKET:
1094         return;
1095       case SIMPLE_BRACKET:
1096         /* Just double check.  */
1097         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1098           if (dfa->nodes[node].opr.sbcset[i])
1099             return;
1100         break;
1101       default:
1102         abort ();
1103       }
1104
1105   if (mb_chars || has_period)
1106     for (node = 0; node < dfa->nodes_len; ++node)
1107       {
1108         if (dfa->nodes[node].type == CHARACTER
1109             && dfa->nodes[node].opr.c >= 0x80)
1110           dfa->nodes[node].mb_partial = 0;
1111         else if (dfa->nodes[node].type == OP_PERIOD)
1112           dfa->nodes[node].type = OP_UTF8_PERIOD;
1113       }
1114
1115   /* The search can be in single byte locale.  */
1116   dfa->mb_cur_max = 1;
1117   dfa->is_utf8 = 0;
1118   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1119 }
1120 #endif
1121 \f
1122 /* Analyze the structure tree, and calculate "first", "next", "edest",
1123    "eclosure", and "inveclosure".  */
1124
1125 static reg_errcode_t
1126 analyze (preg)
1127      regex_t *preg;
1128 {
1129   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1130   reg_errcode_t ret;
1131
1132   /* Allocate arrays.  */
1133   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1134   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1135   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1136   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1137   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1138           || dfa->eclosures == NULL, 0))
1139     return REG_ESPACE;
1140
1141   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1142   if (dfa->subexp_map != NULL)
1143     {
1144       int i;
1145       for (i = 0; i < preg->re_nsub; i++)
1146         dfa->subexp_map[i] = i;
1147       preorder (dfa->str_tree, optimize_subexps, dfa);
1148       for (i = 0; i < preg->re_nsub; i++)
1149         if (dfa->subexp_map[i] != i)
1150           break;
1151       if (i == preg->re_nsub)
1152         {
1153           free (dfa->subexp_map);
1154           dfa->subexp_map = NULL;
1155         }
1156     }
1157
1158   ret = postorder (dfa->str_tree, lower_subexps, preg);
1159   if (BE (ret != REG_NOERROR, 0))
1160     return ret;
1161   ret = postorder (dfa->str_tree, calc_first, dfa);
1162   if (BE (ret != REG_NOERROR, 0))
1163     return ret;
1164   preorder (dfa->str_tree, calc_next, dfa);
1165   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1166   if (BE (ret != REG_NOERROR, 0))
1167     return ret;
1168   ret = calc_eclosure (dfa);
1169   if (BE (ret != REG_NOERROR, 0))
1170     return ret;
1171
1172   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1173      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1174   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1175       || dfa->nbackref)
1176     {
1177       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1178       if (BE (dfa->inveclosures == NULL, 0))
1179         return REG_ESPACE;
1180       ret = calc_inveclosure (dfa);
1181     }
1182
1183   return ret;
1184 }
1185
1186 /* Our parse trees are very unbalanced, so we cannot use a stack to
1187    implement parse tree visits.  Instead, we use parent pointers and
1188    some hairy code in these two functions.  */
1189 static reg_errcode_t
1190 postorder (root, fn, extra)
1191      bin_tree_t *root;
1192      reg_errcode_t (fn (void *, bin_tree_t *));
1193      void *extra;
1194 {
1195   bin_tree_t *node, *prev;
1196
1197   for (node = root; ; )
1198     {
1199       /* Descend down the tree, preferably to the left (or to the right
1200          if that's the only child).  */
1201       while (node->left || node->right)
1202         if (node->left)
1203           node = node->left;
1204         else
1205           node = node->right;
1206
1207       do
1208         {
1209           reg_errcode_t err = fn (extra, node);
1210           if (BE (err != REG_NOERROR, 0))
1211             return err;
1212           if (node->parent == NULL)
1213             return REG_NOERROR;
1214           prev = node;
1215           node = node->parent;
1216         }
1217       /* Go up while we have a node that is reached from the right.  */
1218       while (node->right == prev || node->right == NULL);
1219       node = node->right;
1220     }
1221 }
1222
1223 static reg_errcode_t
1224 preorder (root, fn, extra)
1225      bin_tree_t *root;
1226      reg_errcode_t (fn (void *, bin_tree_t *));
1227      void *extra;
1228 {
1229   bin_tree_t *node;
1230
1231   for (node = root; ; )
1232     {
1233       reg_errcode_t err = fn (extra, node);
1234       if (BE (err != REG_NOERROR, 0))
1235         return err;
1236
1237       /* Go to the left node, or up and to the right.  */
1238       if (node->left)
1239         node = node->left;
1240       else
1241         {
1242           bin_tree_t *prev = NULL;
1243           while (node->right == prev || node->right == NULL)
1244             {
1245               prev = node;
1246               node = node->parent;
1247               if (!node)
1248                 return REG_NOERROR;
1249             }
1250           node = node->right;
1251         }
1252     }
1253 }
1254
1255 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1256    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1257    backreferences as well.  Requires a preorder visit.  */
1258 static reg_errcode_t
1259 optimize_subexps (extra, node)
1260      void *extra;
1261      bin_tree_t *node;
1262 {
1263   re_dfa_t *dfa = (re_dfa_t *) extra;
1264
1265   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1266     {
1267       int idx = node->token.opr.idx;
1268       node->token.opr.idx = dfa->subexp_map[idx];
1269       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1270     }
1271
1272   else if (node->token.type == SUBEXP
1273            && node->left && node->left->token.type == SUBEXP)
1274     {
1275       int other_idx = node->left->token.opr.idx;
1276
1277       node->left = node->left->left;
1278       if (node->left)
1279         node->left->parent = node;
1280
1281       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1282       if (other_idx < 8 * sizeof (dfa->used_bkref_map))
1283         dfa->used_bkref_map &= ~(1 << other_idx);
1284     }
1285
1286   return REG_NOERROR;
1287 }
1288
1289 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1290    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1291 static reg_errcode_t
1292 lower_subexps (extra, node)
1293      void *extra;
1294      bin_tree_t *node;
1295 {
1296   regex_t *preg = (regex_t *) extra;
1297   reg_errcode_t err = REG_NOERROR;
1298
1299   if (node->left && node->left->token.type == SUBEXP)
1300     {
1301       node->left = lower_subexp (&err, preg, node->left);
1302       if (node->left)
1303         node->left->parent = node;
1304     }
1305   if (node->right && node->right->token.type == SUBEXP)
1306     {
1307       node->right = lower_subexp (&err, preg, node->right);
1308       if (node->right)
1309         node->right->parent = node;
1310     }
1311
1312   return err;
1313 }
1314
1315 static bin_tree_t *
1316 lower_subexp (err, preg, node)
1317      reg_errcode_t *err;
1318      regex_t *preg;
1319      bin_tree_t *node;
1320 {
1321   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1322   bin_tree_t *body = node->left;
1323   bin_tree_t *op, *cls, *tree1, *tree;
1324
1325   if (preg->no_sub
1326       /* We do not optimize empty subexpressions, because otherwise we may
1327          have bad CONCAT nodes with NULL children.  This is obviously not
1328          very common, so we do not lose much.  An example that triggers
1329          this case is the sed "script" /\(\)/x.  */
1330       && node->left != NULL
1331       && (node->token.opr.idx >= 8 * sizeof (dfa->used_bkref_map)
1332           || !(dfa->used_bkref_map & (1 << node->token.opr.idx))))
1333     return node->left;
1334
1335   /* Convert the SUBEXP node to the concatenation of an
1336      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1337   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340   tree = create_tree (dfa, op, tree1, CONCAT);
1341   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1342     {
1343       *err = REG_ESPACE;
1344       return NULL;
1345     }
1346
1347   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1348   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1349   return tree;
1350 }
1351
1352 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353    nodes.  Requires a postorder visit.  */
1354 static reg_errcode_t
1355 calc_first (extra, node)
1356      void *extra;
1357      bin_tree_t *node;
1358 {
1359   re_dfa_t *dfa = (re_dfa_t *) extra;
1360   if (node->token.type == CONCAT)
1361     {
1362       node->first = node->left->first;
1363       node->node_idx = node->left->node_idx;
1364     }
1365   else
1366     {
1367       node->first = node;
1368       node->node_idx = re_dfa_add_node (dfa, node->token);
1369       if (BE (node->node_idx == -1, 0))
1370         return REG_ESPACE;
1371     }
1372   return REG_NOERROR;
1373 }
1374
1375 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1376 static reg_errcode_t
1377 calc_next (extra, node)
1378      void *extra;
1379      bin_tree_t *node;
1380 {
1381   switch (node->token.type)
1382     {
1383     case OP_DUP_ASTERISK:
1384       node->left->next = node;
1385       break;
1386     case CONCAT:
1387       node->left->next = node->right->first;
1388       node->right->next = node->next;
1389       break;
1390     default:
1391       if (node->left)
1392         node->left->next = node->next;
1393       if (node->right)
1394         node->right->next = node->next;
1395       break;
1396     }
1397   return REG_NOERROR;
1398 }
1399
1400 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1401 static reg_errcode_t
1402 link_nfa_nodes (extra, node)
1403      void *extra;
1404      bin_tree_t *node;
1405 {
1406   re_dfa_t *dfa = (re_dfa_t *) extra;
1407   int idx = node->node_idx;
1408   reg_errcode_t err = REG_NOERROR;
1409
1410   switch (node->token.type)
1411     {
1412     case CONCAT:
1413       break;
1414
1415     case END_OF_RE:
1416       assert (node->next == NULL);
1417       break;
1418
1419     case OP_DUP_ASTERISK:
1420     case OP_ALT:
1421       {
1422         int left, right;
1423         dfa->has_plural_match = 1;
1424         if (node->left != NULL)
1425           left = node->left->first->node_idx;
1426         else
1427           left = node->next->node_idx;
1428         if (node->right != NULL)
1429           right = node->right->first->node_idx;
1430         else
1431           right = node->next->node_idx;
1432         assert (left > -1);
1433         assert (right > -1);
1434         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1435       }
1436       break;
1437
1438     case ANCHOR:
1439     case OP_OPEN_SUBEXP:
1440     case OP_CLOSE_SUBEXP:
1441       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442       break;
1443
1444     case OP_BACK_REF:
1445       dfa->nexts[idx] = node->next->node_idx;
1446       if (node->token.type == OP_BACK_REF)
1447         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448       break;
1449
1450     default:
1451       assert (!IS_EPSILON_NODE (node->token.type));
1452       dfa->nexts[idx] = node->next->node_idx;
1453       break;
1454     }
1455
1456   return err;
1457 }
1458
1459 /* Duplicate the epsilon closure of the node ROOT_NODE.
1460    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461    to their own constraint.  */
1462
1463 static reg_errcode_t
1464 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1465                         init_constraint)
1466      re_dfa_t *dfa;
1467      int top_org_node, top_clone_node, root_node;
1468      unsigned int init_constraint;
1469 {
1470   reg_errcode_t err;
1471   int org_node, clone_node, ret;
1472   unsigned int constraint = init_constraint;
1473   for (org_node = top_org_node, clone_node = top_clone_node;;)
1474     {
1475       int org_dest, clone_dest;
1476       if (dfa->nodes[org_node].type == OP_BACK_REF)
1477         {
1478           /* If the back reference epsilon-transit, its destination must
1479              also have the constraint.  Then duplicate the epsilon closure
1480              of the destination of the back reference, and store it in
1481              edests of the back reference.  */
1482           org_dest = dfa->nexts[org_node];
1483           re_node_set_empty (dfa->edests + clone_node);
1484           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1485           if (BE (err != REG_NOERROR, 0))
1486             return err;
1487           dfa->nexts[clone_node] = dfa->nexts[org_node];
1488           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1489           if (BE (ret < 0, 0))
1490             return REG_ESPACE;
1491         }
1492       else if (dfa->edests[org_node].nelem == 0)
1493         {
1494           /* In case of the node can't epsilon-transit, don't duplicate the
1495              destination and store the original destination as the
1496              destination of the node.  */
1497           dfa->nexts[clone_node] = dfa->nexts[org_node];
1498           break;
1499         }
1500       else if (dfa->edests[org_node].nelem == 1)
1501         {
1502           /* In case of the node can epsilon-transit, and it has only one
1503              destination.  */
1504           org_dest = dfa->edests[org_node].elems[0];
1505           re_node_set_empty (dfa->edests + clone_node);
1506           if (dfa->nodes[org_node].type == ANCHOR)
1507             {
1508               /* In case of the node has another constraint, append it.  */
1509               if (org_node == root_node && clone_node != org_node)
1510                 {
1511                   /* ...but if the node is root_node itself, it means the
1512                      epsilon closure have a loop, then tie it to the
1513                      destination of the root_node.  */
1514                   ret = re_node_set_insert (dfa->edests + clone_node,
1515                                             org_dest);
1516                   if (BE (ret < 0, 0))
1517                     return REG_ESPACE;
1518                   break;
1519                 }
1520               constraint |= dfa->nodes[org_node].opr.ctx_type;
1521             }
1522           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1523           if (BE (err != REG_NOERROR, 0))
1524             return err;
1525           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1526           if (BE (ret < 0, 0))
1527             return REG_ESPACE;
1528         }
1529       else /* dfa->edests[org_node].nelem == 2 */
1530         {
1531           /* In case of the node can epsilon-transit, and it has two
1532              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1533           org_dest = dfa->edests[org_node].elems[0];
1534           re_node_set_empty (dfa->edests + clone_node);
1535           /* Search for a duplicated node which satisfies the constraint.  */
1536           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1537           if (clone_dest == -1)
1538             {
1539               /* There are no such a duplicated node, create a new one.  */
1540               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1541               if (BE (err != REG_NOERROR, 0))
1542                 return err;
1543               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1544               if (BE (ret < 0, 0))
1545                 return REG_ESPACE;
1546               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1547                                             root_node, constraint);
1548               if (BE (err != REG_NOERROR, 0))
1549                 return err;
1550             }
1551           else
1552             {
1553               /* There are a duplicated node which satisfy the constraint,
1554                  use it to avoid infinite loop.  */
1555               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1556               if (BE (ret < 0, 0))
1557                 return REG_ESPACE;
1558             }
1559
1560           org_dest = dfa->edests[org_node].elems[1];
1561           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1562           if (BE (err != REG_NOERROR, 0))
1563             return err;
1564           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1565           if (BE (ret < 0, 0))
1566             return REG_ESPACE;
1567         }
1568       org_node = org_dest;
1569       clone_node = clone_dest;
1570     }
1571   return REG_NOERROR;
1572 }
1573
1574 /* Search for a node which is duplicated from the node ORG_NODE, and
1575    satisfies the constraint CONSTRAINT.  */
1576
1577 static int
1578 search_duplicated_node (dfa, org_node, constraint)
1579      re_dfa_t *dfa;
1580      int org_node;
1581      unsigned int constraint;
1582 {
1583   int idx;
1584   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1585     {
1586       if (org_node == dfa->org_indices[idx]
1587           && constraint == dfa->nodes[idx].constraint)
1588         return idx; /* Found.  */
1589     }
1590   return -1; /* Not found.  */
1591 }
1592
1593 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1594    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1595    otherwise return the error code.  */
1596
1597 static reg_errcode_t
1598 duplicate_node (new_idx, dfa, org_idx, constraint)
1599      re_dfa_t *dfa;
1600      int *new_idx, org_idx;
1601      unsigned int constraint;
1602 {
1603   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1604   if (BE (dup_idx == -1, 0))
1605     return REG_ESPACE;
1606   dfa->nodes[dup_idx].constraint = constraint;
1607   if (dfa->nodes[org_idx].type == ANCHOR)
1608     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1609   dfa->nodes[dup_idx].duplicated = 1;
1610
1611   /* Store the index of the original node.  */
1612   dfa->org_indices[dup_idx] = org_idx;
1613   *new_idx = dup_idx;
1614   return REG_NOERROR;
1615 }
1616
1617 static reg_errcode_t
1618 calc_inveclosure (dfa)
1619      re_dfa_t *dfa;
1620 {
1621   int src, idx, ret;
1622   for (idx = 0; idx < dfa->nodes_len; ++idx)
1623     re_node_set_init_empty (dfa->inveclosures + idx);
1624
1625   for (src = 0; src < dfa->nodes_len; ++src)
1626     {
1627       int *elems = dfa->eclosures[src].elems;
1628       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1629         {
1630           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1631           if (BE (ret == -1, 0))
1632             return REG_ESPACE;
1633         }
1634     }
1635
1636   return REG_NOERROR;
1637 }
1638
1639 /* Calculate "eclosure" for all the node in DFA.  */
1640
1641 static reg_errcode_t
1642 calc_eclosure (dfa)
1643      re_dfa_t *dfa;
1644 {
1645   int node_idx, incomplete;
1646 #ifdef DEBUG
1647   assert (dfa->nodes_len > 0);
1648 #endif
1649   incomplete = 0;
1650   /* For each nodes, calculate epsilon closure.  */
1651   for (node_idx = 0; ; ++node_idx)
1652     {
1653       reg_errcode_t err;
1654       re_node_set eclosure_elem;
1655       if (node_idx == dfa->nodes_len)
1656         {
1657           if (!incomplete)
1658             break;
1659           incomplete = 0;
1660           node_idx = 0;
1661         }
1662
1663 #ifdef DEBUG
1664       assert (dfa->eclosures[node_idx].nelem != -1);
1665 #endif
1666
1667       /* If we have already calculated, skip it.  */
1668       if (dfa->eclosures[node_idx].nelem != 0)
1669         continue;
1670       /* Calculate epsilon closure of `node_idx'.  */
1671       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1672       if (BE (err != REG_NOERROR, 0))
1673         return err;
1674
1675       if (dfa->eclosures[node_idx].nelem == 0)
1676         {
1677           incomplete = 1;
1678           re_node_set_free (&eclosure_elem);
1679         }
1680     }
1681   return REG_NOERROR;
1682 }
1683
1684 /* Calculate epsilon closure of NODE.  */
1685
1686 static reg_errcode_t
1687 calc_eclosure_iter (new_set, dfa, node, root)
1688      re_node_set *new_set;
1689      re_dfa_t *dfa;
1690      int node, root;
1691 {
1692   reg_errcode_t err;
1693   unsigned int constraint;
1694   int i, incomplete;
1695   re_node_set eclosure;
1696   incomplete = 0;
1697   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1698   if (BE (err != REG_NOERROR, 0))
1699     return err;
1700
1701   /* This indicates that we are calculating this node now.
1702      We reference this value to avoid infinite loop.  */
1703   dfa->eclosures[node].nelem = -1;
1704
1705   constraint = ((dfa->nodes[node].type == ANCHOR)
1706                 ? dfa->nodes[node].opr.ctx_type : 0);
1707   /* If the current node has constraints, duplicate all nodes.
1708      Since they must inherit the constraints.  */
1709   if (constraint
1710       && dfa->edests[node].nelem
1711       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1712     {
1713       int org_node, cur_node;
1714       org_node = cur_node = node;
1715       err = duplicate_node_closure (dfa, node, node, node, constraint);
1716       if (BE (err != REG_NOERROR, 0))
1717         return err;
1718     }
1719
1720   /* Expand each epsilon destination nodes.  */
1721   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1722     for (i = 0; i < dfa->edests[node].nelem; ++i)
1723       {
1724         re_node_set eclosure_elem;
1725         int edest = dfa->edests[node].elems[i];
1726         /* If calculating the epsilon closure of `edest' is in progress,
1727            return intermediate result.  */
1728         if (dfa->eclosures[edest].nelem == -1)
1729           {
1730             incomplete = 1;
1731             continue;
1732           }
1733         /* If we haven't calculated the epsilon closure of `edest' yet,
1734            calculate now. Otherwise use calculated epsilon closure.  */
1735         if (dfa->eclosures[edest].nelem == 0)
1736           {
1737             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1738             if (BE (err != REG_NOERROR, 0))
1739               return err;
1740           }
1741         else
1742           eclosure_elem = dfa->eclosures[edest];
1743         /* Merge the epsilon closure of `edest'.  */
1744         re_node_set_merge (&eclosure, &eclosure_elem);
1745         /* If the epsilon closure of `edest' is incomplete,
1746            the epsilon closure of this node is also incomplete.  */
1747         if (dfa->eclosures[edest].nelem == 0)
1748           {
1749             incomplete = 1;
1750             re_node_set_free (&eclosure_elem);
1751           }
1752       }
1753
1754   /* Epsilon closures include itself.  */
1755   re_node_set_insert (&eclosure, node);
1756   if (incomplete && !root)
1757     dfa->eclosures[node].nelem = 0;
1758   else
1759     dfa->eclosures[node] = eclosure;
1760   *new_set = eclosure;
1761   return REG_NOERROR;
1762 }
1763 \f
1764 /* Functions for token which are used in the parser.  */
1765
1766 /* Fetch a token from INPUT.
1767    We must not use this function inside bracket expressions.  */
1768
1769 static void
1770 fetch_token (result, input, syntax)
1771      re_token_t *result;
1772      re_string_t *input;
1773      reg_syntax_t syntax;
1774 {
1775   re_string_skip_bytes (input, peek_token (result, input, syntax));
1776 }
1777
1778 /* Peek a token from INPUT, and return the length of the token.
1779    We must not use this function inside bracket expressions.  */
1780
1781 static int
1782 peek_token (token, input, syntax)
1783      re_token_t *token;
1784      re_string_t *input;
1785      reg_syntax_t syntax;
1786 {
1787   unsigned char c;
1788
1789   if (re_string_eoi (input))
1790     {
1791       token->type = END_OF_RE;
1792       return 0;
1793     }
1794
1795   c = re_string_peek_byte (input, 0);
1796   token->opr.c = c;
1797
1798   token->word_char = 0;
1799 #ifdef RE_ENABLE_I18N
1800   token->mb_partial = 0;
1801   if (input->mb_cur_max > 1 &&
1802       !re_string_first_byte (input, re_string_cur_idx (input)))
1803     {
1804       token->type = CHARACTER;
1805       token->mb_partial = 1;
1806       return 1;
1807     }
1808 #endif
1809   if (c == '\\')
1810     {
1811       unsigned char c2;
1812       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1813         {
1814           token->type = BACK_SLASH;
1815           return 1;
1816         }
1817
1818       c2 = re_string_peek_byte_case (input, 1);
1819       token->opr.c = c2;
1820       token->type = CHARACTER;
1821 #ifdef RE_ENABLE_I18N
1822       if (input->mb_cur_max > 1)
1823         {
1824           wint_t wc = re_string_wchar_at (input,
1825                                           re_string_cur_idx (input) + 1);
1826           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1827         }
1828       else
1829 #endif
1830         token->word_char = IS_WORD_CHAR (c2) != 0;
1831
1832       switch (c2)
1833         {
1834         case '|':
1835           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1836             token->type = OP_ALT;
1837           break;
1838         case '1': case '2': case '3': case '4': case '5':
1839         case '6': case '7': case '8': case '9':
1840           if (!(syntax & RE_NO_BK_REFS))
1841             {
1842               token->type = OP_BACK_REF;
1843               token->opr.idx = c2 - '1';
1844             }
1845           break;
1846         case '<':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             {
1849               token->type = ANCHOR;
1850               token->opr.ctx_type = WORD_FIRST;
1851             }
1852           break;
1853         case '>':
1854           if (!(syntax & RE_NO_GNU_OPS))
1855             {
1856               token->type = ANCHOR;
1857               token->opr.ctx_type = WORD_LAST;
1858             }
1859           break;
1860         case 'b':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             {
1863               token->type = ANCHOR;
1864               token->opr.ctx_type = WORD_DELIM;
1865             }
1866           break;
1867         case 'B':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             {
1870               token->type = ANCHOR;
1871               token->opr.ctx_type = NOT_WORD_DELIM;
1872             }
1873           break;
1874         case 'w':
1875           if (!(syntax & RE_NO_GNU_OPS))
1876             token->type = OP_WORD;
1877           break;
1878         case 'W':
1879           if (!(syntax & RE_NO_GNU_OPS))
1880             token->type = OP_NOTWORD;
1881           break;
1882         case 's':
1883           if (!(syntax & RE_NO_GNU_OPS))
1884             token->type = OP_SPACE;
1885           break;
1886         case 'S':
1887           if (!(syntax & RE_NO_GNU_OPS))
1888             token->type = OP_NOTSPACE;
1889           break;
1890         case '`':
1891           if (!(syntax & RE_NO_GNU_OPS))
1892             {
1893               token->type = ANCHOR;
1894               token->opr.ctx_type = BUF_FIRST;
1895             }
1896           break;
1897         case '\'':
1898           if (!(syntax & RE_NO_GNU_OPS))
1899             {
1900               token->type = ANCHOR;
1901               token->opr.ctx_type = BUF_LAST;
1902             }
1903           break;
1904         case '(':
1905           if (!(syntax & RE_NO_BK_PARENS))
1906             token->type = OP_OPEN_SUBEXP;
1907           break;
1908         case ')':
1909           if (!(syntax & RE_NO_BK_PARENS))
1910             token->type = OP_CLOSE_SUBEXP;
1911           break;
1912         case '+':
1913           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1914             token->type = OP_DUP_PLUS;
1915           break;
1916         case '?':
1917           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1918             token->type = OP_DUP_QUESTION;
1919           break;
1920         case '{':
1921           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1922             token->type = OP_OPEN_DUP_NUM;
1923           break;
1924         case '}':
1925           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1926             token->type = OP_CLOSE_DUP_NUM;
1927           break;
1928         default:
1929           break;
1930         }
1931       return 2;
1932     }
1933
1934   token->type = CHARACTER;
1935 #ifdef RE_ENABLE_I18N
1936   if (input->mb_cur_max > 1)
1937     {
1938       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1939       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1940     }
1941   else
1942 #endif
1943     token->word_char = IS_WORD_CHAR (token->opr.c);
1944
1945   switch (c)
1946     {
1947     case '\n':
1948       if (syntax & RE_NEWLINE_ALT)
1949         token->type = OP_ALT;
1950       break;
1951     case '|':
1952       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1953         token->type = OP_ALT;
1954       break;
1955     case '*':
1956       token->type = OP_DUP_ASTERISK;
1957       break;
1958     case '+':
1959       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1960         token->type = OP_DUP_PLUS;
1961       break;
1962     case '?':
1963       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1964         token->type = OP_DUP_QUESTION;
1965       break;
1966     case '{':
1967       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1968         token->type = OP_OPEN_DUP_NUM;
1969       break;
1970     case '}':
1971       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1972         token->type = OP_CLOSE_DUP_NUM;
1973       break;
1974     case '(':
1975       if (syntax & RE_NO_BK_PARENS)
1976         token->type = OP_OPEN_SUBEXP;
1977       break;
1978     case ')':
1979       if (syntax & RE_NO_BK_PARENS)
1980         token->type = OP_CLOSE_SUBEXP;
1981       break;
1982     case '[':
1983       token->type = OP_OPEN_BRACKET;
1984       break;
1985     case '.':
1986       token->type = OP_PERIOD;
1987       break;
1988     case '^':
1989       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1990           re_string_cur_idx (input) != 0)
1991         {
1992           char prev = re_string_peek_byte (input, -1);
1993           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1994             break;
1995         }
1996       token->type = ANCHOR;
1997       token->opr.ctx_type = LINE_FIRST;
1998       break;
1999     case '$':
2000       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
2001           re_string_cur_idx (input) + 1 != re_string_length (input))
2002         {
2003           re_token_t next;
2004           re_string_skip_bytes (input, 1);
2005           peek_token (&next, input, syntax);
2006           re_string_skip_bytes (input, -1);
2007           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2008             break;
2009         }
2010       token->type = ANCHOR;
2011       token->opr.ctx_type = LINE_LAST;
2012       break;
2013     default:
2014       break;
2015     }
2016   return 1;
2017 }
2018
2019 /* Peek a token from INPUT, and return the length of the token.
2020    We must not use this function out of bracket expressions.  */
2021
2022 static int
2023 peek_token_bracket (token, input, syntax)
2024      re_token_t *token;
2025      re_string_t *input;
2026      reg_syntax_t syntax;
2027 {
2028   unsigned char c;
2029   if (re_string_eoi (input))
2030     {
2031       token->type = END_OF_RE;
2032       return 0;
2033     }
2034   c = re_string_peek_byte (input, 0);
2035   token->opr.c = c;
2036
2037 #ifdef RE_ENABLE_I18N
2038   if (input->mb_cur_max > 1 &&
2039       !re_string_first_byte (input, re_string_cur_idx (input)))
2040     {
2041       token->type = CHARACTER;
2042       return 1;
2043     }
2044 #endif /* RE_ENABLE_I18N */
2045
2046   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2047       && re_string_cur_idx (input) + 1 < re_string_length (input))
2048     {
2049       /* In this case, '\' escape a character.  */
2050       unsigned char c2;
2051       re_string_skip_bytes (input, 1);
2052       c2 = re_string_peek_byte (input, 0);
2053       token->opr.c = c2;
2054       token->type = CHARACTER;
2055       return 1;
2056     }
2057   if (c == '[') /* '[' is a special char in a bracket exps.  */
2058     {
2059       unsigned char c2;
2060       int token_len;
2061       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2062         c2 = re_string_peek_byte (input, 1);
2063       else
2064         c2 = 0;
2065       token->opr.c = c2;
2066       token_len = 2;
2067       switch (c2)
2068         {
2069         case '.':
2070           token->type = OP_OPEN_COLL_ELEM;
2071           break;
2072         case '=':
2073           token->type = OP_OPEN_EQUIV_CLASS;
2074           break;
2075         case ':':
2076           if (syntax & RE_CHAR_CLASSES)
2077             {
2078               token->type = OP_OPEN_CHAR_CLASS;
2079               break;
2080             }
2081           /* else fall through.  */
2082         default:
2083           token->type = CHARACTER;
2084           token->opr.c = c;
2085           token_len = 1;
2086           break;
2087         }
2088       return token_len;
2089     }
2090   switch (c)
2091     {
2092     case '-':
2093       token->type = OP_CHARSET_RANGE;
2094       break;
2095     case ']':
2096       token->type = OP_CLOSE_BRACKET;
2097       break;
2098     case '^':
2099       token->type = OP_NON_MATCH_LIST;
2100       break;
2101     default:
2102       token->type = CHARACTER;
2103     }
2104   return 1;
2105 }
2106 \f
2107 /* Functions for parser.  */
2108
2109 /* Entry point of the parser.
2110    Parse the regular expression REGEXP and return the structure tree.
2111    If an error is occured, ERR is set by error code, and return NULL.
2112    This function build the following tree, from regular expression <reg_exp>:
2113            CAT
2114            / \
2115           /   \
2116    <reg_exp>  EOR
2117
2118    CAT means concatenation.
2119    EOR means end of regular expression.  */
2120
2121 static bin_tree_t *
2122 parse (regexp, preg, syntax, err)
2123      re_string_t *regexp;
2124      regex_t *preg;
2125      reg_syntax_t syntax;
2126      reg_errcode_t *err;
2127 {
2128   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2129   bin_tree_t *tree, *eor, *root;
2130   re_token_t current_token;
2131   dfa->syntax = syntax;
2132   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2133   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2134   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2135     return NULL;
2136   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2137   if (tree != NULL)
2138     root = create_tree (dfa, tree, eor, CONCAT);
2139   else
2140     root = eor;
2141   if (BE (eor == NULL || root == NULL, 0))
2142     {
2143       *err = REG_ESPACE;
2144       return NULL;
2145     }
2146   return root;
2147 }
2148
2149 /* This function build the following tree, from regular expression
2150    <branch1>|<branch2>:
2151            ALT
2152            / \
2153           /   \
2154    <branch1> <branch2>
2155
2156    ALT means alternative, which represents the operator `|'.  */
2157
2158 static bin_tree_t *
2159 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2160      re_string_t *regexp;
2161      regex_t *preg;
2162      re_token_t *token;
2163      reg_syntax_t syntax;
2164      int nest;
2165      reg_errcode_t *err;
2166 {
2167   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2168   bin_tree_t *tree, *branch = NULL;
2169   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2170   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2171     return NULL;
2172
2173   while (token->type == OP_ALT)
2174     {
2175       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2176       if (token->type != OP_ALT && token->type != END_OF_RE
2177           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2178         {
2179           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2180           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2181             return NULL;
2182         }
2183       else
2184         branch = NULL;
2185       tree = create_tree (dfa, tree, branch, OP_ALT);
2186       if (BE (tree == NULL, 0))
2187         {
2188           *err = REG_ESPACE;
2189           return NULL;
2190         }
2191     }
2192   return tree;
2193 }
2194
2195 /* This function build the following tree, from regular expression
2196    <exp1><exp2>:
2197         CAT
2198         / \
2199        /   \
2200    <exp1> <exp2>
2201
2202    CAT means concatenation.  */
2203
2204 static bin_tree_t *
2205 parse_branch (regexp, preg, token, syntax, nest, err)
2206      re_string_t *regexp;
2207      regex_t *preg;
2208      re_token_t *token;
2209      reg_syntax_t syntax;
2210      int nest;
2211      reg_errcode_t *err;
2212 {
2213   bin_tree_t *tree, *exp;
2214   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2215   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2216   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2217     return NULL;
2218
2219   while (token->type != OP_ALT && token->type != END_OF_RE
2220          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2221     {
2222       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2223       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2224         {
2225           return NULL;
2226         }
2227       if (tree != NULL && exp != NULL)
2228         {
2229           tree = create_tree (dfa, tree, exp, CONCAT);
2230           if (tree == NULL)
2231             {
2232               *err = REG_ESPACE;
2233               return NULL;
2234             }
2235         }
2236       else if (tree == NULL)
2237         tree = exp;
2238       /* Otherwise exp == NULL, we don't need to create new tree.  */
2239     }
2240   return tree;
2241 }
2242
2243 /* This function build the following tree, from regular expression a*:
2244          *
2245          |
2246          a
2247 */
2248
2249 static bin_tree_t *
2250 parse_expression (regexp, preg, token, syntax, nest, err)
2251      re_string_t *regexp;
2252      regex_t *preg;
2253      re_token_t *token;
2254      reg_syntax_t syntax;
2255      int nest;
2256      reg_errcode_t *err;
2257 {
2258   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2259   bin_tree_t *tree;
2260   switch (token->type)
2261     {
2262     case CHARACTER:
2263       tree = create_token_tree (dfa, NULL, NULL, token);
2264       if (BE (tree == NULL, 0))
2265         {
2266           *err = REG_ESPACE;
2267           return NULL;
2268         }
2269 #ifdef RE_ENABLE_I18N
2270       if (dfa->mb_cur_max > 1)
2271         {
2272           while (!re_string_eoi (regexp)
2273                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2274             {
2275               bin_tree_t *mbc_remain;
2276               fetch_token (token, regexp, syntax);
2277               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2278               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2279               if (BE (mbc_remain == NULL || tree == NULL, 0))
2280                 {
2281                   *err = REG_ESPACE;
2282                   return NULL;
2283                 }
2284             }
2285         }
2286 #endif
2287       break;
2288     case OP_OPEN_SUBEXP:
2289       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2290       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2291         return NULL;
2292       break;
2293     case OP_OPEN_BRACKET:
2294       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2295       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2296         return NULL;
2297       break;
2298     case OP_BACK_REF:
2299       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2300         {
2301           *err = REG_ESUBREG;
2302           return NULL;
2303         }
2304       dfa->used_bkref_map |= 1 << token->opr.idx;
2305       tree = create_token_tree (dfa, NULL, NULL, token);
2306       if (BE (tree == NULL, 0))
2307         {
2308           *err = REG_ESPACE;
2309           return NULL;
2310         }
2311       ++dfa->nbackref;
2312       dfa->has_mb_node = 1;
2313       break;
2314     case OP_OPEN_DUP_NUM:
2315       if (syntax & RE_CONTEXT_INVALID_DUP)
2316         {
2317           *err = REG_BADRPT;
2318           return NULL;
2319         }
2320       /* FALLTHROUGH */
2321     case OP_DUP_ASTERISK:
2322     case OP_DUP_PLUS:
2323     case OP_DUP_QUESTION:
2324       if (syntax & RE_CONTEXT_INVALID_OPS)
2325         {
2326           *err = REG_BADRPT;
2327           return NULL;
2328         }
2329       else if (syntax & RE_CONTEXT_INDEP_OPS)
2330         {
2331           fetch_token (token, regexp, syntax);
2332           return parse_expression (regexp, preg, token, syntax, nest, err);
2333         }
2334       /* else fall through  */
2335     case OP_CLOSE_SUBEXP:
2336       if ((token->type == OP_CLOSE_SUBEXP) &&
2337           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2338         {
2339           *err = REG_ERPAREN;
2340           return NULL;
2341         }
2342       /* else fall through  */
2343     case OP_CLOSE_DUP_NUM:
2344       /* We treat it as a normal character.  */
2345
2346       /* Then we can these characters as normal characters.  */
2347       token->type = CHARACTER;
2348       /* mb_partial and word_char bits should be initialized already
2349          by peek_token.  */
2350       tree = create_token_tree (dfa, NULL, NULL, token);
2351       if (BE (tree == NULL, 0))
2352         {
2353           *err = REG_ESPACE;
2354           return NULL;
2355         }
2356       break;
2357     case ANCHOR:
2358       if ((token->opr.ctx_type
2359            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2360           && dfa->word_ops_used == 0)
2361         init_word_char (dfa);
2362       if (token->opr.ctx_type == WORD_DELIM
2363           || token->opr.ctx_type == NOT_WORD_DELIM)
2364         {
2365           bin_tree_t *tree_first, *tree_last;
2366           if (token->opr.ctx_type == WORD_DELIM)
2367             {
2368               token->opr.ctx_type = WORD_FIRST;
2369               tree_first = create_token_tree (dfa, NULL, NULL, token);
2370               token->opr.ctx_type = WORD_LAST;
2371             }
2372           else
2373             {
2374               token->opr.ctx_type = INSIDE_WORD;
2375               tree_first = create_token_tree (dfa, NULL, NULL, token);
2376               token->opr.ctx_type = INSIDE_NOTWORD;
2377             }
2378           tree_last = create_token_tree (dfa, NULL, NULL, token);
2379           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2380           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2381             {
2382               *err = REG_ESPACE;
2383               return NULL;
2384             }
2385         }
2386       else
2387         {
2388           tree = create_token_tree (dfa, NULL, NULL, token);
2389           if (BE (tree == NULL, 0))
2390             {
2391               *err = REG_ESPACE;
2392               return NULL;
2393             }
2394         }
2395       /* We must return here, since ANCHORs can't be followed
2396          by repetition operators.
2397          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2398              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2399       fetch_token (token, regexp, syntax);
2400       return tree;
2401     case OP_PERIOD:
2402       tree = create_token_tree (dfa, NULL, NULL, token);
2403       if (BE (tree == NULL, 0))
2404         {
2405           *err = REG_ESPACE;
2406           return NULL;
2407         }
2408       if (dfa->mb_cur_max > 1)
2409         dfa->has_mb_node = 1;
2410       break;
2411     case OP_WORD:
2412     case OP_NOTWORD:
2413       tree = build_charclass_op (dfa, regexp->trans,
2414                                  (const unsigned char *) "alnum",
2415                                  (const unsigned char *) "_",
2416                                  token->type == OP_NOTWORD, err);
2417       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2418         return NULL;
2419       break;
2420     case OP_SPACE:
2421     case OP_NOTSPACE:
2422       tree = build_charclass_op (dfa, regexp->trans,
2423                                  (const unsigned char *) "space",
2424                                  (const unsigned char *) "",
2425                                  token->type == OP_NOTSPACE, err);
2426       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2427         return NULL;
2428       break;
2429     case OP_ALT:
2430     case END_OF_RE:
2431       return NULL;
2432     case BACK_SLASH:
2433       *err = REG_EESCAPE;
2434       return NULL;
2435     default:
2436       /* Must not happen?  */
2437 #ifdef DEBUG
2438       assert (0);
2439 #endif
2440       return NULL;
2441     }
2442   fetch_token (token, regexp, syntax);
2443
2444   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2445          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2446     {
2447       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2448       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2449         return NULL;
2450       /* In BRE consecutive duplications are not allowed.  */
2451       if ((syntax & RE_CONTEXT_INVALID_DUP)
2452           && (token->type == OP_DUP_ASTERISK
2453               || token->type == OP_OPEN_DUP_NUM))
2454         {
2455           *err = REG_BADRPT;
2456           return NULL;
2457         }
2458     }
2459
2460   return tree;
2461 }
2462
2463 /* This function build the following tree, from regular expression
2464    (<reg_exp>):
2465          SUBEXP
2466             |
2467         <reg_exp>
2468 */
2469
2470 static bin_tree_t *
2471 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2472      re_string_t *regexp;
2473      regex_t *preg;
2474      re_token_t *token;
2475      reg_syntax_t syntax;
2476      int nest;
2477      reg_errcode_t *err;
2478 {
2479   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2480   bin_tree_t *tree;
2481   size_t cur_nsub;
2482   cur_nsub = preg->re_nsub++;
2483
2484   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2485
2486   /* The subexpression may be a null string.  */
2487   if (token->type == OP_CLOSE_SUBEXP)
2488     tree = NULL;
2489   else
2490     {
2491       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2492       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2493         *err = REG_EPAREN;
2494       if (BE (*err != REG_NOERROR, 0))
2495         return NULL;
2496     }
2497   dfa->completed_bkref_map |= 1 << cur_nsub;
2498
2499   tree = create_tree (dfa, tree, NULL, SUBEXP);
2500   if (BE (tree == NULL, 0))
2501     {
2502       *err = REG_ESPACE;
2503       return NULL;
2504     }
2505   tree->token.opr.idx = cur_nsub;
2506   return tree;
2507 }
2508
2509 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2510
2511 static bin_tree_t *
2512 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2513      bin_tree_t *elem;
2514      re_string_t *regexp;
2515      re_dfa_t *dfa;
2516      re_token_t *token;
2517      reg_syntax_t syntax;
2518      reg_errcode_t *err;
2519 {
2520   bin_tree_t *tree = NULL, *old_tree = NULL;
2521   int i, start, end, start_idx = re_string_cur_idx (regexp);
2522   re_token_t start_token = *token;
2523
2524   if (token->type == OP_OPEN_DUP_NUM)
2525     {
2526       end = 0;
2527       start = fetch_number (regexp, token, syntax);
2528       if (start == -1)
2529         {
2530           if (token->type == CHARACTER && token->opr.c == ',')
2531             start = 0; /* We treat "{,m}" as "{0,m}".  */
2532           else
2533             {
2534               *err = REG_BADBR; /* <re>{} is invalid.  */
2535               return NULL;
2536             }
2537         }
2538       if (BE (start != -2, 1))
2539         {
2540           /* We treat "{n}" as "{n,n}".  */
2541           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2542                  : ((token->type == CHARACTER && token->opr.c == ',')
2543                     ? fetch_number (regexp, token, syntax) : -2));
2544         }
2545       if (BE (start == -2 || end == -2, 0))
2546         {
2547           /* Invalid sequence.  */
2548           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2549             {
2550               if (token->type == END_OF_RE)
2551                 *err = REG_EBRACE;
2552               else
2553                 *err = REG_BADBR;
2554
2555               return NULL;
2556             }
2557
2558           /* If the syntax bit is set, rollback.  */
2559           re_string_set_index (regexp, start_idx);
2560           *token = start_token;
2561           token->type = CHARACTER;
2562           /* mb_partial and word_char bits should be already initialized by
2563              peek_token.  */
2564           return elem;
2565         }
2566
2567       if (BE (end != -1 && start > end, 0))
2568         {
2569           /* First number greater than second.  */
2570           *err = REG_BADBR;
2571           return NULL;
2572         }
2573     }
2574   else
2575     {
2576       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2577       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2578     }
2579
2580   fetch_token (token, regexp, syntax);
2581
2582   if (BE (elem == NULL, 0))
2583     return NULL;
2584   if (BE (start == 0 && end == 0, 0))
2585     {
2586       postorder (elem, free_tree, NULL);
2587       return NULL;
2588     }
2589
2590   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2591   if (BE (start > 0, 0))
2592     {
2593       tree = elem;
2594       for (i = 2; i <= start; ++i)
2595         {
2596           elem = duplicate_tree (elem, dfa);
2597           tree = create_tree (dfa, tree, elem, CONCAT);
2598           if (BE (elem == NULL || tree == NULL, 0))
2599             goto parse_dup_op_espace;
2600         }
2601
2602       if (start == end)
2603         return tree;
2604
2605       /* Duplicate ELEM before it is marked optional.  */
2606       elem = duplicate_tree (elem, dfa);
2607       old_tree = tree;
2608     }
2609   else
2610     old_tree = NULL;
2611
2612   if (elem->token.type == SUBEXP)
2613     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2614
2615   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2616   if (BE (tree == NULL, 0))
2617     goto parse_dup_op_espace;
2618
2619   /* This loop is actually executed only when end != -1,
2620      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2621      already created the start+1-th copy.  */
2622   for (i = start + 2; i <= end; ++i)
2623     {
2624       elem = duplicate_tree (elem, dfa);
2625       tree = create_tree (dfa, tree, elem, CONCAT);
2626       if (BE (elem == NULL || tree == NULL, 0))
2627         goto parse_dup_op_espace;
2628
2629       tree = create_tree (dfa, tree, NULL, OP_ALT);
2630       if (BE (tree == NULL, 0))
2631         goto parse_dup_op_espace;
2632     }
2633
2634   if (old_tree)
2635     tree = create_tree (dfa, old_tree, tree, CONCAT);
2636
2637   return tree;
2638
2639  parse_dup_op_espace:
2640   *err = REG_ESPACE;
2641   return NULL;
2642 }
2643
2644 /* Size of the names for collating symbol/equivalence_class/character_class.
2645    I'm not sure, but maybe enough.  */
2646 #define BRACKET_NAME_BUF_SIZE 32
2647
2648 #ifndef _LIBC
2649   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2650      Build the range expression which starts from START_ELEM, and ends
2651      at END_ELEM.  The result are written to MBCSET and SBCSET.
2652      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2653      mbcset->range_ends, is a pointer argument sinse we may
2654      update it.  */
2655
2656 static reg_errcode_t
2657 # ifdef RE_ENABLE_I18N
2658 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2659      re_charset_t *mbcset;
2660      int *range_alloc;
2661 # else /* not RE_ENABLE_I18N */
2662 build_range_exp (sbcset, start_elem, end_elem)
2663 # endif /* not RE_ENABLE_I18N */
2664      re_bitset_ptr_t sbcset;
2665      bracket_elem_t *start_elem, *end_elem;
2666 {
2667   unsigned int start_ch, end_ch;
2668   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2669   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2670           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2671           0))
2672     return REG_ERANGE;
2673
2674   /* We can handle no multi character collating elements without libc
2675      support.  */
2676   if (BE ((start_elem->type == COLL_SYM
2677            && strlen ((char *) start_elem->opr.name) > 1)
2678           || (end_elem->type == COLL_SYM
2679               && strlen ((char *) end_elem->opr.name) > 1), 0))
2680     return REG_ECOLLATE;
2681
2682 # ifdef RE_ENABLE_I18N
2683   {
2684     wchar_t wc, start_wc, end_wc;
2685     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2686
2687     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2688                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2689                    : 0));
2690     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2691               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2692                  : 0));
2693     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2694                 ? __btowc (start_ch) : start_elem->opr.wch);
2695     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2696               ? __btowc (end_ch) : end_elem->opr.wch);
2697     if (start_wc == WEOF || end_wc == WEOF)
2698       return REG_ECOLLATE;
2699     cmp_buf[0] = start_wc;
2700     cmp_buf[4] = end_wc;
2701     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2702       return REG_ERANGE;
2703
2704     /* Got valid collation sequence values, add them as a new entry.
2705        However, for !_LIBC we have no collation elements: if the
2706        character set is single byte, the single byte character set
2707        that we build below suffices.  parse_bracket_exp passes
2708        no MBCSET if dfa->mb_cur_max == 1.  */
2709     if (mbcset)
2710       {
2711         /* Check the space of the arrays.  */
2712         if (BE (*range_alloc == mbcset->nranges, 0))
2713           {
2714             /* There is not enough space, need realloc.  */
2715             wchar_t *new_array_start, *new_array_end;
2716             int new_nranges;
2717
2718             /* +1 in case of mbcset->nranges is 0.  */
2719             new_nranges = 2 * mbcset->nranges + 1;
2720             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2721                are NULL if *range_alloc == 0.  */
2722             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2723                                           new_nranges);
2724             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2725                                         new_nranges);
2726
2727             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2728               return REG_ESPACE;
2729
2730             mbcset->range_starts = new_array_start;
2731             mbcset->range_ends = new_array_end;
2732             *range_alloc = new_nranges;
2733           }
2734
2735         mbcset->range_starts[mbcset->nranges] = start_wc;
2736         mbcset->range_ends[mbcset->nranges++] = end_wc;
2737       }
2738
2739     /* Build the table for single byte characters.  */
2740     for (wc = 0; wc < SBC_MAX; ++wc)
2741       {
2742         cmp_buf[2] = wc;
2743         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2744             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2745           bitset_set (sbcset, wc);
2746       }
2747   }
2748 # else /* not RE_ENABLE_I18N */
2749   {
2750     unsigned int ch;
2751     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2752                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2753                    : 0));
2754     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2755               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2756                  : 0));
2757     if (start_ch > end_ch)
2758       return REG_ERANGE;
2759     /* Build the table for single byte characters.  */
2760     for (ch = 0; ch < SBC_MAX; ++ch)
2761       if (start_ch <= ch  && ch <= end_ch)
2762         bitset_set (sbcset, ch);
2763   }
2764 # endif /* not RE_ENABLE_I18N */
2765   return REG_NOERROR;
2766 }
2767 #endif /* not _LIBC */
2768
2769 #ifndef _LIBC
2770 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2771    Build the collating element which is represented by NAME.
2772    The result are written to MBCSET and SBCSET.
2773    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2774    pointer argument since we may update it.  */
2775
2776 static reg_errcode_t
2777 # ifdef RE_ENABLE_I18N
2778 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2779      re_charset_t *mbcset;
2780      int *coll_sym_alloc;
2781 # else /* not RE_ENABLE_I18N */
2782 build_collating_symbol (sbcset, name)
2783 # endif /* not RE_ENABLE_I18N */
2784      re_bitset_ptr_t sbcset;
2785      const unsigned char *name;
2786 {
2787   size_t name_len = strlen ((const char *) name);
2788   if (BE (name_len != 1, 0))
2789     return REG_ECOLLATE;
2790   else
2791     {
2792       bitset_set (sbcset, name[0]);
2793       return REG_NOERROR;
2794     }
2795 }
2796 #endif /* not _LIBC */
2797
2798 /* This function parse bracket expression like "[abc]", "[a-c]",
2799    "[[.a-a.]]" etc.  */
2800
2801 static bin_tree_t *
2802 parse_bracket_exp (regexp, dfa, token, syntax, err)
2803      re_string_t *regexp;
2804      re_dfa_t *dfa;
2805      re_token_t *token;
2806      reg_syntax_t syntax;
2807      reg_errcode_t *err;
2808 {
2809 #ifdef _LIBC
2810   const unsigned char *collseqmb;
2811   const char *collseqwc;
2812   uint32_t nrules;
2813   int32_t table_size;
2814   const int32_t *symb_table;
2815   const unsigned char *extra;
2816
2817   /* Local function for parse_bracket_exp used in _LIBC environement.
2818      Seek the collating symbol entry correspondings to NAME.
2819      Return the index of the symbol in the SYMB_TABLE.  */
2820
2821   auto inline int32_t
2822   __attribute ((always_inline))
2823   seek_collating_symbol_entry (name, name_len)
2824          const unsigned char *name;
2825          size_t name_len;
2826     {
2827       int32_t hash = elem_hash ((const char *) name, name_len);
2828       int32_t elem = hash % table_size;
2829       int32_t second = hash % (table_size - 2);
2830       while (symb_table[2 * elem] != 0)
2831         {
2832           /* First compare the hashing value.  */
2833           if (symb_table[2 * elem] == hash
2834               /* Compare the length of the name.  */
2835               && name_len == extra[symb_table[2 * elem + 1]]
2836               /* Compare the name.  */
2837               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2838                          name_len) == 0)
2839             {
2840               /* Yep, this is the entry.  */
2841               break;
2842             }
2843
2844           /* Next entry.  */
2845           elem += second;
2846         }
2847       return elem;
2848     }
2849
2850   /* Local function for parse_bracket_exp used in _LIBC environement.
2851      Look up the collation sequence value of BR_ELEM.
2852      Return the value if succeeded, UINT_MAX otherwise.  */
2853
2854   auto inline unsigned int
2855   __attribute ((always_inline))
2856   lookup_collation_sequence_value (br_elem)
2857          bracket_elem_t *br_elem;
2858     {
2859       if (br_elem->type == SB_CHAR)
2860         {
2861           /*
2862           if (MB_CUR_MAX == 1)
2863           */
2864           if (nrules == 0)
2865             return collseqmb[br_elem->opr.ch];
2866           else
2867             {
2868               wint_t wc = __btowc (br_elem->opr.ch);
2869               return __collseq_table_lookup (collseqwc, wc);
2870             }
2871         }
2872       else if (br_elem->type == MB_CHAR)
2873         {
2874           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2875         }
2876       else if (br_elem->type == COLL_SYM)
2877         {
2878           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2879           if (nrules != 0)
2880             {
2881               int32_t elem, idx;
2882               elem = seek_collating_symbol_entry (br_elem->opr.name,
2883                                                   sym_name_len);
2884               if (symb_table[2 * elem] != 0)
2885                 {
2886                   /* We found the entry.  */
2887                   idx = symb_table[2 * elem + 1];
2888                   /* Skip the name of collating element name.  */
2889                   idx += 1 + extra[idx];
2890                   /* Skip the byte sequence of the collating element.  */
2891                   idx += 1 + extra[idx];
2892                   /* Adjust for the alignment.  */
2893                   idx = (idx + 3) & ~3;
2894                   /* Skip the multibyte collation sequence value.  */
2895                   idx += sizeof (unsigned int);
2896                   /* Skip the wide char sequence of the collating element.  */
2897                   idx += sizeof (unsigned int) *
2898                     (1 + *(unsigned int *) (extra + idx));
2899                   /* Return the collation sequence value.  */
2900                   return *(unsigned int *) (extra + idx);
2901                 }
2902               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2903                 {
2904                   /* No valid character.  Match it as a single byte
2905                      character.  */
2906                   return collseqmb[br_elem->opr.name[0]];
2907                 }
2908             }
2909           else if (sym_name_len == 1)
2910             return collseqmb[br_elem->opr.name[0]];
2911         }
2912       return UINT_MAX;
2913     }
2914
2915   /* Local function for parse_bracket_exp used in _LIBC environement.
2916      Build the range expression which starts from START_ELEM, and ends
2917      at END_ELEM.  The result are written to MBCSET and SBCSET.
2918      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2919      mbcset->range_ends, is a pointer argument sinse we may
2920      update it.  */
2921
2922   auto inline reg_errcode_t
2923   __attribute ((always_inline))
2924   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2925          re_charset_t *mbcset;
2926          int *range_alloc;
2927          re_bitset_ptr_t sbcset;
2928          bracket_elem_t *start_elem, *end_elem;
2929     {
2930       unsigned int ch;
2931       uint32_t start_collseq;
2932       uint32_t end_collseq;
2933
2934       /* Equivalence Classes and Character Classes can't be a range
2935          start/end.  */
2936       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2937               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2938               0))
2939         return REG_ERANGE;
2940
2941       start_collseq = lookup_collation_sequence_value (start_elem);
2942       end_collseq = lookup_collation_sequence_value (end_elem);
2943       /* Check start/end collation sequence values.  */
2944       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2945         return REG_ECOLLATE;
2946       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2947         return REG_ERANGE;
2948
2949       /* Got valid collation sequence values, add them as a new entry.
2950          However, if we have no collation elements, and the character set
2951          is single byte, the single byte character set that we
2952          build below suffices. */
2953       if (nrules > 0 || dfa->mb_cur_max > 1)
2954         {
2955           /* Check the space of the arrays.  */
2956           if (BE (*range_alloc == mbcset->nranges, 0))
2957             {
2958               /* There is not enough space, need realloc.  */
2959               uint32_t *new_array_start;
2960               uint32_t *new_array_end;
2961               int new_nranges;
2962
2963               /* +1 in case of mbcset->nranges is 0.  */
2964               new_nranges = 2 * mbcset->nranges + 1;
2965               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2966                                             new_nranges);
2967               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2968                                           new_nranges);
2969
2970               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2971                 return REG_ESPACE;
2972
2973               mbcset->range_starts = new_array_start;
2974               mbcset->range_ends = new_array_end;
2975               *range_alloc = new_nranges;
2976             }
2977
2978           mbcset->range_starts[mbcset->nranges] = start_collseq;
2979           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2980         }
2981
2982       /* Build the table for single byte characters.  */
2983       for (ch = 0; ch < SBC_MAX; ch++)
2984         {
2985           uint32_t ch_collseq;
2986           /*
2987           if (MB_CUR_MAX == 1)
2988           */
2989           if (nrules == 0)
2990             ch_collseq = collseqmb[ch];
2991           else
2992             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2993           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2994             bitset_set (sbcset, ch);
2995         }
2996       return REG_NOERROR;
2997     }
2998
2999   /* Local function for parse_bracket_exp used in _LIBC environement.
3000      Build the collating element which is represented by NAME.
3001      The result are written to MBCSET and SBCSET.
3002      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3003      pointer argument sinse we may update it.  */
3004
3005   auto inline reg_errcode_t
3006   __attribute ((always_inline))
3007   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3008          re_charset_t *mbcset;
3009          int *coll_sym_alloc;
3010          re_bitset_ptr_t sbcset;
3011          const unsigned char *name;
3012     {
3013       int32_t elem, idx;
3014       size_t name_len = strlen ((const char *) name);
3015       if (nrules != 0)
3016         {
3017           elem = seek_collating_symbol_entry (name, name_len);
3018           if (symb_table[2 * elem] != 0)
3019             {
3020               /* We found the entry.  */
3021               idx = symb_table[2 * elem + 1];
3022               /* Skip the name of collating element name.  */
3023               idx += 1 + extra[idx];
3024             }
3025           else if (symb_table[2 * elem] == 0 && name_len == 1)
3026             {
3027               /* No valid character, treat it as a normal
3028                  character.  */
3029               bitset_set (sbcset, name[0]);
3030               return REG_NOERROR;
3031             }
3032           else
3033             return REG_ECOLLATE;
3034
3035           /* Got valid collation sequence, add it as a new entry.  */
3036           /* Check the space of the arrays.  */
3037           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3038             {
3039               /* Not enough, realloc it.  */
3040               /* +1 in case of mbcset->ncoll_syms is 0.  */
3041               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3042               /* Use realloc since mbcset->coll_syms is NULL
3043                  if *alloc == 0.  */
3044               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3045                                                    new_coll_sym_alloc);
3046               if (BE (new_coll_syms == NULL, 0))
3047                 return REG_ESPACE;
3048               mbcset->coll_syms = new_coll_syms;
3049               *coll_sym_alloc = new_coll_sym_alloc;
3050             }
3051           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3052           return REG_NOERROR;
3053         }
3054       else
3055         {
3056           if (BE (name_len != 1, 0))
3057             return REG_ECOLLATE;
3058           else
3059             {
3060               bitset_set (sbcset, name[0]);
3061               return REG_NOERROR;
3062             }
3063         }
3064     }
3065 #endif
3066
3067   re_token_t br_token;
3068   re_bitset_ptr_t sbcset;
3069 #ifdef RE_ENABLE_I18N
3070   re_charset_t *mbcset;
3071   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3072   int equiv_class_alloc = 0, char_class_alloc = 0;
3073 #endif /* not RE_ENABLE_I18N */
3074   int non_match = 0;
3075   bin_tree_t *work_tree;
3076   int token_len;
3077   int first_round = 1;
3078 #ifdef _LIBC
3079   collseqmb = (const unsigned char *)
3080     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3081   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3082   if (nrules)
3083     {
3084       /*
3085       if (MB_CUR_MAX > 1)
3086       */
3087         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3088       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3089       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3090                                                   _NL_COLLATE_SYMB_TABLEMB);
3091       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3092                                                    _NL_COLLATE_SYMB_EXTRAMB);
3093     }
3094 #endif
3095   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3096 #ifdef RE_ENABLE_I18N
3097   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3098 #endif /* RE_ENABLE_I18N */
3099 #ifdef RE_ENABLE_I18N
3100   if (BE (sbcset == NULL || mbcset == NULL, 0))
3101 #else
3102   if (BE (sbcset == NULL, 0))
3103 #endif /* RE_ENABLE_I18N */
3104     {
3105       *err = REG_ESPACE;
3106       return NULL;
3107     }
3108
3109   token_len = peek_token_bracket (token, regexp, syntax);
3110   if (BE (token->type == END_OF_RE, 0))
3111     {
3112       *err = REG_BADPAT;
3113       goto parse_bracket_exp_free_return;
3114     }
3115   if (token->type == OP_NON_MATCH_LIST)
3116     {
3117 #ifdef RE_ENABLE_I18N
3118       mbcset->non_match = 1;
3119 #endif /* not RE_ENABLE_I18N */
3120       non_match = 1;
3121       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3122         bitset_set (sbcset, '\0');
3123       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3124       token_len = peek_token_bracket (token, regexp, syntax);
3125       if (BE (token->type == END_OF_RE, 0))
3126         {
3127           *err = REG_BADPAT;
3128           goto parse_bracket_exp_free_return;
3129         }
3130     }
3131
3132   /* We treat the first ']' as a normal character.  */
3133   if (token->type == OP_CLOSE_BRACKET)
3134     token->type = CHARACTER;
3135
3136   while (1)
3137     {
3138       bracket_elem_t start_elem, end_elem;
3139       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3140       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3141       reg_errcode_t ret;
3142       int token_len2 = 0, is_range_exp = 0;
3143       re_token_t token2;
3144
3145       start_elem.opr.name = start_name_buf;
3146       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3147                                    syntax, first_round);
3148       if (BE (ret != REG_NOERROR, 0))
3149         {
3150           *err = ret;
3151           goto parse_bracket_exp_free_return;
3152         }
3153       first_round = 0;
3154
3155       /* Get information about the next token.  We need it in any case.  */
3156       token_len = peek_token_bracket (token, regexp, syntax);
3157
3158       /* Do not check for ranges if we know they are not allowed.  */
3159       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3160         {
3161           if (BE (token->type == END_OF_RE, 0))
3162             {
3163               *err = REG_EBRACK;
3164               goto parse_bracket_exp_free_return;
3165             }
3166           if (token->type == OP_CHARSET_RANGE)
3167             {
3168               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3169               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3170               if (BE (token2.type == END_OF_RE, 0))
3171                 {
3172                   *err = REG_EBRACK;
3173                   goto parse_bracket_exp_free_return;
3174                 }
3175               if (token2.type == OP_CLOSE_BRACKET)
3176                 {
3177                   /* We treat the last '-' as a normal character.  */
3178                   re_string_skip_bytes (regexp, -token_len);
3179                   token->type = CHARACTER;
3180                 }
3181               else
3182                 is_range_exp = 1;
3183             }
3184         }
3185
3186       if (is_range_exp == 1)
3187         {
3188           end_elem.opr.name = end_name_buf;
3189           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3190                                        dfa, syntax, 1);
3191           if (BE (ret != REG_NOERROR, 0))
3192             {
3193               *err = ret;
3194               goto parse_bracket_exp_free_return;
3195             }
3196
3197           token_len = peek_token_bracket (token, regexp, syntax);
3198
3199 #ifdef _LIBC
3200           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3201                                   &start_elem, &end_elem);
3202 #else
3203 # ifdef RE_ENABLE_I18N
3204           *err = build_range_exp (sbcset,
3205                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3206                                   &range_alloc, &start_elem, &end_elem);
3207 # else
3208           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3209 # endif
3210 #endif /* RE_ENABLE_I18N */
3211           if (BE (*err != REG_NOERROR, 0))
3212             goto parse_bracket_exp_free_return;
3213         }
3214       else
3215         {
3216           switch (start_elem.type)
3217             {
3218             case SB_CHAR:
3219               bitset_set (sbcset, start_elem.opr.ch);
3220               break;
3221 #ifdef RE_ENABLE_I18N
3222             case MB_CHAR:
3223               /* Check whether the array has enough space.  */
3224               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3225                 {
3226                   wchar_t *new_mbchars;
3227                   /* Not enough, realloc it.  */
3228                   /* +1 in case of mbcset->nmbchars is 0.  */
3229                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3230                   /* Use realloc since array is NULL if *alloc == 0.  */
3231                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3232                                             mbchar_alloc);
3233                   if (BE (new_mbchars == NULL, 0))
3234                     goto parse_bracket_exp_espace;
3235                   mbcset->mbchars = new_mbchars;
3236                 }
3237               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3238               break;
3239 #endif /* RE_ENABLE_I18N */
3240             case EQUIV_CLASS:
3241               *err = build_equiv_class (sbcset,
3242 #ifdef RE_ENABLE_I18N
3243                                         mbcset, &equiv_class_alloc,
3244 #endif /* RE_ENABLE_I18N */
3245                                         start_elem.opr.name);
3246               if (BE (*err != REG_NOERROR, 0))
3247                 goto parse_bracket_exp_free_return;
3248               break;
3249             case COLL_SYM:
3250               *err = build_collating_symbol (sbcset,
3251 #ifdef RE_ENABLE_I18N
3252                                              mbcset, &coll_sym_alloc,
3253 #endif /* RE_ENABLE_I18N */
3254                                              start_elem.opr.name);
3255               if (BE (*err != REG_NOERROR, 0))
3256                 goto parse_bracket_exp_free_return;
3257               break;
3258             case CHAR_CLASS:
3259               *err = build_charclass (regexp->trans, sbcset,
3260 #ifdef RE_ENABLE_I18N
3261                                       mbcset, &char_class_alloc,
3262 #endif /* RE_ENABLE_I18N */
3263                                       start_elem.opr.name, syntax);
3264               if (BE (*err != REG_NOERROR, 0))
3265                goto parse_bracket_exp_free_return;
3266               break;
3267             default:
3268               assert (0);
3269               break;
3270             }
3271         }
3272       if (BE (token->type == END_OF_RE, 0))
3273         {
3274           *err = REG_EBRACK;
3275           goto parse_bracket_exp_free_return;
3276         }
3277       if (token->type == OP_CLOSE_BRACKET)
3278         break;
3279     }
3280
3281   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3282
3283   /* If it is non-matching list.  */
3284   if (non_match)
3285     bitset_not (sbcset);
3286
3287 #ifdef RE_ENABLE_I18N
3288   /* Ensure only single byte characters are set.  */
3289   if (dfa->mb_cur_max > 1)
3290     bitset_mask (sbcset, dfa->sb_char);
3291
3292   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3293       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3294                                                      || mbcset->non_match)))
3295     {
3296       bin_tree_t *mbc_tree;
3297       int sbc_idx;
3298       /* Build a tree for complex bracket.  */
3299       dfa->has_mb_node = 1;
3300       br_token.type = COMPLEX_BRACKET;
3301       br_token.opr.mbcset = mbcset;
3302       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3303       if (BE (mbc_tree == NULL, 0))
3304         goto parse_bracket_exp_espace;
3305       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3306         if (sbcset[sbc_idx])
3307           break;
3308       /* If there are no bits set in sbcset, there is no point
3309          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3310       if (sbc_idx < BITSET_UINTS)
3311         {
3312           /* Build a tree for simple bracket.  */
3313           br_token.type = SIMPLE_BRACKET;
3314           br_token.opr.sbcset = sbcset;
3315           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3316           if (BE (work_tree == NULL, 0))
3317             goto parse_bracket_exp_espace;
3318
3319           /* Then join them by ALT node.  */
3320           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3321           if (BE (work_tree == NULL, 0))
3322             goto parse_bracket_exp_espace;
3323         }
3324       else
3325         {
3326           re_free (sbcset);
3327           work_tree = mbc_tree;
3328         }
3329     }
3330   else
3331 #endif /* not RE_ENABLE_I18N */
3332     {
3333 #ifdef RE_ENABLE_I18N
3334       free_charset (mbcset);
3335 #endif
3336       /* Build a tree for simple bracket.  */
3337       br_token.type = SIMPLE_BRACKET;
3338       br_token.opr.sbcset = sbcset;
3339       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3340       if (BE (work_tree == NULL, 0))
3341         goto parse_bracket_exp_espace;
3342     }
3343   return work_tree;
3344
3345  parse_bracket_exp_espace:
3346   *err = REG_ESPACE;
3347  parse_bracket_exp_free_return:
3348   re_free (sbcset);
3349 #ifdef RE_ENABLE_I18N
3350   free_charset (mbcset);
3351 #endif /* RE_ENABLE_I18N */
3352   return NULL;
3353 }
3354
3355 /* Parse an element in the bracket expression.  */
3356
3357 static reg_errcode_t
3358 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3359                        accept_hyphen)
3360      bracket_elem_t *elem;
3361      re_string_t *regexp;
3362      re_token_t *token;
3363      int token_len;
3364      re_dfa_t *dfa;
3365      reg_syntax_t syntax;
3366      int accept_hyphen;
3367 {
3368 #ifdef RE_ENABLE_I18N
3369   int cur_char_size;
3370   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3371   if (cur_char_size > 1)
3372     {
3373       elem->type = MB_CHAR;
3374       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3375       re_string_skip_bytes (regexp, cur_char_size);
3376       return REG_NOERROR;
3377     }
3378 #endif /* RE_ENABLE_I18N */
3379   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3380   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3381       || token->type == OP_OPEN_EQUIV_CLASS)
3382     return parse_bracket_symbol (elem, regexp, token);
3383   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3384     {
3385       /* A '-' must only appear as anything but a range indicator before
3386          the closing bracket.  Everything else is an error.  */
3387       re_token_t token2;
3388       (void) peek_token_bracket (&token2, regexp, syntax);
3389       if (token2.type != OP_CLOSE_BRACKET)
3390         /* The actual error value is not standardized since this whole
3391            case is undefined.  But ERANGE makes good sense.  */
3392         return REG_ERANGE;
3393     }
3394   elem->type = SB_CHAR;
3395   elem->opr.ch = token->opr.c;
3396   return REG_NOERROR;
3397 }
3398
3399 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3400    such as [:<character_class>:], [.<collating_element>.], and
3401    [=<equivalent_class>=].  */
3402
3403 static reg_errcode_t
3404 parse_bracket_symbol (elem, regexp, token)
3405      bracket_elem_t *elem;
3406      re_string_t *regexp;
3407      re_token_t *token;
3408 {
3409   unsigned char ch, delim = token->opr.c;
3410   int i = 0;
3411   if (re_string_eoi(regexp))
3412     return REG_EBRACK;
3413   for (;; ++i)
3414     {
3415       if (i >= BRACKET_NAME_BUF_SIZE)
3416         return REG_EBRACK;
3417       if (token->type == OP_OPEN_CHAR_CLASS)
3418         ch = re_string_fetch_byte_case (regexp);
3419       else
3420         ch = re_string_fetch_byte (regexp);
3421       if (re_string_eoi(regexp))
3422         return REG_EBRACK;
3423       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3424         break;
3425       elem->opr.name[i] = ch;
3426     }
3427   re_string_skip_bytes (regexp, 1);
3428   elem->opr.name[i] = '\0';
3429   switch (token->type)
3430     {
3431     case OP_OPEN_COLL_ELEM:
3432       elem->type = COLL_SYM;
3433       break;
3434     case OP_OPEN_EQUIV_CLASS:
3435       elem->type = EQUIV_CLASS;
3436       break;
3437     case OP_OPEN_CHAR_CLASS:
3438       elem->type = CHAR_CLASS;
3439       break;
3440     default:
3441       break;
3442     }
3443   return REG_NOERROR;
3444 }
3445
3446   /* Helper function for parse_bracket_exp.
3447      Build the equivalence class which is represented by NAME.
3448      The result are written to MBCSET and SBCSET.
3449      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3450      is a pointer argument sinse we may update it.  */
3451
3452 static reg_errcode_t
3453 #ifdef RE_ENABLE_I18N
3454 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3455      re_charset_t *mbcset;
3456      int *equiv_class_alloc;
3457 #else /* not RE_ENABLE_I18N */
3458 build_equiv_class (sbcset, name)
3459 #endif /* not RE_ENABLE_I18N */
3460      re_bitset_ptr_t sbcset;
3461      const unsigned char *name;
3462 {
3463 #if defined _LIBC
3464   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3465   if (nrules != 0)
3466     {
3467       const int32_t *table, *indirect;
3468       const unsigned char *weights, *extra, *cp;
3469       unsigned char char_buf[2];
3470       int32_t idx1, idx2;
3471       unsigned int ch;
3472       size_t len;
3473       /* This #include defines a local function!  */
3474 # include <locale/weight.h>
3475       /* Calculate the index for equivalence class.  */
3476       cp = name;
3477       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3478       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3479                                                _NL_COLLATE_WEIGHTMB);
3480       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3481                                                    _NL_COLLATE_EXTRAMB);
3482       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3483                                                 _NL_COLLATE_INDIRECTMB);
3484       idx1 = findidx (&cp);
3485       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3486         /* This isn't a valid character.  */
3487         return REG_ECOLLATE;
3488
3489       /* Build single byte matcing table for this equivalence class.  */
3490       char_buf[1] = (unsigned char) '\0';
3491       len = weights[idx1];
3492       for (ch = 0; ch < SBC_MAX; ++ch)
3493         {
3494           char_buf[0] = ch;
3495           cp = char_buf;
3496           idx2 = findidx (&cp);
3497 /*
3498           idx2 = table[ch];
3499 */
3500           if (idx2 == 0)
3501             /* This isn't a valid character.  */
3502             continue;
3503           if (len == weights[idx2])
3504             {
3505               int cnt = 0;
3506               while (cnt <= len &&
3507                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3508                 ++cnt;
3509
3510               if (cnt > len)
3511                 bitset_set (sbcset, ch);
3512             }
3513         }
3514       /* Check whether the array has enough space.  */
3515       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3516         {
3517           /* Not enough, realloc it.  */
3518           /* +1 in case of mbcset->nequiv_classes is 0.  */
3519           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3520           /* Use realloc since the array is NULL if *alloc == 0.  */
3521           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3522                                                    int32_t,
3523                                                    new_equiv_class_alloc);
3524           if (BE (new_equiv_classes == NULL, 0))
3525             return REG_ESPACE;
3526           mbcset->equiv_classes = new_equiv_classes;
3527           *equiv_class_alloc = new_equiv_class_alloc;
3528         }
3529       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3530     }
3531   else
3532 #endif /* _LIBC */
3533     {
3534       if (BE (strlen ((const char *) name) != 1, 0))
3535         return REG_ECOLLATE;
3536       bitset_set (sbcset, *name);
3537     }
3538   return REG_NOERROR;
3539 }
3540
3541   /* Helper function for parse_bracket_exp.
3542      Build the character class which is represented by NAME.
3543      The result are written to MBCSET and SBCSET.
3544      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3545      is a pointer argument sinse we may update it.  */
3546
3547 static reg_errcode_t
3548 #ifdef RE_ENABLE_I18N
3549 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3550      re_charset_t *mbcset;
3551      int *char_class_alloc;
3552 #else /* not RE_ENABLE_I18N */
3553 build_charclass (trans, sbcset, class_name, syntax)
3554 #endif /* not RE_ENABLE_I18N */
3555      unsigned RE_TRANSLATE_TYPE trans;
3556      re_bitset_ptr_t sbcset;
3557      const unsigned char *class_name;
3558      reg_syntax_t syntax;
3559 {
3560   int i;
3561   const char *name = (const char *) class_name;
3562
3563   /* In case of REG_ICASE "upper" and "lower" match the both of
3564      upper and lower cases.  */
3565   if ((syntax & RE_ICASE)
3566       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3567     name = "alpha";
3568
3569 #ifdef RE_ENABLE_I18N
3570   /* Check the space of the arrays.  */
3571   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3572     {
3573       /* Not enough, realloc it.  */
3574       /* +1 in case of mbcset->nchar_classes is 0.  */
3575       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3576       /* Use realloc since array is NULL if *alloc == 0.  */
3577       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3578                                                new_char_class_alloc);
3579       if (BE (new_char_classes == NULL, 0))
3580         return REG_ESPACE;
3581       mbcset->char_classes = new_char_classes;
3582       *char_class_alloc = new_char_class_alloc;
3583     }
3584   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3585 #endif /* RE_ENABLE_I18N */
3586
3587 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3588     for (i = 0; i < SBC_MAX; ++i)               \
3589       {                                         \
3590         if (ctype_func (i))                     \
3591           {                                     \
3592             int ch = trans ? trans[i] : i;      \
3593             bitset_set (sbcset, ch);            \
3594           }                                     \
3595       }
3596
3597   if (strcmp (name, "alnum") == 0)
3598     BUILD_CHARCLASS_LOOP (isalnum)
3599   else if (strcmp (name, "cntrl") == 0)
3600     BUILD_CHARCLASS_LOOP (iscntrl)
3601   else if (strcmp (name, "lower") == 0)
3602     BUILD_CHARCLASS_LOOP (islower)
3603   else if (strcmp (name, "space") == 0)
3604     BUILD_CHARCLASS_LOOP (isspace)
3605   else if (strcmp (name, "alpha") == 0)
3606     BUILD_CHARCLASS_LOOP (isalpha)
3607   else if (strcmp (name, "digit") == 0)
3608     BUILD_CHARCLASS_LOOP (isdigit)
3609   else if (strcmp (name, "print") == 0)
3610     BUILD_CHARCLASS_LOOP (isprint)
3611   else if (strcmp (name, "upper") == 0)
3612     BUILD_CHARCLASS_LOOP (isupper)
3613   else if (strcmp (name, "blank") == 0)
3614     BUILD_CHARCLASS_LOOP (isblank)
3615   else if (strcmp (name, "graph") == 0)
3616     BUILD_CHARCLASS_LOOP (isgraph)
3617   else if (strcmp (name, "punct") == 0)
3618     BUILD_CHARCLASS_LOOP (ispunct)
3619   else if (strcmp (name, "xdigit") == 0)
3620     BUILD_CHARCLASS_LOOP (isxdigit)
3621   else
3622     return REG_ECTYPE;
3623
3624   return REG_NOERROR;
3625 }
3626
3627 static bin_tree_t *
3628 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3629      re_dfa_t *dfa;
3630      unsigned RE_TRANSLATE_TYPE trans;
3631      const unsigned char *class_name;
3632      const unsigned char *extra;
3633      int non_match;
3634      reg_errcode_t *err;
3635 {
3636   re_bitset_ptr_t sbcset;
3637 #ifdef RE_ENABLE_I18N
3638   re_charset_t *mbcset;
3639   int alloc = 0;
3640 #endif /* not RE_ENABLE_I18N */
3641   reg_errcode_t ret;
3642   re_token_t br_token;
3643   bin_tree_t *tree;
3644
3645   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3646 #ifdef RE_ENABLE_I18N
3647   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3648 #endif /* RE_ENABLE_I18N */
3649
3650 #ifdef RE_ENABLE_I18N
3651   if (BE (sbcset == NULL || mbcset == NULL, 0))
3652 #else /* not RE_ENABLE_I18N */
3653   if (BE (sbcset == NULL, 0))
3654 #endif /* not RE_ENABLE_I18N */
3655     {
3656       *err = REG_ESPACE;
3657       return NULL;
3658     }
3659
3660   if (non_match)
3661     {
3662 #ifdef RE_ENABLE_I18N
3663       /*
3664       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3665         bitset_set(cset->sbcset, '\0');
3666       */
3667       mbcset->non_match = 1;
3668 #endif /* not RE_ENABLE_I18N */
3669     }
3670
3671   /* We don't care the syntax in this case.  */
3672   ret = build_charclass (trans, sbcset,
3673 #ifdef RE_ENABLE_I18N
3674                          mbcset, &alloc,
3675 #endif /* RE_ENABLE_I18N */
3676                          class_name, 0);
3677
3678   if (BE (ret != REG_NOERROR, 0))
3679     {
3680       re_free (sbcset);
3681 #ifdef RE_ENABLE_I18N
3682       free_charset (mbcset);
3683 #endif /* RE_ENABLE_I18N */
3684       *err = ret;
3685       return NULL;
3686     }
3687   /* \w match '_' also.  */
3688   for (; *extra; extra++)
3689     bitset_set (sbcset, *extra);
3690
3691   /* If it is non-matching list.  */
3692   if (non_match)
3693     bitset_not (sbcset);
3694
3695 #ifdef RE_ENABLE_I18N
3696   /* Ensure only single byte characters are set.  */
3697   if (dfa->mb_cur_max > 1)
3698     bitset_mask (sbcset, dfa->sb_char);
3699 #endif
3700
3701   /* Build a tree for simple bracket.  */
3702   br_token.type = SIMPLE_BRACKET;
3703   br_token.opr.sbcset = sbcset;
3704   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3705   if (BE (tree == NULL, 0))
3706     goto build_word_op_espace;
3707
3708 #ifdef RE_ENABLE_I18N
3709   if (dfa->mb_cur_max > 1)
3710     {
3711       bin_tree_t *mbc_tree;
3712       /* Build a tree for complex bracket.  */
3713       br_token.type = COMPLEX_BRACKET;
3714       br_token.opr.mbcset = mbcset;
3715       dfa->has_mb_node = 1;
3716       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3717       if (BE (mbc_tree == NULL, 0))
3718         goto build_word_op_espace;
3719       /* Then join them by ALT node.  */
3720       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3721       if (BE (mbc_tree != NULL, 1))
3722         return tree;
3723     }
3724   else
3725     {
3726       free_charset (mbcset);
3727       return tree;
3728     }
3729 #else /* not RE_ENABLE_I18N */
3730   return tree;
3731 #endif /* not RE_ENABLE_I18N */
3732
3733  build_word_op_espace:
3734   re_free (sbcset);
3735 #ifdef RE_ENABLE_I18N
3736   free_charset (mbcset);
3737 #endif /* RE_ENABLE_I18N */
3738   *err = REG_ESPACE;
3739   return NULL;
3740 }
3741
3742 /* This is intended for the expressions like "a{1,3}".
3743    Fetch a number from `input', and return the number.
3744    Return -1, if the number field is empty like "{,1}".
3745    Return -2, If an error is occured.  */
3746
3747 static int
3748 fetch_number (input, token, syntax)
3749      re_string_t *input;
3750      re_token_t *token;
3751      reg_syntax_t syntax;
3752 {
3753   int num = -1;
3754   unsigned char c;
3755   while (1)
3756     {
3757       fetch_token (token, input, syntax);
3758       c = token->opr.c;
3759       if (BE (token->type == END_OF_RE, 0))
3760         return -2;
3761       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3762         break;
3763       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3764              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3765       num = (num > RE_DUP_MAX) ? -2 : num;
3766     }
3767   return num;
3768 }
3769 \f
3770 #ifdef RE_ENABLE_I18N
3771 static void
3772 free_charset (re_charset_t *cset)
3773 {
3774   re_free (cset->mbchars);
3775 # ifdef _LIBC
3776   re_free (cset->coll_syms);
3777   re_free (cset->equiv_classes);
3778   re_free (cset->range_starts);
3779   re_free (cset->range_ends);
3780 # endif
3781   re_free (cset->char_classes);
3782   re_free (cset);
3783 }
3784 #endif /* RE_ENABLE_I18N */
3785 \f
3786 /* Functions for binary tree operation.  */
3787
3788 /* Create a tree node.  */
3789
3790 static bin_tree_t *
3791 create_tree (dfa, left, right, type)
3792      re_dfa_t *dfa;
3793      bin_tree_t *left;
3794      bin_tree_t *right;
3795      re_token_type_t type;
3796 {
3797   re_token_t t;
3798   t.type = type;
3799   return create_token_tree (dfa, left, right, &t);
3800 }
3801
3802 static bin_tree_t *
3803 create_token_tree (dfa, left, right, token)
3804      re_dfa_t *dfa;
3805      bin_tree_t *left;
3806      bin_tree_t *right;
3807      const re_token_t *token;
3808 {
3809   bin_tree_t *tree;
3810   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3811     {
3812       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3813
3814       if (storage == NULL)
3815         return NULL;
3816       storage->next = dfa->str_tree_storage;
3817       dfa->str_tree_storage = storage;
3818       dfa->str_tree_storage_idx = 0;
3819     }
3820   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3821
3822   tree->parent = NULL;
3823   tree->left = left;
3824   tree->right = right;
3825   tree->token = *token;
3826   tree->token.duplicated = 0;
3827   tree->token.opt_subexp = 0;
3828   tree->first = NULL;
3829   tree->next = NULL;
3830   tree->node_idx = -1;
3831
3832   if (left != NULL)
3833     left->parent = tree;
3834   if (right != NULL)
3835     right->parent = tree;
3836   return tree;
3837 }
3838
3839 /* Mark the tree SRC as an optional subexpression.
3840    To be called from preorder or postorder.  */
3841
3842 static reg_errcode_t
3843 mark_opt_subexp (extra, node)
3844      void *extra;
3845      bin_tree_t *node;
3846 {
3847   int idx = (int) (long) extra;
3848   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3849     node->token.opt_subexp = 1;
3850
3851   return REG_NOERROR;
3852 }
3853
3854 /* Free the allocated memory inside NODE. */
3855
3856 static void
3857 free_token (re_token_t *node)
3858 {
3859 #ifdef RE_ENABLE_I18N
3860   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3861     free_charset (node->opr.mbcset);
3862   else
3863 #endif /* RE_ENABLE_I18N */
3864     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3865       re_free (node->opr.sbcset);
3866 }
3867
3868 /* Worker function for tree walking.  Free the allocated memory inside NODE
3869    and its children. */
3870
3871 static reg_errcode_t
3872 free_tree (void *extra, bin_tree_t *node)
3873 {
3874   free_token (&node->token);
3875   return REG_NOERROR;
3876 }
3877
3878
3879 /* Duplicate the node SRC, and return new node.  This is a preorder
3880    visit similar to the one implemented by the generic visitor, but
3881    we need more infrastructure to maintain two parallel trees --- so,
3882    it's easier to duplicate.  */
3883
3884 static bin_tree_t *
3885 duplicate_tree (root, dfa)
3886      const bin_tree_t *root;
3887      re_dfa_t *dfa;
3888 {
3889   const bin_tree_t *node;
3890   bin_tree_t *dup_root;
3891   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3892
3893   for (node = root; ; )
3894     {
3895       /* Create a new tree and link it back to the current parent.  */
3896       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3897       if (*p_new == NULL)
3898         return NULL;
3899       (*p_new)->parent = dup_node;
3900       (*p_new)->token.duplicated = 1;
3901       dup_node = *p_new;
3902
3903       /* Go to the left node, or up and to the right.  */
3904       if (node->left)
3905         {
3906           node = node->left;
3907           p_new = &dup_node->left;
3908         }
3909       else
3910         {
3911           const bin_tree_t *prev = NULL;
3912           while (node->right == prev || node->right == NULL)
3913             {
3914               prev = node;
3915               node = node->parent;
3916               dup_node = dup_node->parent;
3917               if (!node)
3918                 return dup_root;
3919             }
3920           node = node->right;
3921           p_new = &dup_node->right;
3922         }
3923     }
3924 }