posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  22                                           int length, reg_syntax_t syntax);
  23 static void re_compile_fastmap_iter (regex_t *bufp,
  24                                      const re_dfastate_t *init_state,
  25                                      char *fastmap);
  26 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  27 static void init_word_char (re_dfa_t *dfa);
  28 #ifdef RE_ENABLE_I18N
  29 static void free_charset (re_charset_t *cset);
  30 #endif /* RE_ENABLE_I18N */
  31 static void free_workarea_compile (regex_t *preg);
  32 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  33 #ifdef RE_ENABLE_I18N
  34 static void optimize_utf8 (re_dfa_t *dfa);
  35 #endif
  36 struct subexp_optimize
  37 {
  38   re_dfa_t *dfa;
  39   re_token_t *nodes;
  40   int no_sub, re_nsub;
  41 };
  42 static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
  43                                      bin_tree_t *node, int sidx, int depth);
  44 static reg_errcode_t analyze (re_dfa_t *dfa);
  45 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  46 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  47 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  48 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  49 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  50                                              int top_clone_node, int root_node,
  51                                              unsigned int constraint);
  52 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  53                                      unsigned int constraint);
  54 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  55                                    unsigned int constraint);
  56 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  57 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  58                                          int node, int root);
  59 static void calc_inveclosure (re_dfa_t *dfa);
  60 static int fetch_number (re_string_t *input, re_token_t *token,
  61                          reg_syntax_t syntax);
  62 static void fetch_token (re_token_t *result, re_string_t *input,
  63                          reg_syntax_t syntax);
  64 static int peek_token (re_token_t *token, re_string_t *input,
  65                         reg_syntax_t syntax);
  66 static int peek_token_bracket (re_token_t *token, re_string_t *input,
  67                                reg_syntax_t syntax);
  68 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  69                           reg_syntax_t syntax, reg_errcode_t *err);
  70 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  74                                  re_token_t *token, reg_syntax_t syntax,
  75                                  int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  77                                      re_token_t *token, reg_syntax_t syntax,
  78                                      int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  80                                   re_token_t *token, reg_syntax_t syntax,
  81                                   int nest, reg_errcode_t *err);
  82 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  83                                  re_dfa_t *dfa, re_token_t *token,
  84                                  reg_syntax_t syntax, reg_errcode_t *err);
  85 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  86                                       re_token_t *token, reg_syntax_t syntax,
  87                                       reg_errcode_t *err);
  88 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  89                                             re_string_t *regexp,
  90                                             re_token_t *token, int token_len,
  91                                             re_dfa_t *dfa,
  92                                             reg_syntax_t syntax,
  93                                             int accept_hyphen);
  94 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  95                                           re_string_t *regexp,
  96                                           re_token_t *token);
  97 #ifndef _LIBC
  98 # ifdef RE_ENABLE_I18N
  99 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 100                                       re_charset_t *mbcset, int *range_alloc,
 101                                       bracket_elem_t *start_elem,
 102                                       bracket_elem_t *end_elem);
 103 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 104                                              re_charset_t *mbcset,
 105                                              int *coll_sym_alloc,
 106                                              const unsigned char *name);
 107 # else /* not RE_ENABLE_I18N */
 108 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 109                                       bracket_elem_t *start_elem,
 110                                       bracket_elem_t *end_elem);
 111 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 112                                              const unsigned char *name);
 113 # endif /* not RE_ENABLE_I18N */
 114 #endif /* not _LIBC */
 115 #ifdef RE_ENABLE_I18N
 116 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 117                                         re_charset_t *mbcset,
 118                                         int *equiv_class_alloc,
 119                                         const unsigned char *name);
 120 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 121                                       re_bitset_ptr_t sbcset,
 122                                       re_charset_t *mbcset,
 123                                       int *char_class_alloc,
 124                                       const unsigned char *class_name,
 125                                       reg_syntax_t syntax);
 126 #else  /* not RE_ENABLE_I18N */
 127 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 128                                         const unsigned char *name);
 129 static reg_errcode_t build_charclass (unsigned RE_TRANSLATE_TYPE trans,
 130                                       re_bitset_ptr_t sbcset,
 131                                       const unsigned char *class_name,
 132                                       reg_syntax_t syntax);
 133 #endif /* not RE_ENABLE_I18N */
 134 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 135                                        unsigned RE_TRANSLATE_TYPE trans,
 136                                        const unsigned char *class_name,
 137                                        const unsigned char *extra,
 138                                        int non_match, reg_errcode_t *err);
 139 static bin_tree_t *create_tree (re_dfa_t *dfa,
 140                                 bin_tree_t *left, bin_tree_t *right,
 141                                 re_token_type_t type, int index);
 142 static bin_tree_t *re_dfa_add_tree_node (re_dfa_t *dfa,
 143                                          bin_tree_t *left, bin_tree_t *right,
 144                                          const re_token_t *token)
 145   __attribute ((noinline));
 146 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 147 static void mark_opt_subexp (const bin_tree_t *src, re_dfa_t *dfa);
 148 static void mark_opt_subexp_iter (const bin_tree_t *src, re_dfa_t *dfa, int idx);
 149 \f
 150 /* This table gives an error message for each of the error codes listed
 151    in regex.h.  Obviously the order here has to be same as there.
 152    POSIX doesn't require that we do anything for REG_NOERROR,
 153    but why not be nice?  */
 154
 155 const char __re_error_msgid[] attribute_hidden =
 156   {
 157 #define REG_NOERROR_IDX 0
 158     gettext_noop ("Success")    /* REG_NOERROR */
 159     "\0"
 160 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 161     gettext_noop ("No match")   /* REG_NOMATCH */
 162     "\0"
 163 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 164     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 165     "\0"
 166 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 167     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 168     "\0"
 169 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 170     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 171     "\0"
 172 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 173     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 174     "\0"
 175 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 176     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 177     "\0"
 178 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 179     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 180     "\0"
 181 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 182     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 183     "\0"
 184 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 185     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 186     "\0"
 187 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 188     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 189     "\0"
 190 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 191     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 192     "\0"
 193 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 194     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 195     "\0"
 196 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 197     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 198     "\0"
 199 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 200     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 201     "\0"
 202 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 203     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 204     "\0"
 205 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 206     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 207   };
 208
 209 const size_t __re_error_msgid_idx[] attribute_hidden =
 210   {
 211     REG_NOERROR_IDX,
 212     REG_NOMATCH_IDX,
 213     REG_BADPAT_IDX,
 214     REG_ECOLLATE_IDX,
 215     REG_ECTYPE_IDX,
 216     REG_EESCAPE_IDX,
 217     REG_ESUBREG_IDX,
 218     REG_EBRACK_IDX,
 219     REG_EPAREN_IDX,
 220     REG_EBRACE_IDX,
 221     REG_BADBR_IDX,
 222     REG_ERANGE_IDX,
 223     REG_ESPACE_IDX,
 224     REG_BADRPT_IDX,
 225     REG_EEND_IDX,
 226     REG_ESIZE_IDX,
 227     REG_ERPAREN_IDX
 228   };
 229 \f
 230 /* Entry points for GNU code.  */
 231
 232 /* re_compile_pattern is the GNU regular expression compiler: it
 233    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 234    Returns 0 if the pattern was valid, otherwise an error string.
 235
 236    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 237    are set in BUFP on entry.  */
 238
 239 const char *
 240 re_compile_pattern (pattern, length, bufp)
 241     const char *pattern;
 242     size_t length;
 243     struct re_pattern_buffer *bufp;
 244 {
 245   reg_errcode_t ret;
 246
 247   /* And GNU code determines whether or not to get register information
 248      by passing null for the REGS argument to re_match, etc., not by
 249      setting no_sub, unless RE_NO_SUB is set.  */
 250   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 251
 252   /* Match anchors at newline.  */
 253   bufp->newline_anchor = 1;
 254
 255   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 256
 257   if (!ret)
 258     return NULL;
 259   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 260 }
 261 #ifdef _LIBC
 262 weak_alias (__re_compile_pattern, re_compile_pattern)
 263 #endif
 264
 265 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 266    also be assigned to arbitrarily: each pattern buffer stores its own
 267    syntax, so it can be changed between regex compilations.  */
 268 /* This has no initializer because initialized variables in Emacs
 269    become read-only after dumping.  */
 270 reg_syntax_t re_syntax_options;
 271
 272
 273 /* Specify the precise syntax of regexps for compilation.  This provides
 274    for compatibility for various utilities which historically have
 275    different, incompatible syntaxes.
 276
 277    The argument SYNTAX is a bit mask comprised of the various bits
 278    defined in regex.h.  We return the old syntax.  */
 279
 280 reg_syntax_t
 281 re_set_syntax (syntax)
 282     reg_syntax_t syntax;
 283 {
 284   reg_syntax_t ret = re_syntax_options;
 285
 286   re_syntax_options = syntax;
 287   return ret;
 288 }
 289 #ifdef _LIBC
 290 weak_alias (__re_set_syntax, re_set_syntax)
 291 #endif
 292
 293 int
 294 re_compile_fastmap (bufp)
 295     struct re_pattern_buffer *bufp;
 296 {
 297   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 298   char *fastmap = bufp->fastmap;
 299
 300   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 301   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 302   if (dfa->init_state != dfa->init_state_word)
 303     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 304   if (dfa->init_state != dfa->init_state_nl)
 305     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 306   if (dfa->init_state != dfa->init_state_begbuf)
 307     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 308   bufp->fastmap_accurate = 1;
 309   return 0;
 310 }
 311 #ifdef _LIBC
 312 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 313 #endif
 314
 315 static inline void
 316 __attribute ((always_inline))
 317 re_set_fastmap (char *fastmap, int icase, int ch)
 318 {
 319   fastmap[ch] = 1;
 320   if (icase)
 321     fastmap[tolower (ch)] = 1;
 322 }
 323
 324 /* Helper function for re_compile_fastmap.
 325    Compile fastmap for the initial_state INIT_STATE.  */
 326
 327 static void
 328 re_compile_fastmap_iter (bufp, init_state, fastmap)
 329      regex_t *bufp;
 330      const re_dfastate_t *init_state;
 331      char *fastmap;
 332 {
 333   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 334   int node_cnt;
 335   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 336   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 337     {
 338       int node = init_state->nodes.elems[node_cnt];
 339       re_token_type_t type = dfa->nodes[node].type;
 340
 341       if (type == CHARACTER)
 342         {
 343           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 344 #ifdef RE_ENABLE_I18N
 345           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 346             {
 347               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 348               wchar_t wc;
 349               mbstate_t state;
 350
 351               p = buf;
 352               *p++ = dfa->nodes[node].opr.c;
 353               while (++node < dfa->nodes_len
 354                      && dfa->nodes[node].type == CHARACTER
 355                      && dfa->nodes[node].mb_partial)
 356                 *p++ = dfa->nodes[node].opr.c;
 357               memset (&state, 0, sizeof (state));
 358               if (mbrtowc (&wc, (const char *) buf, p - buf,
 359                            &state) == p - buf
 360                   && __wcrtomb ((char *) buf, towlower (wc), &state) > 0)
 361                 re_set_fastmap (fastmap, 0, buf[0]);
 362             }
 363 #endif
 364         }
 365       else if (type == SIMPLE_BRACKET)
 366         {
 367           int i, j, ch;
 368           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 369             for (j = 0; j < UINT_BITS; ++j, ++ch)
 370               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 371                 re_set_fastmap (fastmap, icase, ch);
 372         }
 373 #ifdef RE_ENABLE_I18N
 374       else if (type == COMPLEX_BRACKET)
 375         {
 376           int i;
 377           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 378           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 379               || cset->nranges || cset->nchar_classes)
 380             {
 381 # ifdef _LIBC
 382               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 383                 {
 384                   /* In this case we want to catch the bytes which are
 385                      the first byte of any collation elements.
 386                      e.g. In da_DK, we want to catch 'a' since "aa"
 387                           is a valid collation element, and don't catch
 388                           'b' since 'b' is the only collation element
 389                           which starts from 'b'.  */
 390                   int j, ch;
 391                   const int32_t *table = (const int32_t *)
 392                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 393                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 394                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 395                       if (table[ch] < 0)
 396                         re_set_fastmap (fastmap, icase, ch);
 397                 }
 398 # else
 399               if (dfa->mb_cur_max > 1)
 400                 for (i = 0; i < SBC_MAX; ++i)
 401                   if (__btowc (i) == WEOF)
 402                     re_set_fastmap (fastmap, icase, i);
 403 # endif /* not _LIBC */
 404             }
 405           for (i = 0; i < cset->nmbchars; ++i)
 406             {
 407               char buf[256];
 408               mbstate_t state;
 409               memset (&state, '\0', sizeof (state));
 410               __wcrtomb (buf, cset->mbchars[i], &state);
 411               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 413                 {
 414                   __wcrtomb (buf, towlower (cset->mbchars[i]), &state);
 415                   re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
 416                 }
 417             }
 418         }
 419 #endif /* RE_ENABLE_I18N */
 420       else if (type == OP_PERIOD
 421 #ifdef RE_ENABLE_I18N
 422                || type == OP_UTF8_PERIOD
 423 #endif /* RE_ENABLE_I18N */
 424                || type == END_OF_RE)
 425         {
 426           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 427           if (type == END_OF_RE)
 428             bufp->can_be_null = 1;
 429           return;
 430         }
 431     }
 432 }
 433 \f
 434 /* Entry point for POSIX code.  */
 435 /* regcomp takes a regular expression as a string and compiles it.
 436
 437    PREG is a regex_t *.  We do not expect any fields to be initialized,
 438    since POSIX says we shouldn't.  Thus, we set
 439
 440      `buffer' to the compiled pattern;
 441      `used' to the length of the compiled pattern;
 442      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 443        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 444        RE_SYNTAX_POSIX_BASIC;
 445      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 446      `fastmap' to an allocated space for the fastmap;
 447      `fastmap_accurate' to zero;
 448      `re_nsub' to the number of subexpressions in PATTERN.
 449
 450    PATTERN is the address of the pattern string.
 451
 452    CFLAGS is a series of bits which affect compilation.
 453
 454      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 455      use POSIX basic syntax.
 456
 457      If REG_NEWLINE is set, then . and [^...] don't match newline.
 458      Also, regexec will try a match beginning after every newline.
 459
 460      If REG_ICASE is set, then we considers upper- and lowercase
 461      versions of letters to be equivalent when matching.
 462
 463      If REG_NOSUB is set, then when PREG is passed to regexec, that
 464      routine will report only success or failure, and nothing about the
 465      registers.
 466
 467    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 468    the return codes and their meanings.)  */
 469
 470 int
 471 regcomp (preg, pattern, cflags)
 472     regex_t *__restrict preg;
 473     const char *__restrict pattern;
 474     int cflags;
 475 {
 476   reg_errcode_t ret;
 477   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 478                          : RE_SYNTAX_POSIX_BASIC);
 479
 480   preg->buffer = NULL;
 481   preg->allocated = 0;
 482   preg->used = 0;
 483
 484   /* Try to allocate space for the fastmap.  */
 485   preg->fastmap = re_malloc (char, SBC_MAX);
 486   if (BE (preg->fastmap == NULL, 0))
 487     return REG_ESPACE;
 488
 489   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 490
 491   /* If REG_NEWLINE is set, newlines are treated differently.  */
 492   if (cflags & REG_NEWLINE)
 493     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 494       syntax &= ~RE_DOT_NEWLINE;
 495       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 496       /* It also changes the matching behavior.  */
 497       preg->newline_anchor = 1;
 498     }
 499   else
 500     preg->newline_anchor = 0;
 501   preg->no_sub = !!(cflags & REG_NOSUB);
 502   preg->translate = NULL;
 503
 504   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 505
 506   /* POSIX doesn't distinguish between an unmatched open-group and an
 507      unmatched close-group: both are REG_EPAREN.  */
 508   if (ret == REG_ERPAREN)
 509     ret = REG_EPAREN;
 510
 511   /* We have already checked preg->fastmap != NULL.  */
 512   if (BE (ret == REG_NOERROR, 1))
 513     /* Compute the fastmap now, since regexec cannot modify the pattern
 514        buffer.  This function never fails in this implementation.  */
 515     (void) re_compile_fastmap (preg);
 516   else
 517     {
 518       /* Some error occurred while compiling the expression.  */
 519       re_free (preg->fastmap);
 520       preg->fastmap = NULL;
 521     }
 522
 523   return (int) ret;
 524 }
 525 #ifdef _LIBC
 526 weak_alias (__regcomp, regcomp)
 527 #endif
 528
 529 /* Returns a message corresponding to an error code, ERRCODE, returned
 530    from either regcomp or regexec.   We don't use PREG here.  */
 531
 532 size_t
 533 regerror (errcode, preg, errbuf, errbuf_size)
 534     int errcode;
 535     const regex_t *preg;
 536     char *errbuf;
 537     size_t errbuf_size;
 538 {
 539   const char *msg;
 540   size_t msg_size;
 541
 542   if (BE (errcode < 0
 543           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 544                                / sizeof (__re_error_msgid_idx[0])), 0))
 545     /* Only error codes returned by the rest of the code should be passed
 546        to this routine.  If we are given anything else, or if other regex
 547        code generates an invalid error code, then the program has a bug.
 548        Dump core so we can fix it.  */
 549     abort ();
 550
 551   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 552
 553   msg_size = strlen (msg) + 1; /* Includes the null.  */
 554
 555   if (BE (errbuf_size != 0, 1))
 556     {
 557       if (BE (msg_size > errbuf_size, 0))
 558         {
 559 #if defined HAVE_MEMPCPY || defined _LIBC
 560           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 561 #else
 562           memcpy (errbuf, msg, errbuf_size - 1);
 563           errbuf[errbuf_size - 1] = 0;
 564 #endif
 565         }
 566       else
 567         memcpy (errbuf, msg, msg_size);
 568     }
 569
 570   return msg_size;
 571 }
 572 #ifdef _LIBC
 573 weak_alias (__regerror, regerror)
 574 #endif
 575
 576
 577 #ifdef RE_ENABLE_I18N
 578 /* This static array is used for the map to single-byte characters when
 579    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 580    it the same all the time.  UTF-8 is the preferred encoding so this is
 581    a worthwhile optimization.  */
 582 static const bitset utf8_sb_map =
 583 {
 584   /* Set the first 128 bits.  */
 585 # if UINT_MAX == 0xffffffff
 586   0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
 587 # else
 588 #  error "Add case for new unsigned int size"
 589 # endif
 590 };
 591 #endif
 592
 593
 594 static void
 595 free_dfa_content (re_dfa_t *dfa)
 596 {
 597   int i, j;
 598
 599   if (dfa->nodes)
 600     for (i = 0; i < dfa->nodes_len; ++i)
 601       {
 602         re_token_t *node = dfa->nodes + i;
 603 #ifdef RE_ENABLE_I18N
 604         if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 605           free_charset (node->opr.mbcset);
 606         else
 607 #endif /* RE_ENABLE_I18N */
 608           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 609             re_free (node->opr.sbcset);
 610       }
 611   re_free (dfa->nexts);
 612   for (i = 0; i < dfa->nodes_len; ++i)
 613     {
 614       if (dfa->eclosures != NULL)
 615         re_node_set_free (dfa->eclosures + i);
 616       if (dfa->inveclosures != NULL)
 617         re_node_set_free (dfa->inveclosures + i);
 618       if (dfa->edests != NULL)
 619         re_node_set_free (dfa->edests + i);
 620     }
 621   re_free (dfa->edests);
 622   re_free (dfa->eclosures);
 623   re_free (dfa->inveclosures);
 624   re_free (dfa->nodes);
 625
 626   if (dfa->state_table)
 627     for (i = 0; i <= dfa->state_hash_mask; ++i)
 628       {
 629         struct re_state_table_entry *entry = dfa->state_table + i;
 630         for (j = 0; j < entry->num; ++j)
 631           {
 632             re_dfastate_t *state = entry->array[j];
 633             free_state (state);
 634           }
 635         re_free (entry->array);
 636       }
 637   re_free (dfa->state_table);
 638 #ifdef RE_ENABLE_I18N
 639   if (dfa->sb_char != utf8_sb_map)
 640     re_free (dfa->sb_char);
 641 #endif
 642   re_free (dfa->subexp_map);
 643 #ifdef DEBUG
 644   re_free (dfa->re_str);
 645 #endif
 646
 647   re_free (dfa);
 648 }
 649
 650
 651 /* Free dynamically allocated space used by PREG.  */
 652
 653 void
 654 regfree (preg)
 655     regex_t *preg;
 656 {
 657   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 658   if (BE (dfa != NULL, 1))
 659     free_dfa_content (dfa);
 660   preg->buffer = NULL;
 661   preg->allocated = 0;
 662
 663   re_free (preg->fastmap);
 664   preg->fastmap = NULL;
 665
 666   re_free (preg->translate);
 667   preg->translate = NULL;
 668 }
 669 #ifdef _LIBC
 670 weak_alias (__regfree, regfree)
 671 #endif
 672 \f
 673 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 674    them unless specifically requested.  */
 675
 676 #if defined _REGEX_RE_COMP || defined _LIBC
 677
 678 /* BSD has one and only one pattern buffer.  */
 679 static struct re_pattern_buffer re_comp_buf;
 680
 681 char *
 682 # ifdef _LIBC
 683 /* Make these definitions weak in libc, so POSIX programs can redefine
 684    these names if they don't use our functions, and still use
 685    regcomp/regexec above without link errors.  */
 686 weak_function
 687 # endif
 688 re_comp (s)
 689      const char *s;
 690 {
 691   reg_errcode_t ret;
 692   char *fastmap;
 693
 694   if (!s)
 695     {
 696       if (!re_comp_buf.buffer)
 697         return gettext ("No previous regular expression");
 698       return 0;
 699     }
 700
 701   if (re_comp_buf.buffer)
 702     {
 703       fastmap = re_comp_buf.fastmap;
 704       re_comp_buf.fastmap = NULL;
 705       __regfree (&re_comp_buf);
 706       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 707       re_comp_buf.fastmap = fastmap;
 708     }
 709
 710   if (re_comp_buf.fastmap == NULL)
 711     {
 712       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 713       if (re_comp_buf.fastmap == NULL)
 714         return (char *) gettext (__re_error_msgid
 715                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 716     }
 717
 718   /* Since `re_exec' always passes NULL for the `regs' argument, we
 719      don't need to initialize the pattern buffer fields which affect it.  */
 720
 721   /* Match anchors at newlines.  */
 722   re_comp_buf.newline_anchor = 1;
 723
 724   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 725
 726   if (!ret)
 727     return NULL;
 728
 729   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 730   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 731 }
 732
 733 #ifdef _LIBC
 734 libc_freeres_fn (free_mem)
 735 {
 736   __regfree (&re_comp_buf);
 737 }
 738 #endif
 739
 740 #endif /* _REGEX_RE_COMP */
 741 \f
 742 /* Internal entry point.
 743    Compile the regular expression PATTERN, whose length is LENGTH.
 744    SYNTAX indicate regular expression's syntax.  */
 745
 746 static reg_errcode_t
 747 re_compile_internal (preg, pattern, length, syntax)
 748      regex_t *preg;
 749      const char * pattern;
 750      int length;
 751      reg_syntax_t syntax;
 752 {
 753   reg_errcode_t err = REG_NOERROR;
 754   re_dfa_t *dfa;
 755   re_string_t regexp;
 756
 757   /* Initialize the pattern buffer.  */
 758   preg->fastmap_accurate = 0;
 759   preg->syntax = syntax;
 760   preg->not_bol = preg->not_eol = 0;
 761   preg->used = 0;
 762   preg->re_nsub = 0;
 763   preg->can_be_null = 0;
 764   preg->regs_allocated = REGS_UNALLOCATED;
 765
 766   /* Initialize the dfa.  */
 767   dfa = (re_dfa_t *) preg->buffer;
 768   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 769     {
 770       /* If zero allocated, but buffer is non-null, try to realloc
 771          enough space.  This loses if buffer's address is bogus, but
 772          that is the user's responsibility.  If ->buffer is NULL this
 773          is a simple allocation.  */
 774       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 775       if (dfa == NULL)
 776         return REG_ESPACE;
 777       preg->allocated = sizeof (re_dfa_t);
 778       preg->buffer = (unsigned char *) dfa;
 779     }
 780   preg->used = sizeof (re_dfa_t);
 781
 782   __libc_lock_init (dfa->lock);
 783
 784   err = init_dfa (dfa, length);
 785   if (BE (err != REG_NOERROR, 0))
 786     {
 787       free_dfa_content (dfa);
 788       preg->buffer = NULL;
 789       preg->allocated = 0;
 790       return err;
 791     }
 792 #ifdef DEBUG
 793   dfa->re_str = re_malloc (char, length + 1);
 794   strncpy (dfa->re_str, pattern, length + 1);
 795 #endif
 796
 797   err = re_string_construct (&regexp, pattern, length, preg->translate,
 798                              syntax & RE_ICASE, dfa);
 799   if (BE (err != REG_NOERROR, 0))
 800     {
 801     re_compile_internal_free_return:
 802       free_workarea_compile (preg);
 803       re_string_destruct (&regexp);
 804       free_dfa_content (dfa);
 805       preg->buffer = NULL;
 806       preg->allocated = 0;
 807       return err;
 808     }
 809
 810   /* Parse the regular expression, and build a structure tree.  */
 811   preg->re_nsub = 0;
 812   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 813   if (BE (dfa->str_tree == NULL, 0))
 814     goto re_compile_internal_free_return;
 815
 816 #ifdef RE_ENABLE_I18N
 817   /* If possible, do searching in single byte encoding to speed things up.  */
 818   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 819     optimize_utf8 (dfa);
 820 #endif
 821
 822   if (preg->re_nsub > 0)
 823     {
 824       struct subexp_optimize so;
 825
 826       so.dfa = dfa;
 827       so.nodes = dfa->nodes;
 828       so.no_sub = preg->no_sub;
 829       so.re_nsub = preg->re_nsub;
 830       dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
 831     }
 832
 833   /* Analyze the tree and collect information which is necessary to
 834      create the dfa.  */
 835   err = analyze (dfa);
 836   if (BE (err != REG_NOERROR, 0))
 837     goto re_compile_internal_free_return;
 838
 839   /* Then create the initial state of the dfa.  */
 840   err = create_initial_state (dfa);
 841
 842   /* Release work areas.  */
 843   free_workarea_compile (preg);
 844   re_string_destruct (&regexp);
 845
 846   if (BE (err != REG_NOERROR, 0))
 847     {
 848       free_dfa_content (dfa);
 849       preg->buffer = NULL;
 850       preg->allocated = 0;
 851     }
 852
 853   return err;
 854 }
 855
 856 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 857    as the initial length of some arrays.  */
 858
 859 static reg_errcode_t
 860 init_dfa (dfa, pat_len)
 861      re_dfa_t *dfa;
 862      int pat_len;
 863 {
 864   int table_size;
 865 #ifndef _LIBC
 866   char *codeset_name;
 867 #endif
 868
 869   memset (dfa, '\0', sizeof (re_dfa_t));
 870
 871   /* Force allocation of str_tree_storage the first time.  */
 872   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 873
 874   dfa->nodes_alloc = pat_len + 1;
 875   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 876
 877   dfa->states_alloc = pat_len + 1;
 878
 879   /*  table_size = 2 ^ ceil(log pat_len) */
 880   for (table_size = 1; table_size > 0; table_size <<= 1)
 881     if (table_size > pat_len)
 882       break;
 883
 884   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 885   dfa->state_hash_mask = table_size - 1;
 886
 887   dfa->mb_cur_max = MB_CUR_MAX;
 888 #ifdef _LIBC
 889   if (dfa->mb_cur_max == 6
 890       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 891     dfa->is_utf8 = 1;
 892   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 893                        != 0);
 894 #else
 895 # ifdef HAVE_LANGINFO_CODESET
 896   codeset_name = nl_langinfo (CODESET);
 897 # else
 898   codeset_name = getenv ("LC_ALL");
 899   if (codeset_name == NULL || codeset[0] == '\0')
 900     codeset_name = getenv ("LC_CTYPE");
 901   if (codeset_name == NULL || codeset[0] == '\0')
 902     codeset_name = getenv ("LANG");
 903   if (codeset_name == NULL)
 904     codeset_name = "";
 905   else if (strchr (codeset_name, '.') !=  NULL)
 906     codeset_name = strchr (codeset_name, '.') + 1;
 907 # endif
 908
 909   if (strcasecmp (codeset_name, "UTF-8") == 0
 910       || strcasecmp (codeset_name, "UTF8") == 0)
 911     dfa->is_utf8 = 1;
 912
 913   /* We check exhaustively in the loop below if this charset is a
 914      superset of ASCII.  */
 915   dfa->map_notascii = 0;
 916 #endif
 917
 918 #ifdef RE_ENABLE_I18N
 919   if (dfa->mb_cur_max > 1)
 920     {
 921       if (dfa->is_utf8)
 922         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 923       else
 924         {
 925           int i, j, ch;
 926
 927           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 928           if (BE (dfa->sb_char == NULL, 0))
 929             return REG_ESPACE;
 930
 931           /* Clear all bits by, then set those corresponding to single
 932              byte chars.  */
 933           bitset_empty (dfa->sb_char);
 934
 935           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 936             for (j = 0; j < UINT_BITS; ++j, ++ch)
 937               {
 938                 wchar_t wch = __btowc (ch);
 939                 if (wch != WEOF)
 940                   dfa->sb_char[i] |= 1 << j;
 941 # ifndef _LIBC
 942                 if (isascii (ch) && wch != (wchar_t) ch)
 943                   dfa->map_notascii = 1;
 944 # endif
 945               }
 946         }
 947     }
 948 #endif
 949
 950   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 951     return REG_ESPACE;
 952   return REG_NOERROR;
 953 }
 954
 955 /* Initialize WORD_CHAR table, which indicate which character is
 956    "word".  In this case "word" means that it is the word construction
 957    character used by some operators like "\<", "\>", etc.  */
 958
 959 static void
 960 init_word_char (dfa)
 961      re_dfa_t *dfa;
 962 {
 963   int i, j, ch;
 964   dfa->word_ops_used = 1;
 965   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 966     for (j = 0; j < UINT_BITS; ++j, ++ch)
 967       if (isalnum (ch) || ch == '_')
 968         dfa->word_char[i] |= 1 << j;
 969 }
 970
 971 /* Free the work area which are only used while compiling.  */
 972
 973 static void
 974 free_workarea_compile (preg)
 975      regex_t *preg;
 976 {
 977   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 978   bin_tree_storage_t *storage, *next;
 979   for (storage = dfa->str_tree_storage; storage; storage = next)
 980     {
 981       next = storage->next;
 982       re_free (storage);
 983     }
 984   dfa->str_tree_storage = NULL;
 985   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 986   dfa->str_tree = NULL;
 987   re_free (dfa->org_indices);
 988   dfa->org_indices = NULL;
 989 }
 990
 991 /* Create initial states for all contexts.  */
 992
 993 static reg_errcode_t
 994 create_initial_state (dfa)
 995      re_dfa_t *dfa;
 996 {
 997   int first, i;
 998   reg_errcode_t err;
 999   re_node_set init_nodes;
1000
1001   /* Initial states have the epsilon closure of the node which is
1002      the first node of the regular expression.  */
1003   first = dfa->str_tree->first;
1004   dfa->init_node = first;
1005   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
1006   if (BE (err != REG_NOERROR, 0))
1007     return err;
1008
1009   /* The back-references which are in initial states can epsilon transit,
1010      since in this case all of the subexpressions can be null.
1011      Then we add epsilon closures of the nodes which are the next nodes of
1012      the back-references.  */
1013   if (dfa->nbackref > 0)
1014     for (i = 0; i < init_nodes.nelem; ++i)
1015       {
1016         int node_idx = init_nodes.elems[i];
1017         re_token_type_t type = dfa->nodes[node_idx].type;
1018
1019         int clexp_idx;
1020         if (type != OP_BACK_REF)
1021           continue;
1022         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1023           {
1024             re_token_t *clexp_node;
1025             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1026             if (clexp_node->type == OP_CLOSE_SUBEXP
1027                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1028               break;
1029           }
1030         if (clexp_idx == init_nodes.nelem)
1031           continue;
1032
1033         if (type == OP_BACK_REF)
1034           {
1035             int dest_idx = dfa->edests[node_idx].elems[0];
1036             if (!re_node_set_contains (&init_nodes, dest_idx))
1037               {
1038                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1039                 i = 0;
1040               }
1041           }
1042       }
1043
1044   /* It must be the first time to invoke acquire_state.  */
1045   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1046   /* We don't check ERR here, since the initial state must not be NULL.  */
1047   if (BE (dfa->init_state == NULL, 0))
1048     return err;
1049   if (dfa->init_state->has_constraint)
1050     {
1051       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1052                                                        CONTEXT_WORD);
1053       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1054                                                      CONTEXT_NEWLINE);
1055       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1056                                                          &init_nodes,
1057                                                          CONTEXT_NEWLINE
1058                                                          | CONTEXT_BEGBUF);
1059       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1060               || dfa->init_state_begbuf == NULL, 0))
1061         return err;
1062     }
1063   else
1064     dfa->init_state_word = dfa->init_state_nl
1065       = dfa->init_state_begbuf = dfa->init_state;
1066
1067   re_node_set_free (&init_nodes);
1068   return REG_NOERROR;
1069 }
1070 \f
1071 #ifdef RE_ENABLE_I18N
1072 /* If it is possible to do searching in single byte encoding instead of UTF-8
1073    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1074    DFA nodes where needed.  */
1075
1076 static void
1077 optimize_utf8 (dfa)
1078      re_dfa_t *dfa;
1079 {
1080   int node, i, mb_chars = 0, has_period = 0;
1081
1082   for (node = 0; node < dfa->nodes_len; ++node)
1083     switch (dfa->nodes[node].type)
1084       {
1085       case CHARACTER:
1086         if (dfa->nodes[node].opr.c >= 0x80)
1087           mb_chars = 1;
1088         break;
1089       case ANCHOR:
1090         switch (dfa->nodes[node].opr.idx)
1091           {
1092           case LINE_FIRST:
1093           case LINE_LAST:
1094           case BUF_FIRST:
1095           case BUF_LAST:
1096             break;
1097           default:
1098             /* Word anchors etc. cannot be handled.  */
1099             return;
1100           }
1101         break;
1102       case OP_PERIOD:
1103         has_period = 1;
1104         break;
1105       case OP_BACK_REF:
1106       case OP_ALT:
1107       case END_OF_RE:
1108       case OP_DUP_ASTERISK:
1109       case OP_DUP_QUESTION:
1110       case OP_OPEN_SUBEXP:
1111       case OP_CLOSE_SUBEXP:
1112         break;
1113       case SIMPLE_BRACKET:
1114         /* Just double check.  */
1115         for (i = 0x80 / UINT_BITS; i < BITSET_UINTS; ++i)
1116           if (dfa->nodes[node].opr.sbcset[i])
1117             return;
1118         break;
1119       default:
1120         return;
1121       }
1122
1123   if (mb_chars || has_period)
1124     for (node = 0; node < dfa->nodes_len; ++node)
1125       {
1126         if (dfa->nodes[node].type == CHARACTER
1127             && dfa->nodes[node].opr.c >= 0x80)
1128           dfa->nodes[node].mb_partial = 0;
1129         else if (dfa->nodes[node].type == OP_PERIOD)
1130           dfa->nodes[node].type = OP_UTF8_PERIOD;
1131       }
1132
1133   /* The search can be in single byte locale.  */
1134   dfa->mb_cur_max = 1;
1135   dfa->is_utf8 = 0;
1136   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1137 }
1138 #endif
1139 \f
1140 static bin_tree_t *
1141 optimize_subexps (so, node, sidx, depth)
1142      struct subexp_optimize *so;
1143      bin_tree_t *node;
1144      int sidx, depth;
1145 {
1146   int idx, new_depth, new_sidx;
1147   bin_tree_t *ret;
1148   if (node == NULL)
1149     return NULL;
1150
1151   new_depth = 0;
1152   new_sidx = sidx;
1153   if ((depth & 1) && node->type == CONCAT
1154       && node->right && node->right->type == 0
1155       && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1156     {
1157       new_depth = depth + 1;
1158       if (new_depth == 2
1159           || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1160               && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
1161         new_sidx = so->nodes[idx].opr.idx;
1162     }
1163   node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
1164   new_depth = (depth & 1) == 0 && node->type == CONCAT
1165               && node->left && node->left->type == 0
1166               && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
1167               ? depth + 1 : 0;
1168   node->right = optimize_subexps (so, node->right, sidx, new_depth);
1169
1170   if (node->type != CONCAT)
1171     return node;
1172   if ((depth & 1) == 0
1173       && node->left
1174       && node->left->type == 0
1175       && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
1176     ret = node->right;
1177   else if ((depth & 1)
1178            && node->right
1179            && node->right->type == 0
1180            && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
1181     ret = node->left;
1182   else
1183     return node;
1184
1185   if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
1186       && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
1187     return node;
1188
1189   if (!so->no_sub)
1190     {
1191       int i;
1192
1193       if (depth < 2)
1194         return node;
1195
1196       if (so->dfa->subexp_map == NULL)
1197         {
1198           so->dfa->subexp_map = re_malloc (int, so->re_nsub);
1199           if (so->dfa->subexp_map == NULL)
1200             return node;
1201
1202           for (i = 0; i < so->re_nsub; i++)
1203             so->dfa->subexp_map[i] = i;
1204         }
1205
1206       i = so->nodes[idx].opr.idx;
1207       assert (sidx < i);
1208       so->dfa->subexp_map[i] = sidx;
1209     }
1210
1211   so->nodes[idx].type = OP_DELETED_SUBEXP;
1212   ret->parent = node->parent;
1213   return ret;
1214 }
1215 \f
1216 /* Analyze the structure tree, and calculate "first", "next", "edest",
1217    "eclosure", and "inveclosure".  */
1218
1219 static reg_errcode_t
1220 analyze (dfa)
1221      re_dfa_t *dfa;
1222 {
1223   int i;
1224   reg_errcode_t ret;
1225
1226   /* Allocate arrays.  */
1227   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1228   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1229   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1230   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1231   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1232   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1233           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
1234     return REG_ESPACE;
1235   /* Initialize them.  */
1236   for (i = 0; i < dfa->nodes_len; ++i)
1237     {
1238       dfa->nexts[i] = -1;
1239       re_node_set_init_empty (dfa->edests + i);
1240       re_node_set_init_empty (dfa->eclosures + i);
1241       re_node_set_init_empty (dfa->inveclosures + i);
1242     }
1243
1244   ret = analyze_tree (dfa, dfa->str_tree);
1245   if (BE (ret == REG_NOERROR, 1))
1246     {
1247       ret = calc_eclosure (dfa);
1248       if (ret == REG_NOERROR)
1249         calc_inveclosure (dfa);
1250     }
1251   return ret;
1252 }
1253
1254 /* Helper functions for analyze.
1255    This function calculate "first", "next", and "edest" for the subtree
1256    whose root is NODE.  */
1257
1258 static reg_errcode_t
1259 analyze_tree (dfa, node)
1260      re_dfa_t *dfa;
1261      bin_tree_t *node;
1262 {
1263   reg_errcode_t ret;
1264   if (node->first == -1)
1265     calc_first (dfa, node);
1266   if (node->next == -1)
1267     calc_next (dfa, node);
1268   calc_epsdest (dfa, node);
1269
1270   /* Calculate "first" etc. for the left child.  */
1271   if (node->left != NULL)
1272     {
1273       ret = analyze_tree (dfa, node->left);
1274       if (BE (ret != REG_NOERROR, 0))
1275         return ret;
1276     }
1277   /* Calculate "first" etc. for the right child.  */
1278   if (node->right != NULL)
1279     {
1280       ret = analyze_tree (dfa, node->right);
1281       if (BE (ret != REG_NOERROR, 0))
1282         return ret;
1283     }
1284   return REG_NOERROR;
1285 }
1286
1287 /* Calculate "first" for the node NODE.  */
1288 static void
1289 calc_first (dfa, node)
1290      re_dfa_t *dfa;
1291      bin_tree_t *node;
1292 {
1293   int idx, type;
1294   idx = node->node_idx;
1295   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1296
1297   switch (type)
1298     {
1299 #ifdef DEBUG
1300     case OP_OPEN_BRACKET:
1301     case OP_CLOSE_BRACKET:
1302     case OP_OPEN_DUP_NUM:
1303     case OP_CLOSE_DUP_NUM:
1304     case OP_DUP_PLUS:
1305     case OP_NON_MATCH_LIST:
1306     case OP_OPEN_COLL_ELEM:
1307     case OP_CLOSE_COLL_ELEM:
1308     case OP_OPEN_EQUIV_CLASS:
1309     case OP_CLOSE_EQUIV_CLASS:
1310     case OP_OPEN_CHAR_CLASS:
1311     case OP_CLOSE_CHAR_CLASS:
1312       /* These must not appear here.  */
1313       assert (0);
1314 #endif
1315     case END_OF_RE:
1316     case CHARACTER:
1317     case OP_PERIOD:
1318     case OP_DUP_ASTERISK:
1319     case OP_DUP_QUESTION:
1320 #ifdef RE_ENABLE_I18N
1321     case OP_UTF8_PERIOD:
1322     case COMPLEX_BRACKET:
1323 #endif /* RE_ENABLE_I18N */
1324     case SIMPLE_BRACKET:
1325     case OP_BACK_REF:
1326     case ANCHOR:
1327     case OP_OPEN_SUBEXP:
1328     case OP_CLOSE_SUBEXP:
1329       node->first = idx;
1330       break;
1331     case OP_ALT:
1332       node->first = idx;
1333       break;
1334       /* else fall through */
1335     default:
1336 #ifdef DEBUG
1337       assert (node->left != NULL);
1338 #endif
1339       if (node->left->first == -1)
1340         calc_first (dfa, node->left);
1341       node->first = node->left->first;
1342       break;
1343     }
1344 }
1345
1346 /* Calculate "next" for the node NODE.  */
1347
1348 static void
1349 calc_next (dfa, node)
1350      re_dfa_t *dfa;
1351      bin_tree_t *node;
1352 {
1353   int idx, type;
1354   bin_tree_t *parent = node->parent;
1355   if (parent == NULL)
1356     {
1357       node->next = -1;
1358       idx = node->node_idx;
1359       if (node->type == 0)
1360         dfa->nexts[idx] = node->next;
1361       return;
1362     }
1363
1364   idx = parent->node_idx;
1365   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1366
1367   switch (type)
1368     {
1369     case OP_DUP_ASTERISK:
1370       node->next = idx;
1371       break;
1372     case CONCAT:
1373       if (parent->left == node)
1374         {
1375           if (parent->right->first == -1)
1376             calc_first (dfa, parent->right);
1377           node->next = parent->right->first;
1378           break;
1379         }
1380       /* else fall through */
1381     default:
1382       if (parent->next == -1)
1383         calc_next (dfa, parent);
1384       node->next = parent->next;
1385       break;
1386     }
1387   idx = node->node_idx;
1388   if (node->type == 0)
1389     dfa->nexts[idx] = node->next;
1390 }
1391
1392 /* Calculate "edest" for the node NODE.  */
1393
1394 static void
1395 calc_epsdest (dfa, node)
1396      re_dfa_t *dfa;
1397      bin_tree_t *node;
1398 {
1399   int idx;
1400   idx = node->node_idx;
1401   if (node->type == 0)
1402     {
1403       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1404           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1405         {
1406           if (node->left->first == -1)
1407             calc_first (dfa, node->left);
1408           if (node->next == -1)
1409             calc_next (dfa, node);
1410           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1411                               node->next);
1412         }
1413       else if (dfa->nodes[idx].type == OP_ALT)
1414         {
1415           int left, right;
1416           if (node->left != NULL)
1417             {
1418               if (node->left->first == -1)
1419                 calc_first (dfa, node->left);
1420               left = node->left->first;
1421             }
1422           else
1423             {
1424               if (node->next == -1)
1425                 calc_next (dfa, node);
1426               left = node->next;
1427             }
1428           if (node->right != NULL)
1429             {
1430               if (node->right->first == -1)
1431                 calc_first (dfa, node->right);
1432               right = node->right->first;
1433             }
1434           else
1435             {
1436               if (node->next == -1)
1437                 calc_next (dfa, node);
1438               right = node->next;
1439             }
1440           re_node_set_init_2 (dfa->edests + idx, left, right);
1441         }
1442       else if (dfa->nodes[idx].type == ANCHOR
1443                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1444                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1445                || dfa->nodes[idx].type == OP_BACK_REF)
1446         re_node_set_init_1 (dfa->edests + idx, node->next);
1447       else
1448         assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
1449     }
1450 }
1451
1452 /* Duplicate the epsilon closure of the node ROOT_NODE.
1453    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1454    to their own constraint.  */
1455
1456 static reg_errcode_t
1457 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1458                         init_constraint)
1459      re_dfa_t *dfa;
1460      int top_org_node, top_clone_node, root_node;
1461      unsigned int init_constraint;
1462 {
1463   reg_errcode_t err;
1464   int org_node, clone_node, ret;
1465   unsigned int constraint = init_constraint;
1466   for (org_node = top_org_node, clone_node = top_clone_node;;)
1467     {
1468       int org_dest, clone_dest;
1469       if (dfa->nodes[org_node].type == OP_BACK_REF)
1470         {
1471           /* If the back reference epsilon-transit, its destination must
1472              also have the constraint.  Then duplicate the epsilon closure
1473              of the destination of the back reference, and store it in
1474              edests of the back reference.  */
1475           org_dest = dfa->nexts[org_node];
1476           re_node_set_empty (dfa->edests + clone_node);
1477           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1478           if (BE (err != REG_NOERROR, 0))
1479             return err;
1480           dfa->nexts[clone_node] = dfa->nexts[org_node];
1481           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1482           if (BE (ret < 0, 0))
1483             return REG_ESPACE;
1484         }
1485       else if (dfa->edests[org_node].nelem == 0)
1486         {
1487           /* In case of the node can't epsilon-transit, don't duplicate the
1488              destination and store the original destination as the
1489              destination of the node.  */
1490           dfa->nexts[clone_node] = dfa->nexts[org_node];
1491           break;
1492         }
1493       else if (dfa->edests[org_node].nelem == 1)
1494         {
1495           /* In case of the node can epsilon-transit, and it has only one
1496              destination.  */
1497           org_dest = dfa->edests[org_node].elems[0];
1498           re_node_set_empty (dfa->edests + clone_node);
1499           if (dfa->nodes[org_node].type == ANCHOR)
1500             {
1501               /* In case of the node has another constraint, append it.  */
1502               if (org_node == root_node && clone_node != org_node)
1503                 {
1504                   /* ...but if the node is root_node itself, it means the
1505                      epsilon closure have a loop, then tie it to the
1506                      destination of the root_node.  */
1507                   ret = re_node_set_insert (dfa->edests + clone_node,
1508                                             org_dest);
1509                   if (BE (ret < 0, 0))
1510                     return REG_ESPACE;
1511                   break;
1512                 }
1513               constraint |= dfa->nodes[org_node].opr.ctx_type;
1514             }
1515           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1516           if (BE (err != REG_NOERROR, 0))
1517             return err;
1518           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1519           if (BE (ret < 0, 0))
1520             return REG_ESPACE;
1521         }
1522       else /* dfa->edests[org_node].nelem == 2 */
1523         {
1524           /* In case of the node can epsilon-transit, and it has two
1525              destinations. E.g. '|', '*', '+', '?'.   */
1526           org_dest = dfa->edests[org_node].elems[0];
1527           re_node_set_empty (dfa->edests + clone_node);
1528           /* Search for a duplicated node which satisfies the constraint.  */
1529           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1530           if (clone_dest == -1)
1531             {
1532               /* There are no such a duplicated node, create a new one.  */
1533               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1534               if (BE (err != REG_NOERROR, 0))
1535                 return err;
1536               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1537               if (BE (ret < 0, 0))
1538                 return REG_ESPACE;
1539               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1540                                             root_node, constraint);
1541               if (BE (err != REG_NOERROR, 0))
1542                 return err;
1543             }
1544           else
1545             {
1546               /* There are a duplicated node which satisfy the constraint,
1547                  use it to avoid infinite loop.  */
1548               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1549               if (BE (ret < 0, 0))
1550                 return REG_ESPACE;
1551             }
1552
1553           org_dest = dfa->edests[org_node].elems[1];
1554           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1555           if (BE (err != REG_NOERROR, 0))
1556             return err;
1557           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1558           if (BE (ret < 0, 0))
1559             return REG_ESPACE;
1560         }
1561       org_node = org_dest;
1562       clone_node = clone_dest;
1563     }
1564   return REG_NOERROR;
1565 }
1566
1567 /* Search for a node which is duplicated from the node ORG_NODE, and
1568    satisfies the constraint CONSTRAINT.  */
1569
1570 static int
1571 search_duplicated_node (dfa, org_node, constraint)
1572      re_dfa_t *dfa;
1573      int org_node;
1574      unsigned int constraint;
1575 {
1576   int idx;
1577   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1578     {
1579       if (org_node == dfa->org_indices[idx]
1580           && constraint == dfa->nodes[idx].constraint)
1581         return idx; /* Found.  */
1582     }
1583   return -1; /* Not found.  */
1584 }
1585
1586 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1587    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1588    otherwise return the error code.  */
1589
1590 static reg_errcode_t
1591 duplicate_node (new_idx, dfa, org_idx, constraint)
1592      re_dfa_t *dfa;
1593      int *new_idx, org_idx;
1594      unsigned int constraint;
1595 {
1596   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx], 1);
1597   if (BE (dup_idx == -1, 0))
1598     return REG_ESPACE;
1599   dfa->nodes[dup_idx].constraint = constraint;
1600   if (dfa->nodes[org_idx].type == ANCHOR)
1601     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1602   dfa->nodes[dup_idx].duplicated = 1;
1603   re_node_set_init_empty (dfa->edests + dup_idx);
1604   re_node_set_init_empty (dfa->eclosures + dup_idx);
1605   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1606
1607   /* Store the index of the original node.  */
1608   dfa->org_indices[dup_idx] = org_idx;
1609   *new_idx = dup_idx;
1610   return REG_NOERROR;
1611 }
1612
1613 static void
1614 calc_inveclosure (dfa)
1615      re_dfa_t *dfa;
1616 {
1617   int src, idx, dest;
1618   for (src = 0; src < dfa->nodes_len; ++src)
1619     {
1620       if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
1621         continue;
1622       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1623         {
1624           dest = dfa->eclosures[src].elems[idx];
1625           re_node_set_insert_last (dfa->inveclosures + dest, src);
1626         }
1627     }
1628 }
1629
1630 /* Calculate "eclosure" for all the node in DFA.  */
1631
1632 static reg_errcode_t
1633 calc_eclosure (dfa)
1634      re_dfa_t *dfa;
1635 {
1636   int node_idx, incomplete;
1637 #ifdef DEBUG
1638   assert (dfa->nodes_len > 0);
1639 #endif
1640   incomplete = 0;
1641   /* For each nodes, calculate epsilon closure.  */
1642   for (node_idx = 0; ; ++node_idx)
1643     {
1644       reg_errcode_t err;
1645       re_node_set eclosure_elem;
1646       if (node_idx == dfa->nodes_len)
1647         {
1648           if (!incomplete)
1649             break;
1650           incomplete = 0;
1651           node_idx = 0;
1652         }
1653
1654 #ifdef DEBUG
1655       assert (dfa->eclosures[node_idx].nelem != -1);
1656 #endif
1657       if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
1658         continue;
1659
1660       /* If we have already calculated, skip it.  */
1661       if (dfa->eclosures[node_idx].nelem != 0)
1662         continue;
1663       /* Calculate epsilon closure of `node_idx'.  */
1664       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1665       if (BE (err != REG_NOERROR, 0))
1666         return err;
1667
1668       if (dfa->eclosures[node_idx].nelem == 0)
1669         {
1670           incomplete = 1;
1671           re_node_set_free (&eclosure_elem);
1672         }
1673     }
1674   return REG_NOERROR;
1675 }
1676
1677 /* Calculate epsilon closure of NODE.  */
1678
1679 static reg_errcode_t
1680 calc_eclosure_iter (new_set, dfa, node, root)
1681      re_node_set *new_set;
1682      re_dfa_t *dfa;
1683      int node, root;
1684 {
1685   reg_errcode_t err;
1686   unsigned int constraint;
1687   int i, incomplete;
1688   re_node_set eclosure;
1689   incomplete = 0;
1690   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1691   if (BE (err != REG_NOERROR, 0))
1692     return err;
1693
1694   /* This indicates that we are calculating this node now.
1695      We reference this value to avoid infinite loop.  */
1696   dfa->eclosures[node].nelem = -1;
1697
1698   constraint = ((dfa->nodes[node].type == ANCHOR)
1699                 ? dfa->nodes[node].opr.ctx_type : 0);
1700   /* If the current node has constraints, duplicate all nodes.
1701      Since they must inherit the constraints.  */
1702   if (constraint
1703       && dfa->edests[node].nelem
1704       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1705     {
1706       int org_node, cur_node;
1707       org_node = cur_node = node;
1708       err = duplicate_node_closure (dfa, node, node, node, constraint);
1709       if (BE (err != REG_NOERROR, 0))
1710         return err;
1711     }
1712
1713   /* Expand each epsilon destination nodes.  */
1714   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1715     for (i = 0; i < dfa->edests[node].nelem; ++i)
1716       {
1717         re_node_set eclosure_elem;
1718         int edest = dfa->edests[node].elems[i];
1719         /* If calculating the epsilon closure of `edest' is in progress,
1720            return intermediate result.  */
1721         if (dfa->eclosures[edest].nelem == -1)
1722           {
1723             incomplete = 1;
1724             continue;
1725           }
1726         /* If we haven't calculated the epsilon closure of `edest' yet,
1727            calculate now. Otherwise use calculated epsilon closure.  */
1728         if (dfa->eclosures[edest].nelem == 0)
1729           {
1730             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1731             if (BE (err != REG_NOERROR, 0))
1732               return err;
1733           }
1734         else
1735           eclosure_elem = dfa->eclosures[edest];
1736         /* Merge the epsilon closure of `edest'.  */
1737         re_node_set_merge (&eclosure, &eclosure_elem);
1738         /* If the epsilon closure of `edest' is incomplete,
1739            the epsilon closure of this node is also incomplete.  */
1740         if (dfa->eclosures[edest].nelem == 0)
1741           {
1742             incomplete = 1;
1743             re_node_set_free (&eclosure_elem);
1744           }
1745       }
1746
1747   /* Epsilon closures include itself.  */
1748   re_node_set_insert (&eclosure, node);
1749   if (incomplete && !root)
1750     dfa->eclosures[node].nelem = 0;
1751   else
1752     dfa->eclosures[node] = eclosure;
1753   *new_set = eclosure;
1754   return REG_NOERROR;
1755 }
1756 \f
1757 /* Functions for token which are used in the parser.  */
1758
1759 /* Fetch a token from INPUT.
1760    We must not use this function inside bracket expressions.  */
1761
1762 static void
1763 fetch_token (result, input, syntax)
1764      re_token_t *result;
1765      re_string_t *input;
1766      reg_syntax_t syntax;
1767 {
1768   re_string_skip_bytes (input, peek_token (result, input, syntax));
1769 }
1770
1771 /* Peek a token from INPUT, and return the length of the token.
1772    We must not use this function inside bracket expressions.  */
1773
1774 static int
1775 peek_token (token, input, syntax)
1776      re_token_t *token;
1777      re_string_t *input;
1778      reg_syntax_t syntax;
1779 {
1780   unsigned char c;
1781
1782   if (re_string_eoi (input))
1783     {
1784       token->type = END_OF_RE;
1785       return 0;
1786     }
1787
1788   c = re_string_peek_byte (input, 0);
1789   token->opr.c = c;
1790
1791   token->word_char = 0;
1792 #ifdef RE_ENABLE_I18N
1793   token->mb_partial = 0;
1794   if (input->mb_cur_max > 1 &&
1795       !re_string_first_byte (input, re_string_cur_idx (input)))
1796     {
1797       token->type = CHARACTER;
1798       token->mb_partial = 1;
1799       return 1;
1800     }
1801 #endif
1802   if (c == '\\')
1803     {
1804       unsigned char c2;
1805       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1806         {
1807           token->type = BACK_SLASH;
1808           return 1;
1809         }
1810
1811       c2 = re_string_peek_byte_case (input, 1);
1812       token->opr.c = c2;
1813       token->type = CHARACTER;
1814 #ifdef RE_ENABLE_I18N
1815       if (input->mb_cur_max > 1)
1816         {
1817           wint_t wc = re_string_wchar_at (input,
1818                                           re_string_cur_idx (input) + 1);
1819           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1820         }
1821       else
1822 #endif
1823         token->word_char = IS_WORD_CHAR (c2) != 0;
1824
1825       switch (c2)
1826         {
1827         case '|':
1828           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1829             token->type = OP_ALT;
1830           break;
1831         case '1': case '2': case '3': case '4': case '5':
1832         case '6': case '7': case '8': case '9':
1833           if (!(syntax & RE_NO_BK_REFS))
1834             {
1835               token->type = OP_BACK_REF;
1836               token->opr.idx = c2 - '1';
1837             }
1838           break;
1839         case '<':
1840           if (!(syntax & RE_NO_GNU_OPS))
1841             {
1842               token->type = ANCHOR;
1843               token->opr.ctx_type = WORD_FIRST;
1844             }
1845           break;
1846         case '>':
1847           if (!(syntax & RE_NO_GNU_OPS))
1848             {
1849               token->type = ANCHOR;
1850               token->opr.ctx_type = WORD_LAST;
1851             }
1852           break;
1853         case 'b':
1854           if (!(syntax & RE_NO_GNU_OPS))
1855             {
1856               token->type = ANCHOR;
1857               token->opr.ctx_type = WORD_DELIM;
1858             }
1859           break;
1860         case 'B':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             {
1863               token->type = ANCHOR;
1864               token->opr.ctx_type = NOT_WORD_DELIM;
1865             }
1866           break;
1867         case 'w':
1868           if (!(syntax & RE_NO_GNU_OPS))
1869             token->type = OP_WORD;
1870           break;
1871         case 'W':
1872           if (!(syntax & RE_NO_GNU_OPS))
1873             token->type = OP_NOTWORD;
1874           break;
1875         case 's':
1876           if (!(syntax & RE_NO_GNU_OPS))
1877             token->type = OP_SPACE;
1878           break;
1879         case 'S':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             token->type = OP_NOTSPACE;
1882           break;
1883         case '`':
1884           if (!(syntax & RE_NO_GNU_OPS))
1885             {
1886               token->type = ANCHOR;
1887               token->opr.ctx_type = BUF_FIRST;
1888             }
1889           break;
1890         case '\'':
1891           if (!(syntax & RE_NO_GNU_OPS))
1892             {
1893               token->type = ANCHOR;
1894               token->opr.ctx_type = BUF_LAST;
1895             }
1896           break;
1897         case '(':
1898           if (!(syntax & RE_NO_BK_PARENS))
1899             token->type = OP_OPEN_SUBEXP;
1900           break;
1901         case ')':
1902           if (!(syntax & RE_NO_BK_PARENS))
1903             token->type = OP_CLOSE_SUBEXP;
1904           break;
1905         case '+':
1906           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1907             token->type = OP_DUP_PLUS;
1908           break;
1909         case '?':
1910           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1911             token->type = OP_DUP_QUESTION;
1912           break;
1913         case '{':
1914           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1915             token->type = OP_OPEN_DUP_NUM;
1916           break;
1917         case '}':
1918           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1919             token->type = OP_CLOSE_DUP_NUM;
1920           break;
1921         default:
1922           break;
1923         }
1924       return 2;
1925     }
1926
1927   token->type = CHARACTER;
1928 #ifdef RE_ENABLE_I18N
1929   if (input->mb_cur_max > 1)
1930     {
1931       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1932       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1933     }
1934   else
1935 #endif
1936     token->word_char = IS_WORD_CHAR (token->opr.c);
1937
1938   switch (c)
1939     {
1940     case '\n':
1941       if (syntax & RE_NEWLINE_ALT)
1942         token->type = OP_ALT;
1943       break;
1944     case '|':
1945       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1946         token->type = OP_ALT;
1947       break;
1948     case '*':
1949       token->type = OP_DUP_ASTERISK;
1950       break;
1951     case '+':
1952       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1953         token->type = OP_DUP_PLUS;
1954       break;
1955     case '?':
1956       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1957         token->type = OP_DUP_QUESTION;
1958       break;
1959     case '{':
1960       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1961         token->type = OP_OPEN_DUP_NUM;
1962       break;
1963     case '}':
1964       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1965         token->type = OP_CLOSE_DUP_NUM;
1966       break;
1967     case '(':
1968       if (syntax & RE_NO_BK_PARENS)
1969         token->type = OP_OPEN_SUBEXP;
1970       break;
1971     case ')':
1972       if (syntax & RE_NO_BK_PARENS)
1973         token->type = OP_CLOSE_SUBEXP;
1974       break;
1975     case '[':
1976       token->type = OP_OPEN_BRACKET;
1977       break;
1978     case '.':
1979       token->type = OP_PERIOD;
1980       break;
1981     case '^':
1982       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1983           re_string_cur_idx (input) != 0)
1984         {
1985           char prev = re_string_peek_byte (input, -1);
1986           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1987             break;
1988         }
1989       token->type = ANCHOR;
1990       token->opr.ctx_type = LINE_FIRST;
1991       break;
1992     case '$':
1993       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1994           re_string_cur_idx (input) + 1 != re_string_length (input))
1995         {
1996           re_token_t next;
1997           re_string_skip_bytes (input, 1);
1998           peek_token (&next, input, syntax);
1999           re_string_skip_bytes (input, -1);
2000           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
2001             break;
2002         }
2003       token->type = ANCHOR;
2004       token->opr.ctx_type = LINE_LAST;
2005       break;
2006     default:
2007       break;
2008     }
2009   return 1;
2010 }
2011
2012 /* Peek a token from INPUT, and return the length of the token.
2013    We must not use this function out of bracket expressions.  */
2014
2015 static int
2016 peek_token_bracket (token, input, syntax)
2017      re_token_t *token;
2018      re_string_t *input;
2019      reg_syntax_t syntax;
2020 {
2021   unsigned char c;
2022   if (re_string_eoi (input))
2023     {
2024       token->type = END_OF_RE;
2025       return 0;
2026     }
2027   c = re_string_peek_byte (input, 0);
2028   token->opr.c = c;
2029
2030 #ifdef RE_ENABLE_I18N
2031   if (input->mb_cur_max > 1 &&
2032       !re_string_first_byte (input, re_string_cur_idx (input)))
2033     {
2034       token->type = CHARACTER;
2035       return 1;
2036     }
2037 #endif /* RE_ENABLE_I18N */
2038
2039   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2040       && re_string_cur_idx (input) + 1 < re_string_length (input))
2041     {
2042       /* In this case, '\' escape a character.  */
2043       unsigned char c2;
2044       re_string_skip_bytes (input, 1);
2045       c2 = re_string_peek_byte (input, 0);
2046       token->opr.c = c2;
2047       token->type = CHARACTER;
2048       return 1;
2049     }
2050   if (c == '[') /* '[' is a special char in a bracket exps.  */
2051     {
2052       unsigned char c2;
2053       int token_len;
2054       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2055         c2 = re_string_peek_byte (input, 1);
2056       else
2057         c2 = 0;
2058       token->opr.c = c2;
2059       token_len = 2;
2060       switch (c2)
2061         {
2062         case '.':
2063           token->type = OP_OPEN_COLL_ELEM;
2064           break;
2065         case '=':
2066           token->type = OP_OPEN_EQUIV_CLASS;
2067           break;
2068         case ':':
2069           if (syntax & RE_CHAR_CLASSES)
2070             {
2071               token->type = OP_OPEN_CHAR_CLASS;
2072               break;
2073             }
2074           /* else fall through.  */
2075         default:
2076           token->type = CHARACTER;
2077           token->opr.c = c;
2078           token_len = 1;
2079           break;
2080         }
2081       return token_len;
2082     }
2083   switch (c)
2084     {
2085     case '-':
2086       token->type = OP_CHARSET_RANGE;
2087       break;
2088     case ']':
2089       token->type = OP_CLOSE_BRACKET;
2090       break;
2091     case '^':
2092       token->type = OP_NON_MATCH_LIST;
2093       break;
2094     default:
2095       token->type = CHARACTER;
2096     }
2097   return 1;
2098 }
2099 \f
2100 /* Functions for parser.  */
2101
2102 /* Entry point of the parser.
2103    Parse the regular expression REGEXP and return the structure tree.
2104    If an error is occured, ERR is set by error code, and return NULL.
2105    This function build the following tree, from regular expression <reg_exp>:
2106            CAT
2107            / \
2108           /   \
2109    <reg_exp>  EOR
2110
2111    CAT means concatenation.
2112    EOR means end of regular expression.  */
2113
2114 static bin_tree_t *
2115 parse (regexp, preg, syntax, err)
2116      re_string_t *regexp;
2117      regex_t *preg;
2118      reg_syntax_t syntax;
2119      reg_errcode_t *err;
2120 {
2121   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2122   bin_tree_t *tree, *eor, *root;
2123   re_token_t current_token;
2124   dfa->syntax = syntax;
2125   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2126   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2127   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2128     return NULL;
2129   eor = re_dfa_add_tree_node (dfa, NULL, NULL, &current_token);
2130   if (tree != NULL)
2131     root = create_tree (dfa, tree, eor, CONCAT, 0);
2132   else
2133     root = eor;
2134   if (BE (eor == NULL || root == NULL, 0))
2135     {
2136       *err = REG_ESPACE;
2137       return NULL;
2138     }
2139   return root;
2140 }
2141
2142 /* This function build the following tree, from regular expression
2143    <branch1>|<branch2>:
2144            ALT
2145            / \
2146           /   \
2147    <branch1> <branch2>
2148
2149    ALT means alternative, which represents the operator `|'.  */
2150
2151 static bin_tree_t *
2152 parse_reg_exp (regexp, preg, token, syntax, nest, err)
2153      re_string_t *regexp;
2154      regex_t *preg;
2155      re_token_t *token;
2156      reg_syntax_t syntax;
2157      int nest;
2158      reg_errcode_t *err;
2159 {
2160   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2161   bin_tree_t *tree, *branch = NULL;
2162   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2163   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2164     return NULL;
2165
2166   while (token->type == OP_ALT)
2167     {
2168       re_token_t alt_token = *token;
2169       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2170       if (token->type != OP_ALT && token->type != END_OF_RE
2171           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2172         {
2173           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2174           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2175             return NULL;
2176         }
2177       else
2178         branch = NULL;
2179       tree = re_dfa_add_tree_node (dfa, tree, branch, &alt_token);
2180       if (BE (tree == NULL, 0))
2181         {
2182           *err = REG_ESPACE;
2183           return NULL;
2184         }
2185       dfa->has_plural_match = 1;
2186     }
2187   return tree;
2188 }
2189
2190 /* This function build the following tree, from regular expression
2191    <exp1><exp2>:
2192         CAT
2193         / \
2194        /   \
2195    <exp1> <exp2>
2196
2197    CAT means concatenation.  */
2198
2199 static bin_tree_t *
2200 parse_branch (regexp, preg, token, syntax, nest, err)
2201      re_string_t *regexp;
2202      regex_t *preg;
2203      re_token_t *token;
2204      reg_syntax_t syntax;
2205      int nest;
2206      reg_errcode_t *err;
2207 {
2208   bin_tree_t *tree, *exp;
2209   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2210   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2211   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2212     return NULL;
2213
2214   while (token->type != OP_ALT && token->type != END_OF_RE
2215          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2216     {
2217       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2218       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2219         {
2220           return NULL;
2221         }
2222       if (tree != NULL && exp != NULL)
2223         {
2224           tree = create_tree (dfa, tree, exp, CONCAT, 0);
2225           if (tree == NULL)
2226             {
2227               *err = REG_ESPACE;
2228               return NULL;
2229             }
2230         }
2231       else if (tree == NULL)
2232         tree = exp;
2233       /* Otherwise exp == NULL, we don't need to create new tree.  */
2234     }
2235   return tree;
2236 }
2237
2238 /* This function build the following tree, from regular expression a*:
2239          *
2240          |
2241          a
2242 */
2243
2244 static bin_tree_t *
2245 parse_expression (regexp, preg, token, syntax, nest, err)
2246      re_string_t *regexp;
2247      regex_t *preg;
2248      re_token_t *token;
2249      reg_syntax_t syntax;
2250      int nest;
2251      reg_errcode_t *err;
2252 {
2253   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2254   bin_tree_t *tree;
2255   switch (token->type)
2256     {
2257     case CHARACTER:
2258       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2259       if (BE (tree == NULL, 0))
2260         {
2261           *err = REG_ESPACE;
2262           return NULL;
2263         }
2264 #ifdef RE_ENABLE_I18N
2265       if (dfa->mb_cur_max > 1)
2266         {
2267           while (!re_string_eoi (regexp)
2268                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2269             {
2270               bin_tree_t *mbc_remain;
2271               fetch_token (token, regexp, syntax);
2272               mbc_remain = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2273               tree = create_tree (dfa, tree, mbc_remain, CONCAT, 0);
2274               if (BE (mbc_remain == NULL || tree == NULL, 0))
2275                 {
2276                   *err = REG_ESPACE;
2277                   return NULL;
2278                 }
2279             }
2280         }
2281 #endif
2282       break;
2283     case OP_OPEN_SUBEXP:
2284       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2285       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2286         return NULL;
2287       break;
2288     case OP_OPEN_BRACKET:
2289       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2290       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2291         return NULL;
2292       break;
2293     case OP_BACK_REF:
2294       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2295         {
2296           *err = REG_ESUBREG;
2297           return NULL;
2298         }
2299       dfa->used_bkref_map |= 1 << token->opr.idx;
2300       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2301       if (BE (tree == NULL, 0))
2302         {
2303           *err = REG_ESPACE;
2304           return NULL;
2305         }
2306       ++dfa->nbackref;
2307       dfa->has_mb_node = 1;
2308       break;
2309     case OP_OPEN_DUP_NUM:
2310       if (syntax & RE_CONTEXT_INVALID_DUP)
2311         {
2312           *err = REG_BADRPT;
2313           return NULL;
2314         }
2315       /* FALLTHROUGH */
2316     case OP_DUP_ASTERISK:
2317     case OP_DUP_PLUS:
2318     case OP_DUP_QUESTION:
2319       if (syntax & RE_CONTEXT_INVALID_OPS)
2320         {
2321           *err = REG_BADRPT;
2322           return NULL;
2323         }
2324       else if (syntax & RE_CONTEXT_INDEP_OPS)
2325         {
2326           fetch_token (token, regexp, syntax);
2327           return parse_expression (regexp, preg, token, syntax, nest, err);
2328         }
2329       /* else fall through  */
2330     case OP_CLOSE_SUBEXP:
2331       if ((token->type == OP_CLOSE_SUBEXP) &&
2332           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2333         {
2334           *err = REG_ERPAREN;
2335           return NULL;
2336         }
2337       /* else fall through  */
2338     case OP_CLOSE_DUP_NUM:
2339       /* We treat it as a normal character.  */
2340
2341       /* Then we can these characters as normal characters.  */
2342       token->type = CHARACTER;
2343       /* mb_partial and word_char bits should be initialized already
2344          by peek_token.  */
2345       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2346       if (BE (tree == NULL, 0))
2347         {
2348           *err = REG_ESPACE;
2349           return NULL;
2350         }
2351       break;
2352     case ANCHOR:
2353       if ((token->opr.ctx_type
2354            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2355           && dfa->word_ops_used == 0)
2356         init_word_char (dfa);
2357       if (token->opr.ctx_type == WORD_DELIM
2358           || token->opr.ctx_type == NOT_WORD_DELIM)
2359         {
2360           bin_tree_t *tree_first, *tree_last;
2361           if (token->opr.ctx_type == WORD_DELIM)
2362             {
2363               token->opr.ctx_type = WORD_FIRST;
2364               tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2365               token->opr.ctx_type = WORD_LAST;
2366             }
2367           else
2368             {
2369               token->opr.ctx_type = INSIDE_WORD;
2370               tree_first = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2371               token->opr.ctx_type = INSIDE_NOTWORD;
2372             }
2373           tree_last = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2374           token->type = OP_ALT;
2375           tree = re_dfa_add_tree_node (dfa, tree_first, tree_last, token);
2376           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2377             {
2378               *err = REG_ESPACE;
2379               return NULL;
2380             }
2381         }
2382       else
2383         {
2384           tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2385           if (BE (tree == NULL, 0))
2386             {
2387               *err = REG_ESPACE;
2388               return NULL;
2389             }
2390         }
2391       /* We must return here, since ANCHORs can't be followed
2392          by repetition operators.
2393          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2394              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2395       fetch_token (token, regexp, syntax);
2396       return tree;
2397     case OP_PERIOD:
2398       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2399       if (BE (tree == NULL, 0))
2400         {
2401           *err = REG_ESPACE;
2402           return NULL;
2403         }
2404       if (dfa->mb_cur_max > 1)
2405         dfa->has_mb_node = 1;
2406       break;
2407     case OP_WORD:
2408     case OP_NOTWORD:
2409       tree = build_charclass_op (dfa, regexp->trans,
2410                                  (const unsigned char *) "alnum",
2411                                  (const unsigned char *) "_",
2412                                  token->type == OP_NOTWORD, err);
2413       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2414         return NULL;
2415       break;
2416     case OP_SPACE:
2417     case OP_NOTSPACE:
2418       tree = build_charclass_op (dfa, regexp->trans,
2419                                  (const unsigned char *) "space",
2420                                  (const unsigned char *) "",
2421                                  token->type == OP_NOTSPACE, err);
2422       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2423         return NULL;
2424       break;
2425     case OP_ALT:
2426     case END_OF_RE:
2427       return NULL;
2428     case BACK_SLASH:
2429       *err = REG_EESCAPE;
2430       return NULL;
2431     default:
2432       /* Must not happen?  */
2433 #ifdef DEBUG
2434       assert (0);
2435 #endif
2436       return NULL;
2437     }
2438   fetch_token (token, regexp, syntax);
2439
2440   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2441          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2442     {
2443       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2444       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2445         return NULL;
2446       /* In BRE consecutive duplications are not allowed.  */
2447       if ((syntax & RE_CONTEXT_INVALID_DUP)
2448           && (token->type == OP_DUP_ASTERISK
2449               || token->type == OP_OPEN_DUP_NUM))
2450         {
2451           *err = REG_BADRPT;
2452           return NULL;
2453         }
2454       dfa->has_plural_match = 1;
2455     }
2456
2457   return tree;
2458 }
2459
2460 /* This function build the following tree, from regular expression
2461    (<reg_exp>):
2462          SUBEXP
2463             |
2464         <reg_exp>
2465 */
2466
2467 static bin_tree_t *
2468 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2469      re_string_t *regexp;
2470      regex_t *preg;
2471      re_token_t *token;
2472      reg_syntax_t syntax;
2473      int nest;
2474      reg_errcode_t *err;
2475 {
2476   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2477   bin_tree_t *tree, *left_par, *right_par;
2478   size_t cur_nsub;
2479   cur_nsub = preg->re_nsub++;
2480
2481   left_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2482   if (BE (left_par == NULL, 0))
2483     {
2484       *err = REG_ESPACE;
2485       return NULL;
2486     }
2487   dfa->nodes[left_par->node_idx].opr.idx = cur_nsub;
2488   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2489
2490   /* The subexpression may be a null string.  */
2491   if (token->type == OP_CLOSE_SUBEXP)
2492     tree = NULL;
2493   else
2494     {
2495       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2496       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2497         return NULL;
2498     }
2499   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2500     {
2501       *err = REG_EPAREN;
2502       return NULL;
2503     }
2504   right_par = re_dfa_add_tree_node (dfa, NULL, NULL, token);
2505   dfa->completed_bkref_map |= 1 << cur_nsub;
2506   tree = ((tree == NULL) ? right_par
2507           : create_tree (dfa, tree, right_par, CONCAT, 0));
2508   tree = create_tree (dfa, left_par, tree, CONCAT, 0);
2509   if (BE (right_par == NULL || tree == NULL, 0))
2510     {
2511       *err = REG_ESPACE;
2512       return NULL;
2513     }
2514   dfa->nodes[right_par->node_idx].opr.idx = cur_nsub;
2515
2516   return tree;
2517 }
2518
2519 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2520
2521 static bin_tree_t *
2522 parse_dup_op (elem, regexp, dfa, token, syntax, err)
2523      bin_tree_t *elem;
2524      re_string_t *regexp;
2525      re_dfa_t *dfa;
2526      re_token_t *token;
2527      reg_syntax_t syntax;
2528      reg_errcode_t *err;
2529 {
2530   re_token_t dup_token;
2531   bin_tree_t *tree = NULL, *old_tree = NULL;
2532   int i, start, end, start_idx = re_string_cur_idx (regexp);
2533   re_token_t start_token = *token;
2534
2535   if (token->type == OP_OPEN_DUP_NUM)
2536     {
2537       end = 0;
2538       start = fetch_number (regexp, token, syntax);
2539       if (start == -1)
2540         {
2541           if (token->type == CHARACTER && token->opr.c == ',')
2542             start = 0; /* We treat "{,m}" as "{0,m}".  */
2543           else
2544             {
2545               *err = REG_BADBR; /* <re>{} is invalid.  */
2546               return NULL;
2547             }
2548         }
2549       if (BE (start != -2, 1))
2550         {
2551           /* We treat "{n}" as "{n,n}".  */
2552           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2553                  : ((token->type == CHARACTER && token->opr.c == ',')
2554                     ? fetch_number (regexp, token, syntax) : -2));
2555         }
2556       if (BE (start == -2 || end == -2, 0))
2557         {
2558           /* Invalid sequence.  */
2559           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2560             {
2561               if (token->type == END_OF_RE)
2562                 *err = REG_EBRACE;
2563               else
2564                 *err = REG_BADBR;
2565
2566               return NULL;
2567             }
2568
2569           /* If the syntax bit is set, rollback.  */
2570           re_string_set_index (regexp, start_idx);
2571           *token = start_token;
2572           token->type = CHARACTER;
2573           /* mb_partial and word_char bits should be already initialized by
2574              peek_token.  */
2575           return elem;
2576         }
2577
2578       if (BE (end != -1 && start > end, 0))
2579         {
2580           /* First number greater than second.  */
2581           *err = REG_BADBR;
2582           return NULL;
2583         }
2584     }
2585   else
2586     {
2587       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2588       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2589     }
2590
2591   fetch_token (token, regexp, syntax);
2592
2593   /* Treat "<re>{0}*" etc. as "<re>{0}".  */
2594   if (BE (elem == NULL || (start == 0 && end == 0), 0))
2595     return NULL;
2596
2597   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2598   if (BE (start > 0, 0))
2599     {
2600       tree = elem;
2601       for (i = 2; i <= start; ++i)
2602         {
2603           elem = duplicate_tree (elem, dfa);
2604           tree = create_tree (dfa, tree, elem, CONCAT, 0);
2605           if (BE (elem == NULL || tree == NULL, 0))
2606             goto parse_dup_op_espace;
2607         }
2608
2609       if (start == end)
2610         return tree;
2611
2612       /* Duplicate ELEM before it is marked optional.  */
2613       elem = duplicate_tree (elem, dfa);
2614       old_tree = tree;
2615     }
2616   else
2617     old_tree = NULL;
2618
2619   mark_opt_subexp (elem, dfa);
2620   dup_token.type = (end == -1 ? OP_DUP_ASTERISK : OP_DUP_QUESTION);
2621   tree = re_dfa_add_tree_node (dfa, elem, NULL, &dup_token);
2622   if (BE (tree == NULL, 0))
2623     goto parse_dup_op_espace;
2624
2625   /* This loop is actually executed only when end != -1,
2626      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2627      already created the start+1-th copy.  */
2628   for (i = start + 2; i <= end; ++i)
2629     {
2630       elem = duplicate_tree (elem, dfa);
2631       tree = create_tree (dfa, tree, elem, CONCAT, 0);
2632       if (BE (elem == NULL || tree == NULL, 0))
2633         goto parse_dup_op_espace;
2634
2635       tree = re_dfa_add_tree_node (dfa, tree, NULL, &dup_token);
2636       if (BE (tree == NULL, 0))
2637         goto parse_dup_op_espace;
2638     }
2639
2640   if (old_tree)
2641     tree = create_tree (dfa, old_tree, tree, CONCAT, 0);
2642
2643   return tree;
2644
2645  parse_dup_op_espace:
2646   *err = REG_ESPACE;
2647   return NULL;
2648 }
2649
2650 /* Size of the names for collating symbol/equivalence_class/character_class.
2651    I'm not sure, but maybe enough.  */
2652 #define BRACKET_NAME_BUF_SIZE 32
2653
2654 #ifndef _LIBC
2655   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2656      Build the range expression which starts from START_ELEM, and ends
2657      at END_ELEM.  The result are written to MBCSET and SBCSET.
2658      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2659      mbcset->range_ends, is a pointer argument sinse we may
2660      update it.  */
2661
2662 static reg_errcode_t
2663 # ifdef RE_ENABLE_I18N
2664 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2665      re_charset_t *mbcset;
2666      int *range_alloc;
2667 # else /* not RE_ENABLE_I18N */
2668 build_range_exp (sbcset, start_elem, end_elem)
2669 # endif /* not RE_ENABLE_I18N */
2670      re_bitset_ptr_t sbcset;
2671      bracket_elem_t *start_elem, *end_elem;
2672 {
2673   unsigned int start_ch, end_ch;
2674   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2675   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2676           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2677           0))
2678     return REG_ERANGE;
2679
2680   /* We can handle no multi character collating elements without libc
2681      support.  */
2682   if (BE ((start_elem->type == COLL_SYM
2683            && strlen ((char *) start_elem->opr.name) > 1)
2684           || (end_elem->type == COLL_SYM
2685               && strlen ((char *) end_elem->opr.name) > 1), 0))
2686     return REG_ECOLLATE;
2687
2688 # ifdef RE_ENABLE_I18N
2689   {
2690     wchar_t wc, start_wc, end_wc;
2691     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2692
2693     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2694                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2695                    : 0));
2696     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2697               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2698                  : 0));
2699     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2700                 ? __btowc (start_ch) : start_elem->opr.wch);
2701     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2702               ? __btowc (end_ch) : end_elem->opr.wch);
2703     if (start_wc == WEOF || end_wc == WEOF)
2704       return REG_ECOLLATE;
2705     cmp_buf[0] = start_wc;
2706     cmp_buf[4] = end_wc;
2707     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2708       return REG_ERANGE;
2709
2710     /* Got valid collation sequence values, add them as a new entry.
2711        However, for !_LIBC we have no collation elements: if the
2712        character set is single byte, the single byte character set
2713        that we build below suffices.  parse_bracket_exp passes
2714        no MBCSET if dfa->mb_cur_max == 1.  */
2715     if (mbcset)
2716       {
2717         /* Check the space of the arrays.  */
2718         if (BE (*range_alloc == mbcset->nranges, 0))
2719           {
2720             /* There is not enough space, need realloc.  */
2721             wchar_t *new_array_start, *new_array_end;
2722             int new_nranges;
2723
2724             /* +1 in case of mbcset->nranges is 0.  */
2725             new_nranges = 2 * mbcset->nranges + 1;
2726             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2727                are NULL if *range_alloc == 0.  */
2728             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2729                                           new_nranges);
2730             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2731                                         new_nranges);
2732
2733             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2734               return REG_ESPACE;
2735
2736             mbcset->range_starts = new_array_start;
2737             mbcset->range_ends = new_array_end;
2738             *range_alloc = new_nranges;
2739           }
2740
2741         mbcset->range_starts[mbcset->nranges] = start_wc;
2742         mbcset->range_ends[mbcset->nranges++] = end_wc;
2743       }
2744
2745     /* Build the table for single byte characters.  */
2746     for (wc = 0; wc < SBC_MAX; ++wc)
2747       {
2748         cmp_buf[2] = wc;
2749         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2750             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2751           bitset_set (sbcset, wc);
2752       }
2753   }
2754 # else /* not RE_ENABLE_I18N */
2755   {
2756     unsigned int ch;
2757     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2758                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2759                    : 0));
2760     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2761               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2762                  : 0));
2763     if (start_ch > end_ch)
2764       return REG_ERANGE;
2765     /* Build the table for single byte characters.  */
2766     for (ch = 0; ch < SBC_MAX; ++ch)
2767       if (start_ch <= ch  && ch <= end_ch)
2768         bitset_set (sbcset, ch);
2769   }
2770 # endif /* not RE_ENABLE_I18N */
2771   return REG_NOERROR;
2772 }
2773 #endif /* not _LIBC */
2774
2775 #ifndef _LIBC
2776 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2777    Build the collating element which is represented by NAME.
2778    The result are written to MBCSET and SBCSET.
2779    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2780    pointer argument since we may update it.  */
2781
2782 static reg_errcode_t
2783 # ifdef RE_ENABLE_I18N
2784 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2785      re_charset_t *mbcset;
2786      int *coll_sym_alloc;
2787 # else /* not RE_ENABLE_I18N */
2788 build_collating_symbol (sbcset, name)
2789 # endif /* not RE_ENABLE_I18N */
2790      re_bitset_ptr_t sbcset;
2791      const unsigned char *name;
2792 {
2793   size_t name_len = strlen ((const char *) name);
2794   if (BE (name_len != 1, 0))
2795     return REG_ECOLLATE;
2796   else
2797     {
2798       bitset_set (sbcset, name[0]);
2799       return REG_NOERROR;
2800     }
2801 }
2802 #endif /* not _LIBC */
2803
2804 /* This function parse bracket expression like "[abc]", "[a-c]",
2805    "[[.a-a.]]" etc.  */
2806
2807 static bin_tree_t *
2808 parse_bracket_exp (regexp, dfa, token, syntax, err)
2809      re_string_t *regexp;
2810      re_dfa_t *dfa;
2811      re_token_t *token;
2812      reg_syntax_t syntax;
2813      reg_errcode_t *err;
2814 {
2815 #ifdef _LIBC
2816   const unsigned char *collseqmb;
2817   const char *collseqwc;
2818   uint32_t nrules;
2819   int32_t table_size;
2820   const int32_t *symb_table;
2821   const unsigned char *extra;
2822
2823   /* Local function for parse_bracket_exp used in _LIBC environement.
2824      Seek the collating symbol entry correspondings to NAME.
2825      Return the index of the symbol in the SYMB_TABLE.  */
2826
2827   auto inline int32_t
2828   __attribute ((always_inline))
2829   seek_collating_symbol_entry (name, name_len)
2830          const unsigned char *name;
2831          size_t name_len;
2832     {
2833       int32_t hash = elem_hash ((const char *) name, name_len);
2834       int32_t elem = hash % table_size;
2835       int32_t second = hash % (table_size - 2);
2836       while (symb_table[2 * elem] != 0)
2837         {
2838           /* First compare the hashing value.  */
2839           if (symb_table[2 * elem] == hash
2840               /* Compare the length of the name.  */
2841               && name_len == extra[symb_table[2 * elem + 1]]
2842               /* Compare the name.  */
2843               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2844                          name_len) == 0)
2845             {
2846               /* Yep, this is the entry.  */
2847               break;
2848             }
2849
2850           /* Next entry.  */
2851           elem += second;
2852         }
2853       return elem;
2854     }
2855
2856   /* Local function for parse_bracket_exp used in _LIBC environement.
2857      Look up the collation sequence value of BR_ELEM.
2858      Return the value if succeeded, UINT_MAX otherwise.  */
2859
2860   auto inline unsigned int
2861   __attribute ((always_inline))
2862   lookup_collation_sequence_value (br_elem)
2863          bracket_elem_t *br_elem;
2864     {
2865       if (br_elem->type == SB_CHAR)
2866         {
2867           /*
2868           if (MB_CUR_MAX == 1)
2869           */
2870           if (nrules == 0)
2871             return collseqmb[br_elem->opr.ch];
2872           else
2873             {
2874               wint_t wc = __btowc (br_elem->opr.ch);
2875               return __collseq_table_lookup (collseqwc, wc);
2876             }
2877         }
2878       else if (br_elem->type == MB_CHAR)
2879         {
2880           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2881         }
2882       else if (br_elem->type == COLL_SYM)
2883         {
2884           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2885           if (nrules != 0)
2886             {
2887               int32_t elem, idx;
2888               elem = seek_collating_symbol_entry (br_elem->opr.name,
2889                                                   sym_name_len);
2890               if (symb_table[2 * elem] != 0)
2891                 {
2892                   /* We found the entry.  */
2893                   idx = symb_table[2 * elem + 1];
2894                   /* Skip the name of collating element name.  */
2895                   idx += 1 + extra[idx];
2896                   /* Skip the byte sequence of the collating element.  */
2897                   idx += 1 + extra[idx];
2898                   /* Adjust for the alignment.  */
2899                   idx = (idx + 3) & ~3;
2900                   /* Skip the multibyte collation sequence value.  */
2901                   idx += sizeof (unsigned int);
2902                   /* Skip the wide char sequence of the collating element.  */
2903                   idx += sizeof (unsigned int) *
2904                     (1 + *(unsigned int *) (extra + idx));
2905                   /* Return the collation sequence value.  */
2906                   return *(unsigned int *) (extra + idx);
2907                 }
2908               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2909                 {
2910                   /* No valid character.  Match it as a single byte
2911                      character.  */
2912                   return collseqmb[br_elem->opr.name[0]];
2913                 }
2914             }
2915           else if (sym_name_len == 1)
2916             return collseqmb[br_elem->opr.name[0]];
2917         }
2918       return UINT_MAX;
2919     }
2920
2921   /* Local function for parse_bracket_exp used in _LIBC environement.
2922      Build the range expression which starts from START_ELEM, and ends
2923      at END_ELEM.  The result are written to MBCSET and SBCSET.
2924      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2925      mbcset->range_ends, is a pointer argument sinse we may
2926      update it.  */
2927
2928   auto inline reg_errcode_t
2929   __attribute ((always_inline))
2930   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2931          re_charset_t *mbcset;
2932          int *range_alloc;
2933          re_bitset_ptr_t sbcset;
2934          bracket_elem_t *start_elem, *end_elem;
2935     {
2936       unsigned int ch;
2937       uint32_t start_collseq;
2938       uint32_t end_collseq;
2939
2940       /* Equivalence Classes and Character Classes can't be a range
2941          start/end.  */
2942       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2943               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2944               0))
2945         return REG_ERANGE;
2946
2947       start_collseq = lookup_collation_sequence_value (start_elem);
2948       end_collseq = lookup_collation_sequence_value (end_elem);
2949       /* Check start/end collation sequence values.  */
2950       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2951         return REG_ECOLLATE;
2952       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2953         return REG_ERANGE;
2954
2955       /* Got valid collation sequence values, add them as a new entry.
2956          However, if we have no collation elements, and the character set
2957          is single byte, the single byte character set that we
2958          build below suffices. */
2959       if (nrules > 0 || dfa->mb_cur_max > 1)
2960         {
2961           /* Check the space of the arrays.  */
2962           if (BE (*range_alloc == mbcset->nranges, 0))
2963             {
2964               /* There is not enough space, need realloc.  */
2965               uint32_t *new_array_start;
2966               uint32_t *new_array_end;
2967               int new_nranges;
2968
2969               /* +1 in case of mbcset->nranges is 0.  */
2970               new_nranges = 2 * mbcset->nranges + 1;
2971               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2972                                             new_nranges);
2973               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2974                                           new_nranges);
2975
2976               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2977                 return REG_ESPACE;
2978
2979               mbcset->range_starts = new_array_start;
2980               mbcset->range_ends = new_array_end;
2981               *range_alloc = new_nranges;
2982             }
2983
2984           mbcset->range_starts[mbcset->nranges] = start_collseq;
2985           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2986         }
2987
2988       /* Build the table for single byte characters.  */
2989       for (ch = 0; ch < SBC_MAX; ch++)
2990         {
2991           uint32_t ch_collseq;
2992           /*
2993           if (MB_CUR_MAX == 1)
2994           */
2995           if (nrules == 0)
2996             ch_collseq = collseqmb[ch];
2997           else
2998             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2999           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
3000             bitset_set (sbcset, ch);
3001         }
3002       return REG_NOERROR;
3003     }
3004
3005   /* Local function for parse_bracket_exp used in _LIBC environement.
3006      Build the collating element which is represented by NAME.
3007      The result are written to MBCSET and SBCSET.
3008      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
3009      pointer argument sinse we may update it.  */
3010
3011   auto inline reg_errcode_t
3012   __attribute ((always_inline))
3013   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
3014          re_charset_t *mbcset;
3015          int *coll_sym_alloc;
3016          re_bitset_ptr_t sbcset;
3017          const unsigned char *name;
3018     {
3019       int32_t elem, idx;
3020       size_t name_len = strlen ((const char *) name);
3021       if (nrules != 0)
3022         {
3023           elem = seek_collating_symbol_entry (name, name_len);
3024           if (symb_table[2 * elem] != 0)
3025             {
3026               /* We found the entry.  */
3027               idx = symb_table[2 * elem + 1];
3028               /* Skip the name of collating element name.  */
3029               idx += 1 + extra[idx];
3030             }
3031           else if (symb_table[2 * elem] == 0 && name_len == 1)
3032             {
3033               /* No valid character, treat it as a normal
3034                  character.  */
3035               bitset_set (sbcset, name[0]);
3036               return REG_NOERROR;
3037             }
3038           else
3039             return REG_ECOLLATE;
3040
3041           /* Got valid collation sequence, add it as a new entry.  */
3042           /* Check the space of the arrays.  */
3043           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
3044             {
3045               /* Not enough, realloc it.  */
3046               /* +1 in case of mbcset->ncoll_syms is 0.  */
3047               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
3048               /* Use realloc since mbcset->coll_syms is NULL
3049                  if *alloc == 0.  */
3050               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3051                                                    new_coll_sym_alloc);
3052               if (BE (new_coll_syms == NULL, 0))
3053                 return REG_ESPACE;
3054               mbcset->coll_syms = new_coll_syms;
3055               *coll_sym_alloc = new_coll_sym_alloc;
3056             }
3057           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3058           return REG_NOERROR;
3059         }
3060       else
3061         {
3062           if (BE (name_len != 1, 0))
3063             return REG_ECOLLATE;
3064           else
3065             {
3066               bitset_set (sbcset, name[0]);
3067               return REG_NOERROR;
3068             }
3069         }
3070     }
3071 #endif
3072
3073   re_token_t br_token;
3074   re_bitset_ptr_t sbcset;
3075 #ifdef RE_ENABLE_I18N
3076   re_charset_t *mbcset;
3077   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3078   int equiv_class_alloc = 0, char_class_alloc = 0;
3079 #endif /* not RE_ENABLE_I18N */
3080   int non_match = 0;
3081   bin_tree_t *work_tree;
3082   int token_len;
3083   int first_round = 1;
3084 #ifdef _LIBC
3085   collseqmb = (const unsigned char *)
3086     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3087   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3088   if (nrules)
3089     {
3090       /*
3091       if (MB_CUR_MAX > 1)
3092       */
3093         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3094       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3095       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3096                                                   _NL_COLLATE_SYMB_TABLEMB);
3097       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3098                                                    _NL_COLLATE_SYMB_EXTRAMB);
3099     }
3100 #endif
3101   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3102 #ifdef RE_ENABLE_I18N
3103   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3104 #endif /* RE_ENABLE_I18N */
3105 #ifdef RE_ENABLE_I18N
3106   if (BE (sbcset == NULL || mbcset == NULL, 0))
3107 #else
3108   if (BE (sbcset == NULL, 0))
3109 #endif /* RE_ENABLE_I18N */
3110     {
3111       *err = REG_ESPACE;
3112       return NULL;
3113     }
3114
3115   token_len = peek_token_bracket (token, regexp, syntax);
3116   if (BE (token->type == END_OF_RE, 0))
3117     {
3118       *err = REG_BADPAT;
3119       goto parse_bracket_exp_free_return;
3120     }
3121   if (token->type == OP_NON_MATCH_LIST)
3122     {
3123 #ifdef RE_ENABLE_I18N
3124       mbcset->non_match = 1;
3125 #endif /* not RE_ENABLE_I18N */
3126       non_match = 1;
3127       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3128         bitset_set (sbcset, '\0');
3129       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3130       token_len = peek_token_bracket (token, regexp, syntax);
3131       if (BE (token->type == END_OF_RE, 0))
3132         {
3133           *err = REG_BADPAT;
3134           goto parse_bracket_exp_free_return;
3135         }
3136     }
3137
3138   /* We treat the first ']' as a normal character.  */
3139   if (token->type == OP_CLOSE_BRACKET)
3140     token->type = CHARACTER;
3141
3142   while (1)
3143     {
3144       bracket_elem_t start_elem, end_elem;
3145       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3146       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3147       reg_errcode_t ret;
3148       int token_len2 = 0, is_range_exp = 0;
3149       re_token_t token2;
3150
3151       start_elem.opr.name = start_name_buf;
3152       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3153                                    syntax, first_round);
3154       if (BE (ret != REG_NOERROR, 0))
3155         {
3156           *err = ret;
3157           goto parse_bracket_exp_free_return;
3158         }
3159       first_round = 0;
3160
3161       /* Get information about the next token.  We need it in any case.  */
3162       token_len = peek_token_bracket (token, regexp, syntax);
3163
3164       /* Do not check for ranges if we know they are not allowed.  */
3165       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3166         {
3167           if (BE (token->type == END_OF_RE, 0))
3168             {
3169               *err = REG_EBRACK;
3170               goto parse_bracket_exp_free_return;
3171             }
3172           if (token->type == OP_CHARSET_RANGE)
3173             {
3174               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3175               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3176               if (BE (token2.type == END_OF_RE, 0))
3177                 {
3178                   *err = REG_EBRACK;
3179                   goto parse_bracket_exp_free_return;
3180                 }
3181               if (token2.type == OP_CLOSE_BRACKET)
3182                 {
3183                   /* We treat the last '-' as a normal character.  */
3184                   re_string_skip_bytes (regexp, -token_len);
3185                   token->type = CHARACTER;
3186                 }
3187               else
3188                 is_range_exp = 1;
3189             }
3190         }
3191
3192       if (is_range_exp == 1)
3193         {
3194           end_elem.opr.name = end_name_buf;
3195           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3196                                        dfa, syntax, 1);
3197           if (BE (ret != REG_NOERROR, 0))
3198             {
3199               *err = ret;
3200               goto parse_bracket_exp_free_return;
3201             }
3202
3203           token_len = peek_token_bracket (token, regexp, syntax);
3204
3205 #ifdef _LIBC
3206           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3207                                   &start_elem, &end_elem);
3208 #else
3209 # ifdef RE_ENABLE_I18N
3210           *err = build_range_exp (sbcset,
3211                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3212                                   &range_alloc, &start_elem, &end_elem);
3213 # else
3214           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3215 # endif
3216 #endif /* RE_ENABLE_I18N */
3217           if (BE (*err != REG_NOERROR, 0))
3218             goto parse_bracket_exp_free_return;
3219         }
3220       else
3221         {
3222           switch (start_elem.type)
3223             {
3224             case SB_CHAR:
3225               bitset_set (sbcset, start_elem.opr.ch);
3226               break;
3227 #ifdef RE_ENABLE_I18N
3228             case MB_CHAR:
3229               /* Check whether the array has enough space.  */
3230               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3231                 {
3232                   wchar_t *new_mbchars;
3233                   /* Not enough, realloc it.  */
3234                   /* +1 in case of mbcset->nmbchars is 0.  */
3235                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3236                   /* Use realloc since array is NULL if *alloc == 0.  */
3237                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3238                                             mbchar_alloc);
3239                   if (BE (new_mbchars == NULL, 0))
3240                     goto parse_bracket_exp_espace;
3241                   mbcset->mbchars = new_mbchars;
3242                 }
3243               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3244               break;
3245 #endif /* RE_ENABLE_I18N */
3246             case EQUIV_CLASS:
3247               *err = build_equiv_class (sbcset,
3248 #ifdef RE_ENABLE_I18N
3249                                         mbcset, &equiv_class_alloc,
3250 #endif /* RE_ENABLE_I18N */
3251                                         start_elem.opr.name);
3252               if (BE (*err != REG_NOERROR, 0))
3253                 goto parse_bracket_exp_free_return;
3254               break;
3255             case COLL_SYM:
3256               *err = build_collating_symbol (sbcset,
3257 #ifdef RE_ENABLE_I18N
3258                                              mbcset, &coll_sym_alloc,
3259 #endif /* RE_ENABLE_I18N */
3260                                              start_elem.opr.name);
3261               if (BE (*err != REG_NOERROR, 0))
3262                 goto parse_bracket_exp_free_return;
3263               break;
3264             case CHAR_CLASS:
3265               *err = build_charclass (regexp->trans, sbcset,
3266 #ifdef RE_ENABLE_I18N
3267                                       mbcset, &char_class_alloc,
3268 #endif /* RE_ENABLE_I18N */
3269                                       start_elem.opr.name, syntax);
3270               if (BE (*err != REG_NOERROR, 0))
3271                goto parse_bracket_exp_free_return;
3272               break;
3273             default:
3274               assert (0);
3275               break;
3276             }
3277         }
3278       if (BE (token->type == END_OF_RE, 0))
3279         {
3280           *err = REG_EBRACK;
3281           goto parse_bracket_exp_free_return;
3282         }
3283       if (token->type == OP_CLOSE_BRACKET)
3284         break;
3285     }
3286
3287   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3288
3289   /* If it is non-matching list.  */
3290   if (non_match)
3291     bitset_not (sbcset);
3292
3293 #ifdef RE_ENABLE_I18N
3294   /* Ensure only single byte characters are set.  */
3295   if (dfa->mb_cur_max > 1)
3296     bitset_mask (sbcset, dfa->sb_char);
3297 #endif /* RE_ENABLE_I18N */
3298
3299   /* Build a tree for simple bracket.  */
3300   br_token.type = SIMPLE_BRACKET;
3301   br_token.opr.sbcset = sbcset;
3302   work_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3303   if (BE (work_tree == NULL, 0))
3304     goto parse_bracket_exp_espace;
3305
3306 #ifdef RE_ENABLE_I18N
3307   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3308       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3309                                                      || mbcset->non_match)))
3310     {
3311       re_token_t alt_token;
3312       bin_tree_t *mbc_tree;
3313       int sbc_idx;
3314       /* Build a tree for complex bracket.  */
3315       dfa->has_mb_node = 1;
3316       for (sbc_idx = 0; sbc_idx < BITSET_UINTS; ++sbc_idx)
3317         if (sbcset[sbc_idx])
3318           break;
3319       /* If there are no bits set in sbcset, there is no point
3320          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3321       if (sbc_idx == BITSET_UINTS)
3322         {
3323           re_free (sbcset);
3324           dfa->nodes[work_tree->node_idx].type = COMPLEX_BRACKET;
3325           dfa->nodes[work_tree->node_idx].opr.mbcset = mbcset;
3326           return work_tree;
3327         }
3328       br_token.type = COMPLEX_BRACKET;
3329       br_token.opr.mbcset = mbcset;
3330       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3331       if (BE (mbc_tree == NULL, 0))
3332         goto parse_bracket_exp_espace;
3333       /* Then join them by ALT node.  */
3334       alt_token.type = OP_ALT;
3335       dfa->has_plural_match = 1;
3336       work_tree = re_dfa_add_tree_node (dfa, work_tree, mbc_tree, &alt_token);
3337       if (BE (mbc_tree != NULL, 1))
3338         return work_tree;
3339     }
3340   else
3341     {
3342       free_charset (mbcset);
3343       return work_tree;
3344     }
3345 #else /* not RE_ENABLE_I18N */
3346   return work_tree;
3347 #endif /* not RE_ENABLE_I18N */
3348
3349  parse_bracket_exp_espace:
3350   *err = REG_ESPACE;
3351  parse_bracket_exp_free_return:
3352   re_free (sbcset);
3353 #ifdef RE_ENABLE_I18N
3354   free_charset (mbcset);
3355 #endif /* RE_ENABLE_I18N */
3356   return NULL;
3357 }
3358
3359 /* Parse an element in the bracket expression.  */
3360
3361 static reg_errcode_t
3362 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax,
3363                        accept_hyphen)
3364      bracket_elem_t *elem;
3365      re_string_t *regexp;
3366      re_token_t *token;
3367      int token_len;
3368      re_dfa_t *dfa;
3369      reg_syntax_t syntax;
3370      int accept_hyphen;
3371 {
3372 #ifdef RE_ENABLE_I18N
3373   int cur_char_size;
3374   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3375   if (cur_char_size > 1)
3376     {
3377       elem->type = MB_CHAR;
3378       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3379       re_string_skip_bytes (regexp, cur_char_size);
3380       return REG_NOERROR;
3381     }
3382 #endif /* RE_ENABLE_I18N */
3383   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3384   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3385       || token->type == OP_OPEN_EQUIV_CLASS)
3386     return parse_bracket_symbol (elem, regexp, token);
3387   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3388     {
3389       /* A '-' must only appear as anything but a range indicator before
3390          the closing bracket.  Everything else is an error.  */
3391       re_token_t token2;
3392       (void) peek_token_bracket (&token2, regexp, syntax);
3393       if (token2.type != OP_CLOSE_BRACKET)
3394         /* The actual error value is not standardized since this whole
3395            case is undefined.  But ERANGE makes good sense.  */
3396         return REG_ERANGE;
3397     }
3398   elem->type = SB_CHAR;
3399   elem->opr.ch = token->opr.c;
3400   return REG_NOERROR;
3401 }
3402
3403 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3404    such as [:<character_class>:], [.<collating_element>.], and
3405    [=<equivalent_class>=].  */
3406
3407 static reg_errcode_t
3408 parse_bracket_symbol (elem, regexp, token)
3409      bracket_elem_t *elem;
3410      re_string_t *regexp;
3411      re_token_t *token;
3412 {
3413   unsigned char ch, delim = token->opr.c;
3414   int i = 0;
3415   if (re_string_eoi(regexp))
3416     return REG_EBRACK;
3417   for (;; ++i)
3418     {
3419       if (i >= BRACKET_NAME_BUF_SIZE)
3420         return REG_EBRACK;
3421       if (token->type == OP_OPEN_CHAR_CLASS)
3422         ch = re_string_fetch_byte_case (regexp);
3423       else
3424         ch = re_string_fetch_byte (regexp);
3425       if (re_string_eoi(regexp))
3426         return REG_EBRACK;
3427       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3428         break;
3429       elem->opr.name[i] = ch;
3430     }
3431   re_string_skip_bytes (regexp, 1);
3432   elem->opr.name[i] = '\0';
3433   switch (token->type)
3434     {
3435     case OP_OPEN_COLL_ELEM:
3436       elem->type = COLL_SYM;
3437       break;
3438     case OP_OPEN_EQUIV_CLASS:
3439       elem->type = EQUIV_CLASS;
3440       break;
3441     case OP_OPEN_CHAR_CLASS:
3442       elem->type = CHAR_CLASS;
3443       break;
3444     default:
3445       break;
3446     }
3447   return REG_NOERROR;
3448 }
3449
3450   /* Helper function for parse_bracket_exp.
3451      Build the equivalence class which is represented by NAME.
3452      The result are written to MBCSET and SBCSET.
3453      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3454      is a pointer argument sinse we may update it.  */
3455
3456 static reg_errcode_t
3457 #ifdef RE_ENABLE_I18N
3458 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3459      re_charset_t *mbcset;
3460      int *equiv_class_alloc;
3461 #else /* not RE_ENABLE_I18N */
3462 build_equiv_class (sbcset, name)
3463 #endif /* not RE_ENABLE_I18N */
3464      re_bitset_ptr_t sbcset;
3465      const unsigned char *name;
3466 {
3467 #if defined _LIBC
3468   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3469   if (nrules != 0)
3470     {
3471       const int32_t *table, *indirect;
3472       const unsigned char *weights, *extra, *cp;
3473       unsigned char char_buf[2];
3474       int32_t idx1, idx2;
3475       unsigned int ch;
3476       size_t len;
3477       /* This #include defines a local function!  */
3478 # include <locale/weight.h>
3479       /* Calculate the index for equivalence class.  */
3480       cp = name;
3481       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3482       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3483                                                _NL_COLLATE_WEIGHTMB);
3484       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3485                                                    _NL_COLLATE_EXTRAMB);
3486       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3487                                                 _NL_COLLATE_INDIRECTMB);
3488       idx1 = findidx (&cp);
3489       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3490         /* This isn't a valid character.  */
3491         return REG_ECOLLATE;
3492
3493       /* Build single byte matcing table for this equivalence class.  */
3494       char_buf[1] = (unsigned char) '\0';
3495       len = weights[idx1];
3496       for (ch = 0; ch < SBC_MAX; ++ch)
3497         {
3498           char_buf[0] = ch;
3499           cp = char_buf;
3500           idx2 = findidx (&cp);
3501 /*
3502           idx2 = table[ch];
3503 */
3504           if (idx2 == 0)
3505             /* This isn't a valid character.  */
3506             continue;
3507           if (len == weights[idx2])
3508             {
3509               int cnt = 0;
3510               while (cnt <= len &&
3511                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3512                 ++cnt;
3513
3514               if (cnt > len)
3515                 bitset_set (sbcset, ch);
3516             }
3517         }
3518       /* Check whether the array has enough space.  */
3519       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3520         {
3521           /* Not enough, realloc it.  */
3522           /* +1 in case of mbcset->nequiv_classes is 0.  */
3523           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3524           /* Use realloc since the array is NULL if *alloc == 0.  */
3525           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3526                                                    int32_t,
3527                                                    new_equiv_class_alloc);
3528           if (BE (new_equiv_classes == NULL, 0))
3529             return REG_ESPACE;
3530           mbcset->equiv_classes = new_equiv_classes;
3531           *equiv_class_alloc = new_equiv_class_alloc;
3532         }
3533       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3534     }
3535   else
3536 #endif /* _LIBC */
3537     {
3538       if (BE (strlen ((const char *) name) != 1, 0))
3539         return REG_ECOLLATE;
3540       bitset_set (sbcset, *name);
3541     }
3542   return REG_NOERROR;
3543 }
3544
3545   /* Helper function for parse_bracket_exp.
3546      Build the character class which is represented by NAME.
3547      The result are written to MBCSET and SBCSET.
3548      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3549      is a pointer argument sinse we may update it.  */
3550
3551 static reg_errcode_t
3552 #ifdef RE_ENABLE_I18N
3553 build_charclass (trans, sbcset, mbcset, char_class_alloc, class_name, syntax)
3554      re_charset_t *mbcset;
3555      int *char_class_alloc;
3556 #else /* not RE_ENABLE_I18N */
3557 build_charclass (trans, sbcset, class_name, syntax)
3558 #endif /* not RE_ENABLE_I18N */
3559      unsigned RE_TRANSLATE_TYPE trans;
3560      re_bitset_ptr_t sbcset;
3561      const unsigned char *class_name;
3562      reg_syntax_t syntax;
3563 {
3564   int i;
3565   const char *name = (const char *) class_name;
3566
3567   /* In case of REG_ICASE "upper" and "lower" match the both of
3568      upper and lower cases.  */
3569   if ((syntax & RE_ICASE)
3570       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3571     name = "alpha";
3572
3573 #ifdef RE_ENABLE_I18N
3574   /* Check the space of the arrays.  */
3575   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3576     {
3577       /* Not enough, realloc it.  */
3578       /* +1 in case of mbcset->nchar_classes is 0.  */
3579       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3580       /* Use realloc since array is NULL if *alloc == 0.  */
3581       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3582                                                new_char_class_alloc);
3583       if (BE (new_char_classes == NULL, 0))
3584         return REG_ESPACE;
3585       mbcset->char_classes = new_char_classes;
3586       *char_class_alloc = new_char_class_alloc;
3587     }
3588   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3589 #endif /* RE_ENABLE_I18N */
3590
3591 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3592     for (i = 0; i < SBC_MAX; ++i)               \
3593       {                                         \
3594         if (ctype_func (i))                     \
3595           {                                     \
3596             int ch = trans ? trans[i] : i;      \
3597             bitset_set (sbcset, ch);            \
3598           }                                     \
3599       }
3600
3601   if (strcmp (name, "alnum") == 0)
3602     BUILD_CHARCLASS_LOOP (isalnum)
3603   else if (strcmp (name, "cntrl") == 0)
3604     BUILD_CHARCLASS_LOOP (iscntrl)
3605   else if (strcmp (name, "lower") == 0)
3606     BUILD_CHARCLASS_LOOP (islower)
3607   else if (strcmp (name, "space") == 0)
3608     BUILD_CHARCLASS_LOOP (isspace)
3609   else if (strcmp (name, "alpha") == 0)
3610     BUILD_CHARCLASS_LOOP (isalpha)
3611   else if (strcmp (name, "digit") == 0)
3612     BUILD_CHARCLASS_LOOP (isdigit)
3613   else if (strcmp (name, "print") == 0)
3614     BUILD_CHARCLASS_LOOP (isprint)
3615   else if (strcmp (name, "upper") == 0)
3616     BUILD_CHARCLASS_LOOP (isupper)
3617   else if (strcmp (name, "blank") == 0)
3618     BUILD_CHARCLASS_LOOP (isblank)
3619   else if (strcmp (name, "graph") == 0)
3620     BUILD_CHARCLASS_LOOP (isgraph)
3621   else if (strcmp (name, "punct") == 0)
3622     BUILD_CHARCLASS_LOOP (ispunct)
3623   else if (strcmp (name, "xdigit") == 0)
3624     BUILD_CHARCLASS_LOOP (isxdigit)
3625   else
3626     return REG_ECTYPE;
3627
3628   return REG_NOERROR;
3629 }
3630
3631 static bin_tree_t *
3632 build_charclass_op (dfa, trans, class_name, extra, non_match, err)
3633      re_dfa_t *dfa;
3634      unsigned RE_TRANSLATE_TYPE trans;
3635      const unsigned char *class_name;
3636      const unsigned char *extra;
3637      int non_match;
3638      reg_errcode_t *err;
3639 {
3640   re_bitset_ptr_t sbcset;
3641 #ifdef RE_ENABLE_I18N
3642   re_charset_t *mbcset;
3643   int alloc = 0;
3644 #endif /* not RE_ENABLE_I18N */
3645   reg_errcode_t ret;
3646   re_token_t br_token;
3647   bin_tree_t *tree;
3648
3649   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3650 #ifdef RE_ENABLE_I18N
3651   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3652 #endif /* RE_ENABLE_I18N */
3653
3654 #ifdef RE_ENABLE_I18N
3655   if (BE (sbcset == NULL || mbcset == NULL, 0))
3656 #else /* not RE_ENABLE_I18N */
3657   if (BE (sbcset == NULL, 0))
3658 #endif /* not RE_ENABLE_I18N */
3659     {
3660       *err = REG_ESPACE;
3661       return NULL;
3662     }
3663
3664   if (non_match)
3665     {
3666 #ifdef RE_ENABLE_I18N
3667       /*
3668       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3669         bitset_set(cset->sbcset, '\0');
3670       */
3671       mbcset->non_match = 1;
3672 #endif /* not RE_ENABLE_I18N */
3673     }
3674
3675   /* We don't care the syntax in this case.  */
3676   ret = build_charclass (trans, sbcset,
3677 #ifdef RE_ENABLE_I18N
3678                          mbcset, &alloc,
3679 #endif /* RE_ENABLE_I18N */
3680                          class_name, 0);
3681
3682   if (BE (ret != REG_NOERROR, 0))
3683     {
3684       re_free (sbcset);
3685 #ifdef RE_ENABLE_I18N
3686       free_charset (mbcset);
3687 #endif /* RE_ENABLE_I18N */
3688       *err = ret;
3689       return NULL;
3690     }
3691   /* \w match '_' also.  */
3692   for (; *extra; extra++)
3693     bitset_set (sbcset, *extra);
3694
3695   /* If it is non-matching list.  */
3696   if (non_match)
3697     bitset_not (sbcset);
3698
3699 #ifdef RE_ENABLE_I18N
3700   /* Ensure only single byte characters are set.  */
3701   if (dfa->mb_cur_max > 1)
3702     bitset_mask (sbcset, dfa->sb_char);
3703 #endif
3704
3705   /* Build a tree for simple bracket.  */
3706   br_token.type = SIMPLE_BRACKET;
3707   br_token.opr.sbcset = sbcset;
3708   tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3709   if (BE (tree == NULL, 0))
3710     goto build_word_op_espace;
3711
3712 #ifdef RE_ENABLE_I18N
3713   if (dfa->mb_cur_max > 1)
3714     {
3715       re_token_t alt_token;
3716       bin_tree_t *mbc_tree;
3717       /* Build a tree for complex bracket.  */
3718       br_token.type = COMPLEX_BRACKET;
3719       br_token.opr.mbcset = mbcset;
3720       dfa->has_mb_node = 1;
3721       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
3722       if (BE (mbc_tree == NULL, 0))
3723         goto build_word_op_espace;
3724       /* Then join them by ALT node.  */
3725       alt_token.type = OP_ALT;
3726       dfa->has_plural_match = 1;
3727       tree = re_dfa_add_tree_node (dfa, tree, mbc_tree, &alt_token);
3728       if (BE (mbc_tree != NULL, 1))
3729         return tree;
3730     }
3731   else
3732     {
3733       free_charset (mbcset);
3734       return tree;
3735     }
3736 #else /* not RE_ENABLE_I18N */
3737   return tree;
3738 #endif /* not RE_ENABLE_I18N */
3739
3740  build_word_op_espace:
3741   re_free (sbcset);
3742 #ifdef RE_ENABLE_I18N
3743   free_charset (mbcset);
3744 #endif /* RE_ENABLE_I18N */
3745   *err = REG_ESPACE;
3746   return NULL;
3747 }
3748
3749 /* This is intended for the expressions like "a{1,3}".
3750    Fetch a number from `input', and return the number.
3751    Return -1, if the number field is empty like "{,1}".
3752    Return -2, If an error is occured.  */
3753
3754 static int
3755 fetch_number (input, token, syntax)
3756      re_string_t *input;
3757      re_token_t *token;
3758      reg_syntax_t syntax;
3759 {
3760   int num = -1;
3761   unsigned char c;
3762   while (1)
3763     {
3764       fetch_token (token, input, syntax);
3765       c = token->opr.c;
3766       if (BE (token->type == END_OF_RE, 0))
3767         return -2;
3768       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3769         break;
3770       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3771              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3772       num = (num > RE_DUP_MAX) ? -2 : num;
3773     }
3774   return num;
3775 }
3776 \f
3777 #ifdef RE_ENABLE_I18N
3778 static void
3779 free_charset (re_charset_t *cset)
3780 {
3781   re_free (cset->mbchars);
3782 # ifdef _LIBC
3783   re_free (cset->coll_syms);
3784   re_free (cset->equiv_classes);
3785   re_free (cset->range_starts);
3786   re_free (cset->range_ends);
3787 # endif
3788   re_free (cset->char_classes);
3789   re_free (cset);
3790 }
3791 #endif /* RE_ENABLE_I18N */
3792 \f
3793 /* Functions for binary tree operation.  */
3794
3795 /* Create a tree node.  */
3796
3797 static bin_tree_t *
3798 create_tree (dfa, left, right, type, index)
3799      re_dfa_t *dfa;
3800      bin_tree_t *left;
3801      bin_tree_t *right;
3802      re_token_type_t type;
3803      int index;
3804 {
3805   bin_tree_t *tree;
3806   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3807     {
3808       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3809
3810       if (storage == NULL)
3811         return NULL;
3812       storage->next = dfa->str_tree_storage;
3813       dfa->str_tree_storage = storage;
3814       dfa->str_tree_storage_idx = 0;
3815     }
3816   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3817
3818   tree->parent = NULL;
3819   tree->left = left;
3820   tree->right = right;
3821   tree->type = type;
3822   tree->node_idx = index;
3823   tree->first = -1;
3824   tree->next = -1;
3825   re_node_set_init_empty (&tree->eclosure);
3826
3827   if (left != NULL)
3828     left->parent = tree;
3829   if (right != NULL)
3830     right->parent = tree;
3831   return tree;
3832 }
3833
3834 /* Create both a DFA node and a tree for it.  */
3835
3836 static bin_tree_t *
3837 re_dfa_add_tree_node (dfa, left, right, token)
3838      re_dfa_t *dfa;
3839      bin_tree_t *left;
3840      bin_tree_t *right;
3841      const re_token_t *token;
3842 {
3843   int new_idx = re_dfa_add_node (dfa, *token, 0);
3844
3845   if (new_idx == -1)
3846     return NULL;
3847
3848   return create_tree (dfa, left, right, 0, new_idx);
3849 }
3850
3851 /* Mark the tree SRC as an optional subexpression.  */
3852
3853 static void
3854 mark_opt_subexp (src, dfa)
3855      const bin_tree_t *src;
3856      re_dfa_t *dfa;
3857 {
3858   /* Pass an OPT_SUBEXP_IDX which is != 1 if the duplicated tree is
3859      a subexpression.  */
3860   if (src->type == CONCAT
3861       && src->left->type == NON_TYPE
3862       && dfa->nodes[src->left->node_idx].type == OP_OPEN_SUBEXP)
3863     mark_opt_subexp_iter (src, dfa, dfa->nodes[src->left->node_idx].opr.idx);
3864 }
3865
3866
3867 /* Recursive tree walker for mark_opt_subexp.  */
3868
3869 static void
3870 mark_opt_subexp_iter (src, dfa, idx)
3871      const bin_tree_t *src;
3872      re_dfa_t *dfa;
3873      int idx;
3874 {
3875   int node_idx;
3876
3877   if (src->type == NON_TYPE)
3878     {
3879       node_idx = src->node_idx;
3880       if ((dfa->nodes[node_idx].type == OP_OPEN_SUBEXP
3881            || dfa->nodes[node_idx].type == OP_CLOSE_SUBEXP)
3882           && dfa->nodes[node_idx].opr.idx == idx)
3883         dfa->nodes[node_idx].opt_subexp = 1;
3884      }
3885
3886   if (src->left != NULL)
3887     mark_opt_subexp_iter (src->left, dfa, idx);
3888
3889   if (src->right != NULL)
3890     mark_opt_subexp_iter (src->right, dfa, idx);
3891 }
3892
3893
3894 /* Duplicate the node SRC, and return new node.  */
3895
3896 static bin_tree_t *
3897 duplicate_tree (src, dfa)
3898      const bin_tree_t *src;
3899      re_dfa_t *dfa;
3900 {
3901   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3902   int new_node_idx;
3903   /* Since node indies must be according to Post-order of the tree,
3904      we must duplicate the left at first.  */
3905   if (src->left != NULL)
3906     {
3907       left = duplicate_tree (src->left, dfa);
3908       if (left == NULL)
3909         return NULL;
3910     }
3911
3912   /* Secondaly, duplicate the right.  */
3913   if (src->right != NULL)
3914     {
3915       right = duplicate_tree (src->right, dfa);
3916       if (right == NULL)
3917         return NULL;
3918     }
3919
3920   /* At last, duplicate itself.  */
3921   if (src->type == NON_TYPE)
3922     {
3923       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3924       dfa->nodes[new_node_idx].duplicated = 1;
3925       if (BE (new_node_idx == -1, 0))
3926         return NULL;
3927     }
3928   else
3929     new_node_idx = src->type;
3930
3931   new_tree = create_tree (dfa, left, right, src->type, new_node_idx);
3932   return new_tree;
3933 }