posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <limits.h>
  24 #include <locale.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #if defined HAVE_WCHAR_H || defined _LIBC
  30 # include <wchar.h>
  31 #endif /* HAVE_WCHAR_H || _LIBC */
  32 #if defined HAVE_WCTYPE_H || defined _LIBC
  33 # include <wctype.h>
  34 #endif /* HAVE_WCTYPE_H || _LIBC */
  35
  36 /* In case that the system doesn't have isblank().  */
  37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
  38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
  39 #endif
  40
  41 #ifdef _LIBC
  42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
  43 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
  44 #   include <locale/localeinfo.h>
  45 #   include <locale/elem-hash.h>
  46 #   include <locale/coll-lookup.h>
  47 # endif
  48 #endif
  49
  50 /* This is for other GNU distributions with internationalized messages.  */
  51 #if HAVE_LIBINTL_H || defined _LIBC
  52 # include <libintl.h>
  53 # ifdef _LIBC
  54 #  undef gettext
  55 #  define gettext(msgid) \
  56   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
  57 # endif
  58 #else
  59 # define gettext(msgid) (msgid)
  60 #endif
  61
  62 #ifndef gettext_noop
  63 /* This define is so xgettext can find the internationalizable
  64    strings.  */
  65 # define gettext_noop(String) String
  66 #endif
  67
  68 #include <regex.h>
  69 #include "regex_internal.h"
  70
  71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  72                                           int length, reg_syntax_t syntax);
  73 static void re_compile_fastmap_iter (regex_t *bufp,
  74                                      const re_dfastate_t *init_state,
  75                                      char *fastmap);
  76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  78 #ifdef RE_ENABLE_I18N
  79 static void free_charset (re_charset_t *cset);
  80 #endif /* RE_ENABLE_I18N */
  81 static void free_workarea_compile (regex_t *preg);
  82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  83 static reg_errcode_t analyze (re_dfa_t *dfa);
  84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  89                                              int top_clone_node, int root_node,
  90                                              unsigned int constraint);
  91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  92                                      unsigned int constraint);
  93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  95                                          int node, int root);
  96 static void calc_inveclosure (re_dfa_t *dfa);
  97 static int fetch_number (re_string_t *input, re_token_t *token,
  98                          reg_syntax_t syntax);
  99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
 100 static int peek_token (re_token_t *token, re_string_t *input,
 101                         reg_syntax_t syntax);
 102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
 103                                reg_syntax_t syntax);
 104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
 105                           reg_syntax_t syntax, reg_errcode_t *err);
 106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
 107                                   re_token_t *token, reg_syntax_t syntax,
 108                                   int nest, reg_errcode_t *err);
 109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
 110                                  re_token_t *token, reg_syntax_t syntax,
 111                                  int nest, reg_errcode_t *err);
 112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
 113                                      re_token_t *token, reg_syntax_t syntax,
 114                                      int nest, reg_errcode_t *err);
 115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
 116                                   re_token_t *token, reg_syntax_t syntax,
 117                                   int nest, reg_errcode_t *err);
 118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
 119                                  re_dfa_t *dfa, re_token_t *token,
 120                                  reg_syntax_t syntax, reg_errcode_t *err);
 121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
 122                                       re_token_t *token, reg_syntax_t syntax,
 123                                       reg_errcode_t *err);
 124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
 125                                             re_string_t *regexp,
 126                                             re_token_t *token, int token_len,
 127                                             re_dfa_t *dfa,
 128                                             reg_syntax_t syntax);
 129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
 130                                           re_string_t *regexp,
 131                                           re_token_t *token);
 132 #ifndef _LIBC
 133 # ifdef RE_ENABLE_I18N
 134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 135                                       re_charset_t *mbcset, int *range_alloc,
 136                                       bracket_elem_t *start_elem,
 137                                       bracket_elem_t *end_elem);
 138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 139                                              re_charset_t *mbcset,
 140                                              int *coll_sym_alloc,
 141                                              const unsigned char *name);
 142 # else /* not RE_ENABLE_I18N */
 143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 144                                       bracket_elem_t *start_elem,
 145                                       bracket_elem_t *end_elem);
 146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 147                                              const unsigned char *name);
 148 # endif /* not RE_ENABLE_I18N */
 149 #endif /* not _LIBC */
 150 #ifdef RE_ENABLE_I18N
 151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 152                                         re_charset_t *mbcset,
 153                                         int *equiv_class_alloc,
 154                                         const unsigned char *name);
 155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 156                                       re_charset_t *mbcset,
 157                                       int *char_class_alloc,
 158                                       const unsigned char *class_name,
 159                                       reg_syntax_t syntax);
 160 #else  /* not RE_ENABLE_I18N */
 161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 162                                         const unsigned char *name);
 163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 164                                       const unsigned char *class_name,
 165                                       reg_syntax_t syntax);
 166 #endif /* not RE_ENABLE_I18N */
 167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
 168 static void free_bin_tree (bin_tree_t *tree);
 169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 170                                 re_token_type_t type, int index);
 171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 172 \f
 173 /* This table gives an error message for each of the error codes listed
 174    in regex.h.  Obviously the order here has to be same as there.
 175    POSIX doesn't require that we do anything for REG_NOERROR,
 176    but why not be nice?  */
 177
 178 const char __re_error_msgid[] attribute_hidden =
 179   {
 180 #define REG_NOERROR_IDX 0
 181     gettext_noop ("Success")    /* REG_NOERROR */
 182     "\0"
 183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 184     gettext_noop ("No match")   /* REG_NOMATCH */
 185     "\0"
 186 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 187     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 188     "\0"
 189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 190     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 191     "\0"
 192 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 193     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 194     "\0"
 195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 196     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 197     "\0"
 198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 199     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 200     "\0"
 201 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 202     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 203     "\0"
 204 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 205     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 206     "\0"
 207 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 208     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 209     "\0"
 210 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 211     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 212     "\0"
 213 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 214     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 215     "\0"
 216 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 217     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 218     "\0"
 219 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 220     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 221     "\0"
 222 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 223     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 224     "\0"
 225 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 226     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 227     "\0"
 228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 229     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 230   };
 231
 232 const size_t __re_error_msgid_idx[] attribute_hidden =
 233   {
 234     REG_NOERROR_IDX,
 235     REG_NOMATCH_IDX,
 236     REG_BADPAT_IDX,
 237     REG_ECOLLATE_IDX,
 238     REG_ECTYPE_IDX,
 239     REG_EESCAPE_IDX,
 240     REG_ESUBREG_IDX,
 241     REG_EBRACK_IDX,
 242     REG_EPAREN_IDX,
 243     REG_EBRACE_IDX,
 244     REG_BADBR_IDX,
 245     REG_ERANGE_IDX,
 246     REG_ESPACE_IDX,
 247     REG_BADRPT_IDX,
 248     REG_EEND_IDX,
 249     REG_ESIZE_IDX,
 250     REG_ERPAREN_IDX
 251   };
 252 \f
 253 /* Entry points for GNU code.  */
 254
 255 /* re_compile_pattern is the GNU regular expression compiler: it
 256    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 257    Returns 0 if the pattern was valid, otherwise an error string.
 258
 259    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 260    are set in BUFP on entry.  */
 261
 262 const char *
 263 re_compile_pattern (pattern, length, bufp)
 264     const char *pattern;
 265     size_t length;
 266     struct re_pattern_buffer *bufp;
 267 {
 268   reg_errcode_t ret;
 269
 270   /* And GNU code determines whether or not to get register information
 271      by passing null for the REGS argument to re_match, etc., not by
 272      setting no_sub.  */
 273   bufp->no_sub = 0;
 274
 275   /* Match anchors at newline.  */
 276   bufp->newline_anchor = 1;
 277
 278   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 279
 280   if (!ret)
 281     return NULL;
 282   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 283 }
 284 #ifdef _LIBC
 285 weak_alias (__re_compile_pattern, re_compile_pattern)
 286 #endif
 287
 288 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 289    also be assigned to arbitrarily: each pattern buffer stores its own
 290    syntax, so it can be changed between regex compilations.  */
 291 /* This has no initializer because initialized variables in Emacs
 292    become read-only after dumping.  */
 293 reg_syntax_t re_syntax_options;
 294
 295
 296 /* Specify the precise syntax of regexps for compilation.  This provides
 297    for compatibility for various utilities which historically have
 298    different, incompatible syntaxes.
 299
 300    The argument SYNTAX is a bit mask comprised of the various bits
 301    defined in regex.h.  We return the old syntax.  */
 302
 303 reg_syntax_t
 304 re_set_syntax (syntax)
 305     reg_syntax_t syntax;
 306 {
 307   reg_syntax_t ret = re_syntax_options;
 308
 309   re_syntax_options = syntax;
 310   return ret;
 311 }
 312 #ifdef _LIBC
 313 weak_alias (__re_set_syntax, re_set_syntax)
 314 #endif
 315
 316 int
 317 re_compile_fastmap (bufp)
 318     struct re_pattern_buffer *bufp;
 319 {
 320   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 321   char *fastmap = bufp->fastmap;
 322
 323   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 324   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 325   if (dfa->init_state != dfa->init_state_word)
 326     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 327   if (dfa->init_state != dfa->init_state_nl)
 328     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 329   if (dfa->init_state != dfa->init_state_begbuf)
 330     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 331   bufp->fastmap_accurate = 1;
 332   return 0;
 333 }
 334 #ifdef _LIBC
 335 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 336 #endif
 337
 338 static inline void
 339 re_set_fastmap (char *fastmap, int icase, int ch)
 340 {
 341   fastmap[ch] = 1;
 342   if (icase)
 343     fastmap[tolower (ch)] = 1;
 344 }
 345
 346 /* Helper function for re_compile_fastmap.
 347    Compile fastmap for the initial_state INIT_STATE.  */
 348
 349 static void
 350 re_compile_fastmap_iter (bufp, init_state, fastmap)
 351      regex_t *bufp;
 352      const re_dfastate_t *init_state;
 353      char *fastmap;
 354 {
 355   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 356   int node_cnt;
 357   int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
 358   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 359     {
 360       int node = init_state->nodes.elems[node_cnt];
 361       re_token_type_t type = dfa->nodes[node].type;
 362
 363       if (type == CHARACTER)
 364         re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 365       else if (type == SIMPLE_BRACKET)
 366         {
 367           int i, j, ch;
 368           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 369             for (j = 0; j < UINT_BITS; ++j, ++ch)
 370               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 371                 re_set_fastmap (fastmap, icase, ch);
 372         }
 373 #ifdef RE_ENABLE_I18N
 374       else if (type == COMPLEX_BRACKET)
 375         {
 376           int i;
 377           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 378           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 379               || cset->nranges || cset->nchar_classes)
 380             {
 381 # ifdef _LIBC
 382               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 383                 {
 384                   /* In this case we want to catch the bytes which are
 385                      the first byte of any collation elements.
 386                      e.g. In da_DK, we want to catch 'a' since "aa"
 387                           is a valid collation element, and don't catch
 388                           'b' since 'b' is the only collation element
 389                           which starts from 'b'.  */
 390                   int j, ch;
 391                   const int32_t *table = (const int32_t *)
 392                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 393                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 394                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 395                       if (table[ch] < 0)
 396                         re_set_fastmap (fastmap, icase, ch);
 397                 }
 398 # else
 399               if (MB_CUR_MAX > 1)
 400                 for (i = 0; i < SBC_MAX; ++i)
 401                   if (__btowc (i) == WEOF)
 402                     re_set_fastmap (fastmap, icase, i);
 403 # endif /* not _LIBC */
 404             }
 405           for (i = 0; i < cset->nmbchars; ++i)
 406             {
 407               char buf[256];
 408               mbstate_t state;
 409               memset (&state, '\0', sizeof (state));
 410               __wcrtomb (buf, cset->mbchars[i], &state);
 411               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 412             }
 413         }
 414 #endif /* RE_ENABLE_I18N */
 415       else if (type == END_OF_RE || type == OP_PERIOD)
 416         {
 417           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 418           if (type == END_OF_RE)
 419             bufp->can_be_null = 1;
 420           return;
 421         }
 422     }
 423 }
 424 \f
 425 /* Entry point for POSIX code.  */
 426 /* regcomp takes a regular expression as a string and compiles it.
 427
 428    PREG is a regex_t *.  We do not expect any fields to be initialized,
 429    since POSIX says we shouldn't.  Thus, we set
 430
 431      `buffer' to the compiled pattern;
 432      `used' to the length of the compiled pattern;
 433      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 434        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 435        RE_SYNTAX_POSIX_BASIC;
 436      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 437      `fastmap' to an allocated space for the fastmap;
 438      `fastmap_accurate' to zero;
 439      `re_nsub' to the number of subexpressions in PATTERN.
 440
 441    PATTERN is the address of the pattern string.
 442
 443    CFLAGS is a series of bits which affect compilation.
 444
 445      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 446      use POSIX basic syntax.
 447
 448      If REG_NEWLINE is set, then . and [^...] don't match newline.
 449      Also, regexec will try a match beginning after every newline.
 450
 451      If REG_ICASE is set, then we considers upper- and lowercase
 452      versions of letters to be equivalent when matching.
 453
 454      If REG_NOSUB is set, then when PREG is passed to regexec, that
 455      routine will report only success or failure, and nothing about the
 456      registers.
 457
 458    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 459    the return codes and their meanings.)  */
 460
 461 int
 462 regcomp (preg, pattern, cflags)
 463     regex_t *__restrict preg;
 464     const char *__restrict pattern;
 465     int cflags;
 466 {
 467   reg_errcode_t ret;
 468   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 469                          : RE_SYNTAX_POSIX_BASIC);
 470
 471   preg->buffer = NULL;
 472   preg->allocated = 0;
 473   preg->used = 0;
 474
 475   /* Try to allocate space for the fastmap.  */
 476   preg->fastmap = re_malloc (char, SBC_MAX);
 477   if (BE (preg->fastmap == NULL, 0))
 478     return REG_ESPACE;
 479
 480   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 481
 482   /* If REG_NEWLINE is set, newlines are treated differently.  */
 483   if (cflags & REG_NEWLINE)
 484     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 485       syntax &= ~RE_DOT_NEWLINE;
 486       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 487       /* It also changes the matching behavior.  */
 488       preg->newline_anchor = 1;
 489     }
 490   else
 491     preg->newline_anchor = 0;
 492   preg->no_sub = !!(cflags & REG_NOSUB);
 493   preg->translate = NULL;
 494
 495   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 496
 497   /* POSIX doesn't distinguish between an unmatched open-group and an
 498      unmatched close-group: both are REG_EPAREN.  */
 499   if (ret == REG_ERPAREN)
 500     ret = REG_EPAREN;
 501
 502   /* We have already checked preg->fastmap != NULL.  */
 503   if (BE (ret == REG_NOERROR, 1))
 504     /* Compute the fastmap now, since regexec cannot modify the pattern
 505        buffer.  This function nevers fails in this implementation.  */
 506     (void) __re_compile_fastmap (preg);
 507   else
 508     {
 509       /* Some error occurred while compiling the expression.  */
 510       re_free (preg->fastmap);
 511       preg->fastmap = NULL;
 512     }
 513
 514   return (int) ret;
 515 }
 516 #ifdef _LIBC
 517 weak_alias (__regcomp, regcomp)
 518 #endif
 519
 520 /* Returns a message corresponding to an error code, ERRCODE, returned
 521    from either regcomp or regexec.   We don't use PREG here.  */
 522
 523 size_t
 524 regerror (errcode, preg, errbuf, errbuf_size)
 525     int errcode;
 526     const regex_t *preg;
 527     char *errbuf;
 528     size_t errbuf_size;
 529 {
 530   const char *msg;
 531   size_t msg_size;
 532
 533   if (BE (errcode < 0
 534           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 535                                / sizeof (__re_error_msgid_idx[0])), 0))
 536     /* Only error codes returned by the rest of the code should be passed
 537        to this routine.  If we are given anything else, or if other regex
 538        code generates an invalid error code, then the program has a bug.
 539        Dump core so we can fix it.  */
 540     abort ();
 541
 542   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 543
 544   msg_size = strlen (msg) + 1; /* Includes the null.  */
 545
 546   if (BE (errbuf_size != 0, 1))
 547     {
 548       if (BE (msg_size > errbuf_size, 0))
 549         {
 550 #if defined HAVE_MEMPCPY || defined _LIBC
 551           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 552 #else
 553           memcpy (errbuf, msg, errbuf_size - 1);
 554           errbuf[errbuf_size - 1] = 0;
 555 #endif
 556         }
 557       else
 558         memcpy (errbuf, msg, msg_size);
 559     }
 560
 561   return msg_size;
 562 }
 563 #ifdef _LIBC
 564 weak_alias (__regerror, regerror)
 565 #endif
 566
 567
 568 static void
 569 free_dfa_content (re_dfa_t *dfa)
 570 {
 571   int i, j;
 572
 573   re_free (dfa->subexps);
 574
 575   for (i = 0; i < dfa->nodes_len; ++i)
 576     {
 577       re_token_t *node = dfa->nodes + i;
 578 #ifdef RE_ENABLE_I18N
 579       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 580         free_charset (node->opr.mbcset);
 581       else
 582 #endif /* RE_ENABLE_I18N */
 583         if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 584           re_free (node->opr.sbcset);
 585     }
 586   re_free (dfa->nexts);
 587   for (i = 0; i < dfa->nodes_len; ++i)
 588     {
 589       if (dfa->eclosures != NULL)
 590         re_node_set_free (dfa->eclosures + i);
 591       if (dfa->inveclosures != NULL)
 592         re_node_set_free (dfa->inveclosures + i);
 593       if (dfa->edests != NULL)
 594         re_node_set_free (dfa->edests + i);
 595     }
 596   re_free (dfa->edests);
 597   re_free (dfa->eclosures);
 598   re_free (dfa->inveclosures);
 599   re_free (dfa->nodes);
 600
 601   for (i = 0; i <= dfa->state_hash_mask; ++i)
 602     {
 603       struct re_state_table_entry *entry = dfa->state_table + i;
 604       for (j = 0; j < entry->num; ++j)
 605         {
 606           re_dfastate_t *state = entry->array[j];
 607           free_state (state);
 608         }
 609       re_free (entry->array);
 610     }
 611   re_free (dfa->state_table);
 612
 613   if (dfa->word_char != NULL)
 614     re_free (dfa->word_char);
 615 #ifdef DEBUG
 616   re_free (dfa->re_str);
 617 #endif
 618
 619   re_free (dfa);
 620 }
 621
 622
 623 /* Free dynamically allocated space used by PREG.  */
 624
 625 void
 626 regfree (preg)
 627     regex_t *preg;
 628 {
 629   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 630   if (BE (dfa != NULL, 1))
 631     free_dfa_content (dfa);
 632
 633   re_free (preg->fastmap);
 634 }
 635 #ifdef _LIBC
 636 weak_alias (__regfree, regfree)
 637 #endif
 638 \f
 639 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 640    them unless specifically requested.  */
 641
 642 #if defined _REGEX_RE_COMP || defined _LIBC
 643
 644 /* BSD has one and only one pattern buffer.  */
 645 static struct re_pattern_buffer re_comp_buf;
 646
 647 char *
 648 # ifdef _LIBC
 649 /* Make these definitions weak in libc, so POSIX programs can redefine
 650    these names if they don't use our functions, and still use
 651    regcomp/regexec above without link errors.  */
 652 weak_function
 653 # endif
 654 re_comp (s)
 655      const char *s;
 656 {
 657   reg_errcode_t ret;
 658   char *fastmap;
 659
 660   if (!s)
 661     {
 662       if (!re_comp_buf.buffer)
 663         return gettext ("No previous regular expression");
 664       return 0;
 665     }
 666
 667   if (re_comp_buf.buffer)
 668     {
 669       fastmap = re_comp_buf.fastmap;
 670       re_comp_buf.fastmap = NULL;
 671       __regfree (&re_comp_buf);
 672       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 673       re_comp_buf.fastmap = fastmap;
 674     }
 675
 676   if (re_comp_buf.fastmap == NULL)
 677     {
 678       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 679       if (re_comp_buf.fastmap == NULL)
 680         return (char *) gettext (__re_error_msgid
 681                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 682     }
 683
 684   /* Since `re_exec' always passes NULL for the `regs' argument, we
 685      don't need to initialize the pattern buffer fields which affect it.  */
 686
 687   /* Match anchors at newlines.  */
 688   re_comp_buf.newline_anchor = 1;
 689
 690   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 691
 692   if (!ret)
 693     return NULL;
 694
 695   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 696   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 697 }
 698
 699 #ifdef _LIBC
 700 libc_freeres_fn (free_mem)
 701 {
 702   __regfree (&re_comp_buf);
 703 }
 704 #endif
 705
 706 #endif /* _REGEX_RE_COMP */
 707 \f
 708 /* Internal entry point.
 709    Compile the regular expression PATTERN, whose length is LENGTH.
 710    SYNTAX indicate regular expression's syntax.  */
 711
 712 static reg_errcode_t
 713 re_compile_internal (preg, pattern, length, syntax)
 714      regex_t *preg;
 715      const char * pattern;
 716      int length;
 717      reg_syntax_t syntax;
 718 {
 719   reg_errcode_t err = REG_NOERROR;
 720   re_dfa_t *dfa;
 721   re_string_t regexp;
 722
 723   /* Initialize the pattern buffer.  */
 724   preg->fastmap_accurate = 0;
 725   preg->syntax = syntax;
 726   preg->not_bol = preg->not_eol = 0;
 727   preg->used = 0;
 728   preg->re_nsub = 0;
 729   preg->can_be_null = 0;
 730   preg->regs_allocated = REGS_UNALLOCATED;
 731
 732   /* Initialize the dfa.  */
 733   dfa = (re_dfa_t *) preg->buffer;
 734   if (preg->allocated < sizeof (re_dfa_t))
 735     {
 736       /* If zero allocated, but buffer is non-null, try to realloc
 737          enough space.  This loses if buffer's address is bogus, but
 738          that is the user's responsibility.  If ->buffer is NULL this
 739          is a simple allocation.  */
 740       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 741       if (dfa == NULL)
 742         return REG_ESPACE;
 743       preg->allocated = sizeof (re_dfa_t);
 744     }
 745   preg->buffer = (unsigned char *) dfa;
 746   preg->used = sizeof (re_dfa_t);
 747
 748   err = init_dfa (dfa, length);
 749   if (BE (err != REG_NOERROR, 0))
 750     {
 751       re_free (dfa);
 752       preg->buffer = NULL;
 753       return err;
 754     }
 755 #ifdef DEBUG
 756   dfa->re_str = re_malloc (char, length + 1);
 757   strncpy (dfa->re_str, pattern, length + 1);
 758 #endif
 759
 760   err = re_string_construct (&regexp, pattern, length, preg->translate,
 761                              syntax & RE_ICASE);
 762   if (BE (err != REG_NOERROR, 0))
 763     {
 764       re_free (dfa);
 765       preg->buffer = NULL;
 766       return err;
 767     }
 768
 769   /* Parse the regular expression, and build a structure tree.  */
 770   preg->re_nsub = 0;
 771   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 772   if (BE (dfa->str_tree == NULL, 0))
 773     goto re_compile_internal_free_return;
 774
 775   /* Analyze the tree and collect information which is necessary to
 776      create the dfa.  */
 777   err = analyze (dfa);
 778   if (BE (err != REG_NOERROR, 0))
 779     goto re_compile_internal_free_return;
 780
 781   /* Then create the initial state of the dfa.  */
 782   err = create_initial_state (dfa);
 783
 784   /* Release work areas.  */
 785   free_workarea_compile (preg);
 786   re_string_destruct (&regexp);
 787
 788   if (BE (err != REG_NOERROR, 0))
 789     {
 790     re_compile_internal_free_return:
 791       free_dfa_content (dfa);
 792       preg->buffer = NULL;
 793     }
 794
 795   return err;
 796 }
 797
 798 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 799    as the initial length of some arrays.  */
 800
 801 static reg_errcode_t
 802 init_dfa (dfa, pat_len)
 803      re_dfa_t *dfa;
 804      int pat_len;
 805 {
 806   int table_size;
 807
 808   memset (dfa, '\0', sizeof (re_dfa_t));
 809
 810   dfa->nodes_alloc = pat_len + 1;
 811   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 812
 813   dfa->states_alloc = pat_len + 1;
 814
 815   /*  table_size = 2 ^ ceil(log pat_len) */
 816   for (table_size = 1; table_size > 0; table_size <<= 1)
 817     if (table_size > pat_len)
 818       break;
 819
 820   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 821   dfa->state_hash_mask = table_size - 1;
 822
 823   dfa->subexps_alloc = 1;
 824   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 825   dfa->word_char = NULL;
 826
 827   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 828           || dfa->subexps == NULL, 0))
 829     {
 830       /* We don't bother to free anything which was allocated.  Very
 831          soon the process will go down anyway.  */
 832       dfa->subexps = NULL;
 833       dfa->state_table = NULL;
 834       dfa->nodes = NULL;
 835       return REG_ESPACE;
 836     }
 837   return REG_NOERROR;
 838 }
 839
 840 /* Initialize WORD_CHAR table, which indicate which character is
 841    "word".  In this case "word" means that it is the word construction
 842    character used by some operators like "\<", "\>", etc.  */
 843
 844 static reg_errcode_t
 845 init_word_char (dfa)
 846      re_dfa_t *dfa;
 847 {
 848   int i, j, ch;
 849   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 850   if (BE (dfa->word_char == NULL, 0))
 851     return REG_ESPACE;
 852   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 853     for (j = 0; j < UINT_BITS; ++j, ++ch)
 854       if (isalnum (ch) || ch == '_')
 855         dfa->word_char[i] |= 1 << j;
 856   return REG_NOERROR;
 857 }
 858
 859 /* Free the work area which are only used while compiling.  */
 860
 861 static void
 862 free_workarea_compile (preg)
 863      regex_t *preg;
 864 {
 865   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 866   free_bin_tree (dfa->str_tree);
 867   dfa->str_tree = NULL;
 868 }
 869
 870 /* Create initial states for all contexts.  */
 871
 872 static reg_errcode_t
 873 create_initial_state (dfa)
 874      re_dfa_t *dfa;
 875 {
 876   int first, i;
 877   reg_errcode_t err;
 878   re_node_set init_nodes;
 879
 880   /* Initial states have the epsilon closure of the node which is
 881      the first node of the regular expression.  */
 882   first = dfa->str_tree->first;
 883   dfa->init_node = first;
 884   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 885   if (BE (err != REG_NOERROR, 0))
 886     return err;
 887
 888   /* The back-references which are in initial states can epsilon transit,
 889      since in this case all of the subexpressions can be null.
 890      Then we add epsilon closures of the nodes which are the next nodes of
 891      the back-references.  */
 892   if (dfa->nbackref > 0)
 893     for (i = 0; i < init_nodes.nelem; ++i)
 894       {
 895         int node_idx = init_nodes.elems[i];
 896         re_token_type_t type = dfa->nodes[node_idx].type;
 897
 898         int clexp_idx;
 899         if (type != OP_BACK_REF)
 900           continue;
 901         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 902           {
 903             re_token_t *clexp_node;
 904             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 905             if (clexp_node->type == OP_CLOSE_SUBEXP
 906                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 907               break;
 908           }
 909         if (clexp_idx == init_nodes.nelem)
 910           continue;
 911
 912         if (type == OP_BACK_REF)
 913           {
 914             int dest_idx = dfa->edests[node_idx].elems[0];
 915             if (!re_node_set_contains (&init_nodes, dest_idx))
 916               {
 917                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 918                 i = 0;
 919               }
 920           }
 921       }
 922
 923   /* It must be the first time to invoke acquire_state.  */
 924   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 925   /* We don't check ERR here, since the initial state must not be NULL.  */
 926   if (BE (dfa->init_state == NULL, 0))
 927     return err;
 928   if (dfa->init_state->has_constraint)
 929     {
 930       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 931                                                        CONTEXT_WORD);
 932       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 933                                                      CONTEXT_NEWLINE);
 934       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 935                                                          &init_nodes,
 936                                                          CONTEXT_NEWLINE
 937                                                          | CONTEXT_BEGBUF);
 938       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 939               || dfa->init_state_begbuf == NULL, 0))
 940         return err;
 941     }
 942   else
 943     dfa->init_state_word = dfa->init_state_nl
 944       = dfa->init_state_begbuf = dfa->init_state;
 945
 946   re_node_set_free (&init_nodes);
 947   return REG_NOERROR;
 948 }
 949 \f
 950 /* Analyze the structure tree, and calculate "first", "next", "edest",
 951    "eclosure", and "inveclosure".  */
 952
 953 static reg_errcode_t
 954 analyze (dfa)
 955      re_dfa_t *dfa;
 956 {
 957   int i;
 958   reg_errcode_t ret;
 959
 960   /* Allocate arrays.  */
 961   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
 962   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 963   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 964   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 965   if (BE (dfa->nexts == NULL || dfa->edests == NULL
 966           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
 967     return REG_ESPACE;
 968   /* Initialize them.  */
 969   for (i = 0; i < dfa->nodes_len; ++i)
 970     {
 971       dfa->nexts[i] = -1;
 972       re_node_set_init_empty (dfa->edests + i);
 973       re_node_set_init_empty (dfa->eclosures + i);
 974       re_node_set_init_empty (dfa->inveclosures + i);
 975     }
 976
 977   ret = analyze_tree (dfa, dfa->str_tree);
 978   if (BE (ret == REG_NOERROR, 1))
 979     {
 980       ret = calc_eclosure (dfa);
 981       if (ret == REG_NOERROR)
 982         calc_inveclosure (dfa);
 983     }
 984   return ret;
 985 }
 986
 987 /* Helper functions for analyze.
 988    This function calculate "first", "next", and "edest" for the subtree
 989    whose root is NODE.  */
 990
 991 static reg_errcode_t
 992 analyze_tree (dfa, node)
 993      re_dfa_t *dfa;
 994      bin_tree_t *node;
 995 {
 996   reg_errcode_t ret;
 997   if (node->first == -1)
 998     calc_first (dfa, node);
 999   if (node->next == -1)
1000     calc_next (dfa, node);
1001   if (node->eclosure.nelem == 0)
1002     calc_epsdest (dfa, node);
1003   /* Calculate "first" etc. for the left child.  */
1004   if (node->left != NULL)
1005     {
1006       ret = analyze_tree (dfa, node->left);
1007       if (BE (ret != REG_NOERROR, 0))
1008         return ret;
1009     }
1010   /* Calculate "first" etc. for the right child.  */
1011   if (node->right != NULL)
1012     {
1013       ret = analyze_tree (dfa, node->right);
1014       if (BE (ret != REG_NOERROR, 0))
1015         return ret;
1016     }
1017   return REG_NOERROR;
1018 }
1019
1020 /* Calculate "first" for the node NODE.  */
1021 static void
1022 calc_first (dfa, node)
1023      re_dfa_t *dfa;
1024      bin_tree_t *node;
1025 {
1026   int idx, type;
1027   idx = node->node_idx;
1028   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1029
1030   switch (type)
1031     {
1032 #ifdef DEBUG
1033     case OP_OPEN_BRACKET:
1034     case OP_CLOSE_BRACKET:
1035     case OP_OPEN_DUP_NUM:
1036     case OP_CLOSE_DUP_NUM:
1037     case OP_NON_MATCH_LIST:
1038     case OP_OPEN_COLL_ELEM:
1039     case OP_CLOSE_COLL_ELEM:
1040     case OP_OPEN_EQUIV_CLASS:
1041     case OP_CLOSE_EQUIV_CLASS:
1042     case OP_OPEN_CHAR_CLASS:
1043     case OP_CLOSE_CHAR_CLASS:
1044       /* These must not be appeared here.  */
1045       assert (0);
1046 #endif
1047     case END_OF_RE:
1048     case CHARACTER:
1049     case OP_PERIOD:
1050     case OP_DUP_ASTERISK:
1051     case OP_DUP_QUESTION:
1052 #ifdef RE_ENABLE_I18N
1053     case COMPLEX_BRACKET:
1054 #endif /* RE_ENABLE_I18N */
1055     case SIMPLE_BRACKET:
1056     case OP_BACK_REF:
1057     case ANCHOR:
1058     case OP_OPEN_SUBEXP:
1059     case OP_CLOSE_SUBEXP:
1060       node->first = idx;
1061       break;
1062     case OP_DUP_PLUS:
1063 #ifdef DEBUG
1064       assert (node->left != NULL);
1065 #endif
1066       if (node->left->first == -1)
1067         calc_first (dfa, node->left);
1068       node->first = node->left->first;
1069       break;
1070     case OP_ALT:
1071       node->first = idx;
1072       break;
1073       /* else fall through */
1074     default:
1075 #ifdef DEBUG
1076       assert (node->left != NULL);
1077 #endif
1078       if (node->left->first == -1)
1079         calc_first (dfa, node->left);
1080       node->first = node->left->first;
1081       break;
1082     }
1083 }
1084
1085 /* Calculate "next" for the node NODE.  */
1086
1087 static void
1088 calc_next (dfa, node)
1089      re_dfa_t *dfa;
1090      bin_tree_t *node;
1091 {
1092   int idx, type;
1093   bin_tree_t *parent = node->parent;
1094   if (parent == NULL)
1095     {
1096       node->next = -1;
1097       idx = node->node_idx;
1098       if (node->type == 0)
1099         dfa->nexts[idx] = node->next;
1100       return;
1101     }
1102
1103   idx = parent->node_idx;
1104   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1105
1106   switch (type)
1107     {
1108     case OP_DUP_ASTERISK:
1109     case OP_DUP_PLUS:
1110       node->next = idx;
1111       break;
1112     case CONCAT:
1113       if (parent->left == node)
1114         {
1115           if (parent->right->first == -1)
1116             calc_first (dfa, parent->right);
1117           node->next = parent->right->first;
1118           break;
1119         }
1120       /* else fall through */
1121     default:
1122       if (parent->next == -1)
1123         calc_next (dfa, parent);
1124       node->next = parent->next;
1125       break;
1126     }
1127   idx = node->node_idx;
1128   if (node->type == 0)
1129     dfa->nexts[idx] = node->next;
1130 }
1131
1132 /* Calculate "edest" for the node NODE.  */
1133
1134 static void
1135 calc_epsdest (dfa, node)
1136      re_dfa_t *dfa;
1137      bin_tree_t *node;
1138 {
1139   int idx;
1140   idx = node->node_idx;
1141   if (node->type == 0)
1142     {
1143       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1144           || dfa->nodes[idx].type == OP_DUP_PLUS
1145           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1146         {
1147           if (node->left->first == -1)
1148             calc_first (dfa, node->left);
1149           if (node->next == -1)
1150             calc_next (dfa, node);
1151           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1152                               node->next);
1153         }
1154       else if (dfa->nodes[idx].type == OP_ALT)
1155         {
1156           int left, right;
1157           if (node->left != NULL)
1158             {
1159               if (node->left->first == -1)
1160                 calc_first (dfa, node->left);
1161               left = node->left->first;
1162             }
1163           else
1164             {
1165               if (node->next == -1)
1166                 calc_next (dfa, node);
1167               left = node->next;
1168             }
1169           if (node->right != NULL)
1170             {
1171               if (node->right->first == -1)
1172                 calc_first (dfa, node->right);
1173               right = node->right->first;
1174             }
1175           else
1176             {
1177               if (node->next == -1)
1178                 calc_next (dfa, node);
1179               right = node->next;
1180             }
1181           re_node_set_init_2 (dfa->edests + idx, left, right);
1182         }
1183       else if (dfa->nodes[idx].type == ANCHOR
1184                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1185                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1186                || dfa->nodes[idx].type == OP_BACK_REF)
1187         re_node_set_init_1 (dfa->edests + idx, node->next);
1188     }
1189 }
1190
1191 /* Duplicate the epsilon closure of the node ROOT_NODE.
1192    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1193    to their own constraint.  */
1194
1195 static reg_errcode_t
1196 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1197                         init_constraint)
1198      re_dfa_t *dfa;
1199      int top_org_node, top_clone_node, root_node;
1200      unsigned int init_constraint;
1201 {
1202   reg_errcode_t err;
1203   int org_node, clone_node, ret;
1204   unsigned int constraint = init_constraint;
1205   for (org_node = top_org_node, clone_node = top_clone_node;;)
1206     {
1207       int org_dest, clone_dest;
1208       if (dfa->nodes[org_node].type == OP_BACK_REF)
1209         {
1210           /* If the back reference epsilon-transit, its destination must
1211              also have the constraint.  Then duplicate the epsilon closure
1212              of the destination of the back reference, and store it in
1213              edests of the back reference.  */
1214           org_dest = dfa->nexts[org_node];
1215           re_node_set_empty (dfa->edests + clone_node);
1216           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1217           if (BE (err != REG_NOERROR, 0))
1218             return err;
1219           dfa->nexts[clone_node] = dfa->nexts[org_node];
1220           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1221           if (BE (ret < 0, 0))
1222             return REG_ESPACE;
1223         }
1224       else if (dfa->edests[org_node].nelem == 0)
1225         {
1226           /* In case of the node can't epsilon-transit, don't duplicate the
1227              destination and store the original destination as the
1228              destination of the node.  */
1229           dfa->nexts[clone_node] = dfa->nexts[org_node];
1230           break;
1231         }
1232       else if (dfa->edests[org_node].nelem == 1)
1233         {
1234           /* In case of the node can epsilon-transit, and it has only one
1235              destination.  */
1236           org_dest = dfa->edests[org_node].elems[0];
1237           re_node_set_empty (dfa->edests + clone_node);
1238           if (dfa->nodes[org_node].type == ANCHOR)
1239             {
1240               /* In case of the node has another constraint, append it.  */
1241               if (org_node == root_node && clone_node != org_node)
1242                 {
1243                   /* ...but if the node is root_node itself, it means the
1244                      epsilon closure have a loop, then tie it to the
1245                      destination of the root_node.  */
1246                   ret = re_node_set_insert (dfa->edests + clone_node,
1247                                             org_dest);
1248                   if (BE (ret < 0, 0))
1249                     return REG_ESPACE;
1250                   break;
1251                 }
1252               constraint |= dfa->nodes[org_node].opr.ctx_type;
1253             }
1254           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1255           if (BE (err != REG_NOERROR, 0))
1256             return err;
1257           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1258           if (BE (ret < 0, 0))
1259             return REG_ESPACE;
1260         }
1261       else /* dfa->edests[org_node].nelem == 2 */
1262         {
1263           /* In case of the node can epsilon-transit, and it has two
1264              destinations.  */
1265           org_dest = dfa->edests[org_node].elems[0];
1266           re_node_set_empty (dfa->edests + clone_node);
1267           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1268           if (BE (err != REG_NOERROR, 0))
1269             return err;
1270           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1271           if (BE (ret < 0, 0))
1272             return REG_ESPACE;
1273
1274           err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node,
1275                                         constraint);
1276           if (BE (err != REG_NOERROR, 0))
1277             return err;
1278
1279           org_dest = dfa->edests[org_node].elems[1];
1280           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1281           if (BE (err != REG_NOERROR, 0))
1282             return err;
1283           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1284           if (BE (ret < 0, 0))
1285             return REG_ESPACE;
1286         }
1287       org_node = org_dest;
1288       clone_node = clone_dest;
1289     }
1290   return REG_NOERROR;
1291 }
1292
1293 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1294    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1295    otherwise return the error code.  */
1296
1297 static reg_errcode_t
1298 duplicate_node (new_idx, dfa, org_idx, constraint)
1299      re_dfa_t *dfa;
1300      int *new_idx, org_idx;
1301      unsigned int constraint;
1302 {
1303   re_token_t dup;
1304   int dup_idx;
1305
1306   dup = dfa->nodes[org_idx];
1307   dup_idx = re_dfa_add_node (dfa, dup, 1);
1308   if (BE (dup_idx == -1, 0))
1309     return REG_ESPACE;
1310   dfa->nodes[dup_idx].constraint = constraint;
1311   if (dfa->nodes[org_idx].type == ANCHOR)
1312     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1313   dfa->nodes[dup_idx].duplicated = 1;
1314   re_node_set_init_empty (dfa->edests + dup_idx);
1315   re_node_set_init_empty (dfa->eclosures + dup_idx);
1316   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1317
1318   *new_idx = dup_idx;
1319   return REG_NOERROR;
1320 }
1321
1322 static void
1323 calc_inveclosure (dfa)
1324      re_dfa_t *dfa;
1325 {
1326   int src, idx, dest;
1327   for (src = 0; src < dfa->nodes_len; ++src)
1328     {
1329       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1330         {
1331           dest = dfa->eclosures[src].elems[idx];
1332           re_node_set_insert (dfa->inveclosures + dest, src);
1333         }
1334     }
1335 }
1336
1337 /* Calculate "eclosure" for all the node in DFA.  */
1338
1339 static reg_errcode_t
1340 calc_eclosure (dfa)
1341      re_dfa_t *dfa;
1342 {
1343   int node_idx, incomplete;
1344 #ifdef DEBUG
1345   assert (dfa->nodes_len > 0);
1346 #endif
1347   incomplete = 0;
1348   /* For each nodes, calculate epsilon closure.  */
1349   for (node_idx = 0; ; ++node_idx)
1350     {
1351       reg_errcode_t err;
1352       re_node_set eclosure_elem;
1353       if (node_idx == dfa->nodes_len)
1354         {
1355           if (!incomplete)
1356             break;
1357           incomplete = 0;
1358           node_idx = 0;
1359         }
1360
1361 #ifdef DEBUG
1362       assert (dfa->eclosures[node_idx].nelem != -1);
1363 #endif
1364       /* If we have already calculated, skip it.  */
1365       if (dfa->eclosures[node_idx].nelem != 0)
1366         continue;
1367       /* Calculate epsilon closure of `node_idx'.  */
1368       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1369       if (BE (err != REG_NOERROR, 0))
1370         return err;
1371
1372       if (dfa->eclosures[node_idx].nelem == 0)
1373         {
1374           incomplete = 1;
1375           re_node_set_free (&eclosure_elem);
1376         }
1377     }
1378   return REG_NOERROR;
1379 }
1380
1381 /* Calculate epsilon closure of NODE.  */
1382
1383 static reg_errcode_t
1384 calc_eclosure_iter (new_set, dfa, node, root)
1385      re_node_set *new_set;
1386      re_dfa_t *dfa;
1387      int node, root;
1388 {
1389   reg_errcode_t err;
1390   unsigned int constraint;
1391   int i, incomplete;
1392   re_node_set eclosure;
1393   incomplete = 0;
1394   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1395   if (BE (err != REG_NOERROR, 0))
1396     return err;
1397
1398   /* This indicates that we are calculating this node now.
1399      We reference this value to avoid infinite loop.  */
1400   dfa->eclosures[node].nelem = -1;
1401
1402   constraint = ((dfa->nodes[node].type == ANCHOR)
1403                 ? dfa->nodes[node].opr.ctx_type : 0);
1404   /* If the current node has constraints, duplicate all nodes.
1405      Since they must inherit the constraints.  */
1406   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1407     {
1408       int org_node, cur_node;
1409       org_node = cur_node = node;
1410       err = duplicate_node_closure (dfa, node, node, node, constraint);
1411       if (BE (err != REG_NOERROR, 0))
1412         return err;
1413     }
1414
1415   /* Expand each epsilon destination nodes.  */
1416   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1417     for (i = 0; i < dfa->edests[node].nelem; ++i)
1418       {
1419         re_node_set eclosure_elem;
1420         int edest = dfa->edests[node].elems[i];
1421         /* If calculating the epsilon closure of `edest' is in progress,
1422            return intermediate result.  */
1423         if (dfa->eclosures[edest].nelem == -1)
1424           {
1425             incomplete = 1;
1426             continue;
1427           }
1428         /* If we haven't calculated the epsilon closure of `edest' yet,
1429            calculate now. Otherwise use calculated epsilon closure.  */
1430         if (dfa->eclosures[edest].nelem == 0)
1431           {
1432             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1433             if (BE (err != REG_NOERROR, 0))
1434               return err;
1435           }
1436         else
1437           eclosure_elem = dfa->eclosures[edest];
1438         /* Merge the epsilon closure of `edest'.  */
1439         re_node_set_merge (&eclosure, &eclosure_elem);
1440         /* If the epsilon closure of `edest' is incomplete,
1441            the epsilon closure of this node is also incomplete.  */
1442         if (dfa->eclosures[edest].nelem == 0)
1443           {
1444             incomplete = 1;
1445             re_node_set_free (&eclosure_elem);
1446           }
1447       }
1448
1449   /* Epsilon closures include itself.  */
1450   re_node_set_insert (&eclosure, node);
1451   if (incomplete && !root)
1452     dfa->eclosures[node].nelem = 0;
1453   else
1454     dfa->eclosures[node] = eclosure;
1455   *new_set = eclosure;
1456   return REG_NOERROR;
1457 }
1458 \f
1459 /* Functions for token which are used in the parser.  */
1460
1461 /* Fetch a token from INPUT.
1462    We must not use this function inside bracket expressions.  */
1463
1464 static re_token_t
1465 fetch_token (input, syntax)
1466      re_string_t *input;
1467      reg_syntax_t syntax;
1468 {
1469   re_token_t token;
1470   int consumed_byte;
1471   consumed_byte = peek_token (&token, input, syntax);
1472   re_string_skip_bytes (input, consumed_byte);
1473   return token;
1474 }
1475
1476 /* Peek a token from INPUT, and return the length of the token.
1477    We must not use this function inside bracket expressions.  */
1478
1479 static int
1480 peek_token (token, input, syntax)
1481      re_token_t *token;
1482      re_string_t *input;
1483      reg_syntax_t syntax;
1484 {
1485   unsigned char c;
1486
1487   if (re_string_eoi (input))
1488     {
1489       token->type = END_OF_RE;
1490       return 0;
1491     }
1492
1493   c = re_string_peek_byte (input, 0);
1494   token->opr.c = c;
1495
1496 #ifdef RE_ENABLE_I18N
1497   token->mb_partial = 0;
1498   if (MB_CUR_MAX > 1 &&
1499       !re_string_first_byte (input, re_string_cur_idx (input)))
1500     {
1501       token->type = CHARACTER;
1502       token->mb_partial = 1;
1503       return 1;
1504     }
1505 #endif
1506   if (c == '\\')
1507     {
1508       unsigned char c2;
1509       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1510         {
1511           token->type = BACK_SLASH;
1512           return 1;
1513         }
1514
1515       c2 = re_string_peek_byte_case (input, 1);
1516       token->opr.c = c2;
1517       token->type = CHARACTER;
1518       switch (c2)
1519         {
1520         case '|':
1521           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1522             token->type = OP_ALT;
1523           break;
1524         case '1': case '2': case '3': case '4': case '5':
1525         case '6': case '7': case '8': case '9':
1526           if (!(syntax & RE_NO_BK_REFS))
1527             {
1528               token->type = OP_BACK_REF;
1529               token->opr.idx = c2 - '0';
1530             }
1531           break;
1532         case '<':
1533           if (!(syntax & RE_NO_GNU_OPS))
1534             {
1535               token->type = ANCHOR;
1536               token->opr.idx = WORD_FIRST;
1537             }
1538           break;
1539         case '>':
1540           if (!(syntax & RE_NO_GNU_OPS))
1541             {
1542               token->type = ANCHOR;
1543               token->opr.idx = WORD_LAST;
1544             }
1545           break;
1546         case 'b':
1547           if (!(syntax & RE_NO_GNU_OPS))
1548             {
1549               token->type = ANCHOR;
1550               token->opr.idx = WORD_DELIM;
1551             }
1552           break;
1553         case 'B':
1554           if (!(syntax & RE_NO_GNU_OPS))
1555             {
1556               token->type = ANCHOR;
1557               token->opr.idx = INSIDE_WORD;
1558             }
1559           break;
1560         case 'w':
1561           if (!(syntax & RE_NO_GNU_OPS))
1562             token->type = OP_WORD;
1563           break;
1564         case 'W':
1565           if (!(syntax & RE_NO_GNU_OPS))
1566             token->type = OP_NOTWORD;
1567           break;
1568         case '`':
1569           if (!(syntax & RE_NO_GNU_OPS))
1570             {
1571               token->type = ANCHOR;
1572               token->opr.idx = BUF_FIRST;
1573             }
1574           break;
1575         case '\'':
1576           if (!(syntax & RE_NO_GNU_OPS))
1577             {
1578               token->type = ANCHOR;
1579               token->opr.idx = BUF_LAST;
1580             }
1581           break;
1582         case '(':
1583           if (!(syntax & RE_NO_BK_PARENS))
1584             token->type = OP_OPEN_SUBEXP;
1585           break;
1586         case ')':
1587           if (!(syntax & RE_NO_BK_PARENS))
1588             token->type = OP_CLOSE_SUBEXP;
1589           break;
1590         case '+':
1591           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1592             token->type = OP_DUP_PLUS;
1593           break;
1594         case '?':
1595           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1596             token->type = OP_DUP_QUESTION;
1597           break;
1598         case '{':
1599           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1600             token->type = OP_OPEN_DUP_NUM;
1601           break;
1602         case '}':
1603           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1604             token->type = OP_CLOSE_DUP_NUM;
1605           break;
1606         default:
1607           break;
1608         }
1609       return 2;
1610     }
1611
1612   token->type = CHARACTER;
1613   switch (c)
1614     {
1615     case '\n':
1616       if (syntax & RE_NEWLINE_ALT)
1617         token->type = OP_ALT;
1618       break;
1619     case '|':
1620       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1621         token->type = OP_ALT;
1622       break;
1623     case '*':
1624       token->type = OP_DUP_ASTERISK;
1625       break;
1626     case '+':
1627       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1628         token->type = OP_DUP_PLUS;
1629       break;
1630     case '?':
1631       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1632         token->type = OP_DUP_QUESTION;
1633       break;
1634     case '{':
1635       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1636         token->type = OP_OPEN_DUP_NUM;
1637       break;
1638     case '}':
1639       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1640         token->type = OP_CLOSE_DUP_NUM;
1641       break;
1642     case '(':
1643       if (syntax & RE_NO_BK_PARENS)
1644         token->type = OP_OPEN_SUBEXP;
1645       break;
1646     case ')':
1647       if (syntax & RE_NO_BK_PARENS)
1648         token->type = OP_CLOSE_SUBEXP;
1649       break;
1650     case '[':
1651       token->type = OP_OPEN_BRACKET;
1652       break;
1653     case '.':
1654       token->type = OP_PERIOD;
1655       break;
1656     case '^':
1657       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1658           re_string_cur_idx (input) != 0)
1659         {
1660           char prev = re_string_peek_byte (input, -1);
1661           if (prev != '|' && prev != '(' &&
1662               (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1663             break;
1664         }
1665       token->type = ANCHOR;
1666       token->opr.idx = LINE_FIRST;
1667       break;
1668     case '$':
1669       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1670           re_string_cur_idx (input) + 1 != re_string_length (input))
1671         {
1672           re_token_t next;
1673           re_string_skip_bytes (input, 1);
1674           peek_token (&next, input, syntax);
1675           re_string_skip_bytes (input, -1);
1676           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1677             break;
1678         }
1679       token->type = ANCHOR;
1680       token->opr.idx = LINE_LAST;
1681       break;
1682     default:
1683       break;
1684     }
1685   return 1;
1686 }
1687
1688 /* Peek a token from INPUT, and return the length of the token.
1689    We must not use this function out of bracket expressions.  */
1690
1691 static int
1692 peek_token_bracket (token, input, syntax)
1693      re_token_t *token;
1694      re_string_t *input;
1695      reg_syntax_t syntax;
1696 {
1697   unsigned char c;
1698   if (re_string_eoi (input))
1699     {
1700       token->type = END_OF_RE;
1701       return 0;
1702     }
1703   c = re_string_peek_byte (input, 0);
1704   token->opr.c = c;
1705
1706 #ifdef RE_ENABLE_I18N
1707   if (MB_CUR_MAX > 1 &&
1708       !re_string_first_byte (input, re_string_cur_idx (input)))
1709     {
1710       token->type = CHARACTER;
1711       return 1;
1712     }
1713 #endif /* RE_ENABLE_I18N */
1714
1715   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1716     {
1717       /* In this case, '\' escape a character.  */
1718       unsigned char c2;
1719       re_string_skip_bytes (input, 1);
1720       c2 = re_string_peek_byte (input, 0);
1721       token->opr.c = c2;
1722       token->type = CHARACTER;
1723       return 1;
1724     }
1725   if (c == '[') /* '[' is a special char in a bracket exps.  */
1726     {
1727       unsigned char c2;
1728       int token_len;
1729       c2 = re_string_peek_byte (input, 1);
1730       token->opr.c = c2;
1731       token_len = 2;
1732       switch (c2)
1733         {
1734         case '.':
1735           token->type = OP_OPEN_COLL_ELEM;
1736           break;
1737         case '=':
1738           token->type = OP_OPEN_EQUIV_CLASS;
1739           break;
1740         case ':':
1741           if (syntax & RE_CHAR_CLASSES)
1742             {
1743               token->type = OP_OPEN_CHAR_CLASS;
1744               break;
1745             }
1746           /* else fall through.  */
1747         default:
1748           token->type = CHARACTER;
1749           token->opr.c = c;
1750           token_len = 1;
1751           break;
1752         }
1753       return token_len;
1754     }
1755   switch (c)
1756     {
1757     case '-':
1758       token->type = OP_CHARSET_RANGE;
1759       break;
1760     case ']':
1761       token->type = OP_CLOSE_BRACKET;
1762       break;
1763     case '^':
1764       token->type = OP_NON_MATCH_LIST;
1765       break;
1766     default:
1767       token->type = CHARACTER;
1768     }
1769   return 1;
1770 }
1771 \f
1772 /* Functions for parser.  */
1773
1774 /* Entry point of the parser.
1775    Parse the regular expression REGEXP and return the structure tree.
1776    If an error is occured, ERR is set by error code, and return NULL.
1777    This function build the following tree, from regular expression <reg_exp>:
1778            CAT
1779            / \
1780           /   \
1781    <reg_exp>  EOR
1782
1783    CAT means concatenation.
1784    EOR means end of regular expression.  */
1785
1786 static bin_tree_t *
1787 parse (regexp, preg, syntax, err)
1788      re_string_t *regexp;
1789      regex_t *preg;
1790      reg_syntax_t syntax;
1791      reg_errcode_t *err;
1792 {
1793   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1794   bin_tree_t *tree, *eor, *root;
1795   re_token_t current_token;
1796   int new_idx;
1797   current_token = fetch_token (regexp, syntax);
1798   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1799   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1800     return NULL;
1801   new_idx = re_dfa_add_node (dfa, current_token, 0);
1802   eor = create_tree (NULL, NULL, 0, new_idx);
1803   if (tree != NULL)
1804     root = create_tree (tree, eor, CONCAT, 0);
1805   else
1806     root = eor;
1807   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1808     {
1809       *err = REG_ESPACE;
1810       return NULL;
1811     }
1812   return root;
1813 }
1814
1815 /* This function build the following tree, from regular expression
1816    <branch1>|<branch2>:
1817            ALT
1818            / \
1819           /   \
1820    <branch1> <branch2>
1821
1822    ALT means alternative, which represents the operator `|'.  */
1823
1824 static bin_tree_t *
1825 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1826      re_string_t *regexp;
1827      regex_t *preg;
1828      re_token_t *token;
1829      reg_syntax_t syntax;
1830      int nest;
1831      reg_errcode_t *err;
1832 {
1833   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1834   bin_tree_t *tree, *branch = NULL;
1835   int new_idx;
1836   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1837   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1838     return NULL;
1839
1840   while (token->type == OP_ALT)
1841     {
1842       re_token_t alt_token = *token;
1843       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1844       *token = fetch_token (regexp, syntax);
1845       if (token->type != OP_ALT && token->type != END_OF_RE
1846           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1847         {
1848           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1849           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1850             {
1851               free_bin_tree (tree);
1852               return NULL;
1853             }
1854         }
1855       else
1856         branch = NULL;
1857       tree = create_tree (tree, branch, 0, new_idx);
1858       if (BE (new_idx == -1 || tree == NULL, 0))
1859         {
1860           *err = REG_ESPACE;
1861           return NULL;
1862         }
1863       dfa->has_plural_match = 1;
1864     }
1865   return tree;
1866 }
1867
1868 /* This function build the following tree, from regular expression
1869    <exp1><exp2>:
1870         CAT
1871         / \
1872        /   \
1873    <exp1> <exp2>
1874
1875    CAT means concatenation.  */
1876
1877 static bin_tree_t *
1878 parse_branch (regexp, preg, token, syntax, nest, err)
1879      re_string_t *regexp;
1880      regex_t *preg;
1881      re_token_t *token;
1882      reg_syntax_t syntax;
1883      int nest;
1884      reg_errcode_t *err;
1885 {
1886   bin_tree_t *tree, *exp;
1887   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1888   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1889     return NULL;
1890
1891   while (token->type != OP_ALT && token->type != END_OF_RE
1892          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1893     {
1894       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1895       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1896         {
1897           free_bin_tree (tree);
1898           return NULL;
1899         }
1900       if (tree != NULL && exp != NULL)
1901         {
1902           tree = create_tree (tree, exp, CONCAT, 0);
1903           if (tree == NULL)
1904             {
1905               *err = REG_ESPACE;
1906               return NULL;
1907             }
1908         }
1909       else if (tree == NULL)
1910         tree = exp;
1911       /* Otherwise exp == NULL, we don't need to create new tree.  */
1912     }
1913   return tree;
1914 }
1915
1916 /* This function build the following tree, from regular expression a*:
1917          *
1918          |
1919          a
1920 */
1921
1922 static bin_tree_t *
1923 parse_expression (regexp, preg, token, syntax, nest, err)
1924      re_string_t *regexp;
1925      regex_t *preg;
1926      re_token_t *token;
1927      reg_syntax_t syntax;
1928      int nest;
1929      reg_errcode_t *err;
1930 {
1931   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1932   bin_tree_t *tree;
1933   int new_idx;
1934   switch (token->type)
1935     {
1936     case CHARACTER:
1937       new_idx = re_dfa_add_node (dfa, *token, 0);
1938       tree = create_tree (NULL, NULL, 0, new_idx);
1939       if (BE (new_idx == -1 || tree == NULL, 0))
1940         {
1941           *err = REG_ESPACE;
1942           return NULL;
1943         }
1944 #ifdef RE_ENABLE_I18N
1945       if (MB_CUR_MAX > 1)
1946         {
1947           while (!re_string_eoi (regexp)
1948                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1949             {
1950               bin_tree_t *mbc_remain;
1951               *token = fetch_token (regexp, syntax);
1952               new_idx = re_dfa_add_node (dfa, *token, 0);
1953               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1954               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1955               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1956                 return *err = REG_ESPACE, NULL;
1957             }
1958         }
1959 #endif
1960       break;
1961     case OP_OPEN_SUBEXP:
1962       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1963       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964         return NULL;
1965       break;
1966     case OP_OPEN_BRACKET:
1967       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1968       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1969         return NULL;
1970       break;
1971     case OP_BACK_REF:
1972       if (BE (preg->re_nsub < token->opr.idx
1973               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1974         {
1975           *err = REG_ESUBREG;
1976           return NULL;
1977         }
1978       new_idx = re_dfa_add_node (dfa, *token, 0);
1979       tree = create_tree (NULL, NULL, 0, new_idx);
1980       if (BE (new_idx == -1 || tree == NULL, 0))
1981         {
1982           *err = REG_ESPACE;
1983           return NULL;
1984         }
1985       ++dfa->nbackref;
1986       dfa->has_mb_node = 1;
1987       break;
1988     case OP_DUP_ASTERISK:
1989     case OP_DUP_PLUS:
1990     case OP_DUP_QUESTION:
1991     case OP_OPEN_DUP_NUM:
1992       if (syntax & RE_CONTEXT_INVALID_OPS)
1993         {
1994           *err = REG_BADRPT;
1995           return NULL;
1996         }
1997       else if (syntax & RE_CONTEXT_INDEP_OPS)
1998         {
1999           *token = fetch_token (regexp, syntax);
2000           return parse_expression (regexp, preg, token, syntax, nest, err);
2001         }
2002       /* else fall through  */
2003     case OP_CLOSE_SUBEXP:
2004       if ((token->type == OP_CLOSE_SUBEXP) &&
2005           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2006         {
2007           *err = REG_ERPAREN;
2008           return NULL;
2009         }
2010       /* else fall through  */
2011     case OP_CLOSE_DUP_NUM:
2012       /* We treat it as a normal character.  */
2013
2014       /* Then we can these characters as normal characters.  */
2015       token->type = CHARACTER;
2016       new_idx = re_dfa_add_node (dfa, *token, 0);
2017       tree = create_tree (NULL, NULL, 0, new_idx);
2018       if (BE (new_idx == -1 || tree == NULL, 0))
2019         {
2020           *err = REG_ESPACE;
2021           return NULL;
2022         }
2023       break;
2024     case ANCHOR:
2025       if (dfa->word_char == NULL)
2026         {
2027           *err = init_word_char (dfa);
2028           if (BE (*err != REG_NOERROR, 0))
2029             return NULL;
2030         }
2031       if (token->opr.ctx_type == WORD_DELIM)
2032         {
2033           bin_tree_t *tree_first, *tree_last;
2034           int idx_first, idx_last;
2035           token->opr.ctx_type = WORD_FIRST;
2036           idx_first = re_dfa_add_node (dfa, *token, 0);
2037           tree_first = create_tree (NULL, NULL, 0, idx_first);
2038           token->opr.ctx_type = WORD_LAST;
2039           idx_last = re_dfa_add_node (dfa, *token, 0);
2040           tree_last = create_tree (NULL, NULL, 0, idx_last);
2041           token->type = OP_ALT;
2042           new_idx = re_dfa_add_node (dfa, *token, 0);
2043           tree = create_tree (tree_first, tree_last, 0, new_idx);
2044           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2045                   || tree_first == NULL || tree_last == NULL
2046                   || tree == NULL, 0))
2047             {
2048               *err = REG_ESPACE;
2049               return NULL;
2050             }
2051         }
2052       else
2053         {
2054           new_idx = re_dfa_add_node (dfa, *token, 0);
2055           tree = create_tree (NULL, NULL, 0, new_idx);
2056           if (BE (new_idx == -1 || tree == NULL, 0))
2057             return *err = REG_ESPACE, NULL;
2058         }
2059       /* We must return here, since ANCHORs can't be followed
2060          by repetition operators.
2061          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2062              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2063       *token = fetch_token (regexp, syntax);
2064       return tree;
2065     case OP_PERIOD:
2066       new_idx = re_dfa_add_node (dfa, *token, 0);
2067       tree = create_tree (NULL, NULL, 0, new_idx);
2068       if (BE (new_idx == -1 || tree == NULL, 0))
2069         {
2070           *err = REG_ESPACE;
2071           return NULL;
2072         }
2073       if (MB_CUR_MAX > 1)
2074         dfa->has_mb_node = 1;
2075       break;
2076     case OP_WORD:
2077       tree = build_word_op (dfa, 0, err);
2078       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2079         return NULL;
2080       break;
2081     case OP_NOTWORD:
2082       tree = build_word_op (dfa, 1, err);
2083       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2084         return NULL;
2085       break;
2086     case OP_ALT:
2087     case END_OF_RE:
2088       return NULL;
2089     case BACK_SLASH:
2090       *err = REG_EESCAPE;
2091       return NULL;
2092     default:
2093       /* Must not happen?  */
2094 #ifdef DEBUG
2095       assert (0);
2096 #endif
2097       return NULL;
2098     }
2099   *token = fetch_token (regexp, syntax);
2100
2101   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2102          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2103     {
2104       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2105       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2106         return NULL;
2107       dfa->has_plural_match = 1;
2108     }
2109
2110   return tree;
2111 }
2112
2113 /* This function build the following tree, from regular expression
2114    (<reg_exp>):
2115          SUBEXP
2116             |
2117         <reg_exp>
2118 */
2119
2120 static bin_tree_t *
2121 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2122      re_string_t *regexp;
2123      regex_t *preg;
2124      re_token_t *token;
2125      reg_syntax_t syntax;
2126      int nest;
2127      reg_errcode_t *err;
2128 {
2129   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2130   bin_tree_t *tree, *left_par, *right_par;
2131   size_t cur_nsub;
2132   int new_idx;
2133   cur_nsub = preg->re_nsub++;
2134   if (dfa->subexps_alloc < preg->re_nsub)
2135     {
2136       re_subexp_t *new_array;
2137       dfa->subexps_alloc *= 2;
2138       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2139       if (BE (new_array == NULL, 0))
2140         {
2141           dfa->subexps_alloc /= 2;
2142           *err = REG_ESPACE;
2143           return NULL;
2144         }
2145       dfa->subexps = new_array;
2146     }
2147   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2148   dfa->subexps[cur_nsub].end = -1;
2149
2150   new_idx = re_dfa_add_node (dfa, *token, 0);
2151   left_par = create_tree (NULL, NULL, 0, new_idx);
2152   if (BE (new_idx == -1 || left_par == NULL, 0))
2153     {
2154       *err = REG_ESPACE;
2155       return NULL;
2156     }
2157   dfa->nodes[new_idx].opr.idx = cur_nsub;
2158   *token = fetch_token (regexp, syntax);
2159
2160   /* The subexpression may be a null string.  */
2161   if (token->type == OP_CLOSE_SUBEXP)
2162     tree = NULL;
2163   else
2164     {
2165       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2166       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2167         return NULL;
2168     }
2169   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2170     {
2171       free_bin_tree (tree);
2172       *err = REG_BADPAT;
2173       return NULL;
2174     }
2175   new_idx = re_dfa_add_node (dfa, *token, 0);
2176   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2177   right_par = create_tree (NULL, NULL, 0, new_idx);
2178   tree = ((tree == NULL) ? right_par
2179           : create_tree (tree, right_par, CONCAT, 0));
2180   tree = create_tree (left_par, tree, CONCAT, 0);
2181   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2182     {
2183       *err = REG_ESPACE;
2184       return NULL;
2185     }
2186   dfa->nodes[new_idx].opr.idx = cur_nsub;
2187
2188   return tree;
2189 }
2190
2191 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2192
2193 static bin_tree_t *
2194 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2195      bin_tree_t *dup_elem;
2196      re_string_t *regexp;
2197      re_dfa_t *dfa;
2198      re_token_t *token;
2199      reg_syntax_t syntax;
2200      reg_errcode_t *err;
2201 {
2202   re_token_t dup_token;
2203   bin_tree_t *tree = dup_elem, *work_tree;
2204   int new_idx, start_idx = re_string_cur_idx (regexp);
2205   re_token_t start_token = *token;
2206   if (token->type == OP_OPEN_DUP_NUM)
2207     {
2208       int i;
2209       int end = 0;
2210       int start = fetch_number (regexp, token, syntax);
2211       bin_tree_t *elem;
2212       if (start == -1)
2213         {
2214           if (token->type == CHARACTER && token->opr.c == ',')
2215             start = 0; /* We treat "{,m}" as "{0,m}".  */
2216           else
2217             {
2218               *err = REG_BADBR; /* <re>{} is invalid.  */
2219               return NULL;
2220             }
2221         }
2222       if (BE (start != -2, 1))
2223         {
2224           /* We treat "{n}" as "{n,n}".  */
2225           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2226                  : ((token->type == CHARACTER && token->opr.c == ',')
2227                     ? fetch_number (regexp, token, syntax) : -2));
2228         }
2229       if (BE (start == -2 || end == -2, 0))
2230         {
2231           /* Invalid sequence.  */
2232           if (token->type == OP_CLOSE_DUP_NUM)
2233             goto parse_dup_op_invalid_interval;
2234           else
2235             goto parse_dup_op_ebrace;
2236         }
2237       if (BE (start == 0 && end == 0, 0))
2238         {
2239           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2240           *token = fetch_token (regexp, syntax);
2241           free_bin_tree (dup_elem);
2242           return NULL;
2243         }
2244
2245       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2246       elem = tree;
2247       for (i = 0; i < start; ++i)
2248         if (i != 0)
2249           {
2250             work_tree = duplicate_tree (elem, dfa);
2251             tree = create_tree (tree, work_tree, CONCAT, 0);
2252             if (BE (work_tree == NULL || tree == NULL, 0))
2253               goto parse_dup_op_espace;
2254           }
2255
2256       if (end == -1)
2257         {
2258           /* We treat "<re>{0,}" as "<re>*".  */
2259           dup_token.type = OP_DUP_ASTERISK;
2260           if (start > 0)
2261             {
2262               elem = duplicate_tree (elem, dfa);
2263               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2264               work_tree = create_tree (elem, NULL, 0, new_idx);
2265               tree = create_tree (tree, work_tree, CONCAT, 0);
2266               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2267                       || tree == NULL, 0))
2268                 goto parse_dup_op_espace;
2269             }
2270           else
2271             {
2272               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2273               tree = create_tree (elem, NULL, 0, new_idx);
2274               if (BE (new_idx == -1 || tree == NULL, 0))
2275                 goto parse_dup_op_espace;
2276             }
2277         }
2278       else if (end - start > 0)
2279         {
2280           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2281           dup_token.type = OP_DUP_QUESTION;
2282           if (start > 0)
2283             {
2284               elem = duplicate_tree (elem, dfa);
2285               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2286               elem = create_tree (elem, NULL, 0, new_idx);
2287               tree = create_tree (tree, elem, CONCAT, 0);
2288               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2289                 goto parse_dup_op_espace;
2290             }
2291           else
2292             {
2293               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2294               tree = elem = create_tree (elem, NULL, 0, new_idx);
2295               if (BE (new_idx == -1 || tree == NULL, 0))
2296                 goto parse_dup_op_espace;
2297             }
2298           for (i = 1; i < end - start; ++i)
2299             {
2300               work_tree = duplicate_tree (elem, dfa);
2301               tree = create_tree (tree, work_tree, CONCAT, 0);
2302               if (BE (work_tree == NULL || tree == NULL, 0))
2303                 {
2304                   *err = REG_ESPACE;
2305                   return NULL;
2306                 }
2307             }
2308         }
2309     }
2310   else
2311     {
2312       new_idx = re_dfa_add_node (dfa, *token, 0);
2313       tree = create_tree (tree, NULL, 0, new_idx);
2314       if (BE (new_idx == -1 || tree == NULL, 0))
2315         {
2316           *err = REG_ESPACE;
2317           return NULL;
2318         }
2319     }
2320   *token = fetch_token (regexp, syntax);
2321   return tree;
2322
2323  parse_dup_op_espace:
2324   free_bin_tree (tree);
2325   *err = REG_ESPACE;
2326   return NULL;
2327
2328  parse_dup_op_ebrace:
2329   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2330     {
2331       *err = REG_EBRACE;
2332       return NULL;
2333     }
2334   goto parse_dup_op_rollback;
2335  parse_dup_op_invalid_interval:
2336   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2337     {
2338       *err = REG_BADBR;
2339       return NULL;
2340     }
2341  parse_dup_op_rollback:
2342   re_string_set_index (regexp, start_idx);
2343   *token = start_token;
2344   token->type = CHARACTER;
2345   return dup_elem;
2346 }
2347
2348 /* Size of the names for collating symbol/equivalence_class/character_class.
2349    I'm not sure, but maybe enough.  */
2350 #define BRACKET_NAME_BUF_SIZE 32
2351
2352 #ifndef _LIBC
2353   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2354      Build the range expression which starts from START_ELEM, and ends
2355      at END_ELEM.  The result are written to MBCSET and SBCSET.
2356      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2357      mbcset->range_ends, is a pointer argument sinse we may
2358      update it.  */
2359
2360 static reg_errcode_t
2361 # ifdef RE_ENABLE_I18N
2362 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2363      re_charset_t *mbcset;
2364      int *range_alloc;
2365 # else /* not RE_ENABLE_I18N */
2366 build_range_exp (sbcset, start_elem, end_elem)
2367 # endif /* not RE_ENABLE_I18N */
2368      re_bitset_ptr_t sbcset;
2369      bracket_elem_t *start_elem, *end_elem;
2370 {
2371   unsigned int start_ch, end_ch;
2372   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2373   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2374           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2375           0))
2376     return REG_ERANGE;
2377
2378   /* We can handle no multi character collating elements without libc
2379      support.  */
2380   if (BE ((start_elem->type == COLL_SYM
2381            && strlen ((char *) start_elem->opr.name) > 1)
2382           || (end_elem->type == COLL_SYM
2383               && strlen ((char *) end_elem->opr.name) > 1), 0))
2384     return REG_ECOLLATE;
2385
2386 # ifdef RE_ENABLE_I18N
2387   {
2388     wchar_t wc, start_wc, end_wc;
2389     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2390
2391     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2392                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2393                    : 0));
2394     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2395               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2396                  : 0));
2397     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2398                 ? __btowc (start_ch) : start_elem->opr.wch);
2399     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2400               ? __btowc (end_ch) : end_elem->opr.wch);
2401     cmp_buf[0] = start_wc;
2402     cmp_buf[4] = end_wc;
2403     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2404       return REG_ERANGE;
2405
2406     /* Check the space of the arrays.  */
2407     if (*range_alloc == mbcset->nranges)
2408       {
2409         /* There are not enough space, need realloc.  */
2410         wchar_t *new_array_start, *new_array_end;
2411         int new_nranges;
2412
2413         /* +1 in case of mbcset->nranges is 0.  */
2414         new_nranges = 2 * mbcset->nranges + 1;
2415         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2416            are NULL if *range_alloc == 0.  */
2417         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2418                                       new_nranges);
2419         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2420                                     new_nranges);
2421
2422         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2423           return REG_ESPACE;
2424
2425         mbcset->range_starts = new_array_start;
2426         mbcset->range_ends = new_array_end;
2427         *range_alloc = new_nranges;
2428       }
2429
2430     mbcset->range_starts[mbcset->nranges] = start_wc;
2431     mbcset->range_ends[mbcset->nranges++] = end_wc;
2432
2433     /* Build the table for single byte characters.  */
2434     for (wc = 0; wc <= SBC_MAX; ++wc)
2435       {
2436         cmp_buf[2] = wc;
2437         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2438             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2439           bitset_set (sbcset, wc);
2440       }
2441   }
2442 # else /* not RE_ENABLE_I18N */
2443   {
2444     unsigned int ch;
2445     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2446                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2447                    : 0));
2448     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2449               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2450                  : 0));
2451     if (start_ch > end_ch)
2452       return REG_ERANGE;
2453     /* Build the table for single byte characters.  */
2454     for (ch = 0; ch <= SBC_MAX; ++ch)
2455       if (start_ch <= ch  && ch <= end_ch)
2456         bitset_set (sbcset, ch);
2457   }
2458 # endif /* not RE_ENABLE_I18N */
2459   return REG_NOERROR;
2460 }
2461 #endif /* not _LIBC */
2462
2463 #ifndef _LIBC
2464 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2465    Build the collating element which is represented by NAME.
2466    The result are written to MBCSET and SBCSET.
2467    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2468    pointer argument since we may update it.  */
2469
2470 static reg_errcode_t
2471 # ifdef RE_ENABLE_I18N
2472 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2473      re_charset_t *mbcset;
2474      int *coll_sym_alloc;
2475 # else /* not RE_ENABLE_I18N */
2476 build_collating_symbol (sbcset, name)
2477 # endif /* not RE_ENABLE_I18N */
2478      re_bitset_ptr_t sbcset;
2479      const unsigned char *name;
2480 {
2481   size_t name_len = strlen ((const char *) name);
2482   if (BE (name_len != 1, 0))
2483     return REG_ECOLLATE;
2484   else
2485     {
2486       bitset_set (sbcset, name[0]);
2487       return REG_NOERROR;
2488     }
2489 }
2490 #endif /* not _LIBC */
2491
2492 /* This function parse bracket expression like "[abc]", "[a-c]",
2493    "[[.a-a.]]" etc.  */
2494
2495 static bin_tree_t *
2496 parse_bracket_exp (regexp, dfa, token, syntax, err)
2497      re_string_t *regexp;
2498      re_dfa_t *dfa;
2499      re_token_t *token;
2500      reg_syntax_t syntax;
2501      reg_errcode_t *err;
2502 {
2503 #ifdef _LIBC
2504   const unsigned char *collseqmb;
2505   const char *collseqwc;
2506   uint32_t nrules;
2507   int32_t table_size;
2508   const int32_t *symb_table;
2509   const unsigned char *extra;
2510
2511   /* Local function for parse_bracket_exp used in _LIBC environement.
2512      Seek the collating symbol entry correspondings to NAME.
2513      Return the index of the symbol in the SYMB_TABLE.  */
2514
2515   static inline int32_t
2516   seek_collating_symbol_entry (name, name_len)
2517          const unsigned char *name;
2518          size_t name_len;
2519     {
2520       int32_t hash = elem_hash ((const char *) name, name_len);
2521       int32_t elem = hash % table_size;
2522       int32_t second = hash % (table_size - 2);
2523       while (symb_table[2 * elem] != 0)
2524         {
2525           /* First compare the hashing value.  */
2526           if (symb_table[2 * elem] == hash
2527               /* Compare the length of the name.  */
2528               && name_len == extra[symb_table[2 * elem + 1]]
2529               /* Compare the name.  */
2530               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2531                          name_len) == 0)
2532             {
2533               /* Yep, this is the entry.  */
2534               break;
2535             }
2536
2537           /* Next entry.  */
2538           elem += second;
2539         }
2540       return elem;
2541     }
2542
2543   /* Local function for parse_bracket_exp used in _LIBC environement.
2544      Look up the collation sequence value of BR_ELEM.
2545      Return the value if succeeded, UINT_MAX otherwise.  */
2546
2547   static inline unsigned int
2548   lookup_collation_sequence_value (br_elem)
2549          bracket_elem_t *br_elem;
2550     {
2551       if (br_elem->type == SB_CHAR)
2552         {
2553           /*
2554           if (MB_CUR_MAX == 1)
2555           */
2556           if (nrules == 0)
2557             return collseqmb[br_elem->opr.ch];
2558           else
2559             {
2560               wint_t wc = __btowc (br_elem->opr.ch);
2561               return collseq_table_lookup (collseqwc, wc);
2562             }
2563         }
2564       else if (br_elem->type == MB_CHAR)
2565         {
2566           return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2567         }
2568       else if (br_elem->type == COLL_SYM)
2569         {
2570           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2571           if (nrules != 0)
2572             {
2573               int32_t elem, idx;
2574               elem = seek_collating_symbol_entry (br_elem->opr.name,
2575                                                   sym_name_len);
2576               if (symb_table[2 * elem] != 0)
2577                 {
2578                   /* We found the entry.  */
2579                   idx = symb_table[2 * elem + 1];
2580                   /* Skip the name of collating element name.  */
2581                   idx += 1 + extra[idx];
2582                   /* Skip the byte sequence of the collating element.  */
2583                   idx += 1 + extra[idx];
2584                   /* Adjust for the alignment.  */
2585                   idx = (idx + 3) & ~3;
2586                   /* Skip the multibyte collation sequence value.  */
2587                   idx += sizeof (unsigned int);
2588                   /* Skip the wide char sequence of the collating element.  */
2589                   idx += sizeof (unsigned int) *
2590                     (1 + *(unsigned int *) (extra + idx));
2591                   /* Return the collation sequence value.  */
2592                   return *(unsigned int *) (extra + idx);
2593                 }
2594               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2595                 {
2596                   /* No valid character.  Match it as a single byte
2597                      character.  */
2598                   return collseqmb[br_elem->opr.name[0]];
2599                 }
2600             }
2601           else if (sym_name_len == 1)
2602             return collseqmb[br_elem->opr.name[0]];
2603         }
2604       return UINT_MAX;
2605     }
2606
2607   /* Local function for parse_bracket_exp used in _LIBC environement.
2608      Build the range expression which starts from START_ELEM, and ends
2609      at END_ELEM.  The result are written to MBCSET and SBCSET.
2610      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2611      mbcset->range_ends, is a pointer argument sinse we may
2612      update it.  */
2613
2614   static inline reg_errcode_t
2615 # ifdef RE_ENABLE_I18N
2616   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2617          re_charset_t *mbcset;
2618          int *range_alloc;
2619 # else /* not RE_ENABLE_I18N */
2620   build_range_exp (sbcset, start_elem, end_elem)
2621 # endif /* not RE_ENABLE_I18N */
2622          re_bitset_ptr_t sbcset;
2623          bracket_elem_t *start_elem, *end_elem;
2624     {
2625       unsigned int ch;
2626       uint32_t start_collseq;
2627       uint32_t end_collseq;
2628
2629 # ifdef RE_ENABLE_I18N
2630       /* Check the space of the arrays.  */
2631       if (*range_alloc == mbcset->nranges)
2632         {
2633           /* There are not enough space, need realloc.  */
2634           uint32_t *new_array_start;
2635           uint32_t *new_array_end;
2636           int new_nranges;
2637
2638           /* +1 in case of mbcset->nranges is 0.  */
2639           new_nranges = 2 * mbcset->nranges + 1;
2640           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2641              are NULL if *range_alloc == 0.  */
2642           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2643                                         new_nranges);
2644           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2645                                       new_nranges);
2646
2647           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2648             return REG_ESPACE;
2649
2650           mbcset->range_starts = new_array_start;
2651           mbcset->range_ends = new_array_end;
2652           *range_alloc = new_nranges;
2653         }
2654 # endif /* RE_ENABLE_I18N */
2655
2656       /* Equivalence Classes and Character Classes can't be a range
2657          start/end.  */
2658       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2659               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2660               0))
2661         return REG_ERANGE;
2662
2663       start_collseq = lookup_collation_sequence_value (start_elem);
2664       end_collseq = lookup_collation_sequence_value (end_elem);
2665       /* Check start/end collation sequence values.  */
2666       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2667         return REG_ECOLLATE;
2668       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2669         return REG_ERANGE;
2670
2671 # ifdef RE_ENABLE_I18N
2672       /* Got valid collation sequence values, add them as a new entry.  */
2673       mbcset->range_starts[mbcset->nranges] = start_collseq;
2674       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2675 # endif /* RE_ENABLE_I18N */
2676
2677       /* Build the table for single byte characters.  */
2678       for (ch = 0; ch <= SBC_MAX; ch++)
2679         {
2680           uint32_t ch_collseq;
2681           /*
2682           if (MB_CUR_MAX == 1)
2683           */
2684           if (nrules == 0)
2685             ch_collseq = collseqmb[ch];
2686           else
2687             ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2688           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2689             bitset_set (sbcset, ch);
2690         }
2691       return REG_NOERROR;
2692     }
2693
2694   /* Local function for parse_bracket_exp used in _LIBC environement.
2695      Build the collating element which is represented by NAME.
2696      The result are written to MBCSET and SBCSET.
2697      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2698      pointer argument sinse we may update it.  */
2699
2700   static inline reg_errcode_t
2701 # ifdef RE_ENABLE_I18N
2702   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2703          re_charset_t *mbcset;
2704          int *coll_sym_alloc;
2705 # else /* not RE_ENABLE_I18N */
2706   build_collating_symbol (sbcset, name)
2707 # endif /* not RE_ENABLE_I18N */
2708          re_bitset_ptr_t sbcset;
2709          const unsigned char *name;
2710     {
2711       int32_t elem, idx;
2712       size_t name_len = strlen ((const char *) name);
2713       if (nrules != 0)
2714         {
2715           elem = seek_collating_symbol_entry (name, name_len);
2716           if (symb_table[2 * elem] != 0)
2717             {
2718               /* We found the entry.  */
2719               idx = symb_table[2 * elem + 1];
2720               /* Skip the name of collating element name.  */
2721               idx += 1 + extra[idx];
2722             }
2723           else if (symb_table[2 * elem] == 0 && name_len == 1)
2724             {
2725               /* No valid character, treat it as a normal
2726                  character.  */
2727               bitset_set (sbcset, name[0]);
2728               return REG_NOERROR;
2729             }
2730           else
2731             return REG_ECOLLATE;
2732
2733 # ifdef RE_ENABLE_I18N
2734           /* Got valid collation sequence, add it as a new entry.  */
2735           /* Check the space of the arrays.  */
2736           if (*coll_sym_alloc == mbcset->ncoll_syms)
2737             {
2738               /* Not enough, realloc it.  */
2739               /* +1 in case of mbcset->ncoll_syms is 0.  */
2740               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2741               /* Use realloc since mbcset->coll_syms is NULL
2742                  if *alloc == 0.  */
2743               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2744                                               *coll_sym_alloc);
2745               if (BE (mbcset->coll_syms == NULL, 0))
2746                 return REG_ESPACE;
2747             }
2748           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2749 # endif /* RE_ENABLE_I18N */
2750           return REG_NOERROR;
2751         }
2752       else
2753         {
2754           if (BE (name_len != 1, 0))
2755             return REG_ECOLLATE;
2756           else
2757             {
2758               bitset_set (sbcset, name[0]);
2759               return REG_NOERROR;
2760             }
2761         }
2762     }
2763 #endif
2764
2765   re_token_t br_token;
2766   re_bitset_ptr_t sbcset;
2767 #ifdef RE_ENABLE_I18N
2768   re_charset_t *mbcset;
2769   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2770   int equiv_class_alloc = 0, char_class_alloc = 0;
2771 #else /* not RE_ENABLE_I18N */
2772   int non_match = 0;
2773 #endif /* not RE_ENABLE_I18N */
2774   bin_tree_t *work_tree;
2775   int token_len, new_idx;
2776 #ifdef _LIBC
2777   collseqmb = (const unsigned char *)
2778     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2779   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2780   if (nrules)
2781     {
2782       /*
2783       if (MB_CUR_MAX > 1)
2784       */
2785         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2786       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2787       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2788                                                   _NL_COLLATE_SYMB_TABLEMB);
2789       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2790                                                    _NL_COLLATE_SYMB_EXTRAMB);
2791     }
2792 #endif
2793   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2794 #ifdef RE_ENABLE_I18N
2795   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2796 #endif /* RE_ENABLE_I18N */
2797 #ifdef RE_ENABLE_I18N
2798   if (BE (sbcset == NULL || mbcset == NULL, 0))
2799 #else
2800   if (BE (sbcset == NULL, 0))
2801 #endif /* RE_ENABLE_I18N */
2802     {
2803       *err = REG_ESPACE;
2804       return NULL;
2805     }
2806
2807   token_len = peek_token_bracket (token, regexp, syntax);
2808   if (BE (token->type == END_OF_RE, 0))
2809     {
2810       *err = REG_BADPAT;
2811       goto parse_bracket_exp_free_return;
2812     }
2813   if (token->type == OP_NON_MATCH_LIST)
2814     {
2815 #ifdef RE_ENABLE_I18N
2816       int i;
2817       mbcset->non_match = 1;
2818 #else /* not RE_ENABLE_I18N */
2819       non_match = 1;
2820 #endif /* not RE_ENABLE_I18N */
2821       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2822         bitset_set (sbcset, '\0');
2823       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2824       token_len = peek_token_bracket (token, regexp, syntax);
2825       if (BE (token->type == END_OF_RE, 0))
2826         {
2827           *err = REG_BADPAT;
2828           goto parse_bracket_exp_free_return;
2829         }
2830 #ifdef RE_ENABLE_I18N
2831       if (MB_CUR_MAX > 1)
2832         for (i = 0; i < SBC_MAX; ++i)
2833           if (__btowc (i) == WEOF)
2834             bitset_set (sbcset, i);
2835 #endif /* RE_ENABLE_I18N */
2836     }
2837
2838   /* We treat the first ']' as a normal character.  */
2839   if (token->type == OP_CLOSE_BRACKET)
2840     token->type = CHARACTER;
2841
2842   while (1)
2843     {
2844       bracket_elem_t start_elem, end_elem;
2845       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2846       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2847       reg_errcode_t ret;
2848       int token_len2 = 0, is_range_exp = 0;
2849       re_token_t token2;
2850
2851       start_elem.opr.name = start_name_buf;
2852       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2853                                    syntax);
2854       if (BE (ret != REG_NOERROR, 0))
2855         {
2856           *err = ret;
2857           goto parse_bracket_exp_free_return;
2858         }
2859
2860       token_len = peek_token_bracket (token, regexp, syntax);
2861       if (BE (token->type == END_OF_RE, 0))
2862         {
2863           *err = REG_BADPAT;
2864           goto parse_bracket_exp_free_return;
2865         }
2866       if (token->type == OP_CHARSET_RANGE)
2867         {
2868           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2869           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2870           if (BE (token->type == END_OF_RE, 0))
2871             {
2872               *err = REG_BADPAT;
2873               goto parse_bracket_exp_free_return;
2874             }
2875           if (token2.type == OP_CLOSE_BRACKET)
2876             {
2877               /* We treat the last '-' as a normal character.  */
2878               re_string_skip_bytes (regexp, -token_len);
2879               token->type = CHARACTER;
2880             }
2881           else
2882             is_range_exp = 1;
2883         }
2884
2885       if (is_range_exp == 1)
2886         {
2887           end_elem.opr.name = end_name_buf;
2888           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2889                                        dfa, syntax);
2890           if (BE (ret != REG_NOERROR, 0))
2891             {
2892               *err = ret;
2893               goto parse_bracket_exp_free_return;
2894             }
2895
2896           token_len = peek_token_bracket (token, regexp, syntax);
2897           if (BE (token->type == END_OF_RE, 0))
2898             {
2899               *err = REG_BADPAT;
2900               goto parse_bracket_exp_free_return;
2901             }
2902           *err = build_range_exp (sbcset,
2903 #ifdef RE_ENABLE_I18N
2904                                   mbcset, &range_alloc,
2905 #endif /* RE_ENABLE_I18N */
2906                                   &start_elem, &end_elem);
2907           if (BE (*err != REG_NOERROR, 0))
2908             goto parse_bracket_exp_free_return;
2909         }
2910       else
2911         {
2912           switch (start_elem.type)
2913             {
2914             case SB_CHAR:
2915               bitset_set (sbcset, start_elem.opr.ch);
2916               break;
2917 #ifdef RE_ENABLE_I18N
2918             case MB_CHAR:
2919               /* Check whether the array has enough space.  */
2920               if (mbchar_alloc == mbcset->nmbchars)
2921                 {
2922                   /* Not enough, realloc it.  */
2923                   /* +1 in case of mbcset->nmbchars is 0.  */
2924                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2925                   /* Use realloc since array is NULL if *alloc == 0.  */
2926                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2927                                                 mbchar_alloc);
2928                   if (BE (mbcset->mbchars == NULL, 0))
2929                     goto parse_bracket_exp_espace;
2930                 }
2931               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2932               break;
2933 #endif /* RE_ENABLE_I18N */
2934             case EQUIV_CLASS:
2935               *err = build_equiv_class (sbcset,
2936 #ifdef RE_ENABLE_I18N
2937                                         mbcset, &equiv_class_alloc,
2938 #endif /* RE_ENABLE_I18N */
2939                                         start_elem.opr.name);
2940               if (BE (*err != REG_NOERROR, 0))
2941                 goto parse_bracket_exp_free_return;
2942               break;
2943             case COLL_SYM:
2944               *err = build_collating_symbol (sbcset,
2945 #ifdef RE_ENABLE_I18N
2946                                              mbcset, &coll_sym_alloc,
2947 #endif /* RE_ENABLE_I18N */
2948                                              start_elem.opr.name);
2949               if (BE (*err != REG_NOERROR, 0))
2950                 goto parse_bracket_exp_free_return;
2951               break;
2952             case CHAR_CLASS:
2953               ret = build_charclass (sbcset,
2954 #ifdef RE_ENABLE_I18N
2955                                      mbcset, &char_class_alloc,
2956 #endif /* RE_ENABLE_I18N */
2957                                      start_elem.opr.name, syntax);
2958               if (BE (ret != REG_NOERROR, 0))
2959                goto parse_bracket_exp_espace;
2960               break;
2961             default:
2962               assert (0);
2963               break;
2964             }
2965         }
2966       if (token->type == OP_CLOSE_BRACKET)
2967         break;
2968     }
2969
2970   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2971
2972   /* If it is non-matching list.  */
2973 #ifdef RE_ENABLE_I18N
2974   if (mbcset->non_match)
2975 #else /* not RE_ENABLE_I18N */
2976   if (non_match)
2977 #endif /* not RE_ENABLE_I18N */
2978     bitset_not (sbcset);
2979
2980   /* Build a tree for simple bracket.  */
2981   br_token.type = SIMPLE_BRACKET;
2982   br_token.opr.sbcset = sbcset;
2983   new_idx = re_dfa_add_node (dfa, br_token, 0);
2984   work_tree = create_tree (NULL, NULL, 0, new_idx);
2985   if (BE (new_idx == -1 || work_tree == NULL, 0))
2986     goto parse_bracket_exp_espace;
2987
2988 #ifdef RE_ENABLE_I18N
2989   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2990       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2991                                                 || mbcset->non_match)))
2992     {
2993       re_token_t alt_token;
2994       bin_tree_t *mbc_tree;
2995       /* Build a tree for complex bracket.  */
2996       br_token.type = COMPLEX_BRACKET;
2997       br_token.opr.mbcset = mbcset;
2998       dfa->has_mb_node = 1;
2999       new_idx = re_dfa_add_node (dfa, br_token, 0);
3000       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3001       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3002         goto parse_bracket_exp_espace;
3003       /* Then join them by ALT node.  */
3004       dfa->has_plural_match = 1;
3005       alt_token.type = OP_ALT;
3006       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3007       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3008       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3009         return work_tree;
3010     }
3011   else
3012     {
3013       free_charset (mbcset);
3014       return work_tree;
3015     }
3016 #else /* not RE_ENABLE_I18N */
3017   return work_tree;
3018 #endif /* not RE_ENABLE_I18N */
3019
3020  parse_bracket_exp_espace:
3021   *err = REG_ESPACE;
3022  parse_bracket_exp_free_return:
3023   re_free (sbcset);
3024 #ifdef RE_ENABLE_I18N
3025   free_charset (mbcset);
3026 #endif /* RE_ENABLE_I18N */
3027   return NULL;
3028 }
3029
3030 /* Parse an element in the bracket expression.  */
3031
3032 static reg_errcode_t
3033 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3034      bracket_elem_t *elem;
3035      re_string_t *regexp;
3036      re_token_t *token;
3037      int token_len;
3038      re_dfa_t *dfa;
3039      reg_syntax_t syntax;
3040 {
3041 #ifdef RE_ENABLE_I18N
3042   int cur_char_size;
3043   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3044   if (cur_char_size > 1)
3045     {
3046       elem->type = MB_CHAR;
3047       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3048       re_string_skip_bytes (regexp, cur_char_size);
3049       return REG_NOERROR;
3050     }
3051 #endif /* RE_ENABLE_I18N */
3052   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3053   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3054       || token->type == OP_OPEN_EQUIV_CLASS)
3055     return parse_bracket_symbol (elem, regexp, token);
3056   elem->type = SB_CHAR;
3057   elem->opr.ch = token->opr.c;
3058   return REG_NOERROR;
3059 }
3060
3061 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3062    such as [:<character_class>:], [.<collating_element>.], and
3063    [=<equivalent_class>=].  */
3064
3065 static reg_errcode_t
3066 parse_bracket_symbol (elem, regexp, token)
3067      bracket_elem_t *elem;
3068      re_string_t *regexp;
3069      re_token_t *token;
3070 {
3071   unsigned char ch, delim = token->opr.c;
3072   int i = 0;
3073   for (;; ++i)
3074     {
3075       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3076         return REG_EBRACK;
3077       if (token->type == OP_OPEN_CHAR_CLASS)
3078         ch = re_string_fetch_byte_case (regexp);
3079       else
3080         ch = re_string_fetch_byte (regexp);
3081       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3082         break;
3083       elem->opr.name[i] = ch;
3084     }
3085   re_string_skip_bytes (regexp, 1);
3086   elem->opr.name[i] = '\0';
3087   switch (token->type)
3088     {
3089     case OP_OPEN_COLL_ELEM:
3090       elem->type = COLL_SYM;
3091       break;
3092     case OP_OPEN_EQUIV_CLASS:
3093       elem->type = EQUIV_CLASS;
3094       break;
3095     case OP_OPEN_CHAR_CLASS:
3096       elem->type = CHAR_CLASS;
3097       break;
3098     default:
3099       break;
3100     }
3101   return REG_NOERROR;
3102 }
3103
3104   /* Helper function for parse_bracket_exp.
3105      Build the equivalence class which is represented by NAME.
3106      The result are written to MBCSET and SBCSET.
3107      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3108      is a pointer argument sinse we may update it.  */
3109
3110 static reg_errcode_t
3111 #ifdef RE_ENABLE_I18N
3112 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3113      re_charset_t *mbcset;
3114      int *equiv_class_alloc;
3115 #else /* not RE_ENABLE_I18N */
3116 build_equiv_class (sbcset, name)
3117 #endif /* not RE_ENABLE_I18N */
3118      re_bitset_ptr_t sbcset;
3119      const unsigned char *name;
3120 {
3121 #if defined _LIBC && defined RE_ENABLE_I18N
3122   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3123   if (nrules != 0)
3124     {
3125       const int32_t *table, *indirect;
3126       const unsigned char *weights, *extra, *cp;
3127       unsigned char char_buf[2];
3128       int32_t idx1, idx2;
3129       unsigned int ch;
3130       size_t len;
3131       /* This #include defines a local function!  */
3132 # include <locale/weight.h>
3133       /* Calculate the index for equivalence class.  */
3134       cp = name;
3135       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3136       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3137                                                _NL_COLLATE_WEIGHTMB);
3138       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3139                                                    _NL_COLLATE_EXTRAMB);
3140       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3141                                                 _NL_COLLATE_INDIRECTMB);
3142       idx1 = findidx (&cp);
3143       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3144         /* This isn't a valid character.  */
3145         return REG_ECOLLATE;
3146
3147       /* Build single byte matcing table for this equivalence class.  */
3148       char_buf[1] = (unsigned char) '\0';
3149       len = weights[idx1];
3150       for (ch = 0; ch < SBC_MAX; ++ch)
3151         {
3152           char_buf[0] = ch;
3153           cp = char_buf;
3154           idx2 = findidx (&cp);
3155 /*
3156           idx2 = table[ch];
3157 */
3158           if (idx2 == 0)
3159             /* This isn't a valid character.  */
3160             continue;
3161           if (len == weights[idx2])
3162             {
3163               int cnt = 0;
3164               while (cnt <= len &&
3165                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3166                 ++cnt;
3167
3168               if (cnt > len)
3169                 bitset_set (sbcset, ch);
3170             }
3171         }
3172       /* Check whether the array has enough space.  */
3173       if (*equiv_class_alloc == mbcset->nequiv_classes)
3174         {
3175           /* Not enough, realloc it.  */
3176           /* +1 in case of mbcset->nequiv_classes is 0.  */
3177           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3178           /* Use realloc since the array is NULL if *alloc == 0.  */
3179           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3180                                               *equiv_class_alloc);
3181           if (BE (mbcset->equiv_classes == NULL, 0))
3182             return REG_ESPACE;
3183         }
3184       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3185     }
3186   else
3187 #endif /* _LIBC && RE_ENABLE_I18N */
3188     {
3189       if (BE (strlen ((const char *) name) != 1, 0))
3190         return REG_ECOLLATE;
3191       bitset_set (sbcset, *name);
3192     }
3193   return REG_NOERROR;
3194 }
3195
3196   /* Helper function for parse_bracket_exp.
3197      Build the character class which is represented by NAME.
3198      The result are written to MBCSET and SBCSET.
3199      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3200      is a pointer argument sinse we may update it.  */
3201
3202 static reg_errcode_t
3203 #ifdef RE_ENABLE_I18N
3204 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3205      re_charset_t *mbcset;
3206      int *char_class_alloc;
3207 #else /* not RE_ENABLE_I18N */
3208 build_charclass (sbcset, class_name, syntax)
3209 #endif /* not RE_ENABLE_I18N */
3210      re_bitset_ptr_t sbcset;
3211      const unsigned char *class_name;
3212      reg_syntax_t syntax;
3213 {
3214   int i;
3215   const char *name = (const char *) class_name;
3216
3217   /* In case of REG_ICASE "upper" and "lower" match the both of
3218      upper and lower cases.  */
3219   if ((syntax & RE_ICASE)
3220       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3221     name = "alpha";
3222
3223 #ifdef RE_ENABLE_I18N
3224   /* Check the space of the arrays.  */
3225   if (*char_class_alloc == mbcset->nchar_classes)
3226     {
3227       /* Not enough, realloc it.  */
3228       /* +1 in case of mbcset->nchar_classes is 0.  */
3229       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3230       /* Use realloc since array is NULL if *alloc == 0.  */
3231       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3232                                          *char_class_alloc);
3233       if (BE (mbcset->char_classes == NULL, 0))
3234         return REG_ESPACE;
3235     }
3236   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3237 #endif /* RE_ENABLE_I18N */
3238
3239 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3240     for (i = 0; i < SBC_MAX; ++i)       \
3241       {                                 \
3242         if (ctype_func (i))             \
3243           bitset_set (sbcset, i);       \
3244       }
3245
3246   if (strcmp (name, "alnum") == 0)
3247     BUILD_CHARCLASS_LOOP (isalnum)
3248   else if (strcmp (name, "cntrl") == 0)
3249     BUILD_CHARCLASS_LOOP (iscntrl)
3250   else if (strcmp (name, "lower") == 0)
3251     BUILD_CHARCLASS_LOOP (islower)
3252   else if (strcmp (name, "space") == 0)
3253     BUILD_CHARCLASS_LOOP (isspace)
3254   else if (strcmp (name, "alpha") == 0)
3255     BUILD_CHARCLASS_LOOP (isalpha)
3256   else if (strcmp (name, "digit") == 0)
3257     BUILD_CHARCLASS_LOOP (isdigit)
3258   else if (strcmp (name, "print") == 0)
3259     BUILD_CHARCLASS_LOOP (isprint)
3260   else if (strcmp (name, "upper") == 0)
3261     BUILD_CHARCLASS_LOOP (isupper)
3262   else if (strcmp (name, "blank") == 0)
3263     BUILD_CHARCLASS_LOOP (isblank)
3264   else if (strcmp (name, "graph") == 0)
3265     BUILD_CHARCLASS_LOOP (isgraph)
3266   else if (strcmp (name, "punct") == 0)
3267     BUILD_CHARCLASS_LOOP (ispunct)
3268   else if (strcmp (name, "xdigit") == 0)
3269     BUILD_CHARCLASS_LOOP (isxdigit)
3270   else
3271     return REG_ECTYPE;
3272
3273   return REG_NOERROR;
3274 }
3275
3276 static bin_tree_t *
3277 build_word_op (dfa, not, err)
3278      re_dfa_t *dfa;
3279      int not;
3280      reg_errcode_t *err;
3281 {
3282   re_bitset_ptr_t sbcset;
3283 #ifdef RE_ENABLE_I18N
3284   re_charset_t *mbcset;
3285   int alloc = 0;
3286 #else /* not RE_ENABLE_I18N */
3287   int non_match = 0;
3288 #endif /* not RE_ENABLE_I18N */
3289   reg_errcode_t ret;
3290   re_token_t br_token;
3291   bin_tree_t *tree;
3292   int new_idx;
3293
3294   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3295 #ifdef RE_ENABLE_I18N
3296   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3297 #endif /* RE_ENABLE_I18N */
3298
3299 #ifdef RE_ENABLE_I18N
3300   if (BE (sbcset == NULL || mbcset == NULL, 0))
3301 #else /* not RE_ENABLE_I18N */
3302   if (BE (sbcset == NULL, 0))
3303 #endif /* not RE_ENABLE_I18N */
3304     {
3305       *err = REG_ESPACE;
3306       return NULL;
3307     }
3308
3309   if (not)
3310     {
3311 #ifdef RE_ENABLE_I18N
3312       int i;
3313       /*
3314       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3315         bitset_set(cset->sbcset, '\0');
3316       */
3317       mbcset->non_match = 1;
3318       if (MB_CUR_MAX > 1)
3319         for (i = 0; i < SBC_MAX; ++i)
3320           if (__btowc (i) == WEOF)
3321             bitset_set (sbcset, i);
3322 #else /* not RE_ENABLE_I18N */
3323       non_match = 1;
3324 #endif /* not RE_ENABLE_I18N */
3325     }
3326
3327   /* We don't care the syntax in this case.  */
3328   ret = build_charclass (sbcset,
3329 #ifdef RE_ENABLE_I18N
3330                          mbcset, &alloc,
3331 #endif /* RE_ENABLE_I18N */
3332                          (const unsigned char *) "alpha", 0);
3333
3334   if (BE (ret != REG_NOERROR, 0))
3335     {
3336       re_free (sbcset);
3337 #ifdef RE_ENABLE_I18N
3338       free_charset (mbcset);
3339 #endif /* RE_ENABLE_I18N */
3340       *err = REG_ESPACE;
3341       return NULL;
3342     }
3343   /* \w match '_' also.  */
3344   bitset_set (sbcset, '_');
3345
3346   /* If it is non-matching list.  */
3347 #ifdef RE_ENABLE_I18N
3348   if (mbcset->non_match)
3349 #else /* not RE_ENABLE_I18N */
3350   if (non_match)
3351 #endif /* not RE_ENABLE_I18N */
3352     bitset_not (sbcset);
3353
3354   /* Build a tree for simple bracket.  */
3355   br_token.type = SIMPLE_BRACKET;
3356   br_token.opr.sbcset = sbcset;
3357   new_idx = re_dfa_add_node (dfa, br_token, 0);
3358   tree = create_tree (NULL, NULL, 0, new_idx);
3359   if (BE (new_idx == -1 || tree == NULL, 0))
3360     goto build_word_op_espace;
3361
3362 #ifdef RE_ENABLE_I18N
3363   if (MB_CUR_MAX > 1)
3364     {
3365       re_token_t alt_token;
3366       bin_tree_t *mbc_tree;
3367       /* Build a tree for complex bracket.  */
3368       br_token.type = COMPLEX_BRACKET;
3369       br_token.opr.mbcset = mbcset;
3370       dfa->has_mb_node = 1;
3371       new_idx = re_dfa_add_node (dfa, br_token, 0);
3372       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3373       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3374         goto build_word_op_espace;
3375       /* Then join them by ALT node.  */
3376       alt_token.type = OP_ALT;
3377       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3378       tree = create_tree (tree, mbc_tree, 0, new_idx);
3379       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3380         return tree;
3381     }
3382   else
3383     {
3384       free_charset (mbcset);
3385       return tree;
3386     }
3387 #else /* not RE_ENABLE_I18N */
3388   return tree;
3389 #endif /* not RE_ENABLE_I18N */
3390
3391  build_word_op_espace:
3392   re_free (sbcset);
3393 #ifdef RE_ENABLE_I18N
3394   free_charset (mbcset);
3395 #endif /* RE_ENABLE_I18N */
3396   *err = REG_ESPACE;
3397   return NULL;
3398 }
3399
3400 /* This is intended for the expressions like "a{1,3}".
3401    Fetch a number from `input', and return the number.
3402    Return -1, if the number field is empty like "{,1}".
3403    Return -2, If an error is occured.  */
3404
3405 static int
3406 fetch_number (input, token, syntax)
3407      re_string_t *input;
3408      re_token_t *token;
3409      reg_syntax_t syntax;
3410 {
3411   int num = -1;
3412   unsigned char c;
3413   while (1)
3414     {
3415       *token = fetch_token (input, syntax);
3416       c = token->opr.c;
3417       if (BE (token->type == END_OF_RE, 0))
3418         return -2;
3419       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3420         break;
3421       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3422              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3423       num = (num > RE_DUP_MAX) ? -2 : num;
3424     }
3425   return num;
3426 }
3427 \f
3428 #ifdef RE_ENABLE_I18N
3429 static void
3430 free_charset (re_charset_t *cset)
3431 {
3432   re_free (cset->mbchars);
3433 # ifdef _LIBC
3434   re_free (cset->coll_syms);
3435   re_free (cset->equiv_classes);
3436   re_free (cset->range_starts);
3437   re_free (cset->range_ends);
3438 # endif
3439   re_free (cset->char_classes);
3440   re_free (cset);
3441 }
3442 #endif /* RE_ENABLE_I18N */
3443 \f
3444 /* Functions for binary tree operation.  */
3445
3446 /* Create a node of tree.
3447    Note: This function automatically free left and right if malloc fails.  */
3448
3449 static bin_tree_t *
3450 create_tree (left, right, type, index)
3451      bin_tree_t *left;
3452      bin_tree_t *right;
3453      re_token_type_t type;
3454      int index;
3455 {
3456   bin_tree_t *tree;
3457   tree = re_malloc (bin_tree_t, 1);
3458   if (BE (tree == NULL, 0))
3459     {
3460       free_bin_tree (left);
3461       free_bin_tree (right);
3462       return NULL;
3463     }
3464   tree->parent = NULL;
3465   tree->left = left;
3466   tree->right = right;
3467   tree->type = type;
3468   tree->node_idx = index;
3469   tree->first = -1;
3470   tree->next = -1;
3471   re_node_set_init_empty (&tree->eclosure);
3472
3473   if (left != NULL)
3474     left->parent = tree;
3475   if (right != NULL)
3476     right->parent = tree;
3477   return tree;
3478 }
3479
3480 /* Free the sub tree pointed by TREE.  */
3481
3482 static void
3483 free_bin_tree (tree)
3484      bin_tree_t *tree;
3485 {
3486   if (tree == NULL)
3487     return;
3488   /*re_node_set_free (&tree->eclosure);*/
3489   free_bin_tree (tree->left);
3490   free_bin_tree (tree->right);
3491   re_free (tree);
3492 }
3493
3494 /* Duplicate the node SRC, and return new node.  */
3495
3496 static bin_tree_t *
3497 duplicate_tree (src, dfa)
3498      const bin_tree_t *src;
3499      re_dfa_t *dfa;
3500 {
3501   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3502   int new_node_idx;
3503   /* Since node indies must be according to Post-order of the tree,
3504      we must duplicate the left at first.  */
3505   if (src->left != NULL)
3506     {
3507       left = duplicate_tree (src->left, dfa);
3508       if (left == NULL)
3509         return NULL;
3510     }
3511
3512   /* Secondaly, duplicate the right.  */
3513   if (src->right != NULL)
3514     {
3515       right = duplicate_tree (src->right, dfa);
3516       if (right == NULL)
3517         {
3518           free_bin_tree (left);
3519           return NULL;
3520         }
3521     }
3522
3523   /* At last, duplicate itself.  */
3524   if (src->type == NON_TYPE)
3525     {
3526       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3527       dfa->nodes[new_node_idx].duplicated = 1;
3528       if (BE (new_node_idx == -1, 0))
3529         {
3530           free_bin_tree (left);
3531           free_bin_tree (right);
3532           return NULL;
3533         }
3534     }
3535   else
3536     new_node_idx = src->type;
3537
3538   new_tree = create_tree (left, right, src->type, new_node_idx);
3539   if (BE (new_tree == NULL, 0))
3540     {
3541       free_bin_tree (left);
3542       free_bin_tree (right);
3543     }
3544   return new_tree;
3545 }