posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <limits.h>
  24 #include <locale.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #if defined HAVE_WCHAR_H || defined _LIBC
  30 # include <wchar.h>
  31 #endif /* HAVE_WCHAR_H || _LIBC */
  32 #if defined HAVE_WCTYPE_H || defined _LIBC
  33 # include <wctype.h>
  34 #endif /* HAVE_WCTYPE_H || _LIBC */
  35
  36 /* In case that the system doesn't have isblank().  */
  37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
  38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
  39 #endif
  40
  41 #ifdef _LIBC
  42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
  43 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
  44 #   include <locale/localeinfo.h>
  45 #   include <locale/elem-hash.h>
  46 #   include <locale/coll-lookup.h>
  47 # endif
  48 #endif
  49
  50 /* This is for other GNU distributions with internationalized messages.  */
  51 #if HAVE_LIBINTL_H || defined _LIBC
  52 # include <libintl.h>
  53 # ifdef _LIBC
  54 #  undef gettext
  55 #  define gettext(msgid) \
  56   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
  57 # endif
  58 #else
  59 # define gettext(msgid) (msgid)
  60 #endif
  61
  62 #ifndef gettext_noop
  63 /* This define is so xgettext can find the internationalizable
  64    strings.  */
  65 # define gettext_noop(String) String
  66 #endif
  67
  68 #include <regex.h>
  69 #include "regex_internal.h"
  70
  71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  72                                           int length, reg_syntax_t syntax);
  73 static void re_compile_fastmap_iter (regex_t *bufp,
  74                                      const re_dfastate_t *init_state,
  75                                      char *fastmap);
  76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  78 #ifdef RE_ENABLE_I18N
  79 static void free_charset (re_charset_t *cset);
  80 #endif /* RE_ENABLE_I18N */
  81 static void free_workarea_compile (regex_t *preg);
  82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  83 static reg_errcode_t analyze (re_dfa_t *dfa);
  84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  88 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  89                                      unsigned int constraint);
  90 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  91 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  92                                          int node, int root);
  93 static void calc_inveclosure (re_dfa_t *dfa);
  94 static int fetch_number (re_string_t *input, re_token_t *token,
  95                          reg_syntax_t syntax);
  96 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
  97 static int peek_token (re_token_t *token, re_string_t *input,
  98                         reg_syntax_t syntax);
  99 static int peek_token_bracket (re_token_t *token, re_string_t *input,
 100                                reg_syntax_t syntax);
 101 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
 102                           reg_syntax_t syntax, reg_errcode_t *err);
 103 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
 104                                   re_token_t *token, reg_syntax_t syntax,
 105                                   int nest, reg_errcode_t *err);
 106 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
 107                                  re_token_t *token, reg_syntax_t syntax,
 108                                  int nest, reg_errcode_t *err);
 109 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
 110                                      re_token_t *token, reg_syntax_t syntax,
 111                                      int nest, reg_errcode_t *err);
 112 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
 113                                   re_token_t *token, reg_syntax_t syntax,
 114                                   int nest, reg_errcode_t *err);
 115 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
 116                                  re_dfa_t *dfa, re_token_t *token,
 117                                  reg_syntax_t syntax, reg_errcode_t *err);
 118 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
 119                                       re_token_t *token, reg_syntax_t syntax,
 120                                       reg_errcode_t *err);
 121 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
 122                                             re_string_t *regexp,
 123                                             re_token_t *token, int token_len,
 124                                             re_dfa_t *dfa,
 125                                             reg_syntax_t syntax);
 126 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
 127                                           re_string_t *regexp,
 128                                           re_token_t *token);
 129 #ifndef _LIBC
 130 # ifdef RE_ENABLE_I18N
 131 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 132                                       re_charset_t *mbcset, int *range_alloc,
 133                                       bracket_elem_t *start_elem,
 134                                       bracket_elem_t *end_elem);
 135 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 136                                              re_charset_t *mbcset,
 137                                              int *coll_sym_alloc,
 138                                              const unsigned char *name);
 139 # else /* not RE_ENABLE_I18N */
 140 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 141                                       bracket_elem_t *start_elem,
 142                                       bracket_elem_t *end_elem);
 143 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 144                                              const unsigned char *name);
 145 # endif /* not RE_ENABLE_I18N */
 146 #endif /* not _LIBC */
 147 #ifdef RE_ENABLE_I18N
 148 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 149                                         re_charset_t *mbcset,
 150                                         int *equiv_class_alloc,
 151                                         const unsigned char *name);
 152 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 153                                       re_charset_t *mbcset,
 154                                       int *char_class_alloc,
 155                                       const unsigned char *class_name,
 156                                       reg_syntax_t syntax);
 157 #else  /* not RE_ENABLE_I18N */
 158 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 159                                         const unsigned char *name);
 160 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 161                                       const unsigned char *class_name,
 162                                       reg_syntax_t syntax);
 163 #endif /* not RE_ENABLE_I18N */
 164 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
 165 static void free_bin_tree (bin_tree_t *tree);
 166 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 167                                 re_token_type_t type, int index);
 168 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 169 \f
 170 /* This table gives an error message for each of the error codes listed
 171    in regex.h.  Obviously the order here has to be same as there.
 172    POSIX doesn't require that we do anything for REG_NOERROR,
 173    but why not be nice?  */
 174
 175 const char __re_error_msgid[] attribute_hidden =
 176   {
 177 #define REG_NOERROR_IDX 0
 178     gettext_noop ("Success")    /* REG_NOERROR */
 179     "\0"
 180 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 181     gettext_noop ("No match")   /* REG_NOMATCH */
 182     "\0"
 183 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 184     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 185     "\0"
 186 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 187     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 188     "\0"
 189 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 190     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 191     "\0"
 192 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 193     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 194     "\0"
 195 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 196     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 197     "\0"
 198 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 199     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 200     "\0"
 201 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 202     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 203     "\0"
 204 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 205     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 206     "\0"
 207 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 208     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 209     "\0"
 210 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 211     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 212     "\0"
 213 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 214     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 215     "\0"
 216 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 217     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 218     "\0"
 219 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 220     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 221     "\0"
 222 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 223     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 224     "\0"
 225 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 226     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 227   };
 228
 229 const size_t __re_error_msgid_idx[] attribute_hidden =
 230   {
 231     REG_NOERROR_IDX,
 232     REG_NOMATCH_IDX,
 233     REG_BADPAT_IDX,
 234     REG_ECOLLATE_IDX,
 235     REG_ECTYPE_IDX,
 236     REG_EESCAPE_IDX,
 237     REG_ESUBREG_IDX,
 238     REG_EBRACK_IDX,
 239     REG_EPAREN_IDX,
 240     REG_EBRACE_IDX,
 241     REG_BADBR_IDX,
 242     REG_ERANGE_IDX,
 243     REG_ESPACE_IDX,
 244     REG_BADRPT_IDX,
 245     REG_EEND_IDX,
 246     REG_ESIZE_IDX,
 247     REG_ERPAREN_IDX
 248   };
 249 \f
 250 /* Entry points for GNU code.  */
 251
 252 /* re_compile_pattern is the GNU regular expression compiler: it
 253    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 254    Returns 0 if the pattern was valid, otherwise an error string.
 255
 256    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 257    are set in BUFP on entry.  */
 258
 259 const char *
 260 re_compile_pattern (pattern, length, bufp)
 261     const char *pattern;
 262     size_t length;
 263     struct re_pattern_buffer *bufp;
 264 {
 265   reg_errcode_t ret;
 266
 267   /* GNU code is written to assume at least RE_NREGS registers will be set
 268      (and at least one extra will be -1).  */
 269   bufp->regs_allocated = REGS_UNALLOCATED;
 270
 271   /* And GNU code determines whether or not to get register information
 272      by passing null for the REGS argument to re_match, etc., not by
 273      setting no_sub.  */
 274   bufp->no_sub = 0;
 275
 276   /* Match anchors at newline.  */
 277   bufp->newline_anchor = 1;
 278
 279   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 280
 281   if (!ret)
 282     return NULL;
 283   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_pattern, re_compile_pattern)
 287 #endif
 288
 289 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 290    also be assigned to arbitrarily: each pattern buffer stores its own
 291    syntax, so it can be changed between regex compilations.  */
 292 /* This has no initializer because initialized variables in Emacs
 293    become read-only after dumping.  */
 294 reg_syntax_t re_syntax_options;
 295
 296
 297 /* Specify the precise syntax of regexps for compilation.  This provides
 298    for compatibility for various utilities which historically have
 299    different, incompatible syntaxes.
 300
 301    The argument SYNTAX is a bit mask comprised of the various bits
 302    defined in regex.h.  We return the old syntax.  */
 303
 304 reg_syntax_t
 305 re_set_syntax (syntax)
 306     reg_syntax_t syntax;
 307 {
 308   reg_syntax_t ret = re_syntax_options;
 309
 310   re_syntax_options = syntax;
 311   return ret;
 312 }
 313 #ifdef _LIBC
 314 weak_alias (__re_set_syntax, re_set_syntax)
 315 #endif
 316
 317 int
 318 re_compile_fastmap (bufp)
 319     struct re_pattern_buffer *bufp;
 320 {
 321   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 322   char *fastmap = bufp->fastmap;
 323
 324   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 325   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 326   if (dfa->init_state != dfa->init_state_word)
 327     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 328   if (dfa->init_state != dfa->init_state_nl)
 329     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 330   if (dfa->init_state != dfa->init_state_begbuf)
 331     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 332   bufp->fastmap_accurate = 1;
 333   return 0;
 334 }
 335 #ifdef _LIBC
 336 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 337 #endif
 338
 339 /* Helper function for re_compile_fastmap.
 340    Compile fastmap for the initial_state INIT_STATE.  */
 341
 342 static void
 343 re_compile_fastmap_iter (bufp, init_state, fastmap)
 344      regex_t *bufp;
 345      const re_dfastate_t *init_state;
 346      char *fastmap;
 347 {
 348   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 349   int node_cnt;
 350   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 351     {
 352       int node = init_state->nodes.elems[node_cnt];
 353       re_token_type_t type = dfa->nodes[node].type;
 354       if (type == OP_CONTEXT_NODE)
 355         {
 356           node = dfa->nodes[node].opr.ctx_info->entity;
 357           type = dfa->nodes[node].type;
 358         }
 359
 360       if (type == CHARACTER)
 361         fastmap[dfa->nodes[node].opr.c] = 1;
 362       else if (type == SIMPLE_BRACKET)
 363         {
 364           int i, j, ch;
 365           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 366             for (j = 0; j < UINT_BITS; ++j, ++ch)
 367               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 368                 fastmap[ch] = 1;
 369         }
 370 #ifdef RE_ENABLE_I18N
 371       else if (type == COMPLEX_BRACKET)
 372         {
 373           int i;
 374           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 375           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 376               || cset->nranges || cset->nchar_classes)
 377             {
 378 # ifdef _LIBC
 379               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 380                 {
 381                   /* In this case we want to catch the bytes which are
 382                      the first byte of any collation elements.
 383                      e.g. In da_DK, we want to catch 'a' since "aa"
 384                           is a valid collation element, and don't catch
 385                           'b' since 'b' is the only collation element
 386                           which starts from 'b'.  */
 387                   int j, ch;
 388                   const int32_t *table = (const int32_t *)
 389                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 390                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 391                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 392                       if (table[ch] < 0)
 393                         fastmap[ch] = 1;
 394                 }
 395 # else
 396               if (MB_CUR_MAX > 1)
 397                 for (i = 0; i < SBC_MAX; ++i)
 398                   if (__btowc (i) == WEOF)
 399                     fastmap[i] = 1;
 400 # endif /* not _LIBC */
 401             }
 402           for (i = 0; i < cset->nmbchars; ++i)
 403             {
 404               char buf[256];
 405               wctomb (buf, cset->mbchars[i]);
 406               fastmap[*(unsigned char *) buf] = 1;
 407             }
 408         }
 409 #endif /* RE_ENABLE_I18N */
 410       else if (type == END_OF_RE || type == OP_PERIOD
 411 #ifdef RE_ENABLE_I18N
 412                || type == COMPLEX_BRACKET
 413 #endif /* RE_ENABLE_I18N */
 414               )
 415         {
 416           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 417           if (type == END_OF_RE)
 418             bufp->can_be_null = 1;
 419           return;
 420         }
 421     }
 422 }
 423 \f
 424 /* Entry point for POSIX code.  */
 425 /* regcomp takes a regular expression as a string and compiles it.
 426
 427    PREG is a regex_t *.  We do not expect any fields to be initialized,
 428    since POSIX says we shouldn't.  Thus, we set
 429
 430      `buffer' to the compiled pattern;
 431      `used' to the length of the compiled pattern;
 432      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 433        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 434        RE_SYNTAX_POSIX_BASIC;
 435      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 436      `fastmap' to an allocated space for the fastmap;
 437      `fastmap_accurate' to zero;
 438      `re_nsub' to the number of subexpressions in PATTERN.
 439
 440    PATTERN is the address of the pattern string.
 441
 442    CFLAGS is a series of bits which affect compilation.
 443
 444      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 445      use POSIX basic syntax.
 446
 447      If REG_NEWLINE is set, then . and [^...] don't match newline.
 448      Also, regexec will try a match beginning after every newline.
 449
 450      If REG_ICASE is set, then we considers upper- and lowercase
 451      versions of letters to be equivalent when matching.
 452
 453      If REG_NOSUB is set, then when PREG is passed to regexec, that
 454      routine will report only success or failure, and nothing about the
 455      registers.
 456
 457    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 458    the return codes and their meanings.)  */
 459
 460 int
 461 regcomp (preg, pattern, cflags)
 462     regex_t *__restrict preg;
 463     const char *__restrict pattern;
 464     int cflags;
 465 {
 466   reg_errcode_t ret;
 467   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 468                          : RE_SYNTAX_POSIX_BASIC);
 469
 470   preg->buffer = NULL;
 471   preg->allocated = 0;
 472   preg->used = 0;
 473
 474   /* Try to allocate space for the fastmap.  */
 475   preg->fastmap = re_malloc (char, SBC_MAX);
 476   if (BE (preg->fastmap == NULL, 0))
 477     return REG_ESPACE;
 478
 479   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 480
 481   /* If REG_NEWLINE is set, newlines are treated differently.  */
 482   if (cflags & REG_NEWLINE)
 483     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 484       syntax &= ~RE_DOT_NEWLINE;
 485       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 486       /* It also changes the matching behavior.  */
 487       preg->newline_anchor = 1;
 488     }
 489   else
 490     preg->newline_anchor = 0;
 491   preg->no_sub = !!(cflags & REG_NOSUB);
 492   preg->translate = NULL;
 493
 494   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 495
 496   /* POSIX doesn't distinguish between an unmatched open-group and an
 497      unmatched close-group: both are REG_EPAREN.  */
 498   if (ret == REG_ERPAREN)
 499     ret = REG_EPAREN;
 500
 501   /* We have already checked preg->fastmap != NULL.  */
 502   if (BE (ret == REG_NOERROR, 1))
 503     {
 504       /* Compute the fastmap now, since regexec cannot modify the pattern
 505          buffer.  */
 506       if (BE (re_compile_fastmap (preg) == -2, 0))
 507         {
 508           /* Some error occurred while computing the fastmap, just forget
 509              about it.  */
 510           re_free (preg->fastmap);
 511           preg->fastmap = NULL;
 512         }
 513     }
 514
 515   return (int) ret;
 516 }
 517 #ifdef _LIBC
 518 weak_alias (__regcomp, regcomp)
 519 #endif
 520
 521 /* Returns a message corresponding to an error code, ERRCODE, returned
 522    from either regcomp or regexec.   We don't use PREG here.  */
 523
 524 size_t
 525 regerror (errcode, preg, errbuf, errbuf_size)
 526     int errcode;
 527     const regex_t *preg;
 528     char *errbuf;
 529     size_t errbuf_size;
 530 {
 531   const char *msg;
 532   size_t msg_size;
 533
 534   if (BE (errcode < 0
 535           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 536                                / sizeof (__re_error_msgid_idx[0])), 0))
 537     /* Only error codes returned by the rest of the code should be passed
 538        to this routine.  If we are given anything else, or if other regex
 539        code generates an invalid error code, then the program has a bug.
 540        Dump core so we can fix it.  */
 541     abort ();
 542
 543   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 544
 545   msg_size = strlen (msg) + 1; /* Includes the null.  */
 546
 547   if (BE (errbuf_size != 0, 1))
 548     {
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551 #if defined HAVE_MEMPCPY || defined _LIBC
 552           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 553 #else
 554           memcpy (errbuf, msg, errbuf_size - 1);
 555           errbuf[errbuf_size - 1] = 0;
 556 #endif
 557         }
 558       else
 559         memcpy (errbuf, msg, msg_size);
 560     }
 561
 562   return msg_size;
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__regerror, regerror)
 566 #endif
 567
 568 /* Free dynamically allocated space used by PREG.  */
 569
 570 void
 571 regfree (preg)
 572     regex_t *preg;
 573 {
 574   int i, j;
 575   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 576   if (BE (dfa != NULL, 1))
 577     {
 578       re_free (dfa->subexps);
 579
 580       for (i = 0; i < dfa->nodes_len; ++i)
 581         {
 582           re_token_t *node = dfa->nodes + i;
 583 #ifdef RE_ENABLE_I18N
 584           if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 585             free_charset (node->opr.mbcset);
 586           else
 587 #endif /* RE_ENABLE_I18N */
 588           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 589             re_free (node->opr.sbcset);
 590           else if (node->type == OP_CONTEXT_NODE)
 591             {
 592               if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
 593                 {
 594                   if (node->opr.ctx_info->bkref_eclosure != NULL)
 595                     re_node_set_free (node->opr.ctx_info->bkref_eclosure);
 596                   re_free (node->opr.ctx_info->bkref_eclosure);
 597                 }
 598               re_free (node->opr.ctx_info);
 599             }
 600         }
 601       re_free (dfa->firsts);
 602       re_free (dfa->nexts);
 603       for (i = 0; i < dfa->nodes_len; ++i)
 604         {
 605           if (dfa->eclosures != NULL)
 606             re_node_set_free (dfa->eclosures + i);
 607           if (dfa->inveclosures != NULL)
 608             re_node_set_free (dfa->inveclosures + i);
 609           if (dfa->edests != NULL)
 610             re_node_set_free (dfa->edests + i);
 611         }
 612       re_free (dfa->edests);
 613       re_free (dfa->eclosures);
 614       re_free (dfa->inveclosures);
 615       re_free (dfa->nodes);
 616
 617       for (i = 0; i <= dfa->state_hash_mask; ++i)
 618         {
 619           struct re_state_table_entry *entry = dfa->state_table + i;
 620           for (j = 0; j < entry->num; ++j)
 621             {
 622               re_dfastate_t *state = entry->array[j];
 623               if (state->entrance_nodes != &state->nodes)
 624                 {
 625                   re_node_set_free (state->entrance_nodes);
 626                   re_free (state->entrance_nodes);
 627                 }
 628               re_node_set_free (&state->nodes);
 629               re_free (state->trtable);
 630               re_free (state->trtable_search);
 631               re_free (state);
 632             }
 633           re_free (entry->array);
 634         }
 635       re_free (dfa->state_table);
 636
 637       if (dfa->word_char != NULL)
 638         re_free (dfa->word_char);
 639       re_free (dfa);
 640     }
 641   re_free (preg->fastmap);
 642 }
 643 #ifdef _LIBC
 644 weak_alias (__regfree, regfree)
 645 #endif
 646 \f
 647 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 648    them unless specifically requested.  */
 649
 650 #if defined _REGEX_RE_COMP || defined _LIBC
 651
 652 /* BSD has one and only one pattern buffer.  */
 653 static struct re_pattern_buffer re_comp_buf;
 654
 655 char *
 656 # ifdef _LIBC
 657 /* Make these definitions weak in libc, so POSIX programs can redefine
 658    these names if they don't use our functions, and still use
 659    regcomp/regexec above without link errors.  */
 660 weak_function
 661 # endif
 662 re_comp (s)
 663      const char *s;
 664 {
 665   reg_errcode_t ret;
 666
 667   if (!s)
 668     {
 669       if (!re_comp_buf.buffer)
 670         return gettext ("No previous regular expression");
 671       return 0;
 672     }
 673
 674   if (!re_comp_buf.buffer)
 675     {
 676       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 677       if (re_comp_buf.fastmap == NULL)
 678         return (char *) gettext (__re_error_msgid
 679                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 680     }
 681
 682   /* Since `re_exec' always passes NULL for the `regs' argument, we
 683      don't need to initialize the pattern buffer fields which affect it.  */
 684
 685   /* Match anchors at newlines.  */
 686   re_comp_buf.newline_anchor = 1;
 687
 688   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 689
 690   if (!ret)
 691     return NULL;
 692
 693   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 694   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 695 }
 696 #endif /* _REGEX_RE_COMP */
 697 \f
 698 /* Internal entry point.
 699    Compile the regular expression PATTERN, whose length is LENGTH.
 700    SYNTAX indicate regular expression's syntax.  */
 701
 702 static reg_errcode_t
 703 re_compile_internal (preg, pattern, length, syntax)
 704      regex_t *preg;
 705      const char * pattern;
 706      int length;
 707      reg_syntax_t syntax;
 708 {
 709   reg_errcode_t err = REG_NOERROR;
 710   re_dfa_t *dfa;
 711   re_string_t regexp;
 712
 713   /* Initialize the pattern buffer.  */
 714   preg->fastmap_accurate = 0;
 715   preg->syntax = syntax;
 716   preg->not_bol = preg->not_eol = 0;
 717   preg->used = 0;
 718   preg->re_nsub = 0;
 719
 720   /* Initialize the dfa.  */
 721   dfa = (re_dfa_t *) preg->buffer;
 722   if (preg->allocated < sizeof (re_dfa_t))
 723     {
 724       /* If zero allocated, but buffer is non-null, try to realloc
 725          enough space.  This loses if buffer's address is bogus, but
 726          that is the user's responsibility.  If ->buffer is NULL this
 727          is a simple allocation.  */
 728       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 729       if (dfa == NULL)
 730         return REG_ESPACE;
 731       preg->allocated = sizeof (re_dfa_t);
 732     }
 733   preg->buffer = (unsigned char *) dfa;
 734   preg->used = sizeof (re_dfa_t);
 735
 736   err = init_dfa (dfa, length);
 737   if (BE (err != REG_NOERROR, 0))
 738     {
 739       re_free (dfa);
 740       preg->buffer = NULL;
 741       return err;
 742     }
 743
 744   err = re_string_construct (&regexp, pattern, length, preg->translate,
 745                              syntax & RE_ICASE);
 746   if (BE (err != REG_NOERROR, 0))
 747     {
 748       re_free (dfa);
 749       preg->buffer = NULL;
 750       return err;
 751     }
 752
 753   /* Parse the regular expression, and build a structure tree.  */
 754   preg->re_nsub = 0;
 755   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 756   if (BE (dfa->str_tree == NULL, 0))
 757     goto re_compile_internal_free_return;
 758
 759   /* Analyze the tree and collect information which is necessary to
 760      create the dfa.  */
 761   err = analyze (dfa);
 762   if (BE (err != REG_NOERROR, 0))
 763     goto re_compile_internal_free_return;
 764
 765   /* Then create the initial state of the dfa.  */
 766   err = create_initial_state (dfa);
 767   if (BE (err != REG_NOERROR, 0))
 768     goto re_compile_internal_free_return;
 769
 770  re_compile_internal_free_return:
 771   /* Release work areas.  */
 772   free_workarea_compile (preg);
 773   re_string_destruct (&regexp);
 774
 775   return err;
 776 }
 777
 778 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 779    as the initial length of some arrays.  */
 780
 781 static reg_errcode_t
 782 init_dfa (dfa, pat_len)
 783      re_dfa_t *dfa;
 784      int pat_len;
 785 {
 786   int table_size;
 787
 788   memset (dfa, '\0', sizeof (re_dfa_t));
 789
 790   dfa->nodes_alloc = pat_len + 1;
 791   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 792
 793   dfa->states_alloc = pat_len + 1;
 794
 795   /*  table_size = 2 ^ ceil(log pat_len) */
 796   for (table_size = 1; table_size > 0; table_size <<= 1)
 797     if (table_size > pat_len)
 798       break;
 799
 800   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 801   dfa->state_hash_mask = table_size - 1;
 802
 803   dfa->subexps_alloc = 1;
 804   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 805   dfa->word_char = NULL;
 806
 807   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 808           || dfa->subexps == NULL, 0))
 809     {
 810       /* We don't bother to free anything which was allocated.  Very
 811          soon the process will go down anyway.  */
 812       dfa->subexps = NULL;
 813       dfa->state_table = NULL;
 814       dfa->nodes = NULL;
 815       return REG_ESPACE;
 816     }
 817   return REG_NOERROR;
 818 }
 819
 820 /* Initialize WORD_CHAR table, which indicate which character is
 821    "word".  In this case "word" means that it is the word construction
 822    character used by some operators like "\<", "\>", etc.  */
 823
 824 static reg_errcode_t
 825 init_word_char (dfa)
 826      re_dfa_t *dfa;
 827 {
 828   int i, j, ch;
 829   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 830   if (BE (dfa->word_char == NULL, 0))
 831     return REG_ESPACE;
 832   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 833     for (j = 0; j < UINT_BITS; ++j, ++ch)
 834       if (isalnum (ch) || ch == '_')
 835         dfa->word_char[i] |= 1 << j;
 836   return REG_NOERROR;
 837 }
 838
 839 /* Free the work area which are only used while compiling.  */
 840
 841 static void
 842 free_workarea_compile (preg)
 843      regex_t *preg;
 844 {
 845   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 846   free_bin_tree (dfa->str_tree);
 847   dfa->str_tree = NULL;
 848 }
 849
 850 /* Create initial states for all contexts.  */
 851
 852 static reg_errcode_t
 853 create_initial_state (dfa)
 854      re_dfa_t *dfa;
 855 {
 856   int first, i;
 857   reg_errcode_t err;
 858   re_node_set init_nodes;
 859
 860   /* Initial states have the epsilon closure of the node which is
 861      the first node of the regular expression.  */
 862   first = dfa->str_tree->first;
 863   dfa->init_node = first;
 864   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 865   if (BE (err != REG_NOERROR, 0))
 866     return err;
 867
 868   /* The back-references which are in initial states can epsilon transit,
 869      since in this case all of the subexpressions can be null.
 870      Then we add epsilon closures of the nodes which are the next nodes of
 871      the back-references.  */
 872   if (dfa->nbackref > 0)
 873     for (i = 0; i < init_nodes.nelem; ++i)
 874       {
 875         int node_idx = init_nodes.elems[i];
 876         re_token_type_t type = dfa->nodes[node_idx].type;
 877         if (type == OP_CONTEXT_NODE
 878             && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type
 879                 == OP_BACK_REF))
 880           {
 881             int prev_nelem = init_nodes.nelem;
 882             re_node_set_merge (&init_nodes,
 883                            dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure);
 884             if (prev_nelem < init_nodes.nelem)
 885               i = 0;
 886           }
 887         else if (type == OP_BACK_REF)
 888           {
 889             int next_idx = dfa->nexts[node_idx];
 890             if (!re_node_set_contains (&init_nodes, next_idx))
 891               {
 892                 re_node_set_merge (&init_nodes, dfa->eclosures + next_idx);
 893                 i = 0;
 894               }
 895           }
 896       }
 897
 898   /* It must be the first time to invoke acquire_state.  */
 899   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 900   /* We don't check ERR here, since the initial state must not be NULL.  */
 901   if (BE (dfa->init_state == NULL, 0))
 902     return err;
 903   if (dfa->init_state->has_constraint)
 904     {
 905       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 906                                                        CONTEXT_WORD);
 907       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 908                                                      CONTEXT_NEWLINE);
 909       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 910                                                          &init_nodes,
 911                                                          CONTEXT_NEWLINE
 912                                                          | CONTEXT_BEGBUF);
 913       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 914               || dfa->init_state_begbuf == NULL, 0))
 915         return err;
 916     }
 917   else
 918     dfa->init_state_word = dfa->init_state_nl
 919       = dfa->init_state_begbuf = dfa->init_state;
 920
 921   re_node_set_free (&init_nodes);
 922   return REG_NOERROR;
 923 }
 924 \f
 925 /* Analyze the structure tree, and calculate "first", "next", "edest",
 926    "eclosure", and "inveclosure".  */
 927
 928 static reg_errcode_t
 929 analyze (dfa)
 930      re_dfa_t *dfa;
 931 {
 932   int i;
 933   reg_errcode_t ret;
 934
 935   /* Allocate arrays.  */
 936   dfa->firsts = re_malloc (int, dfa->nodes_alloc);
 937   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
 938   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 939   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 940   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 941   if (BE (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL
 942           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
 943     return REG_ESPACE;
 944   /* Initialize them.  */
 945   for (i = 0; i < dfa->nodes_len; ++i)
 946     {
 947       dfa->firsts[i] = -1;
 948       dfa->nexts[i] = -1;
 949       re_node_set_init_empty (dfa->edests + i);
 950       re_node_set_init_empty (dfa->eclosures + i);
 951       re_node_set_init_empty (dfa->inveclosures + i);
 952     }
 953
 954   ret = analyze_tree (dfa, dfa->str_tree);
 955   if (BE (ret == REG_NOERROR, 1))
 956     {
 957       ret = calc_eclosure (dfa);
 958       if (ret == REG_NOERROR)
 959         calc_inveclosure (dfa);
 960     }
 961   return ret;
 962 }
 963
 964 /* Helper functions for analyze.
 965    This function calculate "first", "next", and "edest" for the subtree
 966    whose root is NODE.  */
 967
 968 static reg_errcode_t
 969 analyze_tree (dfa, node)
 970      re_dfa_t *dfa;
 971      bin_tree_t *node;
 972 {
 973   reg_errcode_t ret;
 974   if (node->first == -1)
 975     calc_first (dfa, node);
 976   if (node->next == -1)
 977     calc_next (dfa, node);
 978   if (node->eclosure.nelem == 0)
 979     calc_epsdest (dfa, node);
 980   /* Calculate "first" etc. for the left child.  */
 981   if (node->left != NULL)
 982     {
 983       ret = analyze_tree (dfa, node->left);
 984       if (BE (ret != REG_NOERROR, 0))
 985         return ret;
 986     }
 987   /* Calculate "first" etc. for the right child.  */
 988   if (node->right != NULL)
 989     {
 990       ret = analyze_tree (dfa, node->right);
 991       if (BE (ret != REG_NOERROR, 0))
 992         return ret;
 993     }
 994   return REG_NOERROR;
 995 }
 996
 997 /* Calculate "first" for the node NODE.  */
 998 static void
 999 calc_first (dfa, node)
1000      re_dfa_t *dfa;
1001      bin_tree_t *node;
1002 {
1003   int idx, type;
1004   idx = node->node_idx;
1005   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1006
1007   switch (type)
1008     {
1009 #ifdef DEBUG
1010     case OP_OPEN_BRACKET:
1011     case OP_CLOSE_BRACKET:
1012     case OP_OPEN_DUP_NUM:
1013     case OP_CLOSE_DUP_NUM:
1014     case OP_NON_MATCH_LIST:
1015     case OP_OPEN_COLL_ELEM:
1016     case OP_CLOSE_COLL_ELEM:
1017     case OP_OPEN_EQUIV_CLASS:
1018     case OP_CLOSE_EQUIV_CLASS:
1019     case OP_OPEN_CHAR_CLASS:
1020     case OP_CLOSE_CHAR_CLASS:
1021       /* These must not be appeared here.  */
1022       assert (0);
1023 #endif
1024     case END_OF_RE:
1025     case CHARACTER:
1026     case OP_PERIOD:
1027     case OP_DUP_ASTERISK:
1028     case OP_DUP_QUESTION:
1029 #ifdef RE_ENABLE_I18N
1030     case COMPLEX_BRACKET:
1031 #endif /* RE_ENABLE_I18N */
1032     case SIMPLE_BRACKET:
1033     case OP_BACK_REF:
1034     case ANCHOR:
1035     case OP_OPEN_SUBEXP:
1036     case OP_CLOSE_SUBEXP:
1037       node->first = idx;
1038       break;
1039     case OP_DUP_PLUS:
1040 #ifdef DEBUG
1041       assert (node->left != NULL);
1042 #endif
1043       if (node->left->first == -1)
1044         calc_first (dfa, node->left);
1045       node->first = node->left->first;
1046       break;
1047     case OP_ALT:
1048       node->first = idx;
1049       break;
1050       /* else fall through */
1051     default:
1052 #ifdef DEBUG
1053       assert (node->left != NULL);
1054 #endif
1055       if (node->left->first == -1)
1056         calc_first (dfa, node->left);
1057       node->first = node->left->first;
1058       break;
1059     }
1060   if (node->type == 0)
1061     dfa->firsts[idx] = node->first;
1062 }
1063
1064 /* Calculate "next" for the node NODE.  */
1065
1066 static void
1067 calc_next (dfa, node)
1068      re_dfa_t *dfa;
1069      bin_tree_t *node;
1070 {
1071   int idx, type;
1072   bin_tree_t *parent = node->parent;
1073   if (parent == NULL)
1074     {
1075       node->next = -1;
1076       idx = node->node_idx;
1077       if (node->type == 0)
1078         dfa->nexts[idx] = node->next;
1079       return;
1080     }
1081
1082   idx = parent->node_idx;
1083   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1084
1085   switch (type)
1086     {
1087     case OP_DUP_ASTERISK:
1088     case OP_DUP_PLUS:
1089       node->next = idx;
1090       break;
1091     case CONCAT:
1092       if (parent->left == node)
1093         {
1094           if (parent->right->first == -1)
1095             calc_first (dfa, parent->right);
1096           node->next = parent->right->first;
1097           break;
1098         }
1099       /* else fall through */
1100     default:
1101       if (parent->next == -1)
1102         calc_next (dfa, parent);
1103       node->next = parent->next;
1104       break;
1105     }
1106   idx = node->node_idx;
1107   if (node->type == 0)
1108     dfa->nexts[idx] = node->next;
1109 }
1110
1111 /* Calculate "edest" for the node NODE.  */
1112
1113 static void
1114 calc_epsdest (dfa, node)
1115      re_dfa_t *dfa;
1116      bin_tree_t *node;
1117 {
1118   int idx;
1119   idx = node->node_idx;
1120   if (node->type == 0)
1121     {
1122       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1123           || dfa->nodes[idx].type == OP_DUP_PLUS
1124           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1125         {
1126           if (node->left->first == -1)
1127             calc_first (dfa, node->left);
1128           if (node->next == -1)
1129             calc_next (dfa, node);
1130           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1131                               node->next);
1132         }
1133       else if (dfa->nodes[idx].type == OP_ALT)
1134         {
1135           int left, right;
1136           if (node->left != NULL)
1137             {
1138               if (node->left->first == -1)
1139                 calc_first (dfa, node->left);
1140               left = node->left->first;
1141             }
1142           else
1143             {
1144               if (node->next == -1)
1145                 calc_next (dfa, node);
1146               left = node->next;
1147             }
1148           if (node->right != NULL)
1149             {
1150               if (node->right->first == -1)
1151                 calc_first (dfa, node->right);
1152               right = node->right->first;
1153             }
1154           else
1155             {
1156               if (node->next == -1)
1157                 calc_next (dfa, node);
1158               right = node->next;
1159             }
1160           re_node_set_init_2 (dfa->edests + idx, left, right);
1161         }
1162       else if (dfa->nodes[idx].type == ANCHOR
1163                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1164                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
1165         re_node_set_init_1 (dfa->edests + idx, node->next);
1166     }
1167 }
1168
1169 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1170    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1171    otherwise return the error code.  */
1172
1173 static reg_errcode_t
1174 duplicate_node (new_idx, dfa, org_idx, constraint)
1175      re_dfa_t *dfa;
1176      int *new_idx, org_idx;
1177      unsigned int constraint;
1178 {
1179   re_token_t dup;
1180   int dup_idx;
1181   reg_errcode_t err;
1182
1183   dup.type = OP_CONTEXT_NODE;
1184   if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE)
1185     {
1186       /* If the node whose index is ORG_IDX is the same as the intended
1187          node, use it.  */
1188       if (dfa->nodes[org_idx].constraint == constraint)
1189         {
1190           *new_idx = org_idx;
1191           return REG_NOERROR;
1192         }
1193       dup.constraint = constraint |
1194         dfa->nodes[org_idx].constraint;
1195     }
1196   else
1197     dup.constraint = constraint;
1198
1199   /* In case that `entity' points OP_CONTEXT_NODE,
1200      we correct `entity' to real entity in calc_inveclosures(). */
1201   dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info));
1202   dup_idx = re_dfa_add_node (dfa, dup, 1);
1203   if (BE (dup.opr.ctx_info == NULL || dup_idx == -1, 0))
1204     return REG_ESPACE;
1205   dup.opr.ctx_info->entity = org_idx;
1206   dup.opr.ctx_info->bkref_eclosure = NULL;
1207
1208   dfa->nodes[dup_idx].duplicated = 1;
1209   dfa->firsts[dup_idx] = dfa->firsts[org_idx];
1210   dfa->nexts[dup_idx] = dfa->nexts[org_idx];
1211   err = re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx);
1212   if (BE (err != REG_NOERROR, 0))
1213     return err;
1214   /* Since we don't duplicate epsilon nodes, epsilon closure have
1215      only itself.  */
1216   err = re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx);
1217   if (BE (err != REG_NOERROR, 0))
1218     return err;
1219   err = re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx);
1220   if (BE (err != REG_NOERROR, 0))
1221     return err;
1222   /* Then we must update inveclosure for this node.
1223      We process them at last part of calc_eclosure(),
1224      since we don't complete to calculate them here.  */
1225
1226   *new_idx = dup_idx;
1227   return REG_NOERROR;
1228 }
1229
1230 static void
1231 calc_inveclosure (dfa)
1232      re_dfa_t *dfa;
1233 {
1234   int src, idx, dest, entity;
1235   for (src = 0; src < dfa->nodes_len; ++src)
1236     {
1237       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1238         {
1239           dest = dfa->eclosures[src].elems[idx];
1240           re_node_set_insert (dfa->inveclosures + dest, src);
1241         }
1242
1243       entity = src;
1244       while (dfa->nodes[entity].type == OP_CONTEXT_NODE)
1245         {
1246           entity = dfa->nodes[entity].opr.ctx_info->entity;
1247           re_node_set_merge (dfa->inveclosures + src,
1248                              dfa->inveclosures + entity);
1249           dfa->nodes[src].opr.ctx_info->entity = entity;
1250         }
1251     }
1252 }
1253
1254 /* Calculate "eclosure" for all the node in DFA.  */
1255
1256 static reg_errcode_t
1257 calc_eclosure (dfa)
1258      re_dfa_t *dfa;
1259 {
1260   int idx, node_idx, max, incomplete = 0;
1261 #ifdef DEBUG
1262   assert (dfa->nodes_len > 0);
1263 #endif
1264   /* For each nodes, calculate epsilon closure.  */
1265   for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx)
1266     {
1267       reg_errcode_t err;
1268       re_node_set eclosure_elem;
1269       if (node_idx == max)
1270         {
1271           if (!incomplete)
1272             break;
1273           incomplete = 0;
1274           node_idx = 0;
1275         }
1276
1277 #ifdef DEBUG
1278       assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE);
1279       assert (dfa->eclosures[node_idx].nelem != -1);
1280 #endif
1281       /* If we have already calculated, skip it.  */
1282       if (dfa->eclosures[node_idx].nelem != 0)
1283         continue;
1284       /* Calculate epsilon closure of `node_idx'.  */
1285       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1286       if (BE (err != REG_NOERROR, 0))
1287         return err;
1288
1289       if (dfa->eclosures[node_idx].nelem == 0)
1290         {
1291           incomplete = 1;
1292           re_node_set_free (&eclosure_elem);
1293         }
1294     }
1295
1296   /* for duplicated nodes.  */
1297   for (idx = max; idx < dfa->nodes_len; ++idx)
1298     {
1299       int entity, i, constraint;
1300       re_node_set *bkref_eclosure;
1301       entity = dfa->nodes[idx].opr.ctx_info->entity;
1302       re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity);
1303       if (dfa->nodes[entity].type != OP_BACK_REF)
1304         continue;
1305
1306       /* If the node is backreference, duplicate the epsilon closure of
1307          the next node. Since it may epsilon transit.  */
1308       /* Note: duplicate_node() may realloc dfa->eclosures, etc.  */
1309       bkref_eclosure = re_malloc (re_node_set, 1);
1310       if (BE (bkref_eclosure == NULL, 0))
1311         return REG_ESPACE;
1312       re_node_set_init_empty (bkref_eclosure);
1313       constraint = dfa->nodes[idx].constraint;
1314       for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i)
1315         {
1316           int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i];
1317           if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type))
1318             {
1319               reg_errcode_t err;
1320               err = duplicate_node (&dest_node_idx, dfa, dest_node_idx,
1321                                     constraint);
1322               if (BE (err != REG_NOERROR, 0))
1323                 return err;
1324             }
1325           re_node_set_insert (bkref_eclosure, dest_node_idx);
1326         }
1327       dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure;
1328     }
1329
1330   return REG_NOERROR;
1331 }
1332
1333 /* Calculate epsilon closure of NODE.  */
1334
1335 static reg_errcode_t
1336 calc_eclosure_iter (new_set, dfa, node, root)
1337      re_node_set *new_set;
1338      re_dfa_t *dfa;
1339      int node, root;
1340 {
1341   reg_errcode_t err;
1342   unsigned int constraint;
1343   int i, max, incomplete = 0;
1344   re_node_set eclosure;
1345   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1346   if (BE (err != REG_NOERROR, 0))
1347     return err;
1348
1349   /* This indicates that we are calculating this node now.
1350      We reference this value to avoid infinite loop.  */
1351   dfa->eclosures[node].nelem = -1;
1352
1353   constraint = ((dfa->nodes[node].type == ANCHOR)
1354                 ? dfa->nodes[node].opr.ctx_type : 0);
1355
1356   /* Expand each epsilon destination nodes.  */
1357   if (dfa->edests[node].nelem != 0)
1358     for (i = 0; i < dfa->edests[node].nelem; ++i)
1359       {
1360         re_node_set eclosure_elem;
1361         int edest = dfa->edests[node].elems[i];
1362         /* If calculating the epsilon closure of `edest' is in progress,
1363            return intermediate result.  */
1364         if (dfa->eclosures[edest].nelem == -1)
1365           {
1366             incomplete = 1;
1367             continue;
1368           }
1369         /* If we haven't calculated the epsilon closure of `edest' yet,
1370            calculate now. Otherwise use calculated epsilon closure.  */
1371         if (dfa->eclosures[edest].nelem == 0)
1372           {
1373             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1374             if (BE (err != REG_NOERROR, 0))
1375               return err;
1376           }
1377         else
1378           eclosure_elem = dfa->eclosures[edest];
1379         /* Merge the epsilon closure of `edest'.  */
1380         re_node_set_merge (&eclosure, &eclosure_elem);
1381         /* If the epsilon closure of `edest' is incomplete,
1382            the epsilon closure of this node is also incomplete.  */
1383         if (dfa->eclosures[edest].nelem == 0)
1384           {
1385             incomplete = 1;
1386             re_node_set_free (&eclosure_elem);
1387           }
1388       }
1389
1390   /* If the current node has constraints, duplicate all non-epsilon nodes.
1391      Since they must inherit the constraints.  */
1392   if (constraint)
1393     for (i = 0, max = eclosure.nelem; i < max; ++i)
1394       {
1395         int dest = eclosure.elems[i];
1396         if (!IS_EPSILON_NODE (dfa->nodes[dest].type))
1397           {
1398             int dup_dest;
1399             reg_errcode_t err;
1400             err = duplicate_node (&dup_dest, dfa, dest, constraint);
1401             if (BE (err != REG_NOERROR, 0))
1402               return err;
1403             if (dest != dup_dest)
1404               {
1405                 re_node_set_remove_at (&eclosure, i--);
1406                 re_node_set_insert (&eclosure, dup_dest);
1407                 --max;
1408               }
1409           }
1410       }
1411
1412   /* Epsilon closures include itself.  */
1413   re_node_set_insert (&eclosure, node);
1414   if (incomplete && !root)
1415     dfa->eclosures[node].nelem = 0;
1416   else
1417     dfa->eclosures[node] = eclosure;
1418   *new_set = eclosure;
1419   return REG_NOERROR;
1420 }
1421 \f
1422 /* Functions for token which are used in the parser.  */
1423
1424 /* Fetch a token from INPUT.
1425    We must not use this function inside bracket expressions.  */
1426
1427 static re_token_t
1428 fetch_token (input, syntax)
1429      re_string_t *input;
1430      reg_syntax_t syntax;
1431 {
1432   re_token_t token;
1433   int consumed_byte;
1434   consumed_byte = peek_token (&token, input, syntax);
1435   re_string_skip_bytes (input, consumed_byte);
1436   return token;
1437 }
1438
1439 /* Peek a token from INPUT, and return the length of the token.
1440    We must not use this function inside bracket expressions.  */
1441
1442 static int
1443 peek_token (token, input, syntax)
1444      re_token_t *token;
1445      re_string_t *input;
1446      reg_syntax_t syntax;
1447 {
1448   unsigned char c;
1449
1450   if (re_string_eoi (input))
1451     {
1452       token->type = END_OF_RE;
1453       return 0;
1454     }
1455
1456   c = re_string_peek_byte (input, 0);
1457   token->opr.c = c;
1458
1459 #ifdef RE_ENABLE_I18N
1460   token->mb_partial = 0;
1461   if (MB_CUR_MAX > 1 &&
1462       !re_string_first_byte (input, re_string_cur_idx (input)))
1463     {
1464       token->type = CHARACTER;
1465       token->mb_partial = 1;
1466       return 1;
1467     }
1468 #endif
1469   if (c == '\\')
1470     {
1471       unsigned char c2;
1472       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1473         {
1474           token->type = BACK_SLASH;
1475           return 1;
1476         }
1477
1478       c2 = re_string_peek_byte_case (input, 1);
1479       token->opr.c = c2;
1480       token->type = CHARACTER;
1481       switch (c2)
1482         {
1483         case '|':
1484           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1485             token->type = OP_ALT;
1486           break;
1487         case '1': case '2': case '3': case '4': case '5':
1488         case '6': case '7': case '8': case '9':
1489           if (!(syntax & RE_NO_BK_REFS))
1490             {
1491               token->type = OP_BACK_REF;
1492               token->opr.idx = c2 - '0';
1493             }
1494           break;
1495         case '<':
1496           if (!(syntax & RE_NO_GNU_OPS))
1497             {
1498               token->type = ANCHOR;
1499               token->opr.idx = WORD_FIRST;
1500             }
1501           break;
1502         case '>':
1503           if (!(syntax & RE_NO_GNU_OPS))
1504             {
1505               token->type = ANCHOR;
1506               token->opr.idx = WORD_LAST;
1507             }
1508           break;
1509         case 'b':
1510           if (!(syntax & RE_NO_GNU_OPS))
1511             {
1512               token->type = ANCHOR;
1513               token->opr.idx = WORD_DELIM;
1514             }
1515           break;
1516         case 'B':
1517           if (!(syntax & RE_NO_GNU_OPS))
1518             {
1519               token->type = ANCHOR;
1520               token->opr.idx = INSIDE_WORD;
1521             }
1522           break;
1523         case 'w':
1524           if (!(syntax & RE_NO_GNU_OPS))
1525             token->type = OP_WORD;
1526           break;
1527         case 'W':
1528           if (!(syntax & RE_NO_GNU_OPS))
1529             token->type = OP_NOTWORD;
1530           break;
1531         case '`':
1532           if (!(syntax & RE_NO_GNU_OPS))
1533             {
1534               token->type = ANCHOR;
1535               token->opr.idx = BUF_FIRST;
1536             }
1537           break;
1538         case '\'':
1539           if (!(syntax & RE_NO_GNU_OPS))
1540             {
1541               token->type = ANCHOR;
1542               token->opr.idx = BUF_LAST;
1543             }
1544           break;
1545         case '(':
1546           if (!(syntax & RE_NO_BK_PARENS))
1547             token->type = OP_OPEN_SUBEXP;
1548           break;
1549         case ')':
1550           if (!(syntax & RE_NO_BK_PARENS))
1551             token->type = OP_CLOSE_SUBEXP;
1552           break;
1553         case '+':
1554           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1555             token->type = OP_DUP_PLUS;
1556           break;
1557         case '?':
1558           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1559             token->type = OP_DUP_QUESTION;
1560           break;
1561         case '{':
1562           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1563             token->type = OP_OPEN_DUP_NUM;
1564           break;
1565         case '}':
1566           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1567             token->type = OP_CLOSE_DUP_NUM;
1568           break;
1569         default:
1570           break;
1571         }
1572       return 2;
1573     }
1574
1575   token->type = CHARACTER;
1576   switch (c)
1577     {
1578     case '\n':
1579       if (syntax & RE_NEWLINE_ALT)
1580         token->type = OP_ALT;
1581       break;
1582     case '|':
1583       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1584         token->type = OP_ALT;
1585       break;
1586     case '*':
1587       token->type = OP_DUP_ASTERISK;
1588       break;
1589     case '+':
1590       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1591         token->type = OP_DUP_PLUS;
1592       break;
1593     case '?':
1594       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1595         token->type = OP_DUP_QUESTION;
1596       break;
1597     case '{':
1598       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1599         token->type = OP_OPEN_DUP_NUM;
1600       break;
1601     case '}':
1602       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1603         token->type = OP_CLOSE_DUP_NUM;
1604       break;
1605     case '(':
1606       if (syntax & RE_NO_BK_PARENS)
1607         token->type = OP_OPEN_SUBEXP;
1608       break;
1609     case ')':
1610       if (syntax & RE_NO_BK_PARENS)
1611         token->type = OP_CLOSE_SUBEXP;
1612       break;
1613     case '[':
1614       token->type = OP_OPEN_BRACKET;
1615       break;
1616     case '.':
1617       token->type = OP_PERIOD;
1618       break;
1619     case '^':
1620       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1621           re_string_cur_idx (input) != 0)
1622         {
1623           char prev = re_string_peek_byte (input, -1);
1624           if (prev != '|' && prev != '(' &&
1625               (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1626             break;
1627         }
1628       token->type = ANCHOR;
1629       token->opr.idx = LINE_FIRST;
1630       break;
1631     case '$':
1632       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1633           re_string_cur_idx (input) + 1 != re_string_length (input))
1634         {
1635           re_token_t next;
1636           re_string_skip_bytes (input, 1);
1637           peek_token (&next, input, syntax);
1638           re_string_skip_bytes (input, -1);
1639           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1640             break;
1641         }
1642       token->type = ANCHOR;
1643       token->opr.idx = LINE_LAST;
1644       break;
1645     default:
1646       break;
1647     }
1648   return 1;
1649 }
1650
1651 /* Peek a token from INPUT, and return the length of the token.
1652    We must not use this function out of bracket expressions.  */
1653
1654 static int
1655 peek_token_bracket (token, input, syntax)
1656      re_token_t *token;
1657      re_string_t *input;
1658      reg_syntax_t syntax;
1659 {
1660   unsigned char c;
1661   if (re_string_eoi (input))
1662     {
1663       token->type = END_OF_RE;
1664       return 0;
1665     }
1666   c = re_string_peek_byte (input, 0);
1667   token->opr.c = c;
1668
1669 #ifdef RE_ENABLE_I18N
1670   if (MB_CUR_MAX > 1 &&
1671       !re_string_first_byte (input, re_string_cur_idx (input)))
1672     {
1673       token->type = CHARACTER;
1674       return 1;
1675     }
1676 #endif /* RE_ENABLE_I18N */
1677
1678   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1679     {
1680       /* In this case, '\' escape a character.  */
1681       unsigned char c2;
1682       c2 = re_string_peek_byte (input, 1);
1683       token->opr.c = c2;
1684       token->type = CHARACTER;
1685       return 1;
1686     }
1687   if (c == '[') /* '[' is a special char in a bracket exps.  */
1688     {
1689       unsigned char c2;
1690       int token_len;
1691       c2 = re_string_peek_byte (input, 1);
1692       token->opr.c = c2;
1693       token_len = 2;
1694       switch (c2)
1695         {
1696         case '.':
1697           token->type = OP_OPEN_COLL_ELEM;
1698           break;
1699         case '=':
1700           token->type = OP_OPEN_EQUIV_CLASS;
1701           break;
1702         case ':':
1703           if (syntax & RE_CHAR_CLASSES)
1704             {
1705               token->type = OP_OPEN_CHAR_CLASS;
1706               break;
1707             }
1708           /* else fall through.  */
1709         default:
1710           token->type = CHARACTER;
1711           token->opr.c = c;
1712           token_len = 1;
1713           break;
1714         }
1715       return token_len;
1716     }
1717   switch (c)
1718     {
1719     case '-':
1720       token->type = OP_CHARSET_RANGE;
1721       break;
1722     case ']':
1723       token->type = OP_CLOSE_BRACKET;
1724       break;
1725     case '^':
1726       token->type = OP_NON_MATCH_LIST;
1727       break;
1728     default:
1729       token->type = CHARACTER;
1730     }
1731   return 1;
1732 }
1733 \f
1734 /* Functions for parser.  */
1735
1736 /* Entry point of the parser.
1737    Parse the regular expression REGEXP and return the structure tree.
1738    If an error is occured, ERR is set by error code, and return NULL.
1739    This function build the following tree, from regular expression <reg_exp>:
1740            CAT
1741            / \
1742           /   \
1743    <reg_exp>  EOR
1744
1745    CAT means concatenation.
1746    EOR means end of regular expression.  */
1747
1748 static bin_tree_t *
1749 parse (regexp, preg, syntax, err)
1750      re_string_t *regexp;
1751      regex_t *preg;
1752      reg_syntax_t syntax;
1753      reg_errcode_t *err;
1754 {
1755   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1756   bin_tree_t *tree, *eor, *root;
1757   re_token_t current_token;
1758   int new_idx;
1759   current_token = fetch_token (regexp, syntax);
1760   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1761   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1762     return NULL;
1763   new_idx = re_dfa_add_node (dfa, current_token, 0);
1764   eor = create_tree (NULL, NULL, 0, new_idx);
1765   if (tree != NULL)
1766     root = create_tree (tree, eor, CONCAT, 0);
1767   else
1768     root = eor;
1769   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1770     return *err = REG_ESPACE, NULL;
1771   return root;
1772 }
1773
1774 /* This function build the following tree, from regular expression
1775    <branch1>|<branch2>:
1776            ALT
1777            / \
1778           /   \
1779    <branch1> <branch2>
1780
1781    ALT means alternative, which represents the operator `|'.  */
1782
1783 static bin_tree_t *
1784 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1785      re_string_t *regexp;
1786      regex_t *preg;
1787      re_token_t *token;
1788      reg_syntax_t syntax;
1789      int nest;
1790      reg_errcode_t *err;
1791 {
1792   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1793   bin_tree_t *tree, *branch = NULL;
1794   int new_idx;
1795   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1796   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1797     return NULL;
1798
1799   while (token->type == OP_ALT)
1800     {
1801       re_token_t alt_token = *token;
1802       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1803       *token = fetch_token (regexp, syntax);
1804       if (token->type != OP_ALT && token->type != END_OF_RE
1805           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1806         {
1807           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1808           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1809             {
1810               free_bin_tree (tree);
1811               return NULL;
1812             }
1813         }
1814       else
1815         branch = NULL;
1816       tree = create_tree (tree, branch, 0, new_idx);
1817       if (BE (new_idx == -1 || tree == NULL, 0))
1818         return *err = REG_ESPACE, NULL;
1819     }
1820   return tree;
1821 }
1822
1823 /* This function build the following tree, from regular expression
1824    <exp1><exp2>:
1825         CAT
1826         / \
1827        /   \
1828    <exp1> <exp2>
1829
1830    CAT means concatenation.  */
1831
1832 static bin_tree_t *
1833 parse_branch (regexp, preg, token, syntax, nest, err)
1834      re_string_t *regexp;
1835      regex_t *preg;
1836      re_token_t *token;
1837      reg_syntax_t syntax;
1838      int nest;
1839      reg_errcode_t *err;
1840 {
1841   bin_tree_t *tree, *exp;
1842   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1843   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1844     return NULL;
1845
1846   while (token->type != OP_ALT && token->type != END_OF_RE
1847          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1848     {
1849       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1850       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1851         {
1852           free_bin_tree (tree);
1853           return NULL;
1854         }
1855       if (tree != NULL && exp != NULL)
1856         {
1857           tree = create_tree (tree, exp, CONCAT, 0);
1858           if (tree == NULL)
1859             return *err = REG_ESPACE, NULL;
1860         }
1861       else if (tree == NULL)
1862         tree = exp;
1863       /* Otherwise exp == NULL, we don't need to create new tree.  */
1864     }
1865   return tree;
1866 }
1867
1868 /* This function build the following tree, from regular expression a*:
1869          *
1870          |
1871          a
1872 */
1873
1874 static bin_tree_t *
1875 parse_expression (regexp, preg, token, syntax, nest, err)
1876      re_string_t *regexp;
1877      regex_t *preg;
1878      re_token_t *token;
1879      reg_syntax_t syntax;
1880      int nest;
1881      reg_errcode_t *err;
1882 {
1883   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1884   bin_tree_t *tree;
1885   int new_idx;
1886   switch (token->type)
1887     {
1888     case CHARACTER:
1889       new_idx = re_dfa_add_node (dfa, *token, 0);
1890       tree = create_tree (NULL, NULL, 0, new_idx);
1891       if (BE (new_idx == -1 || tree == NULL, 0))
1892         return *err = REG_ESPACE, NULL;
1893 #ifdef RE_ENABLE_I18N
1894       if (MB_CUR_MAX > 1)
1895         {
1896           while (!re_string_eoi (regexp)
1897                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1898             {
1899               bin_tree_t *mbc_remain;
1900               *token = fetch_token (regexp, syntax);
1901               new_idx = re_dfa_add_node (dfa, *token, 0);
1902               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1903               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1904               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1905                 return *err = REG_ESPACE, NULL;
1906             }
1907         }
1908 #endif
1909       break;
1910     case OP_OPEN_SUBEXP:
1911       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1912       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1913         return NULL;
1914       break;
1915     case OP_OPEN_BRACKET:
1916       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1917       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1918         return NULL;
1919       break;
1920     case OP_BACK_REF:
1921       if (BE (preg->re_nsub < token->opr.idx
1922               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1923         {
1924           *err = REG_ESUBREG;
1925           return NULL;
1926         }
1927       new_idx = re_dfa_add_node (dfa, *token, 0);
1928       tree = create_tree (NULL, NULL, 0, new_idx);
1929       if (BE (new_idx == -1 || tree == NULL, 0))
1930         return *err = REG_ESPACE, NULL;
1931       ++dfa->nbackref;
1932       dfa->has_mb_node = 1;
1933       break;
1934     case OP_DUP_ASTERISK:
1935     case OP_DUP_PLUS:
1936     case OP_DUP_QUESTION:
1937     case OP_OPEN_DUP_NUM:
1938       if (syntax & RE_CONTEXT_INVALID_OPS)
1939         return *err = REG_BADRPT, NULL;
1940       else if (syntax & RE_CONTEXT_INDEP_OPS)
1941         {
1942           *token = fetch_token (regexp, syntax);
1943           return parse_expression (regexp, preg, token, syntax, nest, err);
1944         }
1945       /* else fall through  */
1946     case OP_CLOSE_SUBEXP:
1947       if ((token->type == OP_CLOSE_SUBEXP) &&
1948           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
1949         return *err = REG_ERPAREN, NULL;
1950       /* else fall through  */
1951     case OP_CLOSE_DUP_NUM:
1952       /* We treat it as a normal character.  */
1953
1954       /* Then we can these characters as normal characters.  */
1955       token->type = CHARACTER;
1956       new_idx = re_dfa_add_node (dfa, *token, 0);
1957       tree = create_tree (NULL, NULL, 0, new_idx);
1958       if (BE (new_idx == -1 || tree == NULL, 0))
1959         return *err = REG_ESPACE, NULL;
1960       break;
1961     case ANCHOR:
1962       if (dfa->word_char == NULL)
1963         {
1964           *err = init_word_char (dfa);
1965           if (BE (*err != REG_NOERROR, 0))
1966             return NULL;
1967         }
1968       if (token->opr.ctx_type == WORD_DELIM)
1969         {
1970           bin_tree_t *tree_first, *tree_last;
1971           int idx_first, idx_last;
1972           token->opr.ctx_type = WORD_FIRST;
1973           idx_first = re_dfa_add_node (dfa, *token, 0);
1974           tree_first = create_tree (NULL, NULL, 0, idx_first);
1975           token->opr.ctx_type = WORD_LAST;
1976           idx_last = re_dfa_add_node (dfa, *token, 0);
1977           tree_last = create_tree (NULL, NULL, 0, idx_last);
1978           token->type = OP_ALT;
1979           new_idx = re_dfa_add_node (dfa, *token, 0);
1980           tree = create_tree (tree_first, tree_last, 0, new_idx);
1981           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
1982                   || tree_first == NULL || tree_last == NULL
1983                   || tree == NULL, 0))
1984             return *err = REG_ESPACE, NULL;
1985         }
1986       else
1987         {
1988           new_idx = re_dfa_add_node (dfa, *token, 0);
1989           tree = create_tree (NULL, NULL, 0, new_idx);
1990           if (BE (new_idx == -1 || tree == NULL, 0))
1991             return *err = REG_ESPACE, NULL;
1992         }
1993       /* We must return here, since ANCHORs can't be followed
1994          by repetition operators.
1995          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
1996              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
1997       *token = fetch_token (regexp, syntax);
1998       return tree;
1999     case OP_PERIOD:
2000       new_idx = re_dfa_add_node (dfa, *token, 0);
2001       tree = create_tree (NULL, NULL, 0, new_idx);
2002       if (BE (new_idx == -1 || tree == NULL, 0))
2003         return *err = REG_ESPACE, NULL;
2004       if (MB_CUR_MAX > 1)
2005         dfa->has_mb_node = 1;
2006       break;
2007     case OP_WORD:
2008       tree = build_word_op (dfa, 0, err);
2009       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2010         return NULL;
2011       break;
2012     case OP_NOTWORD:
2013       tree = build_word_op (dfa, 1, err);
2014       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2015         return NULL;
2016       break;
2017     case OP_ALT:
2018     case END_OF_RE:
2019       return NULL;
2020     case BACK_SLASH:
2021       *err = REG_EESCAPE;
2022       return NULL;
2023     default:
2024       /* Must not happen?  */
2025 #ifdef DEBUG
2026       assert (0);
2027 #endif
2028       return NULL;
2029     }
2030   *token = fetch_token (regexp, syntax);
2031
2032   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2033          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2034     {
2035       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2036       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2037         return NULL;
2038     }
2039
2040   return tree;
2041 }
2042
2043 /* This function build the following tree, from regular expression
2044    (<reg_exp>):
2045          SUBEXP
2046             |
2047         <reg_exp>
2048 */
2049
2050 static bin_tree_t *
2051 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2052      re_string_t *regexp;
2053      regex_t *preg;
2054      re_token_t *token;
2055      reg_syntax_t syntax;
2056      int nest;
2057      reg_errcode_t *err;
2058 {
2059   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2060   bin_tree_t *tree, *left_par, *right_par;
2061   size_t cur_nsub;
2062   int new_idx;
2063   cur_nsub = preg->re_nsub++;
2064   if (dfa->subexps_alloc < preg->re_nsub)
2065     {
2066       re_subexp_t *new_array;
2067       dfa->subexps_alloc *= 2;
2068       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2069       if (BE (new_array == NULL, 0))
2070         {
2071           dfa->subexps_alloc /= 2;
2072           *err = REG_ESPACE;
2073           return NULL;
2074         }
2075       dfa->subexps = new_array;
2076     }
2077   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2078   dfa->subexps[cur_nsub].end = -1;
2079
2080   new_idx = re_dfa_add_node (dfa, *token, 0);
2081   left_par = create_tree (NULL, NULL, 0, new_idx);
2082   if (BE (new_idx == -1 || left_par == NULL, 0))
2083     return *err = REG_ESPACE, NULL;
2084   dfa->nodes[new_idx].opr.idx = cur_nsub;
2085   *token = fetch_token (regexp, syntax);
2086
2087   /* The subexpression may be a null string.  */
2088   if (token->type == OP_CLOSE_SUBEXP)
2089     tree = NULL;
2090   else
2091     {
2092       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2093       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2094         return NULL;
2095     }
2096   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2097     {
2098       free_bin_tree (tree);
2099       *err = REG_BADPAT;
2100       return NULL;
2101     }
2102   new_idx = re_dfa_add_node (dfa, *token, 0);
2103   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2104   right_par = create_tree (NULL, NULL, 0, new_idx);
2105   tree = ((tree == NULL) ? right_par
2106           : create_tree (tree, right_par, CONCAT, 0));
2107   tree = create_tree (left_par, tree, CONCAT, 0);
2108   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2109     return *err = REG_ESPACE, NULL;
2110   dfa->nodes[new_idx].opr.idx = cur_nsub;
2111
2112   return tree;
2113 }
2114
2115 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2116
2117 static bin_tree_t *
2118 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2119      bin_tree_t *dup_elem;
2120      re_string_t *regexp;
2121      re_dfa_t *dfa;
2122      re_token_t *token;
2123      reg_syntax_t syntax;
2124      reg_errcode_t *err;
2125 {
2126   re_token_t dup_token;
2127   bin_tree_t *tree = dup_elem, *work_tree;
2128   int new_idx, start_idx = re_string_cur_idx (regexp);
2129   re_token_t start_token = *token;
2130   if (token->type == OP_OPEN_DUP_NUM)
2131     {
2132       int i;
2133       int end = 0;
2134       int start = fetch_number (regexp, token, syntax);
2135       bin_tree_t *elem;
2136       if (start == -1)
2137         {
2138           if (token->type == CHARACTER && token->opr.c == ',')
2139             start = 0; /* We treat "{,m}" as "{0,m}".  */
2140           else
2141             {
2142               *err = REG_BADBR; /* <re>{} is invalid.  */
2143               return NULL;
2144             }
2145         }
2146       if (BE (start != -2, 1))
2147         {
2148           /* We treat "{n}" as "{n,n}".  */
2149           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2150                  : ((token->type == CHARACTER && token->opr.c == ',')
2151                     ? fetch_number (regexp, token, syntax) : -2));
2152         }
2153       if (BE (start == -2 || end == -2, 0))
2154         {
2155           /* Invalid sequence.  */
2156           if (token->type == OP_CLOSE_DUP_NUM)
2157             goto parse_dup_op_invalid_interval;
2158           else
2159             goto parse_dup_op_ebrace;
2160         }
2161       if (BE (start == 0 && end == 0, 0))
2162         {
2163           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2164           *token = fetch_token (regexp, syntax);
2165           free_bin_tree (dup_elem);
2166           return NULL;
2167         }
2168
2169       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2170       elem = tree;
2171       for (i = 0; i < start; ++i)
2172         if (i != 0)
2173           {
2174             work_tree = duplicate_tree (elem, dfa);
2175             tree = create_tree (tree, work_tree, CONCAT, 0);
2176             if (BE (work_tree == NULL || tree == NULL, 0))
2177               goto parse_dup_op_espace;
2178           }
2179
2180       if (end == -1)
2181         {
2182           /* We treat "<re>{0,}" as "<re>*".  */
2183           dup_token.type = OP_DUP_ASTERISK;
2184           if (start > 0)
2185             {
2186               elem = duplicate_tree (elem, dfa);
2187               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2188               work_tree = create_tree (elem, NULL, 0, new_idx);
2189               tree = create_tree (tree, work_tree, CONCAT, 0);
2190               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2191                       || tree == NULL, 0))
2192                 goto parse_dup_op_espace;
2193             }
2194           else
2195             {
2196               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2197               tree = create_tree (elem, NULL, 0, new_idx);
2198               if (BE (new_idx == -1 || tree == NULL, 0))
2199                 goto parse_dup_op_espace;
2200             }
2201         }
2202       else if (end - start > 0)
2203         {
2204           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2205           dup_token.type = OP_DUP_QUESTION;
2206           if (start > 0)
2207             {
2208               elem = duplicate_tree (elem, dfa);
2209               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2210               elem = create_tree (elem, NULL, 0, new_idx);
2211               tree = create_tree (tree, elem, CONCAT, 0);
2212               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2213                 goto parse_dup_op_espace;
2214             }
2215           else
2216             {
2217               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2218               tree = elem = create_tree (elem, NULL, 0, new_idx);
2219               if (BE (new_idx == -1 || tree == NULL, 0))
2220                 goto parse_dup_op_espace;
2221             }
2222           for (i = 1; i < end - start; ++i)
2223             {
2224               work_tree = duplicate_tree (elem, dfa);
2225               tree = create_tree (tree, work_tree, CONCAT, 0);
2226               if (BE (work_tree == NULL || tree == NULL, 0))
2227                 return *err = REG_ESPACE, NULL;
2228             }
2229         }
2230     }
2231   else
2232     {
2233       new_idx = re_dfa_add_node (dfa, *token, 0);
2234       tree = create_tree (tree, NULL, 0, new_idx);
2235       if (BE (new_idx == -1 || tree == NULL, 0))
2236         return *err = REG_ESPACE, NULL;
2237     }
2238   *token = fetch_token (regexp, syntax);
2239   return tree;
2240
2241  parse_dup_op_espace:
2242   free_bin_tree (tree);
2243   *err = REG_ESPACE;
2244   return NULL;
2245
2246  parse_dup_op_ebrace:
2247   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2248     {
2249       *err = REG_EBRACE;
2250       return NULL;
2251     }
2252   goto parse_dup_op_rollback;
2253  parse_dup_op_invalid_interval:
2254   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2255     {
2256       *err = REG_BADBR;
2257       return NULL;
2258     }
2259  parse_dup_op_rollback:
2260   re_string_set_index (regexp, start_idx);
2261   *token = start_token;
2262   token->type = CHARACTER;
2263   return dup_elem;
2264 }
2265
2266 /* Size of the names for collating symbol/equivalence_class/character_class.
2267    I'm not sure, but maybe enough.  */
2268 #define BRACKET_NAME_BUF_SIZE 32
2269
2270 #ifndef _LIBC
2271   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2272      Build the range expression which starts from START_ELEM, and ends
2273      at END_ELEM.  The result are written to MBCSET and SBCSET.
2274      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2275      mbcset->range_ends, is a pointer argument sinse we may
2276      update it.  */
2277
2278 static reg_errcode_t
2279 # ifdef RE_ENABLE_I18N
2280 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2281      re_charset_t *mbcset;
2282      int *range_alloc;
2283 # else /* not RE_ENABLE_I18N */
2284 build_range_exp (sbcset, start_elem, end_elem)
2285 # endif /* not RE_ENABLE_I18N */
2286      re_bitset_ptr_t sbcset;
2287      bracket_elem_t *start_elem, *end_elem;
2288 {
2289   unsigned int start_ch, end_ch;
2290   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2291   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2292           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2293           0))
2294     return REG_ERANGE;
2295
2296   /* We can handle no multi character collating elements without libc
2297      support.  */
2298   if (BE ((start_elem->type == COLL_SYM
2299            && strlen ((char *) start_elem->opr.name) > 1)
2300           || (end_elem->type == COLL_SYM
2301               && strlen ((char *) end_elem->opr.name) > 1), 0))
2302     return REG_ECOLLATE;
2303
2304 # ifdef RE_ENABLE_I18N
2305   {
2306     wchar_t wc, start_wc, end_wc;
2307     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2308
2309     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2310                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2311                    : 0));
2312     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2313               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2314                  : 0));
2315     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2316                 ? __btowc (start_ch) : start_elem->opr.wch);
2317     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2318               ? __btowc (end_ch) : end_elem->opr.wch);
2319     cmp_buf[0] = start_wc;
2320     cmp_buf[4] = end_wc;
2321     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2322       return REG_ERANGE;
2323
2324     /* Check the space of the arrays.  */
2325     if (*range_alloc == mbcset->nranges)
2326       {
2327         /* There are not enough space, need realloc.  */
2328         wchar_t *new_array_start, *new_array_end;
2329         int new_nranges;
2330
2331         /* +1 in case of mbcset->nranges is 0.  */
2332         new_nranges = 2 * mbcset->nranges + 1;
2333         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2334            are NULL if *range_alloc == 0.  */
2335         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2336                                       new_nranges);
2337         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2338                                     new_nranges);
2339
2340         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2341           return REG_ESPACE;
2342
2343         mbcset->range_starts = new_array_start;
2344         mbcset->range_ends = new_array_end;
2345         *range_alloc = new_nranges;
2346       }
2347
2348     mbcset->range_starts[mbcset->nranges] = start_wc;
2349     mbcset->range_ends[mbcset->nranges++] = end_wc;
2350
2351     /* Build the table for single byte characters.  */
2352     for (wc = 0; wc <= SBC_MAX; ++wc)
2353       {
2354         cmp_buf[2] = wc;
2355         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2356             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2357           bitset_set (sbcset, wc);
2358       }
2359   }
2360 # else /* not RE_ENABLE_I18N */
2361   {
2362     unsigned int ch;
2363     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2364                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2365                    : 0));
2366     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2367               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2368                  : 0));
2369     if (start_ch > end_ch)
2370       return REG_ERANGE;
2371     /* Build the table for single byte characters.  */
2372     for (ch = 0; ch <= SBC_MAX; ++ch)
2373       if (start_ch <= ch  && ch <= end_ch)
2374         bitset_set (sbcset, ch);
2375   }
2376 # endif /* not RE_ENABLE_I18N */
2377   return REG_NOERROR;
2378 }
2379 #endif /* not _LIBC */
2380
2381 #ifndef _LIBC
2382 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2383    Build the collating element which is represented by NAME.
2384    The result are written to MBCSET and SBCSET.
2385    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2386    pointer argument since we may update it.  */
2387
2388 static reg_errcode_t
2389 # ifdef RE_ENABLE_I18N
2390 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2391      re_charset_t *mbcset;
2392      int *coll_sym_alloc;
2393 # else /* not RE_ENABLE_I18N */
2394 build_collating_symbol (sbcset, name)
2395 # endif /* not RE_ENABLE_I18N */
2396      re_bitset_ptr_t sbcset;
2397      const unsigned char *name;
2398 {
2399   size_t name_len = strlen ((const char *) name);
2400   if (BE (name_len != 1, 0))
2401     return REG_ECOLLATE;
2402   else
2403     {
2404       bitset_set (sbcset, name[0]);
2405       return REG_NOERROR;
2406     }
2407 }
2408 #endif /* not _LIBC */
2409
2410 /* This function parse bracket expression like "[abc]", "[a-c]",
2411    "[[.a-a.]]" etc.  */
2412
2413 static bin_tree_t *
2414 parse_bracket_exp (regexp, dfa, token, syntax, err)
2415      re_string_t *regexp;
2416      re_dfa_t *dfa;
2417      re_token_t *token;
2418      reg_syntax_t syntax;
2419      reg_errcode_t *err;
2420 {
2421 #ifdef _LIBC
2422   const unsigned char *collseqmb;
2423   const char *collseqwc;
2424   uint32_t nrules;
2425   int32_t table_size;
2426   const int32_t *symb_table;
2427   const unsigned char *extra;
2428
2429   /* Local function for parse_bracket_exp used in _LIBC environement.
2430      Seek the collating symbol entry correspondings to NAME.
2431      Return the index of the symbol in the SYMB_TABLE.  */
2432
2433   static inline int32_t
2434   seek_collating_symbol_entry (name, name_len)
2435          const unsigned char *name;
2436          size_t name_len;
2437     {
2438       int32_t hash = elem_hash ((const char *) name, name_len);
2439       int32_t elem = hash % table_size;
2440       int32_t second = hash % (table_size - 2);
2441       while (symb_table[2 * elem] != 0)
2442         {
2443           /* First compare the hashing value.  */
2444           if (symb_table[2 * elem] == hash
2445               /* Compare the length of the name.  */
2446               && name_len == extra[symb_table[2 * elem + 1]]
2447               /* Compare the name.  */
2448               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2449                          name_len) == 0)
2450             {
2451               /* Yep, this is the entry.  */
2452               break;
2453             }
2454
2455           /* Next entry.  */
2456           elem += second;
2457         }
2458       return elem;
2459     }
2460
2461   /* Local function for parse_bracket_exp used in _LIBC environement.
2462      Look up the collation sequence value of BR_ELEM.
2463      Return the value if succeeded, UINT_MAX otherwise.  */
2464
2465   static inline unsigned int
2466   lookup_collation_sequence_value (br_elem)
2467          bracket_elem_t *br_elem;
2468     {
2469       if (br_elem->type == SB_CHAR)
2470         {
2471           /*
2472           if (MB_CUR_MAX == 1)
2473           */
2474           if (nrules == 0)
2475             return collseqmb[br_elem->opr.ch];
2476           else
2477             {
2478               wint_t wc = __btowc (br_elem->opr.ch);
2479               return collseq_table_lookup (collseqwc, wc);
2480             }
2481         }
2482       else if (br_elem->type == MB_CHAR)
2483         {
2484           return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2485         }
2486       else if (br_elem->type == COLL_SYM)
2487         {
2488           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2489           if (nrules != 0)
2490             {
2491               int32_t elem, idx;
2492               elem = seek_collating_symbol_entry (br_elem->opr.name,
2493                                                   sym_name_len);
2494               if (symb_table[2 * elem] != 0)
2495                 {
2496                   /* We found the entry.  */
2497                   idx = symb_table[2 * elem + 1];
2498                   /* Skip the name of collating element name.  */
2499                   idx += 1 + extra[idx];
2500                   /* Skip the byte sequence of the collating element.  */
2501                   idx += 1 + extra[idx];
2502                   /* Adjust for the alignment.  */
2503                   idx = (idx + 3) & ~3;
2504                   /* Skip the multibyte collation sequence value.  */
2505                   idx += sizeof (unsigned int);
2506                   /* Skip the wide char sequence of the collating element.  */
2507                   idx += sizeof (unsigned int) *
2508                     (1 + *(unsigned int *) (extra + idx));
2509                   /* Return the collation sequence value.  */
2510                   return *(unsigned int *) (extra + idx);
2511                 }
2512               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2513                 {
2514                   /* No valid character.  Match it as a single byte
2515                      character.  */
2516                   return collseqmb[br_elem->opr.name[0]];
2517                 }
2518             }
2519           else if (sym_name_len == 1)
2520             return collseqmb[br_elem->opr.name[0]];
2521         }
2522       return UINT_MAX;
2523     }
2524
2525   /* Local function for parse_bracket_exp used in _LIBC environement.
2526      Build the range expression which starts from START_ELEM, and ends
2527      at END_ELEM.  The result are written to MBCSET and SBCSET.
2528      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2529      mbcset->range_ends, is a pointer argument sinse we may
2530      update it.  */
2531
2532   static inline reg_errcode_t
2533 # ifdef RE_ENABLE_I18N
2534   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2535          re_charset_t *mbcset;
2536          int *range_alloc;
2537 # else /* not RE_ENABLE_I18N */
2538   build_range_exp (sbcset, start_elem, end_elem)
2539 # endif /* not RE_ENABLE_I18N */
2540          re_bitset_ptr_t sbcset;
2541          bracket_elem_t *start_elem, *end_elem;
2542     {
2543       unsigned int ch;
2544       uint32_t start_collseq;
2545       uint32_t end_collseq;
2546
2547 # ifdef RE_ENABLE_I18N
2548       /* Check the space of the arrays.  */
2549       if (*range_alloc == mbcset->nranges)
2550         {
2551           /* There are not enough space, need realloc.  */
2552           uint32_t *new_array_start;
2553           uint32_t *new_array_end;
2554           int new_nranges;
2555
2556           /* +1 in case of mbcset->nranges is 0.  */
2557           new_nranges = 2 * mbcset->nranges + 1;
2558           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2559              are NULL if *range_alloc == 0.  */
2560           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2561                                         new_nranges);
2562           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2563                                       new_nranges);
2564
2565           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2566             return REG_ESPACE;
2567
2568           mbcset->range_starts = new_array_start;
2569           mbcset->range_ends = new_array_end;
2570           *range_alloc = new_nranges;
2571         }
2572 # endif /* RE_ENABLE_I18N */
2573
2574       /* Equivalence Classes and Character Classes can't be a range
2575          start/end.  */
2576       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2577               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2578               0))
2579         return REG_ERANGE;
2580
2581       start_collseq = lookup_collation_sequence_value (start_elem);
2582       end_collseq = lookup_collation_sequence_value (end_elem);
2583       /* Check start/end collation sequence values.  */
2584       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2585         return REG_ECOLLATE;
2586       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2587         return REG_ERANGE;
2588
2589 # ifdef RE_ENABLE_I18N
2590       /* Got valid collation sequence values, add them as a new entry.  */
2591       mbcset->range_starts[mbcset->nranges] = start_collseq;
2592       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2593 # endif /* RE_ENABLE_I18N */
2594
2595       /* Build the table for single byte characters.  */
2596       for (ch = 0; ch <= SBC_MAX; ch++)
2597         {
2598           uint32_t ch_collseq;
2599           /*
2600           if (MB_CUR_MAX == 1)
2601           */
2602           if (nrules == 0)
2603             ch_collseq = collseqmb[ch];
2604           else
2605             ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2606           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2607             bitset_set (sbcset, ch);
2608         }
2609       return REG_NOERROR;
2610     }
2611
2612   /* Local function for parse_bracket_exp used in _LIBC environement.
2613      Build the collating element which is represented by NAME.
2614      The result are written to MBCSET and SBCSET.
2615      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2616      pointer argument sinse we may update it.  */
2617
2618   static inline reg_errcode_t
2619 # ifdef RE_ENABLE_I18N
2620   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2621          re_charset_t *mbcset;
2622          int *coll_sym_alloc;
2623 # else /* not RE_ENABLE_I18N */
2624   build_collating_symbol (sbcset, name)
2625 # endif /* not RE_ENABLE_I18N */
2626          re_bitset_ptr_t sbcset;
2627          const unsigned char *name;
2628     {
2629       int32_t elem, idx;
2630       size_t name_len = strlen ((const char *) name);
2631       if (nrules != 0)
2632         {
2633           elem = seek_collating_symbol_entry (name, name_len);
2634           if (symb_table[2 * elem] != 0)
2635             {
2636               /* We found the entry.  */
2637               idx = symb_table[2 * elem + 1];
2638               /* Skip the name of collating element name.  */
2639               idx += 1 + extra[idx];
2640             }
2641           else if (symb_table[2 * elem] == 0 && name_len == 1)
2642             {
2643               /* No valid character, treat it as a normal
2644                  character.  */
2645               bitset_set (sbcset, name[0]);
2646               return REG_NOERROR;
2647             }
2648           else
2649             return REG_ECOLLATE;
2650
2651 # ifdef RE_ENABLE_I18N
2652           /* Got valid collation sequence, add it as a new entry.  */
2653           /* Check the space of the arrays.  */
2654           if (*coll_sym_alloc == mbcset->ncoll_syms)
2655             {
2656               /* Not enough, realloc it.  */
2657               /* +1 in case of mbcset->ncoll_syms is 0.  */
2658               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2659               /* Use realloc since mbcset->coll_syms is NULL
2660                  if *alloc == 0.  */
2661               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2662                                               *coll_sym_alloc);
2663               if (BE (mbcset->coll_syms == NULL, 0))
2664                 return REG_ESPACE;
2665             }
2666           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2667 # endif /* RE_ENABLE_I18N */
2668           return REG_NOERROR;
2669         }
2670       else
2671         {
2672           if (BE (name_len != 1, 0))
2673             return REG_ECOLLATE;
2674           else
2675             {
2676               bitset_set (sbcset, name[0]);
2677               return REG_NOERROR;
2678             }
2679         }
2680     }
2681 #endif
2682
2683   re_token_t br_token;
2684   re_bitset_ptr_t sbcset;
2685 #ifdef RE_ENABLE_I18N
2686   re_charset_t *mbcset;
2687   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2688   int equiv_class_alloc = 0, char_class_alloc = 0;
2689 #else /* not RE_ENABLE_I18N */
2690   int non_match = 0;
2691 #endif /* not RE_ENABLE_I18N */
2692   bin_tree_t *work_tree;
2693   int token_len, new_idx;
2694 #ifdef _LIBC
2695   collseqmb = (const unsigned char *)
2696     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2697   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2698   if (nrules)
2699     {
2700       /*
2701       if (MB_CUR_MAX > 1)
2702       */
2703         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2704       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2705       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2706                                                   _NL_COLLATE_SYMB_TABLEMB);
2707       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2708                                                    _NL_COLLATE_SYMB_EXTRAMB);
2709     }
2710 #endif
2711   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2712 #ifdef RE_ENABLE_I18N
2713   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2714 #endif /* RE_ENABLE_I18N */
2715 #ifdef RE_ENABLE_I18N
2716   if (BE (sbcset == NULL || mbcset == NULL, 0))
2717 #else
2718   if (BE (sbcset == NULL, 0))
2719 #endif /* RE_ENABLE_I18N */
2720     {
2721       *err = REG_ESPACE;
2722       return NULL;
2723     }
2724
2725   token_len = peek_token_bracket (token, regexp, syntax);
2726   if (BE (token->type == END_OF_RE, 0))
2727     {
2728       *err = REG_BADPAT;
2729       goto parse_bracket_exp_free_return;
2730     }
2731   if (token->type == OP_NON_MATCH_LIST)
2732     {
2733 #ifdef RE_ENABLE_I18N
2734       int i;
2735       mbcset->non_match = 1;
2736 #else /* not RE_ENABLE_I18N */
2737       non_match = 1;
2738 #endif /* not RE_ENABLE_I18N */
2739       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2740         bitset_set (sbcset, '\0');
2741       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2742       token_len = peek_token_bracket (token, regexp, syntax);
2743       if (BE (token->type == END_OF_RE, 0))
2744         {
2745           *err = REG_BADPAT;
2746           goto parse_bracket_exp_free_return;
2747         }
2748 #ifdef RE_ENABLE_I18N
2749       if (MB_CUR_MAX > 1)
2750         for (i = 0; i < SBC_MAX; ++i)
2751           if (__btowc (i) == WEOF)
2752             bitset_set (sbcset, i);
2753 #endif /* RE_ENABLE_I18N */
2754     }
2755
2756   /* We treat the first ']' as a normal character.  */
2757   if (token->type == OP_CLOSE_BRACKET)
2758     token->type = CHARACTER;
2759
2760   while (1)
2761     {
2762       bracket_elem_t start_elem, end_elem;
2763       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2764       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2765       reg_errcode_t ret;
2766       int token_len2 = 0, is_range_exp = 0;
2767       re_token_t token2;
2768
2769       start_elem.opr.name = start_name_buf;
2770       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2771                                    syntax);
2772       if (BE (ret != REG_NOERROR, 0))
2773         {
2774           *err = ret;
2775           goto parse_bracket_exp_free_return;
2776         }
2777
2778       token_len = peek_token_bracket (token, regexp, syntax);
2779       if (BE (token->type == END_OF_RE, 0))
2780         {
2781           *err = REG_BADPAT;
2782           goto parse_bracket_exp_free_return;
2783         }
2784       if (token->type == OP_CHARSET_RANGE)
2785         {
2786           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2787           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2788           if (BE (token->type == END_OF_RE, 0))
2789             {
2790               *err = REG_BADPAT;
2791               goto parse_bracket_exp_free_return;
2792             }
2793           if (token2.type == OP_CLOSE_BRACKET)
2794             {
2795               /* We treat the last '-' as a normal character.  */
2796               re_string_skip_bytes (regexp, -token_len);
2797               token->type = CHARACTER;
2798             }
2799           else
2800             is_range_exp = 1;
2801         }
2802
2803       if (is_range_exp == 1)
2804         {
2805           end_elem.opr.name = end_name_buf;
2806           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2807                                        dfa, syntax);
2808           if (BE (ret != REG_NOERROR, 0))
2809             {
2810               *err = ret;
2811               goto parse_bracket_exp_free_return;
2812             }
2813
2814           token_len = peek_token_bracket (token, regexp, syntax);
2815           if (BE (token->type == END_OF_RE, 0))
2816             {
2817               *err = REG_BADPAT;
2818               goto parse_bracket_exp_free_return;
2819             }
2820           *err = build_range_exp (sbcset,
2821 #ifdef RE_ENABLE_I18N
2822                                   mbcset, &range_alloc,
2823 #endif /* RE_ENABLE_I18N */
2824                                   &start_elem, &end_elem);
2825           if (BE (*err != REG_NOERROR, 0))
2826             goto parse_bracket_exp_free_return;
2827         }
2828       else
2829         {
2830           switch (start_elem.type)
2831             {
2832             case SB_CHAR:
2833               bitset_set (sbcset, start_elem.opr.ch);
2834               break;
2835 #ifdef RE_ENABLE_I18N
2836             case MB_CHAR:
2837               /* Check whether the array has enough space.  */
2838               if (mbchar_alloc == mbcset->nmbchars)
2839                 {
2840                   /* Not enough, realloc it.  */
2841                   /* +1 in case of mbcset->nmbchars is 0.  */
2842                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2843                   /* Use realloc since array is NULL if *alloc == 0.  */
2844                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2845                                                 mbchar_alloc);
2846                   if (BE (mbcset->mbchars == NULL, 0))
2847                     goto parse_bracket_exp_espace;
2848                 }
2849               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2850               break;
2851 #endif /* RE_ENABLE_I18N */
2852             case EQUIV_CLASS:
2853               *err = build_equiv_class (sbcset,
2854 #ifdef RE_ENABLE_I18N
2855                                         mbcset, &equiv_class_alloc,
2856 #endif /* RE_ENABLE_I18N */
2857                                         start_elem.opr.name);
2858               if (BE (*err != REG_NOERROR, 0))
2859                 goto parse_bracket_exp_free_return;
2860               break;
2861             case COLL_SYM:
2862               *err = build_collating_symbol (sbcset,
2863 #ifdef RE_ENABLE_I18N
2864                                              mbcset, &coll_sym_alloc,
2865 #endif /* RE_ENABLE_I18N */
2866                                              start_elem.opr.name);
2867               if (BE (*err != REG_NOERROR, 0))
2868                 goto parse_bracket_exp_free_return;
2869               break;
2870             case CHAR_CLASS:
2871               ret = build_charclass (sbcset,
2872 #ifdef RE_ENABLE_I18N
2873                                      mbcset, &char_class_alloc,
2874 #endif /* RE_ENABLE_I18N */
2875                                      start_elem.opr.name, syntax);
2876               if (BE (ret != REG_NOERROR, 0))
2877                goto parse_bracket_exp_espace;
2878               break;
2879             default:
2880               assert (0);
2881               break;
2882             }
2883         }
2884       if (token->type == OP_CLOSE_BRACKET)
2885         break;
2886     }
2887
2888   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2889
2890   /* If it is non-matching list.  */
2891 #ifdef RE_ENABLE_I18N
2892   if (mbcset->non_match)
2893 #else /* not RE_ENABLE_I18N */
2894   if (non_match)
2895 #endif /* not RE_ENABLE_I18N */
2896     bitset_not (sbcset);
2897
2898   /* Build a tree for simple bracket.  */
2899   br_token.type = SIMPLE_BRACKET;
2900   br_token.opr.sbcset = sbcset;
2901   new_idx = re_dfa_add_node (dfa, br_token, 0);
2902   work_tree = create_tree (NULL, NULL, 0, new_idx);
2903   if (BE (new_idx == -1 || work_tree == NULL, 0))
2904     goto parse_bracket_exp_espace;
2905
2906 #ifdef RE_ENABLE_I18N
2907   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2908       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2909                                                 || mbcset->non_match)))
2910     {
2911       re_token_t alt_token;
2912       bin_tree_t *mbc_tree;
2913       /* Build a tree for complex bracket.  */
2914       br_token.type = COMPLEX_BRACKET;
2915       br_token.opr.mbcset = mbcset;
2916       dfa->has_mb_node = 1;
2917       new_idx = re_dfa_add_node (dfa, br_token, 0);
2918       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2919       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2920         goto parse_bracket_exp_espace;
2921       /* Then join them by ALT node.  */
2922       alt_token.type = OP_ALT;
2923       new_idx = re_dfa_add_node (dfa, alt_token, 0);
2924       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
2925       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
2926         return work_tree;
2927     }
2928   else
2929     {
2930       free_charset (mbcset);
2931       return work_tree;
2932     }
2933 #else /* not RE_ENABLE_I18N */
2934   return work_tree;
2935 #endif /* not RE_ENABLE_I18N */
2936
2937  parse_bracket_exp_espace:
2938   *err = REG_ESPACE;
2939  parse_bracket_exp_free_return:
2940   re_free (sbcset);
2941 #ifdef RE_ENABLE_I18N
2942   free_charset (mbcset);
2943 #endif /* RE_ENABLE_I18N */
2944   return NULL;
2945 }
2946
2947 /* Parse an element in the bracket expression.  */
2948
2949 static reg_errcode_t
2950 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
2951      bracket_elem_t *elem;
2952      re_string_t *regexp;
2953      re_token_t *token;
2954      int token_len;
2955      re_dfa_t *dfa;
2956      reg_syntax_t syntax;
2957 {
2958 #ifdef RE_ENABLE_I18N
2959   int cur_char_size;
2960   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
2961   if (cur_char_size > 1)
2962     {
2963       elem->type = MB_CHAR;
2964       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
2965       re_string_skip_bytes (regexp, cur_char_size);
2966       return REG_NOERROR;
2967     }
2968 #endif /* RE_ENABLE_I18N */
2969   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2970   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
2971       || token->type == OP_OPEN_EQUIV_CLASS)
2972     return parse_bracket_symbol (elem, regexp, token);
2973   elem->type = SB_CHAR;
2974   elem->opr.ch = token->opr.c;
2975   return REG_NOERROR;
2976 }
2977
2978 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
2979    such as [:<character_class>:], [.<collating_element>.], and
2980    [=<equivalent_class>=].  */
2981
2982 static reg_errcode_t
2983 parse_bracket_symbol (elem, regexp, token)
2984      bracket_elem_t *elem;
2985      re_string_t *regexp;
2986      re_token_t *token;
2987 {
2988   unsigned char ch, delim = token->opr.c;
2989   int i = 0;
2990   for (;; ++i)
2991     {
2992       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
2993         return REG_EBRACK;
2994       if (token->type == OP_OPEN_CHAR_CLASS)
2995         ch = re_string_fetch_byte_case (regexp);
2996       else
2997         ch = re_string_fetch_byte (regexp);
2998       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
2999         break;
3000       elem->opr.name[i] = ch;
3001     }
3002   re_string_skip_bytes (regexp, 1);
3003   elem->opr.name[i] = '\0';
3004   switch (token->type)
3005     {
3006     case OP_OPEN_COLL_ELEM:
3007       elem->type = COLL_SYM;
3008       break;
3009     case OP_OPEN_EQUIV_CLASS:
3010       elem->type = EQUIV_CLASS;
3011       break;
3012     case OP_OPEN_CHAR_CLASS:
3013       elem->type = CHAR_CLASS;
3014       break;
3015     default:
3016       break;
3017     }
3018   return REG_NOERROR;
3019 }
3020
3021   /* Helper function for parse_bracket_exp.
3022      Build the equivalence class which is represented by NAME.
3023      The result are written to MBCSET and SBCSET.
3024      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3025      is a pointer argument sinse we may update it.  */
3026
3027 static reg_errcode_t
3028 #ifdef RE_ENABLE_I18N
3029 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3030      re_charset_t *mbcset;
3031      int *equiv_class_alloc;
3032 #else /* not RE_ENABLE_I18N */
3033 build_equiv_class (sbcset, name)
3034 #endif /* not RE_ENABLE_I18N */
3035      re_bitset_ptr_t sbcset;
3036      const unsigned char *name;
3037 {
3038 #if defined _LIBC && defined RE_ENABLE_I18N
3039   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3040   if (nrules != 0)
3041     {
3042       const int32_t *table, *indirect;
3043       const unsigned char *weights, *extra, *cp;
3044       unsigned char char_buf[2];
3045       int32_t idx1, idx2;
3046       unsigned int ch;
3047       size_t len;
3048       /* This #include defines a local function!  */
3049 # include <locale/weight.h>
3050       /* Calculate the index for equivalence class.  */
3051       cp = name;
3052       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3053       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3054                                                _NL_COLLATE_WEIGHTMB);
3055       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3056                                                    _NL_COLLATE_EXTRAMB);
3057       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3058                                                 _NL_COLLATE_INDIRECTMB);
3059       idx1 = findidx (&cp);
3060       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3061         /* This isn't a valid character.  */
3062         return REG_ECOLLATE;
3063
3064       /* Build single byte matcing table for this equivalence class.  */
3065       char_buf[1] = (unsigned char) '\0';
3066       len = weights[idx1];
3067       for (ch = 0; ch < SBC_MAX; ++ch)
3068         {
3069           char_buf[0] = ch;
3070           cp = char_buf;
3071           idx2 = findidx (&cp);
3072 /*
3073           idx2 = table[ch];
3074 */
3075           if (idx2 == 0)
3076             /* This isn't a valid character.  */
3077             continue;
3078           if (len == weights[idx2])
3079             {
3080               int cnt = 0;
3081               while (cnt <= len &&
3082                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3083                 ++cnt;
3084
3085               if (cnt > len)
3086                 bitset_set (sbcset, ch);
3087             }
3088         }
3089       /* Check whether the array has enough space.  */
3090       if (*equiv_class_alloc == mbcset->nequiv_classes)
3091         {
3092           /* Not enough, realloc it.  */
3093           /* +1 in case of mbcset->nequiv_classes is 0.  */
3094           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3095           /* Use realloc since the array is NULL if *alloc == 0.  */
3096           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3097                                               *equiv_class_alloc);
3098           if (BE (mbcset->equiv_classes == NULL, 0))
3099             return REG_ESPACE;
3100         }
3101       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3102     }
3103   else
3104 #endif /* _LIBC && RE_ENABLE_I18N */
3105     {
3106       if (BE (strlen ((const char *) name) != 1, 0))
3107         return REG_ECOLLATE;
3108       bitset_set (sbcset, *name);
3109     }
3110   return REG_NOERROR;
3111 }
3112
3113   /* Helper function for parse_bracket_exp.
3114      Build the character class which is represented by NAME.
3115      The result are written to MBCSET and SBCSET.
3116      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3117      is a pointer argument sinse we may update it.  */
3118
3119 static reg_errcode_t
3120 #ifdef RE_ENABLE_I18N
3121 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3122      re_charset_t *mbcset;
3123      int *char_class_alloc;
3124 #else /* not RE_ENABLE_I18N */
3125 build_charclass (sbcset, class_name, syntax)
3126 #endif /* not RE_ENABLE_I18N */
3127      re_bitset_ptr_t sbcset;
3128      const unsigned char *class_name;
3129      reg_syntax_t syntax;
3130 {
3131   int i;
3132   const char *name = (const char *) class_name;
3133
3134   /* In case of REG_ICASE "upper" and "lower" match the both of
3135      upper and lower cases.  */
3136   if ((syntax & RE_ICASE)
3137       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3138     name = "alpha";
3139
3140 #ifdef RE_ENABLE_I18N
3141   /* Check the space of the arrays.  */
3142   if (*char_class_alloc == mbcset->nchar_classes)
3143     {
3144       /* Not enough, realloc it.  */
3145       /* +1 in case of mbcset->nchar_classes is 0.  */
3146       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3147       /* Use realloc since array is NULL if *alloc == 0.  */
3148       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3149                                          *char_class_alloc);
3150       if (BE (mbcset->char_classes == NULL, 0))
3151         return REG_ESPACE;
3152     }
3153   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3154 #endif /* RE_ENABLE_I18N */
3155
3156 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3157     for (i = 0; i < SBC_MAX; ++i)       \
3158       {                                 \
3159         if (ctype_func (i))             \
3160           bitset_set (sbcset, i);       \
3161       }
3162
3163   if (strcmp (name, "alnum") == 0)
3164     BUILD_CHARCLASS_LOOP (isalnum)
3165   else if (strcmp (name, "cntrl") == 0)
3166     BUILD_CHARCLASS_LOOP (iscntrl)
3167   else if (strcmp (name, "lower") == 0)
3168     BUILD_CHARCLASS_LOOP (islower)
3169   else if (strcmp (name, "space") == 0)
3170     BUILD_CHARCLASS_LOOP (isspace)
3171   else if (strcmp (name, "alpha") == 0)
3172     BUILD_CHARCLASS_LOOP (isalpha)
3173   else if (strcmp (name, "digit") == 0)
3174     BUILD_CHARCLASS_LOOP (isdigit)
3175   else if (strcmp (name, "print") == 0)
3176     BUILD_CHARCLASS_LOOP (isprint)
3177   else if (strcmp (name, "upper") == 0)
3178     BUILD_CHARCLASS_LOOP (isupper)
3179   else if (strcmp (name, "blank") == 0)
3180     BUILD_CHARCLASS_LOOP (isblank)
3181   else if (strcmp (name, "graph") == 0)
3182     BUILD_CHARCLASS_LOOP (isgraph)
3183   else if (strcmp (name, "punct") == 0)
3184     BUILD_CHARCLASS_LOOP (ispunct)
3185   else if (strcmp (name, "xdigit") == 0)
3186     BUILD_CHARCLASS_LOOP (isxdigit)
3187   else
3188     return REG_ECTYPE;
3189
3190   return REG_NOERROR;
3191 }
3192
3193 static bin_tree_t *
3194 build_word_op (dfa, not, err)
3195      re_dfa_t *dfa;
3196      int not;
3197      reg_errcode_t *err;
3198 {
3199   re_bitset_ptr_t sbcset;
3200 #ifdef RE_ENABLE_I18N
3201   re_charset_t *mbcset;
3202   int alloc = 0;
3203 #else /* not RE_ENABLE_I18N */
3204   int non_match = 0;
3205 #endif /* not RE_ENABLE_I18N */
3206   reg_errcode_t ret;
3207   re_token_t br_token;
3208   bin_tree_t *tree;
3209   int new_idx;
3210
3211   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3212 #ifdef RE_ENABLE_I18N
3213   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3214 #endif /* RE_ENABLE_I18N */
3215
3216 #ifdef RE_ENABLE_I18N
3217   if (BE (sbcset == NULL || mbcset == NULL, 0))
3218 #else /* not RE_ENABLE_I18N */
3219   if (BE (sbcset == NULL, 0))
3220 #endif /* not RE_ENABLE_I18N */
3221     {
3222       *err = REG_ESPACE;
3223       return NULL;
3224     }
3225
3226   if (not)
3227     {
3228 #ifdef RE_ENABLE_I18N
3229       int i;
3230       /*
3231       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3232         bitset_set(cset->sbcset, '\0');
3233       */
3234       mbcset->non_match = 1;
3235       if (MB_CUR_MAX > 1)
3236         for (i = 0; i < SBC_MAX; ++i)
3237           if (__btowc (i) == WEOF)
3238             bitset_set (sbcset, i);
3239 #else /* not RE_ENABLE_I18N */
3240       non_match = 1;
3241 #endif /* not RE_ENABLE_I18N */
3242     }
3243
3244   /* We don't care the syntax in this case.  */
3245   ret = build_charclass (sbcset,
3246 #ifdef RE_ENABLE_I18N
3247                          mbcset, &alloc,
3248 #endif /* RE_ENABLE_I18N */
3249                          (const unsigned char *) "alpha", 0);
3250
3251   if (BE (ret != REG_NOERROR, 0))
3252     {
3253       re_free (sbcset);
3254 #ifdef RE_ENABLE_I18N
3255       free_charset (mbcset);
3256 #endif /* RE_ENABLE_I18N */
3257       *err = REG_ESPACE;
3258       return NULL;
3259     }
3260   /* \w match '_' also.  */
3261   bitset_set (sbcset, '_');
3262
3263   /* If it is non-matching list.  */
3264 #ifdef RE_ENABLE_I18N
3265   if (mbcset->non_match)
3266 #else /* not RE_ENABLE_I18N */
3267   if (non_match)
3268 #endif /* not RE_ENABLE_I18N */
3269     bitset_not (sbcset);
3270
3271   /* Build a tree for simple bracket.  */
3272   br_token.type = SIMPLE_BRACKET;
3273   br_token.opr.sbcset = sbcset;
3274   new_idx = re_dfa_add_node (dfa, br_token, 0);
3275   tree = create_tree (NULL, NULL, 0, new_idx);
3276   if (BE (new_idx == -1 || tree == NULL, 0))
3277     goto build_word_op_espace;
3278
3279 #ifdef RE_ENABLE_I18N
3280   if (MB_CUR_MAX > 1)
3281     {
3282       re_token_t alt_token;
3283       bin_tree_t *mbc_tree;
3284       /* Build a tree for complex bracket.  */
3285       br_token.type = COMPLEX_BRACKET;
3286       br_token.opr.mbcset = mbcset;
3287       dfa->has_mb_node = 1;
3288       new_idx = re_dfa_add_node (dfa, br_token, 0);
3289       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3290       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3291         goto build_word_op_espace;
3292       /* Then join them by ALT node.  */
3293       alt_token.type = OP_ALT;
3294       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3295       tree = create_tree (tree, mbc_tree, 0, new_idx);
3296       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3297         return tree;
3298     }
3299   else
3300     {
3301       free_charset (mbcset);
3302       return tree;
3303     }
3304 #else /* not RE_ENABLE_I18N */
3305   return tree;
3306 #endif /* not RE_ENABLE_I18N */
3307
3308  build_word_op_espace:
3309   re_free (sbcset);
3310 #ifdef RE_ENABLE_I18N
3311   free_charset (mbcset);
3312 #endif /* RE_ENABLE_I18N */
3313   *err = REG_ESPACE;
3314   return NULL;
3315 }
3316
3317 /* This is intended for the expressions like "a{1,3}".
3318    Fetch a number from `input', and return the number.
3319    Return -1, if the number field is empty like "{,1}".
3320    Return -2, If an error is occured.  */
3321
3322 static int
3323 fetch_number (input, token, syntax)
3324      re_string_t *input;
3325      re_token_t *token;
3326      reg_syntax_t syntax;
3327 {
3328   int num = -1;
3329   unsigned char c;
3330   while (1)
3331     {
3332       *token = fetch_token (input, syntax);
3333       c = token->opr.c;
3334       if (BE (token->type == END_OF_RE, 0))
3335         return -2;
3336       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3337         break;
3338       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3339              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3340       num = (num > RE_DUP_MAX) ? -2 : num;
3341     }
3342   return num;
3343 }
3344 \f
3345 #ifdef RE_ENABLE_I18N
3346 static void
3347 free_charset (re_charset_t *cset)
3348 {
3349   re_free (cset->mbchars);
3350 # ifdef _LIBC
3351   re_free (cset->coll_syms);
3352   re_free (cset->equiv_classes);
3353   re_free (cset->range_starts);
3354   re_free (cset->range_ends);
3355 # endif
3356   re_free (cset->char_classes);
3357   re_free (cset);
3358 }
3359 #endif /* RE_ENABLE_I18N */
3360 \f
3361 /* Functions for binary tree operation.  */
3362
3363 /* Create a node of tree.
3364    Note: This function automatically free left and right if malloc fails.  */
3365
3366 static bin_tree_t *
3367 create_tree (left, right, type, index)
3368      bin_tree_t *left;
3369      bin_tree_t *right;
3370      re_token_type_t type;
3371      int index;
3372 {
3373   bin_tree_t *tree;
3374   tree = re_malloc (bin_tree_t, 1);
3375   if (BE (tree == NULL, 0))
3376     {
3377       free_bin_tree (left);
3378       free_bin_tree (right);
3379       return NULL;
3380     }
3381   tree->parent = NULL;
3382   tree->left = left;
3383   tree->right = right;
3384   tree->type = type;
3385   tree->node_idx = index;
3386   tree->first = -1;
3387   tree->next = -1;
3388   re_node_set_init_empty (&tree->eclosure);
3389
3390   if (left != NULL)
3391     left->parent = tree;
3392   if (right != NULL)
3393     right->parent = tree;
3394   return tree;
3395 }
3396
3397 /* Free the sub tree pointed by TREE.  */
3398
3399 static void
3400 free_bin_tree (tree)
3401      bin_tree_t *tree;
3402 {
3403   if (tree == NULL)
3404     return;
3405   /*re_node_set_free (&tree->eclosure);*/
3406   free_bin_tree (tree->left);
3407   free_bin_tree (tree->right);
3408   re_free (tree);
3409 }
3410
3411 /* Duplicate the node SRC, and return new node.  */
3412
3413 static bin_tree_t *
3414 duplicate_tree (src, dfa)
3415      const bin_tree_t *src;
3416      re_dfa_t *dfa;
3417 {
3418   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3419   int new_node_idx;
3420   /* Since node indies must be according to Post-order of the tree,
3421      we must duplicate the left at first.  */
3422   if (src->left != NULL)
3423     {
3424       left = duplicate_tree (src->left, dfa);
3425       if (left == NULL)
3426         return NULL;
3427     }
3428
3429   /* Secondaly, duplicate the right.  */
3430   if (src->right != NULL)
3431     {
3432       right = duplicate_tree (src->right, dfa);
3433       if (right == NULL)
3434         {
3435           free_bin_tree (left);
3436           return NULL;
3437         }
3438     }
3439
3440   /* At last, duplicate itself.  */
3441   if (src->type == NON_TYPE)
3442     {
3443       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3444       dfa->nodes[new_node_idx].duplicated = 1;
3445       if (BE (new_node_idx == -1, 0))
3446         {
3447           free_bin_tree (left);
3448           free_bin_tree (right);
3449           return NULL;
3450         }
3451     }
3452   else
3453     new_node_idx = src->type;
3454
3455   new_tree = create_tree (left, right, src->type, new_node_idx);
3456   if (BE (new_tree == NULL, 0))
3457     {
3458       free_bin_tree (left);
3459       free_bin_tree (right);
3460     }
3461   return new_tree;
3462 }