posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <limits.h>
  24 #include <locale.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #if defined HAVE_WCHAR_H || defined _LIBC
  30 # include <wchar.h>
  31 #endif /* HAVE_WCHAR_H || _LIBC */
  32 #if defined HAVE_WCTYPE_H || defined _LIBC
  33 # include <wctype.h>
  34 #endif /* HAVE_WCTYPE_H || _LIBC */
  35
  36 /* In case that the system doesn't have isblank().  */
  37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
  38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
  39 #endif
  40
  41 #ifdef _LIBC
  42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
  43 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
  44 #   include <locale/localeinfo.h>
  45 #   include <locale/elem-hash.h>
  46 #   include <locale/coll-lookup.h>
  47 # endif
  48 #endif
  49
  50 /* This is for other GNU distributions with internationalized messages.  */
  51 #if HAVE_LIBINTL_H || defined _LIBC
  52 # include <libintl.h>
  53 # ifdef _LIBC
  54 #  undef gettext
  55 #  define gettext(msgid) \
  56   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
  57 # endif
  58 #else
  59 # define gettext(msgid) (msgid)
  60 #endif
  61
  62 #ifndef gettext_noop
  63 /* This define is so xgettext can find the internationalizable
  64    strings.  */
  65 # define gettext_noop(String) String
  66 #endif
  67
  68 #include <regex.h>
  69 #include "regex_internal.h"
  70
  71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  72                                           int length, reg_syntax_t syntax);
  73 static void re_compile_fastmap_iter (regex_t *bufp,
  74                                      const re_dfastate_t *init_state,
  75                                      char *fastmap);
  76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  78 #ifdef RE_ENABLE_I18N
  79 static void free_charset (re_charset_t *cset);
  80 #endif /* RE_ENABLE_I18N */
  81 static void free_workarea_compile (regex_t *preg);
  82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  83 static reg_errcode_t analyze (re_dfa_t *dfa);
  84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  89                                              int top_clone_node, int root_node,
  90                                              unsigned int constraint);
  91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  92                                      unsigned int constraint);
  93 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  94 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  95                                          int node, int root);
  96 static void calc_inveclosure (re_dfa_t *dfa);
  97 static int fetch_number (re_string_t *input, re_token_t *token,
  98                          reg_syntax_t syntax);
  99 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
 100 static int peek_token (re_token_t *token, re_string_t *input,
 101                         reg_syntax_t syntax);
 102 static int peek_token_bracket (re_token_t *token, re_string_t *input,
 103                                reg_syntax_t syntax);
 104 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
 105                           reg_syntax_t syntax, reg_errcode_t *err);
 106 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
 107                                   re_token_t *token, reg_syntax_t syntax,
 108                                   int nest, reg_errcode_t *err);
 109 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
 110                                  re_token_t *token, reg_syntax_t syntax,
 111                                  int nest, reg_errcode_t *err);
 112 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
 113                                      re_token_t *token, reg_syntax_t syntax,
 114                                      int nest, reg_errcode_t *err);
 115 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
 116                                   re_token_t *token, reg_syntax_t syntax,
 117                                   int nest, reg_errcode_t *err);
 118 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
 119                                  re_dfa_t *dfa, re_token_t *token,
 120                                  reg_syntax_t syntax, reg_errcode_t *err);
 121 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
 122                                       re_token_t *token, reg_syntax_t syntax,
 123                                       reg_errcode_t *err);
 124 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
 125                                             re_string_t *regexp,
 126                                             re_token_t *token, int token_len,
 127                                             re_dfa_t *dfa,
 128                                             reg_syntax_t syntax);
 129 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
 130                                           re_string_t *regexp,
 131                                           re_token_t *token);
 132 #ifndef _LIBC
 133 # ifdef RE_ENABLE_I18N
 134 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 135                                       re_charset_t *mbcset, int *range_alloc,
 136                                       bracket_elem_t *start_elem,
 137                                       bracket_elem_t *end_elem);
 138 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 139                                              re_charset_t *mbcset,
 140                                              int *coll_sym_alloc,
 141                                              const unsigned char *name);
 142 # else /* not RE_ENABLE_I18N */
 143 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 144                                       bracket_elem_t *start_elem,
 145                                       bracket_elem_t *end_elem);
 146 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 147                                              const unsigned char *name);
 148 # endif /* not RE_ENABLE_I18N */
 149 #endif /* not _LIBC */
 150 #ifdef RE_ENABLE_I18N
 151 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 152                                         re_charset_t *mbcset,
 153                                         int *equiv_class_alloc,
 154                                         const unsigned char *name);
 155 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 156                                       re_charset_t *mbcset,
 157                                       int *char_class_alloc,
 158                                       const unsigned char *class_name,
 159                                       reg_syntax_t syntax);
 160 #else  /* not RE_ENABLE_I18N */
 161 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 162                                         const unsigned char *name);
 163 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 164                                       const unsigned char *class_name,
 165                                       reg_syntax_t syntax);
 166 #endif /* not RE_ENABLE_I18N */
 167 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
 168 static void free_bin_tree (bin_tree_t *tree);
 169 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 170                                 re_token_type_t type, int index);
 171 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 172 \f
 173 /* This table gives an error message for each of the error codes listed
 174    in regex.h.  Obviously the order here has to be same as there.
 175    POSIX doesn't require that we do anything for REG_NOERROR,
 176    but why not be nice?  */
 177
 178 const char __re_error_msgid[] attribute_hidden =
 179   {
 180 #define REG_NOERROR_IDX 0
 181     gettext_noop ("Success")    /* REG_NOERROR */
 182     "\0"
 183 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 184     gettext_noop ("No match")   /* REG_NOMATCH */
 185     "\0"
 186 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 187     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 188     "\0"
 189 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 190     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 191     "\0"
 192 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 193     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 194     "\0"
 195 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 196     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 197     "\0"
 198 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 199     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 200     "\0"
 201 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 202     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 203     "\0"
 204 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 205     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 206     "\0"
 207 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 208     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 209     "\0"
 210 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 211     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 212     "\0"
 213 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 214     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 215     "\0"
 216 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 217     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 218     "\0"
 219 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 220     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 221     "\0"
 222 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 223     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 224     "\0"
 225 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 226     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 227     "\0"
 228 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 229     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 230   };
 231
 232 const size_t __re_error_msgid_idx[] attribute_hidden =
 233   {
 234     REG_NOERROR_IDX,
 235     REG_NOMATCH_IDX,
 236     REG_BADPAT_IDX,
 237     REG_ECOLLATE_IDX,
 238     REG_ECTYPE_IDX,
 239     REG_EESCAPE_IDX,
 240     REG_ESUBREG_IDX,
 241     REG_EBRACK_IDX,
 242     REG_EPAREN_IDX,
 243     REG_EBRACE_IDX,
 244     REG_BADBR_IDX,
 245     REG_ERANGE_IDX,
 246     REG_ESPACE_IDX,
 247     REG_BADRPT_IDX,
 248     REG_EEND_IDX,
 249     REG_ESIZE_IDX,
 250     REG_ERPAREN_IDX
 251   };
 252 \f
 253 /* Entry points for GNU code.  */
 254
 255 /* re_compile_pattern is the GNU regular expression compiler: it
 256    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 257    Returns 0 if the pattern was valid, otherwise an error string.
 258
 259    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 260    are set in BUFP on entry.  */
 261
 262 const char *
 263 re_compile_pattern (pattern, length, bufp)
 264     const char *pattern;
 265     size_t length;
 266     struct re_pattern_buffer *bufp;
 267 {
 268   reg_errcode_t ret;
 269
 270   /* GNU code is written to assume at least RE_NREGS registers will be set
 271      (and at least one extra will be -1).  */
 272   bufp->regs_allocated = REGS_UNALLOCATED;
 273
 274   /* And GNU code determines whether or not to get register information
 275      by passing null for the REGS argument to re_match, etc., not by
 276      setting no_sub.  */
 277   bufp->no_sub = 0;
 278
 279   /* Match anchors at newline.  */
 280   bufp->newline_anchor = 1;
 281
 282   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 283
 284   if (!ret)
 285     return NULL;
 286   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 287 }
 288 #ifdef _LIBC
 289 weak_alias (__re_compile_pattern, re_compile_pattern)
 290 #endif
 291
 292 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 293    also be assigned to arbitrarily: each pattern buffer stores its own
 294    syntax, so it can be changed between regex compilations.  */
 295 /* This has no initializer because initialized variables in Emacs
 296    become read-only after dumping.  */
 297 reg_syntax_t re_syntax_options;
 298
 299
 300 /* Specify the precise syntax of regexps for compilation.  This provides
 301    for compatibility for various utilities which historically have
 302    different, incompatible syntaxes.
 303
 304    The argument SYNTAX is a bit mask comprised of the various bits
 305    defined in regex.h.  We return the old syntax.  */
 306
 307 reg_syntax_t
 308 re_set_syntax (syntax)
 309     reg_syntax_t syntax;
 310 {
 311   reg_syntax_t ret = re_syntax_options;
 312
 313   re_syntax_options = syntax;
 314   return ret;
 315 }
 316 #ifdef _LIBC
 317 weak_alias (__re_set_syntax, re_set_syntax)
 318 #endif
 319
 320 int
 321 re_compile_fastmap (bufp)
 322     struct re_pattern_buffer *bufp;
 323 {
 324   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 325   char *fastmap = bufp->fastmap;
 326
 327   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 328   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 329   if (dfa->init_state != dfa->init_state_word)
 330     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 331   if (dfa->init_state != dfa->init_state_nl)
 332     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 333   if (dfa->init_state != dfa->init_state_begbuf)
 334     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 335   bufp->fastmap_accurate = 1;
 336   return 0;
 337 }
 338 #ifdef _LIBC
 339 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 340 #endif
 341
 342 /* Helper function for re_compile_fastmap.
 343    Compile fastmap for the initial_state INIT_STATE.  */
 344
 345 static void
 346 re_compile_fastmap_iter (bufp, init_state, fastmap)
 347      regex_t *bufp;
 348      const re_dfastate_t *init_state;
 349      char *fastmap;
 350 {
 351   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 352   int node_cnt;
 353   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 354     {
 355       int node = init_state->nodes.elems[node_cnt];
 356       re_token_type_t type = dfa->nodes[node].type;
 357
 358       if (type == CHARACTER)
 359         fastmap[dfa->nodes[node].opr.c] = 1;
 360       else if (type == SIMPLE_BRACKET)
 361         {
 362           int i, j, ch;
 363           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 364             for (j = 0; j < UINT_BITS; ++j, ++ch)
 365               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 366                 fastmap[ch] = 1;
 367         }
 368 #ifdef RE_ENABLE_I18N
 369       else if (type == COMPLEX_BRACKET)
 370         {
 371           int i;
 372           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 373           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 374               || cset->nranges || cset->nchar_classes)
 375             {
 376 # ifdef _LIBC
 377               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 378                 {
 379                   /* In this case we want to catch the bytes which are
 380                      the first byte of any collation elements.
 381                      e.g. In da_DK, we want to catch 'a' since "aa"
 382                           is a valid collation element, and don't catch
 383                           'b' since 'b' is the only collation element
 384                           which starts from 'b'.  */
 385                   int j, ch;
 386                   const int32_t *table = (const int32_t *)
 387                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 388                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 389                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 390                       if (table[ch] < 0)
 391                         fastmap[ch] = 1;
 392                 }
 393 # else
 394               if (MB_CUR_MAX > 1)
 395                 for (i = 0; i < SBC_MAX; ++i)
 396                   if (__btowc (i) == WEOF)
 397                     fastmap[i] = 1;
 398 # endif /* not _LIBC */
 399             }
 400           for (i = 0; i < cset->nmbchars; ++i)
 401             {
 402               char buf[256];
 403               wctomb (buf, cset->mbchars[i]);
 404               fastmap[*(unsigned char *) buf] = 1;
 405             }
 406         }
 407 #endif /* RE_ENABLE_I18N */
 408       else if (type == END_OF_RE || type == OP_PERIOD
 409 #ifdef RE_ENABLE_I18N
 410                || type == COMPLEX_BRACKET
 411 #endif /* RE_ENABLE_I18N */
 412               )
 413         {
 414           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 415           if (type == END_OF_RE)
 416             bufp->can_be_null = 1;
 417           return;
 418         }
 419     }
 420 }
 421 \f
 422 /* Entry point for POSIX code.  */
 423 /* regcomp takes a regular expression as a string and compiles it.
 424
 425    PREG is a regex_t *.  We do not expect any fields to be initialized,
 426    since POSIX says we shouldn't.  Thus, we set
 427
 428      `buffer' to the compiled pattern;
 429      `used' to the length of the compiled pattern;
 430      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 431        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 432        RE_SYNTAX_POSIX_BASIC;
 433      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 434      `fastmap' to an allocated space for the fastmap;
 435      `fastmap_accurate' to zero;
 436      `re_nsub' to the number of subexpressions in PATTERN.
 437
 438    PATTERN is the address of the pattern string.
 439
 440    CFLAGS is a series of bits which affect compilation.
 441
 442      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 443      use POSIX basic syntax.
 444
 445      If REG_NEWLINE is set, then . and [^...] don't match newline.
 446      Also, regexec will try a match beginning after every newline.
 447
 448      If REG_ICASE is set, then we considers upper- and lowercase
 449      versions of letters to be equivalent when matching.
 450
 451      If REG_NOSUB is set, then when PREG is passed to regexec, that
 452      routine will report only success or failure, and nothing about the
 453      registers.
 454
 455    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 456    the return codes and their meanings.)  */
 457
 458 int
 459 regcomp (preg, pattern, cflags)
 460     regex_t *__restrict preg;
 461     const char *__restrict pattern;
 462     int cflags;
 463 {
 464   reg_errcode_t ret;
 465   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 466                          : RE_SYNTAX_POSIX_BASIC);
 467
 468   preg->buffer = NULL;
 469   preg->allocated = 0;
 470   preg->used = 0;
 471
 472   /* Try to allocate space for the fastmap.  */
 473   preg->fastmap = re_malloc (char, SBC_MAX);
 474   if (BE (preg->fastmap == NULL, 0))
 475     return REG_ESPACE;
 476
 477   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 478
 479   /* If REG_NEWLINE is set, newlines are treated differently.  */
 480   if (cflags & REG_NEWLINE)
 481     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 482       syntax &= ~RE_DOT_NEWLINE;
 483       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 484       /* It also changes the matching behavior.  */
 485       preg->newline_anchor = 1;
 486     }
 487   else
 488     preg->newline_anchor = 0;
 489   preg->no_sub = !!(cflags & REG_NOSUB);
 490   preg->translate = NULL;
 491
 492   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 493
 494   /* POSIX doesn't distinguish between an unmatched open-group and an
 495      unmatched close-group: both are REG_EPAREN.  */
 496   if (ret == REG_ERPAREN)
 497     ret = REG_EPAREN;
 498
 499   /* We have already checked preg->fastmap != NULL.  */
 500   if (BE (ret == REG_NOERROR, 1))
 501     {
 502       /* Compute the fastmap now, since regexec cannot modify the pattern
 503          buffer.  */
 504       if (BE (re_compile_fastmap (preg) == -2, 0))
 505         {
 506           /* Some error occurred while computing the fastmap, just forget
 507              about it.  */
 508           re_free (preg->fastmap);
 509           preg->fastmap = NULL;
 510         }
 511     }
 512
 513   return (int) ret;
 514 }
 515 #ifdef _LIBC
 516 weak_alias (__regcomp, regcomp)
 517 #endif
 518
 519 /* Returns a message corresponding to an error code, ERRCODE, returned
 520    from either regcomp or regexec.   We don't use PREG here.  */
 521
 522 size_t
 523 regerror (errcode, preg, errbuf, errbuf_size)
 524     int errcode;
 525     const regex_t *preg;
 526     char *errbuf;
 527     size_t errbuf_size;
 528 {
 529   const char *msg;
 530   size_t msg_size;
 531
 532   if (BE (errcode < 0
 533           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 534                                / sizeof (__re_error_msgid_idx[0])), 0))
 535     /* Only error codes returned by the rest of the code should be passed
 536        to this routine.  If we are given anything else, or if other regex
 537        code generates an invalid error code, then the program has a bug.
 538        Dump core so we can fix it.  */
 539     abort ();
 540
 541   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 542
 543   msg_size = strlen (msg) + 1; /* Includes the null.  */
 544
 545   if (BE (errbuf_size != 0, 1))
 546     {
 547       if (BE (msg_size > errbuf_size, 0))
 548         {
 549 #if defined HAVE_MEMPCPY || defined _LIBC
 550           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 551 #else
 552           memcpy (errbuf, msg, errbuf_size - 1);
 553           errbuf[errbuf_size - 1] = 0;
 554 #endif
 555         }
 556       else
 557         memcpy (errbuf, msg, msg_size);
 558     }
 559
 560   return msg_size;
 561 }
 562 #ifdef _LIBC
 563 weak_alias (__regerror, regerror)
 564 #endif
 565
 566 /* Free dynamically allocated space used by PREG.  */
 567
 568 void
 569 regfree (preg)
 570     regex_t *preg;
 571 {
 572   int i, j;
 573   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 574   if (BE (dfa != NULL, 1))
 575     {
 576       re_free (dfa->subexps);
 577
 578       for (i = 0; i < dfa->nodes_len; ++i)
 579         {
 580           re_token_t *node = dfa->nodes + i;
 581 #ifdef RE_ENABLE_I18N
 582           if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 583             free_charset (node->opr.mbcset);
 584           else
 585 #endif /* RE_ENABLE_I18N */
 586           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 587             re_free (node->opr.sbcset);
 588         }
 589       re_free (dfa->nexts);
 590       for (i = 0; i < dfa->nodes_len; ++i)
 591         {
 592           if (dfa->eclosures != NULL)
 593             re_node_set_free (dfa->eclosures + i);
 594           if (dfa->inveclosures != NULL)
 595             re_node_set_free (dfa->inveclosures + i);
 596           if (dfa->edests != NULL)
 597             re_node_set_free (dfa->edests + i);
 598         }
 599       re_free (dfa->edests);
 600       re_free (dfa->eclosures);
 601       re_free (dfa->inveclosures);
 602       re_free (dfa->nodes);
 603
 604       for (i = 0; i <= dfa->state_hash_mask; ++i)
 605         {
 606           struct re_state_table_entry *entry = dfa->state_table + i;
 607           for (j = 0; j < entry->num; ++j)
 608             {
 609               re_dfastate_t *state = entry->array[j];
 610               if (state->entrance_nodes != &state->nodes)
 611                 {
 612                   re_node_set_free (state->entrance_nodes);
 613                   re_free (state->entrance_nodes);
 614                 }
 615               re_node_set_free (&state->nodes);
 616               re_free (state->trtable);
 617               re_free (state->trtable_search);
 618               re_free (state);
 619             }
 620           re_free (entry->array);
 621         }
 622       re_free (dfa->state_table);
 623
 624       if (dfa->word_char != NULL)
 625         re_free (dfa->word_char);
 626 #ifdef DEBUG
 627       re_free (dfa->re_str);
 628 #endif
 629       re_free (dfa);
 630     }
 631   re_free (preg->fastmap);
 632 }
 633 #ifdef _LIBC
 634 weak_alias (__regfree, regfree)
 635 #endif
 636 \f
 637 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 638    them unless specifically requested.  */
 639
 640 #if defined _REGEX_RE_COMP || defined _LIBC
 641
 642 /* BSD has one and only one pattern buffer.  */
 643 static struct re_pattern_buffer re_comp_buf;
 644
 645 char *
 646 # ifdef _LIBC
 647 /* Make these definitions weak in libc, so POSIX programs can redefine
 648    these names if they don't use our functions, and still use
 649    regcomp/regexec above without link errors.  */
 650 weak_function
 651 # endif
 652 re_comp (s)
 653      const char *s;
 654 {
 655   reg_errcode_t ret;
 656   char *fastmap;
 657
 658   if (!s)
 659     {
 660       if (!re_comp_buf.buffer)
 661         return gettext ("No previous regular expression");
 662       return 0;
 663     }
 664
 665   if (re_comp_buf.buffer)
 666     {
 667       fastmap = re_comp_buf.fastmap;
 668       re_comp_buf.fastmap = NULL;
 669       __regfree (&re_comp_buf);
 670       re_comp_buf.buffer = NULL;
 671       re_comp_buf.allocated = 0;
 672       re_comp_buf.fastmap = fastmap;
 673     }
 674
 675   if (re_comp_buf.fastmap == NULL)
 676     {
 677       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 678       if (re_comp_buf.fastmap == NULL)
 679         return (char *) gettext (__re_error_msgid
 680                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 681     }
 682
 683   /* Since `re_exec' always passes NULL for the `regs' argument, we
 684      don't need to initialize the pattern buffer fields which affect it.  */
 685
 686   /* Match anchors at newlines.  */
 687   re_comp_buf.newline_anchor = 1;
 688
 689   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 690
 691   if (!ret)
 692     return NULL;
 693
 694   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 695   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 696 }
 697
 698 #ifdef _LIBC
 699 static void __attribute__ ((unused))
 700 free_mem (void)
 701 {
 702   __regfree (&re_comp_buf);
 703 }
 704 text_set_element (__libc_subfreeres, free_mem);
 705 #endif
 706
 707 #endif /* _REGEX_RE_COMP */
 708 \f
 709 /* Internal entry point.
 710    Compile the regular expression PATTERN, whose length is LENGTH.
 711    SYNTAX indicate regular expression's syntax.  */
 712
 713 static reg_errcode_t
 714 re_compile_internal (preg, pattern, length, syntax)
 715      regex_t *preg;
 716      const char * pattern;
 717      int length;
 718      reg_syntax_t syntax;
 719 {
 720   reg_errcode_t err = REG_NOERROR;
 721   re_dfa_t *dfa;
 722   re_string_t regexp;
 723
 724   /* Initialize the pattern buffer.  */
 725   preg->fastmap_accurate = 0;
 726   preg->syntax = syntax;
 727   preg->not_bol = preg->not_eol = 0;
 728   preg->used = 0;
 729   preg->re_nsub = 0;
 730
 731   /* Initialize the dfa.  */
 732   dfa = (re_dfa_t *) preg->buffer;
 733   if (preg->allocated < sizeof (re_dfa_t))
 734     {
 735       /* If zero allocated, but buffer is non-null, try to realloc
 736          enough space.  This loses if buffer's address is bogus, but
 737          that is the user's responsibility.  If ->buffer is NULL this
 738          is a simple allocation.  */
 739       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 740       if (dfa == NULL)
 741         return REG_ESPACE;
 742       preg->allocated = sizeof (re_dfa_t);
 743     }
 744   preg->buffer = (unsigned char *) dfa;
 745   preg->used = sizeof (re_dfa_t);
 746
 747   err = init_dfa (dfa, length);
 748   if (BE (err != REG_NOERROR, 0))
 749     {
 750       re_free (dfa);
 751       preg->buffer = NULL;
 752       return err;
 753     }
 754 #ifdef DEBUG
 755   dfa->re_str = re_malloc (char, length + 1);
 756   strncpy (dfa->re_str, pattern, length + 1);
 757 #endif
 758
 759   err = re_string_construct (&regexp, pattern, length, preg->translate,
 760                              syntax & RE_ICASE);
 761   if (BE (err != REG_NOERROR, 0))
 762     {
 763       re_free (dfa);
 764       preg->buffer = NULL;
 765       return err;
 766     }
 767
 768   /* Parse the regular expression, and build a structure tree.  */
 769   preg->re_nsub = 0;
 770   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 771   if (BE (dfa->str_tree == NULL, 0))
 772     goto re_compile_internal_free_return;
 773
 774   /* Analyze the tree and collect information which is necessary to
 775      create the dfa.  */
 776   err = analyze (dfa);
 777   if (BE (err != REG_NOERROR, 0))
 778     goto re_compile_internal_free_return;
 779
 780   /* Then create the initial state of the dfa.  */
 781   err = create_initial_state (dfa);
 782   if (BE (err != REG_NOERROR, 0))
 783     goto re_compile_internal_free_return;
 784
 785  re_compile_internal_free_return:
 786   /* Release work areas.  */
 787   free_workarea_compile (preg);
 788   re_string_destruct (&regexp);
 789
 790   return err;
 791 }
 792
 793 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 794    as the initial length of some arrays.  */
 795
 796 static reg_errcode_t
 797 init_dfa (dfa, pat_len)
 798      re_dfa_t *dfa;
 799      int pat_len;
 800 {
 801   int table_size;
 802
 803   memset (dfa, '\0', sizeof (re_dfa_t));
 804
 805   dfa->nodes_alloc = pat_len + 1;
 806   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 807
 808   dfa->states_alloc = pat_len + 1;
 809
 810   /*  table_size = 2 ^ ceil(log pat_len) */
 811   for (table_size = 1; table_size > 0; table_size <<= 1)
 812     if (table_size > pat_len)
 813       break;
 814
 815   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 816   dfa->state_hash_mask = table_size - 1;
 817
 818   dfa->subexps_alloc = 1;
 819   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 820   dfa->word_char = NULL;
 821
 822   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 823           || dfa->subexps == NULL, 0))
 824     {
 825       /* We don't bother to free anything which was allocated.  Very
 826          soon the process will go down anyway.  */
 827       dfa->subexps = NULL;
 828       dfa->state_table = NULL;
 829       dfa->nodes = NULL;
 830       return REG_ESPACE;
 831     }
 832   return REG_NOERROR;
 833 }
 834
 835 /* Initialize WORD_CHAR table, which indicate which character is
 836    "word".  In this case "word" means that it is the word construction
 837    character used by some operators like "\<", "\>", etc.  */
 838
 839 static reg_errcode_t
 840 init_word_char (dfa)
 841      re_dfa_t *dfa;
 842 {
 843   int i, j, ch;
 844   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 845   if (BE (dfa->word_char == NULL, 0))
 846     return REG_ESPACE;
 847   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 848     for (j = 0; j < UINT_BITS; ++j, ++ch)
 849       if (isalnum (ch) || ch == '_')
 850         dfa->word_char[i] |= 1 << j;
 851   return REG_NOERROR;
 852 }
 853
 854 /* Free the work area which are only used while compiling.  */
 855
 856 static void
 857 free_workarea_compile (preg)
 858      regex_t *preg;
 859 {
 860   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 861   free_bin_tree (dfa->str_tree);
 862   dfa->str_tree = NULL;
 863 }
 864
 865 /* Create initial states for all contexts.  */
 866
 867 static reg_errcode_t
 868 create_initial_state (dfa)
 869      re_dfa_t *dfa;
 870 {
 871   int first, i;
 872   reg_errcode_t err;
 873   re_node_set init_nodes;
 874
 875   /* Initial states have the epsilon closure of the node which is
 876      the first node of the regular expression.  */
 877   first = dfa->str_tree->first;
 878   dfa->init_node = first;
 879   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 880   if (BE (err != REG_NOERROR, 0))
 881     return err;
 882
 883   /* The back-references which are in initial states can epsilon transit,
 884      since in this case all of the subexpressions can be null.
 885      Then we add epsilon closures of the nodes which are the next nodes of
 886      the back-references.  */
 887   if (dfa->nbackref > 0)
 888     for (i = 0; i < init_nodes.nelem; ++i)
 889       {
 890         int node_idx = init_nodes.elems[i];
 891         re_token_type_t type = dfa->nodes[node_idx].type;
 892
 893         int clexp_idx;
 894         if (type != OP_BACK_REF)
 895           continue;
 896         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 897           {
 898             re_token_t *clexp_node;
 899             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 900             if (clexp_node->type == OP_CLOSE_SUBEXP
 901                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 902               break;
 903           }
 904         if (clexp_idx == init_nodes.nelem)
 905           continue;
 906
 907         if (type == OP_BACK_REF)
 908           {
 909             int dest_idx = dfa->edests[node_idx].elems[0];
 910             if (!re_node_set_contains (&init_nodes, dest_idx))
 911               {
 912                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 913                 i = 0;
 914               }
 915           }
 916       }
 917
 918   /* It must be the first time to invoke acquire_state.  */
 919   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 920   /* We don't check ERR here, since the initial state must not be NULL.  */
 921   if (BE (dfa->init_state == NULL, 0))
 922     return err;
 923   if (dfa->init_state->has_constraint)
 924     {
 925       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 926                                                        CONTEXT_WORD);
 927       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 928                                                      CONTEXT_NEWLINE);
 929       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 930                                                          &init_nodes,
 931                                                          CONTEXT_NEWLINE
 932                                                          | CONTEXT_BEGBUF);
 933       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 934               || dfa->init_state_begbuf == NULL, 0))
 935         return err;
 936     }
 937   else
 938     dfa->init_state_word = dfa->init_state_nl
 939       = dfa->init_state_begbuf = dfa->init_state;
 940
 941   re_node_set_free (&init_nodes);
 942   return REG_NOERROR;
 943 }
 944 \f
 945 /* Analyze the structure tree, and calculate "first", "next", "edest",
 946    "eclosure", and "inveclosure".  */
 947
 948 static reg_errcode_t
 949 analyze (dfa)
 950      re_dfa_t *dfa;
 951 {
 952   int i;
 953   reg_errcode_t ret;
 954
 955   /* Allocate arrays.  */
 956   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
 957   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 958   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 959   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 960   if (BE (dfa->nexts == NULL || dfa->edests == NULL
 961           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
 962     return REG_ESPACE;
 963   /* Initialize them.  */
 964   for (i = 0; i < dfa->nodes_len; ++i)
 965     {
 966       dfa->nexts[i] = -1;
 967       re_node_set_init_empty (dfa->edests + i);
 968       re_node_set_init_empty (dfa->eclosures + i);
 969       re_node_set_init_empty (dfa->inveclosures + i);
 970     }
 971
 972   ret = analyze_tree (dfa, dfa->str_tree);
 973   if (BE (ret == REG_NOERROR, 1))
 974     {
 975       ret = calc_eclosure (dfa);
 976       if (ret == REG_NOERROR)
 977         calc_inveclosure (dfa);
 978     }
 979   return ret;
 980 }
 981
 982 /* Helper functions for analyze.
 983    This function calculate "first", "next", and "edest" for the subtree
 984    whose root is NODE.  */
 985
 986 static reg_errcode_t
 987 analyze_tree (dfa, node)
 988      re_dfa_t *dfa;
 989      bin_tree_t *node;
 990 {
 991   reg_errcode_t ret;
 992   if (node->first == -1)
 993     calc_first (dfa, node);
 994   if (node->next == -1)
 995     calc_next (dfa, node);
 996   if (node->eclosure.nelem == 0)
 997     calc_epsdest (dfa, node);
 998   /* Calculate "first" etc. for the left child.  */
 999   if (node->left != NULL)
1000     {
1001       ret = analyze_tree (dfa, node->left);
1002       if (BE (ret != REG_NOERROR, 0))
1003         return ret;
1004     }
1005   /* Calculate "first" etc. for the right child.  */
1006   if (node->right != NULL)
1007     {
1008       ret = analyze_tree (dfa, node->right);
1009       if (BE (ret != REG_NOERROR, 0))
1010         return ret;
1011     }
1012   return REG_NOERROR;
1013 }
1014
1015 /* Calculate "first" for the node NODE.  */
1016 static void
1017 calc_first (dfa, node)
1018      re_dfa_t *dfa;
1019      bin_tree_t *node;
1020 {
1021   int idx, type;
1022   idx = node->node_idx;
1023   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1024
1025   switch (type)
1026     {
1027 #ifdef DEBUG
1028     case OP_OPEN_BRACKET:
1029     case OP_CLOSE_BRACKET:
1030     case OP_OPEN_DUP_NUM:
1031     case OP_CLOSE_DUP_NUM:
1032     case OP_NON_MATCH_LIST:
1033     case OP_OPEN_COLL_ELEM:
1034     case OP_CLOSE_COLL_ELEM:
1035     case OP_OPEN_EQUIV_CLASS:
1036     case OP_CLOSE_EQUIV_CLASS:
1037     case OP_OPEN_CHAR_CLASS:
1038     case OP_CLOSE_CHAR_CLASS:
1039       /* These must not be appeared here.  */
1040       assert (0);
1041 #endif
1042     case END_OF_RE:
1043     case CHARACTER:
1044     case OP_PERIOD:
1045     case OP_DUP_ASTERISK:
1046     case OP_DUP_QUESTION:
1047 #ifdef RE_ENABLE_I18N
1048     case COMPLEX_BRACKET:
1049 #endif /* RE_ENABLE_I18N */
1050     case SIMPLE_BRACKET:
1051     case OP_BACK_REF:
1052     case ANCHOR:
1053     case OP_OPEN_SUBEXP:
1054     case OP_CLOSE_SUBEXP:
1055       node->first = idx;
1056       break;
1057     case OP_DUP_PLUS:
1058 #ifdef DEBUG
1059       assert (node->left != NULL);
1060 #endif
1061       if (node->left->first == -1)
1062         calc_first (dfa, node->left);
1063       node->first = node->left->first;
1064       break;
1065     case OP_ALT:
1066       node->first = idx;
1067       break;
1068       /* else fall through */
1069     default:
1070 #ifdef DEBUG
1071       assert (node->left != NULL);
1072 #endif
1073       if (node->left->first == -1)
1074         calc_first (dfa, node->left);
1075       node->first = node->left->first;
1076       break;
1077     }
1078 }
1079
1080 /* Calculate "next" for the node NODE.  */
1081
1082 static void
1083 calc_next (dfa, node)
1084      re_dfa_t *dfa;
1085      bin_tree_t *node;
1086 {
1087   int idx, type;
1088   bin_tree_t *parent = node->parent;
1089   if (parent == NULL)
1090     {
1091       node->next = -1;
1092       idx = node->node_idx;
1093       if (node->type == 0)
1094         dfa->nexts[idx] = node->next;
1095       return;
1096     }
1097
1098   idx = parent->node_idx;
1099   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1100
1101   switch (type)
1102     {
1103     case OP_DUP_ASTERISK:
1104     case OP_DUP_PLUS:
1105       node->next = idx;
1106       break;
1107     case CONCAT:
1108       if (parent->left == node)
1109         {
1110           if (parent->right->first == -1)
1111             calc_first (dfa, parent->right);
1112           node->next = parent->right->first;
1113           break;
1114         }
1115       /* else fall through */
1116     default:
1117       if (parent->next == -1)
1118         calc_next (dfa, parent);
1119       node->next = parent->next;
1120       break;
1121     }
1122   idx = node->node_idx;
1123   if (node->type == 0)
1124     dfa->nexts[idx] = node->next;
1125 }
1126
1127 /* Calculate "edest" for the node NODE.  */
1128
1129 static void
1130 calc_epsdest (dfa, node)
1131      re_dfa_t *dfa;
1132      bin_tree_t *node;
1133 {
1134   int idx;
1135   idx = node->node_idx;
1136   if (node->type == 0)
1137     {
1138       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1139           || dfa->nodes[idx].type == OP_DUP_PLUS
1140           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1141         {
1142           if (node->left->first == -1)
1143             calc_first (dfa, node->left);
1144           if (node->next == -1)
1145             calc_next (dfa, node);
1146           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1147                               node->next);
1148         }
1149       else if (dfa->nodes[idx].type == OP_ALT)
1150         {
1151           int left, right;
1152           if (node->left != NULL)
1153             {
1154               if (node->left->first == -1)
1155                 calc_first (dfa, node->left);
1156               left = node->left->first;
1157             }
1158           else
1159             {
1160               if (node->next == -1)
1161                 calc_next (dfa, node);
1162               left = node->next;
1163             }
1164           if (node->right != NULL)
1165             {
1166               if (node->right->first == -1)
1167                 calc_first (dfa, node->right);
1168               right = node->right->first;
1169             }
1170           else
1171             {
1172               if (node->next == -1)
1173                 calc_next (dfa, node);
1174               right = node->next;
1175             }
1176           re_node_set_init_2 (dfa->edests + idx, left, right);
1177         }
1178       else if (dfa->nodes[idx].type == ANCHOR
1179                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1180                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1181                || dfa->nodes[idx].type == OP_BACK_REF)
1182         re_node_set_init_1 (dfa->edests + idx, node->next);
1183     }
1184 }
1185
1186 /* Duplicate the epsilon closure of the node ROOT_NODE.
1187    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1188    to their own constraint.  */
1189
1190 static reg_errcode_t
1191 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1192                         init_constraint)
1193      re_dfa_t *dfa;
1194      int top_org_node, top_clone_node, root_node;
1195      unsigned int init_constraint;
1196 {
1197   reg_errcode_t err;
1198   int org_node, clone_node, ret;
1199   unsigned int constraint = init_constraint;
1200   for (org_node = top_org_node, clone_node = top_clone_node;;)
1201     {
1202       int org_dest, clone_dest;
1203       if (dfa->nodes[org_node].type == OP_BACK_REF)
1204         {
1205           /* If the back reference epsilon-transit, its destination must
1206              also have the constraint.  Then duplicate the epsilon closure
1207              of the destination of the back reference, and store it in
1208              edests of the back reference.  */
1209           org_dest = dfa->nexts[org_node];
1210           re_node_set_empty (dfa->edests + clone_node);
1211           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1212           if (BE (err != REG_NOERROR, 0))
1213             return err;
1214           dfa->nexts[clone_node] = dfa->nexts[org_node];
1215           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1216           if (BE (ret < 0, 0))
1217             return REG_ESPACE;
1218         }
1219       else if (dfa->edests[org_node].nelem == 0)
1220         {
1221           /* In case of the node can't epsilon-transit, don't duplicate the
1222              destination and store the original destination as the
1223              destination of the node.  */
1224           dfa->nexts[clone_node] = dfa->nexts[org_node];
1225           break;
1226         }
1227       else if (dfa->edests[org_node].nelem == 1)
1228         {
1229           /* In case of the node can epsilon-transit, and it has only one
1230              destination.  */
1231           org_dest = dfa->edests[org_node].elems[0];
1232           re_node_set_empty (dfa->edests + clone_node);
1233           if (dfa->nodes[org_node].type == ANCHOR)
1234             {
1235               /* In case of the node has another constraint, append it.  */
1236               if (org_node == root_node && clone_node != org_node)
1237                 {
1238                   /* ...but if the node is root_node itself, it means the
1239                      epsilon closure have a loop, then tie it to the
1240                      destination of the root_node.  */
1241                   ret = re_node_set_insert (dfa->edests + clone_node,
1242                                             org_dest);
1243                   if (BE (ret < 0, 0))
1244                     return REG_ESPACE;
1245                   break;
1246                 }
1247               constraint |= dfa->nodes[org_node].opr.ctx_type;
1248             }
1249           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1250           if (BE (err != REG_NOERROR, 0))
1251             return err;
1252           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1253           if (BE (ret < 0, 0))
1254             return REG_ESPACE;
1255         }
1256       else /* dfa->edests[org_node].nelem == 2 */
1257         {
1258           /* In case of the node can epsilon-transit, and it has two
1259              destinations.  */
1260           org_dest = dfa->edests[org_node].elems[0];
1261           re_node_set_empty (dfa->edests + clone_node);
1262           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1263           if (BE (err != REG_NOERROR, 0))
1264             return err;
1265           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1266           if (BE (ret < 0, 0))
1267             return REG_ESPACE;
1268
1269           err = duplicate_node_closure (dfa, org_dest, clone_dest, root_node,
1270                                         constraint);
1271           if (BE (err != REG_NOERROR, 0))
1272             return err;
1273
1274           org_dest = dfa->edests[org_node].elems[1];
1275           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1276           if (BE (err != REG_NOERROR, 0))
1277             return err;
1278           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1279           if (BE (ret < 0, 0))
1280             return REG_ESPACE;
1281         }
1282       org_node = org_dest;
1283       clone_node = clone_dest;
1284     }
1285   return REG_NOERROR;
1286 }
1287
1288 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1289    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1290    otherwise return the error code.  */
1291
1292 static reg_errcode_t
1293 duplicate_node (new_idx, dfa, org_idx, constraint)
1294      re_dfa_t *dfa;
1295      int *new_idx, org_idx;
1296      unsigned int constraint;
1297 {
1298   re_token_t dup;
1299   int dup_idx;
1300
1301   dup = dfa->nodes[org_idx];
1302   dup_idx = re_dfa_add_node (dfa, dup, 1);
1303   if (BE (dup_idx == -1, 0))
1304     return REG_ESPACE;
1305   dfa->nodes[dup_idx].constraint = constraint;
1306   if (dfa->nodes[org_idx].type == ANCHOR)
1307     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1308   dfa->nodes[dup_idx].duplicated = 1;
1309   re_node_set_init_empty (dfa->edests + dup_idx);
1310   re_node_set_init_empty (dfa->eclosures + dup_idx);
1311   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1312
1313   *new_idx = dup_idx;
1314   return REG_NOERROR;
1315 }
1316
1317 static void
1318 calc_inveclosure (dfa)
1319      re_dfa_t *dfa;
1320 {
1321   int src, idx, dest;
1322   for (src = 0; src < dfa->nodes_len; ++src)
1323     {
1324       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1325         {
1326           dest = dfa->eclosures[src].elems[idx];
1327           re_node_set_insert (dfa->inveclosures + dest, src);
1328         }
1329     }
1330 }
1331
1332 /* Calculate "eclosure" for all the node in DFA.  */
1333
1334 static reg_errcode_t
1335 calc_eclosure (dfa)
1336      re_dfa_t *dfa;
1337 {
1338   int node_idx, incomplete;
1339 #ifdef DEBUG
1340   assert (dfa->nodes_len > 0);
1341 #endif
1342   incomplete = 0;
1343   /* For each nodes, calculate epsilon closure.  */
1344   for (node_idx = 0; ; ++node_idx)
1345     {
1346       reg_errcode_t err;
1347       re_node_set eclosure_elem;
1348       if (node_idx == dfa->nodes_len)
1349         {
1350           if (!incomplete)
1351             break;
1352           incomplete = 0;
1353           node_idx = 0;
1354         }
1355
1356 #ifdef DEBUG
1357       assert (dfa->eclosures[node_idx].nelem != -1);
1358 #endif
1359       /* If we have already calculated, skip it.  */
1360       if (dfa->eclosures[node_idx].nelem != 0)
1361         continue;
1362       /* Calculate epsilon closure of `node_idx'.  */
1363       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1364       if (BE (err != REG_NOERROR, 0))
1365         return err;
1366
1367       if (dfa->eclosures[node_idx].nelem == 0)
1368         {
1369           incomplete = 1;
1370           re_node_set_free (&eclosure_elem);
1371         }
1372     }
1373   return REG_NOERROR;
1374 }
1375
1376 /* Calculate epsilon closure of NODE.  */
1377
1378 static reg_errcode_t
1379 calc_eclosure_iter (new_set, dfa, node, root)
1380      re_node_set *new_set;
1381      re_dfa_t *dfa;
1382      int node, root;
1383 {
1384   reg_errcode_t err;
1385   unsigned int constraint;
1386   int i, incomplete;
1387   re_node_set eclosure;
1388   incomplete = 0;
1389   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1390   if (BE (err != REG_NOERROR, 0))
1391     return err;
1392
1393   /* This indicates that we are calculating this node now.
1394      We reference this value to avoid infinite loop.  */
1395   dfa->eclosures[node].nelem = -1;
1396
1397   constraint = ((dfa->nodes[node].type == ANCHOR)
1398                 ? dfa->nodes[node].opr.ctx_type : 0);
1399   /* If the current node has constraints, duplicate all nodes.
1400      Since they must inherit the constraints.  */
1401   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1402     {
1403       int org_node, cur_node;
1404       org_node = cur_node = node;
1405       err = duplicate_node_closure (dfa, node, node, node, constraint);
1406       if (BE (err != REG_NOERROR, 0))
1407         return err;
1408     }
1409
1410   /* Expand each epsilon destination nodes.  */
1411   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1412     for (i = 0; i < dfa->edests[node].nelem; ++i)
1413       {
1414         re_node_set eclosure_elem;
1415         int edest = dfa->edests[node].elems[i];
1416         /* If calculating the epsilon closure of `edest' is in progress,
1417            return intermediate result.  */
1418         if (dfa->eclosures[edest].nelem == -1)
1419           {
1420             incomplete = 1;
1421             continue;
1422           }
1423         /* If we haven't calculated the epsilon closure of `edest' yet,
1424            calculate now. Otherwise use calculated epsilon closure.  */
1425         if (dfa->eclosures[edest].nelem == 0)
1426           {
1427             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1428             if (BE (err != REG_NOERROR, 0))
1429               return err;
1430           }
1431         else
1432           eclosure_elem = dfa->eclosures[edest];
1433         /* Merge the epsilon closure of `edest'.  */
1434         re_node_set_merge (&eclosure, &eclosure_elem);
1435         /* If the epsilon closure of `edest' is incomplete,
1436            the epsilon closure of this node is also incomplete.  */
1437         if (dfa->eclosures[edest].nelem == 0)
1438           {
1439             incomplete = 1;
1440             re_node_set_free (&eclosure_elem);
1441           }
1442       }
1443
1444   /* Epsilon closures include itself.  */
1445   re_node_set_insert (&eclosure, node);
1446   if (incomplete && !root)
1447     dfa->eclosures[node].nelem = 0;
1448   else
1449     dfa->eclosures[node] = eclosure;
1450   *new_set = eclosure;
1451   return REG_NOERROR;
1452 }
1453 \f
1454 /* Functions for token which are used in the parser.  */
1455
1456 /* Fetch a token from INPUT.
1457    We must not use this function inside bracket expressions.  */
1458
1459 static re_token_t
1460 fetch_token (input, syntax)
1461      re_string_t *input;
1462      reg_syntax_t syntax;
1463 {
1464   re_token_t token;
1465   int consumed_byte;
1466   consumed_byte = peek_token (&token, input, syntax);
1467   re_string_skip_bytes (input, consumed_byte);
1468   return token;
1469 }
1470
1471 /* Peek a token from INPUT, and return the length of the token.
1472    We must not use this function inside bracket expressions.  */
1473
1474 static int
1475 peek_token (token, input, syntax)
1476      re_token_t *token;
1477      re_string_t *input;
1478      reg_syntax_t syntax;
1479 {
1480   unsigned char c;
1481
1482   if (re_string_eoi (input))
1483     {
1484       token->type = END_OF_RE;
1485       return 0;
1486     }
1487
1488   c = re_string_peek_byte (input, 0);
1489   token->opr.c = c;
1490
1491 #ifdef RE_ENABLE_I18N
1492   token->mb_partial = 0;
1493   if (MB_CUR_MAX > 1 &&
1494       !re_string_first_byte (input, re_string_cur_idx (input)))
1495     {
1496       token->type = CHARACTER;
1497       token->mb_partial = 1;
1498       return 1;
1499     }
1500 #endif
1501   if (c == '\\')
1502     {
1503       unsigned char c2;
1504       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1505         {
1506           token->type = BACK_SLASH;
1507           return 1;
1508         }
1509
1510       c2 = re_string_peek_byte_case (input, 1);
1511       token->opr.c = c2;
1512       token->type = CHARACTER;
1513       switch (c2)
1514         {
1515         case '|':
1516           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1517             token->type = OP_ALT;
1518           break;
1519         case '1': case '2': case '3': case '4': case '5':
1520         case '6': case '7': case '8': case '9':
1521           if (!(syntax & RE_NO_BK_REFS))
1522             {
1523               token->type = OP_BACK_REF;
1524               token->opr.idx = c2 - '0';
1525             }
1526           break;
1527         case '<':
1528           if (!(syntax & RE_NO_GNU_OPS))
1529             {
1530               token->type = ANCHOR;
1531               token->opr.idx = WORD_FIRST;
1532             }
1533           break;
1534         case '>':
1535           if (!(syntax & RE_NO_GNU_OPS))
1536             {
1537               token->type = ANCHOR;
1538               token->opr.idx = WORD_LAST;
1539             }
1540           break;
1541         case 'b':
1542           if (!(syntax & RE_NO_GNU_OPS))
1543             {
1544               token->type = ANCHOR;
1545               token->opr.idx = WORD_DELIM;
1546             }
1547           break;
1548         case 'B':
1549           if (!(syntax & RE_NO_GNU_OPS))
1550             {
1551               token->type = ANCHOR;
1552               token->opr.idx = INSIDE_WORD;
1553             }
1554           break;
1555         case 'w':
1556           if (!(syntax & RE_NO_GNU_OPS))
1557             token->type = OP_WORD;
1558           break;
1559         case 'W':
1560           if (!(syntax & RE_NO_GNU_OPS))
1561             token->type = OP_NOTWORD;
1562           break;
1563         case '`':
1564           if (!(syntax & RE_NO_GNU_OPS))
1565             {
1566               token->type = ANCHOR;
1567               token->opr.idx = BUF_FIRST;
1568             }
1569           break;
1570         case '\'':
1571           if (!(syntax & RE_NO_GNU_OPS))
1572             {
1573               token->type = ANCHOR;
1574               token->opr.idx = BUF_LAST;
1575             }
1576           break;
1577         case '(':
1578           if (!(syntax & RE_NO_BK_PARENS))
1579             token->type = OP_OPEN_SUBEXP;
1580           break;
1581         case ')':
1582           if (!(syntax & RE_NO_BK_PARENS))
1583             token->type = OP_CLOSE_SUBEXP;
1584           break;
1585         case '+':
1586           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1587             token->type = OP_DUP_PLUS;
1588           break;
1589         case '?':
1590           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1591             token->type = OP_DUP_QUESTION;
1592           break;
1593         case '{':
1594           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1595             token->type = OP_OPEN_DUP_NUM;
1596           break;
1597         case '}':
1598           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1599             token->type = OP_CLOSE_DUP_NUM;
1600           break;
1601         default:
1602           break;
1603         }
1604       return 2;
1605     }
1606
1607   token->type = CHARACTER;
1608   switch (c)
1609     {
1610     case '\n':
1611       if (syntax & RE_NEWLINE_ALT)
1612         token->type = OP_ALT;
1613       break;
1614     case '|':
1615       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1616         token->type = OP_ALT;
1617       break;
1618     case '*':
1619       token->type = OP_DUP_ASTERISK;
1620       break;
1621     case '+':
1622       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1623         token->type = OP_DUP_PLUS;
1624       break;
1625     case '?':
1626       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1627         token->type = OP_DUP_QUESTION;
1628       break;
1629     case '{':
1630       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1631         token->type = OP_OPEN_DUP_NUM;
1632       break;
1633     case '}':
1634       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1635         token->type = OP_CLOSE_DUP_NUM;
1636       break;
1637     case '(':
1638       if (syntax & RE_NO_BK_PARENS)
1639         token->type = OP_OPEN_SUBEXP;
1640       break;
1641     case ')':
1642       if (syntax & RE_NO_BK_PARENS)
1643         token->type = OP_CLOSE_SUBEXP;
1644       break;
1645     case '[':
1646       token->type = OP_OPEN_BRACKET;
1647       break;
1648     case '.':
1649       token->type = OP_PERIOD;
1650       break;
1651     case '^':
1652       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1653           re_string_cur_idx (input) != 0)
1654         {
1655           char prev = re_string_peek_byte (input, -1);
1656           if (prev != '|' && prev != '(' &&
1657               (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1658             break;
1659         }
1660       token->type = ANCHOR;
1661       token->opr.idx = LINE_FIRST;
1662       break;
1663     case '$':
1664       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1665           re_string_cur_idx (input) + 1 != re_string_length (input))
1666         {
1667           re_token_t next;
1668           re_string_skip_bytes (input, 1);
1669           peek_token (&next, input, syntax);
1670           re_string_skip_bytes (input, -1);
1671           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1672             break;
1673         }
1674       token->type = ANCHOR;
1675       token->opr.idx = LINE_LAST;
1676       break;
1677     default:
1678       break;
1679     }
1680   return 1;
1681 }
1682
1683 /* Peek a token from INPUT, and return the length of the token.
1684    We must not use this function out of bracket expressions.  */
1685
1686 static int
1687 peek_token_bracket (token, input, syntax)
1688      re_token_t *token;
1689      re_string_t *input;
1690      reg_syntax_t syntax;
1691 {
1692   unsigned char c;
1693   if (re_string_eoi (input))
1694     {
1695       token->type = END_OF_RE;
1696       return 0;
1697     }
1698   c = re_string_peek_byte (input, 0);
1699   token->opr.c = c;
1700
1701 #ifdef RE_ENABLE_I18N
1702   if (MB_CUR_MAX > 1 &&
1703       !re_string_first_byte (input, re_string_cur_idx (input)))
1704     {
1705       token->type = CHARACTER;
1706       return 1;
1707     }
1708 #endif /* RE_ENABLE_I18N */
1709
1710   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1711     {
1712       /* In this case, '\' escape a character.  */
1713       unsigned char c2;
1714       re_string_skip_bytes (input, 1);
1715       c2 = re_string_peek_byte (input, 0);
1716       token->opr.c = c2;
1717       token->type = CHARACTER;
1718       return 1;
1719     }
1720   if (c == '[') /* '[' is a special char in a bracket exps.  */
1721     {
1722       unsigned char c2;
1723       int token_len;
1724       c2 = re_string_peek_byte (input, 1);
1725       token->opr.c = c2;
1726       token_len = 2;
1727       switch (c2)
1728         {
1729         case '.':
1730           token->type = OP_OPEN_COLL_ELEM;
1731           break;
1732         case '=':
1733           token->type = OP_OPEN_EQUIV_CLASS;
1734           break;
1735         case ':':
1736           if (syntax & RE_CHAR_CLASSES)
1737             {
1738               token->type = OP_OPEN_CHAR_CLASS;
1739               break;
1740             }
1741           /* else fall through.  */
1742         default:
1743           token->type = CHARACTER;
1744           token->opr.c = c;
1745           token_len = 1;
1746           break;
1747         }
1748       return token_len;
1749     }
1750   switch (c)
1751     {
1752     case '-':
1753       token->type = OP_CHARSET_RANGE;
1754       break;
1755     case ']':
1756       token->type = OP_CLOSE_BRACKET;
1757       break;
1758     case '^':
1759       token->type = OP_NON_MATCH_LIST;
1760       break;
1761     default:
1762       token->type = CHARACTER;
1763     }
1764   return 1;
1765 }
1766 \f
1767 /* Functions for parser.  */
1768
1769 /* Entry point of the parser.
1770    Parse the regular expression REGEXP and return the structure tree.
1771    If an error is occured, ERR is set by error code, and return NULL.
1772    This function build the following tree, from regular expression <reg_exp>:
1773            CAT
1774            / \
1775           /   \
1776    <reg_exp>  EOR
1777
1778    CAT means concatenation.
1779    EOR means end of regular expression.  */
1780
1781 static bin_tree_t *
1782 parse (regexp, preg, syntax, err)
1783      re_string_t *regexp;
1784      regex_t *preg;
1785      reg_syntax_t syntax;
1786      reg_errcode_t *err;
1787 {
1788   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1789   bin_tree_t *tree, *eor, *root;
1790   re_token_t current_token;
1791   int new_idx;
1792   current_token = fetch_token (regexp, syntax);
1793   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1794   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1795     return NULL;
1796   new_idx = re_dfa_add_node (dfa, current_token, 0);
1797   eor = create_tree (NULL, NULL, 0, new_idx);
1798   if (tree != NULL)
1799     root = create_tree (tree, eor, CONCAT, 0);
1800   else
1801     root = eor;
1802   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1803     {
1804       *err = REG_ESPACE;
1805       return NULL;
1806     }
1807   return root;
1808 }
1809
1810 /* This function build the following tree, from regular expression
1811    <branch1>|<branch2>:
1812            ALT
1813            / \
1814           /   \
1815    <branch1> <branch2>
1816
1817    ALT means alternative, which represents the operator `|'.  */
1818
1819 static bin_tree_t *
1820 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1821      re_string_t *regexp;
1822      regex_t *preg;
1823      re_token_t *token;
1824      reg_syntax_t syntax;
1825      int nest;
1826      reg_errcode_t *err;
1827 {
1828   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1829   bin_tree_t *tree, *branch = NULL;
1830   int new_idx;
1831   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1832   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1833     return NULL;
1834
1835   while (token->type == OP_ALT)
1836     {
1837       re_token_t alt_token = *token;
1838       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1839       *token = fetch_token (regexp, syntax);
1840       if (token->type != OP_ALT && token->type != END_OF_RE
1841           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1842         {
1843           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1844           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1845             {
1846               free_bin_tree (tree);
1847               return NULL;
1848             }
1849         }
1850       else
1851         branch = NULL;
1852       tree = create_tree (tree, branch, 0, new_idx);
1853       if (BE (new_idx == -1 || tree == NULL, 0))
1854         {
1855           *err = REG_ESPACE;
1856           return NULL;
1857         }
1858       dfa->has_plural_match = 1;
1859     }
1860   return tree;
1861 }
1862
1863 /* This function build the following tree, from regular expression
1864    <exp1><exp2>:
1865         CAT
1866         / \
1867        /   \
1868    <exp1> <exp2>
1869
1870    CAT means concatenation.  */
1871
1872 static bin_tree_t *
1873 parse_branch (regexp, preg, token, syntax, nest, err)
1874      re_string_t *regexp;
1875      regex_t *preg;
1876      re_token_t *token;
1877      reg_syntax_t syntax;
1878      int nest;
1879      reg_errcode_t *err;
1880 {
1881   bin_tree_t *tree, *exp;
1882   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1883   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1884     return NULL;
1885
1886   while (token->type != OP_ALT && token->type != END_OF_RE
1887          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1888     {
1889       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1890       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1891         {
1892           free_bin_tree (tree);
1893           return NULL;
1894         }
1895       if (tree != NULL && exp != NULL)
1896         {
1897           tree = create_tree (tree, exp, CONCAT, 0);
1898           if (tree == NULL)
1899             {
1900               *err = REG_ESPACE;
1901               return NULL;
1902             }
1903         }
1904       else if (tree == NULL)
1905         tree = exp;
1906       /* Otherwise exp == NULL, we don't need to create new tree.  */
1907     }
1908   return tree;
1909 }
1910
1911 /* This function build the following tree, from regular expression a*:
1912          *
1913          |
1914          a
1915 */
1916
1917 static bin_tree_t *
1918 parse_expression (regexp, preg, token, syntax, nest, err)
1919      re_string_t *regexp;
1920      regex_t *preg;
1921      re_token_t *token;
1922      reg_syntax_t syntax;
1923      int nest;
1924      reg_errcode_t *err;
1925 {
1926   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1927   bin_tree_t *tree;
1928   int new_idx;
1929   switch (token->type)
1930     {
1931     case CHARACTER:
1932       new_idx = re_dfa_add_node (dfa, *token, 0);
1933       tree = create_tree (NULL, NULL, 0, new_idx);
1934       if (BE (new_idx == -1 || tree == NULL, 0))
1935         {
1936           *err = REG_ESPACE;
1937           return NULL;
1938         }
1939 #ifdef RE_ENABLE_I18N
1940       if (MB_CUR_MAX > 1)
1941         {
1942           while (!re_string_eoi (regexp)
1943                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1944             {
1945               bin_tree_t *mbc_remain;
1946               *token = fetch_token (regexp, syntax);
1947               new_idx = re_dfa_add_node (dfa, *token, 0);
1948               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1949               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1950               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1951                 return *err = REG_ESPACE, NULL;
1952             }
1953         }
1954 #endif
1955       break;
1956     case OP_OPEN_SUBEXP:
1957       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1958       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1959         return NULL;
1960       break;
1961     case OP_OPEN_BRACKET:
1962       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1963       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1964         return NULL;
1965       break;
1966     case OP_BACK_REF:
1967       if (BE (preg->re_nsub < token->opr.idx
1968               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1969         {
1970           *err = REG_ESUBREG;
1971           return NULL;
1972         }
1973       new_idx = re_dfa_add_node (dfa, *token, 0);
1974       tree = create_tree (NULL, NULL, 0, new_idx);
1975       if (BE (new_idx == -1 || tree == NULL, 0))
1976         {
1977           *err = REG_ESPACE;
1978           return NULL;
1979         }
1980       ++dfa->nbackref;
1981       dfa->has_mb_node = 1;
1982       break;
1983     case OP_DUP_ASTERISK:
1984     case OP_DUP_PLUS:
1985     case OP_DUP_QUESTION:
1986     case OP_OPEN_DUP_NUM:
1987       if (syntax & RE_CONTEXT_INVALID_OPS)
1988         {
1989           *err = REG_BADRPT;
1990           return NULL;
1991         }
1992       else if (syntax & RE_CONTEXT_INDEP_OPS)
1993         {
1994           *token = fetch_token (regexp, syntax);
1995           return parse_expression (regexp, preg, token, syntax, nest, err);
1996         }
1997       /* else fall through  */
1998     case OP_CLOSE_SUBEXP:
1999       if ((token->type == OP_CLOSE_SUBEXP) &&
2000           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2001         {
2002           *err = REG_ERPAREN;
2003           return NULL;
2004         }
2005       /* else fall through  */
2006     case OP_CLOSE_DUP_NUM:
2007       /* We treat it as a normal character.  */
2008
2009       /* Then we can these characters as normal characters.  */
2010       token->type = CHARACTER;
2011       new_idx = re_dfa_add_node (dfa, *token, 0);
2012       tree = create_tree (NULL, NULL, 0, new_idx);
2013       if (BE (new_idx == -1 || tree == NULL, 0))
2014         {
2015           *err = REG_ESPACE;
2016           return NULL;
2017         }
2018       break;
2019     case ANCHOR:
2020       if (dfa->word_char == NULL)
2021         {
2022           *err = init_word_char (dfa);
2023           if (BE (*err != REG_NOERROR, 0))
2024             return NULL;
2025         }
2026       if (token->opr.ctx_type == WORD_DELIM)
2027         {
2028           bin_tree_t *tree_first, *tree_last;
2029           int idx_first, idx_last;
2030           token->opr.ctx_type = WORD_FIRST;
2031           idx_first = re_dfa_add_node (dfa, *token, 0);
2032           tree_first = create_tree (NULL, NULL, 0, idx_first);
2033           token->opr.ctx_type = WORD_LAST;
2034           idx_last = re_dfa_add_node (dfa, *token, 0);
2035           tree_last = create_tree (NULL, NULL, 0, idx_last);
2036           token->type = OP_ALT;
2037           new_idx = re_dfa_add_node (dfa, *token, 0);
2038           tree = create_tree (tree_first, tree_last, 0, new_idx);
2039           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2040                   || tree_first == NULL || tree_last == NULL
2041                   || tree == NULL, 0))
2042             {
2043               *err = REG_ESPACE;
2044               return NULL;
2045             }
2046         }
2047       else
2048         {
2049           new_idx = re_dfa_add_node (dfa, *token, 0);
2050           tree = create_tree (NULL, NULL, 0, new_idx);
2051           if (BE (new_idx == -1 || tree == NULL, 0))
2052             return *err = REG_ESPACE, NULL;
2053         }
2054       /* We must return here, since ANCHORs can't be followed
2055          by repetition operators.
2056          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2057              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2058       *token = fetch_token (regexp, syntax);
2059       return tree;
2060     case OP_PERIOD:
2061       new_idx = re_dfa_add_node (dfa, *token, 0);
2062       tree = create_tree (NULL, NULL, 0, new_idx);
2063       if (BE (new_idx == -1 || tree == NULL, 0))
2064         {
2065           *err = REG_ESPACE;
2066           return NULL;
2067         }
2068       if (MB_CUR_MAX > 1)
2069         dfa->has_mb_node = 1;
2070       break;
2071     case OP_WORD:
2072       tree = build_word_op (dfa, 0, err);
2073       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2074         return NULL;
2075       break;
2076     case OP_NOTWORD:
2077       tree = build_word_op (dfa, 1, err);
2078       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2079         return NULL;
2080       break;
2081     case OP_ALT:
2082     case END_OF_RE:
2083       return NULL;
2084     case BACK_SLASH:
2085       *err = REG_EESCAPE;
2086       return NULL;
2087     default:
2088       /* Must not happen?  */
2089 #ifdef DEBUG
2090       assert (0);
2091 #endif
2092       return NULL;
2093     }
2094   *token = fetch_token (regexp, syntax);
2095
2096   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2097          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2098     {
2099       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2100       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2101         return NULL;
2102       dfa->has_plural_match = 1;
2103     }
2104
2105   return tree;
2106 }
2107
2108 /* This function build the following tree, from regular expression
2109    (<reg_exp>):
2110          SUBEXP
2111             |
2112         <reg_exp>
2113 */
2114
2115 static bin_tree_t *
2116 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2117      re_string_t *regexp;
2118      regex_t *preg;
2119      re_token_t *token;
2120      reg_syntax_t syntax;
2121      int nest;
2122      reg_errcode_t *err;
2123 {
2124   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2125   bin_tree_t *tree, *left_par, *right_par;
2126   size_t cur_nsub;
2127   int new_idx;
2128   cur_nsub = preg->re_nsub++;
2129   if (dfa->subexps_alloc < preg->re_nsub)
2130     {
2131       re_subexp_t *new_array;
2132       dfa->subexps_alloc *= 2;
2133       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2134       if (BE (new_array == NULL, 0))
2135         {
2136           dfa->subexps_alloc /= 2;
2137           *err = REG_ESPACE;
2138           return NULL;
2139         }
2140       dfa->subexps = new_array;
2141     }
2142   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2143   dfa->subexps[cur_nsub].end = -1;
2144
2145   new_idx = re_dfa_add_node (dfa, *token, 0);
2146   left_par = create_tree (NULL, NULL, 0, new_idx);
2147   if (BE (new_idx == -1 || left_par == NULL, 0))
2148     {
2149       *err = REG_ESPACE;
2150       return NULL;
2151     }
2152   dfa->nodes[new_idx].opr.idx = cur_nsub;
2153   *token = fetch_token (regexp, syntax);
2154
2155   /* The subexpression may be a null string.  */
2156   if (token->type == OP_CLOSE_SUBEXP)
2157     tree = NULL;
2158   else
2159     {
2160       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2161       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2162         return NULL;
2163     }
2164   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2165     {
2166       free_bin_tree (tree);
2167       *err = REG_BADPAT;
2168       return NULL;
2169     }
2170   new_idx = re_dfa_add_node (dfa, *token, 0);
2171   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2172   right_par = create_tree (NULL, NULL, 0, new_idx);
2173   tree = ((tree == NULL) ? right_par
2174           : create_tree (tree, right_par, CONCAT, 0));
2175   tree = create_tree (left_par, tree, CONCAT, 0);
2176   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2177     {
2178       *err = REG_ESPACE;
2179       return NULL;
2180     }
2181   dfa->nodes[new_idx].opr.idx = cur_nsub;
2182
2183   return tree;
2184 }
2185
2186 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2187
2188 static bin_tree_t *
2189 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2190      bin_tree_t *dup_elem;
2191      re_string_t *regexp;
2192      re_dfa_t *dfa;
2193      re_token_t *token;
2194      reg_syntax_t syntax;
2195      reg_errcode_t *err;
2196 {
2197   re_token_t dup_token;
2198   bin_tree_t *tree = dup_elem, *work_tree;
2199   int new_idx, start_idx = re_string_cur_idx (regexp);
2200   re_token_t start_token = *token;
2201   if (token->type == OP_OPEN_DUP_NUM)
2202     {
2203       int i;
2204       int end = 0;
2205       int start = fetch_number (regexp, token, syntax);
2206       bin_tree_t *elem;
2207       if (start == -1)
2208         {
2209           if (token->type == CHARACTER && token->opr.c == ',')
2210             start = 0; /* We treat "{,m}" as "{0,m}".  */
2211           else
2212             {
2213               *err = REG_BADBR; /* <re>{} is invalid.  */
2214               return NULL;
2215             }
2216         }
2217       if (BE (start != -2, 1))
2218         {
2219           /* We treat "{n}" as "{n,n}".  */
2220           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2221                  : ((token->type == CHARACTER && token->opr.c == ',')
2222                     ? fetch_number (regexp, token, syntax) : -2));
2223         }
2224       if (BE (start == -2 || end == -2, 0))
2225         {
2226           /* Invalid sequence.  */
2227           if (token->type == OP_CLOSE_DUP_NUM)
2228             goto parse_dup_op_invalid_interval;
2229           else
2230             goto parse_dup_op_ebrace;
2231         }
2232       if (BE (start == 0 && end == 0, 0))
2233         {
2234           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2235           *token = fetch_token (regexp, syntax);
2236           free_bin_tree (dup_elem);
2237           return NULL;
2238         }
2239
2240       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2241       elem = tree;
2242       for (i = 0; i < start; ++i)
2243         if (i != 0)
2244           {
2245             work_tree = duplicate_tree (elem, dfa);
2246             tree = create_tree (tree, work_tree, CONCAT, 0);
2247             if (BE (work_tree == NULL || tree == NULL, 0))
2248               goto parse_dup_op_espace;
2249           }
2250
2251       if (end == -1)
2252         {
2253           /* We treat "<re>{0,}" as "<re>*".  */
2254           dup_token.type = OP_DUP_ASTERISK;
2255           if (start > 0)
2256             {
2257               elem = duplicate_tree (elem, dfa);
2258               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2259               work_tree = create_tree (elem, NULL, 0, new_idx);
2260               tree = create_tree (tree, work_tree, CONCAT, 0);
2261               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2262                       || tree == NULL, 0))
2263                 goto parse_dup_op_espace;
2264             }
2265           else
2266             {
2267               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2268               tree = create_tree (elem, NULL, 0, new_idx);
2269               if (BE (new_idx == -1 || tree == NULL, 0))
2270                 goto parse_dup_op_espace;
2271             }
2272         }
2273       else if (end - start > 0)
2274         {
2275           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2276           dup_token.type = OP_DUP_QUESTION;
2277           if (start > 0)
2278             {
2279               elem = duplicate_tree (elem, dfa);
2280               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2281               elem = create_tree (elem, NULL, 0, new_idx);
2282               tree = create_tree (tree, elem, CONCAT, 0);
2283               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2284                 goto parse_dup_op_espace;
2285             }
2286           else
2287             {
2288               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2289               tree = elem = create_tree (elem, NULL, 0, new_idx);
2290               if (BE (new_idx == -1 || tree == NULL, 0))
2291                 goto parse_dup_op_espace;
2292             }
2293           for (i = 1; i < end - start; ++i)
2294             {
2295               work_tree = duplicate_tree (elem, dfa);
2296               tree = create_tree (tree, work_tree, CONCAT, 0);
2297               if (BE (work_tree == NULL || tree == NULL, 0))
2298                 {
2299                   *err = REG_ESPACE;
2300                   return NULL;
2301                 }
2302             }
2303         }
2304     }
2305   else
2306     {
2307       new_idx = re_dfa_add_node (dfa, *token, 0);
2308       tree = create_tree (tree, NULL, 0, new_idx);
2309       if (BE (new_idx == -1 || tree == NULL, 0))
2310         {
2311           *err = REG_ESPACE;
2312           return NULL;
2313         }
2314     }
2315   *token = fetch_token (regexp, syntax);
2316   return tree;
2317
2318  parse_dup_op_espace:
2319   free_bin_tree (tree);
2320   *err = REG_ESPACE;
2321   return NULL;
2322
2323  parse_dup_op_ebrace:
2324   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2325     {
2326       *err = REG_EBRACE;
2327       return NULL;
2328     }
2329   goto parse_dup_op_rollback;
2330  parse_dup_op_invalid_interval:
2331   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2332     {
2333       *err = REG_BADBR;
2334       return NULL;
2335     }
2336  parse_dup_op_rollback:
2337   re_string_set_index (regexp, start_idx);
2338   *token = start_token;
2339   token->type = CHARACTER;
2340   return dup_elem;
2341 }
2342
2343 /* Size of the names for collating symbol/equivalence_class/character_class.
2344    I'm not sure, but maybe enough.  */
2345 #define BRACKET_NAME_BUF_SIZE 32
2346
2347 #ifndef _LIBC
2348   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2349      Build the range expression which starts from START_ELEM, and ends
2350      at END_ELEM.  The result are written to MBCSET and SBCSET.
2351      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2352      mbcset->range_ends, is a pointer argument sinse we may
2353      update it.  */
2354
2355 static reg_errcode_t
2356 # ifdef RE_ENABLE_I18N
2357 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2358      re_charset_t *mbcset;
2359      int *range_alloc;
2360 # else /* not RE_ENABLE_I18N */
2361 build_range_exp (sbcset, start_elem, end_elem)
2362 # endif /* not RE_ENABLE_I18N */
2363      re_bitset_ptr_t sbcset;
2364      bracket_elem_t *start_elem, *end_elem;
2365 {
2366   unsigned int start_ch, end_ch;
2367   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2368   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2369           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2370           0))
2371     return REG_ERANGE;
2372
2373   /* We can handle no multi character collating elements without libc
2374      support.  */
2375   if (BE ((start_elem->type == COLL_SYM
2376            && strlen ((char *) start_elem->opr.name) > 1)
2377           || (end_elem->type == COLL_SYM
2378               && strlen ((char *) end_elem->opr.name) > 1), 0))
2379     return REG_ECOLLATE;
2380
2381 # ifdef RE_ENABLE_I18N
2382   {
2383     wchar_t wc, start_wc, end_wc;
2384     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2385
2386     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2387                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2388                    : 0));
2389     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2390               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2391                  : 0));
2392     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2393                 ? __btowc (start_ch) : start_elem->opr.wch);
2394     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2395               ? __btowc (end_ch) : end_elem->opr.wch);
2396     cmp_buf[0] = start_wc;
2397     cmp_buf[4] = end_wc;
2398     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2399       return REG_ERANGE;
2400
2401     /* Check the space of the arrays.  */
2402     if (*range_alloc == mbcset->nranges)
2403       {
2404         /* There are not enough space, need realloc.  */
2405         wchar_t *new_array_start, *new_array_end;
2406         int new_nranges;
2407
2408         /* +1 in case of mbcset->nranges is 0.  */
2409         new_nranges = 2 * mbcset->nranges + 1;
2410         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2411            are NULL if *range_alloc == 0.  */
2412         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2413                                       new_nranges);
2414         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2415                                     new_nranges);
2416
2417         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2418           return REG_ESPACE;
2419
2420         mbcset->range_starts = new_array_start;
2421         mbcset->range_ends = new_array_end;
2422         *range_alloc = new_nranges;
2423       }
2424
2425     mbcset->range_starts[mbcset->nranges] = start_wc;
2426     mbcset->range_ends[mbcset->nranges++] = end_wc;
2427
2428     /* Build the table for single byte characters.  */
2429     for (wc = 0; wc <= SBC_MAX; ++wc)
2430       {
2431         cmp_buf[2] = wc;
2432         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2433             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2434           bitset_set (sbcset, wc);
2435       }
2436   }
2437 # else /* not RE_ENABLE_I18N */
2438   {
2439     unsigned int ch;
2440     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2441                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2442                    : 0));
2443     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2444               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2445                  : 0));
2446     if (start_ch > end_ch)
2447       return REG_ERANGE;
2448     /* Build the table for single byte characters.  */
2449     for (ch = 0; ch <= SBC_MAX; ++ch)
2450       if (start_ch <= ch  && ch <= end_ch)
2451         bitset_set (sbcset, ch);
2452   }
2453 # endif /* not RE_ENABLE_I18N */
2454   return REG_NOERROR;
2455 }
2456 #endif /* not _LIBC */
2457
2458 #ifndef _LIBC
2459 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2460    Build the collating element which is represented by NAME.
2461    The result are written to MBCSET and SBCSET.
2462    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2463    pointer argument since we may update it.  */
2464
2465 static reg_errcode_t
2466 # ifdef RE_ENABLE_I18N
2467 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2468      re_charset_t *mbcset;
2469      int *coll_sym_alloc;
2470 # else /* not RE_ENABLE_I18N */
2471 build_collating_symbol (sbcset, name)
2472 # endif /* not RE_ENABLE_I18N */
2473      re_bitset_ptr_t sbcset;
2474      const unsigned char *name;
2475 {
2476   size_t name_len = strlen ((const char *) name);
2477   if (BE (name_len != 1, 0))
2478     return REG_ECOLLATE;
2479   else
2480     {
2481       bitset_set (sbcset, name[0]);
2482       return REG_NOERROR;
2483     }
2484 }
2485 #endif /* not _LIBC */
2486
2487 /* This function parse bracket expression like "[abc]", "[a-c]",
2488    "[[.a-a.]]" etc.  */
2489
2490 static bin_tree_t *
2491 parse_bracket_exp (regexp, dfa, token, syntax, err)
2492      re_string_t *regexp;
2493      re_dfa_t *dfa;
2494      re_token_t *token;
2495      reg_syntax_t syntax;
2496      reg_errcode_t *err;
2497 {
2498 #ifdef _LIBC
2499   const unsigned char *collseqmb;
2500   const char *collseqwc;
2501   uint32_t nrules;
2502   int32_t table_size;
2503   const int32_t *symb_table;
2504   const unsigned char *extra;
2505
2506   /* Local function for parse_bracket_exp used in _LIBC environement.
2507      Seek the collating symbol entry correspondings to NAME.
2508      Return the index of the symbol in the SYMB_TABLE.  */
2509
2510   static inline int32_t
2511   seek_collating_symbol_entry (name, name_len)
2512          const unsigned char *name;
2513          size_t name_len;
2514     {
2515       int32_t hash = elem_hash ((const char *) name, name_len);
2516       int32_t elem = hash % table_size;
2517       int32_t second = hash % (table_size - 2);
2518       while (symb_table[2 * elem] != 0)
2519         {
2520           /* First compare the hashing value.  */
2521           if (symb_table[2 * elem] == hash
2522               /* Compare the length of the name.  */
2523               && name_len == extra[symb_table[2 * elem + 1]]
2524               /* Compare the name.  */
2525               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2526                          name_len) == 0)
2527             {
2528               /* Yep, this is the entry.  */
2529               break;
2530             }
2531
2532           /* Next entry.  */
2533           elem += second;
2534         }
2535       return elem;
2536     }
2537
2538   /* Local function for parse_bracket_exp used in _LIBC environement.
2539      Look up the collation sequence value of BR_ELEM.
2540      Return the value if succeeded, UINT_MAX otherwise.  */
2541
2542   static inline unsigned int
2543   lookup_collation_sequence_value (br_elem)
2544          bracket_elem_t *br_elem;
2545     {
2546       if (br_elem->type == SB_CHAR)
2547         {
2548           /*
2549           if (MB_CUR_MAX == 1)
2550           */
2551           if (nrules == 0)
2552             return collseqmb[br_elem->opr.ch];
2553           else
2554             {
2555               wint_t wc = __btowc (br_elem->opr.ch);
2556               return collseq_table_lookup (collseqwc, wc);
2557             }
2558         }
2559       else if (br_elem->type == MB_CHAR)
2560         {
2561           return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2562         }
2563       else if (br_elem->type == COLL_SYM)
2564         {
2565           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2566           if (nrules != 0)
2567             {
2568               int32_t elem, idx;
2569               elem = seek_collating_symbol_entry (br_elem->opr.name,
2570                                                   sym_name_len);
2571               if (symb_table[2 * elem] != 0)
2572                 {
2573                   /* We found the entry.  */
2574                   idx = symb_table[2 * elem + 1];
2575                   /* Skip the name of collating element name.  */
2576                   idx += 1 + extra[idx];
2577                   /* Skip the byte sequence of the collating element.  */
2578                   idx += 1 + extra[idx];
2579                   /* Adjust for the alignment.  */
2580                   idx = (idx + 3) & ~3;
2581                   /* Skip the multibyte collation sequence value.  */
2582                   idx += sizeof (unsigned int);
2583                   /* Skip the wide char sequence of the collating element.  */
2584                   idx += sizeof (unsigned int) *
2585                     (1 + *(unsigned int *) (extra + idx));
2586                   /* Return the collation sequence value.  */
2587                   return *(unsigned int *) (extra + idx);
2588                 }
2589               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2590                 {
2591                   /* No valid character.  Match it as a single byte
2592                      character.  */
2593                   return collseqmb[br_elem->opr.name[0]];
2594                 }
2595             }
2596           else if (sym_name_len == 1)
2597             return collseqmb[br_elem->opr.name[0]];
2598         }
2599       return UINT_MAX;
2600     }
2601
2602   /* Local function for parse_bracket_exp used in _LIBC environement.
2603      Build the range expression which starts from START_ELEM, and ends
2604      at END_ELEM.  The result are written to MBCSET and SBCSET.
2605      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2606      mbcset->range_ends, is a pointer argument sinse we may
2607      update it.  */
2608
2609   static inline reg_errcode_t
2610 # ifdef RE_ENABLE_I18N
2611   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2612          re_charset_t *mbcset;
2613          int *range_alloc;
2614 # else /* not RE_ENABLE_I18N */
2615   build_range_exp (sbcset, start_elem, end_elem)
2616 # endif /* not RE_ENABLE_I18N */
2617          re_bitset_ptr_t sbcset;
2618          bracket_elem_t *start_elem, *end_elem;
2619     {
2620       unsigned int ch;
2621       uint32_t start_collseq;
2622       uint32_t end_collseq;
2623
2624 # ifdef RE_ENABLE_I18N
2625       /* Check the space of the arrays.  */
2626       if (*range_alloc == mbcset->nranges)
2627         {
2628           /* There are not enough space, need realloc.  */
2629           uint32_t *new_array_start;
2630           uint32_t *new_array_end;
2631           int new_nranges;
2632
2633           /* +1 in case of mbcset->nranges is 0.  */
2634           new_nranges = 2 * mbcset->nranges + 1;
2635           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2636              are NULL if *range_alloc == 0.  */
2637           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2638                                         new_nranges);
2639           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2640                                       new_nranges);
2641
2642           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2643             return REG_ESPACE;
2644
2645           mbcset->range_starts = new_array_start;
2646           mbcset->range_ends = new_array_end;
2647           *range_alloc = new_nranges;
2648         }
2649 # endif /* RE_ENABLE_I18N */
2650
2651       /* Equivalence Classes and Character Classes can't be a range
2652          start/end.  */
2653       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2654               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2655               0))
2656         return REG_ERANGE;
2657
2658       start_collseq = lookup_collation_sequence_value (start_elem);
2659       end_collseq = lookup_collation_sequence_value (end_elem);
2660       /* Check start/end collation sequence values.  */
2661       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2662         return REG_ECOLLATE;
2663       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2664         return REG_ERANGE;
2665
2666 # ifdef RE_ENABLE_I18N
2667       /* Got valid collation sequence values, add them as a new entry.  */
2668       mbcset->range_starts[mbcset->nranges] = start_collseq;
2669       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2670 # endif /* RE_ENABLE_I18N */
2671
2672       /* Build the table for single byte characters.  */
2673       for (ch = 0; ch <= SBC_MAX; ch++)
2674         {
2675           uint32_t ch_collseq;
2676           /*
2677           if (MB_CUR_MAX == 1)
2678           */
2679           if (nrules == 0)
2680             ch_collseq = collseqmb[ch];
2681           else
2682             ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2683           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2684             bitset_set (sbcset, ch);
2685         }
2686       return REG_NOERROR;
2687     }
2688
2689   /* Local function for parse_bracket_exp used in _LIBC environement.
2690      Build the collating element which is represented by NAME.
2691      The result are written to MBCSET and SBCSET.
2692      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2693      pointer argument sinse we may update it.  */
2694
2695   static inline reg_errcode_t
2696 # ifdef RE_ENABLE_I18N
2697   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2698          re_charset_t *mbcset;
2699          int *coll_sym_alloc;
2700 # else /* not RE_ENABLE_I18N */
2701   build_collating_symbol (sbcset, name)
2702 # endif /* not RE_ENABLE_I18N */
2703          re_bitset_ptr_t sbcset;
2704          const unsigned char *name;
2705     {
2706       int32_t elem, idx;
2707       size_t name_len = strlen ((const char *) name);
2708       if (nrules != 0)
2709         {
2710           elem = seek_collating_symbol_entry (name, name_len);
2711           if (symb_table[2 * elem] != 0)
2712             {
2713               /* We found the entry.  */
2714               idx = symb_table[2 * elem + 1];
2715               /* Skip the name of collating element name.  */
2716               idx += 1 + extra[idx];
2717             }
2718           else if (symb_table[2 * elem] == 0 && name_len == 1)
2719             {
2720               /* No valid character, treat it as a normal
2721                  character.  */
2722               bitset_set (sbcset, name[0]);
2723               return REG_NOERROR;
2724             }
2725           else
2726             return REG_ECOLLATE;
2727
2728 # ifdef RE_ENABLE_I18N
2729           /* Got valid collation sequence, add it as a new entry.  */
2730           /* Check the space of the arrays.  */
2731           if (*coll_sym_alloc == mbcset->ncoll_syms)
2732             {
2733               /* Not enough, realloc it.  */
2734               /* +1 in case of mbcset->ncoll_syms is 0.  */
2735               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2736               /* Use realloc since mbcset->coll_syms is NULL
2737                  if *alloc == 0.  */
2738               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2739                                               *coll_sym_alloc);
2740               if (BE (mbcset->coll_syms == NULL, 0))
2741                 return REG_ESPACE;
2742             }
2743           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2744 # endif /* RE_ENABLE_I18N */
2745           return REG_NOERROR;
2746         }
2747       else
2748         {
2749           if (BE (name_len != 1, 0))
2750             return REG_ECOLLATE;
2751           else
2752             {
2753               bitset_set (sbcset, name[0]);
2754               return REG_NOERROR;
2755             }
2756         }
2757     }
2758 #endif
2759
2760   re_token_t br_token;
2761   re_bitset_ptr_t sbcset;
2762 #ifdef RE_ENABLE_I18N
2763   re_charset_t *mbcset;
2764   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2765   int equiv_class_alloc = 0, char_class_alloc = 0;
2766 #else /* not RE_ENABLE_I18N */
2767   int non_match = 0;
2768 #endif /* not RE_ENABLE_I18N */
2769   bin_tree_t *work_tree;
2770   int token_len, new_idx;
2771 #ifdef _LIBC
2772   collseqmb = (const unsigned char *)
2773     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2774   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2775   if (nrules)
2776     {
2777       /*
2778       if (MB_CUR_MAX > 1)
2779       */
2780         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2781       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2782       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2783                                                   _NL_COLLATE_SYMB_TABLEMB);
2784       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2785                                                    _NL_COLLATE_SYMB_EXTRAMB);
2786     }
2787 #endif
2788   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2789 #ifdef RE_ENABLE_I18N
2790   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2791 #endif /* RE_ENABLE_I18N */
2792 #ifdef RE_ENABLE_I18N
2793   if (BE (sbcset == NULL || mbcset == NULL, 0))
2794 #else
2795   if (BE (sbcset == NULL, 0))
2796 #endif /* RE_ENABLE_I18N */
2797     {
2798       *err = REG_ESPACE;
2799       return NULL;
2800     }
2801
2802   token_len = peek_token_bracket (token, regexp, syntax);
2803   if (BE (token->type == END_OF_RE, 0))
2804     {
2805       *err = REG_BADPAT;
2806       goto parse_bracket_exp_free_return;
2807     }
2808   if (token->type == OP_NON_MATCH_LIST)
2809     {
2810 #ifdef RE_ENABLE_I18N
2811       int i;
2812       mbcset->non_match = 1;
2813 #else /* not RE_ENABLE_I18N */
2814       non_match = 1;
2815 #endif /* not RE_ENABLE_I18N */
2816       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2817         bitset_set (sbcset, '\0');
2818       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2819       token_len = peek_token_bracket (token, regexp, syntax);
2820       if (BE (token->type == END_OF_RE, 0))
2821         {
2822           *err = REG_BADPAT;
2823           goto parse_bracket_exp_free_return;
2824         }
2825 #ifdef RE_ENABLE_I18N
2826       if (MB_CUR_MAX > 1)
2827         for (i = 0; i < SBC_MAX; ++i)
2828           if (__btowc (i) == WEOF)
2829             bitset_set (sbcset, i);
2830 #endif /* RE_ENABLE_I18N */
2831     }
2832
2833   /* We treat the first ']' as a normal character.  */
2834   if (token->type == OP_CLOSE_BRACKET)
2835     token->type = CHARACTER;
2836
2837   while (1)
2838     {
2839       bracket_elem_t start_elem, end_elem;
2840       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2841       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2842       reg_errcode_t ret;
2843       int token_len2 = 0, is_range_exp = 0;
2844       re_token_t token2;
2845
2846       start_elem.opr.name = start_name_buf;
2847       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2848                                    syntax);
2849       if (BE (ret != REG_NOERROR, 0))
2850         {
2851           *err = ret;
2852           goto parse_bracket_exp_free_return;
2853         }
2854
2855       token_len = peek_token_bracket (token, regexp, syntax);
2856       if (BE (token->type == END_OF_RE, 0))
2857         {
2858           *err = REG_BADPAT;
2859           goto parse_bracket_exp_free_return;
2860         }
2861       if (token->type == OP_CHARSET_RANGE)
2862         {
2863           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2864           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2865           if (BE (token->type == END_OF_RE, 0))
2866             {
2867               *err = REG_BADPAT;
2868               goto parse_bracket_exp_free_return;
2869             }
2870           if (token2.type == OP_CLOSE_BRACKET)
2871             {
2872               /* We treat the last '-' as a normal character.  */
2873               re_string_skip_bytes (regexp, -token_len);
2874               token->type = CHARACTER;
2875             }
2876           else
2877             is_range_exp = 1;
2878         }
2879
2880       if (is_range_exp == 1)
2881         {
2882           end_elem.opr.name = end_name_buf;
2883           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2884                                        dfa, syntax);
2885           if (BE (ret != REG_NOERROR, 0))
2886             {
2887               *err = ret;
2888               goto parse_bracket_exp_free_return;
2889             }
2890
2891           token_len = peek_token_bracket (token, regexp, syntax);
2892           if (BE (token->type == END_OF_RE, 0))
2893             {
2894               *err = REG_BADPAT;
2895               goto parse_bracket_exp_free_return;
2896             }
2897           *err = build_range_exp (sbcset,
2898 #ifdef RE_ENABLE_I18N
2899                                   mbcset, &range_alloc,
2900 #endif /* RE_ENABLE_I18N */
2901                                   &start_elem, &end_elem);
2902           if (BE (*err != REG_NOERROR, 0))
2903             goto parse_bracket_exp_free_return;
2904         }
2905       else
2906         {
2907           switch (start_elem.type)
2908             {
2909             case SB_CHAR:
2910               bitset_set (sbcset, start_elem.opr.ch);
2911               break;
2912 #ifdef RE_ENABLE_I18N
2913             case MB_CHAR:
2914               /* Check whether the array has enough space.  */
2915               if (mbchar_alloc == mbcset->nmbchars)
2916                 {
2917                   /* Not enough, realloc it.  */
2918                   /* +1 in case of mbcset->nmbchars is 0.  */
2919                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2920                   /* Use realloc since array is NULL if *alloc == 0.  */
2921                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2922                                                 mbchar_alloc);
2923                   if (BE (mbcset->mbchars == NULL, 0))
2924                     goto parse_bracket_exp_espace;
2925                 }
2926               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2927               break;
2928 #endif /* RE_ENABLE_I18N */
2929             case EQUIV_CLASS:
2930               *err = build_equiv_class (sbcset,
2931 #ifdef RE_ENABLE_I18N
2932                                         mbcset, &equiv_class_alloc,
2933 #endif /* RE_ENABLE_I18N */
2934                                         start_elem.opr.name);
2935               if (BE (*err != REG_NOERROR, 0))
2936                 goto parse_bracket_exp_free_return;
2937               break;
2938             case COLL_SYM:
2939               *err = build_collating_symbol (sbcset,
2940 #ifdef RE_ENABLE_I18N
2941                                              mbcset, &coll_sym_alloc,
2942 #endif /* RE_ENABLE_I18N */
2943                                              start_elem.opr.name);
2944               if (BE (*err != REG_NOERROR, 0))
2945                 goto parse_bracket_exp_free_return;
2946               break;
2947             case CHAR_CLASS:
2948               ret = build_charclass (sbcset,
2949 #ifdef RE_ENABLE_I18N
2950                                      mbcset, &char_class_alloc,
2951 #endif /* RE_ENABLE_I18N */
2952                                      start_elem.opr.name, syntax);
2953               if (BE (ret != REG_NOERROR, 0))
2954                goto parse_bracket_exp_espace;
2955               break;
2956             default:
2957               assert (0);
2958               break;
2959             }
2960         }
2961       if (token->type == OP_CLOSE_BRACKET)
2962         break;
2963     }
2964
2965   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2966
2967   /* If it is non-matching list.  */
2968 #ifdef RE_ENABLE_I18N
2969   if (mbcset->non_match)
2970 #else /* not RE_ENABLE_I18N */
2971   if (non_match)
2972 #endif /* not RE_ENABLE_I18N */
2973     bitset_not (sbcset);
2974
2975   /* Build a tree for simple bracket.  */
2976   br_token.type = SIMPLE_BRACKET;
2977   br_token.opr.sbcset = sbcset;
2978   new_idx = re_dfa_add_node (dfa, br_token, 0);
2979   work_tree = create_tree (NULL, NULL, 0, new_idx);
2980   if (BE (new_idx == -1 || work_tree == NULL, 0))
2981     goto parse_bracket_exp_espace;
2982
2983 #ifdef RE_ENABLE_I18N
2984   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2985       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2986                                                 || mbcset->non_match)))
2987     {
2988       re_token_t alt_token;
2989       bin_tree_t *mbc_tree;
2990       /* Build a tree for complex bracket.  */
2991       br_token.type = COMPLEX_BRACKET;
2992       br_token.opr.mbcset = mbcset;
2993       dfa->has_mb_node = 1;
2994       new_idx = re_dfa_add_node (dfa, br_token, 0);
2995       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2996       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2997         goto parse_bracket_exp_espace;
2998       /* Then join them by ALT node.  */
2999       dfa->has_plural_match = 1;
3000       alt_token.type = OP_ALT;
3001       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3002       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3003       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3004         return work_tree;
3005     }
3006   else
3007     {
3008       free_charset (mbcset);
3009       return work_tree;
3010     }
3011 #else /* not RE_ENABLE_I18N */
3012   return work_tree;
3013 #endif /* not RE_ENABLE_I18N */
3014
3015  parse_bracket_exp_espace:
3016   *err = REG_ESPACE;
3017  parse_bracket_exp_free_return:
3018   re_free (sbcset);
3019 #ifdef RE_ENABLE_I18N
3020   free_charset (mbcset);
3021 #endif /* RE_ENABLE_I18N */
3022   return NULL;
3023 }
3024
3025 /* Parse an element in the bracket expression.  */
3026
3027 static reg_errcode_t
3028 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3029      bracket_elem_t *elem;
3030      re_string_t *regexp;
3031      re_token_t *token;
3032      int token_len;
3033      re_dfa_t *dfa;
3034      reg_syntax_t syntax;
3035 {
3036 #ifdef RE_ENABLE_I18N
3037   int cur_char_size;
3038   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3039   if (cur_char_size > 1)
3040     {
3041       elem->type = MB_CHAR;
3042       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3043       re_string_skip_bytes (regexp, cur_char_size);
3044       return REG_NOERROR;
3045     }
3046 #endif /* RE_ENABLE_I18N */
3047   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3048   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3049       || token->type == OP_OPEN_EQUIV_CLASS)
3050     return parse_bracket_symbol (elem, regexp, token);
3051   elem->type = SB_CHAR;
3052   elem->opr.ch = token->opr.c;
3053   return REG_NOERROR;
3054 }
3055
3056 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3057    such as [:<character_class>:], [.<collating_element>.], and
3058    [=<equivalent_class>=].  */
3059
3060 static reg_errcode_t
3061 parse_bracket_symbol (elem, regexp, token)
3062      bracket_elem_t *elem;
3063      re_string_t *regexp;
3064      re_token_t *token;
3065 {
3066   unsigned char ch, delim = token->opr.c;
3067   int i = 0;
3068   for (;; ++i)
3069     {
3070       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3071         return REG_EBRACK;
3072       if (token->type == OP_OPEN_CHAR_CLASS)
3073         ch = re_string_fetch_byte_case (regexp);
3074       else
3075         ch = re_string_fetch_byte (regexp);
3076       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3077         break;
3078       elem->opr.name[i] = ch;
3079     }
3080   re_string_skip_bytes (regexp, 1);
3081   elem->opr.name[i] = '\0';
3082   switch (token->type)
3083     {
3084     case OP_OPEN_COLL_ELEM:
3085       elem->type = COLL_SYM;
3086       break;
3087     case OP_OPEN_EQUIV_CLASS:
3088       elem->type = EQUIV_CLASS;
3089       break;
3090     case OP_OPEN_CHAR_CLASS:
3091       elem->type = CHAR_CLASS;
3092       break;
3093     default:
3094       break;
3095     }
3096   return REG_NOERROR;
3097 }
3098
3099   /* Helper function for parse_bracket_exp.
3100      Build the equivalence class which is represented by NAME.
3101      The result are written to MBCSET and SBCSET.
3102      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3103      is a pointer argument sinse we may update it.  */
3104
3105 static reg_errcode_t
3106 #ifdef RE_ENABLE_I18N
3107 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3108      re_charset_t *mbcset;
3109      int *equiv_class_alloc;
3110 #else /* not RE_ENABLE_I18N */
3111 build_equiv_class (sbcset, name)
3112 #endif /* not RE_ENABLE_I18N */
3113      re_bitset_ptr_t sbcset;
3114      const unsigned char *name;
3115 {
3116 #if defined _LIBC && defined RE_ENABLE_I18N
3117   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3118   if (nrules != 0)
3119     {
3120       const int32_t *table, *indirect;
3121       const unsigned char *weights, *extra, *cp;
3122       unsigned char char_buf[2];
3123       int32_t idx1, idx2;
3124       unsigned int ch;
3125       size_t len;
3126       /* This #include defines a local function!  */
3127 # include <locale/weight.h>
3128       /* Calculate the index for equivalence class.  */
3129       cp = name;
3130       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3131       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3132                                                _NL_COLLATE_WEIGHTMB);
3133       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3134                                                    _NL_COLLATE_EXTRAMB);
3135       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3136                                                 _NL_COLLATE_INDIRECTMB);
3137       idx1 = findidx (&cp);
3138       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3139         /* This isn't a valid character.  */
3140         return REG_ECOLLATE;
3141
3142       /* Build single byte matcing table for this equivalence class.  */
3143       char_buf[1] = (unsigned char) '\0';
3144       len = weights[idx1];
3145       for (ch = 0; ch < SBC_MAX; ++ch)
3146         {
3147           char_buf[0] = ch;
3148           cp = char_buf;
3149           idx2 = findidx (&cp);
3150 /*
3151           idx2 = table[ch];
3152 */
3153           if (idx2 == 0)
3154             /* This isn't a valid character.  */
3155             continue;
3156           if (len == weights[idx2])
3157             {
3158               int cnt = 0;
3159               while (cnt <= len &&
3160                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3161                 ++cnt;
3162
3163               if (cnt > len)
3164                 bitset_set (sbcset, ch);
3165             }
3166         }
3167       /* Check whether the array has enough space.  */
3168       if (*equiv_class_alloc == mbcset->nequiv_classes)
3169         {
3170           /* Not enough, realloc it.  */
3171           /* +1 in case of mbcset->nequiv_classes is 0.  */
3172           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3173           /* Use realloc since the array is NULL if *alloc == 0.  */
3174           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3175                                               *equiv_class_alloc);
3176           if (BE (mbcset->equiv_classes == NULL, 0))
3177             return REG_ESPACE;
3178         }
3179       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3180     }
3181   else
3182 #endif /* _LIBC && RE_ENABLE_I18N */
3183     {
3184       if (BE (strlen ((const char *) name) != 1, 0))
3185         return REG_ECOLLATE;
3186       bitset_set (sbcset, *name);
3187     }
3188   return REG_NOERROR;
3189 }
3190
3191   /* Helper function for parse_bracket_exp.
3192      Build the character class which is represented by NAME.
3193      The result are written to MBCSET and SBCSET.
3194      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3195      is a pointer argument sinse we may update it.  */
3196
3197 static reg_errcode_t
3198 #ifdef RE_ENABLE_I18N
3199 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3200      re_charset_t *mbcset;
3201      int *char_class_alloc;
3202 #else /* not RE_ENABLE_I18N */
3203 build_charclass (sbcset, class_name, syntax)
3204 #endif /* not RE_ENABLE_I18N */
3205      re_bitset_ptr_t sbcset;
3206      const unsigned char *class_name;
3207      reg_syntax_t syntax;
3208 {
3209   int i;
3210   const char *name = (const char *) class_name;
3211
3212   /* In case of REG_ICASE "upper" and "lower" match the both of
3213      upper and lower cases.  */
3214   if ((syntax & RE_ICASE)
3215       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3216     name = "alpha";
3217
3218 #ifdef RE_ENABLE_I18N
3219   /* Check the space of the arrays.  */
3220   if (*char_class_alloc == mbcset->nchar_classes)
3221     {
3222       /* Not enough, realloc it.  */
3223       /* +1 in case of mbcset->nchar_classes is 0.  */
3224       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3225       /* Use realloc since array is NULL if *alloc == 0.  */
3226       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3227                                          *char_class_alloc);
3228       if (BE (mbcset->char_classes == NULL, 0))
3229         return REG_ESPACE;
3230     }
3231   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3232 #endif /* RE_ENABLE_I18N */
3233
3234 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3235     for (i = 0; i < SBC_MAX; ++i)       \
3236       {                                 \
3237         if (ctype_func (i))             \
3238           bitset_set (sbcset, i);       \
3239       }
3240
3241   if (strcmp (name, "alnum") == 0)
3242     BUILD_CHARCLASS_LOOP (isalnum)
3243   else if (strcmp (name, "cntrl") == 0)
3244     BUILD_CHARCLASS_LOOP (iscntrl)
3245   else if (strcmp (name, "lower") == 0)
3246     BUILD_CHARCLASS_LOOP (islower)
3247   else if (strcmp (name, "space") == 0)
3248     BUILD_CHARCLASS_LOOP (isspace)
3249   else if (strcmp (name, "alpha") == 0)
3250     BUILD_CHARCLASS_LOOP (isalpha)
3251   else if (strcmp (name, "digit") == 0)
3252     BUILD_CHARCLASS_LOOP (isdigit)
3253   else if (strcmp (name, "print") == 0)
3254     BUILD_CHARCLASS_LOOP (isprint)
3255   else if (strcmp (name, "upper") == 0)
3256     BUILD_CHARCLASS_LOOP (isupper)
3257   else if (strcmp (name, "blank") == 0)
3258     BUILD_CHARCLASS_LOOP (isblank)
3259   else if (strcmp (name, "graph") == 0)
3260     BUILD_CHARCLASS_LOOP (isgraph)
3261   else if (strcmp (name, "punct") == 0)
3262     BUILD_CHARCLASS_LOOP (ispunct)
3263   else if (strcmp (name, "xdigit") == 0)
3264     BUILD_CHARCLASS_LOOP (isxdigit)
3265   else
3266     return REG_ECTYPE;
3267
3268   return REG_NOERROR;
3269 }
3270
3271 static bin_tree_t *
3272 build_word_op (dfa, not, err)
3273      re_dfa_t *dfa;
3274      int not;
3275      reg_errcode_t *err;
3276 {
3277   re_bitset_ptr_t sbcset;
3278 #ifdef RE_ENABLE_I18N
3279   re_charset_t *mbcset;
3280   int alloc = 0;
3281 #else /* not RE_ENABLE_I18N */
3282   int non_match = 0;
3283 #endif /* not RE_ENABLE_I18N */
3284   reg_errcode_t ret;
3285   re_token_t br_token;
3286   bin_tree_t *tree;
3287   int new_idx;
3288
3289   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3290 #ifdef RE_ENABLE_I18N
3291   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3292 #endif /* RE_ENABLE_I18N */
3293
3294 #ifdef RE_ENABLE_I18N
3295   if (BE (sbcset == NULL || mbcset == NULL, 0))
3296 #else /* not RE_ENABLE_I18N */
3297   if (BE (sbcset == NULL, 0))
3298 #endif /* not RE_ENABLE_I18N */
3299     {
3300       *err = REG_ESPACE;
3301       return NULL;
3302     }
3303
3304   if (not)
3305     {
3306 #ifdef RE_ENABLE_I18N
3307       int i;
3308       /*
3309       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3310         bitset_set(cset->sbcset, '\0');
3311       */
3312       mbcset->non_match = 1;
3313       if (MB_CUR_MAX > 1)
3314         for (i = 0; i < SBC_MAX; ++i)
3315           if (__btowc (i) == WEOF)
3316             bitset_set (sbcset, i);
3317 #else /* not RE_ENABLE_I18N */
3318       non_match = 1;
3319 #endif /* not RE_ENABLE_I18N */
3320     }
3321
3322   /* We don't care the syntax in this case.  */
3323   ret = build_charclass (sbcset,
3324 #ifdef RE_ENABLE_I18N
3325                          mbcset, &alloc,
3326 #endif /* RE_ENABLE_I18N */
3327                          (const unsigned char *) "alpha", 0);
3328
3329   if (BE (ret != REG_NOERROR, 0))
3330     {
3331       re_free (sbcset);
3332 #ifdef RE_ENABLE_I18N
3333       free_charset (mbcset);
3334 #endif /* RE_ENABLE_I18N */
3335       *err = REG_ESPACE;
3336       return NULL;
3337     }
3338   /* \w match '_' also.  */
3339   bitset_set (sbcset, '_');
3340
3341   /* If it is non-matching list.  */
3342 #ifdef RE_ENABLE_I18N
3343   if (mbcset->non_match)
3344 #else /* not RE_ENABLE_I18N */
3345   if (non_match)
3346 #endif /* not RE_ENABLE_I18N */
3347     bitset_not (sbcset);
3348
3349   /* Build a tree for simple bracket.  */
3350   br_token.type = SIMPLE_BRACKET;
3351   br_token.opr.sbcset = sbcset;
3352   new_idx = re_dfa_add_node (dfa, br_token, 0);
3353   tree = create_tree (NULL, NULL, 0, new_idx);
3354   if (BE (new_idx == -1 || tree == NULL, 0))
3355     goto build_word_op_espace;
3356
3357 #ifdef RE_ENABLE_I18N
3358   if (MB_CUR_MAX > 1)
3359     {
3360       re_token_t alt_token;
3361       bin_tree_t *mbc_tree;
3362       /* Build a tree for complex bracket.  */
3363       br_token.type = COMPLEX_BRACKET;
3364       br_token.opr.mbcset = mbcset;
3365       dfa->has_mb_node = 1;
3366       new_idx = re_dfa_add_node (dfa, br_token, 0);
3367       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3368       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3369         goto build_word_op_espace;
3370       /* Then join them by ALT node.  */
3371       alt_token.type = OP_ALT;
3372       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3373       tree = create_tree (tree, mbc_tree, 0, new_idx);
3374       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3375         return tree;
3376     }
3377   else
3378     {
3379       free_charset (mbcset);
3380       return tree;
3381     }
3382 #else /* not RE_ENABLE_I18N */
3383   return tree;
3384 #endif /* not RE_ENABLE_I18N */
3385
3386  build_word_op_espace:
3387   re_free (sbcset);
3388 #ifdef RE_ENABLE_I18N
3389   free_charset (mbcset);
3390 #endif /* RE_ENABLE_I18N */
3391   *err = REG_ESPACE;
3392   return NULL;
3393 }
3394
3395 /* This is intended for the expressions like "a{1,3}".
3396    Fetch a number from `input', and return the number.
3397    Return -1, if the number field is empty like "{,1}".
3398    Return -2, If an error is occured.  */
3399
3400 static int
3401 fetch_number (input, token, syntax)
3402      re_string_t *input;
3403      re_token_t *token;
3404      reg_syntax_t syntax;
3405 {
3406   int num = -1;
3407   unsigned char c;
3408   while (1)
3409     {
3410       *token = fetch_token (input, syntax);
3411       c = token->opr.c;
3412       if (BE (token->type == END_OF_RE, 0))
3413         return -2;
3414       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3415         break;
3416       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3417              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3418       num = (num > RE_DUP_MAX) ? -2 : num;
3419     }
3420   return num;
3421 }
3422 \f
3423 #ifdef RE_ENABLE_I18N
3424 static void
3425 free_charset (re_charset_t *cset)
3426 {
3427   re_free (cset->mbchars);
3428 # ifdef _LIBC
3429   re_free (cset->coll_syms);
3430   re_free (cset->equiv_classes);
3431   re_free (cset->range_starts);
3432   re_free (cset->range_ends);
3433 # endif
3434   re_free (cset->char_classes);
3435   re_free (cset);
3436 }
3437 #endif /* RE_ENABLE_I18N */
3438 \f
3439 /* Functions for binary tree operation.  */
3440
3441 /* Create a node of tree.
3442    Note: This function automatically free left and right if malloc fails.  */
3443
3444 static bin_tree_t *
3445 create_tree (left, right, type, index)
3446      bin_tree_t *left;
3447      bin_tree_t *right;
3448      re_token_type_t type;
3449      int index;
3450 {
3451   bin_tree_t *tree;
3452   tree = re_malloc (bin_tree_t, 1);
3453   if (BE (tree == NULL, 0))
3454     {
3455       free_bin_tree (left);
3456       free_bin_tree (right);
3457       return NULL;
3458     }
3459   tree->parent = NULL;
3460   tree->left = left;
3461   tree->right = right;
3462   tree->type = type;
3463   tree->node_idx = index;
3464   tree->first = -1;
3465   tree->next = -1;
3466   re_node_set_init_empty (&tree->eclosure);
3467
3468   if (left != NULL)
3469     left->parent = tree;
3470   if (right != NULL)
3471     right->parent = tree;
3472   return tree;
3473 }
3474
3475 /* Free the sub tree pointed by TREE.  */
3476
3477 static void
3478 free_bin_tree (tree)
3479      bin_tree_t *tree;
3480 {
3481   if (tree == NULL)
3482     return;
3483   /*re_node_set_free (&tree->eclosure);*/
3484   free_bin_tree (tree->left);
3485   free_bin_tree (tree->right);
3486   re_free (tree);
3487 }
3488
3489 /* Duplicate the node SRC, and return new node.  */
3490
3491 static bin_tree_t *
3492 duplicate_tree (src, dfa)
3493      const bin_tree_t *src;
3494      re_dfa_t *dfa;
3495 {
3496   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3497   int new_node_idx;
3498   /* Since node indies must be according to Post-order of the tree,
3499      we must duplicate the left at first.  */
3500   if (src->left != NULL)
3501     {
3502       left = duplicate_tree (src->left, dfa);
3503       if (left == NULL)
3504         return NULL;
3505     }
3506
3507   /* Secondaly, duplicate the right.  */
3508   if (src->right != NULL)
3509     {
3510       right = duplicate_tree (src->right, dfa);
3511       if (right == NULL)
3512         {
3513           free_bin_tree (left);
3514           return NULL;
3515         }
3516     }
3517
3518   /* At last, duplicate itself.  */
3519   if (src->type == NON_TYPE)
3520     {
3521       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3522       dfa->nodes[new_node_idx].duplicated = 1;
3523       if (BE (new_node_idx == -1, 0))
3524         {
3525           free_bin_tree (left);
3526           free_bin_tree (right);
3527           return NULL;
3528         }
3529     }
3530   else
3531     new_node_idx = src->type;
3532
3533   new_tree = create_tree (left, right, src->type, new_node_idx);
3534   if (BE (new_tree == NULL, 0))
3535     {
3536       free_bin_tree (left);
3537       free_bin_tree (right);
3538     }
3539   return new_tree;
3540 }