posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <limits.h>
  24 #include <locale.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #if defined HAVE_WCHAR_H || defined _LIBC
  30 # include <wchar.h>
  31 #endif /* HAVE_WCHAR_H || _LIBC */
  32 #if defined HAVE_WCTYPE_H || defined _LIBC
  33 # include <wctype.h>
  34 #endif /* HAVE_WCTYPE_H || _LIBC */
  35
  36 /* In case that the system doesn't have isblank().  */
  37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
  38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
  39 #endif
  40
  41 #ifdef _LIBC
  42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
  43 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
  44 #   include <locale/localeinfo.h>
  45 #   include <locale/elem-hash.h>
  46 #   include <locale/coll-lookup.h>
  47 # endif
  48 #endif
  49
  50 /* This is for other GNU distributions with internationalized messages.  */
  51 #if HAVE_LIBINTL_H || defined _LIBC
  52 # include <libintl.h>
  53 # ifdef _LIBC
  54 #  undef gettext
  55 #  define gettext(msgid) \
  56   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
  57 # endif
  58 #else
  59 # define gettext(msgid) (msgid)
  60 #endif
  61
  62 #ifndef gettext_noop
  63 /* This define is so xgettext can find the internationalizable
  64    strings.  */
  65 # define gettext_noop(String) String
  66 #endif
  67
  68 #include <regex.h>
  69 #include "regex_internal.h"
  70
  71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  72                                           int length, reg_syntax_t syntax);
  73 static void re_compile_fastmap_iter (regex_t *bufp,
  74                                      const re_dfastate_t *init_state,
  75                                      char *fastmap);
  76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  78 #ifdef RE_ENABLE_I18N
  79 static void free_charset (re_charset_t *cset);
  80 #endif /* RE_ENABLE_I18N */
  81 static void free_workarea_compile (regex_t *preg);
  82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  83 static reg_errcode_t analyze (re_dfa_t *dfa);
  84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  88 static reg_errcode_t duplicate_node_closure (re_dfa_t *dfa, int top_org_node,
  89                                              int top_clone_node, int root_node,
  90                                              unsigned int constraint);
  91 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  92                                      unsigned int constraint);
  93 static int search_duplicated_node (re_dfa_t *dfa, int org_node,
  94                                    unsigned int constraint);
  95 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  96 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  97                                          int node, int root);
  98 static void calc_inveclosure (re_dfa_t *dfa);
  99 static int fetch_number (re_string_t *input, re_token_t *token,
 100                          reg_syntax_t syntax);
 101 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
 102 static int peek_token (re_token_t *token, re_string_t *input,
 103                         reg_syntax_t syntax);
 104 static int peek_token_bracket (re_token_t *token, re_string_t *input,
 105                                reg_syntax_t syntax);
 106 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
 107                           reg_syntax_t syntax, reg_errcode_t *err);
 108 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
 109                                   re_token_t *token, reg_syntax_t syntax,
 110                                   int nest, reg_errcode_t *err);
 111 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
 112                                  re_token_t *token, reg_syntax_t syntax,
 113                                  int nest, reg_errcode_t *err);
 114 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
 115                                      re_token_t *token, reg_syntax_t syntax,
 116                                      int nest, reg_errcode_t *err);
 117 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
 118                                   re_token_t *token, reg_syntax_t syntax,
 119                                   int nest, reg_errcode_t *err);
 120 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
 121                                  re_dfa_t *dfa, re_token_t *token,
 122                                  reg_syntax_t syntax, reg_errcode_t *err);
 123 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
 124                                       re_token_t *token, reg_syntax_t syntax,
 125                                       reg_errcode_t *err);
 126 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
 127                                             re_string_t *regexp,
 128                                             re_token_t *token, int token_len,
 129                                             re_dfa_t *dfa,
 130                                             reg_syntax_t syntax);
 131 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
 132                                           re_string_t *regexp,
 133                                           re_token_t *token);
 134 #ifndef _LIBC
 135 # ifdef RE_ENABLE_I18N
 136 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 137                                       re_charset_t *mbcset, int *range_alloc,
 138                                       bracket_elem_t *start_elem,
 139                                       bracket_elem_t *end_elem);
 140 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 141                                              re_charset_t *mbcset,
 142                                              int *coll_sym_alloc,
 143                                              const unsigned char *name);
 144 # else /* not RE_ENABLE_I18N */
 145 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 146                                       bracket_elem_t *start_elem,
 147                                       bracket_elem_t *end_elem);
 148 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 149                                              const unsigned char *name);
 150 # endif /* not RE_ENABLE_I18N */
 151 #endif /* not _LIBC */
 152 #ifdef RE_ENABLE_I18N
 153 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 154                                         re_charset_t *mbcset,
 155                                         int *equiv_class_alloc,
 156                                         const unsigned char *name);
 157 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 158                                       re_charset_t *mbcset,
 159                                       int *char_class_alloc,
 160                                       const unsigned char *class_name,
 161                                       reg_syntax_t syntax);
 162 #else  /* not RE_ENABLE_I18N */
 163 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 164                                         const unsigned char *name);
 165 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 166                                       const unsigned char *class_name,
 167                                       reg_syntax_t syntax);
 168 #endif /* not RE_ENABLE_I18N */
 169 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
 170 static void free_bin_tree (bin_tree_t *tree);
 171 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 172                                 re_token_type_t type, int index);
 173 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 174 \f
 175 /* This table gives an error message for each of the error codes listed
 176    in regex.h.  Obviously the order here has to be same as there.
 177    POSIX doesn't require that we do anything for REG_NOERROR,
 178    but why not be nice?  */
 179
 180 const char __re_error_msgid[] attribute_hidden =
 181   {
 182 #define REG_NOERROR_IDX 0
 183     gettext_noop ("Success")    /* REG_NOERROR */
 184     "\0"
 185 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 186     gettext_noop ("No match")   /* REG_NOMATCH */
 187     "\0"
 188 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 189     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 190     "\0"
 191 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 192     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 193     "\0"
 194 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 195     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 196     "\0"
 197 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 198     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 199     "\0"
 200 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 201     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 202     "\0"
 203 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 204     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 205     "\0"
 206 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 207     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 208     "\0"
 209 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 210     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 211     "\0"
 212 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 213     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 214     "\0"
 215 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 216     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 217     "\0"
 218 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 219     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 220     "\0"
 221 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 222     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 223     "\0"
 224 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 225     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 226     "\0"
 227 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 228     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 229     "\0"
 230 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 231     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 232   };
 233
 234 const size_t __re_error_msgid_idx[] attribute_hidden =
 235   {
 236     REG_NOERROR_IDX,
 237     REG_NOMATCH_IDX,
 238     REG_BADPAT_IDX,
 239     REG_ECOLLATE_IDX,
 240     REG_ECTYPE_IDX,
 241     REG_EESCAPE_IDX,
 242     REG_ESUBREG_IDX,
 243     REG_EBRACK_IDX,
 244     REG_EPAREN_IDX,
 245     REG_EBRACE_IDX,
 246     REG_BADBR_IDX,
 247     REG_ERANGE_IDX,
 248     REG_ESPACE_IDX,
 249     REG_BADRPT_IDX,
 250     REG_EEND_IDX,
 251     REG_ESIZE_IDX,
 252     REG_ERPAREN_IDX
 253   };
 254 \f
 255 /* Entry points for GNU code.  */
 256
 257 /* re_compile_pattern is the GNU regular expression compiler: it
 258    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 259    Returns 0 if the pattern was valid, otherwise an error string.
 260
 261    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 262    are set in BUFP on entry.  */
 263
 264 const char *
 265 re_compile_pattern (pattern, length, bufp)
 266     const char *pattern;
 267     size_t length;
 268     struct re_pattern_buffer *bufp;
 269 {
 270   reg_errcode_t ret;
 271
 272   /* And GNU code determines whether or not to get register information
 273      by passing null for the REGS argument to re_match, etc., not by
 274      setting no_sub.  */
 275   bufp->no_sub = 0;
 276
 277   /* Match anchors at newline.  */
 278   bufp->newline_anchor = 1;
 279
 280   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 281
 282   if (!ret)
 283     return NULL;
 284   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_pattern, re_compile_pattern)
 288 #endif
 289
 290 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 291    also be assigned to arbitrarily: each pattern buffer stores its own
 292    syntax, so it can be changed between regex compilations.  */
 293 /* This has no initializer because initialized variables in Emacs
 294    become read-only after dumping.  */
 295 reg_syntax_t re_syntax_options;
 296
 297
 298 /* Specify the precise syntax of regexps for compilation.  This provides
 299    for compatibility for various utilities which historically have
 300    different, incompatible syntaxes.
 301
 302    The argument SYNTAX is a bit mask comprised of the various bits
 303    defined in regex.h.  We return the old syntax.  */
 304
 305 reg_syntax_t
 306 re_set_syntax (syntax)
 307     reg_syntax_t syntax;
 308 {
 309   reg_syntax_t ret = re_syntax_options;
 310
 311   re_syntax_options = syntax;
 312   return ret;
 313 }
 314 #ifdef _LIBC
 315 weak_alias (__re_set_syntax, re_set_syntax)
 316 #endif
 317
 318 int
 319 re_compile_fastmap (bufp)
 320     struct re_pattern_buffer *bufp;
 321 {
 322   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 323   char *fastmap = bufp->fastmap;
 324
 325   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 326   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 327   if (dfa->init_state != dfa->init_state_word)
 328     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 329   if (dfa->init_state != dfa->init_state_nl)
 330     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 331   if (dfa->init_state != dfa->init_state_begbuf)
 332     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 333   bufp->fastmap_accurate = 1;
 334   return 0;
 335 }
 336 #ifdef _LIBC
 337 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 338 #endif
 339
 340 static inline void
 341 re_set_fastmap (char *fastmap, int icase, int ch)
 342 {
 343   fastmap[ch] = 1;
 344   if (icase)
 345     fastmap[tolower (ch)] = 1;
 346 }
 347
 348 /* Helper function for re_compile_fastmap.
 349    Compile fastmap for the initial_state INIT_STATE.  */
 350
 351 static void
 352 re_compile_fastmap_iter (bufp, init_state, fastmap)
 353      regex_t *bufp;
 354      const re_dfastate_t *init_state;
 355      char *fastmap;
 356 {
 357   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 358   int node_cnt;
 359   int icase = (MB_CUR_MAX == 1 && (bufp->syntax & RE_ICASE));
 360   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 361     {
 362       int node = init_state->nodes.elems[node_cnt];
 363       re_token_type_t type = dfa->nodes[node].type;
 364
 365       if (type == CHARACTER)
 366         re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 367       else if (type == SIMPLE_BRACKET)
 368         {
 369           int i, j, ch;
 370           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 371             for (j = 0; j < UINT_BITS; ++j, ++ch)
 372               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 373                 re_set_fastmap (fastmap, icase, ch);
 374         }
 375 #ifdef RE_ENABLE_I18N
 376       else if (type == COMPLEX_BRACKET)
 377         {
 378           int i;
 379           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 380           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 381               || cset->nranges || cset->nchar_classes)
 382             {
 383 # ifdef _LIBC
 384               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 385                 {
 386                   /* In this case we want to catch the bytes which are
 387                      the first byte of any collation elements.
 388                      e.g. In da_DK, we want to catch 'a' since "aa"
 389                           is a valid collation element, and don't catch
 390                           'b' since 'b' is the only collation element
 391                           which starts from 'b'.  */
 392                   int j, ch;
 393                   const int32_t *table = (const int32_t *)
 394                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 395                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 396                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 397                       if (table[ch] < 0)
 398                         re_set_fastmap (fastmap, icase, ch);
 399                 }
 400 # else
 401               if (MB_CUR_MAX > 1)
 402                 for (i = 0; i < SBC_MAX; ++i)
 403                   if (__btowc (i) == WEOF)
 404                     re_set_fastmap (fastmap, icase, i);
 405 # endif /* not _LIBC */
 406             }
 407           for (i = 0; i < cset->nmbchars; ++i)
 408             {
 409               char buf[256];
 410               mbstate_t state;
 411               memset (&state, '\0', sizeof (state));
 412               __wcrtomb (buf, cset->mbchars[i], &state);
 413               re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 414             }
 415         }
 416 #endif /* RE_ENABLE_I18N */
 417       else if (type == END_OF_RE || type == OP_PERIOD)
 418         {
 419           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 420           if (type == END_OF_RE)
 421             bufp->can_be_null = 1;
 422           return;
 423         }
 424     }
 425 }
 426 \f
 427 /* Entry point for POSIX code.  */
 428 /* regcomp takes a regular expression as a string and compiles it.
 429
 430    PREG is a regex_t *.  We do not expect any fields to be initialized,
 431    since POSIX says we shouldn't.  Thus, we set
 432
 433      `buffer' to the compiled pattern;
 434      `used' to the length of the compiled pattern;
 435      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 436        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 437        RE_SYNTAX_POSIX_BASIC;
 438      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 439      `fastmap' to an allocated space for the fastmap;
 440      `fastmap_accurate' to zero;
 441      `re_nsub' to the number of subexpressions in PATTERN.
 442
 443    PATTERN is the address of the pattern string.
 444
 445    CFLAGS is a series of bits which affect compilation.
 446
 447      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 448      use POSIX basic syntax.
 449
 450      If REG_NEWLINE is set, then . and [^...] don't match newline.
 451      Also, regexec will try a match beginning after every newline.
 452
 453      If REG_ICASE is set, then we considers upper- and lowercase
 454      versions of letters to be equivalent when matching.
 455
 456      If REG_NOSUB is set, then when PREG is passed to regexec, that
 457      routine will report only success or failure, and nothing about the
 458      registers.
 459
 460    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 461    the return codes and their meanings.)  */
 462
 463 int
 464 regcomp (preg, pattern, cflags)
 465     regex_t *__restrict preg;
 466     const char *__restrict pattern;
 467     int cflags;
 468 {
 469   reg_errcode_t ret;
 470   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 471                          : RE_SYNTAX_POSIX_BASIC);
 472
 473   preg->buffer = NULL;
 474   preg->allocated = 0;
 475   preg->used = 0;
 476
 477   /* Try to allocate space for the fastmap.  */
 478   preg->fastmap = re_malloc (char, SBC_MAX);
 479   if (BE (preg->fastmap == NULL, 0))
 480     return REG_ESPACE;
 481
 482   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 483
 484   /* If REG_NEWLINE is set, newlines are treated differently.  */
 485   if (cflags & REG_NEWLINE)
 486     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 487       syntax &= ~RE_DOT_NEWLINE;
 488       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 489       /* It also changes the matching behavior.  */
 490       preg->newline_anchor = 1;
 491     }
 492   else
 493     preg->newline_anchor = 0;
 494   preg->no_sub = !!(cflags & REG_NOSUB);
 495   preg->translate = NULL;
 496
 497   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 498
 499   /* POSIX doesn't distinguish between an unmatched open-group and an
 500      unmatched close-group: both are REG_EPAREN.  */
 501   if (ret == REG_ERPAREN)
 502     ret = REG_EPAREN;
 503
 504   /* We have already checked preg->fastmap != NULL.  */
 505   if (BE (ret == REG_NOERROR, 1))
 506     /* Compute the fastmap now, since regexec cannot modify the pattern
 507        buffer.  This function nevers fails in this implementation.  */
 508     (void) re_compile_fastmap (preg);
 509   else
 510     {
 511       /* Some error occurred while compiling the expression.  */
 512       re_free (preg->fastmap);
 513       preg->fastmap = NULL;
 514     }
 515
 516   return (int) ret;
 517 }
 518 #ifdef _LIBC
 519 weak_alias (__regcomp, regcomp)
 520 #endif
 521
 522 /* Returns a message corresponding to an error code, ERRCODE, returned
 523    from either regcomp or regexec.   We don't use PREG here.  */
 524
 525 size_t
 526 regerror (errcode, preg, errbuf, errbuf_size)
 527     int errcode;
 528     const regex_t *preg;
 529     char *errbuf;
 530     size_t errbuf_size;
 531 {
 532   const char *msg;
 533   size_t msg_size;
 534
 535   if (BE (errcode < 0
 536           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 537                                / sizeof (__re_error_msgid_idx[0])), 0))
 538     /* Only error codes returned by the rest of the code should be passed
 539        to this routine.  If we are given anything else, or if other regex
 540        code generates an invalid error code, then the program has a bug.
 541        Dump core so we can fix it.  */
 542     abort ();
 543
 544   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 545
 546   msg_size = strlen (msg) + 1; /* Includes the null.  */
 547
 548   if (BE (errbuf_size != 0, 1))
 549     {
 550       if (BE (msg_size > errbuf_size, 0))
 551         {
 552 #if defined HAVE_MEMPCPY || defined _LIBC
 553           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 554 #else
 555           memcpy (errbuf, msg, errbuf_size - 1);
 556           errbuf[errbuf_size - 1] = 0;
 557 #endif
 558         }
 559       else
 560         memcpy (errbuf, msg, msg_size);
 561     }
 562
 563   return msg_size;
 564 }
 565 #ifdef _LIBC
 566 weak_alias (__regerror, regerror)
 567 #endif
 568
 569
 570 static void
 571 free_dfa_content (re_dfa_t *dfa)
 572 {
 573   int i, j;
 574
 575   re_free (dfa->subexps);
 576
 577   for (i = 0; i < dfa->nodes_len; ++i)
 578     {
 579       re_token_t *node = dfa->nodes + i;
 580 #ifdef RE_ENABLE_I18N
 581       if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 582         free_charset (node->opr.mbcset);
 583       else
 584 #endif /* RE_ENABLE_I18N */
 585         if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 586           re_free (node->opr.sbcset);
 587     }
 588   re_free (dfa->nexts);
 589   for (i = 0; i < dfa->nodes_len; ++i)
 590     {
 591       if (dfa->eclosures != NULL)
 592         re_node_set_free (dfa->eclosures + i);
 593       if (dfa->inveclosures != NULL)
 594         re_node_set_free (dfa->inveclosures + i);
 595       if (dfa->edests != NULL)
 596         re_node_set_free (dfa->edests + i);
 597     }
 598   re_free (dfa->edests);
 599   re_free (dfa->eclosures);
 600   re_free (dfa->inveclosures);
 601   re_free (dfa->nodes);
 602
 603   for (i = 0; i <= dfa->state_hash_mask; ++i)
 604     {
 605       struct re_state_table_entry *entry = dfa->state_table + i;
 606       for (j = 0; j < entry->num; ++j)
 607         {
 608           re_dfastate_t *state = entry->array[j];
 609           free_state (state);
 610         }
 611       re_free (entry->array);
 612     }
 613   re_free (dfa->state_table);
 614
 615   if (dfa->word_char != NULL)
 616     re_free (dfa->word_char);
 617 #ifdef DEBUG
 618   re_free (dfa->re_str);
 619 #endif
 620
 621   re_free (dfa);
 622 }
 623
 624
 625 /* Free dynamically allocated space used by PREG.  */
 626
 627 void
 628 regfree (preg)
 629     regex_t *preg;
 630 {
 631   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 632   if (BE (dfa != NULL, 1))
 633     free_dfa_content (dfa);
 634
 635   re_free (preg->fastmap);
 636 }
 637 #ifdef _LIBC
 638 weak_alias (__regfree, regfree)
 639 #endif
 640 \f
 641 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 642    them unless specifically requested.  */
 643
 644 #if defined _REGEX_RE_COMP || defined _LIBC
 645
 646 /* BSD has one and only one pattern buffer.  */
 647 static struct re_pattern_buffer re_comp_buf;
 648
 649 char *
 650 # ifdef _LIBC
 651 /* Make these definitions weak in libc, so POSIX programs can redefine
 652    these names if they don't use our functions, and still use
 653    regcomp/regexec above without link errors.  */
 654 weak_function
 655 # endif
 656 re_comp (s)
 657      const char *s;
 658 {
 659   reg_errcode_t ret;
 660   char *fastmap;
 661
 662   if (!s)
 663     {
 664       if (!re_comp_buf.buffer)
 665         return gettext ("No previous regular expression");
 666       return 0;
 667     }
 668
 669   if (re_comp_buf.buffer)
 670     {
 671       fastmap = re_comp_buf.fastmap;
 672       re_comp_buf.fastmap = NULL;
 673       __regfree (&re_comp_buf);
 674       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 675       re_comp_buf.fastmap = fastmap;
 676     }
 677
 678   if (re_comp_buf.fastmap == NULL)
 679     {
 680       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 681       if (re_comp_buf.fastmap == NULL)
 682         return (char *) gettext (__re_error_msgid
 683                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 684     }
 685
 686   /* Since `re_exec' always passes NULL for the `regs' argument, we
 687      don't need to initialize the pattern buffer fields which affect it.  */
 688
 689   /* Match anchors at newlines.  */
 690   re_comp_buf.newline_anchor = 1;
 691
 692   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 693
 694   if (!ret)
 695     return NULL;
 696
 697   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 698   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 699 }
 700
 701 #ifdef _LIBC
 702 libc_freeres_fn (free_mem)
 703 {
 704   __regfree (&re_comp_buf);
 705 }
 706 #endif
 707
 708 #endif /* _REGEX_RE_COMP */
 709 \f
 710 /* Internal entry point.
 711    Compile the regular expression PATTERN, whose length is LENGTH.
 712    SYNTAX indicate regular expression's syntax.  */
 713
 714 static reg_errcode_t
 715 re_compile_internal (preg, pattern, length, syntax)
 716      regex_t *preg;
 717      const char * pattern;
 718      int length;
 719      reg_syntax_t syntax;
 720 {
 721   reg_errcode_t err = REG_NOERROR;
 722   re_dfa_t *dfa;
 723   re_string_t regexp;
 724
 725   /* Initialize the pattern buffer.  */
 726   preg->fastmap_accurate = 0;
 727   preg->syntax = syntax;
 728   preg->not_bol = preg->not_eol = 0;
 729   preg->used = 0;
 730   preg->re_nsub = 0;
 731   preg->can_be_null = 0;
 732   preg->regs_allocated = REGS_UNALLOCATED;
 733
 734   /* Initialize the dfa.  */
 735   dfa = (re_dfa_t *) preg->buffer;
 736   if (preg->allocated < sizeof (re_dfa_t))
 737     {
 738       /* If zero allocated, but buffer is non-null, try to realloc
 739          enough space.  This loses if buffer's address is bogus, but
 740          that is the user's responsibility.  If ->buffer is NULL this
 741          is a simple allocation.  */
 742       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 743       if (dfa == NULL)
 744         return REG_ESPACE;
 745       preg->allocated = sizeof (re_dfa_t);
 746     }
 747   preg->buffer = (unsigned char *) dfa;
 748   preg->used = sizeof (re_dfa_t);
 749
 750   err = init_dfa (dfa, length);
 751   if (BE (err != REG_NOERROR, 0))
 752     {
 753       re_free (dfa);
 754       preg->buffer = NULL;
 755       return err;
 756     }
 757 #ifdef DEBUG
 758   dfa->re_str = re_malloc (char, length + 1);
 759   strncpy (dfa->re_str, pattern, length + 1);
 760 #endif
 761
 762   err = re_string_construct (&regexp, pattern, length, preg->translate,
 763                              syntax & RE_ICASE);
 764   if (BE (err != REG_NOERROR, 0))
 765     {
 766       re_free (dfa);
 767       preg->buffer = NULL;
 768       return err;
 769     }
 770
 771   /* Parse the regular expression, and build a structure tree.  */
 772   preg->re_nsub = 0;
 773   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 774   if (BE (dfa->str_tree == NULL, 0))
 775     goto re_compile_internal_free_return;
 776
 777   /* Analyze the tree and collect information which is necessary to
 778      create the dfa.  */
 779   err = analyze (dfa);
 780   if (BE (err != REG_NOERROR, 0))
 781     goto re_compile_internal_free_return;
 782
 783   /* Then create the initial state of the dfa.  */
 784   err = create_initial_state (dfa);
 785
 786   /* Release work areas.  */
 787   free_workarea_compile (preg);
 788   re_string_destruct (&regexp);
 789
 790   if (BE (err != REG_NOERROR, 0))
 791     {
 792     re_compile_internal_free_return:
 793       free_dfa_content (dfa);
 794       preg->buffer = NULL;
 795     }
 796
 797   return err;
 798 }
 799
 800 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 801    as the initial length of some arrays.  */
 802
 803 static reg_errcode_t
 804 init_dfa (dfa, pat_len)
 805      re_dfa_t *dfa;
 806      int pat_len;
 807 {
 808   int table_size;
 809
 810   memset (dfa, '\0', sizeof (re_dfa_t));
 811
 812   dfa->nodes_alloc = pat_len + 1;
 813   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 814
 815   dfa->states_alloc = pat_len + 1;
 816
 817   /*  table_size = 2 ^ ceil(log pat_len) */
 818   for (table_size = 1; table_size > 0; table_size <<= 1)
 819     if (table_size > pat_len)
 820       break;
 821
 822   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 823   dfa->state_hash_mask = table_size - 1;
 824
 825   dfa->subexps_alloc = 1;
 826   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 827   dfa->word_char = NULL;
 828
 829   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 830           || dfa->subexps == NULL, 0))
 831     {
 832       /* We don't bother to free anything which was allocated.  Very
 833          soon the process will go down anyway.  */
 834       dfa->subexps = NULL;
 835       dfa->state_table = NULL;
 836       dfa->nodes = NULL;
 837       return REG_ESPACE;
 838     }
 839   return REG_NOERROR;
 840 }
 841
 842 /* Initialize WORD_CHAR table, which indicate which character is
 843    "word".  In this case "word" means that it is the word construction
 844    character used by some operators like "\<", "\>", etc.  */
 845
 846 static reg_errcode_t
 847 init_word_char (dfa)
 848      re_dfa_t *dfa;
 849 {
 850   int i, j, ch;
 851   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 852   if (BE (dfa->word_char == NULL, 0))
 853     return REG_ESPACE;
 854   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 855     for (j = 0; j < UINT_BITS; ++j, ++ch)
 856       if (isalnum (ch) || ch == '_')
 857         dfa->word_char[i] |= 1 << j;
 858   return REG_NOERROR;
 859 }
 860
 861 /* Free the work area which are only used while compiling.  */
 862
 863 static void
 864 free_workarea_compile (preg)
 865      regex_t *preg;
 866 {
 867   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 868   free_bin_tree (dfa->str_tree);
 869   dfa->str_tree = NULL;
 870   re_free (dfa->org_indices);
 871   dfa->org_indices = NULL;
 872 }
 873
 874 /* Create initial states for all contexts.  */
 875
 876 static reg_errcode_t
 877 create_initial_state (dfa)
 878      re_dfa_t *dfa;
 879 {
 880   int first, i;
 881   reg_errcode_t err;
 882   re_node_set init_nodes;
 883
 884   /* Initial states have the epsilon closure of the node which is
 885      the first node of the regular expression.  */
 886   first = dfa->str_tree->first;
 887   dfa->init_node = first;
 888   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 889   if (BE (err != REG_NOERROR, 0))
 890     return err;
 891
 892   /* The back-references which are in initial states can epsilon transit,
 893      since in this case all of the subexpressions can be null.
 894      Then we add epsilon closures of the nodes which are the next nodes of
 895      the back-references.  */
 896   if (dfa->nbackref > 0)
 897     for (i = 0; i < init_nodes.nelem; ++i)
 898       {
 899         int node_idx = init_nodes.elems[i];
 900         re_token_type_t type = dfa->nodes[node_idx].type;
 901
 902         int clexp_idx;
 903         if (type != OP_BACK_REF)
 904           continue;
 905         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 906           {
 907             re_token_t *clexp_node;
 908             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 909             if (clexp_node->type == OP_CLOSE_SUBEXP
 910                 && clexp_node->opr.idx + 1 == dfa->nodes[node_idx].opr.idx)
 911               break;
 912           }
 913         if (clexp_idx == init_nodes.nelem)
 914           continue;
 915
 916         if (type == OP_BACK_REF)
 917           {
 918             int dest_idx = dfa->edests[node_idx].elems[0];
 919             if (!re_node_set_contains (&init_nodes, dest_idx))
 920               {
 921                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
 922                 i = 0;
 923               }
 924           }
 925       }
 926
 927   /* It must be the first time to invoke acquire_state.  */
 928   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 929   /* We don't check ERR here, since the initial state must not be NULL.  */
 930   if (BE (dfa->init_state == NULL, 0))
 931     return err;
 932   if (dfa->init_state->has_constraint)
 933     {
 934       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 935                                                        CONTEXT_WORD);
 936       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 937                                                      CONTEXT_NEWLINE);
 938       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 939                                                          &init_nodes,
 940                                                          CONTEXT_NEWLINE
 941                                                          | CONTEXT_BEGBUF);
 942       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 943               || dfa->init_state_begbuf == NULL, 0))
 944         return err;
 945     }
 946   else
 947     dfa->init_state_word = dfa->init_state_nl
 948       = dfa->init_state_begbuf = dfa->init_state;
 949
 950   re_node_set_free (&init_nodes);
 951   return REG_NOERROR;
 952 }
 953 \f
 954 /* Analyze the structure tree, and calculate "first", "next", "edest",
 955    "eclosure", and "inveclosure".  */
 956
 957 static reg_errcode_t
 958 analyze (dfa)
 959      re_dfa_t *dfa;
 960 {
 961   int i;
 962   reg_errcode_t ret;
 963
 964   /* Allocate arrays.  */
 965   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
 966   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
 967   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 968   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 969   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 970   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
 971           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
 972     return REG_ESPACE;
 973   /* Initialize them.  */
 974   for (i = 0; i < dfa->nodes_len; ++i)
 975     {
 976       dfa->nexts[i] = -1;
 977       re_node_set_init_empty (dfa->edests + i);
 978       re_node_set_init_empty (dfa->eclosures + i);
 979       re_node_set_init_empty (dfa->inveclosures + i);
 980     }
 981
 982   ret = analyze_tree (dfa, dfa->str_tree);
 983   if (BE (ret == REG_NOERROR, 1))
 984     {
 985       ret = calc_eclosure (dfa);
 986       if (ret == REG_NOERROR)
 987         calc_inveclosure (dfa);
 988     }
 989   return ret;
 990 }
 991
 992 /* Helper functions for analyze.
 993    This function calculate "first", "next", and "edest" for the subtree
 994    whose root is NODE.  */
 995
 996 static reg_errcode_t
 997 analyze_tree (dfa, node)
 998      re_dfa_t *dfa;
 999      bin_tree_t *node;
1000 {
1001   reg_errcode_t ret;
1002   if (node->first == -1)
1003     calc_first (dfa, node);
1004   if (node->next == -1)
1005     calc_next (dfa, node);
1006   if (node->eclosure.nelem == 0)
1007     calc_epsdest (dfa, node);
1008   /* Calculate "first" etc. for the left child.  */
1009   if (node->left != NULL)
1010     {
1011       ret = analyze_tree (dfa, node->left);
1012       if (BE (ret != REG_NOERROR, 0))
1013         return ret;
1014     }
1015   /* Calculate "first" etc. for the right child.  */
1016   if (node->right != NULL)
1017     {
1018       ret = analyze_tree (dfa, node->right);
1019       if (BE (ret != REG_NOERROR, 0))
1020         return ret;
1021     }
1022   return REG_NOERROR;
1023 }
1024
1025 /* Calculate "first" for the node NODE.  */
1026 static void
1027 calc_first (dfa, node)
1028      re_dfa_t *dfa;
1029      bin_tree_t *node;
1030 {
1031   int idx, type;
1032   idx = node->node_idx;
1033   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1034
1035   switch (type)
1036     {
1037 #ifdef DEBUG
1038     case OP_OPEN_BRACKET:
1039     case OP_CLOSE_BRACKET:
1040     case OP_OPEN_DUP_NUM:
1041     case OP_CLOSE_DUP_NUM:
1042     case OP_NON_MATCH_LIST:
1043     case OP_OPEN_COLL_ELEM:
1044     case OP_CLOSE_COLL_ELEM:
1045     case OP_OPEN_EQUIV_CLASS:
1046     case OP_CLOSE_EQUIV_CLASS:
1047     case OP_OPEN_CHAR_CLASS:
1048     case OP_CLOSE_CHAR_CLASS:
1049       /* These must not be appeared here.  */
1050       assert (0);
1051 #endif
1052     case END_OF_RE:
1053     case CHARACTER:
1054     case OP_PERIOD:
1055     case OP_DUP_ASTERISK:
1056     case OP_DUP_QUESTION:
1057 #ifdef RE_ENABLE_I18N
1058     case COMPLEX_BRACKET:
1059 #endif /* RE_ENABLE_I18N */
1060     case SIMPLE_BRACKET:
1061     case OP_BACK_REF:
1062     case ANCHOR:
1063     case OP_OPEN_SUBEXP:
1064     case OP_CLOSE_SUBEXP:
1065       node->first = idx;
1066       break;
1067     case OP_DUP_PLUS:
1068 #ifdef DEBUG
1069       assert (node->left != NULL);
1070 #endif
1071       if (node->left->first == -1)
1072         calc_first (dfa, node->left);
1073       node->first = node->left->first;
1074       break;
1075     case OP_ALT:
1076       node->first = idx;
1077       break;
1078       /* else fall through */
1079     default:
1080 #ifdef DEBUG
1081       assert (node->left != NULL);
1082 #endif
1083       if (node->left->first == -1)
1084         calc_first (dfa, node->left);
1085       node->first = node->left->first;
1086       break;
1087     }
1088 }
1089
1090 /* Calculate "next" for the node NODE.  */
1091
1092 static void
1093 calc_next (dfa, node)
1094      re_dfa_t *dfa;
1095      bin_tree_t *node;
1096 {
1097   int idx, type;
1098   bin_tree_t *parent = node->parent;
1099   if (parent == NULL)
1100     {
1101       node->next = -1;
1102       idx = node->node_idx;
1103       if (node->type == 0)
1104         dfa->nexts[idx] = node->next;
1105       return;
1106     }
1107
1108   idx = parent->node_idx;
1109   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1110
1111   switch (type)
1112     {
1113     case OP_DUP_ASTERISK:
1114     case OP_DUP_PLUS:
1115       node->next = idx;
1116       break;
1117     case CONCAT:
1118       if (parent->left == node)
1119         {
1120           if (parent->right->first == -1)
1121             calc_first (dfa, parent->right);
1122           node->next = parent->right->first;
1123           break;
1124         }
1125       /* else fall through */
1126     default:
1127       if (parent->next == -1)
1128         calc_next (dfa, parent);
1129       node->next = parent->next;
1130       break;
1131     }
1132   idx = node->node_idx;
1133   if (node->type == 0)
1134     dfa->nexts[idx] = node->next;
1135 }
1136
1137 /* Calculate "edest" for the node NODE.  */
1138
1139 static void
1140 calc_epsdest (dfa, node)
1141      re_dfa_t *dfa;
1142      bin_tree_t *node;
1143 {
1144   int idx;
1145   idx = node->node_idx;
1146   if (node->type == 0)
1147     {
1148       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1149           || dfa->nodes[idx].type == OP_DUP_PLUS
1150           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1151         {
1152           if (node->left->first == -1)
1153             calc_first (dfa, node->left);
1154           if (node->next == -1)
1155             calc_next (dfa, node);
1156           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1157                               node->next);
1158         }
1159       else if (dfa->nodes[idx].type == OP_ALT)
1160         {
1161           int left, right;
1162           if (node->left != NULL)
1163             {
1164               if (node->left->first == -1)
1165                 calc_first (dfa, node->left);
1166               left = node->left->first;
1167             }
1168           else
1169             {
1170               if (node->next == -1)
1171                 calc_next (dfa, node);
1172               left = node->next;
1173             }
1174           if (node->right != NULL)
1175             {
1176               if (node->right->first == -1)
1177                 calc_first (dfa, node->right);
1178               right = node->right->first;
1179             }
1180           else
1181             {
1182               if (node->next == -1)
1183                 calc_next (dfa, node);
1184               right = node->next;
1185             }
1186           re_node_set_init_2 (dfa->edests + idx, left, right);
1187         }
1188       else if (dfa->nodes[idx].type == ANCHOR
1189                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1190                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
1191                || dfa->nodes[idx].type == OP_BACK_REF)
1192         re_node_set_init_1 (dfa->edests + idx, node->next);
1193     }
1194 }
1195
1196 /* Duplicate the epsilon closure of the node ROOT_NODE.
1197    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1198    to their own constraint.  */
1199
1200 static reg_errcode_t
1201 duplicate_node_closure (dfa, top_org_node, top_clone_node, root_node,
1202                         init_constraint)
1203      re_dfa_t *dfa;
1204      int top_org_node, top_clone_node, root_node;
1205      unsigned int init_constraint;
1206 {
1207   reg_errcode_t err;
1208   int org_node, clone_node, ret;
1209   unsigned int constraint = init_constraint;
1210   for (org_node = top_org_node, clone_node = top_clone_node;;)
1211     {
1212       int org_dest, clone_dest;
1213       if (dfa->nodes[org_node].type == OP_BACK_REF)
1214         {
1215           /* If the back reference epsilon-transit, its destination must
1216              also have the constraint.  Then duplicate the epsilon closure
1217              of the destination of the back reference, and store it in
1218              edests of the back reference.  */
1219           org_dest = dfa->nexts[org_node];
1220           re_node_set_empty (dfa->edests + clone_node);
1221           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1222           if (BE (err != REG_NOERROR, 0))
1223             return err;
1224           dfa->nexts[clone_node] = dfa->nexts[org_node];
1225           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1226           if (BE (ret < 0, 0))
1227             return REG_ESPACE;
1228         }
1229       else if (dfa->edests[org_node].nelem == 0)
1230         {
1231           /* In case of the node can't epsilon-transit, don't duplicate the
1232              destination and store the original destination as the
1233              destination of the node.  */
1234           dfa->nexts[clone_node] = dfa->nexts[org_node];
1235           break;
1236         }
1237       else if (dfa->edests[org_node].nelem == 1)
1238         {
1239           /* In case of the node can epsilon-transit, and it has only one
1240              destination.  */
1241           org_dest = dfa->edests[org_node].elems[0];
1242           re_node_set_empty (dfa->edests + clone_node);
1243           if (dfa->nodes[org_node].type == ANCHOR)
1244             {
1245               /* In case of the node has another constraint, append it.  */
1246               if (org_node == root_node && clone_node != org_node)
1247                 {
1248                   /* ...but if the node is root_node itself, it means the
1249                      epsilon closure have a loop, then tie it to the
1250                      destination of the root_node.  */
1251                   ret = re_node_set_insert (dfa->edests + clone_node,
1252                                             org_dest);
1253                   if (BE (ret < 0, 0))
1254                     return REG_ESPACE;
1255                   break;
1256                 }
1257               constraint |= dfa->nodes[org_node].opr.ctx_type;
1258             }
1259           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1260           if (BE (err != REG_NOERROR, 0))
1261             return err;
1262           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1263           if (BE (ret < 0, 0))
1264             return REG_ESPACE;
1265         }
1266       else /* dfa->edests[org_node].nelem == 2 */
1267         {
1268           /* In case of the node can epsilon-transit, and it has two
1269              destinations. E.g. '|', '*', '+', '?'.   */
1270           org_dest = dfa->edests[org_node].elems[0];
1271           re_node_set_empty (dfa->edests + clone_node);
1272           /* Search for a duplicated node which satisfies the constraint.  */
1273           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1274           if (clone_dest == -1)
1275             {
1276               /* There are no such a duplicated node, create a new one.  */
1277               err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1278               if (BE (err != REG_NOERROR, 0))
1279                 return err;
1280               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1281               if (BE (ret < 0, 0))
1282                 return REG_ESPACE;
1283               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1284                                             root_node, constraint);
1285               if (BE (err != REG_NOERROR, 0))
1286                 return err;
1287             }
1288           else
1289             {
1290               /* There are a duplicated node which satisfy the constraint,
1291                  use it to avoid infinite loop.  */
1292               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1293               if (BE (ret < 0, 0))
1294                 return REG_ESPACE;
1295             }
1296
1297           org_dest = dfa->edests[org_node].elems[1];
1298           err = duplicate_node (&clone_dest, dfa, org_dest, constraint);
1299           if (BE (err != REG_NOERROR, 0))
1300             return err;
1301           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1302           if (BE (ret < 0, 0))
1303             return REG_ESPACE;
1304         }
1305       org_node = org_dest;
1306       clone_node = clone_dest;
1307     }
1308   return REG_NOERROR;
1309 }
1310
1311 /* Search for a node which is duplicated from the node ORG_NODE, and
1312    satisfies the constraint CONSTRAINT.  */
1313
1314 static int
1315 search_duplicated_node (dfa, org_node, constraint)
1316      re_dfa_t *dfa;
1317      int org_node;
1318      unsigned int constraint;
1319 {
1320   int idx;
1321   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1322     {
1323       if (org_node == dfa->org_indices[idx]
1324           && constraint == dfa->nodes[idx].constraint)
1325         return idx; /* Found.  */
1326     }
1327   return -1; /* Not found.  */
1328 }
1329
1330 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1331    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1332    otherwise return the error code.  */
1333
1334 static reg_errcode_t
1335 duplicate_node (new_idx, dfa, org_idx, constraint)
1336      re_dfa_t *dfa;
1337      int *new_idx, org_idx;
1338      unsigned int constraint;
1339 {
1340   re_token_t dup;
1341   int dup_idx;
1342
1343   dup = dfa->nodes[org_idx];
1344   dup_idx = re_dfa_add_node (dfa, dup, 1);
1345   if (BE (dup_idx == -1, 0))
1346     return REG_ESPACE;
1347   dfa->nodes[dup_idx].constraint = constraint;
1348   if (dfa->nodes[org_idx].type == ANCHOR)
1349     dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1350   dfa->nodes[dup_idx].duplicated = 1;
1351   re_node_set_init_empty (dfa->edests + dup_idx);
1352   re_node_set_init_empty (dfa->eclosures + dup_idx);
1353   re_node_set_init_empty (dfa->inveclosures + dup_idx);
1354
1355   /* Store the index of the original node.  */
1356   dfa->org_indices[dup_idx] = org_idx;
1357   *new_idx = dup_idx;
1358   return REG_NOERROR;
1359 }
1360
1361 static void
1362 calc_inveclosure (dfa)
1363      re_dfa_t *dfa;
1364 {
1365   int src, idx, dest;
1366   for (src = 0; src < dfa->nodes_len; ++src)
1367     {
1368       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1369         {
1370           dest = dfa->eclosures[src].elems[idx];
1371           re_node_set_insert (dfa->inveclosures + dest, src);
1372         }
1373     }
1374 }
1375
1376 /* Calculate "eclosure" for all the node in DFA.  */
1377
1378 static reg_errcode_t
1379 calc_eclosure (dfa)
1380      re_dfa_t *dfa;
1381 {
1382   int node_idx, incomplete;
1383 #ifdef DEBUG
1384   assert (dfa->nodes_len > 0);
1385 #endif
1386   incomplete = 0;
1387   /* For each nodes, calculate epsilon closure.  */
1388   for (node_idx = 0; ; ++node_idx)
1389     {
1390       reg_errcode_t err;
1391       re_node_set eclosure_elem;
1392       if (node_idx == dfa->nodes_len)
1393         {
1394           if (!incomplete)
1395             break;
1396           incomplete = 0;
1397           node_idx = 0;
1398         }
1399
1400 #ifdef DEBUG
1401       assert (dfa->eclosures[node_idx].nelem != -1);
1402 #endif
1403       /* If we have already calculated, skip it.  */
1404       if (dfa->eclosures[node_idx].nelem != 0)
1405         continue;
1406       /* Calculate epsilon closure of `node_idx'.  */
1407       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1408       if (BE (err != REG_NOERROR, 0))
1409         return err;
1410
1411       if (dfa->eclosures[node_idx].nelem == 0)
1412         {
1413           incomplete = 1;
1414           re_node_set_free (&eclosure_elem);
1415         }
1416     }
1417   return REG_NOERROR;
1418 }
1419
1420 /* Calculate epsilon closure of NODE.  */
1421
1422 static reg_errcode_t
1423 calc_eclosure_iter (new_set, dfa, node, root)
1424      re_node_set *new_set;
1425      re_dfa_t *dfa;
1426      int node, root;
1427 {
1428   reg_errcode_t err;
1429   unsigned int constraint;
1430   int i, incomplete;
1431   re_node_set eclosure;
1432   incomplete = 0;
1433   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1434   if (BE (err != REG_NOERROR, 0))
1435     return err;
1436
1437   /* This indicates that we are calculating this node now.
1438      We reference this value to avoid infinite loop.  */
1439   dfa->eclosures[node].nelem = -1;
1440
1441   constraint = ((dfa->nodes[node].type == ANCHOR)
1442                 ? dfa->nodes[node].opr.ctx_type : 0);
1443   /* If the current node has constraints, duplicate all nodes.
1444      Since they must inherit the constraints.  */
1445   if (constraint && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1446     {
1447       int org_node, cur_node;
1448       org_node = cur_node = node;
1449       err = duplicate_node_closure (dfa, node, node, node, constraint);
1450       if (BE (err != REG_NOERROR, 0))
1451         return err;
1452     }
1453
1454   /* Expand each epsilon destination nodes.  */
1455   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1456     for (i = 0; i < dfa->edests[node].nelem; ++i)
1457       {
1458         re_node_set eclosure_elem;
1459         int edest = dfa->edests[node].elems[i];
1460         /* If calculating the epsilon closure of `edest' is in progress,
1461            return intermediate result.  */
1462         if (dfa->eclosures[edest].nelem == -1)
1463           {
1464             incomplete = 1;
1465             continue;
1466           }
1467         /* If we haven't calculated the epsilon closure of `edest' yet,
1468            calculate now. Otherwise use calculated epsilon closure.  */
1469         if (dfa->eclosures[edest].nelem == 0)
1470           {
1471             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1472             if (BE (err != REG_NOERROR, 0))
1473               return err;
1474           }
1475         else
1476           eclosure_elem = dfa->eclosures[edest];
1477         /* Merge the epsilon closure of `edest'.  */
1478         re_node_set_merge (&eclosure, &eclosure_elem);
1479         /* If the epsilon closure of `edest' is incomplete,
1480            the epsilon closure of this node is also incomplete.  */
1481         if (dfa->eclosures[edest].nelem == 0)
1482           {
1483             incomplete = 1;
1484             re_node_set_free (&eclosure_elem);
1485           }
1486       }
1487
1488   /* Epsilon closures include itself.  */
1489   re_node_set_insert (&eclosure, node);
1490   if (incomplete && !root)
1491     dfa->eclosures[node].nelem = 0;
1492   else
1493     dfa->eclosures[node] = eclosure;
1494   *new_set = eclosure;
1495   return REG_NOERROR;
1496 }
1497 \f
1498 /* Functions for token which are used in the parser.  */
1499
1500 /* Fetch a token from INPUT.
1501    We must not use this function inside bracket expressions.  */
1502
1503 static re_token_t
1504 fetch_token (input, syntax)
1505      re_string_t *input;
1506      reg_syntax_t syntax;
1507 {
1508   re_token_t token;
1509   int consumed_byte;
1510   consumed_byte = peek_token (&token, input, syntax);
1511   re_string_skip_bytes (input, consumed_byte);
1512   return token;
1513 }
1514
1515 /* Peek a token from INPUT, and return the length of the token.
1516    We must not use this function inside bracket expressions.  */
1517
1518 static int
1519 peek_token (token, input, syntax)
1520      re_token_t *token;
1521      re_string_t *input;
1522      reg_syntax_t syntax;
1523 {
1524   unsigned char c;
1525
1526   if (re_string_eoi (input))
1527     {
1528       token->type = END_OF_RE;
1529       return 0;
1530     }
1531
1532   c = re_string_peek_byte (input, 0);
1533   token->opr.c = c;
1534
1535 #ifdef RE_ENABLE_I18N
1536   token->mb_partial = 0;
1537   if (MB_CUR_MAX > 1 &&
1538       !re_string_first_byte (input, re_string_cur_idx (input)))
1539     {
1540       token->type = CHARACTER;
1541       token->mb_partial = 1;
1542       return 1;
1543     }
1544 #endif
1545   if (c == '\\')
1546     {
1547       unsigned char c2;
1548       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1549         {
1550           token->type = BACK_SLASH;
1551           return 1;
1552         }
1553
1554       c2 = re_string_peek_byte_case (input, 1);
1555       token->opr.c = c2;
1556       token->type = CHARACTER;
1557       switch (c2)
1558         {
1559         case '|':
1560           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1561             token->type = OP_ALT;
1562           break;
1563         case '1': case '2': case '3': case '4': case '5':
1564         case '6': case '7': case '8': case '9':
1565           if (!(syntax & RE_NO_BK_REFS))
1566             {
1567               token->type = OP_BACK_REF;
1568               token->opr.idx = c2 - '0';
1569             }
1570           break;
1571         case '<':
1572           if (!(syntax & RE_NO_GNU_OPS))
1573             {
1574               token->type = ANCHOR;
1575               token->opr.idx = WORD_FIRST;
1576             }
1577           break;
1578         case '>':
1579           if (!(syntax & RE_NO_GNU_OPS))
1580             {
1581               token->type = ANCHOR;
1582               token->opr.idx = WORD_LAST;
1583             }
1584           break;
1585         case 'b':
1586           if (!(syntax & RE_NO_GNU_OPS))
1587             {
1588               token->type = ANCHOR;
1589               token->opr.idx = WORD_DELIM;
1590             }
1591           break;
1592         case 'B':
1593           if (!(syntax & RE_NO_GNU_OPS))
1594             {
1595               token->type = ANCHOR;
1596               token->opr.idx = INSIDE_WORD;
1597             }
1598           break;
1599         case 'w':
1600           if (!(syntax & RE_NO_GNU_OPS))
1601             token->type = OP_WORD;
1602           break;
1603         case 'W':
1604           if (!(syntax & RE_NO_GNU_OPS))
1605             token->type = OP_NOTWORD;
1606           break;
1607         case '`':
1608           if (!(syntax & RE_NO_GNU_OPS))
1609             {
1610               token->type = ANCHOR;
1611               token->opr.idx = BUF_FIRST;
1612             }
1613           break;
1614         case '\'':
1615           if (!(syntax & RE_NO_GNU_OPS))
1616             {
1617               token->type = ANCHOR;
1618               token->opr.idx = BUF_LAST;
1619             }
1620           break;
1621         case '(':
1622           if (!(syntax & RE_NO_BK_PARENS))
1623             token->type = OP_OPEN_SUBEXP;
1624           break;
1625         case ')':
1626           if (!(syntax & RE_NO_BK_PARENS))
1627             token->type = OP_CLOSE_SUBEXP;
1628           break;
1629         case '+':
1630           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1631             token->type = OP_DUP_PLUS;
1632           break;
1633         case '?':
1634           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1635             token->type = OP_DUP_QUESTION;
1636           break;
1637         case '{':
1638           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1639             token->type = OP_OPEN_DUP_NUM;
1640           break;
1641         case '}':
1642           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1643             token->type = OP_CLOSE_DUP_NUM;
1644           break;
1645         default:
1646           break;
1647         }
1648       return 2;
1649     }
1650
1651   token->type = CHARACTER;
1652   switch (c)
1653     {
1654     case '\n':
1655       if (syntax & RE_NEWLINE_ALT)
1656         token->type = OP_ALT;
1657       break;
1658     case '|':
1659       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1660         token->type = OP_ALT;
1661       break;
1662     case '*':
1663       token->type = OP_DUP_ASTERISK;
1664       break;
1665     case '+':
1666       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1667         token->type = OP_DUP_PLUS;
1668       break;
1669     case '?':
1670       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1671         token->type = OP_DUP_QUESTION;
1672       break;
1673     case '{':
1674       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1675         token->type = OP_OPEN_DUP_NUM;
1676       break;
1677     case '}':
1678       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1679         token->type = OP_CLOSE_DUP_NUM;
1680       break;
1681     case '(':
1682       if (syntax & RE_NO_BK_PARENS)
1683         token->type = OP_OPEN_SUBEXP;
1684       break;
1685     case ')':
1686       if (syntax & RE_NO_BK_PARENS)
1687         token->type = OP_CLOSE_SUBEXP;
1688       break;
1689     case '[':
1690       token->type = OP_OPEN_BRACKET;
1691       break;
1692     case '.':
1693       token->type = OP_PERIOD;
1694       break;
1695     case '^':
1696       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1697           re_string_cur_idx (input) != 0)
1698         {
1699           char prev = re_string_peek_byte (input, -1);
1700           if (prev != '|' && prev != '(' &&
1701               (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1702             break;
1703         }
1704       token->type = ANCHOR;
1705       token->opr.idx = LINE_FIRST;
1706       break;
1707     case '$':
1708       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1709           re_string_cur_idx (input) + 1 != re_string_length (input))
1710         {
1711           re_token_t next;
1712           re_string_skip_bytes (input, 1);
1713           peek_token (&next, input, syntax);
1714           re_string_skip_bytes (input, -1);
1715           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1716             break;
1717         }
1718       token->type = ANCHOR;
1719       token->opr.idx = LINE_LAST;
1720       break;
1721     default:
1722       break;
1723     }
1724   return 1;
1725 }
1726
1727 /* Peek a token from INPUT, and return the length of the token.
1728    We must not use this function out of bracket expressions.  */
1729
1730 static int
1731 peek_token_bracket (token, input, syntax)
1732      re_token_t *token;
1733      re_string_t *input;
1734      reg_syntax_t syntax;
1735 {
1736   unsigned char c;
1737   if (re_string_eoi (input))
1738     {
1739       token->type = END_OF_RE;
1740       return 0;
1741     }
1742   c = re_string_peek_byte (input, 0);
1743   token->opr.c = c;
1744
1745 #ifdef RE_ENABLE_I18N
1746   if (MB_CUR_MAX > 1 &&
1747       !re_string_first_byte (input, re_string_cur_idx (input)))
1748     {
1749       token->type = CHARACTER;
1750       return 1;
1751     }
1752 #endif /* RE_ENABLE_I18N */
1753
1754   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1755     {
1756       /* In this case, '\' escape a character.  */
1757       unsigned char c2;
1758       re_string_skip_bytes (input, 1);
1759       c2 = re_string_peek_byte (input, 0);
1760       token->opr.c = c2;
1761       token->type = CHARACTER;
1762       return 1;
1763     }
1764   if (c == '[') /* '[' is a special char in a bracket exps.  */
1765     {
1766       unsigned char c2;
1767       int token_len;
1768       c2 = re_string_peek_byte (input, 1);
1769       token->opr.c = c2;
1770       token_len = 2;
1771       switch (c2)
1772         {
1773         case '.':
1774           token->type = OP_OPEN_COLL_ELEM;
1775           break;
1776         case '=':
1777           token->type = OP_OPEN_EQUIV_CLASS;
1778           break;
1779         case ':':
1780           if (syntax & RE_CHAR_CLASSES)
1781             {
1782               token->type = OP_OPEN_CHAR_CLASS;
1783               break;
1784             }
1785           /* else fall through.  */
1786         default:
1787           token->type = CHARACTER;
1788           token->opr.c = c;
1789           token_len = 1;
1790           break;
1791         }
1792       return token_len;
1793     }
1794   switch (c)
1795     {
1796     case '-':
1797       token->type = OP_CHARSET_RANGE;
1798       break;
1799     case ']':
1800       token->type = OP_CLOSE_BRACKET;
1801       break;
1802     case '^':
1803       token->type = OP_NON_MATCH_LIST;
1804       break;
1805     default:
1806       token->type = CHARACTER;
1807     }
1808   return 1;
1809 }
1810 \f
1811 /* Functions for parser.  */
1812
1813 /* Entry point of the parser.
1814    Parse the regular expression REGEXP and return the structure tree.
1815    If an error is occured, ERR is set by error code, and return NULL.
1816    This function build the following tree, from regular expression <reg_exp>:
1817            CAT
1818            / \
1819           /   \
1820    <reg_exp>  EOR
1821
1822    CAT means concatenation.
1823    EOR means end of regular expression.  */
1824
1825 static bin_tree_t *
1826 parse (regexp, preg, syntax, err)
1827      re_string_t *regexp;
1828      regex_t *preg;
1829      reg_syntax_t syntax;
1830      reg_errcode_t *err;
1831 {
1832   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1833   bin_tree_t *tree, *eor, *root;
1834   re_token_t current_token;
1835   int new_idx;
1836   current_token = fetch_token (regexp, syntax);
1837   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1838   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1839     return NULL;
1840   new_idx = re_dfa_add_node (dfa, current_token, 0);
1841   eor = create_tree (NULL, NULL, 0, new_idx);
1842   if (tree != NULL)
1843     root = create_tree (tree, eor, CONCAT, 0);
1844   else
1845     root = eor;
1846   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1847     {
1848       *err = REG_ESPACE;
1849       return NULL;
1850     }
1851   return root;
1852 }
1853
1854 /* This function build the following tree, from regular expression
1855    <branch1>|<branch2>:
1856            ALT
1857            / \
1858           /   \
1859    <branch1> <branch2>
1860
1861    ALT means alternative, which represents the operator `|'.  */
1862
1863 static bin_tree_t *
1864 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1865      re_string_t *regexp;
1866      regex_t *preg;
1867      re_token_t *token;
1868      reg_syntax_t syntax;
1869      int nest;
1870      reg_errcode_t *err;
1871 {
1872   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1873   bin_tree_t *tree, *branch = NULL;
1874   int new_idx;
1875   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1876   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1877     return NULL;
1878
1879   while (token->type == OP_ALT)
1880     {
1881       re_token_t alt_token = *token;
1882       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1883       *token = fetch_token (regexp, syntax);
1884       if (token->type != OP_ALT && token->type != END_OF_RE
1885           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1886         {
1887           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1888           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1889             {
1890               free_bin_tree (tree);
1891               return NULL;
1892             }
1893         }
1894       else
1895         branch = NULL;
1896       tree = create_tree (tree, branch, 0, new_idx);
1897       if (BE (new_idx == -1 || tree == NULL, 0))
1898         {
1899           *err = REG_ESPACE;
1900           return NULL;
1901         }
1902       dfa->has_plural_match = 1;
1903     }
1904   return tree;
1905 }
1906
1907 /* This function build the following tree, from regular expression
1908    <exp1><exp2>:
1909         CAT
1910         / \
1911        /   \
1912    <exp1> <exp2>
1913
1914    CAT means concatenation.  */
1915
1916 static bin_tree_t *
1917 parse_branch (regexp, preg, token, syntax, nest, err)
1918      re_string_t *regexp;
1919      regex_t *preg;
1920      re_token_t *token;
1921      reg_syntax_t syntax;
1922      int nest;
1923      reg_errcode_t *err;
1924 {
1925   bin_tree_t *tree, *exp;
1926   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1927   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1928     return NULL;
1929
1930   while (token->type != OP_ALT && token->type != END_OF_RE
1931          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1932     {
1933       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1934       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1935         {
1936           free_bin_tree (tree);
1937           return NULL;
1938         }
1939       if (tree != NULL && exp != NULL)
1940         {
1941           tree = create_tree (tree, exp, CONCAT, 0);
1942           if (tree == NULL)
1943             {
1944               *err = REG_ESPACE;
1945               return NULL;
1946             }
1947         }
1948       else if (tree == NULL)
1949         tree = exp;
1950       /* Otherwise exp == NULL, we don't need to create new tree.  */
1951     }
1952   return tree;
1953 }
1954
1955 /* This function build the following tree, from regular expression a*:
1956          *
1957          |
1958          a
1959 */
1960
1961 static bin_tree_t *
1962 parse_expression (regexp, preg, token, syntax, nest, err)
1963      re_string_t *regexp;
1964      regex_t *preg;
1965      re_token_t *token;
1966      reg_syntax_t syntax;
1967      int nest;
1968      reg_errcode_t *err;
1969 {
1970   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1971   bin_tree_t *tree;
1972   int new_idx;
1973   switch (token->type)
1974     {
1975     case CHARACTER:
1976       new_idx = re_dfa_add_node (dfa, *token, 0);
1977       tree = create_tree (NULL, NULL, 0, new_idx);
1978       if (BE (new_idx == -1 || tree == NULL, 0))
1979         {
1980           *err = REG_ESPACE;
1981           return NULL;
1982         }
1983 #ifdef RE_ENABLE_I18N
1984       if (MB_CUR_MAX > 1)
1985         {
1986           while (!re_string_eoi (regexp)
1987                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1988             {
1989               bin_tree_t *mbc_remain;
1990               *token = fetch_token (regexp, syntax);
1991               new_idx = re_dfa_add_node (dfa, *token, 0);
1992               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1993               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1994               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1995                 return *err = REG_ESPACE, NULL;
1996             }
1997         }
1998 #endif
1999       break;
2000     case OP_OPEN_SUBEXP:
2001       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2002       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2003         return NULL;
2004       break;
2005     case OP_OPEN_BRACKET:
2006       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2007       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2008         return NULL;
2009       break;
2010     case OP_BACK_REF:
2011       if (BE (preg->re_nsub < token->opr.idx
2012               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
2013         {
2014           *err = REG_ESUBREG;
2015           return NULL;
2016         }
2017       dfa->used_bkref_map |= 1 << (token->opr.idx - 1);
2018       new_idx = re_dfa_add_node (dfa, *token, 0);
2019       tree = create_tree (NULL, NULL, 0, new_idx);
2020       if (BE (new_idx == -1 || tree == NULL, 0))
2021         {
2022           *err = REG_ESPACE;
2023           return NULL;
2024         }
2025       ++dfa->nbackref;
2026       dfa->has_mb_node = 1;
2027       break;
2028     case OP_DUP_ASTERISK:
2029     case OP_DUP_PLUS:
2030     case OP_DUP_QUESTION:
2031     case OP_OPEN_DUP_NUM:
2032       if (syntax & RE_CONTEXT_INVALID_OPS)
2033         {
2034           *err = REG_BADRPT;
2035           return NULL;
2036         }
2037       else if (syntax & RE_CONTEXT_INDEP_OPS)
2038         {
2039           *token = fetch_token (regexp, syntax);
2040           return parse_expression (regexp, preg, token, syntax, nest, err);
2041         }
2042       /* else fall through  */
2043     case OP_CLOSE_SUBEXP:
2044       if ((token->type == OP_CLOSE_SUBEXP) &&
2045           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2046         {
2047           *err = REG_ERPAREN;
2048           return NULL;
2049         }
2050       /* else fall through  */
2051     case OP_CLOSE_DUP_NUM:
2052       /* We treat it as a normal character.  */
2053
2054       /* Then we can these characters as normal characters.  */
2055       token->type = CHARACTER;
2056       new_idx = re_dfa_add_node (dfa, *token, 0);
2057       tree = create_tree (NULL, NULL, 0, new_idx);
2058       if (BE (new_idx == -1 || tree == NULL, 0))
2059         {
2060           *err = REG_ESPACE;
2061           return NULL;
2062         }
2063       break;
2064     case ANCHOR:
2065       if (dfa->word_char == NULL)
2066         {
2067           *err = init_word_char (dfa);
2068           if (BE (*err != REG_NOERROR, 0))
2069             return NULL;
2070         }
2071       if (token->opr.ctx_type == WORD_DELIM)
2072         {
2073           bin_tree_t *tree_first, *tree_last;
2074           int idx_first, idx_last;
2075           token->opr.ctx_type = WORD_FIRST;
2076           idx_first = re_dfa_add_node (dfa, *token, 0);
2077           tree_first = create_tree (NULL, NULL, 0, idx_first);
2078           token->opr.ctx_type = WORD_LAST;
2079           idx_last = re_dfa_add_node (dfa, *token, 0);
2080           tree_last = create_tree (NULL, NULL, 0, idx_last);
2081           token->type = OP_ALT;
2082           new_idx = re_dfa_add_node (dfa, *token, 0);
2083           tree = create_tree (tree_first, tree_last, 0, new_idx);
2084           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2085                   || tree_first == NULL || tree_last == NULL
2086                   || tree == NULL, 0))
2087             {
2088               *err = REG_ESPACE;
2089               return NULL;
2090             }
2091         }
2092       else
2093         {
2094           new_idx = re_dfa_add_node (dfa, *token, 0);
2095           tree = create_tree (NULL, NULL, 0, new_idx);
2096           if (BE (new_idx == -1 || tree == NULL, 0))
2097             return *err = REG_ESPACE, NULL;
2098         }
2099       /* We must return here, since ANCHORs can't be followed
2100          by repetition operators.
2101          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2102              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2103       *token = fetch_token (regexp, syntax);
2104       return tree;
2105     case OP_PERIOD:
2106       new_idx = re_dfa_add_node (dfa, *token, 0);
2107       tree = create_tree (NULL, NULL, 0, new_idx);
2108       if (BE (new_idx == -1 || tree == NULL, 0))
2109         {
2110           *err = REG_ESPACE;
2111           return NULL;
2112         }
2113       if (MB_CUR_MAX > 1)
2114         dfa->has_mb_node = 1;
2115       break;
2116     case OP_WORD:
2117       tree = build_word_op (dfa, 0, err);
2118       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2119         return NULL;
2120       break;
2121     case OP_NOTWORD:
2122       tree = build_word_op (dfa, 1, err);
2123       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2124         return NULL;
2125       break;
2126     case OP_ALT:
2127     case END_OF_RE:
2128       return NULL;
2129     case BACK_SLASH:
2130       *err = REG_EESCAPE;
2131       return NULL;
2132     default:
2133       /* Must not happen?  */
2134 #ifdef DEBUG
2135       assert (0);
2136 #endif
2137       return NULL;
2138     }
2139   *token = fetch_token (regexp, syntax);
2140
2141   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2142          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2143     {
2144       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2145       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2146         return NULL;
2147       dfa->has_plural_match = 1;
2148     }
2149
2150   return tree;
2151 }
2152
2153 /* This function build the following tree, from regular expression
2154    (<reg_exp>):
2155          SUBEXP
2156             |
2157         <reg_exp>
2158 */
2159
2160 static bin_tree_t *
2161 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2162      re_string_t *regexp;
2163      regex_t *preg;
2164      re_token_t *token;
2165      reg_syntax_t syntax;
2166      int nest;
2167      reg_errcode_t *err;
2168 {
2169   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2170   bin_tree_t *tree, *left_par, *right_par;
2171   size_t cur_nsub;
2172   int new_idx;
2173   cur_nsub = preg->re_nsub++;
2174   if (dfa->subexps_alloc < preg->re_nsub)
2175     {
2176       re_subexp_t *new_array;
2177       dfa->subexps_alloc *= 2;
2178       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2179       if (BE (new_array == NULL, 0))
2180         {
2181           dfa->subexps_alloc /= 2;
2182           *err = REG_ESPACE;
2183           return NULL;
2184         }
2185       dfa->subexps = new_array;
2186     }
2187   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2188   dfa->subexps[cur_nsub].end = -1;
2189
2190   new_idx = re_dfa_add_node (dfa, *token, 0);
2191   left_par = create_tree (NULL, NULL, 0, new_idx);
2192   if (BE (new_idx == -1 || left_par == NULL, 0))
2193     {
2194       *err = REG_ESPACE;
2195       return NULL;
2196     }
2197   dfa->nodes[new_idx].opr.idx = cur_nsub;
2198   *token = fetch_token (regexp, syntax);
2199
2200   /* The subexpression may be a null string.  */
2201   if (token->type == OP_CLOSE_SUBEXP)
2202     tree = NULL;
2203   else
2204     {
2205       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2206       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2207         return NULL;
2208     }
2209   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2210     {
2211       free_bin_tree (tree);
2212       *err = REG_BADPAT;
2213       return NULL;
2214     }
2215   new_idx = re_dfa_add_node (dfa, *token, 0);
2216   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2217   right_par = create_tree (NULL, NULL, 0, new_idx);
2218   tree = ((tree == NULL) ? right_par
2219           : create_tree (tree, right_par, CONCAT, 0));
2220   tree = create_tree (left_par, tree, CONCAT, 0);
2221   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2222     {
2223       *err = REG_ESPACE;
2224       return NULL;
2225     }
2226   dfa->nodes[new_idx].opr.idx = cur_nsub;
2227
2228   return tree;
2229 }
2230
2231 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2232
2233 static bin_tree_t *
2234 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2235      bin_tree_t *dup_elem;
2236      re_string_t *regexp;
2237      re_dfa_t *dfa;
2238      re_token_t *token;
2239      reg_syntax_t syntax;
2240      reg_errcode_t *err;
2241 {
2242   re_token_t dup_token;
2243   bin_tree_t *tree = dup_elem, *work_tree;
2244   int new_idx, start_idx = re_string_cur_idx (regexp);
2245   re_token_t start_token = *token;
2246   if (token->type == OP_OPEN_DUP_NUM)
2247     {
2248       int i;
2249       int end = 0;
2250       int start = fetch_number (regexp, token, syntax);
2251       bin_tree_t *elem;
2252       if (start == -1)
2253         {
2254           if (token->type == CHARACTER && token->opr.c == ',')
2255             start = 0; /* We treat "{,m}" as "{0,m}".  */
2256           else
2257             {
2258               *err = REG_BADBR; /* <re>{} is invalid.  */
2259               return NULL;
2260             }
2261         }
2262       if (BE (start != -2, 1))
2263         {
2264           /* We treat "{n}" as "{n,n}".  */
2265           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2266                  : ((token->type == CHARACTER && token->opr.c == ',')
2267                     ? fetch_number (regexp, token, syntax) : -2));
2268         }
2269       if (BE (start == -2 || end == -2, 0))
2270         {
2271           /* Invalid sequence.  */
2272           if (token->type == OP_CLOSE_DUP_NUM)
2273             goto parse_dup_op_invalid_interval;
2274           else
2275             goto parse_dup_op_ebrace;
2276         }
2277       if (BE (start == 0 && end == 0, 0))
2278         {
2279           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2280           *token = fetch_token (regexp, syntax);
2281           free_bin_tree (dup_elem);
2282           return NULL;
2283         }
2284
2285       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2286       elem = tree;
2287       for (i = 0; i < start; ++i)
2288         if (i != 0)
2289           {
2290             work_tree = duplicate_tree (elem, dfa);
2291             tree = create_tree (tree, work_tree, CONCAT, 0);
2292             if (BE (work_tree == NULL || tree == NULL, 0))
2293               goto parse_dup_op_espace;
2294           }
2295
2296       if (end == -1)
2297         {
2298           /* We treat "<re>{0,}" as "<re>*".  */
2299           dup_token.type = OP_DUP_ASTERISK;
2300           if (start > 0)
2301             {
2302               elem = duplicate_tree (elem, dfa);
2303               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2304               work_tree = create_tree (elem, NULL, 0, new_idx);
2305               tree = create_tree (tree, work_tree, CONCAT, 0);
2306               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2307                       || tree == NULL, 0))
2308                 goto parse_dup_op_espace;
2309             }
2310           else
2311             {
2312               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2313               tree = create_tree (elem, NULL, 0, new_idx);
2314               if (BE (new_idx == -1 || tree == NULL, 0))
2315                 goto parse_dup_op_espace;
2316             }
2317         }
2318       else if (end - start > 0)
2319         {
2320           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2321           dup_token.type = OP_DUP_QUESTION;
2322           if (start > 0)
2323             {
2324               elem = duplicate_tree (elem, dfa);
2325               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2326               elem = create_tree (elem, NULL, 0, new_idx);
2327               tree = create_tree (tree, elem, CONCAT, 0);
2328               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2329                 goto parse_dup_op_espace;
2330             }
2331           else
2332             {
2333               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2334               tree = elem = create_tree (elem, NULL, 0, new_idx);
2335               if (BE (new_idx == -1 || tree == NULL, 0))
2336                 goto parse_dup_op_espace;
2337             }
2338           for (i = 1; i < end - start; ++i)
2339             {
2340               work_tree = duplicate_tree (elem, dfa);
2341               tree = create_tree (tree, work_tree, CONCAT, 0);
2342               if (BE (work_tree == NULL || tree == NULL, 0))
2343                 {
2344                   *err = REG_ESPACE;
2345                   return NULL;
2346                 }
2347             }
2348         }
2349     }
2350   else
2351     {
2352       new_idx = re_dfa_add_node (dfa, *token, 0);
2353       tree = create_tree (tree, NULL, 0, new_idx);
2354       if (BE (new_idx == -1 || tree == NULL, 0))
2355         {
2356           *err = REG_ESPACE;
2357           return NULL;
2358         }
2359     }
2360   *token = fetch_token (regexp, syntax);
2361   return tree;
2362
2363  parse_dup_op_espace:
2364   free_bin_tree (tree);
2365   *err = REG_ESPACE;
2366   return NULL;
2367
2368  parse_dup_op_ebrace:
2369   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2370     {
2371       *err = REG_EBRACE;
2372       return NULL;
2373     }
2374   goto parse_dup_op_rollback;
2375  parse_dup_op_invalid_interval:
2376   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2377     {
2378       *err = REG_BADBR;
2379       return NULL;
2380     }
2381  parse_dup_op_rollback:
2382   re_string_set_index (regexp, start_idx);
2383   *token = start_token;
2384   token->type = CHARACTER;
2385   return dup_elem;
2386 }
2387
2388 /* Size of the names for collating symbol/equivalence_class/character_class.
2389    I'm not sure, but maybe enough.  */
2390 #define BRACKET_NAME_BUF_SIZE 32
2391
2392 #ifndef _LIBC
2393   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2394      Build the range expression which starts from START_ELEM, and ends
2395      at END_ELEM.  The result are written to MBCSET and SBCSET.
2396      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2397      mbcset->range_ends, is a pointer argument sinse we may
2398      update it.  */
2399
2400 static reg_errcode_t
2401 # ifdef RE_ENABLE_I18N
2402 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2403      re_charset_t *mbcset;
2404      int *range_alloc;
2405 # else /* not RE_ENABLE_I18N */
2406 build_range_exp (sbcset, start_elem, end_elem)
2407 # endif /* not RE_ENABLE_I18N */
2408      re_bitset_ptr_t sbcset;
2409      bracket_elem_t *start_elem, *end_elem;
2410 {
2411   unsigned int start_ch, end_ch;
2412   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2413   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2414           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2415           0))
2416     return REG_ERANGE;
2417
2418   /* We can handle no multi character collating elements without libc
2419      support.  */
2420   if (BE ((start_elem->type == COLL_SYM
2421            && strlen ((char *) start_elem->opr.name) > 1)
2422           || (end_elem->type == COLL_SYM
2423               && strlen ((char *) end_elem->opr.name) > 1), 0))
2424     return REG_ECOLLATE;
2425
2426 # ifdef RE_ENABLE_I18N
2427   {
2428     wchar_t wc, start_wc, end_wc;
2429     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2430
2431     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2432                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2433                    : 0));
2434     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2435               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2436                  : 0));
2437     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2438                 ? __btowc (start_ch) : start_elem->opr.wch);
2439     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2440               ? __btowc (end_ch) : end_elem->opr.wch);
2441     cmp_buf[0] = start_wc;
2442     cmp_buf[4] = end_wc;
2443     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2444       return REG_ERANGE;
2445
2446     /* Check the space of the arrays.  */
2447     if (*range_alloc == mbcset->nranges)
2448       {
2449         /* There are not enough space, need realloc.  */
2450         wchar_t *new_array_start, *new_array_end;
2451         int new_nranges;
2452
2453         /* +1 in case of mbcset->nranges is 0.  */
2454         new_nranges = 2 * mbcset->nranges + 1;
2455         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2456            are NULL if *range_alloc == 0.  */
2457         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2458                                       new_nranges);
2459         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2460                                     new_nranges);
2461
2462         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2463           return REG_ESPACE;
2464
2465         mbcset->range_starts = new_array_start;
2466         mbcset->range_ends = new_array_end;
2467         *range_alloc = new_nranges;
2468       }
2469
2470     mbcset->range_starts[mbcset->nranges] = start_wc;
2471     mbcset->range_ends[mbcset->nranges++] = end_wc;
2472
2473     /* Build the table for single byte characters.  */
2474     for (wc = 0; wc <= SBC_MAX; ++wc)
2475       {
2476         cmp_buf[2] = wc;
2477         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2478             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2479           bitset_set (sbcset, wc);
2480       }
2481   }
2482 # else /* not RE_ENABLE_I18N */
2483   {
2484     unsigned int ch;
2485     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2486                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2487                    : 0));
2488     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2489               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2490                  : 0));
2491     if (start_ch > end_ch)
2492       return REG_ERANGE;
2493     /* Build the table for single byte characters.  */
2494     for (ch = 0; ch <= SBC_MAX; ++ch)
2495       if (start_ch <= ch  && ch <= end_ch)
2496         bitset_set (sbcset, ch);
2497   }
2498 # endif /* not RE_ENABLE_I18N */
2499   return REG_NOERROR;
2500 }
2501 #endif /* not _LIBC */
2502
2503 #ifndef _LIBC
2504 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2505    Build the collating element which is represented by NAME.
2506    The result are written to MBCSET and SBCSET.
2507    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2508    pointer argument since we may update it.  */
2509
2510 static reg_errcode_t
2511 # ifdef RE_ENABLE_I18N
2512 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2513      re_charset_t *mbcset;
2514      int *coll_sym_alloc;
2515 # else /* not RE_ENABLE_I18N */
2516 build_collating_symbol (sbcset, name)
2517 # endif /* not RE_ENABLE_I18N */
2518      re_bitset_ptr_t sbcset;
2519      const unsigned char *name;
2520 {
2521   size_t name_len = strlen ((const char *) name);
2522   if (BE (name_len != 1, 0))
2523     return REG_ECOLLATE;
2524   else
2525     {
2526       bitset_set (sbcset, name[0]);
2527       return REG_NOERROR;
2528     }
2529 }
2530 #endif /* not _LIBC */
2531
2532 /* This function parse bracket expression like "[abc]", "[a-c]",
2533    "[[.a-a.]]" etc.  */
2534
2535 static bin_tree_t *
2536 parse_bracket_exp (regexp, dfa, token, syntax, err)
2537      re_string_t *regexp;
2538      re_dfa_t *dfa;
2539      re_token_t *token;
2540      reg_syntax_t syntax;
2541      reg_errcode_t *err;
2542 {
2543 #ifdef _LIBC
2544   const unsigned char *collseqmb;
2545   const char *collseqwc;
2546   uint32_t nrules;
2547   int32_t table_size;
2548   const int32_t *symb_table;
2549   const unsigned char *extra;
2550
2551   /* Local function for parse_bracket_exp used in _LIBC environement.
2552      Seek the collating symbol entry correspondings to NAME.
2553      Return the index of the symbol in the SYMB_TABLE.  */
2554
2555   static inline int32_t
2556   seek_collating_symbol_entry (name, name_len)
2557          const unsigned char *name;
2558          size_t name_len;
2559     {
2560       int32_t hash = elem_hash ((const char *) name, name_len);
2561       int32_t elem = hash % table_size;
2562       int32_t second = hash % (table_size - 2);
2563       while (symb_table[2 * elem] != 0)
2564         {
2565           /* First compare the hashing value.  */
2566           if (symb_table[2 * elem] == hash
2567               /* Compare the length of the name.  */
2568               && name_len == extra[symb_table[2 * elem + 1]]
2569               /* Compare the name.  */
2570               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2571                          name_len) == 0)
2572             {
2573               /* Yep, this is the entry.  */
2574               break;
2575             }
2576
2577           /* Next entry.  */
2578           elem += second;
2579         }
2580       return elem;
2581     }
2582
2583   /* Local function for parse_bracket_exp used in _LIBC environement.
2584      Look up the collation sequence value of BR_ELEM.
2585      Return the value if succeeded, UINT_MAX otherwise.  */
2586
2587   static inline unsigned int
2588   lookup_collation_sequence_value (br_elem)
2589          bracket_elem_t *br_elem;
2590     {
2591       if (br_elem->type == SB_CHAR)
2592         {
2593           /*
2594           if (MB_CUR_MAX == 1)
2595           */
2596           if (nrules == 0)
2597             return collseqmb[br_elem->opr.ch];
2598           else
2599             {
2600               wint_t wc = __btowc (br_elem->opr.ch);
2601               return collseq_table_lookup (collseqwc, wc);
2602             }
2603         }
2604       else if (br_elem->type == MB_CHAR)
2605         {
2606           return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2607         }
2608       else if (br_elem->type == COLL_SYM)
2609         {
2610           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2611           if (nrules != 0)
2612             {
2613               int32_t elem, idx;
2614               elem = seek_collating_symbol_entry (br_elem->opr.name,
2615                                                   sym_name_len);
2616               if (symb_table[2 * elem] != 0)
2617                 {
2618                   /* We found the entry.  */
2619                   idx = symb_table[2 * elem + 1];
2620                   /* Skip the name of collating element name.  */
2621                   idx += 1 + extra[idx];
2622                   /* Skip the byte sequence of the collating element.  */
2623                   idx += 1 + extra[idx];
2624                   /* Adjust for the alignment.  */
2625                   idx = (idx + 3) & ~3;
2626                   /* Skip the multibyte collation sequence value.  */
2627                   idx += sizeof (unsigned int);
2628                   /* Skip the wide char sequence of the collating element.  */
2629                   idx += sizeof (unsigned int) *
2630                     (1 + *(unsigned int *) (extra + idx));
2631                   /* Return the collation sequence value.  */
2632                   return *(unsigned int *) (extra + idx);
2633                 }
2634               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2635                 {
2636                   /* No valid character.  Match it as a single byte
2637                      character.  */
2638                   return collseqmb[br_elem->opr.name[0]];
2639                 }
2640             }
2641           else if (sym_name_len == 1)
2642             return collseqmb[br_elem->opr.name[0]];
2643         }
2644       return UINT_MAX;
2645     }
2646
2647   /* Local function for parse_bracket_exp used in _LIBC environement.
2648      Build the range expression which starts from START_ELEM, and ends
2649      at END_ELEM.  The result are written to MBCSET and SBCSET.
2650      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2651      mbcset->range_ends, is a pointer argument sinse we may
2652      update it.  */
2653
2654   static inline reg_errcode_t
2655 # ifdef RE_ENABLE_I18N
2656   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2657          re_charset_t *mbcset;
2658          int *range_alloc;
2659 # else /* not RE_ENABLE_I18N */
2660   build_range_exp (sbcset, start_elem, end_elem)
2661 # endif /* not RE_ENABLE_I18N */
2662          re_bitset_ptr_t sbcset;
2663          bracket_elem_t *start_elem, *end_elem;
2664     {
2665       unsigned int ch;
2666       uint32_t start_collseq;
2667       uint32_t end_collseq;
2668
2669 # ifdef RE_ENABLE_I18N
2670       /* Check the space of the arrays.  */
2671       if (*range_alloc == mbcset->nranges)
2672         {
2673           /* There are not enough space, need realloc.  */
2674           uint32_t *new_array_start;
2675           uint32_t *new_array_end;
2676           int new_nranges;
2677
2678           /* +1 in case of mbcset->nranges is 0.  */
2679           new_nranges = 2 * mbcset->nranges + 1;
2680           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2681              are NULL if *range_alloc == 0.  */
2682           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2683                                         new_nranges);
2684           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2685                                       new_nranges);
2686
2687           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2688             return REG_ESPACE;
2689
2690           mbcset->range_starts = new_array_start;
2691           mbcset->range_ends = new_array_end;
2692           *range_alloc = new_nranges;
2693         }
2694 # endif /* RE_ENABLE_I18N */
2695
2696       /* Equivalence Classes and Character Classes can't be a range
2697          start/end.  */
2698       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2699               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2700               0))
2701         return REG_ERANGE;
2702
2703       start_collseq = lookup_collation_sequence_value (start_elem);
2704       end_collseq = lookup_collation_sequence_value (end_elem);
2705       /* Check start/end collation sequence values.  */
2706       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2707         return REG_ECOLLATE;
2708       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2709         return REG_ERANGE;
2710
2711 # ifdef RE_ENABLE_I18N
2712       /* Got valid collation sequence values, add them as a new entry.  */
2713       mbcset->range_starts[mbcset->nranges] = start_collseq;
2714       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2715 # endif /* RE_ENABLE_I18N */
2716
2717       /* Build the table for single byte characters.  */
2718       for (ch = 0; ch <= SBC_MAX; ch++)
2719         {
2720           uint32_t ch_collseq;
2721           /*
2722           if (MB_CUR_MAX == 1)
2723           */
2724           if (nrules == 0)
2725             ch_collseq = collseqmb[ch];
2726           else
2727             ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2728           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2729             bitset_set (sbcset, ch);
2730         }
2731       return REG_NOERROR;
2732     }
2733
2734   /* Local function for parse_bracket_exp used in _LIBC environement.
2735      Build the collating element which is represented by NAME.
2736      The result are written to MBCSET and SBCSET.
2737      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2738      pointer argument sinse we may update it.  */
2739
2740   static inline reg_errcode_t
2741 # ifdef RE_ENABLE_I18N
2742   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2743          re_charset_t *mbcset;
2744          int *coll_sym_alloc;
2745 # else /* not RE_ENABLE_I18N */
2746   build_collating_symbol (sbcset, name)
2747 # endif /* not RE_ENABLE_I18N */
2748          re_bitset_ptr_t sbcset;
2749          const unsigned char *name;
2750     {
2751       int32_t elem, idx;
2752       size_t name_len = strlen ((const char *) name);
2753       if (nrules != 0)
2754         {
2755           elem = seek_collating_symbol_entry (name, name_len);
2756           if (symb_table[2 * elem] != 0)
2757             {
2758               /* We found the entry.  */
2759               idx = symb_table[2 * elem + 1];
2760               /* Skip the name of collating element name.  */
2761               idx += 1 + extra[idx];
2762             }
2763           else if (symb_table[2 * elem] == 0 && name_len == 1)
2764             {
2765               /* No valid character, treat it as a normal
2766                  character.  */
2767               bitset_set (sbcset, name[0]);
2768               return REG_NOERROR;
2769             }
2770           else
2771             return REG_ECOLLATE;
2772
2773 # ifdef RE_ENABLE_I18N
2774           /* Got valid collation sequence, add it as a new entry.  */
2775           /* Check the space of the arrays.  */
2776           if (*coll_sym_alloc == mbcset->ncoll_syms)
2777             {
2778               /* Not enough, realloc it.  */
2779               /* +1 in case of mbcset->ncoll_syms is 0.  */
2780               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2781               /* Use realloc since mbcset->coll_syms is NULL
2782                  if *alloc == 0.  */
2783               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2784                                               *coll_sym_alloc);
2785               if (BE (mbcset->coll_syms == NULL, 0))
2786                 return REG_ESPACE;
2787             }
2788           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2789 # endif /* RE_ENABLE_I18N */
2790           return REG_NOERROR;
2791         }
2792       else
2793         {
2794           if (BE (name_len != 1, 0))
2795             return REG_ECOLLATE;
2796           else
2797             {
2798               bitset_set (sbcset, name[0]);
2799               return REG_NOERROR;
2800             }
2801         }
2802     }
2803 #endif
2804
2805   re_token_t br_token;
2806   re_bitset_ptr_t sbcset;
2807 #ifdef RE_ENABLE_I18N
2808   re_charset_t *mbcset;
2809   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2810   int equiv_class_alloc = 0, char_class_alloc = 0;
2811 #else /* not RE_ENABLE_I18N */
2812   int non_match = 0;
2813 #endif /* not RE_ENABLE_I18N */
2814   bin_tree_t *work_tree;
2815   int token_len, new_idx;
2816 #ifdef _LIBC
2817   collseqmb = (const unsigned char *)
2818     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2819   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2820   if (nrules)
2821     {
2822       /*
2823       if (MB_CUR_MAX > 1)
2824       */
2825         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2826       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2827       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2828                                                   _NL_COLLATE_SYMB_TABLEMB);
2829       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2830                                                    _NL_COLLATE_SYMB_EXTRAMB);
2831     }
2832 #endif
2833   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2834 #ifdef RE_ENABLE_I18N
2835   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2836 #endif /* RE_ENABLE_I18N */
2837 #ifdef RE_ENABLE_I18N
2838   if (BE (sbcset == NULL || mbcset == NULL, 0))
2839 #else
2840   if (BE (sbcset == NULL, 0))
2841 #endif /* RE_ENABLE_I18N */
2842     {
2843       *err = REG_ESPACE;
2844       return NULL;
2845     }
2846
2847   token_len = peek_token_bracket (token, regexp, syntax);
2848   if (BE (token->type == END_OF_RE, 0))
2849     {
2850       *err = REG_BADPAT;
2851       goto parse_bracket_exp_free_return;
2852     }
2853   if (token->type == OP_NON_MATCH_LIST)
2854     {
2855 #ifdef RE_ENABLE_I18N
2856       int i;
2857       mbcset->non_match = 1;
2858 #else /* not RE_ENABLE_I18N */
2859       non_match = 1;
2860 #endif /* not RE_ENABLE_I18N */
2861       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2862         bitset_set (sbcset, '\0');
2863       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2864       token_len = peek_token_bracket (token, regexp, syntax);
2865       if (BE (token->type == END_OF_RE, 0))
2866         {
2867           *err = REG_BADPAT;
2868           goto parse_bracket_exp_free_return;
2869         }
2870 #ifdef RE_ENABLE_I18N
2871       if (MB_CUR_MAX > 1)
2872         for (i = 0; i < SBC_MAX; ++i)
2873           if (__btowc (i) == WEOF)
2874             bitset_set (sbcset, i);
2875 #endif /* RE_ENABLE_I18N */
2876     }
2877
2878   /* We treat the first ']' as a normal character.  */
2879   if (token->type == OP_CLOSE_BRACKET)
2880     token->type = CHARACTER;
2881
2882   while (1)
2883     {
2884       bracket_elem_t start_elem, end_elem;
2885       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2886       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2887       reg_errcode_t ret;
2888       int token_len2 = 0, is_range_exp = 0;
2889       re_token_t token2;
2890
2891       start_elem.opr.name = start_name_buf;
2892       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2893                                    syntax);
2894       if (BE (ret != REG_NOERROR, 0))
2895         {
2896           *err = ret;
2897           goto parse_bracket_exp_free_return;
2898         }
2899
2900       token_len = peek_token_bracket (token, regexp, syntax);
2901       if (BE (token->type == END_OF_RE, 0))
2902         {
2903           *err = REG_BADPAT;
2904           goto parse_bracket_exp_free_return;
2905         }
2906       if (token->type == OP_CHARSET_RANGE)
2907         {
2908           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2909           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2910           if (BE (token->type == END_OF_RE, 0))
2911             {
2912               *err = REG_BADPAT;
2913               goto parse_bracket_exp_free_return;
2914             }
2915           if (token2.type == OP_CLOSE_BRACKET)
2916             {
2917               /* We treat the last '-' as a normal character.  */
2918               re_string_skip_bytes (regexp, -token_len);
2919               token->type = CHARACTER;
2920             }
2921           else
2922             is_range_exp = 1;
2923         }
2924
2925       if (is_range_exp == 1)
2926         {
2927           end_elem.opr.name = end_name_buf;
2928           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2929                                        dfa, syntax);
2930           if (BE (ret != REG_NOERROR, 0))
2931             {
2932               *err = ret;
2933               goto parse_bracket_exp_free_return;
2934             }
2935
2936           token_len = peek_token_bracket (token, regexp, syntax);
2937           if (BE (token->type == END_OF_RE, 0))
2938             {
2939               *err = REG_BADPAT;
2940               goto parse_bracket_exp_free_return;
2941             }
2942           *err = build_range_exp (sbcset,
2943 #ifdef RE_ENABLE_I18N
2944                                   mbcset, &range_alloc,
2945 #endif /* RE_ENABLE_I18N */
2946                                   &start_elem, &end_elem);
2947           if (BE (*err != REG_NOERROR, 0))
2948             goto parse_bracket_exp_free_return;
2949         }
2950       else
2951         {
2952           switch (start_elem.type)
2953             {
2954             case SB_CHAR:
2955               bitset_set (sbcset, start_elem.opr.ch);
2956               break;
2957 #ifdef RE_ENABLE_I18N
2958             case MB_CHAR:
2959               /* Check whether the array has enough space.  */
2960               if (mbchar_alloc == mbcset->nmbchars)
2961                 {
2962                   /* Not enough, realloc it.  */
2963                   /* +1 in case of mbcset->nmbchars is 0.  */
2964                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2965                   /* Use realloc since array is NULL if *alloc == 0.  */
2966                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2967                                                 mbchar_alloc);
2968                   if (BE (mbcset->mbchars == NULL, 0))
2969                     goto parse_bracket_exp_espace;
2970                 }
2971               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2972               break;
2973 #endif /* RE_ENABLE_I18N */
2974             case EQUIV_CLASS:
2975               *err = build_equiv_class (sbcset,
2976 #ifdef RE_ENABLE_I18N
2977                                         mbcset, &equiv_class_alloc,
2978 #endif /* RE_ENABLE_I18N */
2979                                         start_elem.opr.name);
2980               if (BE (*err != REG_NOERROR, 0))
2981                 goto parse_bracket_exp_free_return;
2982               break;
2983             case COLL_SYM:
2984               *err = build_collating_symbol (sbcset,
2985 #ifdef RE_ENABLE_I18N
2986                                              mbcset, &coll_sym_alloc,
2987 #endif /* RE_ENABLE_I18N */
2988                                              start_elem.opr.name);
2989               if (BE (*err != REG_NOERROR, 0))
2990                 goto parse_bracket_exp_free_return;
2991               break;
2992             case CHAR_CLASS:
2993               ret = build_charclass (sbcset,
2994 #ifdef RE_ENABLE_I18N
2995                                      mbcset, &char_class_alloc,
2996 #endif /* RE_ENABLE_I18N */
2997                                      start_elem.opr.name, syntax);
2998               if (BE (ret != REG_NOERROR, 0))
2999                goto parse_bracket_exp_espace;
3000               break;
3001             default:
3002               assert (0);
3003               break;
3004             }
3005         }
3006       if (token->type == OP_CLOSE_BRACKET)
3007         break;
3008     }
3009
3010   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3011
3012   /* If it is non-matching list.  */
3013 #ifdef RE_ENABLE_I18N
3014   if (mbcset->non_match)
3015 #else /* not RE_ENABLE_I18N */
3016   if (non_match)
3017 #endif /* not RE_ENABLE_I18N */
3018     bitset_not (sbcset);
3019
3020   /* Build a tree for simple bracket.  */
3021   br_token.type = SIMPLE_BRACKET;
3022   br_token.opr.sbcset = sbcset;
3023   new_idx = re_dfa_add_node (dfa, br_token, 0);
3024   work_tree = create_tree (NULL, NULL, 0, new_idx);
3025   if (BE (new_idx == -1 || work_tree == NULL, 0))
3026     goto parse_bracket_exp_espace;
3027
3028 #ifdef RE_ENABLE_I18N
3029   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3030       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
3031                                                 || mbcset->non_match)))
3032     {
3033       re_token_t alt_token;
3034       bin_tree_t *mbc_tree;
3035       /* Build a tree for complex bracket.  */
3036       br_token.type = COMPLEX_BRACKET;
3037       br_token.opr.mbcset = mbcset;
3038       dfa->has_mb_node = 1;
3039       new_idx = re_dfa_add_node (dfa, br_token, 0);
3040       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3041       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3042         goto parse_bracket_exp_espace;
3043       /* Then join them by ALT node.  */
3044       dfa->has_plural_match = 1;
3045       alt_token.type = OP_ALT;
3046       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3047       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
3048       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3049         return work_tree;
3050     }
3051   else
3052     {
3053       free_charset (mbcset);
3054       return work_tree;
3055     }
3056 #else /* not RE_ENABLE_I18N */
3057   return work_tree;
3058 #endif /* not RE_ENABLE_I18N */
3059
3060  parse_bracket_exp_espace:
3061   *err = REG_ESPACE;
3062  parse_bracket_exp_free_return:
3063   re_free (sbcset);
3064 #ifdef RE_ENABLE_I18N
3065   free_charset (mbcset);
3066 #endif /* RE_ENABLE_I18N */
3067   return NULL;
3068 }
3069
3070 /* Parse an element in the bracket expression.  */
3071
3072 static reg_errcode_t
3073 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
3074      bracket_elem_t *elem;
3075      re_string_t *regexp;
3076      re_token_t *token;
3077      int token_len;
3078      re_dfa_t *dfa;
3079      reg_syntax_t syntax;
3080 {
3081 #ifdef RE_ENABLE_I18N
3082   int cur_char_size;
3083   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3084   if (cur_char_size > 1)
3085     {
3086       elem->type = MB_CHAR;
3087       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3088       re_string_skip_bytes (regexp, cur_char_size);
3089       return REG_NOERROR;
3090     }
3091 #endif /* RE_ENABLE_I18N */
3092   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3093   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3094       || token->type == OP_OPEN_EQUIV_CLASS)
3095     return parse_bracket_symbol (elem, regexp, token);
3096   elem->type = SB_CHAR;
3097   elem->opr.ch = token->opr.c;
3098   return REG_NOERROR;
3099 }
3100
3101 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3102    such as [:<character_class>:], [.<collating_element>.], and
3103    [=<equivalent_class>=].  */
3104
3105 static reg_errcode_t
3106 parse_bracket_symbol (elem, regexp, token)
3107      bracket_elem_t *elem;
3108      re_string_t *regexp;
3109      re_token_t *token;
3110 {
3111   unsigned char ch, delim = token->opr.c;
3112   int i = 0;
3113   for (;; ++i)
3114     {
3115       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3116         return REG_EBRACK;
3117       if (token->type == OP_OPEN_CHAR_CLASS)
3118         ch = re_string_fetch_byte_case (regexp);
3119       else
3120         ch = re_string_fetch_byte (regexp);
3121       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3122         break;
3123       elem->opr.name[i] = ch;
3124     }
3125   re_string_skip_bytes (regexp, 1);
3126   elem->opr.name[i] = '\0';
3127   switch (token->type)
3128     {
3129     case OP_OPEN_COLL_ELEM:
3130       elem->type = COLL_SYM;
3131       break;
3132     case OP_OPEN_EQUIV_CLASS:
3133       elem->type = EQUIV_CLASS;
3134       break;
3135     case OP_OPEN_CHAR_CLASS:
3136       elem->type = CHAR_CLASS;
3137       break;
3138     default:
3139       break;
3140     }
3141   return REG_NOERROR;
3142 }
3143
3144   /* Helper function for parse_bracket_exp.
3145      Build the equivalence class which is represented by NAME.
3146      The result are written to MBCSET and SBCSET.
3147      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3148      is a pointer argument sinse we may update it.  */
3149
3150 static reg_errcode_t
3151 #ifdef RE_ENABLE_I18N
3152 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3153      re_charset_t *mbcset;
3154      int *equiv_class_alloc;
3155 #else /* not RE_ENABLE_I18N */
3156 build_equiv_class (sbcset, name)
3157 #endif /* not RE_ENABLE_I18N */
3158      re_bitset_ptr_t sbcset;
3159      const unsigned char *name;
3160 {
3161 #if defined _LIBC && defined RE_ENABLE_I18N
3162   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3163   if (nrules != 0)
3164     {
3165       const int32_t *table, *indirect;
3166       const unsigned char *weights, *extra, *cp;
3167       unsigned char char_buf[2];
3168       int32_t idx1, idx2;
3169       unsigned int ch;
3170       size_t len;
3171       /* This #include defines a local function!  */
3172 # include <locale/weight.h>
3173       /* Calculate the index for equivalence class.  */
3174       cp = name;
3175       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3176       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3177                                                _NL_COLLATE_WEIGHTMB);
3178       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3179                                                    _NL_COLLATE_EXTRAMB);
3180       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3181                                                 _NL_COLLATE_INDIRECTMB);
3182       idx1 = findidx (&cp);
3183       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3184         /* This isn't a valid character.  */
3185         return REG_ECOLLATE;
3186
3187       /* Build single byte matcing table for this equivalence class.  */
3188       char_buf[1] = (unsigned char) '\0';
3189       len = weights[idx1];
3190       for (ch = 0; ch < SBC_MAX; ++ch)
3191         {
3192           char_buf[0] = ch;
3193           cp = char_buf;
3194           idx2 = findidx (&cp);
3195 /*
3196           idx2 = table[ch];
3197 */
3198           if (idx2 == 0)
3199             /* This isn't a valid character.  */
3200             continue;
3201           if (len == weights[idx2])
3202             {
3203               int cnt = 0;
3204               while (cnt <= len &&
3205                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3206                 ++cnt;
3207
3208               if (cnt > len)
3209                 bitset_set (sbcset, ch);
3210             }
3211         }
3212       /* Check whether the array has enough space.  */
3213       if (*equiv_class_alloc == mbcset->nequiv_classes)
3214         {
3215           /* Not enough, realloc it.  */
3216           /* +1 in case of mbcset->nequiv_classes is 0.  */
3217           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3218           /* Use realloc since the array is NULL if *alloc == 0.  */
3219           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3220                                               *equiv_class_alloc);
3221           if (BE (mbcset->equiv_classes == NULL, 0))
3222             return REG_ESPACE;
3223         }
3224       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3225     }
3226   else
3227 #endif /* _LIBC && RE_ENABLE_I18N */
3228     {
3229       if (BE (strlen ((const char *) name) != 1, 0))
3230         return REG_ECOLLATE;
3231       bitset_set (sbcset, *name);
3232     }
3233   return REG_NOERROR;
3234 }
3235
3236   /* Helper function for parse_bracket_exp.
3237      Build the character class which is represented by NAME.
3238      The result are written to MBCSET and SBCSET.
3239      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3240      is a pointer argument sinse we may update it.  */
3241
3242 static reg_errcode_t
3243 #ifdef RE_ENABLE_I18N
3244 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3245      re_charset_t *mbcset;
3246      int *char_class_alloc;
3247 #else /* not RE_ENABLE_I18N */
3248 build_charclass (sbcset, class_name, syntax)
3249 #endif /* not RE_ENABLE_I18N */
3250      re_bitset_ptr_t sbcset;
3251      const unsigned char *class_name;
3252      reg_syntax_t syntax;
3253 {
3254   int i;
3255   const char *name = (const char *) class_name;
3256
3257   /* In case of REG_ICASE "upper" and "lower" match the both of
3258      upper and lower cases.  */
3259   if ((syntax & RE_ICASE)
3260       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3261     name = "alpha";
3262
3263 #ifdef RE_ENABLE_I18N
3264   /* Check the space of the arrays.  */
3265   if (*char_class_alloc == mbcset->nchar_classes)
3266     {
3267       /* Not enough, realloc it.  */
3268       /* +1 in case of mbcset->nchar_classes is 0.  */
3269       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3270       /* Use realloc since array is NULL if *alloc == 0.  */
3271       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3272                                          *char_class_alloc);
3273       if (BE (mbcset->char_classes == NULL, 0))
3274         return REG_ESPACE;
3275     }
3276   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3277 #endif /* RE_ENABLE_I18N */
3278
3279 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3280     for (i = 0; i < SBC_MAX; ++i)       \
3281       {                                 \
3282         if (ctype_func (i))             \
3283           bitset_set (sbcset, i);       \
3284       }
3285
3286   if (strcmp (name, "alnum") == 0)
3287     BUILD_CHARCLASS_LOOP (isalnum)
3288   else if (strcmp (name, "cntrl") == 0)
3289     BUILD_CHARCLASS_LOOP (iscntrl)
3290   else if (strcmp (name, "lower") == 0)
3291     BUILD_CHARCLASS_LOOP (islower)
3292   else if (strcmp (name, "space") == 0)
3293     BUILD_CHARCLASS_LOOP (isspace)
3294   else if (strcmp (name, "alpha") == 0)
3295     BUILD_CHARCLASS_LOOP (isalpha)
3296   else if (strcmp (name, "digit") == 0)
3297     BUILD_CHARCLASS_LOOP (isdigit)
3298   else if (strcmp (name, "print") == 0)
3299     BUILD_CHARCLASS_LOOP (isprint)
3300   else if (strcmp (name, "upper") == 0)
3301     BUILD_CHARCLASS_LOOP (isupper)
3302   else if (strcmp (name, "blank") == 0)
3303     BUILD_CHARCLASS_LOOP (isblank)
3304   else if (strcmp (name, "graph") == 0)
3305     BUILD_CHARCLASS_LOOP (isgraph)
3306   else if (strcmp (name, "punct") == 0)
3307     BUILD_CHARCLASS_LOOP (ispunct)
3308   else if (strcmp (name, "xdigit") == 0)
3309     BUILD_CHARCLASS_LOOP (isxdigit)
3310   else
3311     return REG_ECTYPE;
3312
3313   return REG_NOERROR;
3314 }
3315
3316 static bin_tree_t *
3317 build_word_op (dfa, not, err)
3318      re_dfa_t *dfa;
3319      int not;
3320      reg_errcode_t *err;
3321 {
3322   re_bitset_ptr_t sbcset;
3323 #ifdef RE_ENABLE_I18N
3324   re_charset_t *mbcset;
3325   int alloc = 0;
3326 #else /* not RE_ENABLE_I18N */
3327   int non_match = 0;
3328 #endif /* not RE_ENABLE_I18N */
3329   reg_errcode_t ret;
3330   re_token_t br_token;
3331   bin_tree_t *tree;
3332   int new_idx;
3333
3334   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3335 #ifdef RE_ENABLE_I18N
3336   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3337 #endif /* RE_ENABLE_I18N */
3338
3339 #ifdef RE_ENABLE_I18N
3340   if (BE (sbcset == NULL || mbcset == NULL, 0))
3341 #else /* not RE_ENABLE_I18N */
3342   if (BE (sbcset == NULL, 0))
3343 #endif /* not RE_ENABLE_I18N */
3344     {
3345       *err = REG_ESPACE;
3346       return NULL;
3347     }
3348
3349   if (not)
3350     {
3351 #ifdef RE_ENABLE_I18N
3352       int i;
3353       /*
3354       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3355         bitset_set(cset->sbcset, '\0');
3356       */
3357       mbcset->non_match = 1;
3358       if (MB_CUR_MAX > 1)
3359         for (i = 0; i < SBC_MAX; ++i)
3360           if (__btowc (i) == WEOF)
3361             bitset_set (sbcset, i);
3362 #else /* not RE_ENABLE_I18N */
3363       non_match = 1;
3364 #endif /* not RE_ENABLE_I18N */
3365     }
3366
3367   /* We don't care the syntax in this case.  */
3368   ret = build_charclass (sbcset,
3369 #ifdef RE_ENABLE_I18N
3370                          mbcset, &alloc,
3371 #endif /* RE_ENABLE_I18N */
3372                          (const unsigned char *) "alpha", 0);
3373
3374   if (BE (ret != REG_NOERROR, 0))
3375     {
3376       re_free (sbcset);
3377 #ifdef RE_ENABLE_I18N
3378       free_charset (mbcset);
3379 #endif /* RE_ENABLE_I18N */
3380       *err = REG_ESPACE;
3381       return NULL;
3382     }
3383   /* \w match '_' also.  */
3384   bitset_set (sbcset, '_');
3385
3386   /* If it is non-matching list.  */
3387 #ifdef RE_ENABLE_I18N
3388   if (mbcset->non_match)
3389 #else /* not RE_ENABLE_I18N */
3390   if (non_match)
3391 #endif /* not RE_ENABLE_I18N */
3392     bitset_not (sbcset);
3393
3394   /* Build a tree for simple bracket.  */
3395   br_token.type = SIMPLE_BRACKET;
3396   br_token.opr.sbcset = sbcset;
3397   new_idx = re_dfa_add_node (dfa, br_token, 0);
3398   tree = create_tree (NULL, NULL, 0, new_idx);
3399   if (BE (new_idx == -1 || tree == NULL, 0))
3400     goto build_word_op_espace;
3401
3402 #ifdef RE_ENABLE_I18N
3403   if (MB_CUR_MAX > 1)
3404     {
3405       re_token_t alt_token;
3406       bin_tree_t *mbc_tree;
3407       /* Build a tree for complex bracket.  */
3408       br_token.type = COMPLEX_BRACKET;
3409       br_token.opr.mbcset = mbcset;
3410       dfa->has_mb_node = 1;
3411       new_idx = re_dfa_add_node (dfa, br_token, 0);
3412       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3413       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3414         goto build_word_op_espace;
3415       /* Then join them by ALT node.  */
3416       alt_token.type = OP_ALT;
3417       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3418       tree = create_tree (tree, mbc_tree, 0, new_idx);
3419       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3420         return tree;
3421     }
3422   else
3423     {
3424       free_charset (mbcset);
3425       return tree;
3426     }
3427 #else /* not RE_ENABLE_I18N */
3428   return tree;
3429 #endif /* not RE_ENABLE_I18N */
3430
3431  build_word_op_espace:
3432   re_free (sbcset);
3433 #ifdef RE_ENABLE_I18N
3434   free_charset (mbcset);
3435 #endif /* RE_ENABLE_I18N */
3436   *err = REG_ESPACE;
3437   return NULL;
3438 }
3439
3440 /* This is intended for the expressions like "a{1,3}".
3441    Fetch a number from `input', and return the number.
3442    Return -1, if the number field is empty like "{,1}".
3443    Return -2, If an error is occured.  */
3444
3445 static int
3446 fetch_number (input, token, syntax)
3447      re_string_t *input;
3448      re_token_t *token;
3449      reg_syntax_t syntax;
3450 {
3451   int num = -1;
3452   unsigned char c;
3453   while (1)
3454     {
3455       *token = fetch_token (input, syntax);
3456       c = token->opr.c;
3457       if (BE (token->type == END_OF_RE, 0))
3458         return -2;
3459       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3460         break;
3461       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3462              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3463       num = (num > RE_DUP_MAX) ? -2 : num;
3464     }
3465   return num;
3466 }
3467 \f
3468 #ifdef RE_ENABLE_I18N
3469 static void
3470 free_charset (re_charset_t *cset)
3471 {
3472   re_free (cset->mbchars);
3473 # ifdef _LIBC
3474   re_free (cset->coll_syms);
3475   re_free (cset->equiv_classes);
3476   re_free (cset->range_starts);
3477   re_free (cset->range_ends);
3478 # endif
3479   re_free (cset->char_classes);
3480   re_free (cset);
3481 }
3482 #endif /* RE_ENABLE_I18N */
3483 \f
3484 /* Functions for binary tree operation.  */
3485
3486 /* Create a node of tree.
3487    Note: This function automatically free left and right if malloc fails.  */
3488
3489 static bin_tree_t *
3490 create_tree (left, right, type, index)
3491      bin_tree_t *left;
3492      bin_tree_t *right;
3493      re_token_type_t type;
3494      int index;
3495 {
3496   bin_tree_t *tree;
3497   tree = re_malloc (bin_tree_t, 1);
3498   if (BE (tree == NULL, 0))
3499     {
3500       free_bin_tree (left);
3501       free_bin_tree (right);
3502       return NULL;
3503     }
3504   tree->parent = NULL;
3505   tree->left = left;
3506   tree->right = right;
3507   tree->type = type;
3508   tree->node_idx = index;
3509   tree->first = -1;
3510   tree->next = -1;
3511   re_node_set_init_empty (&tree->eclosure);
3512
3513   if (left != NULL)
3514     left->parent = tree;
3515   if (right != NULL)
3516     right->parent = tree;
3517   return tree;
3518 }
3519
3520 /* Free the sub tree pointed by TREE.  */
3521
3522 static void
3523 free_bin_tree (tree)
3524      bin_tree_t *tree;
3525 {
3526   if (tree == NULL)
3527     return;
3528   /*re_node_set_free (&tree->eclosure);*/
3529   free_bin_tree (tree->left);
3530   free_bin_tree (tree->right);
3531   re_free (tree);
3532 }
3533
3534 /* Duplicate the node SRC, and return new node.  */
3535
3536 static bin_tree_t *
3537 duplicate_tree (src, dfa)
3538      const bin_tree_t *src;
3539      re_dfa_t *dfa;
3540 {
3541   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3542   int new_node_idx;
3543   /* Since node indies must be according to Post-order of the tree,
3544      we must duplicate the left at first.  */
3545   if (src->left != NULL)
3546     {
3547       left = duplicate_tree (src->left, dfa);
3548       if (left == NULL)
3549         return NULL;
3550     }
3551
3552   /* Secondaly, duplicate the right.  */
3553   if (src->right != NULL)
3554     {
3555       right = duplicate_tree (src->right, dfa);
3556       if (right == NULL)
3557         {
3558           free_bin_tree (left);
3559           return NULL;
3560         }
3561     }
3562
3563   /* At last, duplicate itself.  */
3564   if (src->type == NON_TYPE)
3565     {
3566       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3567       dfa->nodes[new_node_idx].duplicated = 1;
3568       if (BE (new_node_idx == -1, 0))
3569         {
3570           free_bin_tree (left);
3571           free_bin_tree (right);
3572           return NULL;
3573         }
3574     }
3575   else
3576     new_node_idx = src->type;
3577
3578   new_tree = create_tree (left, right, src->type, new_node_idx);
3579   if (BE (new_tree == NULL, 0))
3580     {
3581       free_bin_tree (left);
3582       free_bin_tree (right);
3583     }
3584   return new_tree;
3585 }