posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <limits.h>
  24 #include <locale.h>
  25 #include <stdio.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28
  29 #if defined HAVE_WCHAR_H || defined _LIBC
  30 # include <wchar.h>
  31 #endif /* HAVE_WCHAR_H || _LIBC */
  32 #if defined HAVE_WCTYPE_H || defined _LIBC
  33 # include <wctype.h>
  34 #endif /* HAVE_WCTYPE_H || _LIBC */
  35
  36 /* In case that the system doesn't have isblank().  */
  37 #if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank
  38 # define isblank(ch) ((ch) == ' ' || (ch) == '\t')
  39 #endif
  40
  41 #ifdef _LIBC
  42 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
  43 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
  44 #   include <locale/localeinfo.h>
  45 #   include <locale/elem-hash.h>
  46 #   include <locale/coll-lookup.h>
  47 # endif
  48 #endif
  49
  50 /* This is for other GNU distributions with internationalized messages.  */
  51 #if HAVE_LIBINTL_H || defined _LIBC
  52 # include <libintl.h>
  53 # ifdef _LIBC
  54 #  undef gettext
  55 #  define gettext(msgid) \
  56   INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)
  57 # endif
  58 #else
  59 # define gettext(msgid) (msgid)
  60 #endif
  61
  62 #ifndef gettext_noop
  63 /* This define is so xgettext can find the internationalizable
  64    strings.  */
  65 # define gettext_noop(String) String
  66 #endif
  67
  68 #include <regex.h>
  69 #include "regex_internal.h"
  70
  71 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  72                                           int length, reg_syntax_t syntax);
  73 static void re_compile_fastmap_iter (regex_t *bufp,
  74                                      const re_dfastate_t *init_state,
  75                                      char *fastmap);
  76 static reg_errcode_t init_dfa (re_dfa_t *dfa, int pat_len);
  77 static reg_errcode_t init_word_char (re_dfa_t *dfa);
  78 #ifdef RE_ENABLE_I18N
  79 static void free_charset (re_charset_t *cset);
  80 #endif /* RE_ENABLE_I18N */
  81 static void free_workarea_compile (regex_t *preg);
  82 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  83 static reg_errcode_t analyze (re_dfa_t *dfa);
  84 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
  85 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
  86 static void calc_next (re_dfa_t *dfa, bin_tree_t *node);
  87 static void calc_epsdest (re_dfa_t *dfa, bin_tree_t *node);
  88 static reg_errcode_t duplicate_node (int *new_idx, re_dfa_t *dfa, int org_idx,
  89                                      unsigned int constraint);
  90 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  91 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  92                                          int node, int root);
  93 static void calc_inveclosure (re_dfa_t *dfa);
  94 static int fetch_number (re_string_t *input, re_token_t *token,
  95                          reg_syntax_t syntax);
  96 static re_token_t fetch_token (re_string_t *input, reg_syntax_t syntax);
  97 static int peek_token (re_token_t *token, re_string_t *input,
  98                         reg_syntax_t syntax);
  99 static int peek_token_bracket (re_token_t *token, re_string_t *input,
 100                                reg_syntax_t syntax);
 101 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
 102                           reg_syntax_t syntax, reg_errcode_t *err);
 103 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
 104                                   re_token_t *token, reg_syntax_t syntax,
 105                                   int nest, reg_errcode_t *err);
 106 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
 107                                  re_token_t *token, reg_syntax_t syntax,
 108                                  int nest, reg_errcode_t *err);
 109 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
 110                                      re_token_t *token, reg_syntax_t syntax,
 111                                      int nest, reg_errcode_t *err);
 112 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
 113                                   re_token_t *token, reg_syntax_t syntax,
 114                                   int nest, reg_errcode_t *err);
 115 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
 116                                  re_dfa_t *dfa, re_token_t *token,
 117                                  reg_syntax_t syntax, reg_errcode_t *err);
 118 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
 119                                       re_token_t *token, reg_syntax_t syntax,
 120                                       reg_errcode_t *err);
 121 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
 122                                             re_string_t *regexp,
 123                                             re_token_t *token, int token_len,
 124                                             re_dfa_t *dfa,
 125                                             reg_syntax_t syntax);
 126 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
 127                                           re_string_t *regexp,
 128                                           re_token_t *token);
 129 #ifndef _LIBC
 130 # ifdef RE_ENABLE_I18N
 131 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 132                                       re_charset_t *mbcset, int *range_alloc,
 133                                       bracket_elem_t *start_elem,
 134                                       bracket_elem_t *end_elem);
 135 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 136                                              re_charset_t *mbcset,
 137                                              int *coll_sym_alloc,
 138                                              const unsigned char *name);
 139 # else /* not RE_ENABLE_I18N */
 140 static reg_errcode_t build_range_exp (re_bitset_ptr_t sbcset,
 141                                       bracket_elem_t *start_elem,
 142                                       bracket_elem_t *end_elem);
 143 static reg_errcode_t build_collating_symbol (re_bitset_ptr_t sbcset,
 144                                              const unsigned char *name);
 145 # endif /* not RE_ENABLE_I18N */
 146 #endif /* not _LIBC */
 147 #ifdef RE_ENABLE_I18N
 148 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 149                                         re_charset_t *mbcset,
 150                                         int *equiv_class_alloc,
 151                                         const unsigned char *name);
 152 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 153                                       re_charset_t *mbcset,
 154                                       int *char_class_alloc,
 155                                       const unsigned char *class_name,
 156                                       reg_syntax_t syntax);
 157 #else  /* not RE_ENABLE_I18N */
 158 static reg_errcode_t build_equiv_class (re_bitset_ptr_t sbcset,
 159                                         const unsigned char *name);
 160 static reg_errcode_t build_charclass (re_bitset_ptr_t sbcset,
 161                                       const unsigned char *class_name,
 162                                       reg_syntax_t syntax);
 163 #endif /* not RE_ENABLE_I18N */
 164 static bin_tree_t *build_word_op (re_dfa_t *dfa, int not, reg_errcode_t *err);
 165 static void free_bin_tree (bin_tree_t *tree);
 166 static bin_tree_t *create_tree (bin_tree_t *left, bin_tree_t *right,
 167                                 re_token_type_t type, int index);
 168 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 169 \f
 170 /* This table gives an error message for each of the error codes listed
 171    in regex.h.  Obviously the order here has to be same as there.
 172    POSIX doesn't require that we do anything for REG_NOERROR,
 173    but why not be nice?  */
 174
 175 const char __re_error_msgid[] attribute_hidden =
 176   {
 177 #define REG_NOERROR_IDX 0
 178     gettext_noop ("Success")    /* REG_NOERROR */
 179     "\0"
 180 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 181     gettext_noop ("No match")   /* REG_NOMATCH */
 182     "\0"
 183 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 184     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 185     "\0"
 186 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 187     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 188     "\0"
 189 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 190     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 191     "\0"
 192 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 193     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 194     "\0"
 195 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 196     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 197     "\0"
 198 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 199     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 200     "\0"
 201 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 202     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 203     "\0"
 204 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 205     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 206     "\0"
 207 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 208     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 209     "\0"
 210 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 211     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 212     "\0"
 213 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 214     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 215     "\0"
 216 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 217     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 218     "\0"
 219 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 220     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 221     "\0"
 222 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 223     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 224     "\0"
 225 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 226     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 227   };
 228
 229 const size_t __re_error_msgid_idx[] attribute_hidden =
 230   {
 231     REG_NOERROR_IDX,
 232     REG_NOMATCH_IDX,
 233     REG_BADPAT_IDX,
 234     REG_ECOLLATE_IDX,
 235     REG_ECTYPE_IDX,
 236     REG_EESCAPE_IDX,
 237     REG_ESUBREG_IDX,
 238     REG_EBRACK_IDX,
 239     REG_EPAREN_IDX,
 240     REG_EBRACE_IDX,
 241     REG_BADBR_IDX,
 242     REG_ERANGE_IDX,
 243     REG_ESPACE_IDX,
 244     REG_BADRPT_IDX,
 245     REG_EEND_IDX,
 246     REG_ESIZE_IDX,
 247     REG_ERPAREN_IDX
 248   };
 249 \f
 250 /* Entry points for GNU code.  */
 251
 252 /* re_compile_pattern is the GNU regular expression compiler: it
 253    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 254    Returns 0 if the pattern was valid, otherwise an error string.
 255
 256    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 257    are set in BUFP on entry.  */
 258
 259 const char *
 260 re_compile_pattern (pattern, length, bufp)
 261     const char *pattern;
 262     size_t length;
 263     struct re_pattern_buffer *bufp;
 264 {
 265   reg_errcode_t ret;
 266
 267   /* GNU code is written to assume at least RE_NREGS registers will be set
 268      (and at least one extra will be -1).  */
 269   bufp->regs_allocated = REGS_UNALLOCATED;
 270
 271   /* And GNU code determines whether or not to get register information
 272      by passing null for the REGS argument to re_match, etc., not by
 273      setting no_sub.  */
 274   bufp->no_sub = 0;
 275
 276   /* Match anchors at newline.  */
 277   bufp->newline_anchor = 1;
 278
 279   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 280
 281   if (!ret)
 282     return NULL;
 283   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 284 }
 285 #ifdef _LIBC
 286 weak_alias (__re_compile_pattern, re_compile_pattern)
 287 #endif
 288
 289 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 290    also be assigned to arbitrarily: each pattern buffer stores its own
 291    syntax, so it can be changed between regex compilations.  */
 292 /* This has no initializer because initialized variables in Emacs
 293    become read-only after dumping.  */
 294 reg_syntax_t re_syntax_options;
 295
 296
 297 /* Specify the precise syntax of regexps for compilation.  This provides
 298    for compatibility for various utilities which historically have
 299    different, incompatible syntaxes.
 300
 301    The argument SYNTAX is a bit mask comprised of the various bits
 302    defined in regex.h.  We return the old syntax.  */
 303
 304 reg_syntax_t
 305 re_set_syntax (syntax)
 306     reg_syntax_t syntax;
 307 {
 308   reg_syntax_t ret = re_syntax_options;
 309
 310   re_syntax_options = syntax;
 311   return ret;
 312 }
 313 #ifdef _LIBC
 314 weak_alias (__re_set_syntax, re_set_syntax)
 315 #endif
 316
 317 int
 318 re_compile_fastmap (bufp)
 319     struct re_pattern_buffer *bufp;
 320 {
 321   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 322   char *fastmap = bufp->fastmap;
 323
 324   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 325   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 326   if (dfa->init_state != dfa->init_state_word)
 327     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 328   if (dfa->init_state != dfa->init_state_nl)
 329     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 330   if (dfa->init_state != dfa->init_state_begbuf)
 331     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 332   bufp->fastmap_accurate = 1;
 333   return 0;
 334 }
 335 #ifdef _LIBC
 336 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 337 #endif
 338
 339 /* Helper function for re_compile_fastmap.
 340    Compile fastmap for the initial_state INIT_STATE.  */
 341
 342 static void
 343 re_compile_fastmap_iter (bufp, init_state, fastmap)
 344      regex_t *bufp;
 345      const re_dfastate_t *init_state;
 346      char *fastmap;
 347 {
 348   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 349   int node_cnt;
 350   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 351     {
 352       int node = init_state->nodes.elems[node_cnt];
 353       re_token_type_t type = dfa->nodes[node].type;
 354       if (type == OP_CONTEXT_NODE)
 355         {
 356           node = dfa->nodes[node].opr.ctx_info->entity;
 357           type = dfa->nodes[node].type;
 358         }
 359
 360       if (type == CHARACTER)
 361         fastmap[dfa->nodes[node].opr.c] = 1;
 362       else if (type == SIMPLE_BRACKET)
 363         {
 364           int i, j, ch;
 365           for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 366             for (j = 0; j < UINT_BITS; ++j, ++ch)
 367               if (dfa->nodes[node].opr.sbcset[i] & (1 << j))
 368                 fastmap[ch] = 1;
 369         }
 370 #ifdef RE_ENABLE_I18N
 371       else if (type == COMPLEX_BRACKET)
 372         {
 373           int i;
 374           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 375           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 376               || cset->nranges || cset->nchar_classes)
 377             {
 378 # ifdef _LIBC
 379               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 380                 {
 381                   /* In this case we want to catch the bytes which are
 382                      the first byte of any collation elements.
 383                      e.g. In da_DK, we want to catch 'a' since "aa"
 384                           is a valid collation element, and don't catch
 385                           'b' since 'b' is the only collation element
 386                           which starts from 'b'.  */
 387                   int j, ch;
 388                   const int32_t *table = (const int32_t *)
 389                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 390                   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 391                     for (j = 0; j < UINT_BITS; ++j, ++ch)
 392                       if (table[ch] < 0)
 393                         fastmap[ch] = 1;
 394                 }
 395 # else
 396               if (MB_CUR_MAX > 1)
 397                 for (i = 0; i < SBC_MAX; ++i)
 398                   if (__btowc (i) == WEOF)
 399                     fastmap[i] = 1;
 400 # endif /* not _LIBC */
 401             }
 402           for (i = 0; i < cset->nmbchars; ++i)
 403             {
 404               char buf[256];
 405               wctomb (buf, cset->mbchars[i]);
 406               fastmap[*(unsigned char *) buf] = 1;
 407             }
 408         }
 409 #endif /* RE_ENABLE_I18N */
 410       else if (type == END_OF_RE || type == OP_PERIOD
 411 #ifdef RE_ENABLE_I18N
 412                || type == COMPLEX_BRACKET
 413 #endif /* RE_ENABLE_I18N */
 414               )
 415         {
 416           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 417           if (type == END_OF_RE)
 418             bufp->can_be_null = 1;
 419           return;
 420         }
 421     }
 422 }
 423 \f
 424 /* Entry point for POSIX code.  */
 425 /* regcomp takes a regular expression as a string and compiles it.
 426
 427    PREG is a regex_t *.  We do not expect any fields to be initialized,
 428    since POSIX says we shouldn't.  Thus, we set
 429
 430      `buffer' to the compiled pattern;
 431      `used' to the length of the compiled pattern;
 432      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 433        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 434        RE_SYNTAX_POSIX_BASIC;
 435      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 436      `fastmap' to an allocated space for the fastmap;
 437      `fastmap_accurate' to zero;
 438      `re_nsub' to the number of subexpressions in PATTERN.
 439
 440    PATTERN is the address of the pattern string.
 441
 442    CFLAGS is a series of bits which affect compilation.
 443
 444      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 445      use POSIX basic syntax.
 446
 447      If REG_NEWLINE is set, then . and [^...] don't match newline.
 448      Also, regexec will try a match beginning after every newline.
 449
 450      If REG_ICASE is set, then we considers upper- and lowercase
 451      versions of letters to be equivalent when matching.
 452
 453      If REG_NOSUB is set, then when PREG is passed to regexec, that
 454      routine will report only success or failure, and nothing about the
 455      registers.
 456
 457    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 458    the return codes and their meanings.)  */
 459
 460 int
 461 regcomp (preg, pattern, cflags)
 462     regex_t *__restrict preg;
 463     const char *__restrict pattern;
 464     int cflags;
 465 {
 466   reg_errcode_t ret;
 467   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 468                          : RE_SYNTAX_POSIX_BASIC);
 469
 470   preg->buffer = NULL;
 471   preg->allocated = 0;
 472   preg->used = 0;
 473
 474   /* Try to allocate space for the fastmap.  */
 475   preg->fastmap = re_malloc (char, SBC_MAX);
 476   if (BE (preg->fastmap == NULL, 0))
 477     return REG_ESPACE;
 478
 479   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 480
 481   /* If REG_NEWLINE is set, newlines are treated differently.  */
 482   if (cflags & REG_NEWLINE)
 483     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 484       syntax &= ~RE_DOT_NEWLINE;
 485       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 486       /* It also changes the matching behavior.  */
 487       preg->newline_anchor = 1;
 488     }
 489   else
 490     preg->newline_anchor = 0;
 491   preg->no_sub = !!(cflags & REG_NOSUB);
 492   preg->translate = NULL;
 493
 494   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 495
 496   /* POSIX doesn't distinguish between an unmatched open-group and an
 497      unmatched close-group: both are REG_EPAREN.  */
 498   if (ret == REG_ERPAREN)
 499     ret = REG_EPAREN;
 500
 501   /* We have already checked preg->fastmap != NULL.  */
 502   if (BE (ret == REG_NOERROR, 1))
 503     {
 504       /* Compute the fastmap now, since regexec cannot modify the pattern
 505          buffer.  */
 506       if (BE (re_compile_fastmap (preg) == -2, 0))
 507         {
 508           /* Some error occurred while computing the fastmap, just forget
 509              about it.  */
 510           re_free (preg->fastmap);
 511           preg->fastmap = NULL;
 512         }
 513     }
 514
 515   return (int) ret;
 516 }
 517 #ifdef _LIBC
 518 weak_alias (__regcomp, regcomp)
 519 #endif
 520
 521 /* Returns a message corresponding to an error code, ERRCODE, returned
 522    from either regcomp or regexec.   We don't use PREG here.  */
 523
 524 size_t
 525 regerror (errcode, preg, errbuf, errbuf_size)
 526     int errcode;
 527     const regex_t *preg;
 528     char *errbuf;
 529     size_t errbuf_size;
 530 {
 531   const char *msg;
 532   size_t msg_size;
 533
 534   if (BE (errcode < 0
 535           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 536                                / sizeof (__re_error_msgid_idx[0])), 0))
 537     /* Only error codes returned by the rest of the code should be passed
 538        to this routine.  If we are given anything else, or if other regex
 539        code generates an invalid error code, then the program has a bug.
 540        Dump core so we can fix it.  */
 541     abort ();
 542
 543   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 544
 545   msg_size = strlen (msg) + 1; /* Includes the null.  */
 546
 547   if (BE (errbuf_size != 0, 1))
 548     {
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551 #if defined HAVE_MEMPCPY || defined _LIBC
 552           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 553 #else
 554           memcpy (errbuf, msg, errbuf_size - 1);
 555           errbuf[errbuf_size - 1] = 0;
 556 #endif
 557         }
 558       else
 559         memcpy (errbuf, msg, msg_size);
 560     }
 561
 562   return msg_size;
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__regerror, regerror)
 566 #endif
 567
 568 /* Free dynamically allocated space used by PREG.  */
 569
 570 void
 571 regfree (preg)
 572     regex_t *preg;
 573 {
 574   int i, j;
 575   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 576   if (BE (dfa != NULL, 1))
 577     {
 578       re_free (dfa->subexps);
 579
 580       for (i = 0; i < dfa->nodes_len; ++i)
 581         {
 582           re_token_t *node = dfa->nodes + i;
 583 #ifdef RE_ENABLE_I18N
 584           if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
 585             free_charset (node->opr.mbcset);
 586           else
 587 #endif /* RE_ENABLE_I18N */
 588           if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
 589             re_free (node->opr.sbcset);
 590           else if (node->type == OP_CONTEXT_NODE)
 591             {
 592               if (dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
 593                 {
 594                   if (node->opr.ctx_info->bkref_eclosure != NULL)
 595                     re_node_set_free (node->opr.ctx_info->bkref_eclosure);
 596                   re_free (node->opr.ctx_info->bkref_eclosure);
 597                 }
 598               re_free (node->opr.ctx_info);
 599             }
 600         }
 601       re_free (dfa->firsts);
 602       re_free (dfa->nexts);
 603       for (i = 0; i < dfa->nodes_len; ++i)
 604         {
 605           if (dfa->eclosures != NULL)
 606             re_node_set_free (dfa->eclosures + i);
 607           if (dfa->inveclosures != NULL)
 608             re_node_set_free (dfa->inveclosures + i);
 609           if (dfa->edests != NULL)
 610             re_node_set_free (dfa->edests + i);
 611         }
 612       re_free (dfa->edests);
 613       re_free (dfa->eclosures);
 614       re_free (dfa->inveclosures);
 615       re_free (dfa->nodes);
 616
 617       for (i = 0; i <= dfa->state_hash_mask; ++i)
 618         {
 619           struct re_state_table_entry *entry = dfa->state_table + i;
 620           for (j = 0; j < entry->num; ++j)
 621             {
 622               re_dfastate_t *state = entry->array[j];
 623               if (state->entrance_nodes != &state->nodes)
 624                 {
 625                   re_node_set_free (state->entrance_nodes);
 626                   re_free (state->entrance_nodes);
 627                 }
 628               re_node_set_free (&state->nodes);
 629               re_free (state->trtable);
 630               re_free (state->trtable_search);
 631               re_free (state);
 632             }
 633           re_free (entry->array);
 634         }
 635       re_free (dfa->state_table);
 636
 637       if (dfa->word_char != NULL)
 638         re_free (dfa->word_char);
 639 #ifdef DEBUG
 640       re_free (dfa->re_str);
 641 #endif
 642       re_free (dfa);
 643     }
 644   re_free (preg->fastmap);
 645 }
 646 #ifdef _LIBC
 647 weak_alias (__regfree, regfree)
 648 #endif
 649 \f
 650 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 651    them unless specifically requested.  */
 652
 653 #if defined _REGEX_RE_COMP || defined _LIBC
 654
 655 /* BSD has one and only one pattern buffer.  */
 656 static struct re_pattern_buffer re_comp_buf;
 657
 658 char *
 659 # ifdef _LIBC
 660 /* Make these definitions weak in libc, so POSIX programs can redefine
 661    these names if they don't use our functions, and still use
 662    regcomp/regexec above without link errors.  */
 663 weak_function
 664 # endif
 665 re_comp (s)
 666      const char *s;
 667 {
 668   reg_errcode_t ret;
 669
 670   if (!s)
 671     {
 672       if (!re_comp_buf.buffer)
 673         return gettext ("No previous regular expression");
 674       return 0;
 675     }
 676
 677   if (!re_comp_buf.buffer)
 678     {
 679       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 680       if (re_comp_buf.fastmap == NULL)
 681         return (char *) gettext (__re_error_msgid
 682                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 683     }
 684
 685   /* Since `re_exec' always passes NULL for the `regs' argument, we
 686      don't need to initialize the pattern buffer fields which affect it.  */
 687
 688   /* Match anchors at newlines.  */
 689   re_comp_buf.newline_anchor = 1;
 690
 691   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 692
 693   if (!ret)
 694     return NULL;
 695
 696   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 697   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 698 }
 699 #endif /* _REGEX_RE_COMP */
 700 \f
 701 /* Internal entry point.
 702    Compile the regular expression PATTERN, whose length is LENGTH.
 703    SYNTAX indicate regular expression's syntax.  */
 704
 705 static reg_errcode_t
 706 re_compile_internal (preg, pattern, length, syntax)
 707      regex_t *preg;
 708      const char * pattern;
 709      int length;
 710      reg_syntax_t syntax;
 711 {
 712   reg_errcode_t err = REG_NOERROR;
 713   re_dfa_t *dfa;
 714   re_string_t regexp;
 715
 716   /* Initialize the pattern buffer.  */
 717   preg->fastmap_accurate = 0;
 718   preg->syntax = syntax;
 719   preg->not_bol = preg->not_eol = 0;
 720   preg->used = 0;
 721   preg->re_nsub = 0;
 722
 723   /* Initialize the dfa.  */
 724   dfa = (re_dfa_t *) preg->buffer;
 725   if (preg->allocated < sizeof (re_dfa_t))
 726     {
 727       /* If zero allocated, but buffer is non-null, try to realloc
 728          enough space.  This loses if buffer's address is bogus, but
 729          that is the user's responsibility.  If ->buffer is NULL this
 730          is a simple allocation.  */
 731       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 732       if (dfa == NULL)
 733         return REG_ESPACE;
 734       preg->allocated = sizeof (re_dfa_t);
 735     }
 736   preg->buffer = (unsigned char *) dfa;
 737   preg->used = sizeof (re_dfa_t);
 738
 739   err = init_dfa (dfa, length);
 740   if (BE (err != REG_NOERROR, 0))
 741     {
 742       re_free (dfa);
 743       preg->buffer = NULL;
 744       return err;
 745     }
 746 #ifdef DEBUG
 747   dfa->re_str = re_malloc (char, length + 1);
 748   strncpy (dfa->re_str, pattern, length + 1);
 749 #endif
 750
 751   err = re_string_construct (&regexp, pattern, length, preg->translate,
 752                              syntax & RE_ICASE);
 753   if (BE (err != REG_NOERROR, 0))
 754     {
 755       re_free (dfa);
 756       preg->buffer = NULL;
 757       return err;
 758     }
 759
 760   /* Parse the regular expression, and build a structure tree.  */
 761   preg->re_nsub = 0;
 762   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 763   if (BE (dfa->str_tree == NULL, 0))
 764     goto re_compile_internal_free_return;
 765
 766   /* Analyze the tree and collect information which is necessary to
 767      create the dfa.  */
 768   err = analyze (dfa);
 769   if (BE (err != REG_NOERROR, 0))
 770     goto re_compile_internal_free_return;
 771
 772   /* Then create the initial state of the dfa.  */
 773   err = create_initial_state (dfa);
 774   if (BE (err != REG_NOERROR, 0))
 775     goto re_compile_internal_free_return;
 776
 777  re_compile_internal_free_return:
 778   /* Release work areas.  */
 779   free_workarea_compile (preg);
 780   re_string_destruct (&regexp);
 781
 782   return err;
 783 }
 784
 785 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 786    as the initial length of some arrays.  */
 787
 788 static reg_errcode_t
 789 init_dfa (dfa, pat_len)
 790      re_dfa_t *dfa;
 791      int pat_len;
 792 {
 793   int table_size;
 794
 795   memset (dfa, '\0', sizeof (re_dfa_t));
 796
 797   dfa->nodes_alloc = pat_len + 1;
 798   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 799
 800   dfa->states_alloc = pat_len + 1;
 801
 802   /*  table_size = 2 ^ ceil(log pat_len) */
 803   for (table_size = 1; table_size > 0; table_size <<= 1)
 804     if (table_size > pat_len)
 805       break;
 806
 807   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 808   dfa->state_hash_mask = table_size - 1;
 809
 810   dfa->subexps_alloc = 1;
 811   dfa->subexps = re_malloc (re_subexp_t, dfa->subexps_alloc);
 812   dfa->word_char = NULL;
 813
 814   if (BE (dfa->nodes == NULL || dfa->state_table == NULL
 815           || dfa->subexps == NULL, 0))
 816     {
 817       /* We don't bother to free anything which was allocated.  Very
 818          soon the process will go down anyway.  */
 819       dfa->subexps = NULL;
 820       dfa->state_table = NULL;
 821       dfa->nodes = NULL;
 822       return REG_ESPACE;
 823     }
 824   return REG_NOERROR;
 825 }
 826
 827 /* Initialize WORD_CHAR table, which indicate which character is
 828    "word".  In this case "word" means that it is the word construction
 829    character used by some operators like "\<", "\>", etc.  */
 830
 831 static reg_errcode_t
 832 init_word_char (dfa)
 833      re_dfa_t *dfa;
 834 {
 835   int i, j, ch;
 836   dfa->word_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
 837   if (BE (dfa->word_char == NULL, 0))
 838     return REG_ESPACE;
 839   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 840     for (j = 0; j < UINT_BITS; ++j, ++ch)
 841       if (isalnum (ch) || ch == '_')
 842         dfa->word_char[i] |= 1 << j;
 843   return REG_NOERROR;
 844 }
 845
 846 /* Free the work area which are only used while compiling.  */
 847
 848 static void
 849 free_workarea_compile (preg)
 850      regex_t *preg;
 851 {
 852   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 853   free_bin_tree (dfa->str_tree);
 854   dfa->str_tree = NULL;
 855 }
 856
 857 /* Create initial states for all contexts.  */
 858
 859 static reg_errcode_t
 860 create_initial_state (dfa)
 861      re_dfa_t *dfa;
 862 {
 863   int first, i;
 864   reg_errcode_t err;
 865   re_node_set init_nodes;
 866
 867   /* Initial states have the epsilon closure of the node which is
 868      the first node of the regular expression.  */
 869   first = dfa->str_tree->first;
 870   dfa->init_node = first;
 871   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 872   if (BE (err != REG_NOERROR, 0))
 873     return err;
 874
 875   /* The back-references which are in initial states can epsilon transit,
 876      since in this case all of the subexpressions can be null.
 877      Then we add epsilon closures of the nodes which are the next nodes of
 878      the back-references.  */
 879   if (dfa->nbackref > 0)
 880     for (i = 0; i < init_nodes.nelem; ++i)
 881       {
 882         int node_idx = init_nodes.elems[i];
 883         re_token_type_t type = dfa->nodes[node_idx].type;
 884
 885         int clexp_idx;
 886         int entity = (type != OP_CONTEXT_NODE ? node_idx
 887                       : dfa->nodes[node_idx].opr.ctx_info->entity);
 888         if ((type != OP_CONTEXT_NODE
 889              || (dfa->nodes[entity].type != OP_BACK_REF))
 890             && (type != OP_BACK_REF))
 891           continue;
 892         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
 893           {
 894             re_token_t *clexp_node;
 895             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
 896             if (clexp_node->type == OP_CLOSE_SUBEXP
 897                 && clexp_node->opr.idx + 1 == dfa->nodes[entity].opr.idx)
 898               break;
 899           }
 900         if (clexp_idx == init_nodes.nelem)
 901           continue;
 902
 903         if (type == OP_CONTEXT_NODE
 904             && (dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type
 905                 == OP_BACK_REF))
 906           {
 907             int prev_nelem = init_nodes.nelem;
 908             re_node_set_merge (&init_nodes,
 909                            dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure);
 910             if (prev_nelem < init_nodes.nelem)
 911               i = 0;
 912           }
 913         else if (type == OP_BACK_REF)
 914           {
 915             int next_idx = dfa->nexts[node_idx];
 916             if (!re_node_set_contains (&init_nodes, next_idx))
 917               {
 918                 re_node_set_merge (&init_nodes, dfa->eclosures + next_idx);
 919                 i = 0;
 920               }
 921           }
 922       }
 923
 924   /* It must be the first time to invoke acquire_state.  */
 925   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
 926   /* We don't check ERR here, since the initial state must not be NULL.  */
 927   if (BE (dfa->init_state == NULL, 0))
 928     return err;
 929   if (dfa->init_state->has_constraint)
 930     {
 931       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
 932                                                        CONTEXT_WORD);
 933       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
 934                                                      CONTEXT_NEWLINE);
 935       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
 936                                                          &init_nodes,
 937                                                          CONTEXT_NEWLINE
 938                                                          | CONTEXT_BEGBUF);
 939       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 940               || dfa->init_state_begbuf == NULL, 0))
 941         return err;
 942     }
 943   else
 944     dfa->init_state_word = dfa->init_state_nl
 945       = dfa->init_state_begbuf = dfa->init_state;
 946
 947   re_node_set_free (&init_nodes);
 948   return REG_NOERROR;
 949 }
 950 \f
 951 /* Analyze the structure tree, and calculate "first", "next", "edest",
 952    "eclosure", and "inveclosure".  */
 953
 954 static reg_errcode_t
 955 analyze (dfa)
 956      re_dfa_t *dfa;
 957 {
 958   int i;
 959   reg_errcode_t ret;
 960
 961   /* Allocate arrays.  */
 962   dfa->firsts = re_malloc (int, dfa->nodes_alloc);
 963   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
 964   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
 965   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 966   dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_alloc);
 967   if (BE (dfa->firsts == NULL || dfa->nexts == NULL || dfa->edests == NULL
 968           || dfa->eclosures == NULL || dfa->inveclosures == NULL, 0))
 969     return REG_ESPACE;
 970   /* Initialize them.  */
 971   for (i = 0; i < dfa->nodes_len; ++i)
 972     {
 973       dfa->firsts[i] = -1;
 974       dfa->nexts[i] = -1;
 975       re_node_set_init_empty (dfa->edests + i);
 976       re_node_set_init_empty (dfa->eclosures + i);
 977       re_node_set_init_empty (dfa->inveclosures + i);
 978     }
 979
 980   ret = analyze_tree (dfa, dfa->str_tree);
 981   if (BE (ret == REG_NOERROR, 1))
 982     {
 983       ret = calc_eclosure (dfa);
 984       if (ret == REG_NOERROR)
 985         calc_inveclosure (dfa);
 986     }
 987   return ret;
 988 }
 989
 990 /* Helper functions for analyze.
 991    This function calculate "first", "next", and "edest" for the subtree
 992    whose root is NODE.  */
 993
 994 static reg_errcode_t
 995 analyze_tree (dfa, node)
 996      re_dfa_t *dfa;
 997      bin_tree_t *node;
 998 {
 999   reg_errcode_t ret;
1000   if (node->first == -1)
1001     calc_first (dfa, node);
1002   if (node->next == -1)
1003     calc_next (dfa, node);
1004   if (node->eclosure.nelem == 0)
1005     calc_epsdest (dfa, node);
1006   /* Calculate "first" etc. for the left child.  */
1007   if (node->left != NULL)
1008     {
1009       ret = analyze_tree (dfa, node->left);
1010       if (BE (ret != REG_NOERROR, 0))
1011         return ret;
1012     }
1013   /* Calculate "first" etc. for the right child.  */
1014   if (node->right != NULL)
1015     {
1016       ret = analyze_tree (dfa, node->right);
1017       if (BE (ret != REG_NOERROR, 0))
1018         return ret;
1019     }
1020   return REG_NOERROR;
1021 }
1022
1023 /* Calculate "first" for the node NODE.  */
1024 static void
1025 calc_first (dfa, node)
1026      re_dfa_t *dfa;
1027      bin_tree_t *node;
1028 {
1029   int idx, type;
1030   idx = node->node_idx;
1031   type = (node->type == 0) ? dfa->nodes[idx].type : node->type;
1032
1033   switch (type)
1034     {
1035 #ifdef DEBUG
1036     case OP_OPEN_BRACKET:
1037     case OP_CLOSE_BRACKET:
1038     case OP_OPEN_DUP_NUM:
1039     case OP_CLOSE_DUP_NUM:
1040     case OP_NON_MATCH_LIST:
1041     case OP_OPEN_COLL_ELEM:
1042     case OP_CLOSE_COLL_ELEM:
1043     case OP_OPEN_EQUIV_CLASS:
1044     case OP_CLOSE_EQUIV_CLASS:
1045     case OP_OPEN_CHAR_CLASS:
1046     case OP_CLOSE_CHAR_CLASS:
1047       /* These must not be appeared here.  */
1048       assert (0);
1049 #endif
1050     case END_OF_RE:
1051     case CHARACTER:
1052     case OP_PERIOD:
1053     case OP_DUP_ASTERISK:
1054     case OP_DUP_QUESTION:
1055 #ifdef RE_ENABLE_I18N
1056     case COMPLEX_BRACKET:
1057 #endif /* RE_ENABLE_I18N */
1058     case SIMPLE_BRACKET:
1059     case OP_BACK_REF:
1060     case ANCHOR:
1061     case OP_OPEN_SUBEXP:
1062     case OP_CLOSE_SUBEXP:
1063       node->first = idx;
1064       break;
1065     case OP_DUP_PLUS:
1066 #ifdef DEBUG
1067       assert (node->left != NULL);
1068 #endif
1069       if (node->left->first == -1)
1070         calc_first (dfa, node->left);
1071       node->first = node->left->first;
1072       break;
1073     case OP_ALT:
1074       node->first = idx;
1075       break;
1076       /* else fall through */
1077     default:
1078 #ifdef DEBUG
1079       assert (node->left != NULL);
1080 #endif
1081       if (node->left->first == -1)
1082         calc_first (dfa, node->left);
1083       node->first = node->left->first;
1084       break;
1085     }
1086   if (node->type == 0)
1087     dfa->firsts[idx] = node->first;
1088 }
1089
1090 /* Calculate "next" for the node NODE.  */
1091
1092 static void
1093 calc_next (dfa, node)
1094      re_dfa_t *dfa;
1095      bin_tree_t *node;
1096 {
1097   int idx, type;
1098   bin_tree_t *parent = node->parent;
1099   if (parent == NULL)
1100     {
1101       node->next = -1;
1102       idx = node->node_idx;
1103       if (node->type == 0)
1104         dfa->nexts[idx] = node->next;
1105       return;
1106     }
1107
1108   idx = parent->node_idx;
1109   type = (parent->type == 0) ? dfa->nodes[idx].type : parent->type;
1110
1111   switch (type)
1112     {
1113     case OP_DUP_ASTERISK:
1114     case OP_DUP_PLUS:
1115       node->next = idx;
1116       break;
1117     case CONCAT:
1118       if (parent->left == node)
1119         {
1120           if (parent->right->first == -1)
1121             calc_first (dfa, parent->right);
1122           node->next = parent->right->first;
1123           break;
1124         }
1125       /* else fall through */
1126     default:
1127       if (parent->next == -1)
1128         calc_next (dfa, parent);
1129       node->next = parent->next;
1130       break;
1131     }
1132   idx = node->node_idx;
1133   if (node->type == 0)
1134     dfa->nexts[idx] = node->next;
1135 }
1136
1137 /* Calculate "edest" for the node NODE.  */
1138
1139 static void
1140 calc_epsdest (dfa, node)
1141      re_dfa_t *dfa;
1142      bin_tree_t *node;
1143 {
1144   int idx;
1145   idx = node->node_idx;
1146   if (node->type == 0)
1147     {
1148       if (dfa->nodes[idx].type == OP_DUP_ASTERISK
1149           || dfa->nodes[idx].type == OP_DUP_PLUS
1150           || dfa->nodes[idx].type == OP_DUP_QUESTION)
1151         {
1152           if (node->left->first == -1)
1153             calc_first (dfa, node->left);
1154           if (node->next == -1)
1155             calc_next (dfa, node);
1156           re_node_set_init_2 (dfa->edests + idx, node->left->first,
1157                               node->next);
1158         }
1159       else if (dfa->nodes[idx].type == OP_ALT)
1160         {
1161           int left, right;
1162           if (node->left != NULL)
1163             {
1164               if (node->left->first == -1)
1165                 calc_first (dfa, node->left);
1166               left = node->left->first;
1167             }
1168           else
1169             {
1170               if (node->next == -1)
1171                 calc_next (dfa, node);
1172               left = node->next;
1173             }
1174           if (node->right != NULL)
1175             {
1176               if (node->right->first == -1)
1177                 calc_first (dfa, node->right);
1178               right = node->right->first;
1179             }
1180           else
1181             {
1182               if (node->next == -1)
1183                 calc_next (dfa, node);
1184               right = node->next;
1185             }
1186           re_node_set_init_2 (dfa->edests + idx, left, right);
1187         }
1188       else if (dfa->nodes[idx].type == ANCHOR
1189                || dfa->nodes[idx].type == OP_OPEN_SUBEXP
1190                || dfa->nodes[idx].type == OP_CLOSE_SUBEXP)
1191         re_node_set_init_1 (dfa->edests + idx, node->next);
1192     }
1193 }
1194
1195 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1196    The new index will be stored in NEW_IDX and return REG_NOERROR if succeeded,
1197    otherwise return the error code.  */
1198
1199 static reg_errcode_t
1200 duplicate_node (new_idx, dfa, org_idx, constraint)
1201      re_dfa_t *dfa;
1202      int *new_idx, org_idx;
1203      unsigned int constraint;
1204 {
1205   re_token_t dup;
1206   int dup_idx;
1207   reg_errcode_t err;
1208
1209   dup.type = OP_CONTEXT_NODE;
1210   if (dfa->nodes[org_idx].type == OP_CONTEXT_NODE)
1211     {
1212       /* If the node whose index is ORG_IDX is the same as the intended
1213          node, use it.  */
1214       if (dfa->nodes[org_idx].constraint == constraint)
1215         {
1216           *new_idx = org_idx;
1217           return REG_NOERROR;
1218         }
1219       dup.constraint = constraint |
1220         dfa->nodes[org_idx].constraint;
1221     }
1222   else
1223     dup.constraint = constraint;
1224
1225   /* In case that `entity' points OP_CONTEXT_NODE,
1226      we correct `entity' to real entity in calc_inveclosures(). */
1227   dup.opr.ctx_info = malloc (sizeof (*dup.opr.ctx_info));
1228   dup_idx = re_dfa_add_node (dfa, dup, 1);
1229   if (BE (dup.opr.ctx_info == NULL || dup_idx == -1, 0))
1230     return REG_ESPACE;
1231   dup.opr.ctx_info->entity = org_idx;
1232   dup.opr.ctx_info->bkref_eclosure = NULL;
1233
1234   dfa->nodes[dup_idx].duplicated = 1;
1235   dfa->firsts[dup_idx] = dfa->firsts[org_idx];
1236   dfa->nexts[dup_idx] = dfa->nexts[org_idx];
1237   err = re_node_set_init_copy (dfa->edests + dup_idx, dfa->edests + org_idx);
1238   if (BE (err != REG_NOERROR, 0))
1239     return err;
1240   /* Since we don't duplicate epsilon nodes, epsilon closure have
1241      only itself.  */
1242   err = re_node_set_init_1 (dfa->eclosures + dup_idx, dup_idx);
1243   if (BE (err != REG_NOERROR, 0))
1244     return err;
1245   err = re_node_set_init_1 (dfa->inveclosures + dup_idx, dup_idx);
1246   if (BE (err != REG_NOERROR, 0))
1247     return err;
1248   /* Then we must update inveclosure for this node.
1249      We process them at last part of calc_eclosure(),
1250      since we don't complete to calculate them here.  */
1251
1252   *new_idx = dup_idx;
1253   return REG_NOERROR;
1254 }
1255
1256 static void
1257 calc_inveclosure (dfa)
1258      re_dfa_t *dfa;
1259 {
1260   int src, idx, dest, entity;
1261   for (src = 0; src < dfa->nodes_len; ++src)
1262     {
1263       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1264         {
1265           dest = dfa->eclosures[src].elems[idx];
1266           re_node_set_insert (dfa->inveclosures + dest, src);
1267         }
1268
1269       entity = src;
1270       while (dfa->nodes[entity].type == OP_CONTEXT_NODE)
1271         {
1272           entity = dfa->nodes[entity].opr.ctx_info->entity;
1273           re_node_set_merge (dfa->inveclosures + src,
1274                              dfa->inveclosures + entity);
1275           dfa->nodes[src].opr.ctx_info->entity = entity;
1276         }
1277     }
1278 }
1279
1280 /* Calculate "eclosure" for all the node in DFA.  */
1281
1282 static reg_errcode_t
1283 calc_eclosure (dfa)
1284      re_dfa_t *dfa;
1285 {
1286   int idx, node_idx, max, incomplete = 0;
1287 #ifdef DEBUG
1288   assert (dfa->nodes_len > 0);
1289 #endif
1290   /* For each nodes, calculate epsilon closure.  */
1291   for (node_idx = 0, max = dfa->nodes_len; ; ++node_idx)
1292     {
1293       reg_errcode_t err;
1294       re_node_set eclosure_elem;
1295       if (node_idx == max)
1296         {
1297           if (!incomplete)
1298             break;
1299           incomplete = 0;
1300           node_idx = 0;
1301         }
1302
1303 #ifdef DEBUG
1304       assert (dfa->nodes[node_idx].type != OP_CONTEXT_NODE);
1305       assert (dfa->eclosures[node_idx].nelem != -1);
1306 #endif
1307       /* If we have already calculated, skip it.  */
1308       if (dfa->eclosures[node_idx].nelem != 0)
1309         continue;
1310       /* Calculate epsilon closure of `node_idx'.  */
1311       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1312       if (BE (err != REG_NOERROR, 0))
1313         return err;
1314
1315       if (dfa->eclosures[node_idx].nelem == 0)
1316         {
1317           incomplete = 1;
1318           re_node_set_free (&eclosure_elem);
1319         }
1320     }
1321
1322   /* for duplicated nodes.  */
1323   for (idx = max; idx < dfa->nodes_len; ++idx)
1324     {
1325       int entity, i, constraint;
1326       re_node_set *bkref_eclosure;
1327       entity = dfa->nodes[idx].opr.ctx_info->entity;
1328       re_node_set_merge (dfa->inveclosures + idx, dfa->inveclosures + entity);
1329       if (dfa->nodes[entity].type != OP_BACK_REF)
1330         continue;
1331
1332       /* If the node is backreference, duplicate the epsilon closure of
1333          the next node. Since it may epsilon transit.  */
1334       /* Note: duplicate_node() may realloc dfa->eclosures, etc.  */
1335       bkref_eclosure = re_malloc (re_node_set, 1);
1336       if (BE (bkref_eclosure == NULL, 0))
1337         return REG_ESPACE;
1338       re_node_set_init_empty (bkref_eclosure);
1339       constraint = dfa->nodes[idx].constraint;
1340       for (i = 0; i < dfa->eclosures[dfa->nexts[idx]].nelem; ++i)
1341         {
1342           int dest_node_idx = dfa->eclosures[dfa->nexts[idx]].elems[i];
1343           if (!IS_EPSILON_NODE (dfa->nodes[dest_node_idx].type))
1344             {
1345               reg_errcode_t err;
1346               err = duplicate_node (&dest_node_idx, dfa, dest_node_idx,
1347                                     constraint);
1348               if (BE (err != REG_NOERROR, 0))
1349                 return err;
1350             }
1351           re_node_set_insert (bkref_eclosure, dest_node_idx);
1352         }
1353       dfa->nodes[idx].opr.ctx_info->bkref_eclosure = bkref_eclosure;
1354     }
1355
1356   return REG_NOERROR;
1357 }
1358
1359 /* Calculate epsilon closure of NODE.  */
1360
1361 static reg_errcode_t
1362 calc_eclosure_iter (new_set, dfa, node, root)
1363      re_node_set *new_set;
1364      re_dfa_t *dfa;
1365      int node, root;
1366 {
1367   reg_errcode_t err;
1368   unsigned int constraint;
1369   int i, max, incomplete = 0;
1370   re_node_set eclosure;
1371   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1372   if (BE (err != REG_NOERROR, 0))
1373     return err;
1374
1375   /* This indicates that we are calculating this node now.
1376      We reference this value to avoid infinite loop.  */
1377   dfa->eclosures[node].nelem = -1;
1378
1379   constraint = ((dfa->nodes[node].type == ANCHOR)
1380                 ? dfa->nodes[node].opr.ctx_type : 0);
1381
1382   /* Expand each epsilon destination nodes.  */
1383   if (dfa->edests[node].nelem != 0)
1384     for (i = 0; i < dfa->edests[node].nelem; ++i)
1385       {
1386         re_node_set eclosure_elem;
1387         int edest = dfa->edests[node].elems[i];
1388         /* If calculating the epsilon closure of `edest' is in progress,
1389            return intermediate result.  */
1390         if (dfa->eclosures[edest].nelem == -1)
1391           {
1392             incomplete = 1;
1393             continue;
1394           }
1395         /* If we haven't calculated the epsilon closure of `edest' yet,
1396            calculate now. Otherwise use calculated epsilon closure.  */
1397         if (dfa->eclosures[edest].nelem == 0)
1398           {
1399             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1400             if (BE (err != REG_NOERROR, 0))
1401               return err;
1402           }
1403         else
1404           eclosure_elem = dfa->eclosures[edest];
1405         /* Merge the epsilon closure of `edest'.  */
1406         re_node_set_merge (&eclosure, &eclosure_elem);
1407         /* If the epsilon closure of `edest' is incomplete,
1408            the epsilon closure of this node is also incomplete.  */
1409         if (dfa->eclosures[edest].nelem == 0)
1410           {
1411             incomplete = 1;
1412             re_node_set_free (&eclosure_elem);
1413           }
1414       }
1415
1416   /* If the current node has constraints, duplicate all non-epsilon nodes.
1417      Since they must inherit the constraints.  */
1418   if (constraint)
1419     for (i = 0, max = eclosure.nelem; i < max; ++i)
1420       {
1421         int dest = eclosure.elems[i];
1422         if (!IS_EPSILON_NODE (dfa->nodes[dest].type))
1423           {
1424             int dup_dest;
1425             reg_errcode_t err;
1426             err = duplicate_node (&dup_dest, dfa, dest, constraint);
1427             if (BE (err != REG_NOERROR, 0))
1428               return err;
1429             if (dest != dup_dest)
1430               {
1431                 re_node_set_remove_at (&eclosure, i--);
1432                 re_node_set_insert (&eclosure, dup_dest);
1433                 --max;
1434               }
1435           }
1436       }
1437
1438   /* Epsilon closures include itself.  */
1439   re_node_set_insert (&eclosure, node);
1440   if (incomplete && !root)
1441     dfa->eclosures[node].nelem = 0;
1442   else
1443     dfa->eclosures[node] = eclosure;
1444   *new_set = eclosure;
1445   return REG_NOERROR;
1446 }
1447 \f
1448 /* Functions for token which are used in the parser.  */
1449
1450 /* Fetch a token from INPUT.
1451    We must not use this function inside bracket expressions.  */
1452
1453 static re_token_t
1454 fetch_token (input, syntax)
1455      re_string_t *input;
1456      reg_syntax_t syntax;
1457 {
1458   re_token_t token;
1459   int consumed_byte;
1460   consumed_byte = peek_token (&token, input, syntax);
1461   re_string_skip_bytes (input, consumed_byte);
1462   return token;
1463 }
1464
1465 /* Peek a token from INPUT, and return the length of the token.
1466    We must not use this function inside bracket expressions.  */
1467
1468 static int
1469 peek_token (token, input, syntax)
1470      re_token_t *token;
1471      re_string_t *input;
1472      reg_syntax_t syntax;
1473 {
1474   unsigned char c;
1475
1476   if (re_string_eoi (input))
1477     {
1478       token->type = END_OF_RE;
1479       return 0;
1480     }
1481
1482   c = re_string_peek_byte (input, 0);
1483   token->opr.c = c;
1484
1485 #ifdef RE_ENABLE_I18N
1486   token->mb_partial = 0;
1487   if (MB_CUR_MAX > 1 &&
1488       !re_string_first_byte (input, re_string_cur_idx (input)))
1489     {
1490       token->type = CHARACTER;
1491       token->mb_partial = 1;
1492       return 1;
1493     }
1494 #endif
1495   if (c == '\\')
1496     {
1497       unsigned char c2;
1498       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1499         {
1500           token->type = BACK_SLASH;
1501           return 1;
1502         }
1503
1504       c2 = re_string_peek_byte_case (input, 1);
1505       token->opr.c = c2;
1506       token->type = CHARACTER;
1507       switch (c2)
1508         {
1509         case '|':
1510           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1511             token->type = OP_ALT;
1512           break;
1513         case '1': case '2': case '3': case '4': case '5':
1514         case '6': case '7': case '8': case '9':
1515           if (!(syntax & RE_NO_BK_REFS))
1516             {
1517               token->type = OP_BACK_REF;
1518               token->opr.idx = c2 - '0';
1519             }
1520           break;
1521         case '<':
1522           if (!(syntax & RE_NO_GNU_OPS))
1523             {
1524               token->type = ANCHOR;
1525               token->opr.idx = WORD_FIRST;
1526             }
1527           break;
1528         case '>':
1529           if (!(syntax & RE_NO_GNU_OPS))
1530             {
1531               token->type = ANCHOR;
1532               token->opr.idx = WORD_LAST;
1533             }
1534           break;
1535         case 'b':
1536           if (!(syntax & RE_NO_GNU_OPS))
1537             {
1538               token->type = ANCHOR;
1539               token->opr.idx = WORD_DELIM;
1540             }
1541           break;
1542         case 'B':
1543           if (!(syntax & RE_NO_GNU_OPS))
1544             {
1545               token->type = ANCHOR;
1546               token->opr.idx = INSIDE_WORD;
1547             }
1548           break;
1549         case 'w':
1550           if (!(syntax & RE_NO_GNU_OPS))
1551             token->type = OP_WORD;
1552           break;
1553         case 'W':
1554           if (!(syntax & RE_NO_GNU_OPS))
1555             token->type = OP_NOTWORD;
1556           break;
1557         case '`':
1558           if (!(syntax & RE_NO_GNU_OPS))
1559             {
1560               token->type = ANCHOR;
1561               token->opr.idx = BUF_FIRST;
1562             }
1563           break;
1564         case '\'':
1565           if (!(syntax & RE_NO_GNU_OPS))
1566             {
1567               token->type = ANCHOR;
1568               token->opr.idx = BUF_LAST;
1569             }
1570           break;
1571         case '(':
1572           if (!(syntax & RE_NO_BK_PARENS))
1573             token->type = OP_OPEN_SUBEXP;
1574           break;
1575         case ')':
1576           if (!(syntax & RE_NO_BK_PARENS))
1577             token->type = OP_CLOSE_SUBEXP;
1578           break;
1579         case '+':
1580           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1581             token->type = OP_DUP_PLUS;
1582           break;
1583         case '?':
1584           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1585             token->type = OP_DUP_QUESTION;
1586           break;
1587         case '{':
1588           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1589             token->type = OP_OPEN_DUP_NUM;
1590           break;
1591         case '}':
1592           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1593             token->type = OP_CLOSE_DUP_NUM;
1594           break;
1595         default:
1596           break;
1597         }
1598       return 2;
1599     }
1600
1601   token->type = CHARACTER;
1602   switch (c)
1603     {
1604     case '\n':
1605       if (syntax & RE_NEWLINE_ALT)
1606         token->type = OP_ALT;
1607       break;
1608     case '|':
1609       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1610         token->type = OP_ALT;
1611       break;
1612     case '*':
1613       token->type = OP_DUP_ASTERISK;
1614       break;
1615     case '+':
1616       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1617         token->type = OP_DUP_PLUS;
1618       break;
1619     case '?':
1620       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1621         token->type = OP_DUP_QUESTION;
1622       break;
1623     case '{':
1624       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1625         token->type = OP_OPEN_DUP_NUM;
1626       break;
1627     case '}':
1628       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1629         token->type = OP_CLOSE_DUP_NUM;
1630       break;
1631     case '(':
1632       if (syntax & RE_NO_BK_PARENS)
1633         token->type = OP_OPEN_SUBEXP;
1634       break;
1635     case ')':
1636       if (syntax & RE_NO_BK_PARENS)
1637         token->type = OP_CLOSE_SUBEXP;
1638       break;
1639     case '[':
1640       token->type = OP_OPEN_BRACKET;
1641       break;
1642     case '.':
1643       token->type = OP_PERIOD;
1644       break;
1645     case '^':
1646       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1647           re_string_cur_idx (input) != 0)
1648         {
1649           char prev = re_string_peek_byte (input, -1);
1650           if (prev != '|' && prev != '(' &&
1651               (!(syntax & RE_NEWLINE_ALT) || prev != '\n'))
1652             break;
1653         }
1654       token->type = ANCHOR;
1655       token->opr.idx = LINE_FIRST;
1656       break;
1657     case '$':
1658       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1659           re_string_cur_idx (input) + 1 != re_string_length (input))
1660         {
1661           re_token_t next;
1662           re_string_skip_bytes (input, 1);
1663           peek_token (&next, input, syntax);
1664           re_string_skip_bytes (input, -1);
1665           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1666             break;
1667         }
1668       token->type = ANCHOR;
1669       token->opr.idx = LINE_LAST;
1670       break;
1671     default:
1672       break;
1673     }
1674   return 1;
1675 }
1676
1677 /* Peek a token from INPUT, and return the length of the token.
1678    We must not use this function out of bracket expressions.  */
1679
1680 static int
1681 peek_token_bracket (token, input, syntax)
1682      re_token_t *token;
1683      re_string_t *input;
1684      reg_syntax_t syntax;
1685 {
1686   unsigned char c;
1687   if (re_string_eoi (input))
1688     {
1689       token->type = END_OF_RE;
1690       return 0;
1691     }
1692   c = re_string_peek_byte (input, 0);
1693   token->opr.c = c;
1694
1695 #ifdef RE_ENABLE_I18N
1696   if (MB_CUR_MAX > 1 &&
1697       !re_string_first_byte (input, re_string_cur_idx (input)))
1698     {
1699       token->type = CHARACTER;
1700       return 1;
1701     }
1702 #endif /* RE_ENABLE_I18N */
1703
1704   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS))
1705     {
1706       /* In this case, '\' escape a character.  */
1707       unsigned char c2;
1708       c2 = re_string_peek_byte (input, 1);
1709       token->opr.c = c2;
1710       token->type = CHARACTER;
1711       return 1;
1712     }
1713   if (c == '[') /* '[' is a special char in a bracket exps.  */
1714     {
1715       unsigned char c2;
1716       int token_len;
1717       c2 = re_string_peek_byte (input, 1);
1718       token->opr.c = c2;
1719       token_len = 2;
1720       switch (c2)
1721         {
1722         case '.':
1723           token->type = OP_OPEN_COLL_ELEM;
1724           break;
1725         case '=':
1726           token->type = OP_OPEN_EQUIV_CLASS;
1727           break;
1728         case ':':
1729           if (syntax & RE_CHAR_CLASSES)
1730             {
1731               token->type = OP_OPEN_CHAR_CLASS;
1732               break;
1733             }
1734           /* else fall through.  */
1735         default:
1736           token->type = CHARACTER;
1737           token->opr.c = c;
1738           token_len = 1;
1739           break;
1740         }
1741       return token_len;
1742     }
1743   switch (c)
1744     {
1745     case '-':
1746       token->type = OP_CHARSET_RANGE;
1747       break;
1748     case ']':
1749       token->type = OP_CLOSE_BRACKET;
1750       break;
1751     case '^':
1752       token->type = OP_NON_MATCH_LIST;
1753       break;
1754     default:
1755       token->type = CHARACTER;
1756     }
1757   return 1;
1758 }
1759 \f
1760 /* Functions for parser.  */
1761
1762 /* Entry point of the parser.
1763    Parse the regular expression REGEXP and return the structure tree.
1764    If an error is occured, ERR is set by error code, and return NULL.
1765    This function build the following tree, from regular expression <reg_exp>:
1766            CAT
1767            / \
1768           /   \
1769    <reg_exp>  EOR
1770
1771    CAT means concatenation.
1772    EOR means end of regular expression.  */
1773
1774 static bin_tree_t *
1775 parse (regexp, preg, syntax, err)
1776      re_string_t *regexp;
1777      regex_t *preg;
1778      reg_syntax_t syntax;
1779      reg_errcode_t *err;
1780 {
1781   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1782   bin_tree_t *tree, *eor, *root;
1783   re_token_t current_token;
1784   int new_idx;
1785   current_token = fetch_token (regexp, syntax);
1786   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
1787   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1788     return NULL;
1789   new_idx = re_dfa_add_node (dfa, current_token, 0);
1790   eor = create_tree (NULL, NULL, 0, new_idx);
1791   if (tree != NULL)
1792     root = create_tree (tree, eor, CONCAT, 0);
1793   else
1794     root = eor;
1795   if (BE (new_idx == -1 || eor == NULL || root == NULL, 0))
1796     return *err = REG_ESPACE, NULL;
1797   return root;
1798 }
1799
1800 /* This function build the following tree, from regular expression
1801    <branch1>|<branch2>:
1802            ALT
1803            / \
1804           /   \
1805    <branch1> <branch2>
1806
1807    ALT means alternative, which represents the operator `|'.  */
1808
1809 static bin_tree_t *
1810 parse_reg_exp (regexp, preg, token, syntax, nest, err)
1811      re_string_t *regexp;
1812      regex_t *preg;
1813      re_token_t *token;
1814      reg_syntax_t syntax;
1815      int nest;
1816      reg_errcode_t *err;
1817 {
1818   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1819   bin_tree_t *tree, *branch = NULL;
1820   int new_idx;
1821   tree = parse_branch (regexp, preg, token, syntax, nest, err);
1822   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1823     return NULL;
1824
1825   while (token->type == OP_ALT)
1826     {
1827       re_token_t alt_token = *token;
1828       new_idx = re_dfa_add_node (dfa, alt_token, 0);
1829       *token = fetch_token (regexp, syntax);
1830       if (token->type != OP_ALT && token->type != END_OF_RE
1831           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1832         {
1833           branch = parse_branch (regexp, preg, token, syntax, nest, err);
1834           if (BE (*err != REG_NOERROR && branch == NULL, 0))
1835             {
1836               free_bin_tree (tree);
1837               return NULL;
1838             }
1839         }
1840       else
1841         branch = NULL;
1842       tree = create_tree (tree, branch, 0, new_idx);
1843       if (BE (new_idx == -1 || tree == NULL, 0))
1844         return *err = REG_ESPACE, NULL;
1845       dfa->has_plural_match = 1;
1846     }
1847   return tree;
1848 }
1849
1850 /* This function build the following tree, from regular expression
1851    <exp1><exp2>:
1852         CAT
1853         / \
1854        /   \
1855    <exp1> <exp2>
1856
1857    CAT means concatenation.  */
1858
1859 static bin_tree_t *
1860 parse_branch (regexp, preg, token, syntax, nest, err)
1861      re_string_t *regexp;
1862      regex_t *preg;
1863      re_token_t *token;
1864      reg_syntax_t syntax;
1865      int nest;
1866      reg_errcode_t *err;
1867 {
1868   bin_tree_t *tree, *exp;
1869   tree = parse_expression (regexp, preg, token, syntax, nest, err);
1870   if (BE (*err != REG_NOERROR && tree == NULL, 0))
1871     return NULL;
1872
1873   while (token->type != OP_ALT && token->type != END_OF_RE
1874          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
1875     {
1876       exp = parse_expression (regexp, preg, token, syntax, nest, err);
1877       if (BE (*err != REG_NOERROR && exp == NULL, 0))
1878         {
1879           free_bin_tree (tree);
1880           return NULL;
1881         }
1882       if (tree != NULL && exp != NULL)
1883         {
1884           tree = create_tree (tree, exp, CONCAT, 0);
1885           if (tree == NULL)
1886             return *err = REG_ESPACE, NULL;
1887         }
1888       else if (tree == NULL)
1889         tree = exp;
1890       /* Otherwise exp == NULL, we don't need to create new tree.  */
1891     }
1892   return tree;
1893 }
1894
1895 /* This function build the following tree, from regular expression a*:
1896          *
1897          |
1898          a
1899 */
1900
1901 static bin_tree_t *
1902 parse_expression (regexp, preg, token, syntax, nest, err)
1903      re_string_t *regexp;
1904      regex_t *preg;
1905      re_token_t *token;
1906      reg_syntax_t syntax;
1907      int nest;
1908      reg_errcode_t *err;
1909 {
1910   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1911   bin_tree_t *tree;
1912   int new_idx;
1913   switch (token->type)
1914     {
1915     case CHARACTER:
1916       new_idx = re_dfa_add_node (dfa, *token, 0);
1917       tree = create_tree (NULL, NULL, 0, new_idx);
1918       if (BE (new_idx == -1 || tree == NULL, 0))
1919         return *err = REG_ESPACE, NULL;
1920 #ifdef RE_ENABLE_I18N
1921       if (MB_CUR_MAX > 1)
1922         {
1923           while (!re_string_eoi (regexp)
1924                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
1925             {
1926               bin_tree_t *mbc_remain;
1927               *token = fetch_token (regexp, syntax);
1928               new_idx = re_dfa_add_node (dfa, *token, 0);
1929               mbc_remain = create_tree (NULL, NULL, 0, new_idx);
1930               tree = create_tree (tree, mbc_remain, CONCAT, 0);
1931               if (BE (new_idx == -1 || mbc_remain == NULL || tree == NULL, 0))
1932                 return *err = REG_ESPACE, NULL;
1933             }
1934         }
1935 #endif
1936       break;
1937     case OP_OPEN_SUBEXP:
1938       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
1939       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1940         return NULL;
1941       break;
1942     case OP_OPEN_BRACKET:
1943       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
1944       if (BE (*err != REG_NOERROR && tree == NULL, 0))
1945         return NULL;
1946       break;
1947     case OP_BACK_REF:
1948       if (BE (preg->re_nsub < token->opr.idx
1949               || dfa->subexps[token->opr.idx - 1].end == -1, 0))
1950         {
1951           *err = REG_ESUBREG;
1952           return NULL;
1953         }
1954       new_idx = re_dfa_add_node (dfa, *token, 0);
1955       tree = create_tree (NULL, NULL, 0, new_idx);
1956       if (BE (new_idx == -1 || tree == NULL, 0))
1957         return *err = REG_ESPACE, NULL;
1958       ++dfa->nbackref;
1959       dfa->has_mb_node = 1;
1960       break;
1961     case OP_DUP_ASTERISK:
1962     case OP_DUP_PLUS:
1963     case OP_DUP_QUESTION:
1964     case OP_OPEN_DUP_NUM:
1965       if (syntax & RE_CONTEXT_INVALID_OPS)
1966         return *err = REG_BADRPT, NULL;
1967       else if (syntax & RE_CONTEXT_INDEP_OPS)
1968         {
1969           *token = fetch_token (regexp, syntax);
1970           return parse_expression (regexp, preg, token, syntax, nest, err);
1971         }
1972       /* else fall through  */
1973     case OP_CLOSE_SUBEXP:
1974       if ((token->type == OP_CLOSE_SUBEXP) &&
1975           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
1976         return *err = REG_ERPAREN, NULL;
1977       /* else fall through  */
1978     case OP_CLOSE_DUP_NUM:
1979       /* We treat it as a normal character.  */
1980
1981       /* Then we can these characters as normal characters.  */
1982       token->type = CHARACTER;
1983       new_idx = re_dfa_add_node (dfa, *token, 0);
1984       tree = create_tree (NULL, NULL, 0, new_idx);
1985       if (BE (new_idx == -1 || tree == NULL, 0))
1986         return *err = REG_ESPACE, NULL;
1987       break;
1988     case ANCHOR:
1989       if (dfa->word_char == NULL)
1990         {
1991           *err = init_word_char (dfa);
1992           if (BE (*err != REG_NOERROR, 0))
1993             return NULL;
1994         }
1995       if (token->opr.ctx_type == WORD_DELIM)
1996         {
1997           bin_tree_t *tree_first, *tree_last;
1998           int idx_first, idx_last;
1999           token->opr.ctx_type = WORD_FIRST;
2000           idx_first = re_dfa_add_node (dfa, *token, 0);
2001           tree_first = create_tree (NULL, NULL, 0, idx_first);
2002           token->opr.ctx_type = WORD_LAST;
2003           idx_last = re_dfa_add_node (dfa, *token, 0);
2004           tree_last = create_tree (NULL, NULL, 0, idx_last);
2005           token->type = OP_ALT;
2006           new_idx = re_dfa_add_node (dfa, *token, 0);
2007           tree = create_tree (tree_first, tree_last, 0, new_idx);
2008           if (BE (idx_first == -1 || idx_last == -1 || new_idx == -1
2009                   || tree_first == NULL || tree_last == NULL
2010                   || tree == NULL, 0))
2011             return *err = REG_ESPACE, NULL;
2012         }
2013       else
2014         {
2015           new_idx = re_dfa_add_node (dfa, *token, 0);
2016           tree = create_tree (NULL, NULL, 0, new_idx);
2017           if (BE (new_idx == -1 || tree == NULL, 0))
2018             return *err = REG_ESPACE, NULL;
2019         }
2020       /* We must return here, since ANCHORs can't be followed
2021          by repetition operators.
2022          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2023              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2024       *token = fetch_token (regexp, syntax);
2025       return tree;
2026     case OP_PERIOD:
2027       new_idx = re_dfa_add_node (dfa, *token, 0);
2028       tree = create_tree (NULL, NULL, 0, new_idx);
2029       if (BE (new_idx == -1 || tree == NULL, 0))
2030         return *err = REG_ESPACE, NULL;
2031       if (MB_CUR_MAX > 1)
2032         dfa->has_mb_node = 1;
2033       break;
2034     case OP_WORD:
2035       tree = build_word_op (dfa, 0, err);
2036       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2037         return NULL;
2038       break;
2039     case OP_NOTWORD:
2040       tree = build_word_op (dfa, 1, err);
2041       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2042         return NULL;
2043       break;
2044     case OP_ALT:
2045     case END_OF_RE:
2046       return NULL;
2047     case BACK_SLASH:
2048       *err = REG_EESCAPE;
2049       return NULL;
2050     default:
2051       /* Must not happen?  */
2052 #ifdef DEBUG
2053       assert (0);
2054 #endif
2055       return NULL;
2056     }
2057   *token = fetch_token (regexp, syntax);
2058
2059   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2060          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2061     {
2062       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2063       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2064         return NULL;
2065       dfa->has_plural_match = 1;
2066     }
2067
2068   return tree;
2069 }
2070
2071 /* This function build the following tree, from regular expression
2072    (<reg_exp>):
2073          SUBEXP
2074             |
2075         <reg_exp>
2076 */
2077
2078 static bin_tree_t *
2079 parse_sub_exp (regexp, preg, token, syntax, nest, err)
2080      re_string_t *regexp;
2081      regex_t *preg;
2082      re_token_t *token;
2083      reg_syntax_t syntax;
2084      int nest;
2085      reg_errcode_t *err;
2086 {
2087   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2088   bin_tree_t *tree, *left_par, *right_par;
2089   size_t cur_nsub;
2090   int new_idx;
2091   cur_nsub = preg->re_nsub++;
2092   if (dfa->subexps_alloc < preg->re_nsub)
2093     {
2094       re_subexp_t *new_array;
2095       dfa->subexps_alloc *= 2;
2096       new_array = re_realloc (dfa->subexps, re_subexp_t, dfa->subexps_alloc);
2097       if (BE (new_array == NULL, 0))
2098         {
2099           dfa->subexps_alloc /= 2;
2100           *err = REG_ESPACE;
2101           return NULL;
2102         }
2103       dfa->subexps = new_array;
2104     }
2105   dfa->subexps[cur_nsub].start = dfa->nodes_len;
2106   dfa->subexps[cur_nsub].end = -1;
2107
2108   new_idx = re_dfa_add_node (dfa, *token, 0);
2109   left_par = create_tree (NULL, NULL, 0, new_idx);
2110   if (BE (new_idx == -1 || left_par == NULL, 0))
2111     return *err = REG_ESPACE, NULL;
2112   dfa->nodes[new_idx].opr.idx = cur_nsub;
2113   *token = fetch_token (regexp, syntax);
2114
2115   /* The subexpression may be a null string.  */
2116   if (token->type == OP_CLOSE_SUBEXP)
2117     tree = NULL;
2118   else
2119     {
2120       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2121       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2122         return NULL;
2123     }
2124   if (BE (token->type != OP_CLOSE_SUBEXP, 0))
2125     {
2126       free_bin_tree (tree);
2127       *err = REG_BADPAT;
2128       return NULL;
2129     }
2130   new_idx = re_dfa_add_node (dfa, *token, 0);
2131   dfa->subexps[cur_nsub].end = dfa->nodes_len;
2132   right_par = create_tree (NULL, NULL, 0, new_idx);
2133   tree = ((tree == NULL) ? right_par
2134           : create_tree (tree, right_par, CONCAT, 0));
2135   tree = create_tree (left_par, tree, CONCAT, 0);
2136   if (BE (new_idx == -1 || right_par == NULL || tree == NULL, 0))
2137     return *err = REG_ESPACE, NULL;
2138   dfa->nodes[new_idx].opr.idx = cur_nsub;
2139
2140   return tree;
2141 }
2142
2143 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2144
2145 static bin_tree_t *
2146 parse_dup_op (dup_elem, regexp, dfa, token, syntax, err)
2147      bin_tree_t *dup_elem;
2148      re_string_t *regexp;
2149      re_dfa_t *dfa;
2150      re_token_t *token;
2151      reg_syntax_t syntax;
2152      reg_errcode_t *err;
2153 {
2154   re_token_t dup_token;
2155   bin_tree_t *tree = dup_elem, *work_tree;
2156   int new_idx, start_idx = re_string_cur_idx (regexp);
2157   re_token_t start_token = *token;
2158   if (token->type == OP_OPEN_DUP_NUM)
2159     {
2160       int i;
2161       int end = 0;
2162       int start = fetch_number (regexp, token, syntax);
2163       bin_tree_t *elem;
2164       if (start == -1)
2165         {
2166           if (token->type == CHARACTER && token->opr.c == ',')
2167             start = 0; /* We treat "{,m}" as "{0,m}".  */
2168           else
2169             {
2170               *err = REG_BADBR; /* <re>{} is invalid.  */
2171               return NULL;
2172             }
2173         }
2174       if (BE (start != -2, 1))
2175         {
2176           /* We treat "{n}" as "{n,n}".  */
2177           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2178                  : ((token->type == CHARACTER && token->opr.c == ',')
2179                     ? fetch_number (regexp, token, syntax) : -2));
2180         }
2181       if (BE (start == -2 || end == -2, 0))
2182         {
2183           /* Invalid sequence.  */
2184           if (token->type == OP_CLOSE_DUP_NUM)
2185             goto parse_dup_op_invalid_interval;
2186           else
2187             goto parse_dup_op_ebrace;
2188         }
2189       if (BE (start == 0 && end == 0, 0))
2190         {
2191           /* We treat "<re>{0}" and "<re>{0,0}" as null string.  */
2192           *token = fetch_token (regexp, syntax);
2193           free_bin_tree (dup_elem);
2194           return NULL;
2195         }
2196
2197       /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2198       elem = tree;
2199       for (i = 0; i < start; ++i)
2200         if (i != 0)
2201           {
2202             work_tree = duplicate_tree (elem, dfa);
2203             tree = create_tree (tree, work_tree, CONCAT, 0);
2204             if (BE (work_tree == NULL || tree == NULL, 0))
2205               goto parse_dup_op_espace;
2206           }
2207
2208       if (end == -1)
2209         {
2210           /* We treat "<re>{0,}" as "<re>*".  */
2211           dup_token.type = OP_DUP_ASTERISK;
2212           if (start > 0)
2213             {
2214               elem = duplicate_tree (elem, dfa);
2215               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2216               work_tree = create_tree (elem, NULL, 0, new_idx);
2217               tree = create_tree (tree, work_tree, CONCAT, 0);
2218               if (BE (elem == NULL || new_idx == -1 || work_tree == NULL
2219                       || tree == NULL, 0))
2220                 goto parse_dup_op_espace;
2221             }
2222           else
2223             {
2224               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2225               tree = create_tree (elem, NULL, 0, new_idx);
2226               if (BE (new_idx == -1 || tree == NULL, 0))
2227                 goto parse_dup_op_espace;
2228             }
2229         }
2230       else if (end - start > 0)
2231         {
2232           /* Then extract "<re>{0,m}" to "<re>?<re>?...<re>?".  */
2233           dup_token.type = OP_DUP_QUESTION;
2234           if (start > 0)
2235             {
2236               elem = duplicate_tree (elem, dfa);
2237               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2238               elem = create_tree (elem, NULL, 0, new_idx);
2239               tree = create_tree (tree, elem, CONCAT, 0);
2240               if (BE (elem == NULL || new_idx == -1 || tree == NULL, 0))
2241                 goto parse_dup_op_espace;
2242             }
2243           else
2244             {
2245               new_idx = re_dfa_add_node (dfa, dup_token, 0);
2246               tree = elem = create_tree (elem, NULL, 0, new_idx);
2247               if (BE (new_idx == -1 || tree == NULL, 0))
2248                 goto parse_dup_op_espace;
2249             }
2250           for (i = 1; i < end - start; ++i)
2251             {
2252               work_tree = duplicate_tree (elem, dfa);
2253               tree = create_tree (tree, work_tree, CONCAT, 0);
2254               if (BE (work_tree == NULL || tree == NULL, 0))
2255                 return *err = REG_ESPACE, NULL;
2256             }
2257         }
2258     }
2259   else
2260     {
2261       new_idx = re_dfa_add_node (dfa, *token, 0);
2262       tree = create_tree (tree, NULL, 0, new_idx);
2263       if (BE (new_idx == -1 || tree == NULL, 0))
2264         return *err = REG_ESPACE, NULL;
2265     }
2266   *token = fetch_token (regexp, syntax);
2267   return tree;
2268
2269  parse_dup_op_espace:
2270   free_bin_tree (tree);
2271   *err = REG_ESPACE;
2272   return NULL;
2273
2274  parse_dup_op_ebrace:
2275   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2276     {
2277       *err = REG_EBRACE;
2278       return NULL;
2279     }
2280   goto parse_dup_op_rollback;
2281  parse_dup_op_invalid_interval:
2282   if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2283     {
2284       *err = REG_BADBR;
2285       return NULL;
2286     }
2287  parse_dup_op_rollback:
2288   re_string_set_index (regexp, start_idx);
2289   *token = start_token;
2290   token->type = CHARACTER;
2291   return dup_elem;
2292 }
2293
2294 /* Size of the names for collating symbol/equivalence_class/character_class.
2295    I'm not sure, but maybe enough.  */
2296 #define BRACKET_NAME_BUF_SIZE 32
2297
2298 #ifndef _LIBC
2299   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2300      Build the range expression which starts from START_ELEM, and ends
2301      at END_ELEM.  The result are written to MBCSET and SBCSET.
2302      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2303      mbcset->range_ends, is a pointer argument sinse we may
2304      update it.  */
2305
2306 static reg_errcode_t
2307 # ifdef RE_ENABLE_I18N
2308 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2309      re_charset_t *mbcset;
2310      int *range_alloc;
2311 # else /* not RE_ENABLE_I18N */
2312 build_range_exp (sbcset, start_elem, end_elem)
2313 # endif /* not RE_ENABLE_I18N */
2314      re_bitset_ptr_t sbcset;
2315      bracket_elem_t *start_elem, *end_elem;
2316 {
2317   unsigned int start_ch, end_ch;
2318   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2319   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2320           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2321           0))
2322     return REG_ERANGE;
2323
2324   /* We can handle no multi character collating elements without libc
2325      support.  */
2326   if (BE ((start_elem->type == COLL_SYM
2327            && strlen ((char *) start_elem->opr.name) > 1)
2328           || (end_elem->type == COLL_SYM
2329               && strlen ((char *) end_elem->opr.name) > 1), 0))
2330     return REG_ECOLLATE;
2331
2332 # ifdef RE_ENABLE_I18N
2333   {
2334     wchar_t wc, start_wc, end_wc;
2335     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2336
2337     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2338                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2339                    : 0));
2340     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2341               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2342                  : 0));
2343     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2344                 ? __btowc (start_ch) : start_elem->opr.wch);
2345     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2346               ? __btowc (end_ch) : end_elem->opr.wch);
2347     cmp_buf[0] = start_wc;
2348     cmp_buf[4] = end_wc;
2349     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2350       return REG_ERANGE;
2351
2352     /* Check the space of the arrays.  */
2353     if (*range_alloc == mbcset->nranges)
2354       {
2355         /* There are not enough space, need realloc.  */
2356         wchar_t *new_array_start, *new_array_end;
2357         int new_nranges;
2358
2359         /* +1 in case of mbcset->nranges is 0.  */
2360         new_nranges = 2 * mbcset->nranges + 1;
2361         /* Use realloc since mbcset->range_starts and mbcset->range_ends
2362            are NULL if *range_alloc == 0.  */
2363         new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2364                                       new_nranges);
2365         new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2366                                     new_nranges);
2367
2368         if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2369           return REG_ESPACE;
2370
2371         mbcset->range_starts = new_array_start;
2372         mbcset->range_ends = new_array_end;
2373         *range_alloc = new_nranges;
2374       }
2375
2376     mbcset->range_starts[mbcset->nranges] = start_wc;
2377     mbcset->range_ends[mbcset->nranges++] = end_wc;
2378
2379     /* Build the table for single byte characters.  */
2380     for (wc = 0; wc <= SBC_MAX; ++wc)
2381       {
2382         cmp_buf[2] = wc;
2383         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2384             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2385           bitset_set (sbcset, wc);
2386       }
2387   }
2388 # else /* not RE_ENABLE_I18N */
2389   {
2390     unsigned int ch;
2391     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2392                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2393                    : 0));
2394     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2395               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2396                  : 0));
2397     if (start_ch > end_ch)
2398       return REG_ERANGE;
2399     /* Build the table for single byte characters.  */
2400     for (ch = 0; ch <= SBC_MAX; ++ch)
2401       if (start_ch <= ch  && ch <= end_ch)
2402         bitset_set (sbcset, ch);
2403   }
2404 # endif /* not RE_ENABLE_I18N */
2405   return REG_NOERROR;
2406 }
2407 #endif /* not _LIBC */
2408
2409 #ifndef _LIBC
2410 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2411    Build the collating element which is represented by NAME.
2412    The result are written to MBCSET and SBCSET.
2413    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2414    pointer argument since we may update it.  */
2415
2416 static reg_errcode_t
2417 # ifdef RE_ENABLE_I18N
2418 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2419      re_charset_t *mbcset;
2420      int *coll_sym_alloc;
2421 # else /* not RE_ENABLE_I18N */
2422 build_collating_symbol (sbcset, name)
2423 # endif /* not RE_ENABLE_I18N */
2424      re_bitset_ptr_t sbcset;
2425      const unsigned char *name;
2426 {
2427   size_t name_len = strlen ((const char *) name);
2428   if (BE (name_len != 1, 0))
2429     return REG_ECOLLATE;
2430   else
2431     {
2432       bitset_set (sbcset, name[0]);
2433       return REG_NOERROR;
2434     }
2435 }
2436 #endif /* not _LIBC */
2437
2438 /* This function parse bracket expression like "[abc]", "[a-c]",
2439    "[[.a-a.]]" etc.  */
2440
2441 static bin_tree_t *
2442 parse_bracket_exp (regexp, dfa, token, syntax, err)
2443      re_string_t *regexp;
2444      re_dfa_t *dfa;
2445      re_token_t *token;
2446      reg_syntax_t syntax;
2447      reg_errcode_t *err;
2448 {
2449 #ifdef _LIBC
2450   const unsigned char *collseqmb;
2451   const char *collseqwc;
2452   uint32_t nrules;
2453   int32_t table_size;
2454   const int32_t *symb_table;
2455   const unsigned char *extra;
2456
2457   /* Local function for parse_bracket_exp used in _LIBC environement.
2458      Seek the collating symbol entry correspondings to NAME.
2459      Return the index of the symbol in the SYMB_TABLE.  */
2460
2461   static inline int32_t
2462   seek_collating_symbol_entry (name, name_len)
2463          const unsigned char *name;
2464          size_t name_len;
2465     {
2466       int32_t hash = elem_hash ((const char *) name, name_len);
2467       int32_t elem = hash % table_size;
2468       int32_t second = hash % (table_size - 2);
2469       while (symb_table[2 * elem] != 0)
2470         {
2471           /* First compare the hashing value.  */
2472           if (symb_table[2 * elem] == hash
2473               /* Compare the length of the name.  */
2474               && name_len == extra[symb_table[2 * elem + 1]]
2475               /* Compare the name.  */
2476               && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2477                          name_len) == 0)
2478             {
2479               /* Yep, this is the entry.  */
2480               break;
2481             }
2482
2483           /* Next entry.  */
2484           elem += second;
2485         }
2486       return elem;
2487     }
2488
2489   /* Local function for parse_bracket_exp used in _LIBC environement.
2490      Look up the collation sequence value of BR_ELEM.
2491      Return the value if succeeded, UINT_MAX otherwise.  */
2492
2493   static inline unsigned int
2494   lookup_collation_sequence_value (br_elem)
2495          bracket_elem_t *br_elem;
2496     {
2497       if (br_elem->type == SB_CHAR)
2498         {
2499           /*
2500           if (MB_CUR_MAX == 1)
2501           */
2502           if (nrules == 0)
2503             return collseqmb[br_elem->opr.ch];
2504           else
2505             {
2506               wint_t wc = __btowc (br_elem->opr.ch);
2507               return collseq_table_lookup (collseqwc, wc);
2508             }
2509         }
2510       else if (br_elem->type == MB_CHAR)
2511         {
2512           return collseq_table_lookup (collseqwc, br_elem->opr.wch);
2513         }
2514       else if (br_elem->type == COLL_SYM)
2515         {
2516           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2517           if (nrules != 0)
2518             {
2519               int32_t elem, idx;
2520               elem = seek_collating_symbol_entry (br_elem->opr.name,
2521                                                   sym_name_len);
2522               if (symb_table[2 * elem] != 0)
2523                 {
2524                   /* We found the entry.  */
2525                   idx = symb_table[2 * elem + 1];
2526                   /* Skip the name of collating element name.  */
2527                   idx += 1 + extra[idx];
2528                   /* Skip the byte sequence of the collating element.  */
2529                   idx += 1 + extra[idx];
2530                   /* Adjust for the alignment.  */
2531                   idx = (idx + 3) & ~3;
2532                   /* Skip the multibyte collation sequence value.  */
2533                   idx += sizeof (unsigned int);
2534                   /* Skip the wide char sequence of the collating element.  */
2535                   idx += sizeof (unsigned int) *
2536                     (1 + *(unsigned int *) (extra + idx));
2537                   /* Return the collation sequence value.  */
2538                   return *(unsigned int *) (extra + idx);
2539                 }
2540               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2541                 {
2542                   /* No valid character.  Match it as a single byte
2543                      character.  */
2544                   return collseqmb[br_elem->opr.name[0]];
2545                 }
2546             }
2547           else if (sym_name_len == 1)
2548             return collseqmb[br_elem->opr.name[0]];
2549         }
2550       return UINT_MAX;
2551     }
2552
2553   /* Local function for parse_bracket_exp used in _LIBC environement.
2554      Build the range expression which starts from START_ELEM, and ends
2555      at END_ELEM.  The result are written to MBCSET and SBCSET.
2556      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2557      mbcset->range_ends, is a pointer argument sinse we may
2558      update it.  */
2559
2560   static inline reg_errcode_t
2561 # ifdef RE_ENABLE_I18N
2562   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2563          re_charset_t *mbcset;
2564          int *range_alloc;
2565 # else /* not RE_ENABLE_I18N */
2566   build_range_exp (sbcset, start_elem, end_elem)
2567 # endif /* not RE_ENABLE_I18N */
2568          re_bitset_ptr_t sbcset;
2569          bracket_elem_t *start_elem, *end_elem;
2570     {
2571       unsigned int ch;
2572       uint32_t start_collseq;
2573       uint32_t end_collseq;
2574
2575 # ifdef RE_ENABLE_I18N
2576       /* Check the space of the arrays.  */
2577       if (*range_alloc == mbcset->nranges)
2578         {
2579           /* There are not enough space, need realloc.  */
2580           uint32_t *new_array_start;
2581           uint32_t *new_array_end;
2582           int new_nranges;
2583
2584           /* +1 in case of mbcset->nranges is 0.  */
2585           new_nranges = 2 * mbcset->nranges + 1;
2586           /* Use realloc since mbcset->range_starts and mbcset->range_ends
2587              are NULL if *range_alloc == 0.  */
2588           new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2589                                         new_nranges);
2590           new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2591                                       new_nranges);
2592
2593           if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2594             return REG_ESPACE;
2595
2596           mbcset->range_starts = new_array_start;
2597           mbcset->range_ends = new_array_end;
2598           *range_alloc = new_nranges;
2599         }
2600 # endif /* RE_ENABLE_I18N */
2601
2602       /* Equivalence Classes and Character Classes can't be a range
2603          start/end.  */
2604       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2605               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2606               0))
2607         return REG_ERANGE;
2608
2609       start_collseq = lookup_collation_sequence_value (start_elem);
2610       end_collseq = lookup_collation_sequence_value (end_elem);
2611       /* Check start/end collation sequence values.  */
2612       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2613         return REG_ECOLLATE;
2614       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2615         return REG_ERANGE;
2616
2617 # ifdef RE_ENABLE_I18N
2618       /* Got valid collation sequence values, add them as a new entry.  */
2619       mbcset->range_starts[mbcset->nranges] = start_collseq;
2620       mbcset->range_ends[mbcset->nranges++] = end_collseq;
2621 # endif /* RE_ENABLE_I18N */
2622
2623       /* Build the table for single byte characters.  */
2624       for (ch = 0; ch <= SBC_MAX; ch++)
2625         {
2626           uint32_t ch_collseq;
2627           /*
2628           if (MB_CUR_MAX == 1)
2629           */
2630           if (nrules == 0)
2631             ch_collseq = collseqmb[ch];
2632           else
2633             ch_collseq = collseq_table_lookup (collseqwc, __btowc (ch));
2634           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2635             bitset_set (sbcset, ch);
2636         }
2637       return REG_NOERROR;
2638     }
2639
2640   /* Local function for parse_bracket_exp used in _LIBC environement.
2641      Build the collating element which is represented by NAME.
2642      The result are written to MBCSET and SBCSET.
2643      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2644      pointer argument sinse we may update it.  */
2645
2646   static inline reg_errcode_t
2647 # ifdef RE_ENABLE_I18N
2648   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2649          re_charset_t *mbcset;
2650          int *coll_sym_alloc;
2651 # else /* not RE_ENABLE_I18N */
2652   build_collating_symbol (sbcset, name)
2653 # endif /* not RE_ENABLE_I18N */
2654          re_bitset_ptr_t sbcset;
2655          const unsigned char *name;
2656     {
2657       int32_t elem, idx;
2658       size_t name_len = strlen ((const char *) name);
2659       if (nrules != 0)
2660         {
2661           elem = seek_collating_symbol_entry (name, name_len);
2662           if (symb_table[2 * elem] != 0)
2663             {
2664               /* We found the entry.  */
2665               idx = symb_table[2 * elem + 1];
2666               /* Skip the name of collating element name.  */
2667               idx += 1 + extra[idx];
2668             }
2669           else if (symb_table[2 * elem] == 0 && name_len == 1)
2670             {
2671               /* No valid character, treat it as a normal
2672                  character.  */
2673               bitset_set (sbcset, name[0]);
2674               return REG_NOERROR;
2675             }
2676           else
2677             return REG_ECOLLATE;
2678
2679 # ifdef RE_ENABLE_I18N
2680           /* Got valid collation sequence, add it as a new entry.  */
2681           /* Check the space of the arrays.  */
2682           if (*coll_sym_alloc == mbcset->ncoll_syms)
2683             {
2684               /* Not enough, realloc it.  */
2685               /* +1 in case of mbcset->ncoll_syms is 0.  */
2686               *coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2687               /* Use realloc since mbcset->coll_syms is NULL
2688                  if *alloc == 0.  */
2689               mbcset->coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2690                                               *coll_sym_alloc);
2691               if (BE (mbcset->coll_syms == NULL, 0))
2692                 return REG_ESPACE;
2693             }
2694           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2695 # endif /* RE_ENABLE_I18N */
2696           return REG_NOERROR;
2697         }
2698       else
2699         {
2700           if (BE (name_len != 1, 0))
2701             return REG_ECOLLATE;
2702           else
2703             {
2704               bitset_set (sbcset, name[0]);
2705               return REG_NOERROR;
2706             }
2707         }
2708     }
2709 #endif
2710
2711   re_token_t br_token;
2712   re_bitset_ptr_t sbcset;
2713 #ifdef RE_ENABLE_I18N
2714   re_charset_t *mbcset;
2715   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2716   int equiv_class_alloc = 0, char_class_alloc = 0;
2717 #else /* not RE_ENABLE_I18N */
2718   int non_match = 0;
2719 #endif /* not RE_ENABLE_I18N */
2720   bin_tree_t *work_tree;
2721   int token_len, new_idx;
2722 #ifdef _LIBC
2723   collseqmb = (const unsigned char *)
2724     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2725   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2726   if (nrules)
2727     {
2728       /*
2729       if (MB_CUR_MAX > 1)
2730       */
2731         collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2732       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
2733       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
2734                                                   _NL_COLLATE_SYMB_TABLEMB);
2735       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
2736                                                    _NL_COLLATE_SYMB_EXTRAMB);
2737     }
2738 #endif
2739   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
2740 #ifdef RE_ENABLE_I18N
2741   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
2742 #endif /* RE_ENABLE_I18N */
2743 #ifdef RE_ENABLE_I18N
2744   if (BE (sbcset == NULL || mbcset == NULL, 0))
2745 #else
2746   if (BE (sbcset == NULL, 0))
2747 #endif /* RE_ENABLE_I18N */
2748     {
2749       *err = REG_ESPACE;
2750       return NULL;
2751     }
2752
2753   token_len = peek_token_bracket (token, regexp, syntax);
2754   if (BE (token->type == END_OF_RE, 0))
2755     {
2756       *err = REG_BADPAT;
2757       goto parse_bracket_exp_free_return;
2758     }
2759   if (token->type == OP_NON_MATCH_LIST)
2760     {
2761 #ifdef RE_ENABLE_I18N
2762       int i;
2763       mbcset->non_match = 1;
2764 #else /* not RE_ENABLE_I18N */
2765       non_match = 1;
2766 #endif /* not RE_ENABLE_I18N */
2767       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
2768         bitset_set (sbcset, '\0');
2769       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2770       token_len = peek_token_bracket (token, regexp, syntax);
2771       if (BE (token->type == END_OF_RE, 0))
2772         {
2773           *err = REG_BADPAT;
2774           goto parse_bracket_exp_free_return;
2775         }
2776 #ifdef RE_ENABLE_I18N
2777       if (MB_CUR_MAX > 1)
2778         for (i = 0; i < SBC_MAX; ++i)
2779           if (__btowc (i) == WEOF)
2780             bitset_set (sbcset, i);
2781 #endif /* RE_ENABLE_I18N */
2782     }
2783
2784   /* We treat the first ']' as a normal character.  */
2785   if (token->type == OP_CLOSE_BRACKET)
2786     token->type = CHARACTER;
2787
2788   while (1)
2789     {
2790       bracket_elem_t start_elem, end_elem;
2791       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
2792       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
2793       reg_errcode_t ret;
2794       int token_len2 = 0, is_range_exp = 0;
2795       re_token_t token2;
2796
2797       start_elem.opr.name = start_name_buf;
2798       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
2799                                    syntax);
2800       if (BE (ret != REG_NOERROR, 0))
2801         {
2802           *err = ret;
2803           goto parse_bracket_exp_free_return;
2804         }
2805
2806       token_len = peek_token_bracket (token, regexp, syntax);
2807       if (BE (token->type == END_OF_RE, 0))
2808         {
2809           *err = REG_BADPAT;
2810           goto parse_bracket_exp_free_return;
2811         }
2812       if (token->type == OP_CHARSET_RANGE)
2813         {
2814           re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
2815           token_len2 = peek_token_bracket (&token2, regexp, syntax);
2816           if (BE (token->type == END_OF_RE, 0))
2817             {
2818               *err = REG_BADPAT;
2819               goto parse_bracket_exp_free_return;
2820             }
2821           if (token2.type == OP_CLOSE_BRACKET)
2822             {
2823               /* We treat the last '-' as a normal character.  */
2824               re_string_skip_bytes (regexp, -token_len);
2825               token->type = CHARACTER;
2826             }
2827           else
2828             is_range_exp = 1;
2829         }
2830
2831       if (is_range_exp == 1)
2832         {
2833           end_elem.opr.name = end_name_buf;
2834           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
2835                                        dfa, syntax);
2836           if (BE (ret != REG_NOERROR, 0))
2837             {
2838               *err = ret;
2839               goto parse_bracket_exp_free_return;
2840             }
2841
2842           token_len = peek_token_bracket (token, regexp, syntax);
2843           if (BE (token->type == END_OF_RE, 0))
2844             {
2845               *err = REG_BADPAT;
2846               goto parse_bracket_exp_free_return;
2847             }
2848           *err = build_range_exp (sbcset,
2849 #ifdef RE_ENABLE_I18N
2850                                   mbcset, &range_alloc,
2851 #endif /* RE_ENABLE_I18N */
2852                                   &start_elem, &end_elem);
2853           if (BE (*err != REG_NOERROR, 0))
2854             goto parse_bracket_exp_free_return;
2855         }
2856       else
2857         {
2858           switch (start_elem.type)
2859             {
2860             case SB_CHAR:
2861               bitset_set (sbcset, start_elem.opr.ch);
2862               break;
2863 #ifdef RE_ENABLE_I18N
2864             case MB_CHAR:
2865               /* Check whether the array has enough space.  */
2866               if (mbchar_alloc == mbcset->nmbchars)
2867                 {
2868                   /* Not enough, realloc it.  */
2869                   /* +1 in case of mbcset->nmbchars is 0.  */
2870                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
2871                   /* Use realloc since array is NULL if *alloc == 0.  */
2872                   mbcset->mbchars = re_realloc (mbcset->mbchars, wchar_t,
2873                                                 mbchar_alloc);
2874                   if (BE (mbcset->mbchars == NULL, 0))
2875                     goto parse_bracket_exp_espace;
2876                 }
2877               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
2878               break;
2879 #endif /* RE_ENABLE_I18N */
2880             case EQUIV_CLASS:
2881               *err = build_equiv_class (sbcset,
2882 #ifdef RE_ENABLE_I18N
2883                                         mbcset, &equiv_class_alloc,
2884 #endif /* RE_ENABLE_I18N */
2885                                         start_elem.opr.name);
2886               if (BE (*err != REG_NOERROR, 0))
2887                 goto parse_bracket_exp_free_return;
2888               break;
2889             case COLL_SYM:
2890               *err = build_collating_symbol (sbcset,
2891 #ifdef RE_ENABLE_I18N
2892                                              mbcset, &coll_sym_alloc,
2893 #endif /* RE_ENABLE_I18N */
2894                                              start_elem.opr.name);
2895               if (BE (*err != REG_NOERROR, 0))
2896                 goto parse_bracket_exp_free_return;
2897               break;
2898             case CHAR_CLASS:
2899               ret = build_charclass (sbcset,
2900 #ifdef RE_ENABLE_I18N
2901                                      mbcset, &char_class_alloc,
2902 #endif /* RE_ENABLE_I18N */
2903                                      start_elem.opr.name, syntax);
2904               if (BE (ret != REG_NOERROR, 0))
2905                goto parse_bracket_exp_espace;
2906               break;
2907             default:
2908               assert (0);
2909               break;
2910             }
2911         }
2912       if (token->type == OP_CLOSE_BRACKET)
2913         break;
2914     }
2915
2916   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2917
2918   /* If it is non-matching list.  */
2919 #ifdef RE_ENABLE_I18N
2920   if (mbcset->non_match)
2921 #else /* not RE_ENABLE_I18N */
2922   if (non_match)
2923 #endif /* not RE_ENABLE_I18N */
2924     bitset_not (sbcset);
2925
2926   /* Build a tree for simple bracket.  */
2927   br_token.type = SIMPLE_BRACKET;
2928   br_token.opr.sbcset = sbcset;
2929   new_idx = re_dfa_add_node (dfa, br_token, 0);
2930   work_tree = create_tree (NULL, NULL, 0, new_idx);
2931   if (BE (new_idx == -1 || work_tree == NULL, 0))
2932     goto parse_bracket_exp_espace;
2933
2934 #ifdef RE_ENABLE_I18N
2935   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
2936       || mbcset->nranges || (MB_CUR_MAX > 1 && (mbcset->nchar_classes
2937                                                 || mbcset->non_match)))
2938     {
2939       re_token_t alt_token;
2940       bin_tree_t *mbc_tree;
2941       /* Build a tree for complex bracket.  */
2942       br_token.type = COMPLEX_BRACKET;
2943       br_token.opr.mbcset = mbcset;
2944       dfa->has_mb_node = 1;
2945       new_idx = re_dfa_add_node (dfa, br_token, 0);
2946       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
2947       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
2948         goto parse_bracket_exp_espace;
2949       /* Then join them by ALT node.  */
2950       dfa->has_plural_match = 1;
2951       alt_token.type = OP_ALT;
2952       new_idx = re_dfa_add_node (dfa, alt_token, 0);
2953       work_tree = create_tree (work_tree, mbc_tree, 0, new_idx);
2954       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
2955         return work_tree;
2956     }
2957   else
2958     {
2959       free_charset (mbcset);
2960       return work_tree;
2961     }
2962 #else /* not RE_ENABLE_I18N */
2963   return work_tree;
2964 #endif /* not RE_ENABLE_I18N */
2965
2966  parse_bracket_exp_espace:
2967   *err = REG_ESPACE;
2968  parse_bracket_exp_free_return:
2969   re_free (sbcset);
2970 #ifdef RE_ENABLE_I18N
2971   free_charset (mbcset);
2972 #endif /* RE_ENABLE_I18N */
2973   return NULL;
2974 }
2975
2976 /* Parse an element in the bracket expression.  */
2977
2978 static reg_errcode_t
2979 parse_bracket_element (elem, regexp, token, token_len, dfa, syntax)
2980      bracket_elem_t *elem;
2981      re_string_t *regexp;
2982      re_token_t *token;
2983      int token_len;
2984      re_dfa_t *dfa;
2985      reg_syntax_t syntax;
2986 {
2987 #ifdef RE_ENABLE_I18N
2988   int cur_char_size;
2989   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
2990   if (cur_char_size > 1)
2991     {
2992       elem->type = MB_CHAR;
2993       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
2994       re_string_skip_bytes (regexp, cur_char_size);
2995       return REG_NOERROR;
2996     }
2997 #endif /* RE_ENABLE_I18N */
2998   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
2999   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3000       || token->type == OP_OPEN_EQUIV_CLASS)
3001     return parse_bracket_symbol (elem, regexp, token);
3002   elem->type = SB_CHAR;
3003   elem->opr.ch = token->opr.c;
3004   return REG_NOERROR;
3005 }
3006
3007 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3008    such as [:<character_class>:], [.<collating_element>.], and
3009    [=<equivalent_class>=].  */
3010
3011 static reg_errcode_t
3012 parse_bracket_symbol (elem, regexp, token)
3013      bracket_elem_t *elem;
3014      re_string_t *regexp;
3015      re_token_t *token;
3016 {
3017   unsigned char ch, delim = token->opr.c;
3018   int i = 0;
3019   for (;; ++i)
3020     {
3021       if (re_string_eoi(regexp) || i >= BRACKET_NAME_BUF_SIZE)
3022         return REG_EBRACK;
3023       if (token->type == OP_OPEN_CHAR_CLASS)
3024         ch = re_string_fetch_byte_case (regexp);
3025       else
3026         ch = re_string_fetch_byte (regexp);
3027       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3028         break;
3029       elem->opr.name[i] = ch;
3030     }
3031   re_string_skip_bytes (regexp, 1);
3032   elem->opr.name[i] = '\0';
3033   switch (token->type)
3034     {
3035     case OP_OPEN_COLL_ELEM:
3036       elem->type = COLL_SYM;
3037       break;
3038     case OP_OPEN_EQUIV_CLASS:
3039       elem->type = EQUIV_CLASS;
3040       break;
3041     case OP_OPEN_CHAR_CLASS:
3042       elem->type = CHAR_CLASS;
3043       break;
3044     default:
3045       break;
3046     }
3047   return REG_NOERROR;
3048 }
3049
3050   /* Helper function for parse_bracket_exp.
3051      Build the equivalence class which is represented by NAME.
3052      The result are written to MBCSET and SBCSET.
3053      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3054      is a pointer argument sinse we may update it.  */
3055
3056 static reg_errcode_t
3057 #ifdef RE_ENABLE_I18N
3058 build_equiv_class (sbcset, mbcset, equiv_class_alloc, name)
3059      re_charset_t *mbcset;
3060      int *equiv_class_alloc;
3061 #else /* not RE_ENABLE_I18N */
3062 build_equiv_class (sbcset, name)
3063 #endif /* not RE_ENABLE_I18N */
3064      re_bitset_ptr_t sbcset;
3065      const unsigned char *name;
3066 {
3067 #if defined _LIBC && defined RE_ENABLE_I18N
3068   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3069   if (nrules != 0)
3070     {
3071       const int32_t *table, *indirect;
3072       const unsigned char *weights, *extra, *cp;
3073       unsigned char char_buf[2];
3074       int32_t idx1, idx2;
3075       unsigned int ch;
3076       size_t len;
3077       /* This #include defines a local function!  */
3078 # include <locale/weight.h>
3079       /* Calculate the index for equivalence class.  */
3080       cp = name;
3081       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3082       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3083                                                _NL_COLLATE_WEIGHTMB);
3084       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3085                                                    _NL_COLLATE_EXTRAMB);
3086       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3087                                                 _NL_COLLATE_INDIRECTMB);
3088       idx1 = findidx (&cp);
3089       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3090         /* This isn't a valid character.  */
3091         return REG_ECOLLATE;
3092
3093       /* Build single byte matcing table for this equivalence class.  */
3094       char_buf[1] = (unsigned char) '\0';
3095       len = weights[idx1];
3096       for (ch = 0; ch < SBC_MAX; ++ch)
3097         {
3098           char_buf[0] = ch;
3099           cp = char_buf;
3100           idx2 = findidx (&cp);
3101 /*
3102           idx2 = table[ch];
3103 */
3104           if (idx2 == 0)
3105             /* This isn't a valid character.  */
3106             continue;
3107           if (len == weights[idx2])
3108             {
3109               int cnt = 0;
3110               while (cnt <= len &&
3111                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3112                 ++cnt;
3113
3114               if (cnt > len)
3115                 bitset_set (sbcset, ch);
3116             }
3117         }
3118       /* Check whether the array has enough space.  */
3119       if (*equiv_class_alloc == mbcset->nequiv_classes)
3120         {
3121           /* Not enough, realloc it.  */
3122           /* +1 in case of mbcset->nequiv_classes is 0.  */
3123           *equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3124           /* Use realloc since the array is NULL if *alloc == 0.  */
3125           mbcset->equiv_classes = re_realloc (mbcset->equiv_classes, int32_t,
3126                                               *equiv_class_alloc);
3127           if (BE (mbcset->equiv_classes == NULL, 0))
3128             return REG_ESPACE;
3129         }
3130       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3131     }
3132   else
3133 #endif /* _LIBC && RE_ENABLE_I18N */
3134     {
3135       if (BE (strlen ((const char *) name) != 1, 0))
3136         return REG_ECOLLATE;
3137       bitset_set (sbcset, *name);
3138     }
3139   return REG_NOERROR;
3140 }
3141
3142   /* Helper function for parse_bracket_exp.
3143      Build the character class which is represented by NAME.
3144      The result are written to MBCSET and SBCSET.
3145      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3146      is a pointer argument sinse we may update it.  */
3147
3148 static reg_errcode_t
3149 #ifdef RE_ENABLE_I18N
3150 build_charclass (sbcset, mbcset, char_class_alloc, class_name, syntax)
3151      re_charset_t *mbcset;
3152      int *char_class_alloc;
3153 #else /* not RE_ENABLE_I18N */
3154 build_charclass (sbcset, class_name, syntax)
3155 #endif /* not RE_ENABLE_I18N */
3156      re_bitset_ptr_t sbcset;
3157      const unsigned char *class_name;
3158      reg_syntax_t syntax;
3159 {
3160   int i;
3161   const char *name = (const char *) class_name;
3162
3163   /* In case of REG_ICASE "upper" and "lower" match the both of
3164      upper and lower cases.  */
3165   if ((syntax & RE_ICASE)
3166       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3167     name = "alpha";
3168
3169 #ifdef RE_ENABLE_I18N
3170   /* Check the space of the arrays.  */
3171   if (*char_class_alloc == mbcset->nchar_classes)
3172     {
3173       /* Not enough, realloc it.  */
3174       /* +1 in case of mbcset->nchar_classes is 0.  */
3175       *char_class_alloc = 2 * mbcset->nchar_classes + 1;
3176       /* Use realloc since array is NULL if *alloc == 0.  */
3177       mbcset->char_classes = re_realloc (mbcset->char_classes, wctype_t,
3178                                          *char_class_alloc);
3179       if (BE (mbcset->char_classes == NULL, 0))
3180         return REG_ESPACE;
3181     }
3182   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3183 #endif /* RE_ENABLE_I18N */
3184
3185 #define BUILD_CHARCLASS_LOOP(ctype_func)\
3186     for (i = 0; i < SBC_MAX; ++i)       \
3187       {                                 \
3188         if (ctype_func (i))             \
3189           bitset_set (sbcset, i);       \
3190       }
3191
3192   if (strcmp (name, "alnum") == 0)
3193     BUILD_CHARCLASS_LOOP (isalnum)
3194   else if (strcmp (name, "cntrl") == 0)
3195     BUILD_CHARCLASS_LOOP (iscntrl)
3196   else if (strcmp (name, "lower") == 0)
3197     BUILD_CHARCLASS_LOOP (islower)
3198   else if (strcmp (name, "space") == 0)
3199     BUILD_CHARCLASS_LOOP (isspace)
3200   else if (strcmp (name, "alpha") == 0)
3201     BUILD_CHARCLASS_LOOP (isalpha)
3202   else if (strcmp (name, "digit") == 0)
3203     BUILD_CHARCLASS_LOOP (isdigit)
3204   else if (strcmp (name, "print") == 0)
3205     BUILD_CHARCLASS_LOOP (isprint)
3206   else if (strcmp (name, "upper") == 0)
3207     BUILD_CHARCLASS_LOOP (isupper)
3208   else if (strcmp (name, "blank") == 0)
3209     BUILD_CHARCLASS_LOOP (isblank)
3210   else if (strcmp (name, "graph") == 0)
3211     BUILD_CHARCLASS_LOOP (isgraph)
3212   else if (strcmp (name, "punct") == 0)
3213     BUILD_CHARCLASS_LOOP (ispunct)
3214   else if (strcmp (name, "xdigit") == 0)
3215     BUILD_CHARCLASS_LOOP (isxdigit)
3216   else
3217     return REG_ECTYPE;
3218
3219   return REG_NOERROR;
3220 }
3221
3222 static bin_tree_t *
3223 build_word_op (dfa, not, err)
3224      re_dfa_t *dfa;
3225      int not;
3226      reg_errcode_t *err;
3227 {
3228   re_bitset_ptr_t sbcset;
3229 #ifdef RE_ENABLE_I18N
3230   re_charset_t *mbcset;
3231   int alloc = 0;
3232 #else /* not RE_ENABLE_I18N */
3233   int non_match = 0;
3234 #endif /* not RE_ENABLE_I18N */
3235   reg_errcode_t ret;
3236   re_token_t br_token;
3237   bin_tree_t *tree;
3238   int new_idx;
3239
3240   sbcset = (re_bitset_ptr_t) calloc (sizeof (unsigned int), BITSET_UINTS);
3241 #ifdef RE_ENABLE_I18N
3242   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3243 #endif /* RE_ENABLE_I18N */
3244
3245 #ifdef RE_ENABLE_I18N
3246   if (BE (sbcset == NULL || mbcset == NULL, 0))
3247 #else /* not RE_ENABLE_I18N */
3248   if (BE (sbcset == NULL, 0))
3249 #endif /* not RE_ENABLE_I18N */
3250     {
3251       *err = REG_ESPACE;
3252       return NULL;
3253     }
3254
3255   if (not)
3256     {
3257 #ifdef RE_ENABLE_I18N
3258       int i;
3259       /*
3260       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3261         bitset_set(cset->sbcset, '\0');
3262       */
3263       mbcset->non_match = 1;
3264       if (MB_CUR_MAX > 1)
3265         for (i = 0; i < SBC_MAX; ++i)
3266           if (__btowc (i) == WEOF)
3267             bitset_set (sbcset, i);
3268 #else /* not RE_ENABLE_I18N */
3269       non_match = 1;
3270 #endif /* not RE_ENABLE_I18N */
3271     }
3272
3273   /* We don't care the syntax in this case.  */
3274   ret = build_charclass (sbcset,
3275 #ifdef RE_ENABLE_I18N
3276                          mbcset, &alloc,
3277 #endif /* RE_ENABLE_I18N */
3278                          (const unsigned char *) "alpha", 0);
3279
3280   if (BE (ret != REG_NOERROR, 0))
3281     {
3282       re_free (sbcset);
3283 #ifdef RE_ENABLE_I18N
3284       free_charset (mbcset);
3285 #endif /* RE_ENABLE_I18N */
3286       *err = REG_ESPACE;
3287       return NULL;
3288     }
3289   /* \w match '_' also.  */
3290   bitset_set (sbcset, '_');
3291
3292   /* If it is non-matching list.  */
3293 #ifdef RE_ENABLE_I18N
3294   if (mbcset->non_match)
3295 #else /* not RE_ENABLE_I18N */
3296   if (non_match)
3297 #endif /* not RE_ENABLE_I18N */
3298     bitset_not (sbcset);
3299
3300   /* Build a tree for simple bracket.  */
3301   br_token.type = SIMPLE_BRACKET;
3302   br_token.opr.sbcset = sbcset;
3303   new_idx = re_dfa_add_node (dfa, br_token, 0);
3304   tree = create_tree (NULL, NULL, 0, new_idx);
3305   if (BE (new_idx == -1 || tree == NULL, 0))
3306     goto build_word_op_espace;
3307
3308 #ifdef RE_ENABLE_I18N
3309   if (MB_CUR_MAX > 1)
3310     {
3311       re_token_t alt_token;
3312       bin_tree_t *mbc_tree;
3313       /* Build a tree for complex bracket.  */
3314       br_token.type = COMPLEX_BRACKET;
3315       br_token.opr.mbcset = mbcset;
3316       dfa->has_mb_node = 1;
3317       new_idx = re_dfa_add_node (dfa, br_token, 0);
3318       mbc_tree = create_tree (NULL, NULL, 0, new_idx);
3319       if (BE (new_idx == -1 || mbc_tree == NULL, 0))
3320         goto build_word_op_espace;
3321       /* Then join them by ALT node.  */
3322       alt_token.type = OP_ALT;
3323       new_idx = re_dfa_add_node (dfa, alt_token, 0);
3324       tree = create_tree (tree, mbc_tree, 0, new_idx);
3325       if (BE (new_idx != -1 && mbc_tree != NULL, 1))
3326         return tree;
3327     }
3328   else
3329     {
3330       free_charset (mbcset);
3331       return tree;
3332     }
3333 #else /* not RE_ENABLE_I18N */
3334   return tree;
3335 #endif /* not RE_ENABLE_I18N */
3336
3337  build_word_op_espace:
3338   re_free (sbcset);
3339 #ifdef RE_ENABLE_I18N
3340   free_charset (mbcset);
3341 #endif /* RE_ENABLE_I18N */
3342   *err = REG_ESPACE;
3343   return NULL;
3344 }
3345
3346 /* This is intended for the expressions like "a{1,3}".
3347    Fetch a number from `input', and return the number.
3348    Return -1, if the number field is empty like "{,1}".
3349    Return -2, If an error is occured.  */
3350
3351 static int
3352 fetch_number (input, token, syntax)
3353      re_string_t *input;
3354      re_token_t *token;
3355      reg_syntax_t syntax;
3356 {
3357   int num = -1;
3358   unsigned char c;
3359   while (1)
3360     {
3361       *token = fetch_token (input, syntax);
3362       c = token->opr.c;
3363       if (BE (token->type == END_OF_RE, 0))
3364         return -2;
3365       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3366         break;
3367       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3368              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3369       num = (num > RE_DUP_MAX) ? -2 : num;
3370     }
3371   return num;
3372 }
3373 \f
3374 #ifdef RE_ENABLE_I18N
3375 static void
3376 free_charset (re_charset_t *cset)
3377 {
3378   re_free (cset->mbchars);
3379 # ifdef _LIBC
3380   re_free (cset->coll_syms);
3381   re_free (cset->equiv_classes);
3382   re_free (cset->range_starts);
3383   re_free (cset->range_ends);
3384 # endif
3385   re_free (cset->char_classes);
3386   re_free (cset);
3387 }
3388 #endif /* RE_ENABLE_I18N */
3389 \f
3390 /* Functions for binary tree operation.  */
3391
3392 /* Create a node of tree.
3393    Note: This function automatically free left and right if malloc fails.  */
3394
3395 static bin_tree_t *
3396 create_tree (left, right, type, index)
3397      bin_tree_t *left;
3398      bin_tree_t *right;
3399      re_token_type_t type;
3400      int index;
3401 {
3402   bin_tree_t *tree;
3403   tree = re_malloc (bin_tree_t, 1);
3404   if (BE (tree == NULL, 0))
3405     {
3406       free_bin_tree (left);
3407       free_bin_tree (right);
3408       return NULL;
3409     }
3410   tree->parent = NULL;
3411   tree->left = left;
3412   tree->right = right;
3413   tree->type = type;
3414   tree->node_idx = index;
3415   tree->first = -1;
3416   tree->next = -1;
3417   re_node_set_init_empty (&tree->eclosure);
3418
3419   if (left != NULL)
3420     left->parent = tree;
3421   if (right != NULL)
3422     right->parent = tree;
3423   return tree;
3424 }
3425
3426 /* Free the sub tree pointed by TREE.  */
3427
3428 static void
3429 free_bin_tree (tree)
3430      bin_tree_t *tree;
3431 {
3432   if (tree == NULL)
3433     return;
3434   /*re_node_set_free (&tree->eclosure);*/
3435   free_bin_tree (tree->left);
3436   free_bin_tree (tree->right);
3437   re_free (tree);
3438 }
3439
3440 /* Duplicate the node SRC, and return new node.  */
3441
3442 static bin_tree_t *
3443 duplicate_tree (src, dfa)
3444      const bin_tree_t *src;
3445      re_dfa_t *dfa;
3446 {
3447   bin_tree_t *left = NULL, *right = NULL, *new_tree;
3448   int new_node_idx;
3449   /* Since node indies must be according to Post-order of the tree,
3450      we must duplicate the left at first.  */
3451   if (src->left != NULL)
3452     {
3453       left = duplicate_tree (src->left, dfa);
3454       if (left == NULL)
3455         return NULL;
3456     }
3457
3458   /* Secondaly, duplicate the right.  */
3459   if (src->right != NULL)
3460     {
3461       right = duplicate_tree (src->right, dfa);
3462       if (right == NULL)
3463         {
3464           free_bin_tree (left);
3465           return NULL;
3466         }
3467     }
3468
3469   /* At last, duplicate itself.  */
3470   if (src->type == NON_TYPE)
3471     {
3472       new_node_idx = re_dfa_add_node (dfa, dfa->nodes[src->node_idx], 0);
3473       dfa->nodes[new_node_idx].duplicated = 1;
3474       if (BE (new_node_idx == -1, 0))
3475         {
3476           free_bin_tree (left);
3477           free_bin_tree (right);
3478           return NULL;
3479         }
3480     }
3481   else
3482     new_node_idx = src->type;
3483
3484   new_tree = create_tree (left, right, src->type, new_node_idx);
3485   if (BE (new_tree == NULL, 0))
3486     {
3487       free_bin_tree (left);
3488       free_bin_tree (right);
3489     }
3490   return new_tree;
3491 }