posix/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 #ifdef _LIBC
  23 # include <locale/weight.h>
  24 #endif
  25
  26 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  27                                           size_t length, reg_syntax_t syntax);
  28 static void re_compile_fastmap_iter (regex_t *bufp,
  29                                      const re_dfastate_t *init_state,
  30                                      char *fastmap);
  31 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  32 #ifdef RE_ENABLE_I18N
  33 static void free_charset (re_charset_t *cset);
  34 #endif /* RE_ENABLE_I18N */
  35 static void free_workarea_compile (regex_t *preg);
  36 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  37 #ifdef RE_ENABLE_I18N
  38 static void optimize_utf8 (re_dfa_t *dfa);
  39 #endif
  40 static reg_errcode_t analyze (regex_t *preg);
  41 static reg_errcode_t preorder (bin_tree_t *root,
  42                                reg_errcode_t (fn (void *, bin_tree_t *)),
  43                                void *extra);
  44 static reg_errcode_t postorder (bin_tree_t *root,
  45                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  46                                 void *extra);
  47 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  48 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  49 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  50                                  bin_tree_t *node);
  51 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  52 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  53 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  54 static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
  55 static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
  56                                    unsigned int constraint);
  57 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  58 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  59                                          int node, int root);
  60 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  61 static int fetch_number (re_string_t *input, re_token_t *token,
  62                          reg_syntax_t syntax);
  63 static int peek_token (re_token_t *token, re_string_t *input,
  64                         reg_syntax_t syntax);
  65 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  66                           reg_syntax_t syntax, reg_errcode_t *err);
  67 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  68                                   re_token_t *token, reg_syntax_t syntax,
  69                                   int nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  71                                  re_token_t *token, reg_syntax_t syntax,
  72                                  int nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  74                                      re_token_t *token, reg_syntax_t syntax,
  75                                      int nest, reg_errcode_t *err);
  76 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  77                                   re_token_t *token, reg_syntax_t syntax,
  78                                   int nest, reg_errcode_t *err);
  79 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  80                                  re_dfa_t *dfa, re_token_t *token,
  81                                  reg_syntax_t syntax, reg_errcode_t *err);
  82 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  83                                       re_token_t *token, reg_syntax_t syntax,
  84                                       reg_errcode_t *err);
  85 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  86                                             re_string_t *regexp,
  87                                             re_token_t *token, int token_len,
  88                                             re_dfa_t *dfa,
  89                                             reg_syntax_t syntax,
  90                                             int accept_hyphen);
  91 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  92                                           re_string_t *regexp,
  93                                           re_token_t *token);
  94 #ifdef RE_ENABLE_I18N
  95 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  96                                         re_charset_t *mbcset,
  97                                         int *equiv_class_alloc,
  98                                         const unsigned char *name);
  99 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 100                                       bitset_t sbcset,
 101                                       re_charset_t *mbcset,
 102                                       int *char_class_alloc,
 103                                       const unsigned char *class_name,
 104                                       reg_syntax_t syntax);
 105 #else  /* not RE_ENABLE_I18N */
 106 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 107                                         const unsigned char *name);
 108 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 109                                       bitset_t sbcset,
 110                                       const unsigned char *class_name,
 111                                       reg_syntax_t syntax);
 112 #endif /* not RE_ENABLE_I18N */
 113 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 114                                        RE_TRANSLATE_TYPE trans,
 115                                        const unsigned char *class_name,
 116                                        const unsigned char *extra,
 117                                        int non_match, reg_errcode_t *err);
 118 static bin_tree_t *create_tree (re_dfa_t *dfa,
 119                                 bin_tree_t *left, bin_tree_t *right,
 120                                 re_token_type_t type);
 121 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 122                                       bin_tree_t *left, bin_tree_t *right,
 123                                       const re_token_t *token);
 124 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 125 static void free_token (re_token_t *node);
 126 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 127 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 128 \f
 129 /* This table gives an error message for each of the error codes listed
 130    in regex.h.  Obviously the order here has to be same as there.
 131    POSIX doesn't require that we do anything for REG_NOERROR,
 132    but why not be nice?  */
 133
 134 const char __re_error_msgid[] attribute_hidden =
 135   {
 136 #define REG_NOERROR_IDX 0
 137     gettext_noop ("Success")    /* REG_NOERROR */
 138     "\0"
 139 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 140     gettext_noop ("No match")   /* REG_NOMATCH */
 141     "\0"
 142 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 143     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 144     "\0"
 145 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 146     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 147     "\0"
 148 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 149     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 150     "\0"
 151 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 152     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 153     "\0"
 154 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 155     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 156     "\0"
 157 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 158     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 159     "\0"
 160 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 161     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 162     "\0"
 163 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 164     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 165     "\0"
 166 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 167     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 168     "\0"
 169 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 170     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 171     "\0"
 172 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 173     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 174     "\0"
 175 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 176     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 177     "\0"
 178 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 179     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 180     "\0"
 181 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 182     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 183     "\0"
 184 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 185     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 186   };
 187
 188 const size_t __re_error_msgid_idx[] attribute_hidden =
 189   {
 190     REG_NOERROR_IDX,
 191     REG_NOMATCH_IDX,
 192     REG_BADPAT_IDX,
 193     REG_ECOLLATE_IDX,
 194     REG_ECTYPE_IDX,
 195     REG_EESCAPE_IDX,
 196     REG_ESUBREG_IDX,
 197     REG_EBRACK_IDX,
 198     REG_EPAREN_IDX,
 199     REG_EBRACE_IDX,
 200     REG_BADBR_IDX,
 201     REG_ERANGE_IDX,
 202     REG_ESPACE_IDX,
 203     REG_BADRPT_IDX,
 204     REG_EEND_IDX,
 205     REG_ESIZE_IDX,
 206     REG_ERPAREN_IDX
 207   };
 208 \f
 209 /* Entry points for GNU code.  */
 210
 211 /* re_compile_pattern is the GNU regular expression compiler: it
 212    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 213    Returns 0 if the pattern was valid, otherwise an error string.
 214
 215    Assumes the 'allocated' (and perhaps 'buffer') and 'translate' fields
 216    are set in BUFP on entry.  */
 217
 218 const char *
 219 re_compile_pattern (const char *pattern, size_t length,
 220                     struct re_pattern_buffer *bufp)
 221 {
 222   reg_errcode_t ret;
 223
 224   /* And GNU code determines whether or not to get register information
 225      by passing null for the REGS argument to re_match, etc., not by
 226      setting no_sub, unless RE_NO_SUB is set.  */
 227   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 228
 229   /* Match anchors at newline.  */
 230   bufp->newline_anchor = 1;
 231
 232   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 233
 234   if (!ret)
 235     return NULL;
 236   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 237 }
 238 #ifdef _LIBC
 239 weak_alias (__re_compile_pattern, re_compile_pattern)
 240 #endif
 241
 242 /* Set by 're_set_syntax' to the current regexp syntax to recognize.  Can
 243    also be assigned to arbitrarily: each pattern buffer stores its own
 244    syntax, so it can be changed between regex compilations.  */
 245 /* This has no initializer because initialized variables in Emacs
 246    become read-only after dumping.  */
 247 reg_syntax_t re_syntax_options;
 248
 249
 250 /* Specify the precise syntax of regexps for compilation.  This provides
 251    for compatibility for various utilities which historically have
 252    different, incompatible syntaxes.
 253
 254    The argument SYNTAX is a bit mask comprised of the various bits
 255    defined in regex.h.  We return the old syntax.  */
 256
 257 reg_syntax_t
 258 re_set_syntax (reg_syntax_t syntax)
 259 {
 260   reg_syntax_t ret = re_syntax_options;
 261
 262   re_syntax_options = syntax;
 263   return ret;
 264 }
 265 #ifdef _LIBC
 266 weak_alias (__re_set_syntax, re_set_syntax)
 267 #endif
 268
 269 int
 270 re_compile_fastmap (struct re_pattern_buffer *bufp)
 271 {
 272   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 273   char *fastmap = bufp->fastmap;
 274
 275   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 276   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 277   if (dfa->init_state != dfa->init_state_word)
 278     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 279   if (dfa->init_state != dfa->init_state_nl)
 280     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 281   if (dfa->init_state != dfa->init_state_begbuf)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 283   bufp->fastmap_accurate = 1;
 284   return 0;
 285 }
 286 #ifdef _LIBC
 287 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 288 #endif
 289
 290 static inline void
 291 __attribute__ ((always_inline))
 292 re_set_fastmap (char *fastmap, bool icase, int ch)
 293 {
 294   fastmap[ch] = 1;
 295   if (icase)
 296     fastmap[tolower (ch)] = 1;
 297 }
 298
 299 /* Helper function for re_compile_fastmap.
 300    Compile fastmap for the initial_state INIT_STATE.  */
 301
 302 static void
 303 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 304                          char *fastmap)
 305 {
 306   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 307   int node_cnt;
 308   int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 309   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 310     {
 311       int node = init_state->nodes.elems[node_cnt];
 312       re_token_type_t type = dfa->nodes[node].type;
 313
 314       if (type == CHARACTER)
 315         {
 316           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 317 #ifdef RE_ENABLE_I18N
 318           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 319             {
 320               unsigned char *buf = alloca (dfa->mb_cur_max), *p;
 321               wchar_t wc;
 322               mbstate_t state;
 323
 324               p = buf;
 325               *p++ = dfa->nodes[node].opr.c;
 326               while (++node < dfa->nodes_len
 327                      && dfa->nodes[node].type == CHARACTER
 328                      && dfa->nodes[node].mb_partial)
 329                 *p++ = dfa->nodes[node].opr.c;
 330               memset (&state, '\0', sizeof (state));
 331               if (__mbrtowc (&wc, (const char *) buf, p - buf,
 332                              &state) == p - buf
 333                   && (__wcrtomb ((char *) buf, __towlower (wc), &state)
 334                       != (size_t) -1))
 335                 re_set_fastmap (fastmap, 0, buf[0]);
 336             }
 337 #endif
 338         }
 339       else if (type == SIMPLE_BRACKET)
 340         {
 341           int i, ch;
 342           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 343             {
 344               int j;
 345               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 346               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 347                 if (w & ((bitset_word_t) 1 << j))
 348                   re_set_fastmap (fastmap, icase, ch);
 349             }
 350         }
 351 #ifdef RE_ENABLE_I18N
 352       else if (type == COMPLEX_BRACKET)
 353         {
 354           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 355           int i;
 356
 357 # ifdef _LIBC
 358           /* See if we have to try all bytes which start multiple collation
 359              elements.
 360              e.g. In da_DK, we want to catch 'a' since "aa" is a valid
 361                   collation element, and don't catch 'b' since 'b' is
 362                   the only collation element which starts from 'b' (and
 363                   it is caught by SIMPLE_BRACKET).  */
 364               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0
 365                   && (cset->ncoll_syms || cset->nranges))
 366                 {
 367                   const int32_t *table = (const int32_t *)
 368                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 369                   for (i = 0; i < SBC_MAX; ++i)
 370                     if (table[i] < 0)
 371                       re_set_fastmap (fastmap, icase, i);
 372                 }
 373 # endif /* _LIBC */
 374
 375           /* See if we have to start the match at all multibyte characters,
 376              i.e. where we would not find an invalid sequence.  This only
 377              applies to multibyte character sets; for single byte character
 378              sets, the SIMPLE_BRACKET again suffices.  */
 379           if (dfa->mb_cur_max > 1
 380               && (cset->nchar_classes || cset->non_match || cset->nranges
 381 # ifdef _LIBC
 382                   || cset->nequiv_classes
 383 # endif /* _LIBC */
 384                  ))
 385             {
 386               unsigned char c = 0;
 387               do
 388                 {
 389                   mbstate_t mbs;
 390                   memset (&mbs, 0, sizeof (mbs));
 391                   if (__mbrtowc (NULL, (char *) &c, 1, &mbs) == (size_t) -2)
 392                     re_set_fastmap (fastmap, false, (int) c);
 393                 }
 394               while (++c != 0);
 395             }
 396
 397           else
 398             {
 399               /* ... Else catch all bytes which can start the mbchars.  */
 400               for (i = 0; i < cset->nmbchars; ++i)
 401                 {
 402                   char buf[256];
 403                   mbstate_t state;
 404                   memset (&state, '\0', sizeof (state));
 405                   if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 406                     re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 407                   if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 408                     {
 409                       if (__wcrtomb (buf, __towlower (cset->mbchars[i]), &state)
 410                           != (size_t) -1)
 411                         re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 412                     }
 413                 }
 414             }
 415         }
 416 #endif /* RE_ENABLE_I18N */
 417       else if (type == OP_PERIOD
 418 #ifdef RE_ENABLE_I18N
 419                || type == OP_UTF8_PERIOD
 420 #endif /* RE_ENABLE_I18N */
 421                || type == END_OF_RE)
 422         {
 423           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 424           if (type == END_OF_RE)
 425             bufp->can_be_null = 1;
 426           return;
 427         }
 428     }
 429 }
 430 \f
 431 /* Entry point for POSIX code.  */
 432 /* regcomp takes a regular expression as a string and compiles it.
 433
 434    PREG is a regex_t *.  We do not expect any fields to be initialized,
 435    since POSIX says we shouldn't.  Thus, we set
 436
 437      'buffer' to the compiled pattern;
 438      'used' to the length of the compiled pattern;
 439      'syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 440        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 441        RE_SYNTAX_POSIX_BASIC;
 442      'newline_anchor' to REG_NEWLINE being set in CFLAGS;
 443      'fastmap' to an allocated space for the fastmap;
 444      'fastmap_accurate' to zero;
 445      're_nsub' to the number of subexpressions in PATTERN.
 446
 447    PATTERN is the address of the pattern string.
 448
 449    CFLAGS is a series of bits which affect compilation.
 450
 451      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 452      use POSIX basic syntax.
 453
 454      If REG_NEWLINE is set, then . and [^...] don't match newline.
 455      Also, regexec will try a match beginning after every newline.
 456
 457      If REG_ICASE is set, then we considers upper- and lowercase
 458      versions of letters to be equivalent when matching.
 459
 460      If REG_NOSUB is set, then when PREG is passed to regexec, that
 461      routine will report only success or failure, and nothing about the
 462      registers.
 463
 464    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 465    the return codes and their meanings.)  */
 466
 467 int
 468 regcomp (regex_t *__restrict preg, const char *__restrict pattern, int cflags)
 469 {
 470   reg_errcode_t ret;
 471   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 472                          : RE_SYNTAX_POSIX_BASIC);
 473
 474   preg->buffer = NULL;
 475   preg->allocated = 0;
 476   preg->used = 0;
 477
 478   /* Try to allocate space for the fastmap.  */
 479   preg->fastmap = re_malloc (char, SBC_MAX);
 480   if (BE (preg->fastmap == NULL, 0))
 481     return REG_ESPACE;
 482
 483   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 484
 485   /* If REG_NEWLINE is set, newlines are treated differently.  */
 486   if (cflags & REG_NEWLINE)
 487     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 488       syntax &= ~RE_DOT_NEWLINE;
 489       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 490       /* It also changes the matching behavior.  */
 491       preg->newline_anchor = 1;
 492     }
 493   else
 494     preg->newline_anchor = 0;
 495   preg->no_sub = !!(cflags & REG_NOSUB);
 496   preg->translate = NULL;
 497
 498   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 499
 500   /* POSIX doesn't distinguish between an unmatched open-group and an
 501      unmatched close-group: both are REG_EPAREN.  */
 502   if (ret == REG_ERPAREN)
 503     ret = REG_EPAREN;
 504
 505   /* We have already checked preg->fastmap != NULL.  */
 506   if (BE (ret == REG_NOERROR, 1))
 507     /* Compute the fastmap now, since regexec cannot modify the pattern
 508        buffer.  This function never fails in this implementation.  */
 509     (void) re_compile_fastmap (preg);
 510   else
 511     {
 512       /* Some error occurred while compiling the expression.  */
 513       re_free (preg->fastmap);
 514       preg->fastmap = NULL;
 515     }
 516
 517   return (int) ret;
 518 }
 519 #ifdef _LIBC
 520 libc_hidden_def (__regcomp)
 521 weak_alias (__regcomp, regcomp)
 522 #endif
 523
 524 /* Returns a message corresponding to an error code, ERRCODE, returned
 525    from either regcomp or regexec.   We don't use PREG here.  */
 526
 527 size_t
 528 regerror (int errcode, const regex_t *__restrict preg, char *__restrict errbuf,
 529           size_t errbuf_size)
 530 {
 531   const char *msg;
 532   size_t msg_size;
 533
 534   if (BE (errcode < 0
 535           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 536                                / sizeof (__re_error_msgid_idx[0])), 0))
 537     /* Only error codes returned by the rest of the code should be passed
 538        to this routine.  If we are given anything else, or if other regex
 539        code generates an invalid error code, then the program has a bug.
 540        Dump core so we can fix it.  */
 541     abort ();
 542
 543   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 544
 545   msg_size = strlen (msg) + 1; /* Includes the null.  */
 546
 547   if (BE (errbuf_size != 0, 1))
 548     {
 549       if (BE (msg_size > errbuf_size, 0))
 550         {
 551 #if defined HAVE_MEMPCPY || defined _LIBC
 552           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 553 #else
 554           memcpy (errbuf, msg, errbuf_size - 1);
 555           errbuf[errbuf_size - 1] = 0;
 556 #endif
 557         }
 558       else
 559         memcpy (errbuf, msg, msg_size);
 560     }
 561
 562   return msg_size;
 563 }
 564 #ifdef _LIBC
 565 weak_alias (__regerror, regerror)
 566 #endif
 567
 568
 569 #ifdef RE_ENABLE_I18N
 570 /* This static array is used for the map to single-byte characters when
 571    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 572    it the same all the time.  UTF-8 is the preferred encoding so this is
 573    a worthwhile optimization.  */
 574 static const bitset_t utf8_sb_map =
 575 {
 576   /* Set the first 128 bits.  */
 577   [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
 578 };
 579 #endif
 580
 581
 582 static void
 583 free_dfa_content (re_dfa_t *dfa)
 584 {
 585   int i, j;
 586
 587   if (dfa->nodes)
 588     for (i = 0; i < dfa->nodes_len; ++i)
 589       free_token (dfa->nodes + i);
 590   re_free (dfa->nexts);
 591   for (i = 0; i < dfa->nodes_len; ++i)
 592     {
 593       if (dfa->eclosures != NULL)
 594         re_node_set_free (dfa->eclosures + i);
 595       if (dfa->inveclosures != NULL)
 596         re_node_set_free (dfa->inveclosures + i);
 597       if (dfa->edests != NULL)
 598         re_node_set_free (dfa->edests + i);
 599     }
 600   re_free (dfa->edests);
 601   re_free (dfa->eclosures);
 602   re_free (dfa->inveclosures);
 603   re_free (dfa->nodes);
 604
 605   if (dfa->state_table)
 606     for (i = 0; i <= dfa->state_hash_mask; ++i)
 607       {
 608         struct re_state_table_entry *entry = dfa->state_table + i;
 609         for (j = 0; j < entry->num; ++j)
 610           {
 611             re_dfastate_t *state = entry->array[j];
 612             free_state (state);
 613           }
 614         re_free (entry->array);
 615       }
 616   re_free (dfa->state_table);
 617 #ifdef RE_ENABLE_I18N
 618   if (dfa->sb_char != utf8_sb_map)
 619     re_free (dfa->sb_char);
 620 #endif
 621   re_free (dfa->subexp_map);
 622 #ifdef DEBUG
 623   re_free (dfa->re_str);
 624 #endif
 625
 626   re_free (dfa);
 627 }
 628
 629
 630 /* Free dynamically allocated space used by PREG.  */
 631
 632 void
 633 regfree (regex_t *preg)
 634 {
 635   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 636   if (BE (dfa != NULL, 1))
 637     free_dfa_content (dfa);
 638   preg->buffer = NULL;
 639   preg->allocated = 0;
 640
 641   re_free (preg->fastmap);
 642   preg->fastmap = NULL;
 643
 644   re_free (preg->translate);
 645   preg->translate = NULL;
 646 }
 647 #ifdef _LIBC
 648 libc_hidden_def (__regfree)
 649 weak_alias (__regfree, regfree)
 650 #endif
 651 \f
 652 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 653    them unless specifically requested.  */
 654
 655 #if defined _REGEX_RE_COMP || defined _LIBC
 656
 657 /* BSD has one and only one pattern buffer.  */
 658 static struct re_pattern_buffer re_comp_buf;
 659
 660 char *
 661 # ifdef _LIBC
 662 /* Make these definitions weak in libc, so POSIX programs can redefine
 663    these names if they don't use our functions, and still use
 664    regcomp/regexec above without link errors.  */
 665 weak_function
 666 # endif
 667 re_comp (const char *s)
 668 {
 669   reg_errcode_t ret;
 670   char *fastmap;
 671
 672   if (!s)
 673     {
 674       if (!re_comp_buf.buffer)
 675         return gettext ("No previous regular expression");
 676       return 0;
 677     }
 678
 679   if (re_comp_buf.buffer)
 680     {
 681       fastmap = re_comp_buf.fastmap;
 682       re_comp_buf.fastmap = NULL;
 683       __regfree (&re_comp_buf);
 684       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 685       re_comp_buf.fastmap = fastmap;
 686     }
 687
 688   if (re_comp_buf.fastmap == NULL)
 689     {
 690       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 691       if (re_comp_buf.fastmap == NULL)
 692         return (char *) gettext (__re_error_msgid
 693                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 694     }
 695
 696   /* Since 're_exec' always passes NULL for the 'regs' argument, we
 697      don't need to initialize the pattern buffer fields which affect it.  */
 698
 699   /* Match anchors at newlines.  */
 700   re_comp_buf.newline_anchor = 1;
 701
 702   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 703
 704   if (!ret)
 705     return NULL;
 706
 707   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 708   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 709 }
 710
 711 #ifdef _LIBC
 712 libc_freeres_fn (free_mem)
 713 {
 714   __regfree (&re_comp_buf);
 715 }
 716 #endif
 717
 718 #endif /* _REGEX_RE_COMP */
 719 \f
 720 /* Internal entry point.
 721    Compile the regular expression PATTERN, whose length is LENGTH.
 722    SYNTAX indicate regular expression's syntax.  */
 723
 724 static reg_errcode_t
 725 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 726                      reg_syntax_t syntax)
 727 {
 728   reg_errcode_t err = REG_NOERROR;
 729   re_dfa_t *dfa;
 730   re_string_t regexp;
 731
 732   /* Initialize the pattern buffer.  */
 733   preg->fastmap_accurate = 0;
 734   preg->syntax = syntax;
 735   preg->not_bol = preg->not_eol = 0;
 736   preg->used = 0;
 737   preg->re_nsub = 0;
 738   preg->can_be_null = 0;
 739   preg->regs_allocated = REGS_UNALLOCATED;
 740
 741   /* Initialize the dfa.  */
 742   dfa = (re_dfa_t *) preg->buffer;
 743   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 744     {
 745       /* If zero allocated, but buffer is non-null, try to realloc
 746          enough space.  This loses if buffer's address is bogus, but
 747          that is the user's responsibility.  If ->buffer is NULL this
 748          is a simple allocation.  */
 749       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 750       if (dfa == NULL)
 751         return REG_ESPACE;
 752       preg->allocated = sizeof (re_dfa_t);
 753       preg->buffer = (unsigned char *) dfa;
 754     }
 755   preg->used = sizeof (re_dfa_t);
 756
 757   err = init_dfa (dfa, length);
 758   if (BE (err != REG_NOERROR, 0))
 759     {
 760       free_dfa_content (dfa);
 761       preg->buffer = NULL;
 762       preg->allocated = 0;
 763       return err;
 764     }
 765 #ifdef DEBUG
 766   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 767   dfa->re_str = re_malloc (char, length + 1);
 768   strncpy (dfa->re_str, pattern, length + 1);
 769 #endif
 770
 771   __libc_lock_init (dfa->lock);
 772
 773   err = re_string_construct (&regexp, pattern, length, preg->translate,
 774                              syntax & RE_ICASE, dfa);
 775   if (BE (err != REG_NOERROR, 0))
 776     {
 777     re_compile_internal_free_return:
 778       free_workarea_compile (preg);
 779       re_string_destruct (&regexp);
 780       free_dfa_content (dfa);
 781       preg->buffer = NULL;
 782       preg->allocated = 0;
 783       return err;
 784     }
 785
 786   /* Parse the regular expression, and build a structure tree.  */
 787   preg->re_nsub = 0;
 788   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 789   if (BE (dfa->str_tree == NULL, 0))
 790     goto re_compile_internal_free_return;
 791
 792   /* Analyze the tree and create the nfa.  */
 793   err = analyze (preg);
 794   if (BE (err != REG_NOERROR, 0))
 795     goto re_compile_internal_free_return;
 796
 797 #ifdef RE_ENABLE_I18N
 798   /* If possible, do searching in single byte encoding to speed things up.  */
 799   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 800     optimize_utf8 (dfa);
 801 #endif
 802
 803   /* Then create the initial state of the dfa.  */
 804   err = create_initial_state (dfa);
 805
 806   /* Release work areas.  */
 807   free_workarea_compile (preg);
 808   re_string_destruct (&regexp);
 809
 810   if (BE (err != REG_NOERROR, 0))
 811     {
 812       free_dfa_content (dfa);
 813       preg->buffer = NULL;
 814       preg->allocated = 0;
 815     }
 816
 817   return err;
 818 }
 819
 820 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 821    as the initial length of some arrays.  */
 822
 823 static reg_errcode_t
 824 init_dfa (re_dfa_t *dfa, size_t pat_len)
 825 {
 826   unsigned int table_size;
 827 #ifndef _LIBC
 828   char *codeset_name;
 829 #endif
 830
 831   memset (dfa, '\0', sizeof (re_dfa_t));
 832
 833   /* Force allocation of str_tree_storage the first time.  */
 834   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 835
 836   /* Avoid overflows.  */
 837   if (pat_len == SIZE_MAX)
 838     return REG_ESPACE;
 839
 840   dfa->nodes_alloc = pat_len + 1;
 841   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 842
 843   /*  table_size = 2 ^ ceil(log pat_len) */
 844   for (table_size = 1; ; table_size <<= 1)
 845     if (table_size > pat_len)
 846       break;
 847
 848   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 849   dfa->state_hash_mask = table_size - 1;
 850
 851   dfa->mb_cur_max = MB_CUR_MAX;
 852 #ifdef _LIBC
 853   if (dfa->mb_cur_max == 6
 854       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 855     dfa->is_utf8 = 1;
 856   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 857                        != 0);
 858 #else
 859 # ifdef HAVE_LANGINFO_CODESET
 860   codeset_name = nl_langinfo (CODESET);
 861 # else
 862   codeset_name = getenv ("LC_ALL");
 863   if (codeset_name == NULL || codeset_name[0] == '\0')
 864     codeset_name = getenv ("LC_CTYPE");
 865   if (codeset_name == NULL || codeset_name[0] == '\0')
 866     codeset_name = getenv ("LANG");
 867   if (codeset_name == NULL)
 868     codeset_name = "";
 869   else if (strchr (codeset_name, '.') !=  NULL)
 870     codeset_name = strchr (codeset_name, '.') + 1;
 871 # endif
 872
 873   if (strcasecmp (codeset_name, "UTF-8") == 0
 874       || strcasecmp (codeset_name, "UTF8") == 0)
 875     dfa->is_utf8 = 1;
 876
 877   /* We check exhaustively in the loop below if this charset is a
 878      superset of ASCII.  */
 879   dfa->map_notascii = 0;
 880 #endif
 881
 882 #ifdef RE_ENABLE_I18N
 883   if (dfa->mb_cur_max > 1)
 884     {
 885       if (dfa->is_utf8)
 886         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 887       else
 888         {
 889           int i, j, ch;
 890
 891           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 892           if (BE (dfa->sb_char == NULL, 0))
 893             return REG_ESPACE;
 894
 895           /* Set the bits corresponding to single byte chars.  */
 896           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 897             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 898               {
 899                 wint_t wch = __btowc (ch);
 900                 if (wch != WEOF)
 901                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 902 # ifndef _LIBC
 903                 if (isascii (ch) && wch != ch)
 904                   dfa->map_notascii = 1;
 905 # endif
 906               }
 907         }
 908     }
 909 #endif
 910
 911   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 912     return REG_ESPACE;
 913   return REG_NOERROR;
 914 }
 915
 916 /* Initialize WORD_CHAR table, which indicate which character is
 917    "word".  In this case "word" means that it is the word construction
 918    character used by some operators like "\<", "\>", etc.  */
 919
 920 static void
 921 init_word_char (re_dfa_t *dfa)
 922 {
 923   dfa->word_ops_used = 1;
 924   int i = 0;
 925   int ch = 0;
 926   if (BE (dfa->map_notascii == 0, 1))
 927     {
 928       if (sizeof (dfa->word_char[0]) == 8)
 929         {
 930           /* The extra temporaries here avoid "implicitly truncated"
 931              warnings in the case when this is dead code, i.e. 32-bit.  */
 932           const uint64_t wc0 = UINT64_C (0x03ff000000000000);
 933           const uint64_t wc1 = UINT64_C (0x07fffffe87fffffe);
 934           dfa->word_char[0] = wc0;
 935           dfa->word_char[1] = wc1;
 936           i = 2;
 937         }
 938       else if (sizeof (dfa->word_char[0]) == 4)
 939         {
 940           dfa->word_char[0] = UINT32_C (0x00000000);
 941           dfa->word_char[1] = UINT32_C (0x03ff0000);
 942           dfa->word_char[2] = UINT32_C (0x87fffffe);
 943           dfa->word_char[3] = UINT32_C (0x07fffffe);
 944           i = 4;
 945         }
 946       else
 947         abort ();
 948       ch = 128;
 949
 950       if (BE (dfa->is_utf8, 1))
 951         {
 952           memset (&dfa->word_char[i], '\0', (SBC_MAX - ch) / 8);
 953           return;
 954         }
 955     }
 956
 957   for (; i < BITSET_WORDS; ++i)
 958     for (int j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 959       if (isalnum (ch) || ch == '_')
 960         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 961 }
 962
 963 /* Free the work area which are only used while compiling.  */
 964
 965 static void
 966 free_workarea_compile (regex_t *preg)
 967 {
 968   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 969   bin_tree_storage_t *storage, *next;
 970   for (storage = dfa->str_tree_storage; storage; storage = next)
 971     {
 972       next = storage->next;
 973       re_free (storage);
 974     }
 975   dfa->str_tree_storage = NULL;
 976   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 977   dfa->str_tree = NULL;
 978   re_free (dfa->org_indices);
 979   dfa->org_indices = NULL;
 980 }
 981
 982 /* Create initial states for all contexts.  */
 983
 984 static reg_errcode_t
 985 create_initial_state (re_dfa_t *dfa)
 986 {
 987   int first, i;
 988   reg_errcode_t err;
 989   re_node_set init_nodes;
 990
 991   /* Initial states have the epsilon closure of the node which is
 992      the first node of the regular expression.  */
 993   first = dfa->str_tree->first->node_idx;
 994   dfa->init_node = first;
 995   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 996   if (BE (err != REG_NOERROR, 0))
 997     return err;
 998
 999   /* The back-references which are in initial states can epsilon transit,
1000      since in this case all of the subexpressions can be null.
1001      Then we add epsilon closures of the nodes which are the next nodes of
1002      the back-references.  */
1003   if (dfa->nbackref > 0)
1004     for (i = 0; i < init_nodes.nelem; ++i)
1005       {
1006         int node_idx = init_nodes.elems[i];
1007         re_token_type_t type = dfa->nodes[node_idx].type;
1008
1009         int clexp_idx;
1010         if (type != OP_BACK_REF)
1011           continue;
1012         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1013           {
1014             re_token_t *clexp_node;
1015             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1016             if (clexp_node->type == OP_CLOSE_SUBEXP
1017                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1018               break;
1019           }
1020         if (clexp_idx == init_nodes.nelem)
1021           continue;
1022
1023         if (type == OP_BACK_REF)
1024           {
1025             int dest_idx = dfa->edests[node_idx].elems[0];
1026             if (!re_node_set_contains (&init_nodes, dest_idx))
1027               {
1028                 reg_errcode_t err = re_node_set_merge (&init_nodes,
1029                                                        dfa->eclosures
1030                                                        + dest_idx);
1031                 if (err != REG_NOERROR)
1032                   return err;
1033                 i = 0;
1034               }
1035           }
1036       }
1037
1038   /* It must be the first time to invoke acquire_state.  */
1039   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1040   /* We don't check ERR here, since the initial state must not be NULL.  */
1041   if (BE (dfa->init_state == NULL, 0))
1042     return err;
1043   if (dfa->init_state->has_constraint)
1044     {
1045       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1046                                                        CONTEXT_WORD);
1047       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1048                                                      CONTEXT_NEWLINE);
1049       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1050                                                          &init_nodes,
1051                                                          CONTEXT_NEWLINE
1052                                                          | CONTEXT_BEGBUF);
1053       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1054               || dfa->init_state_begbuf == NULL, 0))
1055         return err;
1056     }
1057   else
1058     dfa->init_state_word = dfa->init_state_nl
1059       = dfa->init_state_begbuf = dfa->init_state;
1060
1061   re_node_set_free (&init_nodes);
1062   return REG_NOERROR;
1063 }
1064 \f
1065 #ifdef RE_ENABLE_I18N
1066 /* If it is possible to do searching in single byte encoding instead of UTF-8
1067    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1068    DFA nodes where needed.  */
1069
1070 static void
1071 optimize_utf8 (re_dfa_t *dfa)
1072 {
1073   int node, i, mb_chars = 0, has_period = 0;
1074
1075   for (node = 0; node < dfa->nodes_len; ++node)
1076     switch (dfa->nodes[node].type)
1077       {
1078       case CHARACTER:
1079         if (dfa->nodes[node].opr.c >= 0x80)
1080           mb_chars = 1;
1081         break;
1082       case ANCHOR:
1083         switch (dfa->nodes[node].opr.ctx_type)
1084           {
1085           case LINE_FIRST:
1086           case LINE_LAST:
1087           case BUF_FIRST:
1088           case BUF_LAST:
1089             break;
1090           default:
1091             /* Word anchors etc. cannot be handled.  It's okay to test
1092                opr.ctx_type since constraints (for all DFA nodes) are
1093                created by ORing one or more opr.ctx_type values.  */
1094             return;
1095           }
1096         break;
1097       case OP_PERIOD:
1098         has_period = 1;
1099         break;
1100       case OP_BACK_REF:
1101       case OP_ALT:
1102       case END_OF_RE:
1103       case OP_DUP_ASTERISK:
1104       case OP_OPEN_SUBEXP:
1105       case OP_CLOSE_SUBEXP:
1106         break;
1107       case COMPLEX_BRACKET:
1108         return;
1109       case SIMPLE_BRACKET:
1110         /* Just double check.  The non-ASCII range starts at 0x80.  */
1111         assert (0x80 % BITSET_WORD_BITS == 0);
1112         for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1113           if (dfa->nodes[node].opr.sbcset[i])
1114             return;
1115         break;
1116       default:
1117         abort ();
1118       }
1119
1120   if (mb_chars || has_period)
1121     for (node = 0; node < dfa->nodes_len; ++node)
1122       {
1123         if (dfa->nodes[node].type == CHARACTER
1124             && dfa->nodes[node].opr.c >= 0x80)
1125           dfa->nodes[node].mb_partial = 0;
1126         else if (dfa->nodes[node].type == OP_PERIOD)
1127           dfa->nodes[node].type = OP_UTF8_PERIOD;
1128       }
1129
1130   /* The search can be in single byte locale.  */
1131   dfa->mb_cur_max = 1;
1132   dfa->is_utf8 = 0;
1133   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1134 }
1135 #endif
1136 \f
1137 /* Analyze the structure tree, and calculate "first", "next", "edest",
1138    "eclosure", and "inveclosure".  */
1139
1140 static reg_errcode_t
1141 analyze (regex_t *preg)
1142 {
1143   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1144   reg_errcode_t ret;
1145
1146   /* Allocate arrays.  */
1147   dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1148   dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1149   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1150   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1151   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1152           || dfa->eclosures == NULL, 0))
1153     return REG_ESPACE;
1154
1155   dfa->subexp_map = re_malloc (int, preg->re_nsub);
1156   if (dfa->subexp_map != NULL)
1157     {
1158       int i;
1159       for (i = 0; i < preg->re_nsub; i++)
1160         dfa->subexp_map[i] = i;
1161       preorder (dfa->str_tree, optimize_subexps, dfa);
1162       for (i = 0; i < preg->re_nsub; i++)
1163         if (dfa->subexp_map[i] != i)
1164           break;
1165       if (i == preg->re_nsub)
1166         {
1167           free (dfa->subexp_map);
1168           dfa->subexp_map = NULL;
1169         }
1170     }
1171
1172   ret = postorder (dfa->str_tree, lower_subexps, preg);
1173   if (BE (ret != REG_NOERROR, 0))
1174     return ret;
1175   ret = postorder (dfa->str_tree, calc_first, dfa);
1176   if (BE (ret != REG_NOERROR, 0))
1177     return ret;
1178   preorder (dfa->str_tree, calc_next, dfa);
1179   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1180   if (BE (ret != REG_NOERROR, 0))
1181     return ret;
1182   ret = calc_eclosure (dfa);
1183   if (BE (ret != REG_NOERROR, 0))
1184     return ret;
1185
1186   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1187      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1188   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1189       || dfa->nbackref)
1190     {
1191       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1192       if (BE (dfa->inveclosures == NULL, 0))
1193         return REG_ESPACE;
1194       ret = calc_inveclosure (dfa);
1195     }
1196
1197   return ret;
1198 }
1199
1200 /* Our parse trees are very unbalanced, so we cannot use a stack to
1201    implement parse tree visits.  Instead, we use parent pointers and
1202    some hairy code in these two functions.  */
1203 static reg_errcode_t
1204 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1205            void *extra)
1206 {
1207   bin_tree_t *node, *prev;
1208
1209   for (node = root; ; )
1210     {
1211       /* Descend down the tree, preferably to the left (or to the right
1212          if that's the only child).  */
1213       while (node->left || node->right)
1214         if (node->left)
1215           node = node->left;
1216         else
1217           node = node->right;
1218
1219       do
1220         {
1221           reg_errcode_t err = fn (extra, node);
1222           if (BE (err != REG_NOERROR, 0))
1223             return err;
1224           if (node->parent == NULL)
1225             return REG_NOERROR;
1226           prev = node;
1227           node = node->parent;
1228         }
1229       /* Go up while we have a node that is reached from the right.  */
1230       while (node->right == prev || node->right == NULL);
1231       node = node->right;
1232     }
1233 }
1234
1235 static reg_errcode_t
1236 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1237           void *extra)
1238 {
1239   bin_tree_t *node;
1240
1241   for (node = root; ; )
1242     {
1243       reg_errcode_t err = fn (extra, node);
1244       if (BE (err != REG_NOERROR, 0))
1245         return err;
1246
1247       /* Go to the left node, or up and to the right.  */
1248       if (node->left)
1249         node = node->left;
1250       else
1251         {
1252           bin_tree_t *prev = NULL;
1253           while (node->right == prev || node->right == NULL)
1254             {
1255               prev = node;
1256               node = node->parent;
1257               if (!node)
1258                 return REG_NOERROR;
1259             }
1260           node = node->right;
1261         }
1262     }
1263 }
1264
1265 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1266    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1267    backreferences as well.  Requires a preorder visit.  */
1268 static reg_errcode_t
1269 optimize_subexps (void *extra, bin_tree_t *node)
1270 {
1271   re_dfa_t *dfa = (re_dfa_t *) extra;
1272
1273   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1274     {
1275       int idx = node->token.opr.idx;
1276       node->token.opr.idx = dfa->subexp_map[idx];
1277       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1278     }
1279
1280   else if (node->token.type == SUBEXP
1281            && node->left && node->left->token.type == SUBEXP)
1282     {
1283       int other_idx = node->left->token.opr.idx;
1284
1285       node->left = node->left->left;
1286       if (node->left)
1287         node->left->parent = node;
1288
1289       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1290       if (other_idx < BITSET_WORD_BITS)
1291           dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1292     }
1293
1294   return REG_NOERROR;
1295 }
1296
1297 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1298    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1299 static reg_errcode_t
1300 lower_subexps (void *extra, bin_tree_t *node)
1301 {
1302   regex_t *preg = (regex_t *) extra;
1303   reg_errcode_t err = REG_NOERROR;
1304
1305   if (node->left && node->left->token.type == SUBEXP)
1306     {
1307       node->left = lower_subexp (&err, preg, node->left);
1308       if (node->left)
1309         node->left->parent = node;
1310     }
1311   if (node->right && node->right->token.type == SUBEXP)
1312     {
1313       node->right = lower_subexp (&err, preg, node->right);
1314       if (node->right)
1315         node->right->parent = node;
1316     }
1317
1318   return err;
1319 }
1320
1321 static bin_tree_t *
1322 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1323 {
1324   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1325   bin_tree_t *body = node->left;
1326   bin_tree_t *op, *cls, *tree1, *tree;
1327
1328   if (preg->no_sub
1329       /* We do not optimize empty subexpressions, because otherwise we may
1330          have bad CONCAT nodes with NULL children.  This is obviously not
1331          very common, so we do not lose much.  An example that triggers
1332          this case is the sed "script" /\(\)/x.  */
1333       && node->left != NULL
1334       && (node->token.opr.idx >= BITSET_WORD_BITS
1335           || !(dfa->used_bkref_map
1336                & ((bitset_word_t) 1 << node->token.opr.idx))))
1337     return node->left;
1338
1339   /* Convert the SUBEXP node to the concatenation of an
1340      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1341   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1342   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1343   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1344   tree = create_tree (dfa, op, tree1, CONCAT);
1345   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1346     {
1347       *err = REG_ESPACE;
1348       return NULL;
1349     }
1350
1351   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1352   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1353   return tree;
1354 }
1355
1356 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1357    nodes.  Requires a postorder visit.  */
1358 static reg_errcode_t
1359 calc_first (void *extra, bin_tree_t *node)
1360 {
1361   re_dfa_t *dfa = (re_dfa_t *) extra;
1362   if (node->token.type == CONCAT)
1363     {
1364       node->first = node->left->first;
1365       node->node_idx = node->left->node_idx;
1366     }
1367   else
1368     {
1369       node->first = node;
1370       node->node_idx = re_dfa_add_node (dfa, node->token);
1371       if (BE (node->node_idx == -1, 0))
1372         return REG_ESPACE;
1373       if (node->token.type == ANCHOR)
1374         dfa->nodes[node->node_idx].constraint = node->token.opr.ctx_type;
1375     }
1376   return REG_NOERROR;
1377 }
1378
1379 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1380 static reg_errcode_t
1381 calc_next (void *extra, bin_tree_t *node)
1382 {
1383   switch (node->token.type)
1384     {
1385     case OP_DUP_ASTERISK:
1386       node->left->next = node;
1387       break;
1388     case CONCAT:
1389       node->left->next = node->right->first;
1390       node->right->next = node->next;
1391       break;
1392     default:
1393       if (node->left)
1394         node->left->next = node->next;
1395       if (node->right)
1396         node->right->next = node->next;
1397       break;
1398     }
1399   return REG_NOERROR;
1400 }
1401
1402 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1403 static reg_errcode_t
1404 link_nfa_nodes (void *extra, bin_tree_t *node)
1405 {
1406   re_dfa_t *dfa = (re_dfa_t *) extra;
1407   int idx = node->node_idx;
1408   reg_errcode_t err = REG_NOERROR;
1409
1410   switch (node->token.type)
1411     {
1412     case CONCAT:
1413       break;
1414
1415     case END_OF_RE:
1416       assert (node->next == NULL);
1417       break;
1418
1419     case OP_DUP_ASTERISK:
1420     case OP_ALT:
1421       {
1422         int left, right;
1423         dfa->has_plural_match = 1;
1424         if (node->left != NULL)
1425           left = node->left->first->node_idx;
1426         else
1427           left = node->next->node_idx;
1428         if (node->right != NULL)
1429           right = node->right->first->node_idx;
1430         else
1431           right = node->next->node_idx;
1432         assert (left > -1);
1433         assert (right > -1);
1434         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1435       }
1436       break;
1437
1438     case ANCHOR:
1439     case OP_OPEN_SUBEXP:
1440     case OP_CLOSE_SUBEXP:
1441       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1442       break;
1443
1444     case OP_BACK_REF:
1445       dfa->nexts[idx] = node->next->node_idx;
1446       if (node->token.type == OP_BACK_REF)
1447         err = re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1448       break;
1449
1450     default:
1451       assert (!IS_EPSILON_NODE (node->token.type));
1452       dfa->nexts[idx] = node->next->node_idx;
1453       break;
1454     }
1455
1456   return err;
1457 }
1458
1459 /* Duplicate the epsilon closure of the node ROOT_NODE.
1460    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1461    to their own constraint.  */
1462
1463 static reg_errcode_t
1464 duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1465                         int root_node, unsigned int init_constraint)
1466 {
1467   int org_node, clone_node, ret;
1468   unsigned int constraint = init_constraint;
1469   for (org_node = top_org_node, clone_node = top_clone_node;;)
1470     {
1471       int org_dest, clone_dest;
1472       if (dfa->nodes[org_node].type == OP_BACK_REF)
1473         {
1474           /* If the back reference epsilon-transit, its destination must
1475              also have the constraint.  Then duplicate the epsilon closure
1476              of the destination of the back reference, and store it in
1477              edests of the back reference.  */
1478           org_dest = dfa->nexts[org_node];
1479           re_node_set_empty (dfa->edests + clone_node);
1480           clone_dest = duplicate_node (dfa, org_dest, constraint);
1481           if (BE (clone_dest == -1, 0))
1482             return REG_ESPACE;
1483           dfa->nexts[clone_node] = dfa->nexts[org_node];
1484           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1485           if (BE (ret < 0, 0))
1486             return REG_ESPACE;
1487         }
1488       else if (dfa->edests[org_node].nelem == 0)
1489         {
1490           /* In case of the node can't epsilon-transit, don't duplicate the
1491              destination and store the original destination as the
1492              destination of the node.  */
1493           dfa->nexts[clone_node] = dfa->nexts[org_node];
1494           break;
1495         }
1496       else if (dfa->edests[org_node].nelem == 1)
1497         {
1498           /* In case of the node can epsilon-transit, and it has only one
1499              destination.  */
1500           org_dest = dfa->edests[org_node].elems[0];
1501           re_node_set_empty (dfa->edests + clone_node);
1502           /* If the node is root_node itself, it means the epsilon closure
1503              has a loop.   Then tie it to the destination of the root_node.  */
1504           if (org_node == root_node && clone_node != org_node)
1505             {
1506               ret = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507               if (BE (ret < 0, 0))
1508                 return REG_ESPACE;
1509               break;
1510             }
1511           /* In case the node has another constraint, append it.  */
1512           constraint |= dfa->nodes[org_node].constraint;
1513           clone_dest = duplicate_node (dfa, org_dest, constraint);
1514           if (BE (clone_dest == -1, 0))
1515             return REG_ESPACE;
1516           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517           if (BE (ret < 0, 0))
1518             return REG_ESPACE;
1519         }
1520       else /* dfa->edests[org_node].nelem == 2 */
1521         {
1522           /* In case of the node can epsilon-transit, and it has two
1523              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1524           org_dest = dfa->edests[org_node].elems[0];
1525           re_node_set_empty (dfa->edests + clone_node);
1526           /* Search for a duplicated node which satisfies the constraint.  */
1527           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528           if (clone_dest == -1)
1529             {
1530               /* There is no such duplicated node, create a new one.  */
1531               reg_errcode_t err;
1532               clone_dest = duplicate_node (dfa, org_dest, constraint);
1533               if (BE (clone_dest == -1, 0))
1534                 return REG_ESPACE;
1535               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536               if (BE (ret < 0, 0))
1537                 return REG_ESPACE;
1538               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539                                             root_node, constraint);
1540               if (BE (err != REG_NOERROR, 0))
1541                 return err;
1542             }
1543           else
1544             {
1545               /* There is a duplicated node which satisfies the constraint,
1546                  use it to avoid infinite loop.  */
1547               ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548               if (BE (ret < 0, 0))
1549                 return REG_ESPACE;
1550             }
1551
1552           org_dest = dfa->edests[org_node].elems[1];
1553           clone_dest = duplicate_node (dfa, org_dest, constraint);
1554           if (BE (clone_dest == -1, 0))
1555             return REG_ESPACE;
1556           ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557           if (BE (ret < 0, 0))
1558             return REG_ESPACE;
1559         }
1560       org_node = org_dest;
1561       clone_node = clone_dest;
1562     }
1563   return REG_NOERROR;
1564 }
1565
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567    satisfies the constraint CONSTRAINT.  */
1568
1569 static int
1570 search_duplicated_node (const re_dfa_t *dfa, int org_node,
1571                         unsigned int constraint)
1572 {
1573   int idx;
1574   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1575     {
1576       if (org_node == dfa->org_indices[idx]
1577           && constraint == dfa->nodes[idx].constraint)
1578         return idx; /* Found.  */
1579     }
1580   return -1; /* Not found.  */
1581 }
1582
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584    Return the index of the new node, or -1 if insufficient storage is
1585    available.  */
1586
1587 static int
1588 duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1589 {
1590   int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591   if (BE (dup_idx != -1, 1))
1592     {
1593       dfa->nodes[dup_idx].constraint = constraint;
1594       dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].constraint;
1595       dfa->nodes[dup_idx].duplicated = 1;
1596
1597       /* Store the index of the original node.  */
1598       dfa->org_indices[dup_idx] = org_idx;
1599     }
1600   return dup_idx;
1601 }
1602
1603 static reg_errcode_t
1604 calc_inveclosure (re_dfa_t *dfa)
1605 {
1606   int src, idx, ret;
1607   for (idx = 0; idx < dfa->nodes_len; ++idx)
1608     re_node_set_init_empty (dfa->inveclosures + idx);
1609
1610   for (src = 0; src < dfa->nodes_len; ++src)
1611     {
1612       int *elems = dfa->eclosures[src].elems;
1613       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1614         {
1615           ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1616           if (BE (ret == -1, 0))
1617             return REG_ESPACE;
1618         }
1619     }
1620
1621   return REG_NOERROR;
1622 }
1623
1624 /* Calculate "eclosure" for all the node in DFA.  */
1625
1626 static reg_errcode_t
1627 calc_eclosure (re_dfa_t *dfa)
1628 {
1629   int node_idx, incomplete;
1630 #ifdef DEBUG
1631   assert (dfa->nodes_len > 0);
1632 #endif
1633   incomplete = 0;
1634   /* For each nodes, calculate epsilon closure.  */
1635   for (node_idx = 0; ; ++node_idx)
1636     {
1637       reg_errcode_t err;
1638       re_node_set eclosure_elem;
1639       if (node_idx == dfa->nodes_len)
1640         {
1641           if (!incomplete)
1642             break;
1643           incomplete = 0;
1644           node_idx = 0;
1645         }
1646
1647 #ifdef DEBUG
1648       assert (dfa->eclosures[node_idx].nelem != -1);
1649 #endif
1650
1651       /* If we have already calculated, skip it.  */
1652       if (dfa->eclosures[node_idx].nelem != 0)
1653         continue;
1654       /* Calculate epsilon closure of 'node_idx'.  */
1655       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1656       if (BE (err != REG_NOERROR, 0))
1657         return err;
1658
1659       if (dfa->eclosures[node_idx].nelem == 0)
1660         {
1661           incomplete = 1;
1662           re_node_set_free (&eclosure_elem);
1663         }
1664     }
1665   return REG_NOERROR;
1666 }
1667
1668 /* Calculate epsilon closure of NODE.  */
1669
1670 static reg_errcode_t
1671 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1672 {
1673   reg_errcode_t err;
1674   int i;
1675   re_node_set eclosure;
1676   int ret;
1677   int incomplete = 0;
1678   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1679   if (BE (err != REG_NOERROR, 0))
1680     return err;
1681
1682   /* This indicates that we are calculating this node now.
1683      We reference this value to avoid infinite loop.  */
1684   dfa->eclosures[node].nelem = -1;
1685
1686   /* If the current node has constraints, duplicate all nodes
1687      since they must inherit the constraints.  */
1688   if (dfa->nodes[node].constraint
1689       && dfa->edests[node].nelem
1690       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1691     {
1692       err = duplicate_node_closure (dfa, node, node, node,
1693                                     dfa->nodes[node].constraint);
1694       if (BE (err != REG_NOERROR, 0))
1695         return err;
1696     }
1697
1698   /* Expand each epsilon destination nodes.  */
1699   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1700     for (i = 0; i < dfa->edests[node].nelem; ++i)
1701       {
1702         re_node_set eclosure_elem;
1703         int edest = dfa->edests[node].elems[i];
1704         /* If calculating the epsilon closure of `edest' is in progress,
1705            return intermediate result.  */
1706         if (dfa->eclosures[edest].nelem == -1)
1707           {
1708             incomplete = 1;
1709             continue;
1710           }
1711         /* If we haven't calculated the epsilon closure of `edest' yet,
1712            calculate now. Otherwise use calculated epsilon closure.  */
1713         if (dfa->eclosures[edest].nelem == 0)
1714           {
1715             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1716             if (BE (err != REG_NOERROR, 0))
1717               return err;
1718           }
1719         else
1720           eclosure_elem = dfa->eclosures[edest];
1721         /* Merge the epsilon closure of 'edest'.  */
1722         err = re_node_set_merge (&eclosure, &eclosure_elem);
1723         if (BE (err != REG_NOERROR, 0))
1724           return err;
1725         /* If the epsilon closure of 'edest' is incomplete,
1726            the epsilon closure of this node is also incomplete.  */
1727         if (dfa->eclosures[edest].nelem == 0)
1728           {
1729             incomplete = 1;
1730             re_node_set_free (&eclosure_elem);
1731           }
1732       }
1733
1734   /* An epsilon closure includes itself.  */
1735   ret = re_node_set_insert (&eclosure, node);
1736   if (BE (ret < 0, 0))
1737     return REG_ESPACE;
1738   if (incomplete && !root)
1739     dfa->eclosures[node].nelem = 0;
1740   else
1741     dfa->eclosures[node] = eclosure;
1742   *new_set = eclosure;
1743   return REG_NOERROR;
1744 }
1745 \f
1746 /* Functions for token which are used in the parser.  */
1747
1748 /* Fetch a token from INPUT.
1749    We must not use this function inside bracket expressions.  */
1750
1751 static void
1752 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1753 {
1754   re_string_skip_bytes (input, peek_token (result, input, syntax));
1755 }
1756
1757 /* Peek a token from INPUT, and return the length of the token.
1758    We must not use this function inside bracket expressions.  */
1759
1760 static int
1761 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1762 {
1763   unsigned char c;
1764
1765   if (re_string_eoi (input))
1766     {
1767       token->type = END_OF_RE;
1768       return 0;
1769     }
1770
1771   c = re_string_peek_byte (input, 0);
1772   token->opr.c = c;
1773
1774   token->word_char = 0;
1775 #ifdef RE_ENABLE_I18N
1776   token->mb_partial = 0;
1777   if (input->mb_cur_max > 1 &&
1778       !re_string_first_byte (input, re_string_cur_idx (input)))
1779     {
1780       token->type = CHARACTER;
1781       token->mb_partial = 1;
1782       return 1;
1783     }
1784 #endif
1785   if (c == '\\')
1786     {
1787       unsigned char c2;
1788       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1789         {
1790           token->type = BACK_SLASH;
1791           return 1;
1792         }
1793
1794       c2 = re_string_peek_byte_case (input, 1);
1795       token->opr.c = c2;
1796       token->type = CHARACTER;
1797 #ifdef RE_ENABLE_I18N
1798       if (input->mb_cur_max > 1)
1799         {
1800           wint_t wc = re_string_wchar_at (input,
1801                                           re_string_cur_idx (input) + 1);
1802           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1803         }
1804       else
1805 #endif
1806         token->word_char = IS_WORD_CHAR (c2) != 0;
1807
1808       switch (c2)
1809         {
1810         case '|':
1811           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1812             token->type = OP_ALT;
1813           break;
1814         case '1': case '2': case '3': case '4': case '5':
1815         case '6': case '7': case '8': case '9':
1816           if (!(syntax & RE_NO_BK_REFS))
1817             {
1818               token->type = OP_BACK_REF;
1819               token->opr.idx = c2 - '1';
1820             }
1821           break;
1822         case '<':
1823           if (!(syntax & RE_NO_GNU_OPS))
1824             {
1825               token->type = ANCHOR;
1826               token->opr.ctx_type = WORD_FIRST;
1827             }
1828           break;
1829         case '>':
1830           if (!(syntax & RE_NO_GNU_OPS))
1831             {
1832               token->type = ANCHOR;
1833               token->opr.ctx_type = WORD_LAST;
1834             }
1835           break;
1836         case 'b':
1837           if (!(syntax & RE_NO_GNU_OPS))
1838             {
1839               token->type = ANCHOR;
1840               token->opr.ctx_type = WORD_DELIM;
1841             }
1842           break;
1843         case 'B':
1844           if (!(syntax & RE_NO_GNU_OPS))
1845             {
1846               token->type = ANCHOR;
1847               token->opr.ctx_type = NOT_WORD_DELIM;
1848             }
1849           break;
1850         case 'w':
1851           if (!(syntax & RE_NO_GNU_OPS))
1852             token->type = OP_WORD;
1853           break;
1854         case 'W':
1855           if (!(syntax & RE_NO_GNU_OPS))
1856             token->type = OP_NOTWORD;
1857           break;
1858         case 's':
1859           if (!(syntax & RE_NO_GNU_OPS))
1860             token->type = OP_SPACE;
1861           break;
1862         case 'S':
1863           if (!(syntax & RE_NO_GNU_OPS))
1864             token->type = OP_NOTSPACE;
1865           break;
1866         case '`':
1867           if (!(syntax & RE_NO_GNU_OPS))
1868             {
1869               token->type = ANCHOR;
1870               token->opr.ctx_type = BUF_FIRST;
1871             }
1872           break;
1873         case '\'':
1874           if (!(syntax & RE_NO_GNU_OPS))
1875             {
1876               token->type = ANCHOR;
1877               token->opr.ctx_type = BUF_LAST;
1878             }
1879           break;
1880         case '(':
1881           if (!(syntax & RE_NO_BK_PARENS))
1882             token->type = OP_OPEN_SUBEXP;
1883           break;
1884         case ')':
1885           if (!(syntax & RE_NO_BK_PARENS))
1886             token->type = OP_CLOSE_SUBEXP;
1887           break;
1888         case '+':
1889           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1890             token->type = OP_DUP_PLUS;
1891           break;
1892         case '?':
1893           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1894             token->type = OP_DUP_QUESTION;
1895           break;
1896         case '{':
1897           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1898             token->type = OP_OPEN_DUP_NUM;
1899           break;
1900         case '}':
1901           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1902             token->type = OP_CLOSE_DUP_NUM;
1903           break;
1904         default:
1905           break;
1906         }
1907       return 2;
1908     }
1909
1910   token->type = CHARACTER;
1911 #ifdef RE_ENABLE_I18N
1912   if (input->mb_cur_max > 1)
1913     {
1914       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1915       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1916     }
1917   else
1918 #endif
1919     token->word_char = IS_WORD_CHAR (token->opr.c);
1920
1921   switch (c)
1922     {
1923     case '\n':
1924       if (syntax & RE_NEWLINE_ALT)
1925         token->type = OP_ALT;
1926       break;
1927     case '|':
1928       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1929         token->type = OP_ALT;
1930       break;
1931     case '*':
1932       token->type = OP_DUP_ASTERISK;
1933       break;
1934     case '+':
1935       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1936         token->type = OP_DUP_PLUS;
1937       break;
1938     case '?':
1939       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1940         token->type = OP_DUP_QUESTION;
1941       break;
1942     case '{':
1943       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1944         token->type = OP_OPEN_DUP_NUM;
1945       break;
1946     case '}':
1947       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1948         token->type = OP_CLOSE_DUP_NUM;
1949       break;
1950     case '(':
1951       if (syntax & RE_NO_BK_PARENS)
1952         token->type = OP_OPEN_SUBEXP;
1953       break;
1954     case ')':
1955       if (syntax & RE_NO_BK_PARENS)
1956         token->type = OP_CLOSE_SUBEXP;
1957       break;
1958     case '[':
1959       token->type = OP_OPEN_BRACKET;
1960       break;
1961     case '.':
1962       token->type = OP_PERIOD;
1963       break;
1964     case '^':
1965       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1966           re_string_cur_idx (input) != 0)
1967         {
1968           char prev = re_string_peek_byte (input, -1);
1969           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1970             break;
1971         }
1972       token->type = ANCHOR;
1973       token->opr.ctx_type = LINE_FIRST;
1974       break;
1975     case '$':
1976       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1977           re_string_cur_idx (input) + 1 != re_string_length (input))
1978         {
1979           re_token_t next;
1980           re_string_skip_bytes (input, 1);
1981           peek_token (&next, input, syntax);
1982           re_string_skip_bytes (input, -1);
1983           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1984             break;
1985         }
1986       token->type = ANCHOR;
1987       token->opr.ctx_type = LINE_LAST;
1988       break;
1989     default:
1990       break;
1991     }
1992   return 1;
1993 }
1994
1995 /* Peek a token from INPUT, and return the length of the token.
1996    We must not use this function out of bracket expressions.  */
1997
1998 static int
1999 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2000 {
2001   unsigned char c;
2002   if (re_string_eoi (input))
2003     {
2004       token->type = END_OF_RE;
2005       return 0;
2006     }
2007   c = re_string_peek_byte (input, 0);
2008   token->opr.c = c;
2009
2010 #ifdef RE_ENABLE_I18N
2011   if (input->mb_cur_max > 1 &&
2012       !re_string_first_byte (input, re_string_cur_idx (input)))
2013     {
2014       token->type = CHARACTER;
2015       return 1;
2016     }
2017 #endif /* RE_ENABLE_I18N */
2018
2019   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2020       && re_string_cur_idx (input) + 1 < re_string_length (input))
2021     {
2022       /* In this case, '\' escape a character.  */
2023       unsigned char c2;
2024       re_string_skip_bytes (input, 1);
2025       c2 = re_string_peek_byte (input, 0);
2026       token->opr.c = c2;
2027       token->type = CHARACTER;
2028       return 1;
2029     }
2030   if (c == '[') /* '[' is a special char in a bracket exps.  */
2031     {
2032       unsigned char c2;
2033       int token_len;
2034       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2035         c2 = re_string_peek_byte (input, 1);
2036       else
2037         c2 = 0;
2038       token->opr.c = c2;
2039       token_len = 2;
2040       switch (c2)
2041         {
2042         case '.':
2043           token->type = OP_OPEN_COLL_ELEM;
2044           break;
2045         case '=':
2046           token->type = OP_OPEN_EQUIV_CLASS;
2047           break;
2048         case ':':
2049           if (syntax & RE_CHAR_CLASSES)
2050             {
2051               token->type = OP_OPEN_CHAR_CLASS;
2052               break;
2053             }
2054           /* else fall through.  */
2055         default:
2056           token->type = CHARACTER;
2057           token->opr.c = c;
2058           token_len = 1;
2059           break;
2060         }
2061       return token_len;
2062     }
2063   switch (c)
2064     {
2065     case '-':
2066       token->type = OP_CHARSET_RANGE;
2067       break;
2068     case ']':
2069       token->type = OP_CLOSE_BRACKET;
2070       break;
2071     case '^':
2072       token->type = OP_NON_MATCH_LIST;
2073       break;
2074     default:
2075       token->type = CHARACTER;
2076     }
2077   return 1;
2078 }
2079 \f
2080 /* Functions for parser.  */
2081
2082 /* Entry point of the parser.
2083    Parse the regular expression REGEXP and return the structure tree.
2084    If an error occurs, ERR is set by error code, and return NULL.
2085    This function build the following tree, from regular expression <reg_exp>:
2086            CAT
2087            / \
2088           /   \
2089    <reg_exp>  EOR
2090
2091    CAT means concatenation.
2092    EOR means end of regular expression.  */
2093
2094 static bin_tree_t *
2095 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2096        reg_errcode_t *err)
2097 {
2098   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2099   bin_tree_t *tree, *eor, *root;
2100   re_token_t current_token;
2101   dfa->syntax = syntax;
2102   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2103   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2104   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2105     return NULL;
2106   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2107   if (tree != NULL)
2108     root = create_tree (dfa, tree, eor, CONCAT);
2109   else
2110     root = eor;
2111   if (BE (eor == NULL || root == NULL, 0))
2112     {
2113       *err = REG_ESPACE;
2114       return NULL;
2115     }
2116   return root;
2117 }
2118
2119 /* This function build the following tree, from regular expression
2120    <branch1>|<branch2>:
2121            ALT
2122            / \
2123           /   \
2124    <branch1> <branch2>
2125
2126    ALT means alternative, which represents the operator '|'.  */
2127
2128 static bin_tree_t *
2129 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2130                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2131 {
2132   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2133   bin_tree_t *tree, *branch = NULL;
2134   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2135   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2136     return NULL;
2137
2138   while (token->type == OP_ALT)
2139     {
2140       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2141       if (token->type != OP_ALT && token->type != END_OF_RE
2142           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2143         {
2144           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2145           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2146             {
2147               if (tree != NULL)
2148                 postorder (tree, free_tree, NULL);
2149               return NULL;
2150             }
2151         }
2152       else
2153         branch = NULL;
2154       tree = create_tree (dfa, tree, branch, OP_ALT);
2155       if (BE (tree == NULL, 0))
2156         {
2157           *err = REG_ESPACE;
2158           return NULL;
2159         }
2160     }
2161   return tree;
2162 }
2163
2164 /* This function build the following tree, from regular expression
2165    <exp1><exp2>:
2166         CAT
2167         / \
2168        /   \
2169    <exp1> <exp2>
2170
2171    CAT means concatenation.  */
2172
2173 static bin_tree_t *
2174 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2175               reg_syntax_t syntax, int nest, reg_errcode_t *err)
2176 {
2177   bin_tree_t *tree, *exp;
2178   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2179   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2180   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2181     return NULL;
2182
2183   while (token->type != OP_ALT && token->type != END_OF_RE
2184          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2185     {
2186       exp = parse_expression (regexp, preg, token, syntax, nest, err);
2187       if (BE (*err != REG_NOERROR && exp == NULL, 0))
2188         {
2189           if (tree != NULL)
2190             postorder (tree, free_tree, NULL);
2191           return NULL;
2192         }
2193       if (tree != NULL && exp != NULL)
2194         {
2195           bin_tree_t *newtree = create_tree (dfa, tree, exp, CONCAT);
2196           if (newtree == NULL)
2197             {
2198               postorder (exp, free_tree, NULL);
2199               postorder (tree, free_tree, NULL);
2200               *err = REG_ESPACE;
2201               return NULL;
2202             }
2203           tree = newtree;
2204         }
2205       else if (tree == NULL)
2206         tree = exp;
2207       /* Otherwise exp == NULL, we don't need to create new tree.  */
2208     }
2209   return tree;
2210 }
2211
2212 /* This function build the following tree, from regular expression a*:
2213          *
2214          |
2215          a
2216 */
2217
2218 static bin_tree_t *
2219 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2220                   reg_syntax_t syntax, int nest, reg_errcode_t *err)
2221 {
2222   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2223   bin_tree_t *tree;
2224   switch (token->type)
2225     {
2226     case CHARACTER:
2227       tree = create_token_tree (dfa, NULL, NULL, token);
2228       if (BE (tree == NULL, 0))
2229         {
2230           *err = REG_ESPACE;
2231           return NULL;
2232         }
2233 #ifdef RE_ENABLE_I18N
2234       if (dfa->mb_cur_max > 1)
2235         {
2236           while (!re_string_eoi (regexp)
2237                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2238             {
2239               bin_tree_t *mbc_remain;
2240               fetch_token (token, regexp, syntax);
2241               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2242               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2243               if (BE (mbc_remain == NULL || tree == NULL, 0))
2244                 {
2245                   *err = REG_ESPACE;
2246                   return NULL;
2247                 }
2248             }
2249         }
2250 #endif
2251       break;
2252     case OP_OPEN_SUBEXP:
2253       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2254       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2255         return NULL;
2256       break;
2257     case OP_OPEN_BRACKET:
2258       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2259       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2260         return NULL;
2261       break;
2262     case OP_BACK_REF:
2263       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2264         {
2265           *err = REG_ESUBREG;
2266           return NULL;
2267         }
2268       dfa->used_bkref_map |= 1 << token->opr.idx;
2269       tree = create_token_tree (dfa, NULL, NULL, token);
2270       if (BE (tree == NULL, 0))
2271         {
2272           *err = REG_ESPACE;
2273           return NULL;
2274         }
2275       ++dfa->nbackref;
2276       dfa->has_mb_node = 1;
2277       break;
2278     case OP_OPEN_DUP_NUM:
2279       if (syntax & RE_CONTEXT_INVALID_DUP)
2280         {
2281           *err = REG_BADRPT;
2282           return NULL;
2283         }
2284       /* FALLTHROUGH */
2285     case OP_DUP_ASTERISK:
2286     case OP_DUP_PLUS:
2287     case OP_DUP_QUESTION:
2288       if (syntax & RE_CONTEXT_INVALID_OPS)
2289         {
2290           *err = REG_BADRPT;
2291           return NULL;
2292         }
2293       else if (syntax & RE_CONTEXT_INDEP_OPS)
2294         {
2295           fetch_token (token, regexp, syntax);
2296           return parse_expression (regexp, preg, token, syntax, nest, err);
2297         }
2298       /* else fall through  */
2299     case OP_CLOSE_SUBEXP:
2300       if ((token->type == OP_CLOSE_SUBEXP) &&
2301           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2302         {
2303           *err = REG_ERPAREN;
2304           return NULL;
2305         }
2306       /* else fall through  */
2307     case OP_CLOSE_DUP_NUM:
2308       /* We treat it as a normal character.  */
2309
2310       /* Then we can these characters as normal characters.  */
2311       token->type = CHARACTER;
2312       /* mb_partial and word_char bits should be initialized already
2313          by peek_token.  */
2314       tree = create_token_tree (dfa, NULL, NULL, token);
2315       if (BE (tree == NULL, 0))
2316         {
2317           *err = REG_ESPACE;
2318           return NULL;
2319         }
2320       break;
2321     case ANCHOR:
2322       if ((token->opr.ctx_type
2323            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2324           && dfa->word_ops_used == 0)
2325         init_word_char (dfa);
2326       if (token->opr.ctx_type == WORD_DELIM
2327           || token->opr.ctx_type == NOT_WORD_DELIM)
2328         {
2329           bin_tree_t *tree_first, *tree_last;
2330           if (token->opr.ctx_type == WORD_DELIM)
2331             {
2332               token->opr.ctx_type = WORD_FIRST;
2333               tree_first = create_token_tree (dfa, NULL, NULL, token);
2334               token->opr.ctx_type = WORD_LAST;
2335             }
2336           else
2337             {
2338               token->opr.ctx_type = INSIDE_WORD;
2339               tree_first = create_token_tree (dfa, NULL, NULL, token);
2340               token->opr.ctx_type = INSIDE_NOTWORD;
2341             }
2342           tree_last = create_token_tree (dfa, NULL, NULL, token);
2343           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2344           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2345             {
2346               *err = REG_ESPACE;
2347               return NULL;
2348             }
2349         }
2350       else
2351         {
2352           tree = create_token_tree (dfa, NULL, NULL, token);
2353           if (BE (tree == NULL, 0))
2354             {
2355               *err = REG_ESPACE;
2356               return NULL;
2357             }
2358         }
2359       /* We must return here, since ANCHORs can't be followed
2360          by repetition operators.
2361          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2362              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2363       fetch_token (token, regexp, syntax);
2364       return tree;
2365     case OP_PERIOD:
2366       tree = create_token_tree (dfa, NULL, NULL, token);
2367       if (BE (tree == NULL, 0))
2368         {
2369           *err = REG_ESPACE;
2370           return NULL;
2371         }
2372       if (dfa->mb_cur_max > 1)
2373         dfa->has_mb_node = 1;
2374       break;
2375     case OP_WORD:
2376     case OP_NOTWORD:
2377       tree = build_charclass_op (dfa, regexp->trans,
2378                                  (const unsigned char *) "alnum",
2379                                  (const unsigned char *) "_",
2380                                  token->type == OP_NOTWORD, err);
2381       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2382         return NULL;
2383       break;
2384     case OP_SPACE:
2385     case OP_NOTSPACE:
2386       tree = build_charclass_op (dfa, regexp->trans,
2387                                  (const unsigned char *) "space",
2388                                  (const unsigned char *) "",
2389                                  token->type == OP_NOTSPACE, err);
2390       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2391         return NULL;
2392       break;
2393     case OP_ALT:
2394     case END_OF_RE:
2395       return NULL;
2396     case BACK_SLASH:
2397       *err = REG_EESCAPE;
2398       return NULL;
2399     default:
2400       /* Must not happen?  */
2401 #ifdef DEBUG
2402       assert (0);
2403 #endif
2404       return NULL;
2405     }
2406   fetch_token (token, regexp, syntax);
2407
2408   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2409          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2410     {
2411       bin_tree_t *dup_tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2412       if (BE (*err != REG_NOERROR && dup_tree == NULL, 0))
2413         {
2414           if (tree != NULL)
2415             postorder (tree, free_tree, NULL);
2416           return NULL;
2417         }
2418       tree = dup_tree;
2419       /* In BRE consecutive duplications are not allowed.  */
2420       if ((syntax & RE_CONTEXT_INVALID_DUP)
2421           && (token->type == OP_DUP_ASTERISK
2422               || token->type == OP_OPEN_DUP_NUM))
2423         {
2424           if (tree != NULL)
2425             postorder (tree, free_tree, NULL);
2426           *err = REG_BADRPT;
2427           return NULL;
2428         }
2429     }
2430
2431   return tree;
2432 }
2433
2434 /* This function build the following tree, from regular expression
2435    (<reg_exp>):
2436          SUBEXP
2437             |
2438         <reg_exp>
2439 */
2440
2441 static bin_tree_t *
2442 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2443                reg_syntax_t syntax, int nest, reg_errcode_t *err)
2444 {
2445   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2446   bin_tree_t *tree;
2447   size_t cur_nsub;
2448   cur_nsub = preg->re_nsub++;
2449
2450   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2451
2452   /* The subexpression may be a null string.  */
2453   if (token->type == OP_CLOSE_SUBEXP)
2454     tree = NULL;
2455   else
2456     {
2457       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2458       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2459         {
2460           if (tree != NULL)
2461             postorder (tree, free_tree, NULL);
2462           *err = REG_EPAREN;
2463         }
2464       if (BE (*err != REG_NOERROR, 0))
2465         return NULL;
2466     }
2467
2468   if (cur_nsub <= '9' - '1')
2469     dfa->completed_bkref_map |= 1 << cur_nsub;
2470
2471   tree = create_tree (dfa, tree, NULL, SUBEXP);
2472   if (BE (tree == NULL, 0))
2473     {
2474       *err = REG_ESPACE;
2475       return NULL;
2476     }
2477   tree->token.opr.idx = cur_nsub;
2478   return tree;
2479 }
2480
2481 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2482
2483 static bin_tree_t *
2484 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2485               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2486 {
2487   bin_tree_t *tree = NULL, *old_tree = NULL;
2488   int i, start, end, start_idx = re_string_cur_idx (regexp);
2489   re_token_t start_token = *token;
2490
2491   if (token->type == OP_OPEN_DUP_NUM)
2492     {
2493       end = 0;
2494       start = fetch_number (regexp, token, syntax);
2495       if (start == -1)
2496         {
2497           if (token->type == CHARACTER && token->opr.c == ',')
2498             start = 0; /* We treat "{,m}" as "{0,m}".  */
2499           else
2500             {
2501               *err = REG_BADBR; /* <re>{} is invalid.  */
2502               return NULL;
2503             }
2504         }
2505       if (BE (start != -2, 1))
2506         {
2507           /* We treat "{n}" as "{n,n}".  */
2508           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2509                  : ((token->type == CHARACTER && token->opr.c == ',')
2510                     ? fetch_number (regexp, token, syntax) : -2));
2511         }
2512       if (BE (start == -2 || end == -2, 0))
2513         {
2514           /* Invalid sequence.  */
2515           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2516             {
2517               if (token->type == END_OF_RE)
2518                 *err = REG_EBRACE;
2519               else
2520                 *err = REG_BADBR;
2521
2522               return NULL;
2523             }
2524
2525           /* If the syntax bit is set, rollback.  */
2526           re_string_set_index (regexp, start_idx);
2527           *token = start_token;
2528           token->type = CHARACTER;
2529           /* mb_partial and word_char bits should be already initialized by
2530              peek_token.  */
2531           return elem;
2532         }
2533
2534       if (BE ((end != -1 && start > end) || token->type != OP_CLOSE_DUP_NUM, 0))
2535         {
2536           /* First number greater than second.  */
2537           *err = REG_BADBR;
2538           return NULL;
2539         }
2540     }
2541   else
2542     {
2543       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2544       end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2545     }
2546
2547   fetch_token (token, regexp, syntax);
2548
2549   if (BE (elem == NULL, 0))
2550     return NULL;
2551   if (BE (start == 0 && end == 0, 0))
2552     {
2553       postorder (elem, free_tree, NULL);
2554       return NULL;
2555     }
2556
2557   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2558   if (BE (start > 0, 0))
2559     {
2560       tree = elem;
2561       for (i = 2; i <= start; ++i)
2562         {
2563           elem = duplicate_tree (elem, dfa);
2564           tree = create_tree (dfa, tree, elem, CONCAT);
2565           if (BE (elem == NULL || tree == NULL, 0))
2566             goto parse_dup_op_espace;
2567         }
2568
2569       if (start == end)
2570         return tree;
2571
2572       /* Duplicate ELEM before it is marked optional.  */
2573       elem = duplicate_tree (elem, dfa);
2574       if (BE (elem == NULL, 0))
2575         goto parse_dup_op_espace;
2576       old_tree = tree;
2577     }
2578   else
2579     old_tree = NULL;
2580
2581   if (elem->token.type == SUBEXP)
2582     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2583
2584   tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2585   if (BE (tree == NULL, 0))
2586     goto parse_dup_op_espace;
2587
2588   /* This loop is actually executed only when end != -1,
2589      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2590      already created the start+1-th copy.  */
2591   for (i = start + 2; i <= end; ++i)
2592     {
2593       elem = duplicate_tree (elem, dfa);
2594       tree = create_tree (dfa, tree, elem, CONCAT);
2595       if (BE (elem == NULL || tree == NULL, 0))
2596         goto parse_dup_op_espace;
2597
2598       tree = create_tree (dfa, tree, NULL, OP_ALT);
2599       if (BE (tree == NULL, 0))
2600         goto parse_dup_op_espace;
2601     }
2602
2603   if (old_tree)
2604     tree = create_tree (dfa, old_tree, tree, CONCAT);
2605
2606   return tree;
2607
2608  parse_dup_op_espace:
2609   *err = REG_ESPACE;
2610   return NULL;
2611 }
2612
2613 /* Size of the names for collating symbol/equivalence_class/character_class.
2614    I'm not sure, but maybe enough.  */
2615 #define BRACKET_NAME_BUF_SIZE 32
2616
2617 #ifndef _LIBC
2618   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2619      Build the range expression which starts from START_ELEM, and ends
2620      at END_ELEM.  The result are written to MBCSET and SBCSET.
2621      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2622      mbcset->range_ends, is a pointer argument since we may
2623      update it.  */
2624
2625 static reg_errcode_t
2626 # ifdef RE_ENABLE_I18N
2627 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2628                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2629 # else /* not RE_ENABLE_I18N */
2630 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2631                  bracket_elem_t *end_elem)
2632 # endif /* not RE_ENABLE_I18N */
2633 {
2634   unsigned int start_ch, end_ch;
2635   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2636   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2637           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2638           0))
2639     return REG_ERANGE;
2640
2641   /* We can handle no multi character collating elements without libc
2642      support.  */
2643   if (BE ((start_elem->type == COLL_SYM
2644            && strlen ((char *) start_elem->opr.name) > 1)
2645           || (end_elem->type == COLL_SYM
2646               && strlen ((char *) end_elem->opr.name) > 1), 0))
2647     return REG_ECOLLATE;
2648
2649 # ifdef RE_ENABLE_I18N
2650   {
2651     wchar_t wc;
2652     wint_t start_wc;
2653     wint_t end_wc;
2654     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2655
2656     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2657                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2658                    : 0));
2659     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2660               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2661                  : 0));
2662     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2663                 ? __btowc (start_ch) : start_elem->opr.wch);
2664     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2665               ? __btowc (end_ch) : end_elem->opr.wch);
2666     if (start_wc == WEOF || end_wc == WEOF)
2667       return REG_ECOLLATE;
2668     cmp_buf[0] = start_wc;
2669     cmp_buf[4] = end_wc;
2670     if (__wcscoll (cmp_buf, cmp_buf + 4) > 0)
2671       return REG_ERANGE;
2672
2673     /* Got valid collation sequence values, add them as a new entry.
2674        However, for !_LIBC we have no collation elements: if the
2675        character set is single byte, the single byte character set
2676        that we build below suffices.  parse_bracket_exp passes
2677        no MBCSET if dfa->mb_cur_max == 1.  */
2678     if (mbcset)
2679       {
2680         /* Check the space of the arrays.  */
2681         if (BE (*range_alloc == mbcset->nranges, 0))
2682           {
2683             /* There is not enough space, need realloc.  */
2684             wchar_t *new_array_start, *new_array_end;
2685             int new_nranges;
2686
2687             /* +1 in case of mbcset->nranges is 0.  */
2688             new_nranges = 2 * mbcset->nranges + 1;
2689             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2690                are NULL if *range_alloc == 0.  */
2691             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2692                                           new_nranges);
2693             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2694                                         new_nranges);
2695
2696             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2697               return REG_ESPACE;
2698
2699             mbcset->range_starts = new_array_start;
2700             mbcset->range_ends = new_array_end;
2701             *range_alloc = new_nranges;
2702           }
2703
2704         mbcset->range_starts[mbcset->nranges] = start_wc;
2705         mbcset->range_ends[mbcset->nranges++] = end_wc;
2706       }
2707
2708     /* Build the table for single byte characters.  */
2709     for (wc = 0; wc < SBC_MAX; ++wc)
2710       {
2711         cmp_buf[2] = wc;
2712         if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
2713             && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2714           bitset_set (sbcset, wc);
2715       }
2716   }
2717 # else /* not RE_ENABLE_I18N */
2718   {
2719     unsigned int ch;
2720     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2721                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2722                    : 0));
2723     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2724               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2725                  : 0));
2726     if (start_ch > end_ch)
2727       return REG_ERANGE;
2728     /* Build the table for single byte characters.  */
2729     for (ch = 0; ch < SBC_MAX; ++ch)
2730       if (start_ch <= ch  && ch <= end_ch)
2731         bitset_set (sbcset, ch);
2732   }
2733 # endif /* not RE_ENABLE_I18N */
2734   return REG_NOERROR;
2735 }
2736 #endif /* not _LIBC */
2737
2738 #ifndef _LIBC
2739 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2740    Build the collating element which is represented by NAME.
2741    The result are written to MBCSET and SBCSET.
2742    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2743    pointer argument since we may update it.  */
2744
2745 static reg_errcode_t
2746 # ifdef RE_ENABLE_I18N
2747 build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2748                         int *coll_sym_alloc, const unsigned char *name)
2749 # else /* not RE_ENABLE_I18N */
2750 build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2751 # endif /* not RE_ENABLE_I18N */
2752 {
2753   size_t name_len = strlen ((const char *) name);
2754   if (BE (name_len != 1, 0))
2755     return REG_ECOLLATE;
2756   else
2757     {
2758       bitset_set (sbcset, name[0]);
2759       return REG_NOERROR;
2760     }
2761 }
2762 #endif /* not _LIBC */
2763
2764 /* This function parse bracket expression like "[abc]", "[a-c]",
2765    "[[.a-a.]]" etc.  */
2766
2767 static bin_tree_t *
2768 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2769                    reg_syntax_t syntax, reg_errcode_t *err)
2770 {
2771 #ifdef _LIBC
2772   const unsigned char *collseqmb;
2773   const char *collseqwc;
2774   uint32_t nrules;
2775   int32_t table_size;
2776   const int32_t *symb_table;
2777   const unsigned char *extra;
2778
2779   /* Local function for parse_bracket_exp used in _LIBC environment.
2780      Seek the collating symbol entry corresponding to NAME.
2781      Return the index of the symbol in the SYMB_TABLE,
2782      or -1 if not found.  */
2783
2784   auto inline int32_t
2785   __attribute__ ((always_inline))
2786   seek_collating_symbol_entry (const unsigned char *name, size_t name_len)
2787     {
2788       int32_t elem;
2789
2790       for (elem = 0; elem < table_size; elem++)
2791         if (symb_table[2 * elem] != 0)
2792           {
2793             int32_t idx = symb_table[2 * elem + 1];
2794             /* Skip the name of collating element name.  */
2795             idx += 1 + extra[idx];
2796             if (/* Compare the length of the name.  */
2797                 name_len == extra[idx]
2798                 /* Compare the name.  */
2799                 && memcmp (name, &extra[idx + 1], name_len) == 0)
2800               /* Yep, this is the entry.  */
2801               return elem;
2802           }
2803       return -1;
2804     }
2805
2806   /* Local function for parse_bracket_exp used in _LIBC environment.
2807      Look up the collation sequence value of BR_ELEM.
2808      Return the value if succeeded, UINT_MAX otherwise.  */
2809
2810   auto inline unsigned int
2811   __attribute__ ((always_inline))
2812   lookup_collation_sequence_value (bracket_elem_t *br_elem)
2813     {
2814       if (br_elem->type == SB_CHAR)
2815         {
2816           /*
2817           if (MB_CUR_MAX == 1)
2818           */
2819           if (nrules == 0)
2820             return collseqmb[br_elem->opr.ch];
2821           else
2822             {
2823               wint_t wc = __btowc (br_elem->opr.ch);
2824               return __collseq_table_lookup (collseqwc, wc);
2825             }
2826         }
2827       else if (br_elem->type == MB_CHAR)
2828         {
2829           if (nrules != 0)
2830             return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2831         }
2832       else if (br_elem->type == COLL_SYM)
2833         {
2834           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2835           if (nrules != 0)
2836             {
2837               int32_t elem, idx;
2838               elem = seek_collating_symbol_entry (br_elem->opr.name,
2839                                                   sym_name_len);
2840               if (elem != -1)
2841                 {
2842                   /* We found the entry.  */
2843                   idx = symb_table[2 * elem + 1];
2844                   /* Skip the name of collating element name.  */
2845                   idx += 1 + extra[idx];
2846                   /* Skip the byte sequence of the collating element.  */
2847                   idx += 1 + extra[idx];
2848                   /* Adjust for the alignment.  */
2849                   idx = (idx + 3) & ~3;
2850                   /* Skip the multibyte collation sequence value.  */
2851                   idx += sizeof (unsigned int);
2852                   /* Skip the wide char sequence of the collating element.  */
2853                   idx += sizeof (unsigned int) *
2854                     (1 + *(unsigned int *) (extra + idx));
2855                   /* Return the collation sequence value.  */
2856                   return *(unsigned int *) (extra + idx);
2857                 }
2858               else if (sym_name_len == 1)
2859                 {
2860                   /* No valid character.  Match it as a single byte
2861                      character.  */
2862                   return collseqmb[br_elem->opr.name[0]];
2863                 }
2864             }
2865           else if (sym_name_len == 1)
2866             return collseqmb[br_elem->opr.name[0]];
2867         }
2868       return UINT_MAX;
2869     }
2870
2871   /* Local function for parse_bracket_exp used in _LIBC environment.
2872      Build the range expression which starts from START_ELEM, and ends
2873      at END_ELEM.  The result are written to MBCSET and SBCSET.
2874      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2875      mbcset->range_ends, is a pointer argument since we may
2876      update it.  */
2877
2878   auto inline reg_errcode_t
2879   __attribute__ ((always_inline))
2880   build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2881                    bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2882     {
2883       unsigned int ch;
2884       uint32_t start_collseq;
2885       uint32_t end_collseq;
2886
2887       /* Equivalence Classes and Character Classes can't be a range
2888          start/end.  */
2889       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2890               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2891               0))
2892         return REG_ERANGE;
2893
2894       start_collseq = lookup_collation_sequence_value (start_elem);
2895       end_collseq = lookup_collation_sequence_value (end_elem);
2896       /* Check start/end collation sequence values.  */
2897       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2898         return REG_ECOLLATE;
2899       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2900         return REG_ERANGE;
2901
2902       /* Got valid collation sequence values, add them as a new entry.
2903          However, if we have no collation elements, and the character set
2904          is single byte, the single byte character set that we
2905          build below suffices. */
2906       if (nrules > 0 || dfa->mb_cur_max > 1)
2907         {
2908           /* Check the space of the arrays.  */
2909           if (BE (*range_alloc == mbcset->nranges, 0))
2910             {
2911               /* There is not enough space, need realloc.  */
2912               uint32_t *new_array_start;
2913               uint32_t *new_array_end;
2914               int new_nranges;
2915
2916               /* +1 in case of mbcset->nranges is 0.  */
2917               new_nranges = 2 * mbcset->nranges + 1;
2918               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2919                                             new_nranges);
2920               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2921                                           new_nranges);
2922
2923               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2924                 return REG_ESPACE;
2925
2926               mbcset->range_starts = new_array_start;
2927               mbcset->range_ends = new_array_end;
2928               *range_alloc = new_nranges;
2929             }
2930
2931           mbcset->range_starts[mbcset->nranges] = start_collseq;
2932           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2933         }
2934
2935       /* Build the table for single byte characters.  */
2936       for (ch = 0; ch < SBC_MAX; ch++)
2937         {
2938           uint32_t ch_collseq;
2939           /*
2940           if (MB_CUR_MAX == 1)
2941           */
2942           if (nrules == 0)
2943             ch_collseq = collseqmb[ch];
2944           else
2945             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2946           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2947             bitset_set (sbcset, ch);
2948         }
2949       return REG_NOERROR;
2950     }
2951
2952   /* Local function for parse_bracket_exp used in _LIBC environment.
2953      Build the collating element which is represented by NAME.
2954      The result are written to MBCSET and SBCSET.
2955      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2956      pointer argument since we may update it.  */
2957
2958   auto inline reg_errcode_t
2959   __attribute__ ((always_inline))
2960   build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2961                           int *coll_sym_alloc, const unsigned char *name)
2962     {
2963       int32_t elem, idx;
2964       size_t name_len = strlen ((const char *) name);
2965       if (nrules != 0)
2966         {
2967           elem = seek_collating_symbol_entry (name, name_len);
2968           if (elem != -1)
2969             {
2970               /* We found the entry.  */
2971               idx = symb_table[2 * elem + 1];
2972               /* Skip the name of collating element name.  */
2973               idx += 1 + extra[idx];
2974             }
2975           else if (name_len == 1)
2976             {
2977               /* No valid character, treat it as a normal
2978                  character.  */
2979               bitset_set (sbcset, name[0]);
2980               return REG_NOERROR;
2981             }
2982           else
2983             return REG_ECOLLATE;
2984
2985           /* Got valid collation sequence, add it as a new entry.  */
2986           /* Check the space of the arrays.  */
2987           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2988             {
2989               /* Not enough, realloc it.  */
2990               /* +1 in case of mbcset->ncoll_syms is 0.  */
2991               int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2992               /* Use realloc since mbcset->coll_syms is NULL
2993                  if *alloc == 0.  */
2994               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2995                                                    new_coll_sym_alloc);
2996               if (BE (new_coll_syms == NULL, 0))
2997                 return REG_ESPACE;
2998               mbcset->coll_syms = new_coll_syms;
2999               *coll_sym_alloc = new_coll_sym_alloc;
3000             }
3001           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3002           return REG_NOERROR;
3003         }
3004       else
3005         {
3006           if (BE (name_len != 1, 0))
3007             return REG_ECOLLATE;
3008           else
3009             {
3010               bitset_set (sbcset, name[0]);
3011               return REG_NOERROR;
3012             }
3013         }
3014     }
3015 #endif
3016
3017   re_token_t br_token;
3018   re_bitset_ptr_t sbcset;
3019 #ifdef RE_ENABLE_I18N
3020   re_charset_t *mbcset;
3021   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3022   int equiv_class_alloc = 0, char_class_alloc = 0;
3023 #endif /* not RE_ENABLE_I18N */
3024   int non_match = 0;
3025   bin_tree_t *work_tree;
3026   int token_len;
3027   int first_round = 1;
3028 #ifdef _LIBC
3029   collseqmb = (const unsigned char *)
3030     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3031   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3032   if (nrules)
3033     {
3034       /*
3035       if (MB_CUR_MAX > 1)
3036       */
3037       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3038       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3039       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3040                                                   _NL_COLLATE_SYMB_TABLEMB);
3041       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3042                                                    _NL_COLLATE_SYMB_EXTRAMB);
3043     }
3044 #endif
3045   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3046 #ifdef RE_ENABLE_I18N
3047   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3048 #endif /* RE_ENABLE_I18N */
3049 #ifdef RE_ENABLE_I18N
3050   if (BE (sbcset == NULL || mbcset == NULL, 0))
3051 #else
3052   if (BE (sbcset == NULL, 0))
3053 #endif /* RE_ENABLE_I18N */
3054     {
3055       re_free (sbcset);
3056 #ifdef RE_ENABLE_I18N
3057       re_free (mbcset);
3058 #endif
3059       *err = REG_ESPACE;
3060       return NULL;
3061     }
3062
3063   token_len = peek_token_bracket (token, regexp, syntax);
3064   if (BE (token->type == END_OF_RE, 0))
3065     {
3066       *err = REG_BADPAT;
3067       goto parse_bracket_exp_free_return;
3068     }
3069   if (token->type == OP_NON_MATCH_LIST)
3070     {
3071 #ifdef RE_ENABLE_I18N
3072       mbcset->non_match = 1;
3073 #endif /* not RE_ENABLE_I18N */
3074       non_match = 1;
3075       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3076         bitset_set (sbcset, '\n');
3077       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3078       token_len = peek_token_bracket (token, regexp, syntax);
3079       if (BE (token->type == END_OF_RE, 0))
3080         {
3081           *err = REG_BADPAT;
3082           goto parse_bracket_exp_free_return;
3083         }
3084     }
3085
3086   /* We treat the first ']' as a normal character.  */
3087   if (token->type == OP_CLOSE_BRACKET)
3088     token->type = CHARACTER;
3089
3090   while (1)
3091     {
3092       bracket_elem_t start_elem, end_elem;
3093       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3094       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3095       reg_errcode_t ret;
3096       int token_len2 = 0, is_range_exp = 0;
3097       re_token_t token2;
3098
3099       start_elem.opr.name = start_name_buf;
3100       start_elem.type = COLL_SYM;
3101       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3102                                    syntax, first_round);
3103       if (BE (ret != REG_NOERROR, 0))
3104         {
3105           *err = ret;
3106           goto parse_bracket_exp_free_return;
3107         }
3108       first_round = 0;
3109
3110       /* Get information about the next token.  We need it in any case.  */
3111       token_len = peek_token_bracket (token, regexp, syntax);
3112
3113       /* Do not check for ranges if we know they are not allowed.  */
3114       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3115         {
3116           if (BE (token->type == END_OF_RE, 0))
3117             {
3118               *err = REG_EBRACK;
3119               goto parse_bracket_exp_free_return;
3120             }
3121           if (token->type == OP_CHARSET_RANGE)
3122             {
3123               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3124               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3125               if (BE (token2.type == END_OF_RE, 0))
3126                 {
3127                   *err = REG_EBRACK;
3128                   goto parse_bracket_exp_free_return;
3129                 }
3130               if (token2.type == OP_CLOSE_BRACKET)
3131                 {
3132                   /* We treat the last '-' as a normal character.  */
3133                   re_string_skip_bytes (regexp, -token_len);
3134                   token->type = CHARACTER;
3135                 }
3136               else
3137                 is_range_exp = 1;
3138             }
3139         }
3140
3141       if (is_range_exp == 1)
3142         {
3143           end_elem.opr.name = end_name_buf;
3144           end_elem.type = COLL_SYM;
3145           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3146                                        dfa, syntax, 1);
3147           if (BE (ret != REG_NOERROR, 0))
3148             {
3149               *err = ret;
3150               goto parse_bracket_exp_free_return;
3151             }
3152
3153           token_len = peek_token_bracket (token, regexp, syntax);
3154
3155 #ifdef _LIBC
3156           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3157                                   &start_elem, &end_elem);
3158 #else
3159 # ifdef RE_ENABLE_I18N
3160           *err = build_range_exp (sbcset,
3161                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3162                                   &range_alloc, &start_elem, &end_elem);
3163 # else
3164           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3165 # endif
3166 #endif /* RE_ENABLE_I18N */
3167           if (BE (*err != REG_NOERROR, 0))
3168             goto parse_bracket_exp_free_return;
3169         }
3170       else
3171         {
3172           switch (start_elem.type)
3173             {
3174             case SB_CHAR:
3175               bitset_set (sbcset, start_elem.opr.ch);
3176               break;
3177 #ifdef RE_ENABLE_I18N
3178             case MB_CHAR:
3179               /* Check whether the array has enough space.  */
3180               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3181                 {
3182                   wchar_t *new_mbchars;
3183                   /* Not enough, realloc it.  */
3184                   /* +1 in case of mbcset->nmbchars is 0.  */
3185                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3186                   /* Use realloc since array is NULL if *alloc == 0.  */
3187                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3188                                             mbchar_alloc);
3189                   if (BE (new_mbchars == NULL, 0))
3190                     goto parse_bracket_exp_espace;
3191                   mbcset->mbchars = new_mbchars;
3192                 }
3193               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3194               break;
3195 #endif /* RE_ENABLE_I18N */
3196             case EQUIV_CLASS:
3197               *err = build_equiv_class (sbcset,
3198 #ifdef RE_ENABLE_I18N
3199                                         mbcset, &equiv_class_alloc,
3200 #endif /* RE_ENABLE_I18N */
3201                                         start_elem.opr.name);
3202               if (BE (*err != REG_NOERROR, 0))
3203                 goto parse_bracket_exp_free_return;
3204               break;
3205             case COLL_SYM:
3206               *err = build_collating_symbol (sbcset,
3207 #ifdef RE_ENABLE_I18N
3208                                              mbcset, &coll_sym_alloc,
3209 #endif /* RE_ENABLE_I18N */
3210                                              start_elem.opr.name);
3211               if (BE (*err != REG_NOERROR, 0))
3212                 goto parse_bracket_exp_free_return;
3213               break;
3214             case CHAR_CLASS:
3215               *err = build_charclass (regexp->trans, sbcset,
3216 #ifdef RE_ENABLE_I18N
3217                                       mbcset, &char_class_alloc,
3218 #endif /* RE_ENABLE_I18N */
3219                                       start_elem.opr.name, syntax);
3220               if (BE (*err != REG_NOERROR, 0))
3221                goto parse_bracket_exp_free_return;
3222               break;
3223             default:
3224               assert (0);
3225               break;
3226             }
3227         }
3228       if (BE (token->type == END_OF_RE, 0))
3229         {
3230           *err = REG_EBRACK;
3231           goto parse_bracket_exp_free_return;
3232         }
3233       if (token->type == OP_CLOSE_BRACKET)
3234         break;
3235     }
3236
3237   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3238
3239   /* If it is non-matching list.  */
3240   if (non_match)
3241     bitset_not (sbcset);
3242
3243 #ifdef RE_ENABLE_I18N
3244   /* Ensure only single byte characters are set.  */
3245   if (dfa->mb_cur_max > 1)
3246     bitset_mask (sbcset, dfa->sb_char);
3247
3248   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3249       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3250                                                      || mbcset->non_match)))
3251     {
3252       bin_tree_t *mbc_tree;
3253       int sbc_idx;
3254       /* Build a tree for complex bracket.  */
3255       dfa->has_mb_node = 1;
3256       br_token.type = COMPLEX_BRACKET;
3257       br_token.opr.mbcset = mbcset;
3258       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3259       if (BE (mbc_tree == NULL, 0))
3260         goto parse_bracket_exp_espace;
3261       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3262         if (sbcset[sbc_idx])
3263           break;
3264       /* If there are no bits set in sbcset, there is no point
3265          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3266       if (sbc_idx < BITSET_WORDS)
3267         {
3268           /* Build a tree for simple bracket.  */
3269           br_token.type = SIMPLE_BRACKET;
3270           br_token.opr.sbcset = sbcset;
3271           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272           if (BE (work_tree == NULL, 0))
3273             goto parse_bracket_exp_espace;
3274
3275           /* Then join them by ALT node.  */
3276           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3277           if (BE (work_tree == NULL, 0))
3278             goto parse_bracket_exp_espace;
3279         }
3280       else
3281         {
3282           re_free (sbcset);
3283           work_tree = mbc_tree;
3284         }
3285     }
3286   else
3287 #endif /* not RE_ENABLE_I18N */
3288     {
3289 #ifdef RE_ENABLE_I18N
3290       free_charset (mbcset);
3291 #endif
3292       /* Build a tree for simple bracket.  */
3293       br_token.type = SIMPLE_BRACKET;
3294       br_token.opr.sbcset = sbcset;
3295       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296       if (BE (work_tree == NULL, 0))
3297         goto parse_bracket_exp_espace;
3298     }
3299   return work_tree;
3300
3301  parse_bracket_exp_espace:
3302   *err = REG_ESPACE;
3303  parse_bracket_exp_free_return:
3304   re_free (sbcset);
3305 #ifdef RE_ENABLE_I18N
3306   free_charset (mbcset);
3307 #endif /* RE_ENABLE_I18N */
3308   return NULL;
3309 }
3310
3311 /* Parse an element in the bracket expression.  */
3312
3313 static reg_errcode_t
3314 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3315                        re_token_t *token, int token_len, re_dfa_t *dfa,
3316                        reg_syntax_t syntax, int accept_hyphen)
3317 {
3318 #ifdef RE_ENABLE_I18N
3319   int cur_char_size;
3320   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3321   if (cur_char_size > 1)
3322     {
3323       elem->type = MB_CHAR;
3324       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3325       re_string_skip_bytes (regexp, cur_char_size);
3326       return REG_NOERROR;
3327     }
3328 #endif /* RE_ENABLE_I18N */
3329   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3330   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3331       || token->type == OP_OPEN_EQUIV_CLASS)
3332     return parse_bracket_symbol (elem, regexp, token);
3333   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3334     {
3335       /* A '-' must only appear as anything but a range indicator before
3336          the closing bracket.  Everything else is an error.  */
3337       re_token_t token2;
3338       (void) peek_token_bracket (&token2, regexp, syntax);
3339       if (token2.type != OP_CLOSE_BRACKET)
3340         /* The actual error value is not standardized since this whole
3341            case is undefined.  But ERANGE makes good sense.  */
3342         return REG_ERANGE;
3343     }
3344   elem->type = SB_CHAR;
3345   elem->opr.ch = token->opr.c;
3346   return REG_NOERROR;
3347 }
3348
3349 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3350    such as [:<character_class>:], [.<collating_element>.], and
3351    [=<equivalent_class>=].  */
3352
3353 static reg_errcode_t
3354 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3355                       re_token_t *token)
3356 {
3357   unsigned char ch, delim = token->opr.c;
3358   int i = 0;
3359   if (re_string_eoi(regexp))
3360     return REG_EBRACK;
3361   for (;; ++i)
3362     {
3363       if (i >= BRACKET_NAME_BUF_SIZE)
3364         return REG_EBRACK;
3365       if (token->type == OP_OPEN_CHAR_CLASS)
3366         ch = re_string_fetch_byte_case (regexp);
3367       else
3368         ch = re_string_fetch_byte (regexp);
3369       if (re_string_eoi(regexp))
3370         return REG_EBRACK;
3371       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3372         break;
3373       elem->opr.name[i] = ch;
3374     }
3375   re_string_skip_bytes (regexp, 1);
3376   elem->opr.name[i] = '\0';
3377   switch (token->type)
3378     {
3379     case OP_OPEN_COLL_ELEM:
3380       elem->type = COLL_SYM;
3381       break;
3382     case OP_OPEN_EQUIV_CLASS:
3383       elem->type = EQUIV_CLASS;
3384       break;
3385     case OP_OPEN_CHAR_CLASS:
3386       elem->type = CHAR_CLASS;
3387       break;
3388     default:
3389       break;
3390     }
3391   return REG_NOERROR;
3392 }
3393
3394   /* Helper function for parse_bracket_exp.
3395      Build the equivalence class which is represented by NAME.
3396      The result are written to MBCSET and SBCSET.
3397      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3398      is a pointer argument since we may update it.  */
3399
3400 static reg_errcode_t
3401 #ifdef RE_ENABLE_I18N
3402 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3403                    int *equiv_class_alloc, const unsigned char *name)
3404 #else /* not RE_ENABLE_I18N */
3405 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3406 #endif /* not RE_ENABLE_I18N */
3407 {
3408 #ifdef _LIBC
3409   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3410   if (nrules != 0)
3411     {
3412       const int32_t *table, *indirect;
3413       const unsigned char *weights, *extra, *cp;
3414       unsigned char char_buf[2];
3415       int32_t idx1, idx2;
3416       unsigned int ch;
3417       size_t len;
3418       /* Calculate the index for equivalence class.  */
3419       cp = name;
3420       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3421       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3422                                                _NL_COLLATE_WEIGHTMB);
3423       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3424                                                    _NL_COLLATE_EXTRAMB);
3425       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3426                                                 _NL_COLLATE_INDIRECTMB);
3427       idx1 = findidx (table, indirect, extra, &cp, -1);
3428       if (BE (idx1 == 0 || *cp != '\0', 0))
3429         /* This isn't a valid character.  */
3430         return REG_ECOLLATE;
3431
3432       /* Build single byte matcing table for this equivalence class.  */
3433       len = weights[idx1 & 0xffffff];
3434       for (ch = 0; ch < SBC_MAX; ++ch)
3435         {
3436           char_buf[0] = ch;
3437           cp = char_buf;
3438           idx2 = findidx (table, indirect, extra, &cp, 1);
3439 /*
3440           idx2 = table[ch];
3441 */
3442           if (idx2 == 0)
3443             /* This isn't a valid character.  */
3444             continue;
3445           /* Compare only if the length matches and the collation rule
3446              index is the same.  */
3447           if (len == weights[idx2 & 0xffffff] && (idx1 >> 24) == (idx2 >> 24))
3448             {
3449               int cnt = 0;
3450
3451               while (cnt <= len &&
3452                      weights[(idx1 & 0xffffff) + 1 + cnt]
3453                      == weights[(idx2 & 0xffffff) + 1 + cnt])
3454                 ++cnt;
3455
3456               if (cnt > len)
3457                 bitset_set (sbcset, ch);
3458             }
3459         }
3460       /* Check whether the array has enough space.  */
3461       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3462         {
3463           /* Not enough, realloc it.  */
3464           /* +1 in case of mbcset->nequiv_classes is 0.  */
3465           int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3466           /* Use realloc since the array is NULL if *alloc == 0.  */
3467           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3468                                                    int32_t,
3469                                                    new_equiv_class_alloc);
3470           if (BE (new_equiv_classes == NULL, 0))
3471             return REG_ESPACE;
3472           mbcset->equiv_classes = new_equiv_classes;
3473           *equiv_class_alloc = new_equiv_class_alloc;
3474         }
3475       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3476     }
3477   else
3478 #endif /* _LIBC */
3479     {
3480       if (BE (strlen ((const char *) name) != 1, 0))
3481         return REG_ECOLLATE;
3482       bitset_set (sbcset, *name);
3483     }
3484   return REG_NOERROR;
3485 }
3486
3487   /* Helper function for parse_bracket_exp.
3488      Build the character class which is represented by NAME.
3489      The result are written to MBCSET and SBCSET.
3490      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3491      is a pointer argument since we may update it.  */
3492
3493 static reg_errcode_t
3494 #ifdef RE_ENABLE_I18N
3495 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3496                  re_charset_t *mbcset, int *char_class_alloc,
3497                  const unsigned char *class_name, reg_syntax_t syntax)
3498 #else /* not RE_ENABLE_I18N */
3499 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3500                  const unsigned char *class_name, reg_syntax_t syntax)
3501 #endif /* not RE_ENABLE_I18N */
3502 {
3503   int i;
3504   const char *name = (const char *) class_name;
3505
3506   /* In case of REG_ICASE "upper" and "lower" match the both of
3507      upper and lower cases.  */
3508   if ((syntax & RE_ICASE)
3509       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3510     name = "alpha";
3511
3512 #ifdef RE_ENABLE_I18N
3513   /* Check the space of the arrays.  */
3514   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3515     {
3516       /* Not enough, realloc it.  */
3517       /* +1 in case of mbcset->nchar_classes is 0.  */
3518       int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3519       /* Use realloc since array is NULL if *alloc == 0.  */
3520       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3521                                                new_char_class_alloc);
3522       if (BE (new_char_classes == NULL, 0))
3523         return REG_ESPACE;
3524       mbcset->char_classes = new_char_classes;
3525       *char_class_alloc = new_char_class_alloc;
3526     }
3527   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3528 #endif /* RE_ENABLE_I18N */
3529
3530 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3531   do {                                          \
3532     if (BE (trans != NULL, 0))                  \
3533       {                                         \
3534         for (i = 0; i < SBC_MAX; ++i)           \
3535           if (ctype_func (i))                   \
3536             bitset_set (sbcset, trans[i]);      \
3537       }                                         \
3538     else                                        \
3539       {                                         \
3540         for (i = 0; i < SBC_MAX; ++i)           \
3541           if (ctype_func (i))                   \
3542             bitset_set (sbcset, i);             \
3543       }                                         \
3544   } while (0)
3545
3546   if (strcmp (name, "alnum") == 0)
3547     BUILD_CHARCLASS_LOOP (isalnum);
3548   else if (strcmp (name, "cntrl") == 0)
3549     BUILD_CHARCLASS_LOOP (iscntrl);
3550   else if (strcmp (name, "lower") == 0)
3551     BUILD_CHARCLASS_LOOP (islower);
3552   else if (strcmp (name, "space") == 0)
3553     BUILD_CHARCLASS_LOOP (isspace);
3554   else if (strcmp (name, "alpha") == 0)
3555     BUILD_CHARCLASS_LOOP (isalpha);
3556   else if (strcmp (name, "digit") == 0)
3557     BUILD_CHARCLASS_LOOP (isdigit);
3558   else if (strcmp (name, "print") == 0)
3559     BUILD_CHARCLASS_LOOP (isprint);
3560   else if (strcmp (name, "upper") == 0)
3561     BUILD_CHARCLASS_LOOP (isupper);
3562   else if (strcmp (name, "blank") == 0)
3563     BUILD_CHARCLASS_LOOP (isblank);
3564   else if (strcmp (name, "graph") == 0)
3565     BUILD_CHARCLASS_LOOP (isgraph);
3566   else if (strcmp (name, "punct") == 0)
3567     BUILD_CHARCLASS_LOOP (ispunct);
3568   else if (strcmp (name, "xdigit") == 0)
3569     BUILD_CHARCLASS_LOOP (isxdigit);
3570   else
3571     return REG_ECTYPE;
3572
3573   return REG_NOERROR;
3574 }
3575
3576 static bin_tree_t *
3577 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3578                     const unsigned char *class_name,
3579                     const unsigned char *extra, int non_match,
3580                     reg_errcode_t *err)
3581 {
3582   re_bitset_ptr_t sbcset;
3583 #ifdef RE_ENABLE_I18N
3584   re_charset_t *mbcset;
3585   int alloc = 0;
3586 #endif /* not RE_ENABLE_I18N */
3587   reg_errcode_t ret;
3588   re_token_t br_token;
3589   bin_tree_t *tree;
3590
3591   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3592 #ifdef RE_ENABLE_I18N
3593   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3594 #endif /* RE_ENABLE_I18N */
3595
3596 #ifdef RE_ENABLE_I18N
3597   if (BE (sbcset == NULL || mbcset == NULL, 0))
3598 #else /* not RE_ENABLE_I18N */
3599   if (BE (sbcset == NULL, 0))
3600 #endif /* not RE_ENABLE_I18N */
3601     {
3602       *err = REG_ESPACE;
3603       return NULL;
3604     }
3605
3606   if (non_match)
3607     {
3608 #ifdef RE_ENABLE_I18N
3609       mbcset->non_match = 1;
3610 #endif /* not RE_ENABLE_I18N */
3611     }
3612
3613   /* We don't care the syntax in this case.  */
3614   ret = build_charclass (trans, sbcset,
3615 #ifdef RE_ENABLE_I18N
3616                          mbcset, &alloc,
3617 #endif /* RE_ENABLE_I18N */
3618                          class_name, 0);
3619
3620   if (BE (ret != REG_NOERROR, 0))
3621     {
3622       re_free (sbcset);
3623 #ifdef RE_ENABLE_I18N
3624       free_charset (mbcset);
3625 #endif /* RE_ENABLE_I18N */
3626       *err = ret;
3627       return NULL;
3628     }
3629   /* \w match '_' also.  */
3630   for (; *extra; extra++)
3631     bitset_set (sbcset, *extra);
3632
3633   /* If it is non-matching list.  */
3634   if (non_match)
3635     bitset_not (sbcset);
3636
3637 #ifdef RE_ENABLE_I18N
3638   /* Ensure only single byte characters are set.  */
3639   if (dfa->mb_cur_max > 1)
3640     bitset_mask (sbcset, dfa->sb_char);
3641 #endif
3642
3643   /* Build a tree for simple bracket.  */
3644   br_token.type = SIMPLE_BRACKET;
3645   br_token.opr.sbcset = sbcset;
3646   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3647   if (BE (tree == NULL, 0))
3648     goto build_word_op_espace;
3649
3650 #ifdef RE_ENABLE_I18N
3651   if (dfa->mb_cur_max > 1)
3652     {
3653       bin_tree_t *mbc_tree;
3654       /* Build a tree for complex bracket.  */
3655       br_token.type = COMPLEX_BRACKET;
3656       br_token.opr.mbcset = mbcset;
3657       dfa->has_mb_node = 1;
3658       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3659       if (BE (mbc_tree == NULL, 0))
3660         goto build_word_op_espace;
3661       /* Then join them by ALT node.  */
3662       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3663       if (BE (mbc_tree != NULL, 1))
3664         return tree;
3665     }
3666   else
3667     {
3668       free_charset (mbcset);
3669       return tree;
3670     }
3671 #else /* not RE_ENABLE_I18N */
3672   return tree;
3673 #endif /* not RE_ENABLE_I18N */
3674
3675  build_word_op_espace:
3676   re_free (sbcset);
3677 #ifdef RE_ENABLE_I18N
3678   free_charset (mbcset);
3679 #endif /* RE_ENABLE_I18N */
3680   *err = REG_ESPACE;
3681   return NULL;
3682 }
3683
3684 /* This is intended for the expressions like "a{1,3}".
3685    Fetch a number from `input', and return the number.
3686    Return -1, if the number field is empty like "{,1}".
3687    Return -2, If an error is occured.  */
3688
3689 static int
3690 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3691 {
3692   int num = -1;
3693   unsigned char c;
3694   while (1)
3695     {
3696       fetch_token (token, input, syntax);
3697       c = token->opr.c;
3698       if (BE (token->type == END_OF_RE, 0))
3699         return -2;
3700       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3701         break;
3702       num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3703              ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3704       num = (num > RE_DUP_MAX) ? -2 : num;
3705     }
3706   return num;
3707 }
3708 \f
3709 #ifdef RE_ENABLE_I18N
3710 static void
3711 free_charset (re_charset_t *cset)
3712 {
3713   re_free (cset->mbchars);
3714 # ifdef _LIBC
3715   re_free (cset->coll_syms);
3716   re_free (cset->equiv_classes);
3717   re_free (cset->range_starts);
3718   re_free (cset->range_ends);
3719 # endif
3720   re_free (cset->char_classes);
3721   re_free (cset);
3722 }
3723 #endif /* RE_ENABLE_I18N */
3724 \f
3725 /* Functions for binary tree operation.  */
3726
3727 /* Create a tree node.  */
3728
3729 static bin_tree_t *
3730 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3731              re_token_type_t type)
3732 {
3733   re_token_t t;
3734   t.type = type;
3735   return create_token_tree (dfa, left, right, &t);
3736 }
3737
3738 static bin_tree_t *
3739 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3740                    const re_token_t *token)
3741 {
3742   bin_tree_t *tree;
3743   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3744     {
3745       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3746
3747       if (storage == NULL)
3748         return NULL;
3749       storage->next = dfa->str_tree_storage;
3750       dfa->str_tree_storage = storage;
3751       dfa->str_tree_storage_idx = 0;
3752     }
3753   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3754
3755   tree->parent = NULL;
3756   tree->left = left;
3757   tree->right = right;
3758   tree->token = *token;
3759   tree->token.duplicated = 0;
3760   tree->token.opt_subexp = 0;
3761   tree->first = NULL;
3762   tree->next = NULL;
3763   tree->node_idx = -1;
3764
3765   if (left != NULL)
3766     left->parent = tree;
3767   if (right != NULL)
3768     right->parent = tree;
3769   return tree;
3770 }
3771
3772 /* Mark the tree SRC as an optional subexpression.
3773    To be called from preorder or postorder.  */
3774
3775 static reg_errcode_t
3776 mark_opt_subexp (void *extra, bin_tree_t *node)
3777 {
3778   int idx = (int) (long) extra;
3779   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3780     node->token.opt_subexp = 1;
3781
3782   return REG_NOERROR;
3783 }
3784
3785 /* Free the allocated memory inside NODE. */
3786
3787 static void
3788 free_token (re_token_t *node)
3789 {
3790 #ifdef RE_ENABLE_I18N
3791   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3792     free_charset (node->opr.mbcset);
3793   else
3794 #endif /* RE_ENABLE_I18N */
3795     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3796       re_free (node->opr.sbcset);
3797 }
3798
3799 /* Worker function for tree walking.  Free the allocated memory inside NODE
3800    and its children. */
3801
3802 static reg_errcode_t
3803 free_tree (void *extra, bin_tree_t *node)
3804 {
3805   free_token (&node->token);
3806   return REG_NOERROR;
3807 }
3808
3809
3810 /* Duplicate the node SRC, and return new node.  This is a preorder
3811    visit similar to the one implemented by the generic visitor, but
3812    we need more infrastructure to maintain two parallel trees --- so,
3813    it's easier to duplicate.  */
3814
3815 static bin_tree_t *
3816 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3817 {
3818   const bin_tree_t *node;
3819   bin_tree_t *dup_root;
3820   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3821
3822   for (node = root; ; )
3823     {
3824       /* Create a new tree and link it back to the current parent.  */
3825       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3826       if (*p_new == NULL)
3827         return NULL;
3828       (*p_new)->parent = dup_node;
3829       (*p_new)->token.duplicated = 1;
3830       dup_node = *p_new;
3831
3832       /* Go to the left node, or up and to the right.  */
3833       if (node->left)
3834         {
3835           node = node->left;
3836           p_new = &dup_node->left;
3837         }
3838       else
3839         {
3840           const bin_tree_t *prev = NULL;
3841           while (node->right == prev || node->right == NULL)
3842             {
3843               prev = node;
3844               node = node->parent;
3845               dup_node = dup_node->parent;
3846               if (!node)
3847                 return dup_root;
3848             }
3849           node = node->right;
3850           p_new = &dup_node->right;
3851         }
3852     }
3853 }