gl/regcomp.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002,2003,2004,2005,2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2, or (at your option)
   9    any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License along
  17    with this program; if not, write to the Free Software Foundation,
  18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  19
  20 static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
  21                                           size_t length, reg_syntax_t syntax);
  22 static void re_compile_fastmap_iter (regex_t *bufp,
  23                                      const re_dfastate_t *init_state,
  24                                      char *fastmap);
  25 static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
  26 #ifdef RE_ENABLE_I18N
  27 static void free_charset (re_charset_t *cset);
  28 #endif /* RE_ENABLE_I18N */
  29 static void free_workarea_compile (regex_t *preg);
  30 static reg_errcode_t create_initial_state (re_dfa_t *dfa);
  31 #ifdef RE_ENABLE_I18N
  32 static void optimize_utf8 (re_dfa_t *dfa);
  33 #endif
  34 static reg_errcode_t analyze (regex_t *preg);
  35 static reg_errcode_t preorder (bin_tree_t *root,
  36                                reg_errcode_t (fn (void *, bin_tree_t *)),
  37                                void *extra);
  38 static reg_errcode_t postorder (bin_tree_t *root,
  39                                 reg_errcode_t (fn (void *, bin_tree_t *)),
  40                                 void *extra);
  41 static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
  42 static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
  43 static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
  44                                  bin_tree_t *node);
  45 static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
  46 static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
  47 static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
  48 static Idx duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint);
  49 static Idx search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
  50                                    unsigned int constraint);
  51 static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
  52 static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
  53                                          Idx node, bool root);
  54 static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
  55 static Idx fetch_number (re_string_t *input, re_token_t *token,
  56                          reg_syntax_t syntax);
  57 static int peek_token (re_token_t *token, re_string_t *input,
  58                         reg_syntax_t syntax) internal_function;
  59 static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
  60                           reg_syntax_t syntax, reg_errcode_t *err);
  61 static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
  62                                   re_token_t *token, reg_syntax_t syntax,
  63                                   Idx nest, reg_errcode_t *err);
  64 static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
  65                                  re_token_t *token, reg_syntax_t syntax,
  66                                  Idx nest, reg_errcode_t *err);
  67 static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
  68                                      re_token_t *token, reg_syntax_t syntax,
  69                                      Idx nest, reg_errcode_t *err);
  70 static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
  71                                   re_token_t *token, reg_syntax_t syntax,
  72                                   Idx nest, reg_errcode_t *err);
  73 static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
  74                                  re_dfa_t *dfa, re_token_t *token,
  75                                  reg_syntax_t syntax, reg_errcode_t *err);
  76 static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
  77                                       re_token_t *token, reg_syntax_t syntax,
  78                                       reg_errcode_t *err);
  79 static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
  80                                             re_string_t *regexp,
  81                                             re_token_t *token, int token_len,
  82                                             re_dfa_t *dfa,
  83                                             reg_syntax_t syntax,
  84                                             bool accept_hyphen);
  85 static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
  86                                           re_string_t *regexp,
  87                                           re_token_t *token);
  88 #ifdef RE_ENABLE_I18N
  89 static reg_errcode_t build_equiv_class (bitset_t sbcset,
  90                                         re_charset_t *mbcset,
  91                                         Idx *equiv_class_alloc,
  92                                         const unsigned char *name);
  93 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
  94                                       bitset_t sbcset,
  95                                       re_charset_t *mbcset,
  96                                       Idx *char_class_alloc,
  97                                       const unsigned char *class_name,
  98                                       reg_syntax_t syntax);
  99 #else  /* not RE_ENABLE_I18N */
 100 static reg_errcode_t build_equiv_class (bitset_t sbcset,
 101                                         const unsigned char *name);
 102 static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
 103                                       bitset_t sbcset,
 104                                       const unsigned char *class_name,
 105                                       reg_syntax_t syntax);
 106 #endif /* not RE_ENABLE_I18N */
 107 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 108                                        RE_TRANSLATE_TYPE trans,
 109                                        const unsigned char *class_name,
 110                                        const unsigned char *extra,
 111                                        bool non_match, reg_errcode_t *err);
 112 static bin_tree_t *create_tree (re_dfa_t *dfa,
 113                                 bin_tree_t *left, bin_tree_t *right,
 114                                 re_token_type_t type);
 115 static bin_tree_t *create_token_tree (re_dfa_t *dfa,
 116                                       bin_tree_t *left, bin_tree_t *right,
 117                                       const re_token_t *token);
 118 static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
 119 static void free_token (re_token_t *node);
 120 static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
 121 static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
 122 \f
 123 /* This table gives an error message for each of the error codes listed
 124    in regex.h.  Obviously the order here has to be same as there.
 125    POSIX doesn't require that we do anything for REG_NOERROR,
 126    but why not be nice?  */
 127
 128 static const char __re_error_msgid[] =
 129   {
 130 #define REG_NOERROR_IDX 0
 131     gettext_noop ("Success")    /* REG_NOERROR */
 132     "\0"
 133 #define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
 134     gettext_noop ("No match")   /* REG_NOMATCH */
 135     "\0"
 136 #define REG_BADPAT_IDX  (REG_NOMATCH_IDX + sizeof "No match")
 137     gettext_noop ("Invalid regular expression") /* REG_BADPAT */
 138     "\0"
 139 #define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
 140     gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
 141     "\0"
 142 #define REG_ECTYPE_IDX  (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
 143     gettext_noop ("Invalid character class name") /* REG_ECTYPE */
 144     "\0"
 145 #define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
 146     gettext_noop ("Trailing backslash") /* REG_EESCAPE */
 147     "\0"
 148 #define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
 149     gettext_noop ("Invalid back reference") /* REG_ESUBREG */
 150     "\0"
 151 #define REG_EBRACK_IDX  (REG_ESUBREG_IDX + sizeof "Invalid back reference")
 152     gettext_noop ("Unmatched [ or [^")  /* REG_EBRACK */
 153     "\0"
 154 #define REG_EPAREN_IDX  (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
 155     gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
 156     "\0"
 157 #define REG_EBRACE_IDX  (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
 158     gettext_noop ("Unmatched \\{") /* REG_EBRACE */
 159     "\0"
 160 #define REG_BADBR_IDX   (REG_EBRACE_IDX + sizeof "Unmatched \\{")
 161     gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
 162     "\0"
 163 #define REG_ERANGE_IDX  (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
 164     gettext_noop ("Invalid range end")  /* REG_ERANGE */
 165     "\0"
 166 #define REG_ESPACE_IDX  (REG_ERANGE_IDX + sizeof "Invalid range end")
 167     gettext_noop ("Memory exhausted") /* REG_ESPACE */
 168     "\0"
 169 #define REG_BADRPT_IDX  (REG_ESPACE_IDX + sizeof "Memory exhausted")
 170     gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
 171     "\0"
 172 #define REG_EEND_IDX    (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
 173     gettext_noop ("Premature end of regular expression") /* REG_EEND */
 174     "\0"
 175 #define REG_ESIZE_IDX   (REG_EEND_IDX + sizeof "Premature end of regular expression")
 176     gettext_noop ("Regular expression too big") /* REG_ESIZE */
 177     "\0"
 178 #define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
 179     gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
 180   };
 181
 182 static const size_t __re_error_msgid_idx[] =
 183   {
 184     REG_NOERROR_IDX,
 185     REG_NOMATCH_IDX,
 186     REG_BADPAT_IDX,
 187     REG_ECOLLATE_IDX,
 188     REG_ECTYPE_IDX,
 189     REG_EESCAPE_IDX,
 190     REG_ESUBREG_IDX,
 191     REG_EBRACK_IDX,
 192     REG_EPAREN_IDX,
 193     REG_EBRACE_IDX,
 194     REG_BADBR_IDX,
 195     REG_ERANGE_IDX,
 196     REG_ESPACE_IDX,
 197     REG_BADRPT_IDX,
 198     REG_EEND_IDX,
 199     REG_ESIZE_IDX,
 200     REG_ERPAREN_IDX
 201   };
 202 \f
 203 /* Entry points for GNU code.  */
 204
 205 /* re_compile_pattern is the GNU regular expression compiler: it
 206    compiles PATTERN (of length LENGTH) and puts the result in BUFP.
 207    Returns 0 if the pattern was valid, otherwise an error string.
 208
 209    Assumes the `allocated' (and perhaps `buffer') and `translate' fields
 210    are set in BUFP on entry.  */
 211
 212 #ifdef _LIBC
 213 const char *
 214 re_compile_pattern (pattern, length, bufp)
 215     const char *pattern;
 216     size_t length;
 217     struct re_pattern_buffer *bufp;
 218 #else /* size_t might promote */
 219 const char *
 220 re_compile_pattern (const char *pattern, size_t length,
 221                     struct re_pattern_buffer *bufp)
 222 #endif
 223 {
 224   reg_errcode_t ret;
 225
 226   /* And GNU code determines whether or not to get register information
 227      by passing null for the REGS argument to re_match, etc., not by
 228      setting no_sub, unless RE_NO_SUB is set.  */
 229   bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 230
 231   /* Match anchors at newline.  */
 232   bufp->newline_anchor = 1;
 233
 234   ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
 235
 236   if (!ret)
 237     return NULL;
 238   return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 239 }
 240 #ifdef _LIBC
 241 weak_alias (__re_compile_pattern, re_compile_pattern)
 242 #endif
 243
 244 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
 245    also be assigned to arbitrarily: each pattern buffer stores its own
 246    syntax, so it can be changed between regex compilations.  */
 247 /* This has no initializer because initialized variables in Emacs
 248    become read-only after dumping.  */
 249 reg_syntax_t re_syntax_options;
 250
 251
 252 /* Specify the precise syntax of regexps for compilation.  This provides
 253    for compatibility for various utilities which historically have
 254    different, incompatible syntaxes.
 255
 256    The argument SYNTAX is a bit mask comprised of the various bits
 257    defined in regex.h.  We return the old syntax.  */
 258
 259 reg_syntax_t
 260 re_set_syntax (syntax)
 261     reg_syntax_t syntax;
 262 {
 263   reg_syntax_t ret = re_syntax_options;
 264
 265   re_syntax_options = syntax;
 266   return ret;
 267 }
 268 #ifdef _LIBC
 269 weak_alias (__re_set_syntax, re_set_syntax)
 270 #endif
 271
 272 int
 273 re_compile_fastmap (bufp)
 274     struct re_pattern_buffer *bufp;
 275 {
 276   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 277   char *fastmap = bufp->fastmap;
 278
 279   memset (fastmap, '\0', sizeof (char) * SBC_MAX);
 280   re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
 281   if (dfa->init_state != dfa->init_state_word)
 282     re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
 283   if (dfa->init_state != dfa->init_state_nl)
 284     re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
 285   if (dfa->init_state != dfa->init_state_begbuf)
 286     re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
 287   bufp->fastmap_accurate = 1;
 288   return 0;
 289 }
 290 #ifdef _LIBC
 291 weak_alias (__re_compile_fastmap, re_compile_fastmap)
 292 #endif
 293
 294 static inline void
 295 __attribute ((always_inline))
 296 re_set_fastmap (char *fastmap, bool icase, int ch)
 297 {
 298   fastmap[ch] = 1;
 299   if (icase)
 300     fastmap[tolower (ch)] = 1;
 301 }
 302
 303 /* Helper function for re_compile_fastmap.
 304    Compile fastmap for the initial_state INIT_STATE.  */
 305
 306 static void
 307 re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
 308                          char *fastmap)
 309 {
 310   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 311   Idx node_cnt;
 312   bool icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
 313   for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
 314     {
 315       Idx node = init_state->nodes.elems[node_cnt];
 316       re_token_type_t type = dfa->nodes[node].type;
 317
 318       if (type == CHARACTER)
 319         {
 320           re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
 321 #ifdef RE_ENABLE_I18N
 322           if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 323             {
 324               unsigned char buf[MB_LEN_MAX];
 325               unsigned char *p;
 326               wchar_t wc;
 327               mbstate_t state;
 328
 329               p = buf;
 330               *p++ = dfa->nodes[node].opr.c;
 331               while (++node < dfa->nodes_len
 332                      && dfa->nodes[node].type == CHARACTER
 333                      && dfa->nodes[node].mb_partial)
 334                 *p++ = dfa->nodes[node].opr.c;
 335               memset (&state, '\0', sizeof (state));
 336               if (mbrtowc (&wc, (const char *) buf, p - buf,
 337                            &state) == p - buf
 338                   && (__wcrtomb ((char *) buf, towlower (wc), &state)
 339                       != (size_t) -1))
 340                 re_set_fastmap (fastmap, false, buf[0]);
 341             }
 342 #endif
 343         }
 344       else if (type == SIMPLE_BRACKET)
 345         {
 346           int i, ch;
 347           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 348             {
 349               int j;
 350               bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
 351               for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 352                 if (w & ((bitset_word_t) 1 << j))
 353                   re_set_fastmap (fastmap, icase, ch);
 354             }
 355         }
 356 #ifdef RE_ENABLE_I18N
 357       else if (type == COMPLEX_BRACKET)
 358         {
 359           Idx i;
 360           re_charset_t *cset = dfa->nodes[node].opr.mbcset;
 361           if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
 362               || cset->nranges || cset->nchar_classes)
 363             {
 364 # ifdef _LIBC
 365               if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
 366                 {
 367                   /* In this case we want to catch the bytes which are
 368                      the first byte of any collation elements.
 369                      e.g. In da_DK, we want to catch 'a' since "aa"
 370                           is a valid collation element, and don't catch
 371                           'b' since 'b' is the only collation element
 372                           which starts from 'b'.  */
 373                   const int32_t *table = (const int32_t *)
 374                     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
 375                   for (i = 0; i < SBC_MAX; ++i)
 376                     if (table[i] < 0)
 377                       re_set_fastmap (fastmap, icase, i);
 378                 }
 379 # else
 380               if (dfa->mb_cur_max > 1)
 381                 for (i = 0; i < SBC_MAX; ++i)
 382                   if (__btowc (i) == WEOF)
 383                     re_set_fastmap (fastmap, icase, i);
 384 # endif /* not _LIBC */
 385             }
 386           for (i = 0; i < cset->nmbchars; ++i)
 387             {
 388               char buf[256];
 389               mbstate_t state;
 390               memset (&state, '\0', sizeof (state));
 391               if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
 392                 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
 393               if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
 394                 {
 395                   if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
 396                       != (size_t) -1)
 397                     re_set_fastmap (fastmap, false, *(unsigned char *) buf);
 398                 }
 399             }
 400         }
 401 #endif /* RE_ENABLE_I18N */
 402       else if (type == OP_PERIOD
 403 #ifdef RE_ENABLE_I18N
 404                || type == OP_UTF8_PERIOD
 405 #endif /* RE_ENABLE_I18N */
 406                || type == END_OF_RE)
 407         {
 408           memset (fastmap, '\1', sizeof (char) * SBC_MAX);
 409           if (type == END_OF_RE)
 410             bufp->can_be_null = 1;
 411           return;
 412         }
 413     }
 414 }
 415 \f
 416 /* Entry point for POSIX code.  */
 417 /* regcomp takes a regular expression as a string and compiles it.
 418
 419    PREG is a regex_t *.  We do not expect any fields to be initialized,
 420    since POSIX says we shouldn't.  Thus, we set
 421
 422      `buffer' to the compiled pattern;
 423      `used' to the length of the compiled pattern;
 424      `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
 425        REG_EXTENDED bit in CFLAGS is set; otherwise, to
 426        RE_SYNTAX_POSIX_BASIC;
 427      `newline_anchor' to REG_NEWLINE being set in CFLAGS;
 428      `fastmap' to an allocated space for the fastmap;
 429      `fastmap_accurate' to zero;
 430      `re_nsub' to the number of subexpressions in PATTERN.
 431
 432    PATTERN is the address of the pattern string.
 433
 434    CFLAGS is a series of bits which affect compilation.
 435
 436      If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 437      use POSIX basic syntax.
 438
 439      If REG_NEWLINE is set, then . and [^...] don't match newline.
 440      Also, regexec will try a match beginning after every newline.
 441
 442      If REG_ICASE is set, then we considers upper- and lowercase
 443      versions of letters to be equivalent when matching.
 444
 445      If REG_NOSUB is set, then when PREG is passed to regexec, that
 446      routine will report only success or failure, and nothing about the
 447      registers.
 448
 449    It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 450    the return codes and their meanings.)  */
 451
 452 int
 453 regcomp (preg, pattern, cflags)
 454     regex_t *__restrict preg;
 455     const char *__restrict pattern;
 456     int cflags;
 457 {
 458   reg_errcode_t ret;
 459   reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
 460                          : RE_SYNTAX_POSIX_BASIC);
 461
 462   preg->buffer = NULL;
 463   preg->allocated = 0;
 464   preg->used = 0;
 465
 466   /* Try to allocate space for the fastmap.  */
 467   preg->fastmap = re_malloc (char, SBC_MAX);
 468   if (BE (preg->fastmap == NULL, 0))
 469     return REG_ESPACE;
 470
 471   syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
 472
 473   /* If REG_NEWLINE is set, newlines are treated differently.  */
 474   if (cflags & REG_NEWLINE)
 475     { /* REG_NEWLINE implies neither . nor [^...] match newline.  */
 476       syntax &= ~RE_DOT_NEWLINE;
 477       syntax |= RE_HAT_LISTS_NOT_NEWLINE;
 478       /* It also changes the matching behavior.  */
 479       preg->newline_anchor = 1;
 480     }
 481   else
 482     preg->newline_anchor = 0;
 483   preg->no_sub = !!(cflags & REG_NOSUB);
 484   preg->translate = NULL;
 485
 486   ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
 487
 488   /* POSIX doesn't distinguish between an unmatched open-group and an
 489      unmatched close-group: both are REG_EPAREN.  */
 490   if (ret == REG_ERPAREN)
 491     ret = REG_EPAREN;
 492
 493   /* We have already checked preg->fastmap != NULL.  */
 494   if (BE (ret == REG_NOERROR, 1))
 495     /* Compute the fastmap now, since regexec cannot modify the pattern
 496        buffer.  This function never fails in this implementation.  */
 497     (void) re_compile_fastmap (preg);
 498   else
 499     {
 500       /* Some error occurred while compiling the expression.  */
 501       re_free (preg->fastmap);
 502       preg->fastmap = NULL;
 503     }
 504
 505   return (int) ret;
 506 }
 507 #ifdef _LIBC
 508 weak_alias (__regcomp, regcomp)
 509 #endif
 510
 511 /* Returns a message corresponding to an error code, ERRCODE, returned
 512    from either regcomp or regexec.   We don't use PREG here.  */
 513
 514 #ifdef _LIBC
 515 size_t
 516 regerror (errcode, preg, errbuf, errbuf_size)
 517     int errcode;
 518     const regex_t *__restrict preg;
 519     char *__restrict errbuf;
 520     size_t errbuf_size;
 521 #else /* size_t might promote */
 522 size_t
 523 regerror (int errcode, const regex_t *__restrict preg,
 524           char *__restrict errbuf, size_t errbuf_size)
 525 #endif
 526 {
 527   const char *msg;
 528   size_t msg_size;
 529
 530   if (BE (errcode < 0
 531           || errcode >= (int) (sizeof (__re_error_msgid_idx)
 532                                / sizeof (__re_error_msgid_idx[0])), 0))
 533     /* Only error codes returned by the rest of the code should be passed
 534        to this routine.  If we are given anything else, or if other regex
 535        code generates an invalid error code, then the program has a bug.
 536        Dump core so we can fix it.  */
 537     abort ();
 538
 539   msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
 540
 541   msg_size = strlen (msg) + 1; /* Includes the null.  */
 542
 543   if (BE (errbuf_size != 0, 1))
 544     {
 545       if (BE (msg_size > errbuf_size, 0))
 546         {
 547 #if defined HAVE_MEMPCPY || defined _LIBC
 548           *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
 549 #else
 550           memcpy (errbuf, msg, errbuf_size - 1);
 551           errbuf[errbuf_size - 1] = 0;
 552 #endif
 553         }
 554       else
 555         memcpy (errbuf, msg, msg_size);
 556     }
 557
 558   return msg_size;
 559 }
 560 #ifdef _LIBC
 561 weak_alias (__regerror, regerror)
 562 #endif
 563
 564
 565 #ifdef RE_ENABLE_I18N
 566 /* This static array is used for the map to single-byte characters when
 567    UTF-8 is used.  Otherwise we would allocate memory just to initialize
 568    it the same all the time.  UTF-8 is the preferred encoding so this is
 569    a worthwhile optimization.  */
 570 static const bitset_t utf8_sb_map =
 571 {
 572   /* Set the first 128 bits.  */
 573 # if 4 * BITSET_WORD_BITS < ASCII_CHARS
 574 #  error "bitset_word_t is narrower than 32 bits"
 575 # elif 3 * BITSET_WORD_BITS < ASCII_CHARS
 576   BITSET_WORD_MAX, BITSET_WORD_MAX, BITSET_WORD_MAX,
 577 # elif 2 * BITSET_WORD_BITS < ASCII_CHARS
 578   BITSET_WORD_MAX, BITSET_WORD_MAX,
 579 # elif 1 * BITSET_WORD_BITS < ASCII_CHARS
 580   BITSET_WORD_MAX,
 581 # endif
 582   (BITSET_WORD_MAX
 583    >> (SBC_MAX % BITSET_WORD_BITS == 0
 584        ? 0
 585        : BITSET_WORD_BITS - SBC_MAX % BITSET_WORD_BITS))
 586 };
 587 #endif
 588
 589
 590 static void
 591 free_dfa_content (re_dfa_t *dfa)
 592 {
 593   Idx i, j;
 594
 595   if (dfa->nodes)
 596     for (i = 0; i < dfa->nodes_len; ++i)
 597       free_token (dfa->nodes + i);
 598   re_free (dfa->nexts);
 599   for (i = 0; i < dfa->nodes_len; ++i)
 600     {
 601       if (dfa->eclosures != NULL)
 602         re_node_set_free (dfa->eclosures + i);
 603       if (dfa->inveclosures != NULL)
 604         re_node_set_free (dfa->inveclosures + i);
 605       if (dfa->edests != NULL)
 606         re_node_set_free (dfa->edests + i);
 607     }
 608   re_free (dfa->edests);
 609   re_free (dfa->eclosures);
 610   re_free (dfa->inveclosures);
 611   re_free (dfa->nodes);
 612
 613   if (dfa->state_table)
 614     for (i = 0; i <= dfa->state_hash_mask; ++i)
 615       {
 616         struct re_state_table_entry *entry = dfa->state_table + i;
 617         for (j = 0; j < entry->num; ++j)
 618           {
 619             re_dfastate_t *state = entry->array[j];
 620             free_state (state);
 621           }
 622         re_free (entry->array);
 623       }
 624   re_free (dfa->state_table);
 625 #ifdef RE_ENABLE_I18N
 626   if (dfa->sb_char != utf8_sb_map)
 627     re_free (dfa->sb_char);
 628 #endif
 629   re_free (dfa->subexp_map);
 630 #ifdef DEBUG
 631   re_free (dfa->re_str);
 632 #endif
 633
 634   re_free (dfa);
 635 }
 636
 637
 638 /* Free dynamically allocated space used by PREG.  */
 639
 640 void
 641 regfree (preg)
 642     regex_t *preg;
 643 {
 644   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 645   if (BE (dfa != NULL, 1))
 646     free_dfa_content (dfa);
 647   preg->buffer = NULL;
 648   preg->allocated = 0;
 649
 650   re_free (preg->fastmap);
 651   preg->fastmap = NULL;
 652
 653   re_free (preg->translate);
 654   preg->translate = NULL;
 655 }
 656 #ifdef _LIBC
 657 weak_alias (__regfree, regfree)
 658 #endif
 659 \f
 660 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 661    them unless specifically requested.  */
 662
 663 #if defined _REGEX_RE_COMP || defined _LIBC
 664
 665 /* BSD has one and only one pattern buffer.  */
 666 static struct re_pattern_buffer re_comp_buf;
 667
 668 char *
 669 # ifdef _LIBC
 670 /* Make these definitions weak in libc, so POSIX programs can redefine
 671    these names if they don't use our functions, and still use
 672    regcomp/regexec above without link errors.  */
 673 weak_function
 674 # endif
 675 re_comp (s)
 676      const char *s;
 677 {
 678   reg_errcode_t ret;
 679   char *fastmap;
 680
 681   if (!s)
 682     {
 683       if (!re_comp_buf.buffer)
 684         return gettext ("No previous regular expression");
 685       return 0;
 686     }
 687
 688   if (re_comp_buf.buffer)
 689     {
 690       fastmap = re_comp_buf.fastmap;
 691       re_comp_buf.fastmap = NULL;
 692       __regfree (&re_comp_buf);
 693       memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
 694       re_comp_buf.fastmap = fastmap;
 695     }
 696
 697   if (re_comp_buf.fastmap == NULL)
 698     {
 699       re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
 700       if (re_comp_buf.fastmap == NULL)
 701         return (char *) gettext (__re_error_msgid
 702                                  + __re_error_msgid_idx[(int) REG_ESPACE]);
 703     }
 704
 705   /* Since `re_exec' always passes NULL for the `regs' argument, we
 706      don't need to initialize the pattern buffer fields which affect it.  */
 707
 708   /* Match anchors at newlines.  */
 709   re_comp_buf.newline_anchor = 1;
 710
 711   ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
 712
 713   if (!ret)
 714     return NULL;
 715
 716   /* Yes, we're discarding `const' here if !HAVE_LIBINTL.  */
 717   return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
 718 }
 719
 720 #ifdef _LIBC
 721 libc_freeres_fn (free_mem)
 722 {
 723   __regfree (&re_comp_buf);
 724 }
 725 #endif
 726
 727 #endif /* _REGEX_RE_COMP */
 728 \f
 729 /* Internal entry point.
 730    Compile the regular expression PATTERN, whose length is LENGTH.
 731    SYNTAX indicate regular expression's syntax.  */
 732
 733 static reg_errcode_t
 734 re_compile_internal (regex_t *preg, const char * pattern, size_t length,
 735                      reg_syntax_t syntax)
 736 {
 737   reg_errcode_t err = REG_NOERROR;
 738   re_dfa_t *dfa;
 739   re_string_t regexp;
 740
 741   /* Initialize the pattern buffer.  */
 742   preg->fastmap_accurate = 0;
 743   preg->syntax = syntax;
 744   preg->not_bol = preg->not_eol = 0;
 745   preg->used = 0;
 746   preg->re_nsub = 0;
 747   preg->can_be_null = 0;
 748   preg->regs_allocated = REGS_UNALLOCATED;
 749
 750   /* Initialize the dfa.  */
 751   dfa = (re_dfa_t *) preg->buffer;
 752   if (BE (preg->allocated < sizeof (re_dfa_t), 0))
 753     {
 754       /* If zero allocated, but buffer is non-null, try to realloc
 755          enough space.  This loses if buffer's address is bogus, but
 756          that is the user's responsibility.  If ->buffer is NULL this
 757          is a simple allocation.  */
 758       dfa = re_realloc (preg->buffer, re_dfa_t, 1);
 759       if (dfa == NULL)
 760         return REG_ESPACE;
 761       preg->allocated = sizeof (re_dfa_t);
 762       preg->buffer = (unsigned char *) dfa;
 763     }
 764   preg->used = sizeof (re_dfa_t);
 765
 766   err = init_dfa (dfa, length);
 767   if (BE (err != REG_NOERROR, 0))
 768     {
 769       free_dfa_content (dfa);
 770       preg->buffer = NULL;
 771       preg->allocated = 0;
 772       return err;
 773     }
 774 #ifdef DEBUG
 775   /* Note: length+1 will not overflow since it is checked in init_dfa.  */
 776   dfa->re_str = re_malloc (char, length + 1);
 777   strncpy (dfa->re_str, pattern, length + 1);
 778 #endif
 779
 780   __libc_lock_init (dfa->lock);
 781
 782   err = re_string_construct (&regexp, pattern, length, preg->translate,
 783                              syntax & RE_ICASE, dfa);
 784   if (BE (err != REG_NOERROR, 0))
 785     {
 786     re_compile_internal_free_return:
 787       free_workarea_compile (preg);
 788       re_string_destruct (&regexp);
 789       free_dfa_content (dfa);
 790       preg->buffer = NULL;
 791       preg->allocated = 0;
 792       return err;
 793     }
 794
 795   /* Parse the regular expression, and build a structure tree.  */
 796   preg->re_nsub = 0;
 797   dfa->str_tree = parse (&regexp, preg, syntax, &err);
 798   if (BE (dfa->str_tree == NULL, 0))
 799     goto re_compile_internal_free_return;
 800
 801   /* Analyze the tree and create the nfa.  */
 802   err = analyze (preg);
 803   if (BE (err != REG_NOERROR, 0))
 804     goto re_compile_internal_free_return;
 805
 806 #ifdef RE_ENABLE_I18N
 807   /* If possible, do searching in single byte encoding to speed things up.  */
 808   if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
 809     optimize_utf8 (dfa);
 810 #endif
 811
 812   /* Then create the initial state of the dfa.  */
 813   err = create_initial_state (dfa);
 814
 815   /* Release work areas.  */
 816   free_workarea_compile (preg);
 817   re_string_destruct (&regexp);
 818
 819   if (BE (err != REG_NOERROR, 0))
 820     {
 821       free_dfa_content (dfa);
 822       preg->buffer = NULL;
 823       preg->allocated = 0;
 824     }
 825
 826   return err;
 827 }
 828
 829 /* Initialize DFA.  We use the length of the regular expression PAT_LEN
 830    as the initial length of some arrays.  */
 831
 832 static reg_errcode_t
 833 init_dfa (re_dfa_t *dfa, size_t pat_len)
 834 {
 835   __re_size_t table_size;
 836 #ifndef _LIBC
 837   char *codeset_name;
 838 #endif
 839 #ifdef RE_ENABLE_I18N
 840   size_t max_i18n_object_size = MAX (sizeof (wchar_t), sizeof (wctype_t));
 841 #else
 842   size_t max_i18n_object_size = 0;
 843 #endif
 844   size_t max_object_size =
 845     MAX (sizeof (struct re_state_table_entry),
 846          MAX (sizeof (re_token_t),
 847               MAX (sizeof (re_node_set),
 848                    MAX (sizeof (regmatch_t),
 849                         max_i18n_object_size))));
 850
 851   memset (dfa, '\0', sizeof (re_dfa_t));
 852
 853   /* Force allocation of str_tree_storage the first time.  */
 854   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 855
 856   /* Avoid overflows.  The extra "/ 2" is for the table_size doubling
 857      calculation below, and for similar doubling calculations
 858      elsewhere.  And it's <= rather than <, because some of the
 859      doubling calculations add 1 afterwards.  */
 860   if (BE (SIZE_MAX / max_object_size / 2 <= pat_len, 0))
 861     return REG_ESPACE;
 862
 863   dfa->nodes_alloc = pat_len + 1;
 864   dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
 865
 866   /*  table_size = 2 ^ ceil(log pat_len) */
 867   for (table_size = 1; ; table_size <<= 1)
 868     if (table_size > pat_len)
 869       break;
 870
 871   dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
 872   dfa->state_hash_mask = table_size - 1;
 873
 874   dfa->mb_cur_max = MB_CUR_MAX;
 875 #ifdef _LIBC
 876   if (dfa->mb_cur_max == 6
 877       && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
 878     dfa->is_utf8 = 1;
 879   dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
 880                        != 0);
 881 #else
 882 # ifdef HAVE_LANGINFO_CODESET
 883   codeset_name = nl_langinfo (CODESET);
 884 # else
 885   codeset_name = getenv ("LC_ALL");
 886   if (codeset_name == NULL || codeset_name[0] == '\0')
 887     codeset_name = getenv ("LC_CTYPE");
 888   if (codeset_name == NULL || codeset_name[0] == '\0')
 889     codeset_name = getenv ("LANG");
 890   if (codeset_name == NULL)
 891     codeset_name = "";
 892   else if (strchr (codeset_name, '.') !=  NULL)
 893     codeset_name = strchr (codeset_name, '.') + 1;
 894 # endif
 895
 896   if (strcasecmp (codeset_name, "UTF-8") == 0
 897       || strcasecmp (codeset_name, "UTF8") == 0)
 898     dfa->is_utf8 = 1;
 899
 900   /* We check exhaustively in the loop below if this charset is a
 901      superset of ASCII.  */
 902   dfa->map_notascii = 0;
 903 #endif
 904
 905 #ifdef RE_ENABLE_I18N
 906   if (dfa->mb_cur_max > 1)
 907     {
 908       if (dfa->is_utf8)
 909         dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
 910       else
 911         {
 912           int i, j, ch;
 913
 914           dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
 915           if (BE (dfa->sb_char == NULL, 0))
 916             return REG_ESPACE;
 917
 918           /* Set the bits corresponding to single byte chars.  */
 919           for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 920             for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 921               {
 922                 wint_t wch = __btowc (ch);
 923                 if (wch != WEOF)
 924                   dfa->sb_char[i] |= (bitset_word_t) 1 << j;
 925 # ifndef _LIBC
 926                 if (isascii (ch) && wch != ch)
 927                   dfa->map_notascii = 1;
 928 # endif
 929               }
 930         }
 931     }
 932 #endif
 933
 934   if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
 935     return REG_ESPACE;
 936   return REG_NOERROR;
 937 }
 938
 939 /* Initialize WORD_CHAR table, which indicate which character is
 940    "word".  In this case "word" means that it is the word construction
 941    character used by some operators like "\<", "\>", etc.  */
 942
 943 static void
 944 internal_function
 945 init_word_char (re_dfa_t *dfa)
 946 {
 947   int i, j, ch;
 948   dfa->word_ops_used = 1;
 949   for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
 950     for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
 951       if (isalnum (ch) || ch == '_')
 952         dfa->word_char[i] |= (bitset_word_t) 1 << j;
 953 }
 954
 955 /* Free the work area which are only used while compiling.  */
 956
 957 static void
 958 free_workarea_compile (regex_t *preg)
 959 {
 960   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 961   bin_tree_storage_t *storage, *next;
 962   for (storage = dfa->str_tree_storage; storage; storage = next)
 963     {
 964       next = storage->next;
 965       re_free (storage);
 966     }
 967   dfa->str_tree_storage = NULL;
 968   dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
 969   dfa->str_tree = NULL;
 970   re_free (dfa->org_indices);
 971   dfa->org_indices = NULL;
 972 }
 973
 974 /* Create initial states for all contexts.  */
 975
 976 static reg_errcode_t
 977 create_initial_state (re_dfa_t *dfa)
 978 {
 979   Idx first, i;
 980   reg_errcode_t err;
 981   re_node_set init_nodes;
 982
 983   /* Initial states have the epsilon closure of the node which is
 984      the first node of the regular expression.  */
 985   first = dfa->str_tree->first->node_idx;
 986   dfa->init_node = first;
 987   err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
 988   if (BE (err != REG_NOERROR, 0))
 989     return err;
 990
 991   /* The back-references which are in initial states can epsilon transit,
 992      since in this case all of the subexpressions can be null.
 993      Then we add epsilon closures of the nodes which are the next nodes of
 994      the back-references.  */
 995   if (dfa->nbackref > 0)
 996     for (i = 0; i < init_nodes.nelem; ++i)
 997       {
 998         Idx node_idx = init_nodes.elems[i];
 999         re_token_type_t type = dfa->nodes[node_idx].type;
1000
1001         Idx clexp_idx;
1002         if (type != OP_BACK_REF)
1003           continue;
1004         for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
1005           {
1006             re_token_t *clexp_node;
1007             clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
1008             if (clexp_node->type == OP_CLOSE_SUBEXP
1009                 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
1010               break;
1011           }
1012         if (clexp_idx == init_nodes.nelem)
1013           continue;
1014
1015         if (type == OP_BACK_REF)
1016           {
1017             Idx dest_idx = dfa->edests[node_idx].elems[0];
1018             if (!re_node_set_contains (&init_nodes, dest_idx))
1019               {
1020                 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
1021                 i = 0;
1022               }
1023           }
1024       }
1025
1026   /* It must be the first time to invoke acquire_state.  */
1027   dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
1028   /* We don't check ERR here, since the initial state must not be NULL.  */
1029   if (BE (dfa->init_state == NULL, 0))
1030     return err;
1031   if (dfa->init_state->has_constraint)
1032     {
1033       dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1034                                                        CONTEXT_WORD);
1035       dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1036                                                      CONTEXT_NEWLINE);
1037       dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1038                                                          &init_nodes,
1039                                                          CONTEXT_NEWLINE
1040                                                          | CONTEXT_BEGBUF);
1041       if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1042               || dfa->init_state_begbuf == NULL, 0))
1043         return err;
1044     }
1045   else
1046     dfa->init_state_word = dfa->init_state_nl
1047       = dfa->init_state_begbuf = dfa->init_state;
1048
1049   re_node_set_free (&init_nodes);
1050   return REG_NOERROR;
1051 }
1052 \f
1053 #ifdef RE_ENABLE_I18N
1054 /* If it is possible to do searching in single byte encoding instead of UTF-8
1055    to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1056    DFA nodes where needed.  */
1057
1058 static void
1059 optimize_utf8 (re_dfa_t *dfa)
1060 {
1061   Idx node;
1062   int i;
1063   bool mb_chars = false;
1064   bool has_period = false;
1065
1066   for (node = 0; node < dfa->nodes_len; ++node)
1067     switch (dfa->nodes[node].type)
1068       {
1069       case CHARACTER:
1070         if (dfa->nodes[node].opr.c >= ASCII_CHARS)
1071           mb_chars = true;
1072         break;
1073       case ANCHOR:
1074         switch (dfa->nodes[node].opr.idx)
1075           {
1076           case LINE_FIRST:
1077           case LINE_LAST:
1078           case BUF_FIRST:
1079           case BUF_LAST:
1080             break;
1081           default:
1082             /* Word anchors etc. cannot be handled.  */
1083             return;
1084           }
1085         break;
1086       case OP_PERIOD:
1087         has_period = true;
1088         break;
1089       case OP_BACK_REF:
1090       case OP_ALT:
1091       case END_OF_RE:
1092       case OP_DUP_ASTERISK:
1093       case OP_OPEN_SUBEXP:
1094       case OP_CLOSE_SUBEXP:
1095         break;
1096       case COMPLEX_BRACKET:
1097         return;
1098       case SIMPLE_BRACKET:
1099         /* Just double check.  */
1100         {
1101           int rshift = (ASCII_CHARS % BITSET_WORD_BITS == 0
1102                         ? 0
1103                         : BITSET_WORD_BITS - ASCII_CHARS % BITSET_WORD_BITS);
1104           for (i = ASCII_CHARS / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1105             {
1106               if (dfa->nodes[node].opr.sbcset[i] >> rshift != 0)
1107                 return;
1108               rshift = 0;
1109             }
1110         }
1111         break;
1112       default:
1113         abort ();
1114       }
1115
1116   if (mb_chars || has_period)
1117     for (node = 0; node < dfa->nodes_len; ++node)
1118       {
1119         if (dfa->nodes[node].type == CHARACTER
1120             && dfa->nodes[node].opr.c >= ASCII_CHARS)
1121           dfa->nodes[node].mb_partial = 0;
1122         else if (dfa->nodes[node].type == OP_PERIOD)
1123           dfa->nodes[node].type = OP_UTF8_PERIOD;
1124       }
1125
1126   /* The search can be in single byte locale.  */
1127   dfa->mb_cur_max = 1;
1128   dfa->is_utf8 = 0;
1129   dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1130 }
1131 #endif
1132 \f
1133 /* Analyze the structure tree, and calculate "first", "next", "edest",
1134    "eclosure", and "inveclosure".  */
1135
1136 static reg_errcode_t
1137 analyze (regex_t *preg)
1138 {
1139   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1140   reg_errcode_t ret;
1141
1142   /* Allocate arrays.  */
1143   dfa->nexts = re_malloc (Idx, dfa->nodes_alloc);
1144   dfa->org_indices = re_malloc (Idx, dfa->nodes_alloc);
1145   dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1146   dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1147   if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1148           || dfa->eclosures == NULL, 0))
1149     return REG_ESPACE;
1150
1151   dfa->subexp_map = re_malloc (Idx, preg->re_nsub);
1152   if (dfa->subexp_map != NULL)
1153     {
1154       Idx i;
1155       for (i = 0; i < preg->re_nsub; i++)
1156         dfa->subexp_map[i] = i;
1157       preorder (dfa->str_tree, optimize_subexps, dfa);
1158       for (i = 0; i < preg->re_nsub; i++)
1159         if (dfa->subexp_map[i] != i)
1160           break;
1161       if (i == preg->re_nsub)
1162         {
1163           free (dfa->subexp_map);
1164           dfa->subexp_map = NULL;
1165         }
1166     }
1167
1168   ret = postorder (dfa->str_tree, lower_subexps, preg);
1169   if (BE (ret != REG_NOERROR, 0))
1170     return ret;
1171   ret = postorder (dfa->str_tree, calc_first, dfa);
1172   if (BE (ret != REG_NOERROR, 0))
1173     return ret;
1174   preorder (dfa->str_tree, calc_next, dfa);
1175   ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1176   if (BE (ret != REG_NOERROR, 0))
1177     return ret;
1178   ret = calc_eclosure (dfa);
1179   if (BE (ret != REG_NOERROR, 0))
1180     return ret;
1181
1182   /* We only need this during the prune_impossible_nodes pass in regexec.c;
1183      skip it if p_i_n will not run, as calc_inveclosure can be quadratic.  */
1184   if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1185       || dfa->nbackref)
1186     {
1187       dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1188       if (BE (dfa->inveclosures == NULL, 0))
1189         return REG_ESPACE;
1190       ret = calc_inveclosure (dfa);
1191     }
1192
1193   return ret;
1194 }
1195
1196 /* Our parse trees are very unbalanced, so we cannot use a stack to
1197    implement parse tree visits.  Instead, we use parent pointers and
1198    some hairy code in these two functions.  */
1199 static reg_errcode_t
1200 postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1201            void *extra)
1202 {
1203   bin_tree_t *node, *prev;
1204
1205   for (node = root; ; )
1206     {
1207       /* Descend down the tree, preferably to the left (or to the right
1208          if that's the only child).  */
1209       while (node->left || node->right)
1210         if (node->left)
1211           node = node->left;
1212         else
1213           node = node->right;
1214
1215       do
1216         {
1217           reg_errcode_t err = fn (extra, node);
1218           if (BE (err != REG_NOERROR, 0))
1219             return err;
1220           if (node->parent == NULL)
1221             return REG_NOERROR;
1222           prev = node;
1223           node = node->parent;
1224         }
1225       /* Go up while we have a node that is reached from the right.  */
1226       while (node->right == prev || node->right == NULL);
1227       node = node->right;
1228     }
1229 }
1230
1231 static reg_errcode_t
1232 preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1233           void *extra)
1234 {
1235   bin_tree_t *node;
1236
1237   for (node = root; ; )
1238     {
1239       reg_errcode_t err = fn (extra, node);
1240       if (BE (err != REG_NOERROR, 0))
1241         return err;
1242
1243       /* Go to the left node, or up and to the right.  */
1244       if (node->left)
1245         node = node->left;
1246       else
1247         {
1248           bin_tree_t *prev = NULL;
1249           while (node->right == prev || node->right == NULL)
1250             {
1251               prev = node;
1252               node = node->parent;
1253               if (!node)
1254                 return REG_NOERROR;
1255             }
1256           node = node->right;
1257         }
1258     }
1259 }
1260
1261 /* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1262    re_search_internal to map the inner one's opr.idx to this one's.  Adjust
1263    backreferences as well.  Requires a preorder visit.  */
1264 static reg_errcode_t
1265 optimize_subexps (void *extra, bin_tree_t *node)
1266 {
1267   re_dfa_t *dfa = (re_dfa_t *) extra;
1268
1269   if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1270     {
1271       int idx = node->token.opr.idx;
1272       node->token.opr.idx = dfa->subexp_map[idx];
1273       dfa->used_bkref_map |= 1 << node->token.opr.idx;
1274     }
1275
1276   else if (node->token.type == SUBEXP
1277            && node->left && node->left->token.type == SUBEXP)
1278     {
1279       Idx other_idx = node->left->token.opr.idx;
1280
1281       node->left = node->left->left;
1282       if (node->left)
1283         node->left->parent = node;
1284
1285       dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1286       if (other_idx < BITSET_WORD_BITS)
1287         dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1288     }
1289
1290   return REG_NOERROR;
1291 }
1292
1293 /* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1294    of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP.  */
1295 static reg_errcode_t
1296 lower_subexps (void *extra, bin_tree_t *node)
1297 {
1298   regex_t *preg = (regex_t *) extra;
1299   reg_errcode_t err = REG_NOERROR;
1300
1301   if (node->left && node->left->token.type == SUBEXP)
1302     {
1303       node->left = lower_subexp (&err, preg, node->left);
1304       if (node->left)
1305         node->left->parent = node;
1306     }
1307   if (node->right && node->right->token.type == SUBEXP)
1308     {
1309       node->right = lower_subexp (&err, preg, node->right);
1310       if (node->right)
1311         node->right->parent = node;
1312     }
1313
1314   return err;
1315 }
1316
1317 static bin_tree_t *
1318 lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1319 {
1320   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1321   bin_tree_t *body = node->left;
1322   bin_tree_t *op, *cls, *tree1, *tree;
1323
1324   if (preg->no_sub
1325       /* We do not optimize empty subexpressions, because otherwise we may
1326          have bad CONCAT nodes with NULL children.  This is obviously not
1327          very common, so we do not lose much.  An example that triggers
1328          this case is the sed "script" /\(\)/x.  */
1329       && node->left != NULL
1330       && (node->token.opr.idx >= BITSET_WORD_BITS
1331           || !(dfa->used_bkref_map
1332                & ((bitset_word_t) 1 << node->token.opr.idx))))
1333     return node->left;
1334
1335   /* Convert the SUBEXP node to the concatenation of an
1336      OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP.  */
1337   op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1338   cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1339   tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1340   tree = create_tree (dfa, op, tree1, CONCAT);
1341   if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1342     {
1343       *err = REG_ESPACE;
1344       return NULL;
1345     }
1346
1347   op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1348   op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1349   return tree;
1350 }
1351
1352 /* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1353    nodes.  Requires a postorder visit.  */
1354 static reg_errcode_t
1355 calc_first (void *extra, bin_tree_t *node)
1356 {
1357   re_dfa_t *dfa = (re_dfa_t *) extra;
1358   if (node->token.type == CONCAT)
1359     {
1360       node->first = node->left->first;
1361       node->node_idx = node->left->node_idx;
1362     }
1363   else
1364     {
1365       node->first = node;
1366       node->node_idx = re_dfa_add_node (dfa, node->token);
1367       if (BE (node->node_idx == REG_MISSING, 0))
1368         return REG_ESPACE;
1369     }
1370   return REG_NOERROR;
1371 }
1372
1373 /* Pass 2: compute NEXT on the tree.  Preorder visit.  */
1374 static reg_errcode_t
1375 calc_next (void *extra, bin_tree_t *node)
1376 {
1377   switch (node->token.type)
1378     {
1379     case OP_DUP_ASTERISK:
1380       node->left->next = node;
1381       break;
1382     case CONCAT:
1383       node->left->next = node->right->first;
1384       node->right->next = node->next;
1385       break;
1386     default:
1387       if (node->left)
1388         node->left->next = node->next;
1389       if (node->right)
1390         node->right->next = node->next;
1391       break;
1392     }
1393   return REG_NOERROR;
1394 }
1395
1396 /* Pass 3: link all DFA nodes to their NEXT node (any order will do).  */
1397 static reg_errcode_t
1398 link_nfa_nodes (void *extra, bin_tree_t *node)
1399 {
1400   re_dfa_t *dfa = (re_dfa_t *) extra;
1401   Idx idx = node->node_idx;
1402   reg_errcode_t err = REG_NOERROR;
1403
1404   switch (node->token.type)
1405     {
1406     case CONCAT:
1407       break;
1408
1409     case END_OF_RE:
1410       assert (node->next == NULL);
1411       break;
1412
1413     case OP_DUP_ASTERISK:
1414     case OP_ALT:
1415       {
1416         Idx left, right;
1417         dfa->has_plural_match = 1;
1418         if (node->left != NULL)
1419           left = node->left->first->node_idx;
1420         else
1421           left = node->next->node_idx;
1422         if (node->right != NULL)
1423           right = node->right->first->node_idx;
1424         else
1425           right = node->next->node_idx;
1426         assert (REG_VALID_INDEX (left));
1427         assert (REG_VALID_INDEX (right));
1428         err = re_node_set_init_2 (dfa->edests + idx, left, right);
1429       }
1430       break;
1431
1432     case ANCHOR:
1433     case OP_OPEN_SUBEXP:
1434     case OP_CLOSE_SUBEXP:
1435       err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1436       break;
1437
1438     case OP_BACK_REF:
1439       dfa->nexts[idx] = node->next->node_idx;
1440       if (node->token.type == OP_BACK_REF)
1441         re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1442       break;
1443
1444     default:
1445       assert (!IS_EPSILON_NODE (node->token.type));
1446       dfa->nexts[idx] = node->next->node_idx;
1447       break;
1448     }
1449
1450   return err;
1451 }
1452
1453 /* Duplicate the epsilon closure of the node ROOT_NODE.
1454    Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1455    to their own constraint.  */
1456
1457 static reg_errcode_t
1458 internal_function
1459 duplicate_node_closure (re_dfa_t *dfa, Idx top_org_node, Idx top_clone_node,
1460                         Idx root_node, unsigned int init_constraint)
1461 {
1462   Idx org_node, clone_node;
1463   bool ok;
1464   unsigned int constraint = init_constraint;
1465   for (org_node = top_org_node, clone_node = top_clone_node;;)
1466     {
1467       Idx org_dest, clone_dest;
1468       if (dfa->nodes[org_node].type == OP_BACK_REF)
1469         {
1470           /* If the back reference epsilon-transit, its destination must
1471              also have the constraint.  Then duplicate the epsilon closure
1472              of the destination of the back reference, and store it in
1473              edests of the back reference.  */
1474           org_dest = dfa->nexts[org_node];
1475           re_node_set_empty (dfa->edests + clone_node);
1476           clone_dest = duplicate_node (dfa, org_dest, constraint);
1477           if (BE (clone_dest == REG_MISSING, 0))
1478             return REG_ESPACE;
1479           dfa->nexts[clone_node] = dfa->nexts[org_node];
1480           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1481           if (BE (! ok, 0))
1482             return REG_ESPACE;
1483         }
1484       else if (dfa->edests[org_node].nelem == 0)
1485         {
1486           /* In case of the node can't epsilon-transit, don't duplicate the
1487              destination and store the original destination as the
1488              destination of the node.  */
1489           dfa->nexts[clone_node] = dfa->nexts[org_node];
1490           break;
1491         }
1492       else if (dfa->edests[org_node].nelem == 1)
1493         {
1494           /* In case of the node can epsilon-transit, and it has only one
1495              destination.  */
1496           org_dest = dfa->edests[org_node].elems[0];
1497           re_node_set_empty (dfa->edests + clone_node);
1498           if (dfa->nodes[org_node].type == ANCHOR)
1499             {
1500               /* In case of the node has another constraint, append it.  */
1501               if (org_node == root_node && clone_node != org_node)
1502                 {
1503                   /* ...but if the node is root_node itself, it means the
1504                      epsilon closure have a loop, then tie it to the
1505                      destination of the root_node.  */
1506                   ok = re_node_set_insert (dfa->edests + clone_node, org_dest);
1507                   if (BE (! ok, 0))
1508                     return REG_ESPACE;
1509                   break;
1510                 }
1511               constraint |= dfa->nodes[org_node].opr.ctx_type;
1512             }
1513           clone_dest = duplicate_node (dfa, org_dest, constraint);
1514           if (BE (clone_dest == REG_MISSING, 0))
1515             return REG_ESPACE;
1516           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1517           if (BE (! ok, 0))
1518             return REG_ESPACE;
1519         }
1520       else /* dfa->edests[org_node].nelem == 2 */
1521         {
1522           /* In case of the node can epsilon-transit, and it has two
1523              destinations. In the bin_tree_t and DFA, that's '|' and '*'.   */
1524           org_dest = dfa->edests[org_node].elems[0];
1525           re_node_set_empty (dfa->edests + clone_node);
1526           /* Search for a duplicated node which satisfies the constraint.  */
1527           clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1528           if (clone_dest == REG_MISSING)
1529             {
1530               /* There are no such a duplicated node, create a new one.  */
1531               reg_errcode_t err;
1532               clone_dest = duplicate_node (dfa, org_dest, constraint);
1533               if (BE (clone_dest == REG_MISSING, 0))
1534                 return REG_ESPACE;
1535               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1536               if (BE (! ok, 0))
1537                 return REG_ESPACE;
1538               err = duplicate_node_closure (dfa, org_dest, clone_dest,
1539                                             root_node, constraint);
1540               if (BE (err != REG_NOERROR, 0))
1541                 return err;
1542             }
1543           else
1544             {
1545               /* There are a duplicated node which satisfy the constraint,
1546                  use it to avoid infinite loop.  */
1547               ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1548               if (BE (! ok, 0))
1549                 return REG_ESPACE;
1550             }
1551
1552           org_dest = dfa->edests[org_node].elems[1];
1553           clone_dest = duplicate_node (dfa, org_dest, constraint);
1554           if (BE (clone_dest == REG_MISSING, 0))
1555             return REG_ESPACE;
1556           ok = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1557           if (BE (! ok, 0))
1558             return REG_ESPACE;
1559         }
1560       org_node = org_dest;
1561       clone_node = clone_dest;
1562     }
1563   return REG_NOERROR;
1564 }
1565
1566 /* Search for a node which is duplicated from the node ORG_NODE, and
1567    satisfies the constraint CONSTRAINT.  */
1568
1569 static Idx
1570 search_duplicated_node (const re_dfa_t *dfa, Idx org_node,
1571                         unsigned int constraint)
1572 {
1573   Idx idx;
1574   for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1575     {
1576       if (org_node == dfa->org_indices[idx]
1577           && constraint == dfa->nodes[idx].constraint)
1578         return idx; /* Found.  */
1579     }
1580   return REG_MISSING; /* Not found.  */
1581 }
1582
1583 /* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1584    Return the index of the new node, or REG_MISSING if insufficient storage is
1585    available.  */
1586
1587 static Idx
1588 duplicate_node (re_dfa_t *dfa, Idx org_idx, unsigned int constraint)
1589 {
1590   Idx dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1591   if (BE (dup_idx != REG_MISSING, 1))
1592     {
1593       dfa->nodes[dup_idx].constraint = constraint;
1594       if (dfa->nodes[org_idx].type == ANCHOR)
1595         dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1596       dfa->nodes[dup_idx].duplicated = 1;
1597
1598       /* Store the index of the original node.  */
1599       dfa->org_indices[dup_idx] = org_idx;
1600     }
1601   return dup_idx;
1602 }
1603
1604 static reg_errcode_t
1605 calc_inveclosure (re_dfa_t *dfa)
1606 {
1607   Idx src, idx;
1608   bool ok;
1609   for (idx = 0; idx < dfa->nodes_len; ++idx)
1610     re_node_set_init_empty (dfa->inveclosures + idx);
1611
1612   for (src = 0; src < dfa->nodes_len; ++src)
1613     {
1614       Idx *elems = dfa->eclosures[src].elems;
1615       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1616         {
1617           ok = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1618           if (BE (! ok, 0))
1619             return REG_ESPACE;
1620         }
1621     }
1622
1623   return REG_NOERROR;
1624 }
1625
1626 /* Calculate "eclosure" for all the node in DFA.  */
1627
1628 static reg_errcode_t
1629 calc_eclosure (re_dfa_t *dfa)
1630 {
1631   Idx node_idx;
1632   bool incomplete;
1633 #ifdef DEBUG
1634   assert (dfa->nodes_len > 0);
1635 #endif
1636   incomplete = false;
1637   /* For each nodes, calculate epsilon closure.  */
1638   for (node_idx = 0; ; ++node_idx)
1639     {
1640       reg_errcode_t err;
1641       re_node_set eclosure_elem;
1642       if (node_idx == dfa->nodes_len)
1643         {
1644           if (!incomplete)
1645             break;
1646           incomplete = false;
1647           node_idx = 0;
1648         }
1649
1650 #ifdef DEBUG
1651       assert (dfa->eclosures[node_idx].nelem != REG_MISSING);
1652 #endif
1653
1654       /* If we have already calculated, skip it.  */
1655       if (dfa->eclosures[node_idx].nelem != 0)
1656         continue;
1657       /* Calculate epsilon closure of `node_idx'.  */
1658       err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, true);
1659       if (BE (err != REG_NOERROR, 0))
1660         return err;
1661
1662       if (dfa->eclosures[node_idx].nelem == 0)
1663         {
1664           incomplete = true;
1665           re_node_set_free (&eclosure_elem);
1666         }
1667     }
1668   return REG_NOERROR;
1669 }
1670
1671 /* Calculate epsilon closure of NODE.  */
1672
1673 static reg_errcode_t
1674 calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, Idx node, bool root)
1675 {
1676   reg_errcode_t err;
1677   unsigned int constraint;
1678   Idx i;
1679   bool incomplete;
1680   bool ok;
1681   re_node_set eclosure;
1682   incomplete = false;
1683   err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1684   if (BE (err != REG_NOERROR, 0))
1685     return err;
1686
1687   /* This indicates that we are calculating this node now.
1688      We reference this value to avoid infinite loop.  */
1689   dfa->eclosures[node].nelem = REG_MISSING;
1690
1691   constraint = ((dfa->nodes[node].type == ANCHOR)
1692                 ? dfa->nodes[node].opr.ctx_type : 0);
1693   /* If the current node has constraints, duplicate all nodes.
1694      Since they must inherit the constraints.  */
1695   if (constraint
1696       && dfa->edests[node].nelem
1697       && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1698     {
1699       err = duplicate_node_closure (dfa, node, node, node, constraint);
1700       if (BE (err != REG_NOERROR, 0))
1701         return err;
1702     }
1703
1704   /* Expand each epsilon destination nodes.  */
1705   if (IS_EPSILON_NODE(dfa->nodes[node].type))
1706     for (i = 0; i < dfa->edests[node].nelem; ++i)
1707       {
1708         re_node_set eclosure_elem;
1709         Idx edest = dfa->edests[node].elems[i];
1710         /* If calculating the epsilon closure of `edest' is in progress,
1711            return intermediate result.  */
1712         if (dfa->eclosures[edest].nelem == REG_MISSING)
1713           {
1714             incomplete = true;
1715             continue;
1716           }
1717         /* If we haven't calculated the epsilon closure of `edest' yet,
1718            calculate now. Otherwise use calculated epsilon closure.  */
1719         if (dfa->eclosures[edest].nelem == 0)
1720           {
1721             err = calc_eclosure_iter (&eclosure_elem, dfa, edest, false);
1722             if (BE (err != REG_NOERROR, 0))
1723               return err;
1724           }
1725         else
1726           eclosure_elem = dfa->eclosures[edest];
1727         /* Merge the epsilon closure of `edest'.  */
1728         re_node_set_merge (&eclosure, &eclosure_elem);
1729         /* If the epsilon closure of `edest' is incomplete,
1730            the epsilon closure of this node is also incomplete.  */
1731         if (dfa->eclosures[edest].nelem == 0)
1732           {
1733             incomplete = true;
1734             re_node_set_free (&eclosure_elem);
1735           }
1736       }
1737
1738   /* Epsilon closures include itself.  */
1739   ok = re_node_set_insert (&eclosure, node);
1740   if (BE (! ok, 0))
1741     return REG_ESPACE;
1742   if (incomplete && !root)
1743     dfa->eclosures[node].nelem = 0;
1744   else
1745     dfa->eclosures[node] = eclosure;
1746   *new_set = eclosure;
1747   return REG_NOERROR;
1748 }
1749 \f
1750 /* Functions for token which are used in the parser.  */
1751
1752 /* Fetch a token from INPUT.
1753    We must not use this function inside bracket expressions.  */
1754
1755 static void
1756 internal_function
1757 fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1758 {
1759   re_string_skip_bytes (input, peek_token (result, input, syntax));
1760 }
1761
1762 /* Peek a token from INPUT, and return the length of the token.
1763    We must not use this function inside bracket expressions.  */
1764
1765 static int
1766 internal_function
1767 peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1768 {
1769   unsigned char c;
1770
1771   if (re_string_eoi (input))
1772     {
1773       token->type = END_OF_RE;
1774       return 0;
1775     }
1776
1777   c = re_string_peek_byte (input, 0);
1778   token->opr.c = c;
1779
1780   token->word_char = 0;
1781 #ifdef RE_ENABLE_I18N
1782   token->mb_partial = 0;
1783   if (input->mb_cur_max > 1 &&
1784       !re_string_first_byte (input, re_string_cur_idx (input)))
1785     {
1786       token->type = CHARACTER;
1787       token->mb_partial = 1;
1788       return 1;
1789     }
1790 #endif
1791   if (c == '\\')
1792     {
1793       unsigned char c2;
1794       if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1795         {
1796           token->type = BACK_SLASH;
1797           return 1;
1798         }
1799
1800       c2 = re_string_peek_byte_case (input, 1);
1801       token->opr.c = c2;
1802       token->type = CHARACTER;
1803 #ifdef RE_ENABLE_I18N
1804       if (input->mb_cur_max > 1)
1805         {
1806           wint_t wc = re_string_wchar_at (input,
1807                                           re_string_cur_idx (input) + 1);
1808           token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1809         }
1810       else
1811 #endif
1812         token->word_char = IS_WORD_CHAR (c2) != 0;
1813
1814       switch (c2)
1815         {
1816         case '|':
1817           if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1818             token->type = OP_ALT;
1819           break;
1820         case '1': case '2': case '3': case '4': case '5':
1821         case '6': case '7': case '8': case '9':
1822           if (!(syntax & RE_NO_BK_REFS))
1823             {
1824               token->type = OP_BACK_REF;
1825               token->opr.idx = c2 - '1';
1826             }
1827           break;
1828         case '<':
1829           if (!(syntax & RE_NO_GNU_OPS))
1830             {
1831               token->type = ANCHOR;
1832               token->opr.ctx_type = WORD_FIRST;
1833             }
1834           break;
1835         case '>':
1836           if (!(syntax & RE_NO_GNU_OPS))
1837             {
1838               token->type = ANCHOR;
1839               token->opr.ctx_type = WORD_LAST;
1840             }
1841           break;
1842         case 'b':
1843           if (!(syntax & RE_NO_GNU_OPS))
1844             {
1845               token->type = ANCHOR;
1846               token->opr.ctx_type = WORD_DELIM;
1847             }
1848           break;
1849         case 'B':
1850           if (!(syntax & RE_NO_GNU_OPS))
1851             {
1852               token->type = ANCHOR;
1853               token->opr.ctx_type = NOT_WORD_DELIM;
1854             }
1855           break;
1856         case 'w':
1857           if (!(syntax & RE_NO_GNU_OPS))
1858             token->type = OP_WORD;
1859           break;
1860         case 'W':
1861           if (!(syntax & RE_NO_GNU_OPS))
1862             token->type = OP_NOTWORD;
1863           break;
1864         case 's':
1865           if (!(syntax & RE_NO_GNU_OPS))
1866             token->type = OP_SPACE;
1867           break;
1868         case 'S':
1869           if (!(syntax & RE_NO_GNU_OPS))
1870             token->type = OP_NOTSPACE;
1871           break;
1872         case '`':
1873           if (!(syntax & RE_NO_GNU_OPS))
1874             {
1875               token->type = ANCHOR;
1876               token->opr.ctx_type = BUF_FIRST;
1877             }
1878           break;
1879         case '\'':
1880           if (!(syntax & RE_NO_GNU_OPS))
1881             {
1882               token->type = ANCHOR;
1883               token->opr.ctx_type = BUF_LAST;
1884             }
1885           break;
1886         case '(':
1887           if (!(syntax & RE_NO_BK_PARENS))
1888             token->type = OP_OPEN_SUBEXP;
1889           break;
1890         case ')':
1891           if (!(syntax & RE_NO_BK_PARENS))
1892             token->type = OP_CLOSE_SUBEXP;
1893           break;
1894         case '+':
1895           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1896             token->type = OP_DUP_PLUS;
1897           break;
1898         case '?':
1899           if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1900             token->type = OP_DUP_QUESTION;
1901           break;
1902         case '{':
1903           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1904             token->type = OP_OPEN_DUP_NUM;
1905           break;
1906         case '}':
1907           if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1908             token->type = OP_CLOSE_DUP_NUM;
1909           break;
1910         default:
1911           break;
1912         }
1913       return 2;
1914     }
1915
1916   token->type = CHARACTER;
1917 #ifdef RE_ENABLE_I18N
1918   if (input->mb_cur_max > 1)
1919     {
1920       wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1921       token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1922     }
1923   else
1924 #endif
1925     token->word_char = IS_WORD_CHAR (token->opr.c);
1926
1927   switch (c)
1928     {
1929     case '\n':
1930       if (syntax & RE_NEWLINE_ALT)
1931         token->type = OP_ALT;
1932       break;
1933     case '|':
1934       if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1935         token->type = OP_ALT;
1936       break;
1937     case '*':
1938       token->type = OP_DUP_ASTERISK;
1939       break;
1940     case '+':
1941       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1942         token->type = OP_DUP_PLUS;
1943       break;
1944     case '?':
1945       if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1946         token->type = OP_DUP_QUESTION;
1947       break;
1948     case '{':
1949       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1950         token->type = OP_OPEN_DUP_NUM;
1951       break;
1952     case '}':
1953       if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1954         token->type = OP_CLOSE_DUP_NUM;
1955       break;
1956     case '(':
1957       if (syntax & RE_NO_BK_PARENS)
1958         token->type = OP_OPEN_SUBEXP;
1959       break;
1960     case ')':
1961       if (syntax & RE_NO_BK_PARENS)
1962         token->type = OP_CLOSE_SUBEXP;
1963       break;
1964     case '[':
1965       token->type = OP_OPEN_BRACKET;
1966       break;
1967     case '.':
1968       token->type = OP_PERIOD;
1969       break;
1970     case '^':
1971       if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1972           re_string_cur_idx (input) != 0)
1973         {
1974           char prev = re_string_peek_byte (input, -1);
1975           if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1976             break;
1977         }
1978       token->type = ANCHOR;
1979       token->opr.ctx_type = LINE_FIRST;
1980       break;
1981     case '$':
1982       if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1983           re_string_cur_idx (input) + 1 != re_string_length (input))
1984         {
1985           re_token_t next;
1986           re_string_skip_bytes (input, 1);
1987           peek_token (&next, input, syntax);
1988           re_string_skip_bytes (input, -1);
1989           if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1990             break;
1991         }
1992       token->type = ANCHOR;
1993       token->opr.ctx_type = LINE_LAST;
1994       break;
1995     default:
1996       break;
1997     }
1998   return 1;
1999 }
2000
2001 /* Peek a token from INPUT, and return the length of the token.
2002    We must not use this function out of bracket expressions.  */
2003
2004 static int
2005 internal_function
2006 peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
2007 {
2008   unsigned char c;
2009   if (re_string_eoi (input))
2010     {
2011       token->type = END_OF_RE;
2012       return 0;
2013     }
2014   c = re_string_peek_byte (input, 0);
2015   token->opr.c = c;
2016
2017 #ifdef RE_ENABLE_I18N
2018   if (input->mb_cur_max > 1 &&
2019       !re_string_first_byte (input, re_string_cur_idx (input)))
2020     {
2021       token->type = CHARACTER;
2022       return 1;
2023     }
2024 #endif /* RE_ENABLE_I18N */
2025
2026   if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
2027       && re_string_cur_idx (input) + 1 < re_string_length (input))
2028     {
2029       /* In this case, '\' escape a character.  */
2030       unsigned char c2;
2031       re_string_skip_bytes (input, 1);
2032       c2 = re_string_peek_byte (input, 0);
2033       token->opr.c = c2;
2034       token->type = CHARACTER;
2035       return 1;
2036     }
2037   if (c == '[') /* '[' is a special char in a bracket exps.  */
2038     {
2039       unsigned char c2;
2040       int token_len;
2041       if (re_string_cur_idx (input) + 1 < re_string_length (input))
2042         c2 = re_string_peek_byte (input, 1);
2043       else
2044         c2 = 0;
2045       token->opr.c = c2;
2046       token_len = 2;
2047       switch (c2)
2048         {
2049         case '.':
2050           token->type = OP_OPEN_COLL_ELEM;
2051           break;
2052         case '=':
2053           token->type = OP_OPEN_EQUIV_CLASS;
2054           break;
2055         case ':':
2056           if (syntax & RE_CHAR_CLASSES)
2057             {
2058               token->type = OP_OPEN_CHAR_CLASS;
2059               break;
2060             }
2061           /* else fall through.  */
2062         default:
2063           token->type = CHARACTER;
2064           token->opr.c = c;
2065           token_len = 1;
2066           break;
2067         }
2068       return token_len;
2069     }
2070   switch (c)
2071     {
2072     case '-':
2073       token->type = OP_CHARSET_RANGE;
2074       break;
2075     case ']':
2076       token->type = OP_CLOSE_BRACKET;
2077       break;
2078     case '^':
2079       token->type = OP_NON_MATCH_LIST;
2080       break;
2081     default:
2082       token->type = CHARACTER;
2083     }
2084   return 1;
2085 }
2086 \f
2087 /* Functions for parser.  */
2088
2089 /* Entry point of the parser.
2090    Parse the regular expression REGEXP and return the structure tree.
2091    If an error is occured, ERR is set by error code, and return NULL.
2092    This function build the following tree, from regular expression <reg_exp>:
2093            CAT
2094            / \
2095           /   \
2096    <reg_exp>  EOR
2097
2098    CAT means concatenation.
2099    EOR means end of regular expression.  */
2100
2101 static bin_tree_t *
2102 parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2103        reg_errcode_t *err)
2104 {
2105   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2106   bin_tree_t *tree, *eor, *root;
2107   re_token_t current_token;
2108   dfa->syntax = syntax;
2109   fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2110   tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2111   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2112     return NULL;
2113   eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2114   if (tree != NULL)
2115     root = create_tree (dfa, tree, eor, CONCAT);
2116   else
2117     root = eor;
2118   if (BE (eor == NULL || root == NULL, 0))
2119     {
2120       *err = REG_ESPACE;
2121       return NULL;
2122     }
2123   return root;
2124 }
2125
2126 /* This function build the following tree, from regular expression
2127    <branch1>|<branch2>:
2128            ALT
2129            / \
2130           /   \
2131    <branch1> <branch2>
2132
2133    ALT means alternative, which represents the operator `|'.  */
2134
2135 static bin_tree_t *
2136 parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2137                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2138 {
2139   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2140   bin_tree_t *tree, *branch = NULL;
2141   tree = parse_branch (regexp, preg, token, syntax, nest, err);
2142   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2143     return NULL;
2144
2145   while (token->type == OP_ALT)
2146     {
2147       fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2148       if (token->type != OP_ALT && token->type != END_OF_RE
2149           && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2150         {
2151           branch = parse_branch (regexp, preg, token, syntax, nest, err);
2152           if (BE (*err != REG_NOERROR && branch == NULL, 0))
2153             return NULL;
2154         }
2155       else
2156         branch = NULL;
2157       tree = create_tree (dfa, tree, branch, OP_ALT);
2158       if (BE (tree == NULL, 0))
2159         {
2160           *err = REG_ESPACE;
2161           return NULL;
2162         }
2163     }
2164   return tree;
2165 }
2166
2167 /* This function build the following tree, from regular expression
2168    <exp1><exp2>:
2169         CAT
2170         / \
2171        /   \
2172    <exp1> <exp2>
2173
2174    CAT means concatenation.  */
2175
2176 static bin_tree_t *
2177 parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2178               reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2179 {
2180   bin_tree_t *tree, *expr;
2181   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2182   tree = parse_expression (regexp, preg, token, syntax, nest, err);
2183   if (BE (*err != REG_NOERROR && tree == NULL, 0))
2184     return NULL;
2185
2186   while (token->type != OP_ALT && token->type != END_OF_RE
2187          && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2188     {
2189       expr = parse_expression (regexp, preg, token, syntax, nest, err);
2190       if (BE (*err != REG_NOERROR && expr == NULL, 0))
2191         {
2192           return NULL;
2193         }
2194       if (tree != NULL && expr != NULL)
2195         {
2196           tree = create_tree (dfa, tree, expr, CONCAT);
2197           if (tree == NULL)
2198             {
2199               *err = REG_ESPACE;
2200               return NULL;
2201             }
2202         }
2203       else if (tree == NULL)
2204         tree = expr;
2205       /* Otherwise expr == NULL, we don't need to create new tree.  */
2206     }
2207   return tree;
2208 }
2209
2210 /* This function build the following tree, from regular expression a*:
2211          *
2212          |
2213          a
2214 */
2215
2216 static bin_tree_t *
2217 parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2218                   reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2219 {
2220   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2221   bin_tree_t *tree;
2222   switch (token->type)
2223     {
2224     case CHARACTER:
2225       tree = create_token_tree (dfa, NULL, NULL, token);
2226       if (BE (tree == NULL, 0))
2227         {
2228           *err = REG_ESPACE;
2229           return NULL;
2230         }
2231 #ifdef RE_ENABLE_I18N
2232       if (dfa->mb_cur_max > 1)
2233         {
2234           while (!re_string_eoi (regexp)
2235                  && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2236             {
2237               bin_tree_t *mbc_remain;
2238               fetch_token (token, regexp, syntax);
2239               mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2240               tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2241               if (BE (mbc_remain == NULL || tree == NULL, 0))
2242                 {
2243                   *err = REG_ESPACE;
2244                   return NULL;
2245                 }
2246             }
2247         }
2248 #endif
2249       break;
2250     case OP_OPEN_SUBEXP:
2251       tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2252       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2253         return NULL;
2254       break;
2255     case OP_OPEN_BRACKET:
2256       tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2257       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2258         return NULL;
2259       break;
2260     case OP_BACK_REF:
2261       if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2262         {
2263           *err = REG_ESUBREG;
2264           return NULL;
2265         }
2266       dfa->used_bkref_map |= 1 << token->opr.idx;
2267       tree = create_token_tree (dfa, NULL, NULL, token);
2268       if (BE (tree == NULL, 0))
2269         {
2270           *err = REG_ESPACE;
2271           return NULL;
2272         }
2273       ++dfa->nbackref;
2274       dfa->has_mb_node = 1;
2275       break;
2276     case OP_OPEN_DUP_NUM:
2277       if (syntax & RE_CONTEXT_INVALID_DUP)
2278         {
2279           *err = REG_BADRPT;
2280           return NULL;
2281         }
2282       /* FALLTHROUGH */
2283     case OP_DUP_ASTERISK:
2284     case OP_DUP_PLUS:
2285     case OP_DUP_QUESTION:
2286       if (syntax & RE_CONTEXT_INVALID_OPS)
2287         {
2288           *err = REG_BADRPT;
2289           return NULL;
2290         }
2291       else if (syntax & RE_CONTEXT_INDEP_OPS)
2292         {
2293           fetch_token (token, regexp, syntax);
2294           return parse_expression (regexp, preg, token, syntax, nest, err);
2295         }
2296       /* else fall through  */
2297     case OP_CLOSE_SUBEXP:
2298       if ((token->type == OP_CLOSE_SUBEXP) &&
2299           !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2300         {
2301           *err = REG_ERPAREN;
2302           return NULL;
2303         }
2304       /* else fall through  */
2305     case OP_CLOSE_DUP_NUM:
2306       /* We treat it as a normal character.  */
2307
2308       /* Then we can these characters as normal characters.  */
2309       token->type = CHARACTER;
2310       /* mb_partial and word_char bits should be initialized already
2311          by peek_token.  */
2312       tree = create_token_tree (dfa, NULL, NULL, token);
2313       if (BE (tree == NULL, 0))
2314         {
2315           *err = REG_ESPACE;
2316           return NULL;
2317         }
2318       break;
2319     case ANCHOR:
2320       if ((token->opr.ctx_type
2321            & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2322           && dfa->word_ops_used == 0)
2323         init_word_char (dfa);
2324       if (token->opr.ctx_type == WORD_DELIM
2325           || token->opr.ctx_type == NOT_WORD_DELIM)
2326         {
2327           bin_tree_t *tree_first, *tree_last;
2328           if (token->opr.ctx_type == WORD_DELIM)
2329             {
2330               token->opr.ctx_type = WORD_FIRST;
2331               tree_first = create_token_tree (dfa, NULL, NULL, token);
2332               token->opr.ctx_type = WORD_LAST;
2333             }
2334           else
2335             {
2336               token->opr.ctx_type = INSIDE_WORD;
2337               tree_first = create_token_tree (dfa, NULL, NULL, token);
2338               token->opr.ctx_type = INSIDE_NOTWORD;
2339             }
2340           tree_last = create_token_tree (dfa, NULL, NULL, token);
2341           tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2342           if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2343             {
2344               *err = REG_ESPACE;
2345               return NULL;
2346             }
2347         }
2348       else
2349         {
2350           tree = create_token_tree (dfa, NULL, NULL, token);
2351           if (BE (tree == NULL, 0))
2352             {
2353               *err = REG_ESPACE;
2354               return NULL;
2355             }
2356         }
2357       /* We must return here, since ANCHORs can't be followed
2358          by repetition operators.
2359          eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2360              it must not be "<ANCHOR(^)><REPEAT(*)>".  */
2361       fetch_token (token, regexp, syntax);
2362       return tree;
2363     case OP_PERIOD:
2364       tree = create_token_tree (dfa, NULL, NULL, token);
2365       if (BE (tree == NULL, 0))
2366         {
2367           *err = REG_ESPACE;
2368           return NULL;
2369         }
2370       if (dfa->mb_cur_max > 1)
2371         dfa->has_mb_node = 1;
2372       break;
2373     case OP_WORD:
2374     case OP_NOTWORD:
2375       tree = build_charclass_op (dfa, regexp->trans,
2376                                  (const unsigned char *) "alnum",
2377                                  (const unsigned char *) "_",
2378                                  token->type == OP_NOTWORD, err);
2379       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2380         return NULL;
2381       break;
2382     case OP_SPACE:
2383     case OP_NOTSPACE:
2384       tree = build_charclass_op (dfa, regexp->trans,
2385                                  (const unsigned char *) "space",
2386                                  (const unsigned char *) "",
2387                                  token->type == OP_NOTSPACE, err);
2388       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2389         return NULL;
2390       break;
2391     case OP_ALT:
2392     case END_OF_RE:
2393       return NULL;
2394     case BACK_SLASH:
2395       *err = REG_EESCAPE;
2396       return NULL;
2397     default:
2398       /* Must not happen?  */
2399 #ifdef DEBUG
2400       assert (0);
2401 #endif
2402       return NULL;
2403     }
2404   fetch_token (token, regexp, syntax);
2405
2406   while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2407          || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2408     {
2409       tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2410       if (BE (*err != REG_NOERROR && tree == NULL, 0))
2411         return NULL;
2412       /* In BRE consecutive duplications are not allowed.  */
2413       if ((syntax & RE_CONTEXT_INVALID_DUP)
2414           && (token->type == OP_DUP_ASTERISK
2415               || token->type == OP_OPEN_DUP_NUM))
2416         {
2417           *err = REG_BADRPT;
2418           return NULL;
2419         }
2420     }
2421
2422   return tree;
2423 }
2424
2425 /* This function build the following tree, from regular expression
2426    (<reg_exp>):
2427          SUBEXP
2428             |
2429         <reg_exp>
2430 */
2431
2432 static bin_tree_t *
2433 parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2434                reg_syntax_t syntax, Idx nest, reg_errcode_t *err)
2435 {
2436   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2437   bin_tree_t *tree;
2438   size_t cur_nsub;
2439   cur_nsub = preg->re_nsub++;
2440
2441   fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2442
2443   /* The subexpression may be a null string.  */
2444   if (token->type == OP_CLOSE_SUBEXP)
2445     tree = NULL;
2446   else
2447     {
2448       tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2449       if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2450         *err = REG_EPAREN;
2451       if (BE (*err != REG_NOERROR, 0))
2452         return NULL;
2453     }
2454
2455   if (cur_nsub <= '9' - '1')
2456     dfa->completed_bkref_map |= 1 << cur_nsub;
2457
2458   tree = create_tree (dfa, tree, NULL, SUBEXP);
2459   if (BE (tree == NULL, 0))
2460     {
2461       *err = REG_ESPACE;
2462       return NULL;
2463     }
2464   tree->token.opr.idx = cur_nsub;
2465   return tree;
2466 }
2467
2468 /* This function parse repetition operators like "*", "+", "{1,3}" etc.  */
2469
2470 static bin_tree_t *
2471 parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2472               re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2473 {
2474   bin_tree_t *tree = NULL, *old_tree = NULL;
2475   Idx i, start, end, start_idx = re_string_cur_idx (regexp);
2476   re_token_t start_token = *token;
2477
2478   if (token->type == OP_OPEN_DUP_NUM)
2479     {
2480       end = 0;
2481       start = fetch_number (regexp, token, syntax);
2482       if (start == REG_MISSING)
2483         {
2484           if (token->type == CHARACTER && token->opr.c == ',')
2485             start = 0; /* We treat "{,m}" as "{0,m}".  */
2486           else
2487             {
2488               *err = REG_BADBR; /* <re>{} is invalid.  */
2489               return NULL;
2490             }
2491         }
2492       if (BE (start != REG_ERROR, 1))
2493         {
2494           /* We treat "{n}" as "{n,n}".  */
2495           end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2496                  : ((token->type == CHARACTER && token->opr.c == ',')
2497                     ? fetch_number (regexp, token, syntax) : REG_ERROR));
2498         }
2499       if (BE (start == REG_ERROR || end == REG_ERROR, 0))
2500         {
2501           /* Invalid sequence.  */
2502           if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2503             {
2504               if (token->type == END_OF_RE)
2505                 *err = REG_EBRACE;
2506               else
2507                 *err = REG_BADBR;
2508
2509               return NULL;
2510             }
2511
2512           /* If the syntax bit is set, rollback.  */
2513           re_string_set_index (regexp, start_idx);
2514           *token = start_token;
2515           token->type = CHARACTER;
2516           /* mb_partial and word_char bits should be already initialized by
2517              peek_token.  */
2518           return elem;
2519         }
2520
2521       if (BE (end != REG_MISSING && start > end, 0))
2522         {
2523           /* First number greater than second.  */
2524           *err = REG_BADBR;
2525           return NULL;
2526         }
2527     }
2528   else
2529     {
2530       start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2531       end = (token->type == OP_DUP_QUESTION) ? 1 : REG_MISSING;
2532     }
2533
2534   fetch_token (token, regexp, syntax);
2535
2536   if (BE (elem == NULL, 0))
2537     return NULL;
2538   if (BE (start == 0 && end == 0, 0))
2539     {
2540       postorder (elem, free_tree, NULL);
2541       return NULL;
2542     }
2543
2544   /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}".  */
2545   if (BE (start > 0, 0))
2546     {
2547       tree = elem;
2548       for (i = 2; i <= start; ++i)
2549         {
2550           elem = duplicate_tree (elem, dfa);
2551           tree = create_tree (dfa, tree, elem, CONCAT);
2552           if (BE (elem == NULL || tree == NULL, 0))
2553             goto parse_dup_op_espace;
2554         }
2555
2556       if (start == end)
2557         return tree;
2558
2559       /* Duplicate ELEM before it is marked optional.  */
2560       elem = duplicate_tree (elem, dfa);
2561       old_tree = tree;
2562     }
2563   else
2564     old_tree = NULL;
2565
2566   if (elem->token.type == SUBEXP)
2567     postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2568
2569   tree = create_tree (dfa, elem, NULL,
2570                       (end == REG_MISSING ? OP_DUP_ASTERISK : OP_ALT));
2571   if (BE (tree == NULL, 0))
2572     goto parse_dup_op_espace;
2573
2574   /* This loop is actually executed only when end != REG_MISSING,
2575      to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?...  We have
2576      already created the start+1-th copy.  */
2577   if ((Idx) -1 < 0 || end != REG_MISSING)
2578     for (i = start + 2; i <= end; ++i)
2579       {
2580         elem = duplicate_tree (elem, dfa);
2581         tree = create_tree (dfa, tree, elem, CONCAT);
2582         if (BE (elem == NULL || tree == NULL, 0))
2583           goto parse_dup_op_espace;
2584
2585         tree = create_tree (dfa, tree, NULL, OP_ALT);
2586         if (BE (tree == NULL, 0))
2587           goto parse_dup_op_espace;
2588       }
2589
2590   if (old_tree)
2591     tree = create_tree (dfa, old_tree, tree, CONCAT);
2592
2593   return tree;
2594
2595  parse_dup_op_espace:
2596   *err = REG_ESPACE;
2597   return NULL;
2598 }
2599
2600 /* Size of the names for collating symbol/equivalence_class/character_class.
2601    I'm not sure, but maybe enough.  */
2602 #define BRACKET_NAME_BUF_SIZE 32
2603
2604 #ifndef _LIBC
2605   /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2606      Build the range expression which starts from START_ELEM, and ends
2607      at END_ELEM.  The result are written to MBCSET and SBCSET.
2608      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2609      mbcset->range_ends, is a pointer argument sinse we may
2610      update it.  */
2611
2612 static reg_errcode_t
2613 internal_function
2614 # ifdef RE_ENABLE_I18N
2615 build_range_exp (bitset_t sbcset, re_charset_t *mbcset, Idx *range_alloc,
2616                  bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2617 # else /* not RE_ENABLE_I18N */
2618 build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2619                  bracket_elem_t *end_elem)
2620 # endif /* not RE_ENABLE_I18N */
2621 {
2622   unsigned int start_ch, end_ch;
2623   /* Equivalence Classes and Character Classes can't be a range start/end.  */
2624   if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2625           || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2626           0))
2627     return REG_ERANGE;
2628
2629   /* We can handle no multi character collating elements without libc
2630      support.  */
2631   if (BE ((start_elem->type == COLL_SYM
2632            && strlen ((char *) start_elem->opr.name) > 1)
2633           || (end_elem->type == COLL_SYM
2634               && strlen ((char *) end_elem->opr.name) > 1), 0))
2635     return REG_ECOLLATE;
2636
2637 # ifdef RE_ENABLE_I18N
2638   {
2639     wchar_t wc;
2640     wint_t start_wc;
2641     wint_t end_wc;
2642     wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2643
2644     start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2645                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2646                    : 0));
2647     end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2648               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2649                  : 0));
2650     start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2651                 ? __btowc (start_ch) : start_elem->opr.wch);
2652     end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2653               ? __btowc (end_ch) : end_elem->opr.wch);
2654     if (start_wc == WEOF || end_wc == WEOF)
2655       return REG_ECOLLATE;
2656     cmp_buf[0] = start_wc;
2657     cmp_buf[4] = end_wc;
2658     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2659       return REG_ERANGE;
2660
2661     /* Got valid collation sequence values, add them as a new entry.
2662        However, for !_LIBC we have no collation elements: if the
2663        character set is single byte, the single byte character set
2664        that we build below suffices.  parse_bracket_exp passes
2665        no MBCSET if dfa->mb_cur_max == 1.  */
2666     if (mbcset)
2667       {
2668         /* Check the space of the arrays.  */
2669         if (BE (*range_alloc == mbcset->nranges, 0))
2670           {
2671             /* There is not enough space, need realloc.  */
2672             wchar_t *new_array_start, *new_array_end;
2673             Idx new_nranges;
2674
2675             /* +1 in case of mbcset->nranges is 0.  */
2676             new_nranges = 2 * mbcset->nranges + 1;
2677             /* Use realloc since mbcset->range_starts and mbcset->range_ends
2678                are NULL if *range_alloc == 0.  */
2679             new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2680                                           new_nranges);
2681             new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2682                                         new_nranges);
2683
2684             if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2685               return REG_ESPACE;
2686
2687             mbcset->range_starts = new_array_start;
2688             mbcset->range_ends = new_array_end;
2689             *range_alloc = new_nranges;
2690           }
2691
2692         mbcset->range_starts[mbcset->nranges] = start_wc;
2693         mbcset->range_ends[mbcset->nranges++] = end_wc;
2694       }
2695
2696     /* Build the table for single byte characters.  */
2697     for (wc = 0; wc < SBC_MAX; ++wc)
2698       {
2699         cmp_buf[2] = wc;
2700         if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2701             && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2702           bitset_set (sbcset, wc);
2703       }
2704   }
2705 # else /* not RE_ENABLE_I18N */
2706   {
2707     unsigned int ch;
2708     start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2709                 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2710                    : 0));
2711     end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2712               : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2713                  : 0));
2714     if (start_ch > end_ch)
2715       return REG_ERANGE;
2716     /* Build the table for single byte characters.  */
2717     for (ch = 0; ch < SBC_MAX; ++ch)
2718       if (start_ch <= ch  && ch <= end_ch)
2719         bitset_set (sbcset, ch);
2720   }
2721 # endif /* not RE_ENABLE_I18N */
2722   return REG_NOERROR;
2723 }
2724 #endif /* not _LIBC */
2725
2726 #ifndef _LIBC
2727 /* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2728    Build the collating element which is represented by NAME.
2729    The result are written to MBCSET and SBCSET.
2730    COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2731    pointer argument since we may update it.  */
2732
2733 static reg_errcode_t
2734 internal_function
2735 build_collating_symbol (bitset_t sbcset,
2736 # ifdef RE_ENABLE_I18N
2737                         re_charset_t *mbcset, Idx *coll_sym_alloc,
2738 # endif
2739                         const unsigned char *name)
2740 {
2741   size_t name_len = strlen ((const char *) name);
2742   if (BE (name_len != 1, 0))
2743     return REG_ECOLLATE;
2744   else
2745     {
2746       bitset_set (sbcset, name[0]);
2747       return REG_NOERROR;
2748     }
2749 }
2750 #endif /* not _LIBC */
2751
2752 /* This function parse bracket expression like "[abc]", "[a-c]",
2753    "[[.a-a.]]" etc.  */
2754
2755 static bin_tree_t *
2756 parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2757                    reg_syntax_t syntax, reg_errcode_t *err)
2758 {
2759 #ifdef _LIBC
2760   const unsigned char *collseqmb;
2761   const char *collseqwc;
2762   uint32_t nrules;
2763   int32_t table_size;
2764   const int32_t *symb_table;
2765   const unsigned char *extra;
2766
2767   /* Local function for parse_bracket_exp used in _LIBC environement.
2768      Seek the collating symbol entry correspondings to NAME.
2769      Return the index of the symbol in the SYMB_TABLE.  */
2770
2771   auto inline int32_t
2772   __attribute ((always_inline))
2773   seek_collating_symbol_entry (name, name_len)
2774          const unsigned char *name;
2775          size_t name_len;
2776     {
2777       int32_t hash = elem_hash ((const char *) name, name_len);
2778       int32_t elem = hash % table_size;
2779       if (symb_table[2 * elem] != 0)
2780         {
2781           int32_t second = hash % (table_size - 2) + 1;
2782
2783           do
2784             {
2785               /* First compare the hashing value.  */
2786               if (symb_table[2 * elem] == hash
2787                   /* Compare the length of the name.  */
2788                   && name_len == extra[symb_table[2 * elem + 1]]
2789                   /* Compare the name.  */
2790                   && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2791                              name_len) == 0)
2792                 {
2793                   /* Yep, this is the entry.  */
2794                   break;
2795                 }
2796
2797               /* Next entry.  */
2798               elem += second;
2799             }
2800           while (symb_table[2 * elem] != 0);
2801         }
2802       return elem;
2803     }
2804
2805   /* Local function for parse_bracket_exp used in _LIBC environement.
2806      Look up the collation sequence value of BR_ELEM.
2807      Return the value if succeeded, UINT_MAX otherwise.  */
2808
2809   auto inline unsigned int
2810   __attribute ((always_inline))
2811   lookup_collation_sequence_value (br_elem)
2812          bracket_elem_t *br_elem;
2813     {
2814       if (br_elem->type == SB_CHAR)
2815         {
2816           /*
2817           if (MB_CUR_MAX == 1)
2818           */
2819           if (nrules == 0)
2820             return collseqmb[br_elem->opr.ch];
2821           else
2822             {
2823               wint_t wc = __btowc (br_elem->opr.ch);
2824               return __collseq_table_lookup (collseqwc, wc);
2825             }
2826         }
2827       else if (br_elem->type == MB_CHAR)
2828         {
2829           return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2830         }
2831       else if (br_elem->type == COLL_SYM)
2832         {
2833           size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2834           if (nrules != 0)
2835             {
2836               int32_t elem, idx;
2837               elem = seek_collating_symbol_entry (br_elem->opr.name,
2838                                                   sym_name_len);
2839               if (symb_table[2 * elem] != 0)
2840                 {
2841                   /* We found the entry.  */
2842                   idx = symb_table[2 * elem + 1];
2843                   /* Skip the name of collating element name.  */
2844                   idx += 1 + extra[idx];
2845                   /* Skip the byte sequence of the collating element.  */
2846                   idx += 1 + extra[idx];
2847                   /* Adjust for the alignment.  */
2848                   idx = (idx + 3) & ~3;
2849                   /* Skip the multibyte collation sequence value.  */
2850                   idx += sizeof (unsigned int);
2851                   /* Skip the wide char sequence of the collating element.  */
2852                   idx += sizeof (unsigned int) *
2853                     (1 + *(unsigned int *) (extra + idx));
2854                   /* Return the collation sequence value.  */
2855                   return *(unsigned int *) (extra + idx);
2856                 }
2857               else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2858                 {
2859                   /* No valid character.  Match it as a single byte
2860                      character.  */
2861                   return collseqmb[br_elem->opr.name[0]];
2862                 }
2863             }
2864           else if (sym_name_len == 1)
2865             return collseqmb[br_elem->opr.name[0]];
2866         }
2867       return UINT_MAX;
2868     }
2869
2870   /* Local function for parse_bracket_exp used in _LIBC environement.
2871      Build the range expression which starts from START_ELEM, and ends
2872      at END_ELEM.  The result are written to MBCSET and SBCSET.
2873      RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2874      mbcset->range_ends, is a pointer argument sinse we may
2875      update it.  */
2876
2877   auto inline reg_errcode_t
2878   __attribute ((always_inline))
2879   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2880          re_charset_t *mbcset;
2881          Idx *range_alloc;
2882          bitset_t sbcset;
2883          bracket_elem_t *start_elem, *end_elem;
2884     {
2885       unsigned int ch;
2886       uint32_t start_collseq;
2887       uint32_t end_collseq;
2888
2889       /* Equivalence Classes and Character Classes can't be a range
2890          start/end.  */
2891       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2892               || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2893               0))
2894         return REG_ERANGE;
2895
2896       start_collseq = lookup_collation_sequence_value (start_elem);
2897       end_collseq = lookup_collation_sequence_value (end_elem);
2898       /* Check start/end collation sequence values.  */
2899       if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2900         return REG_ECOLLATE;
2901       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2902         return REG_ERANGE;
2903
2904       /* Got valid collation sequence values, add them as a new entry.
2905          However, if we have no collation elements, and the character set
2906          is single byte, the single byte character set that we
2907          build below suffices. */
2908       if (nrules > 0 || dfa->mb_cur_max > 1)
2909         {
2910           /* Check the space of the arrays.  */
2911           if (BE (*range_alloc == mbcset->nranges, 0))
2912             {
2913               /* There is not enough space, need realloc.  */
2914               uint32_t *new_array_start;
2915               uint32_t *new_array_end;
2916               Idx new_nranges;
2917
2918               /* +1 in case of mbcset->nranges is 0.  */
2919               new_nranges = 2 * mbcset->nranges + 1;
2920               new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2921                                             new_nranges);
2922               new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2923                                           new_nranges);
2924
2925               if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2926                 return REG_ESPACE;
2927
2928               mbcset->range_starts = new_array_start;
2929               mbcset->range_ends = new_array_end;
2930               *range_alloc = new_nranges;
2931             }
2932
2933           mbcset->range_starts[mbcset->nranges] = start_collseq;
2934           mbcset->range_ends[mbcset->nranges++] = end_collseq;
2935         }
2936
2937       /* Build the table for single byte characters.  */
2938       for (ch = 0; ch < SBC_MAX; ch++)
2939         {
2940           uint32_t ch_collseq;
2941           /*
2942           if (MB_CUR_MAX == 1)
2943           */
2944           if (nrules == 0)
2945             ch_collseq = collseqmb[ch];
2946           else
2947             ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2948           if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2949             bitset_set (sbcset, ch);
2950         }
2951       return REG_NOERROR;
2952     }
2953
2954   /* Local function for parse_bracket_exp used in _LIBC environement.
2955      Build the collating element which is represented by NAME.
2956      The result are written to MBCSET and SBCSET.
2957      COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2958      pointer argument sinse we may update it.  */
2959
2960   auto inline reg_errcode_t
2961   __attribute ((always_inline))
2962   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2963          re_charset_t *mbcset;
2964          Idx *coll_sym_alloc;
2965          bitset_t sbcset;
2966          const unsigned char *name;
2967     {
2968       int32_t elem, idx;
2969       size_t name_len = strlen ((const char *) name);
2970       if (nrules != 0)
2971         {
2972           elem = seek_collating_symbol_entry (name, name_len);
2973           if (symb_table[2 * elem] != 0)
2974             {
2975               /* We found the entry.  */
2976               idx = symb_table[2 * elem + 1];
2977               /* Skip the name of collating element name.  */
2978               idx += 1 + extra[idx];
2979             }
2980           else if (symb_table[2 * elem] == 0 && name_len == 1)
2981             {
2982               /* No valid character, treat it as a normal
2983                  character.  */
2984               bitset_set (sbcset, name[0]);
2985               return REG_NOERROR;
2986             }
2987           else
2988             return REG_ECOLLATE;
2989
2990           /* Got valid collation sequence, add it as a new entry.  */
2991           /* Check the space of the arrays.  */
2992           if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2993             {
2994               /* Not enough, realloc it.  */
2995               /* +1 in case of mbcset->ncoll_syms is 0.  */
2996               Idx new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2997               /* Use realloc since mbcset->coll_syms is NULL
2998                  if *alloc == 0.  */
2999               int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
3000                                                    new_coll_sym_alloc);
3001               if (BE (new_coll_syms == NULL, 0))
3002                 return REG_ESPACE;
3003               mbcset->coll_syms = new_coll_syms;
3004               *coll_sym_alloc = new_coll_sym_alloc;
3005             }
3006           mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
3007           return REG_NOERROR;
3008         }
3009       else
3010         {
3011           if (BE (name_len != 1, 0))
3012             return REG_ECOLLATE;
3013           else
3014             {
3015               bitset_set (sbcset, name[0]);
3016               return REG_NOERROR;
3017             }
3018         }
3019     }
3020 #endif
3021
3022   re_token_t br_token;
3023   re_bitset_ptr_t sbcset;
3024 #ifdef RE_ENABLE_I18N
3025   re_charset_t *mbcset;
3026   Idx coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
3027   Idx equiv_class_alloc = 0, char_class_alloc = 0;
3028 #endif /* not RE_ENABLE_I18N */
3029   bool non_match = false;
3030   bin_tree_t *work_tree;
3031   int token_len;
3032   bool first_round = true;
3033 #ifdef _LIBC
3034   collseqmb = (const unsigned char *)
3035     _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3036   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3037   if (nrules)
3038     {
3039       /*
3040       if (MB_CUR_MAX > 1)
3041       */
3042       collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3043       table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3044       symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3045                                                   _NL_COLLATE_SYMB_TABLEMB);
3046       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3047                                                    _NL_COLLATE_SYMB_EXTRAMB);
3048     }
3049 #endif
3050   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3051 #ifdef RE_ENABLE_I18N
3052   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3053 #endif /* RE_ENABLE_I18N */
3054 #ifdef RE_ENABLE_I18N
3055   if (BE (sbcset == NULL || mbcset == NULL, 0))
3056 #else
3057   if (BE (sbcset == NULL, 0))
3058 #endif /* RE_ENABLE_I18N */
3059     {
3060       *err = REG_ESPACE;
3061       return NULL;
3062     }
3063
3064   token_len = peek_token_bracket (token, regexp, syntax);
3065   if (BE (token->type == END_OF_RE, 0))
3066     {
3067       *err = REG_BADPAT;
3068       goto parse_bracket_exp_free_return;
3069     }
3070   if (token->type == OP_NON_MATCH_LIST)
3071     {
3072 #ifdef RE_ENABLE_I18N
3073       mbcset->non_match = 1;
3074 #endif /* not RE_ENABLE_I18N */
3075       non_match = true;
3076       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3077         bitset_set (sbcset, '\0');
3078       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3079       token_len = peek_token_bracket (token, regexp, syntax);
3080       if (BE (token->type == END_OF_RE, 0))
3081         {
3082           *err = REG_BADPAT;
3083           goto parse_bracket_exp_free_return;
3084         }
3085     }
3086
3087   /* We treat the first ']' as a normal character.  */
3088   if (token->type == OP_CLOSE_BRACKET)
3089     token->type = CHARACTER;
3090
3091   while (1)
3092     {
3093       bracket_elem_t start_elem, end_elem;
3094       unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3095       unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3096       reg_errcode_t ret;
3097       int token_len2 = 0;
3098       bool is_range_exp = false;
3099       re_token_t token2;
3100
3101       start_elem.opr.name = start_name_buf;
3102       ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3103                                    syntax, first_round);
3104       if (BE (ret != REG_NOERROR, 0))
3105         {
3106           *err = ret;
3107           goto parse_bracket_exp_free_return;
3108         }
3109       first_round = false;
3110
3111       /* Get information about the next token.  We need it in any case.  */
3112       token_len = peek_token_bracket (token, regexp, syntax);
3113
3114       /* Do not check for ranges if we know they are not allowed.  */
3115       if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3116         {
3117           if (BE (token->type == END_OF_RE, 0))
3118             {
3119               *err = REG_EBRACK;
3120               goto parse_bracket_exp_free_return;
3121             }
3122           if (token->type == OP_CHARSET_RANGE)
3123             {
3124               re_string_skip_bytes (regexp, token_len); /* Skip '-'.  */
3125               token_len2 = peek_token_bracket (&token2, regexp, syntax);
3126               if (BE (token2.type == END_OF_RE, 0))
3127                 {
3128                   *err = REG_EBRACK;
3129                   goto parse_bracket_exp_free_return;
3130                 }
3131               if (token2.type == OP_CLOSE_BRACKET)
3132                 {
3133                   /* We treat the last '-' as a normal character.  */
3134                   re_string_skip_bytes (regexp, -token_len);
3135                   token->type = CHARACTER;
3136                 }
3137               else
3138                 is_range_exp = true;
3139             }
3140         }
3141
3142       if (is_range_exp == true)
3143         {
3144           end_elem.opr.name = end_name_buf;
3145           ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3146                                        dfa, syntax, true);
3147           if (BE (ret != REG_NOERROR, 0))
3148             {
3149               *err = ret;
3150               goto parse_bracket_exp_free_return;
3151             }
3152
3153           token_len = peek_token_bracket (token, regexp, syntax);
3154
3155 #ifdef _LIBC
3156           *err = build_range_exp (sbcset, mbcset, &range_alloc,
3157                                   &start_elem, &end_elem);
3158 #else
3159 # ifdef RE_ENABLE_I18N
3160           *err = build_range_exp (sbcset,
3161                                   dfa->mb_cur_max > 1 ? mbcset : NULL,
3162                                   &range_alloc, &start_elem, &end_elem);
3163 # else
3164           *err = build_range_exp (sbcset, &start_elem, &end_elem);
3165 # endif
3166 #endif /* RE_ENABLE_I18N */
3167           if (BE (*err != REG_NOERROR, 0))
3168             goto parse_bracket_exp_free_return;
3169         }
3170       else
3171         {
3172           switch (start_elem.type)
3173             {
3174             case SB_CHAR:
3175               bitset_set (sbcset, start_elem.opr.ch);
3176               break;
3177 #ifdef RE_ENABLE_I18N
3178             case MB_CHAR:
3179               /* Check whether the array has enough space.  */
3180               if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3181                 {
3182                   wchar_t *new_mbchars;
3183                   /* Not enough, realloc it.  */
3184                   /* +1 in case of mbcset->nmbchars is 0.  */
3185                   mbchar_alloc = 2 * mbcset->nmbchars + 1;
3186                   /* Use realloc since array is NULL if *alloc == 0.  */
3187                   new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3188                                             mbchar_alloc);
3189                   if (BE (new_mbchars == NULL, 0))
3190                     goto parse_bracket_exp_espace;
3191                   mbcset->mbchars = new_mbchars;
3192                 }
3193               mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3194               break;
3195 #endif /* RE_ENABLE_I18N */
3196             case EQUIV_CLASS:
3197               *err = build_equiv_class (sbcset,
3198 #ifdef RE_ENABLE_I18N
3199                                         mbcset, &equiv_class_alloc,
3200 #endif /* RE_ENABLE_I18N */
3201                                         start_elem.opr.name);
3202               if (BE (*err != REG_NOERROR, 0))
3203                 goto parse_bracket_exp_free_return;
3204               break;
3205             case COLL_SYM:
3206               *err = build_collating_symbol (sbcset,
3207 #ifdef RE_ENABLE_I18N
3208                                              mbcset, &coll_sym_alloc,
3209 #endif /* RE_ENABLE_I18N */
3210                                              start_elem.opr.name);
3211               if (BE (*err != REG_NOERROR, 0))
3212                 goto parse_bracket_exp_free_return;
3213               break;
3214             case CHAR_CLASS:
3215               *err = build_charclass (regexp->trans, sbcset,
3216 #ifdef RE_ENABLE_I18N
3217                                       mbcset, &char_class_alloc,
3218 #endif /* RE_ENABLE_I18N */
3219                                       start_elem.opr.name, syntax);
3220               if (BE (*err != REG_NOERROR, 0))
3221                goto parse_bracket_exp_free_return;
3222               break;
3223             default:
3224               assert (0);
3225               break;
3226             }
3227         }
3228       if (BE (token->type == END_OF_RE, 0))
3229         {
3230           *err = REG_EBRACK;
3231           goto parse_bracket_exp_free_return;
3232         }
3233       if (token->type == OP_CLOSE_BRACKET)
3234         break;
3235     }
3236
3237   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3238
3239   /* If it is non-matching list.  */
3240   if (non_match)
3241     bitset_not (sbcset);
3242
3243 #ifdef RE_ENABLE_I18N
3244   /* Ensure only single byte characters are set.  */
3245   if (dfa->mb_cur_max > 1)
3246     bitset_mask (sbcset, dfa->sb_char);
3247
3248   if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3249       || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3250                                                      || mbcset->non_match)))
3251     {
3252       bin_tree_t *mbc_tree;
3253       int sbc_idx;
3254       /* Build a tree for complex bracket.  */
3255       dfa->has_mb_node = 1;
3256       br_token.type = COMPLEX_BRACKET;
3257       br_token.opr.mbcset = mbcset;
3258       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3259       if (BE (mbc_tree == NULL, 0))
3260         goto parse_bracket_exp_espace;
3261       for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3262         if (sbcset[sbc_idx])
3263           break;
3264       /* If there are no bits set in sbcset, there is no point
3265          of having both SIMPLE_BRACKET and COMPLEX_BRACKET.  */
3266       if (sbc_idx < BITSET_WORDS)
3267         {
3268           /* Build a tree for simple bracket.  */
3269           br_token.type = SIMPLE_BRACKET;
3270           br_token.opr.sbcset = sbcset;
3271           work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3272           if (BE (work_tree == NULL, 0))
3273             goto parse_bracket_exp_espace;
3274
3275           /* Then join them by ALT node.  */
3276           work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3277           if (BE (work_tree == NULL, 0))
3278             goto parse_bracket_exp_espace;
3279         }
3280       else
3281         {
3282           re_free (sbcset);
3283           work_tree = mbc_tree;
3284         }
3285     }
3286   else
3287 #endif /* not RE_ENABLE_I18N */
3288     {
3289 #ifdef RE_ENABLE_I18N
3290       free_charset (mbcset);
3291 #endif
3292       /* Build a tree for simple bracket.  */
3293       br_token.type = SIMPLE_BRACKET;
3294       br_token.opr.sbcset = sbcset;
3295       work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3296       if (BE (work_tree == NULL, 0))
3297         goto parse_bracket_exp_espace;
3298     }
3299   return work_tree;
3300
3301  parse_bracket_exp_espace:
3302   *err = REG_ESPACE;
3303  parse_bracket_exp_free_return:
3304   re_free (sbcset);
3305 #ifdef RE_ENABLE_I18N
3306   free_charset (mbcset);
3307 #endif /* RE_ENABLE_I18N */
3308   return NULL;
3309 }
3310
3311 /* Parse an element in the bracket expression.  */
3312
3313 static reg_errcode_t
3314 parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3315                        re_token_t *token, int token_len, re_dfa_t *dfa,
3316                        reg_syntax_t syntax, bool accept_hyphen)
3317 {
3318 #ifdef RE_ENABLE_I18N
3319   int cur_char_size;
3320   cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3321   if (cur_char_size > 1)
3322     {
3323       elem->type = MB_CHAR;
3324       elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3325       re_string_skip_bytes (regexp, cur_char_size);
3326       return REG_NOERROR;
3327     }
3328 #endif /* RE_ENABLE_I18N */
3329   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
3330   if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3331       || token->type == OP_OPEN_EQUIV_CLASS)
3332     return parse_bracket_symbol (elem, regexp, token);
3333   if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3334     {
3335       /* A '-' must only appear as anything but a range indicator before
3336          the closing bracket.  Everything else is an error.  */
3337       re_token_t token2;
3338       (void) peek_token_bracket (&token2, regexp, syntax);
3339       if (token2.type != OP_CLOSE_BRACKET)
3340         /* The actual error value is not standardized since this whole
3341            case is undefined.  But ERANGE makes good sense.  */
3342         return REG_ERANGE;
3343     }
3344   elem->type = SB_CHAR;
3345   elem->opr.ch = token->opr.c;
3346   return REG_NOERROR;
3347 }
3348
3349 /* Parse a bracket symbol in the bracket expression.  Bracket symbols are
3350    such as [:<character_class>:], [.<collating_element>.], and
3351    [=<equivalent_class>=].  */
3352
3353 static reg_errcode_t
3354 parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3355                       re_token_t *token)
3356 {
3357   unsigned char ch, delim = token->opr.c;
3358   int i = 0;
3359   if (re_string_eoi(regexp))
3360     return REG_EBRACK;
3361   for (;; ++i)
3362     {
3363       if (i >= BRACKET_NAME_BUF_SIZE)
3364         return REG_EBRACK;
3365       if (token->type == OP_OPEN_CHAR_CLASS)
3366         ch = re_string_fetch_byte_case (regexp);
3367       else
3368         ch = re_string_fetch_byte (regexp);
3369       if (re_string_eoi(regexp))
3370         return REG_EBRACK;
3371       if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3372         break;
3373       elem->opr.name[i] = ch;
3374     }
3375   re_string_skip_bytes (regexp, 1);
3376   elem->opr.name[i] = '\0';
3377   switch (token->type)
3378     {
3379     case OP_OPEN_COLL_ELEM:
3380       elem->type = COLL_SYM;
3381       break;
3382     case OP_OPEN_EQUIV_CLASS:
3383       elem->type = EQUIV_CLASS;
3384       break;
3385     case OP_OPEN_CHAR_CLASS:
3386       elem->type = CHAR_CLASS;
3387       break;
3388     default:
3389       break;
3390     }
3391   return REG_NOERROR;
3392 }
3393
3394   /* Helper function for parse_bracket_exp.
3395      Build the equivalence class which is represented by NAME.
3396      The result are written to MBCSET and SBCSET.
3397      EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3398      is a pointer argument sinse we may update it.  */
3399
3400 static reg_errcode_t
3401 #ifdef RE_ENABLE_I18N
3402 build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3403                    Idx *equiv_class_alloc, const unsigned char *name)
3404 #else /* not RE_ENABLE_I18N */
3405 build_equiv_class (bitset_t sbcset, const unsigned char *name)
3406 #endif /* not RE_ENABLE_I18N */
3407 {
3408 #ifdef _LIBC
3409   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3410   if (nrules != 0)
3411     {
3412       const int32_t *table, *indirect;
3413       const unsigned char *weights, *extra, *cp;
3414       unsigned char char_buf[2];
3415       int32_t idx1, idx2;
3416       unsigned int ch;
3417       size_t len;
3418       /* This #include defines a local function!  */
3419 # include <locale/weight.h>
3420       /* Calculate the index for equivalence class.  */
3421       cp = name;
3422       table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3423       weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3424                                                _NL_COLLATE_WEIGHTMB);
3425       extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3426                                                    _NL_COLLATE_EXTRAMB);
3427       indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3428                                                 _NL_COLLATE_INDIRECTMB);
3429       idx1 = findidx (&cp);
3430       if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3431         /* This isn't a valid character.  */
3432         return REG_ECOLLATE;
3433
3434       /* Build single byte matcing table for this equivalence class.  */
3435       char_buf[1] = (unsigned char) '\0';
3436       len = weights[idx1];
3437       for (ch = 0; ch < SBC_MAX; ++ch)
3438         {
3439           char_buf[0] = ch;
3440           cp = char_buf;
3441           idx2 = findidx (&cp);
3442 /*
3443           idx2 = table[ch];
3444 */
3445           if (idx2 == 0)
3446             /* This isn't a valid character.  */
3447             continue;
3448           if (len == weights[idx2])
3449             {
3450               int cnt = 0;
3451               while (cnt <= len &&
3452                      weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3453                 ++cnt;
3454
3455               if (cnt > len)
3456                 bitset_set (sbcset, ch);
3457             }
3458         }
3459       /* Check whether the array has enough space.  */
3460       if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3461         {
3462           /* Not enough, realloc it.  */
3463           /* +1 in case of mbcset->nequiv_classes is 0.  */
3464           Idx new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3465           /* Use realloc since the array is NULL if *alloc == 0.  */
3466           int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3467                                                    int32_t,
3468                                                    new_equiv_class_alloc);
3469           if (BE (new_equiv_classes == NULL, 0))
3470             return REG_ESPACE;
3471           mbcset->equiv_classes = new_equiv_classes;
3472           *equiv_class_alloc = new_equiv_class_alloc;
3473         }
3474       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3475     }
3476   else
3477 #endif /* _LIBC */
3478     {
3479       if (BE (strlen ((const char *) name) != 1, 0))
3480         return REG_ECOLLATE;
3481       bitset_set (sbcset, *name);
3482     }
3483   return REG_NOERROR;
3484 }
3485
3486   /* Helper function for parse_bracket_exp.
3487      Build the character class which is represented by NAME.
3488      The result are written to MBCSET and SBCSET.
3489      CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3490      is a pointer argument sinse we may update it.  */
3491
3492 static reg_errcode_t
3493 #ifdef RE_ENABLE_I18N
3494 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3495                  re_charset_t *mbcset, Idx *char_class_alloc,
3496                  const unsigned char *class_name, reg_syntax_t syntax)
3497 #else /* not RE_ENABLE_I18N */
3498 build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3499                  const unsigned char *class_name, reg_syntax_t syntax)
3500 #endif /* not RE_ENABLE_I18N */
3501 {
3502   int i;
3503   const char *name = (const char *) class_name;
3504
3505   /* In case of REG_ICASE "upper" and "lower" match the both of
3506      upper and lower cases.  */
3507   if ((syntax & RE_ICASE)
3508       && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3509     name = "alpha";
3510
3511 #ifdef RE_ENABLE_I18N
3512   /* Check the space of the arrays.  */
3513   if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3514     {
3515       /* Not enough, realloc it.  */
3516       /* +1 in case of mbcset->nchar_classes is 0.  */
3517       Idx new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3518       /* Use realloc since array is NULL if *alloc == 0.  */
3519       wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3520                                                new_char_class_alloc);
3521       if (BE (new_char_classes == NULL, 0))
3522         return REG_ESPACE;
3523       mbcset->char_classes = new_char_classes;
3524       *char_class_alloc = new_char_class_alloc;
3525     }
3526   mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3527 #endif /* RE_ENABLE_I18N */
3528
3529 #define BUILD_CHARCLASS_LOOP(ctype_func)        \
3530   do {                                          \
3531     if (BE (trans != NULL, 0))                  \
3532       {                                         \
3533         for (i = 0; i < SBC_MAX; ++i)           \
3534           if (ctype_func (i))                   \
3535             bitset_set (sbcset, trans[i]);      \
3536       }                                         \
3537     else                                        \
3538       {                                         \
3539         for (i = 0; i < SBC_MAX; ++i)           \
3540           if (ctype_func (i))                   \
3541             bitset_set (sbcset, i);             \
3542       }                                         \
3543   } while (0)
3544
3545   if (strcmp (name, "alnum") == 0)
3546     BUILD_CHARCLASS_LOOP (isalnum);
3547   else if (strcmp (name, "cntrl") == 0)
3548     BUILD_CHARCLASS_LOOP (iscntrl);
3549   else if (strcmp (name, "lower") == 0)
3550     BUILD_CHARCLASS_LOOP (islower);
3551   else if (strcmp (name, "space") == 0)
3552     BUILD_CHARCLASS_LOOP (isspace);
3553   else if (strcmp (name, "alpha") == 0)
3554     BUILD_CHARCLASS_LOOP (isalpha);
3555   else if (strcmp (name, "digit") == 0)
3556     BUILD_CHARCLASS_LOOP (isdigit);
3557   else if (strcmp (name, "print") == 0)
3558     BUILD_CHARCLASS_LOOP (isprint);
3559   else if (strcmp (name, "upper") == 0)
3560     BUILD_CHARCLASS_LOOP (isupper);
3561   else if (strcmp (name, "blank") == 0)
3562     BUILD_CHARCLASS_LOOP (isblank);
3563   else if (strcmp (name, "graph") == 0)
3564     BUILD_CHARCLASS_LOOP (isgraph);
3565   else if (strcmp (name, "punct") == 0)
3566     BUILD_CHARCLASS_LOOP (ispunct);
3567   else if (strcmp (name, "xdigit") == 0)
3568     BUILD_CHARCLASS_LOOP (isxdigit);
3569   else
3570     return REG_ECTYPE;
3571
3572   return REG_NOERROR;
3573 }
3574
3575 static bin_tree_t *
3576 build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3577                     const unsigned char *class_name,
3578                     const unsigned char *extra, bool non_match,
3579                     reg_errcode_t *err)
3580 {
3581   re_bitset_ptr_t sbcset;
3582 #ifdef RE_ENABLE_I18N
3583   re_charset_t *mbcset;
3584   Idx alloc = 0;
3585 #endif /* not RE_ENABLE_I18N */
3586   reg_errcode_t ret;
3587   re_token_t br_token;
3588   bin_tree_t *tree;
3589
3590   sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3591 #ifdef RE_ENABLE_I18N
3592   mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3593 #endif /* RE_ENABLE_I18N */
3594
3595 #ifdef RE_ENABLE_I18N
3596   if (BE (sbcset == NULL || mbcset == NULL, 0))
3597 #else /* not RE_ENABLE_I18N */
3598   if (BE (sbcset == NULL, 0))
3599 #endif /* not RE_ENABLE_I18N */
3600     {
3601       *err = REG_ESPACE;
3602       return NULL;
3603     }
3604
3605   if (non_match)
3606     {
3607 #ifdef RE_ENABLE_I18N
3608       /*
3609       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3610         bitset_set(cset->sbcset, '\0');
3611       */
3612       mbcset->non_match = 1;
3613 #endif /* not RE_ENABLE_I18N */
3614     }
3615
3616   /* We don't care the syntax in this case.  */
3617   ret = build_charclass (trans, sbcset,
3618 #ifdef RE_ENABLE_I18N
3619                          mbcset, &alloc,
3620 #endif /* RE_ENABLE_I18N */
3621                          class_name, 0);
3622
3623   if (BE (ret != REG_NOERROR, 0))
3624     {
3625       re_free (sbcset);
3626 #ifdef RE_ENABLE_I18N
3627       free_charset (mbcset);
3628 #endif /* RE_ENABLE_I18N */
3629       *err = ret;
3630       return NULL;
3631     }
3632   /* \w match '_' also.  */
3633   for (; *extra; extra++)
3634     bitset_set (sbcset, *extra);
3635
3636   /* If it is non-matching list.  */
3637   if (non_match)
3638     bitset_not (sbcset);
3639
3640 #ifdef RE_ENABLE_I18N
3641   /* Ensure only single byte characters are set.  */
3642   if (dfa->mb_cur_max > 1)
3643     bitset_mask (sbcset, dfa->sb_char);
3644 #endif
3645
3646   /* Build a tree for simple bracket.  */
3647   br_token.type = SIMPLE_BRACKET;
3648   br_token.opr.sbcset = sbcset;
3649   tree = create_token_tree (dfa, NULL, NULL, &br_token);
3650   if (BE (tree == NULL, 0))
3651     goto build_word_op_espace;
3652
3653 #ifdef RE_ENABLE_I18N
3654   if (dfa->mb_cur_max > 1)
3655     {
3656       bin_tree_t *mbc_tree;
3657       /* Build a tree for complex bracket.  */
3658       br_token.type = COMPLEX_BRACKET;
3659       br_token.opr.mbcset = mbcset;
3660       dfa->has_mb_node = 1;
3661       mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3662       if (BE (mbc_tree == NULL, 0))
3663         goto build_word_op_espace;
3664       /* Then join them by ALT node.  */
3665       tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3666       if (BE (mbc_tree != NULL, 1))
3667         return tree;
3668     }
3669   else
3670     {
3671       free_charset (mbcset);
3672       return tree;
3673     }
3674 #else /* not RE_ENABLE_I18N */
3675   return tree;
3676 #endif /* not RE_ENABLE_I18N */
3677
3678  build_word_op_espace:
3679   re_free (sbcset);
3680 #ifdef RE_ENABLE_I18N
3681   free_charset (mbcset);
3682 #endif /* RE_ENABLE_I18N */
3683   *err = REG_ESPACE;
3684   return NULL;
3685 }
3686
3687 /* This is intended for the expressions like "a{1,3}".
3688    Fetch a number from `input', and return the number.
3689    Return REG_MISSING if the number field is empty like "{,1}".
3690    Return REG_ERROR if an error occurred.  */
3691
3692 static Idx
3693 fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3694 {
3695   Idx num = REG_MISSING;
3696   unsigned char c;
3697   while (1)
3698     {
3699       fetch_token (token, input, syntax);
3700       c = token->opr.c;
3701       if (BE (token->type == END_OF_RE, 0))
3702         return REG_ERROR;
3703       if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3704         break;
3705       num = ((token->type != CHARACTER || c < '0' || '9' < c
3706               || num == REG_ERROR)
3707              ? REG_ERROR
3708              : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
3709       num = (num > RE_DUP_MAX) ? REG_ERROR : num;
3710     }
3711   return num;
3712 }
3713 \f
3714 #ifdef RE_ENABLE_I18N
3715 static void
3716 free_charset (re_charset_t *cset)
3717 {
3718   re_free (cset->mbchars);
3719 # ifdef _LIBC
3720   re_free (cset->coll_syms);
3721   re_free (cset->equiv_classes);
3722   re_free (cset->range_starts);
3723   re_free (cset->range_ends);
3724 # endif
3725   re_free (cset->char_classes);
3726   re_free (cset);
3727 }
3728 #endif /* RE_ENABLE_I18N */
3729 \f
3730 /* Functions for binary tree operation.  */
3731
3732 /* Create a tree node.  */
3733
3734 static bin_tree_t *
3735 create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3736              re_token_type_t type)
3737 {
3738   re_token_t t;
3739   t.type = type;
3740   return create_token_tree (dfa, left, right, &t);
3741 }
3742
3743 static bin_tree_t *
3744 create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3745                    const re_token_t *token)
3746 {
3747   bin_tree_t *tree;
3748   if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3749     {
3750       bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3751
3752       if (storage == NULL)
3753         return NULL;
3754       storage->next = dfa->str_tree_storage;
3755       dfa->str_tree_storage = storage;
3756       dfa->str_tree_storage_idx = 0;
3757     }
3758   tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3759
3760   tree->parent = NULL;
3761   tree->left = left;
3762   tree->right = right;
3763   tree->token = *token;
3764   tree->token.duplicated = 0;
3765   tree->token.opt_subexp = 0;
3766   tree->first = NULL;
3767   tree->next = NULL;
3768   tree->node_idx = REG_MISSING;
3769
3770   if (left != NULL)
3771     left->parent = tree;
3772   if (right != NULL)
3773     right->parent = tree;
3774   return tree;
3775 }
3776
3777 /* Mark the tree SRC as an optional subexpression.
3778    To be called from preorder or postorder.  */
3779
3780 static reg_errcode_t
3781 mark_opt_subexp (void *extra, bin_tree_t *node)
3782 {
3783   Idx idx = (Idx) (long) extra;
3784   if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3785     node->token.opt_subexp = 1;
3786
3787   return REG_NOERROR;
3788 }
3789
3790 /* Free the allocated memory inside NODE. */
3791
3792 static void
3793 free_token (re_token_t *node)
3794 {
3795 #ifdef RE_ENABLE_I18N
3796   if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3797     free_charset (node->opr.mbcset);
3798   else
3799 #endif /* RE_ENABLE_I18N */
3800     if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3801       re_free (node->opr.sbcset);
3802 }
3803
3804 /* Worker function for tree walking.  Free the allocated memory inside NODE
3805    and its children. */
3806
3807 static reg_errcode_t
3808 free_tree (void *extra, bin_tree_t *node)
3809 {
3810   free_token (&node->token);
3811   return REG_NOERROR;
3812 }
3813
3814
3815 /* Duplicate the node SRC, and return new node.  This is a preorder
3816    visit similar to the one implemented by the generic visitor, but
3817    we need more infrastructure to maintain two parallel trees --- so,
3818    it's easier to duplicate.  */
3819
3820 static bin_tree_t *
3821 duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3822 {
3823   const bin_tree_t *node;
3824   bin_tree_t *dup_root;
3825   bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3826
3827   for (node = root; ; )
3828     {
3829       /* Create a new tree and link it back to the current parent.  */
3830       *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3831       if (*p_new == NULL)
3832         return NULL;
3833       (*p_new)->parent = dup_node;
3834       (*p_new)->token.duplicated = 1;
3835       dup_node = *p_new;
3836
3837       /* Go to the left node, or up and to the right.  */
3838       if (node->left)
3839         {
3840           node = node->left;
3841           p_new = &dup_node->left;
3842         }
3843       else
3844         {
3845           const bin_tree_t *prev = NULL;
3846           while (node->right == prev || node->right == NULL)
3847             {
3848               prev = node;
3849               node = node->parent;
3850               dup_node = dup_node->parent;
3851               if (!node)
3852                 return dup_root;
3853             }
3854           node = node->right;
3855           p_new = &dup_node->right;
3856         }
3857     }
3858 }