compat/regex/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19    02110-1301 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags)
  44      internal_function;
  45 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  46                              const char *string1, int length1,
  47                              const char *string2, int length2,
  48                              int start, int range, struct re_registers *regs,
  49                              int stop, int ret_len)
  50      internal_function;
  51 static int re_search_stub (struct re_pattern_buffer *bufp,
  52                            const char *string, int length, int start,
  53                            int range, int stop, struct re_registers *regs,
  54                            int ret_len)
  55      internal_function;
  56 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  57                               int nregs, int regs_allocated)
  58      internal_function;
  59 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  60      internal_function;
  61 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  62                            int *p_match_first) internal_function;
  63 static int check_halt_state_context (const re_match_context_t *mctx,
  64                                      const re_dfastate_t *state, int idx)
  65      internal_function;
  66 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  67                          regmatch_t *prev_idx_match, int cur_node,
  68                          int cur_idx, int nmatch) internal_function;
  69 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  70                                       int str_idx, int dest_node, int nregs,
  71                                       regmatch_t *regs,
  72                                       re_node_set *eps_via_nodes)
  73      internal_function;
  74 static reg_errcode_t set_regs (const regex_t *preg,
  75                                const re_match_context_t *mctx,
  76                                size_t nmatch, regmatch_t *pmatch,
  77                                int fl_backtrack) internal_function;
  78 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  79      internal_function;
  80
  81 #ifdef RE_ENABLE_I18N
  82 static int sift_states_iter_mb (const re_match_context_t *mctx,
  83                                 re_sift_context_t *sctx,
  84                                 int node_idx, int str_idx, int max_str_idx)
  85      internal_function;
  86 #endif /* RE_ENABLE_I18N */
  87 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  88                                            re_sift_context_t *sctx)
  89      internal_function;
  90 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  91                                           re_sift_context_t *sctx, int str_idx,
  92                                           re_node_set *cur_dest)
  93      internal_function;
  94 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  95                                               re_sift_context_t *sctx,
  96                                               int str_idx,
  97                                               re_node_set *dest_nodes)
  98      internal_function;
  99 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
 100                                             re_node_set *dest_nodes,
 101                                             const re_node_set *candidates)
 102      internal_function;
 103 static int check_dst_limits (const re_match_context_t *mctx,
 104                              re_node_set *limits,
 105                              int dst_node, int dst_idx, int src_node,
 106                              int src_idx) internal_function;
 107 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 108                                         int boundaries, int subexp_idx,
 109                                         int from_node, int bkref_idx)
 110      internal_function;
 111 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 112                                       int limit, int subexp_idx,
 113                                       int node, int str_idx,
 114                                       int bkref_idx) internal_function;
 115 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 116                                           re_node_set *dest_nodes,
 117                                           const re_node_set *candidates,
 118                                           re_node_set *limits,
 119                                           struct re_backref_cache_entry *bkref_ents,
 120                                           int str_idx) internal_function;
 121 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 122                                         re_sift_context_t *sctx,
 123                                         int str_idx, const re_node_set *candidates)
 124      internal_function;
 125 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 126                                         re_dfastate_t **dst,
 127                                         re_dfastate_t **src, int num)
 128      internal_function;
 129 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 130                                          re_match_context_t *mctx) internal_function;
 131 static re_dfastate_t *transit_state (reg_errcode_t *err,
 132                                      re_match_context_t *mctx,
 133                                      re_dfastate_t *state) internal_function;
 134 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 135                                             re_match_context_t *mctx,
 136                                             re_dfastate_t *next_state)
 137      internal_function;
 138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 139                                                 re_node_set *cur_nodes,
 140                                                 int str_idx) internal_function;
 141 #if 0
 142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 143                                         re_match_context_t *mctx,
 144                                         re_dfastate_t *pstate)
 145      internal_function;
 146 #endif
 147 #ifdef RE_ENABLE_I18N
 148 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 149                                        re_dfastate_t *pstate)
 150      internal_function;
 151 #endif /* RE_ENABLE_I18N */
 152 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 153                                           const re_node_set *nodes)
 154      internal_function;
 155 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 156                                  int bkref_node, int bkref_str_idx)
 157      internal_function;
 158 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 159                                      const re_sub_match_top_t *sub_top,
 160                                      re_sub_match_last_t *sub_last,
 161                                      int bkref_node, int bkref_str)
 162      internal_function;
 163 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 164                              int subexp_idx, int type) internal_function;
 165 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 166                                     state_array_t *path, int top_node,
 167                                     int top_str, int last_node, int last_str,
 168                                     int type) internal_function;
 169 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 170                                                    int str_idx,
 171                                                    re_node_set *cur_nodes,
 172                                                    re_node_set *next_nodes)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 175                                                re_node_set *cur_nodes,
 176                                                int ex_subexp, int type)
 177      internal_function;
 178 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 179                                                    re_node_set *dst_nodes,
 180                                                    int target, int ex_subexp,
 181                                                    int type) internal_function;
 182 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 183                                          re_node_set *cur_nodes, int cur_str,
 184                                          int subexp_num, int type)
 185      internal_function;
 186 static int build_trtable (const re_dfa_t *dfa,
 187                           re_dfastate_t *state) internal_function;
 188 #ifdef RE_ENABLE_I18N
 189 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 190                                     const re_string_t *input, int idx)
 191      internal_function;
 192 # ifdef _LIBC
 193 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 194                                                    size_t name_len)
 195      internal_function;
 196 # endif /* _LIBC */
 197 #endif /* RE_ENABLE_I18N */
 198 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 199                                        const re_dfastate_t *state,
 200                                        re_node_set *states_node,
 201                                        bitset_t *states_ch) internal_function;
 202 static int check_node_accept (const re_match_context_t *mctx,
 203                               const re_token_t *node, int idx)
 204      internal_function;
 205 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 206      internal_function;
 207 \f
 208 /* Entry point for POSIX code.  */
 209
 210 /* regexec searches for a given pattern, specified by PREG, in the
 211    string STRING.
 212
 213    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 214    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 215    least NMATCH elements, and we set them to the offsets of the
 216    corresponding matched substrings.
 217
 218    EFLAGS specifies `execution flags' which affect matching: if
 219    REG_NOTBOL is set, then ^ does not match at the beginning of the
 220    string; if REG_NOTEOL is set, then $ does not match at the end.
 221
 222    We return 0 if we find a match and REG_NOMATCH if not.  */
 223
 224 int
 225 regexec (
 226         const regex_t *__restrict preg,
 227         const char *__restrict string,
 228         size_t nmatch,
 229         regmatch_t pmatch[],
 230         int eflags)
 231 {
 232   reg_errcode_t err;
 233   int start, length;
 234
 235   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 236     return REG_BADPAT;
 237
 238   if (eflags & REG_STARTEND)
 239     {
 240       start = pmatch[0].rm_so;
 241       length = pmatch[0].rm_eo;
 242     }
 243   else
 244     {
 245       start = 0;
 246       length = strlen (string);
 247     }
 248
 249   __libc_lock_lock (dfa->lock);
 250   if (preg->no_sub)
 251     err = re_search_internal (preg, string, length, start, length - start,
 252                               length, 0, NULL, eflags);
 253   else
 254     err = re_search_internal (preg, string, length, start, length - start,
 255                               length, nmatch, pmatch, eflags);
 256   __libc_lock_unlock (dfa->lock);
 257   return err != REG_NOERROR;
 258 }
 259
 260 #ifdef _LIBC
 261 # include <shlib-compat.h>
 262 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 263
 264 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 265 __typeof__ (__regexec) __compat_regexec;
 266
 267 int
 268 attribute_compat_text_section
 269 __compat_regexec (const regex_t *__restrict preg,
 270                   const char *__restrict string, size_t nmatch,
 271                   regmatch_t pmatch[], int eflags)
 272 {
 273   return regexec (preg, string, nmatch, pmatch,
 274                   eflags & (REG_NOTBOL | REG_NOTEOL));
 275 }
 276 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 277 # endif
 278 #endif
 279
 280 /* Entry points for GNU code.  */
 281
 282 /* re_match, re_search, re_match_2, re_search_2
 283
 284    The former two functions operate on STRING with length LENGTH,
 285    while the later two operate on concatenation of STRING1 and STRING2
 286    with lengths LENGTH1 and LENGTH2, respectively.
 287
 288    re_match() matches the compiled pattern in BUFP against the string,
 289    starting at index START.
 290
 291    re_search() first tries matching at index START, then it tries to match
 292    starting from index START + 1, and so on.  The last start position tried
 293    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 294    way as re_match().)
 295
 296    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 297    the first STOP characters of the concatenation of the strings should be
 298    concerned.
 299
 300    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 301    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 302    computed relative to the concatenation, not relative to the individual
 303    strings.)
 304
 305    On success, re_match* functions return the length of the match, re_search*
 306    return the position of the start of the match.  Return value -1 means no
 307    match was found and -2 indicates an internal error.  */
 308
 309 int
 310 re_match (struct re_pattern_buffer *bufp,
 311           const char *string,
 312           int length,
 313           int start,
 314           struct re_registers *regs)
 315 {
 316   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 317 }
 318 #ifdef _LIBC
 319 weak_alias (__re_match, re_match)
 320 #endif
 321
 322 int
 323 re_search (struct re_pattern_buffer *bufp,
 324            const char *string,
 325            int length, int start, int range,
 326            struct re_registers *regs)
 327 {
 328   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 329 }
 330 #ifdef _LIBC
 331 weak_alias (__re_search, re_search)
 332 #endif
 333
 334 int
 335 re_match_2 (struct re_pattern_buffer *bufp,
 336             const char *string1, int length1,
 337             const char *string2, int length2, int start,
 338             struct re_registers *regs, int stop)
 339 {
 340   return re_search_2_stub (bufp, string1, length1, string2, length2,
 341                            start, 0, regs, stop, 1);
 342 }
 343 #ifdef _LIBC
 344 weak_alias (__re_match_2, re_match_2)
 345 #endif
 346
 347 int
 348 re_search_2 (struct re_pattern_buffer *bufp,
 349              const char *string1, int length1,
 350              const char *string2, int length2, int start,
 351              int range, struct re_registers *regs,  int stop)
 352 {
 353   return re_search_2_stub (bufp, string1, length1, string2, length2,
 354                            start, range, regs, stop, 0);
 355 }
 356 #ifdef _LIBC
 357 weak_alias (__re_search_2, re_search_2)
 358 #endif
 359
 360 static int internal_function
 361 re_search_2_stub (struct re_pattern_buffer *bufp,
 362                   const char *string1, int length1,
 363                   const char *string2, int length2, int start,
 364                   int range, struct re_registers *regs,
 365                   int stop, int ret_len)
 366 {
 367   const char *str;
 368   int rval;
 369   int len = length1 + length2;
 370   int free_str = 0;
 371
 372   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 373     return -2;
 374
 375   /* Concatenate the strings.  */
 376   if (length2 > 0)
 377     if (length1 > 0)
 378       {
 379         char *s = re_malloc (char, len);
 380
 381         if (BE (s == NULL, 0))
 382           return -2;
 383         memcpy (s, string1, length1);
 384         memcpy (s + length1, string2, length2);
 385         str = s;
 386         free_str = 1;
 387       }
 388     else
 389       str = string2;
 390   else
 391     str = string1;
 392
 393   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 394   if (free_str)
 395     re_free ((char *) str);
 396   return rval;
 397 }
 398
 399 /* The parameters have the same meaning as those of re_search.
 400    Additional parameters:
 401    If RET_LEN is nonzero the length of the match is returned (re_match style);
 402    otherwise the position of the match is returned.  */
 403
 404 static int internal_function
 405 re_search_stub (struct re_pattern_buffer *bufp,
 406                 const char *string, int length, int start,
 407                 int range, int stop,
 408                 struct re_registers *regs, int ret_len)
 409 {
 410   reg_errcode_t result;
 411   regmatch_t *pmatch;
 412   int nregs, rval;
 413   int eflags = 0;
 414
 415   /* Check for out-of-range.  */
 416   if (BE (start < 0 || start > length, 0))
 417     return -1;
 418   if (BE (start + range > length, 0))
 419     range = length - start;
 420   else if (BE (start + range < 0, 0))
 421     range = -start;
 422
 423   __libc_lock_lock (dfa->lock);
 424
 425   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 426   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 427
 428   /* Compile fastmap if we haven't yet.  */
 429   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 430     re_compile_fastmap (bufp);
 431
 432   if (BE (bufp->no_sub, 0))
 433     regs = NULL;
 434
 435   /* We need at least 1 register.  */
 436   if (regs == NULL)
 437     nregs = 1;
 438   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 439                regs->num_regs < bufp->re_nsub + 1, 0))
 440     {
 441       nregs = regs->num_regs;
 442       if (BE (nregs < 1, 0))
 443         {
 444           /* Nothing can be copied to regs.  */
 445           regs = NULL;
 446           nregs = 1;
 447         }
 448     }
 449   else
 450     nregs = bufp->re_nsub + 1;
 451   pmatch = re_malloc (regmatch_t, nregs);
 452   if (BE (pmatch == NULL, 0))
 453     {
 454       rval = -2;
 455       goto out;
 456     }
 457
 458   result = re_search_internal (bufp, string, length, start, range, stop,
 459                                nregs, pmatch, eflags);
 460
 461   rval = 0;
 462
 463   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 464   if (result != REG_NOERROR)
 465     rval = -1;
 466   else if (regs != NULL)
 467     {
 468       /* If caller wants register contents data back, copy them.  */
 469       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 470                                            bufp->regs_allocated);
 471       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 472         rval = -2;
 473     }
 474
 475   if (BE (rval == 0, 1))
 476     {
 477       if (ret_len)
 478         {
 479           assert (pmatch[0].rm_so == start);
 480           rval = pmatch[0].rm_eo - start;
 481         }
 482       else
 483         rval = pmatch[0].rm_so;
 484     }
 485   re_free (pmatch);
 486  out:
 487   __libc_lock_unlock (dfa->lock);
 488   return rval;
 489 }
 490
 491 static unsigned internal_function
 492 re_copy_regs (struct re_registers *regs,
 493               regmatch_t *pmatch,
 494               int nregs, int regs_allocated)
 495 {
 496   int rval = REGS_REALLOCATE;
 497   int i;
 498   int need_regs = nregs + 1;
 499   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 500      uses.  */
 501
 502   /* Have the register data arrays been allocated?  */
 503   if (regs_allocated == REGS_UNALLOCATED)
 504     { /* No.  So allocate them with malloc.  */
 505       regs->start = re_malloc (regoff_t, need_regs);
 506       if (BE (regs->start == NULL, 0))
 507         return REGS_UNALLOCATED;
 508       regs->end = re_malloc (regoff_t, need_regs);
 509       if (BE (regs->end == NULL, 0))
 510         {
 511           re_free (regs->start);
 512           return REGS_UNALLOCATED;
 513         }
 514       regs->num_regs = need_regs;
 515     }
 516   else if (regs_allocated == REGS_REALLOCATE)
 517     { /* Yes.  If we need more elements than were already
 518          allocated, reallocate them.  If we need fewer, just
 519          leave it alone.  */
 520       if (BE (need_regs > regs->num_regs, 0))
 521         {
 522           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 523           regoff_t *new_end;
 524           if (BE (new_start == NULL, 0))
 525             return REGS_UNALLOCATED;
 526           new_end = re_realloc (regs->end, regoff_t, need_regs);
 527           if (BE (new_end == NULL, 0))
 528             {
 529               re_free (new_start);
 530               return REGS_UNALLOCATED;
 531             }
 532           regs->start = new_start;
 533           regs->end = new_end;
 534           regs->num_regs = need_regs;
 535         }
 536     }
 537   else
 538     {
 539       assert (regs_allocated == REGS_FIXED);
 540       /* This function may not be called with REGS_FIXED and nregs too big.  */
 541       assert (regs->num_regs >= nregs);
 542       rval = REGS_FIXED;
 543     }
 544
 545   /* Copy the regs.  */
 546   for (i = 0; i < nregs; ++i)
 547     {
 548       regs->start[i] = pmatch[i].rm_so;
 549       regs->end[i] = pmatch[i].rm_eo;
 550     }
 551   for ( ; i < regs->num_regs; ++i)
 552     regs->start[i] = regs->end[i] = -1;
 553
 554   return rval;
 555 }
 556
 557 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 558    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 559    this memory for recording register information.  STARTS and ENDS
 560    must be allocated using the malloc library routine, and must each
 561    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 562
 563    If NUM_REGS == 0, then subsequent matches should allocate their own
 564    register data.
 565
 566    Unless this function is called, the first search or match using
 567    PATTERN_BUFFER will allocate its own register data, without
 568    freeing the old data.  */
 569
 570 void
 571 re_set_registers (struct re_pattern_buffer *bufp,
 572                   struct re_registers *regs,
 573                   unsigned num_regs,
 574                   regoff_t *starts,
 575                   regoff_t *ends)
 576 {
 577   if (num_regs)
 578     {
 579       bufp->regs_allocated = REGS_REALLOCATE;
 580       regs->num_regs = num_regs;
 581       regs->start = starts;
 582       regs->end = ends;
 583     }
 584   else
 585     {
 586       bufp->regs_allocated = REGS_UNALLOCATED;
 587       regs->num_regs = 0;
 588       regs->start = regs->end = (regoff_t *) 0;
 589     }
 590 }
 591 #ifdef _LIBC
 592 weak_alias (__re_set_registers, re_set_registers)
 593 #endif
 594 \f
 595 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 596    them unless specifically requested.  */
 597
 598 #if defined _REGEX_RE_COMP || defined _LIBC
 599 int
 600 # ifdef _LIBC
 601 weak_function
 602 # endif
 603 re_exec (s)
 604      const char *s;
 605 {
 606   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 607 }
 608 #endif /* _REGEX_RE_COMP */
 609 \f
 610 /* Internal entry point.  */
 611
 612 /* Searches for a compiled pattern PREG in the string STRING, whose
 613    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 614    mingings with regexec.  START, and RANGE have the same meanings
 615    with re_search.
 616    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 617    otherwise return the error code.
 618    Note: We assume front end functions already check ranges.
 619    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 620
 621 static reg_errcode_t internal_function
 622 re_search_internal (const regex_t *preg,
 623                     const char *string,
 624                     int length, int start, int range, int stop,
 625                     size_t nmatch, regmatch_t pmatch[],
 626                     int eflags)
 627 {
 628   reg_errcode_t err;
 629   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 630   int left_lim, right_lim, incr;
 631   int fl_longest_match, match_first, match_kind, match_last = -1;
 632   int extra_nmatch;
 633   int sb, ch;
 634 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 635   re_match_context_t mctx = { .dfa = dfa };
 636 #else
 637   re_match_context_t mctx;
 638 #endif
 639   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 640                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 641   RE_TRANSLATE_TYPE t = preg->translate;
 642
 643 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 644   memset (&mctx, '\0', sizeof (re_match_context_t));
 645   mctx.dfa = dfa;
 646 #endif
 647
 648   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 649   nmatch -= extra_nmatch;
 650
 651   /* Check if the DFA haven't been compiled.  */
 652   if (BE (preg->used == 0 || dfa->init_state == NULL
 653           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 654           || dfa->init_state_begbuf == NULL, 0))
 655     return REG_NOMATCH;
 656
 657 #ifdef DEBUG
 658   /* We assume front-end functions already check them.  */
 659   assert (start + range >= 0 && start + range <= length);
 660 #endif
 661
 662   /* If initial states with non-begbuf contexts have no elements,
 663      the regex must be anchored.  If preg->newline_anchor is set,
 664      we'll never use init_state_nl, so do not check it.  */
 665   if (dfa->init_state->nodes.nelem == 0
 666       && dfa->init_state_word->nodes.nelem == 0
 667       && (dfa->init_state_nl->nodes.nelem == 0
 668           || !preg->newline_anchor))
 669     {
 670       if (start != 0 && start + range != 0)
 671         return REG_NOMATCH;
 672       start = range = 0;
 673     }
 674
 675   /* We must check the longest matching, if nmatch > 0.  */
 676   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 677
 678   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 679                             preg->translate, preg->syntax & RE_ICASE, dfa);
 680   if (BE (err != REG_NOERROR, 0))
 681     goto free_return;
 682   mctx.input.stop = stop;
 683   mctx.input.raw_stop = stop;
 684   mctx.input.newline_anchor = preg->newline_anchor;
 685
 686   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 687   if (BE (err != REG_NOERROR, 0))
 688     goto free_return;
 689
 690   /* We will log all the DFA states through which the dfa pass,
 691      if nmatch > 1, or this dfa has "multibyte node", which is a
 692      back-reference or a node which can accept multibyte character or
 693      multi character collating element.  */
 694   if (nmatch > 1 || dfa->has_mb_node)
 695     {
 696       /* Avoid overflow.  */
 697       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 698         {
 699           err = REG_ESPACE;
 700           goto free_return;
 701         }
 702
 703       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 704       if (BE (mctx.state_log == NULL, 0))
 705         {
 706           err = REG_ESPACE;
 707           goto free_return;
 708         }
 709     }
 710   else
 711     mctx.state_log = NULL;
 712
 713   match_first = start;
 714   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 715                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 716
 717   /* Check incrementally whether of not the input string match.  */
 718   incr = (range < 0) ? -1 : 1;
 719   left_lim = (range < 0) ? start + range : start;
 720   right_lim = (range < 0) ? start : start + range;
 721   sb = dfa->mb_cur_max == 1;
 722   match_kind =
 723     (fastmap
 724      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 725         | (range >= 0 ? 2 : 0)
 726         | (t != NULL ? 1 : 0))
 727      : 8);
 728
 729   for (;; match_first += incr)
 730     {
 731       err = REG_NOMATCH;
 732       if (match_first < left_lim || right_lim < match_first)
 733         goto free_return;
 734
 735       /* Advance as rapidly as possible through the string, until we
 736          find a plausible place to start matching.  This may be done
 737          with varying efficiency, so there are various possibilities:
 738          only the most common of them are specialized, in order to
 739          save on code size.  We use a switch statement for speed.  */
 740       switch (match_kind)
 741         {
 742         case 8:
 743           /* No fastmap.  */
 744           break;
 745
 746         case 7:
 747           /* Fastmap with single-byte translation, match forward.  */
 748           while (BE (match_first < right_lim, 1)
 749                  && !fastmap[t[(unsigned char) string[match_first]]])
 750             ++match_first;
 751           goto forward_match_found_start_or_reached_end;
 752
 753         case 6:
 754           /* Fastmap without translation, match forward.  */
 755           while (BE (match_first < right_lim, 1)
 756                  && !fastmap[(unsigned char) string[match_first]])
 757             ++match_first;
 758
 759         forward_match_found_start_or_reached_end:
 760           if (BE (match_first == right_lim, 0))
 761             {
 762               ch = match_first >= length
 763                        ? 0 : (unsigned char) string[match_first];
 764               if (!fastmap[t ? t[ch] : ch])
 765                 goto free_return;
 766             }
 767           break;
 768
 769         case 4:
 770         case 5:
 771           /* Fastmap without multi-byte translation, match backwards.  */
 772           while (match_first >= left_lim)
 773             {
 774               ch = match_first >= length
 775                        ? 0 : (unsigned char) string[match_first];
 776               if (fastmap[t ? t[ch] : ch])
 777                 break;
 778               --match_first;
 779             }
 780           if (match_first < left_lim)
 781             goto free_return;
 782           break;
 783
 784         default:
 785           /* In this case, we can't determine easily the current byte,
 786              since it might be a component byte of a multibyte
 787              character.  Then we use the constructed buffer instead.  */
 788           for (;;)
 789             {
 790               /* If MATCH_FIRST is out of the valid range, reconstruct the
 791                  buffers.  */
 792               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 793               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 794                 {
 795                   err = re_string_reconstruct (&mctx.input, match_first,
 796                                                eflags);
 797                   if (BE (err != REG_NOERROR, 0))
 798                     goto free_return;
 799
 800                   offset = match_first - mctx.input.raw_mbs_idx;
 801                 }
 802               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 803                  Note that MATCH_FIRST must not be smaller than 0.  */
 804               ch = (match_first >= length
 805                     ? 0 : re_string_byte_at (&mctx.input, offset));
 806               if (fastmap[ch])
 807                 break;
 808               match_first += incr;
 809               if (match_first < left_lim || match_first > right_lim)
 810                 {
 811                   err = REG_NOMATCH;
 812                   goto free_return;
 813                 }
 814             }
 815           break;
 816         }
 817
 818       /* Reconstruct the buffers so that the matcher can assume that
 819          the matching starts from the beginning of the buffer.  */
 820       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 821       if (BE (err != REG_NOERROR, 0))
 822         goto free_return;
 823
 824 #ifdef RE_ENABLE_I18N
 825      /* Don't consider this char as a possible match start if it part,
 826         yet isn't the head, of a multibyte character.  */
 827       if (!sb && !re_string_first_byte (&mctx.input, 0))
 828         continue;
 829 #endif
 830
 831       /* It seems to be appropriate one, then use the matcher.  */
 832       /* We assume that the matching starts from 0.  */
 833       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 834       match_last = check_matching (&mctx, fl_longest_match,
 835                                    range >= 0 ? &match_first : NULL);
 836       if (match_last != -1)
 837         {
 838           if (BE (match_last == -2, 0))
 839             {
 840               err = REG_ESPACE;
 841               goto free_return;
 842             }
 843           else
 844             {
 845               mctx.match_last = match_last;
 846               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 847                 {
 848                   re_dfastate_t *pstate = mctx.state_log[match_last];
 849                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 850                                                              match_last);
 851                 }
 852               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 853                   || dfa->nbackref)
 854                 {
 855                   err = prune_impossible_nodes (&mctx);
 856                   if (err == REG_NOERROR)
 857                     break;
 858                   if (BE (err != REG_NOMATCH, 0))
 859                     goto free_return;
 860                   match_last = -1;
 861                 }
 862               else
 863                 break; /* We found a match.  */
 864             }
 865         }
 866
 867       match_ctx_clean (&mctx);
 868     }
 869
 870 #ifdef DEBUG
 871   assert (match_last != -1);
 872   assert (err == REG_NOERROR);
 873 #endif
 874
 875   /* Set pmatch[] if we need.  */
 876   if (nmatch > 0)
 877     {
 878       int reg_idx;
 879
 880       /* Initialize registers.  */
 881       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 882         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 883
 884       /* Set the points where matching start/end.  */
 885       pmatch[0].rm_so = 0;
 886       pmatch[0].rm_eo = mctx.match_last;
 887
 888       if (!preg->no_sub && nmatch > 1)
 889         {
 890           err = set_regs (preg, &mctx, nmatch, pmatch,
 891                           dfa->has_plural_match && dfa->nbackref > 0);
 892           if (BE (err != REG_NOERROR, 0))
 893             goto free_return;
 894         }
 895
 896       /* At last, add the offset to the each registers, since we slided
 897          the buffers so that we could assume that the matching starts
 898          from 0.  */
 899       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 900         if (pmatch[reg_idx].rm_so != -1)
 901           {
 902 #ifdef RE_ENABLE_I18N
 903             if (BE (mctx.input.offsets_needed != 0, 0))
 904               {
 905                 pmatch[reg_idx].rm_so =
 906                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 907                    ? mctx.input.valid_raw_len
 908                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 909                 pmatch[reg_idx].rm_eo =
 910                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 911                    ? mctx.input.valid_raw_len
 912                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 913               }
 914 #else
 915             assert (mctx.input.offsets_needed == 0);
 916 #endif
 917             pmatch[reg_idx].rm_so += match_first;
 918             pmatch[reg_idx].rm_eo += match_first;
 919           }
 920       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 921         {
 922           pmatch[nmatch + reg_idx].rm_so = -1;
 923           pmatch[nmatch + reg_idx].rm_eo = -1;
 924         }
 925
 926       if (dfa->subexp_map)
 927         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 928           if (dfa->subexp_map[reg_idx] != reg_idx)
 929             {
 930               pmatch[reg_idx + 1].rm_so
 931                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 932               pmatch[reg_idx + 1].rm_eo
 933                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 934             }
 935     }
 936
 937  free_return:
 938   re_free (mctx.state_log);
 939   if (dfa->nbackref)
 940     match_ctx_free (&mctx);
 941   re_string_destruct (&mctx.input);
 942   return err;
 943 }
 944
 945 static reg_errcode_t internal_function
 946 prune_impossible_nodes (re_match_context_t *mctx)
 947 {
 948   const re_dfa_t *const dfa = mctx->dfa;
 949   int halt_node, match_last;
 950   reg_errcode_t ret;
 951   re_dfastate_t **sifted_states;
 952   re_dfastate_t **lim_states = NULL;
 953   re_sift_context_t sctx;
 954 #ifdef DEBUG
 955   assert (mctx->state_log != NULL);
 956 #endif
 957   match_last = mctx->match_last;
 958   halt_node = mctx->last_node;
 959
 960   /* Avoid overflow.  */
 961   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 962     return REG_ESPACE;
 963
 964   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 965   if (BE (sifted_states == NULL, 0))
 966     {
 967       ret = REG_ESPACE;
 968       goto free_return;
 969     }
 970   if (dfa->nbackref)
 971     {
 972       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 973       if (BE (lim_states == NULL, 0))
 974         {
 975           ret = REG_ESPACE;
 976           goto free_return;
 977         }
 978       while (1)
 979         {
 980           memset (lim_states, '\0',
 981                   sizeof (re_dfastate_t *) * (match_last + 1));
 982           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 983                          match_last);
 984           ret = sift_states_backward (mctx, &sctx);
 985           re_node_set_free (&sctx.limits);
 986           if (BE (ret != REG_NOERROR, 0))
 987               goto free_return;
 988           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 989             break;
 990           do
 991             {
 992               --match_last;
 993               if (match_last < 0)
 994                 {
 995                   ret = REG_NOMATCH;
 996                   goto free_return;
 997                 }
 998             } while (mctx->state_log[match_last] == NULL
 999                      || !mctx->state_log[match_last]->halt);
1000           halt_node = check_halt_state_context (mctx,
1001                                                 mctx->state_log[match_last],
1002                                                 match_last);
1003         }
1004       ret = merge_state_array (dfa, sifted_states, lim_states,
1005                                match_last + 1);
1006       re_free (lim_states);
1007       lim_states = NULL;
1008       if (BE (ret != REG_NOERROR, 0))
1009         goto free_return;
1010     }
1011   else
1012     {
1013       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1014       ret = sift_states_backward (mctx, &sctx);
1015       re_node_set_free (&sctx.limits);
1016       if (BE (ret != REG_NOERROR, 0))
1017         goto free_return;
1018       if (sifted_states[0] == NULL)
1019         {
1020           ret = REG_NOMATCH;
1021           goto free_return;
1022         }
1023     }
1024   re_free (mctx->state_log);
1025   mctx->state_log = sifted_states;
1026   sifted_states = NULL;
1027   mctx->last_node = halt_node;
1028   mctx->match_last = match_last;
1029   ret = REG_NOERROR;
1030  free_return:
1031   re_free (sifted_states);
1032   re_free (lim_states);
1033   return ret;
1034 }
1035
1036 /* Acquire an initial state and return it.
1037    We must select appropriate initial state depending on the context,
1038    since initial states may have constraints like "\<", "^", etc..  */
1039
1040 static inline re_dfastate_t *
1041 __attribute ((always_inline)) internal_function
1042 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1043                             int idx)
1044 {
1045   const re_dfa_t *const dfa = mctx->dfa;
1046   if (dfa->init_state->has_constraint)
1047     {
1048       unsigned int context;
1049       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1050       if (IS_WORD_CONTEXT (context))
1051         return dfa->init_state_word;
1052       else if (IS_ORDINARY_CONTEXT (context))
1053         return dfa->init_state;
1054       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1055         return dfa->init_state_begbuf;
1056       else if (IS_NEWLINE_CONTEXT (context))
1057         return dfa->init_state_nl;
1058       else if (IS_BEGBUF_CONTEXT (context))
1059         {
1060           /* It is relatively rare case, then calculate on demand.  */
1061           return re_acquire_state_context (err, dfa,
1062                                            dfa->init_state->entrance_nodes,
1063                                            context);
1064         }
1065       else
1066         /* Must not happen?  */
1067         return dfa->init_state;
1068     }
1069   else
1070     return dfa->init_state;
1071 }
1072
1073 /* Check whether the regular expression match input string INPUT or not,
1074    and return the index where the matching end, return -1 if not match,
1075    or return -2 in case of an error.
1076    FL_LONGEST_MATCH means we want the POSIX longest matching.
1077    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1078    next place where we may want to try matching.
1079    Note that the matcher assume that the maching starts from the current
1080    index of the buffer.  */
1081
1082 static int
1083 internal_function
1084 check_matching (re_match_context_t *mctx, int fl_longest_match,
1085                 int *p_match_first)
1086 {
1087   const re_dfa_t *const dfa = mctx->dfa;
1088   reg_errcode_t err;
1089   int match = 0;
1090   int match_last = -1;
1091   int cur_str_idx = re_string_cur_idx (&mctx->input);
1092   re_dfastate_t *cur_state;
1093   int at_init_state = p_match_first != NULL;
1094   int next_start_idx = cur_str_idx;
1095
1096   err = REG_NOERROR;
1097   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1098   /* An initial state must not be NULL (invalid).  */
1099   if (BE (cur_state == NULL, 0))
1100     {
1101       assert (err == REG_ESPACE);
1102       return -2;
1103     }
1104
1105   if (mctx->state_log != NULL)
1106     {
1107       mctx->state_log[cur_str_idx] = cur_state;
1108
1109       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1110          later.  E.g. Processing back references.  */
1111       if (BE (dfa->nbackref, 0))
1112         {
1113           at_init_state = 0;
1114           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1115           if (BE (err != REG_NOERROR, 0))
1116             return err;
1117
1118           if (cur_state->has_backref)
1119             {
1120               err = transit_state_bkref (mctx, &cur_state->nodes);
1121               if (BE (err != REG_NOERROR, 0))
1122                 return err;
1123             }
1124         }
1125     }
1126
1127   /* If the RE accepts NULL string.  */
1128   if (BE (cur_state->halt, 0))
1129     {
1130       if (!cur_state->has_constraint
1131           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1132         {
1133           if (!fl_longest_match)
1134             return cur_str_idx;
1135           else
1136             {
1137               match_last = cur_str_idx;
1138               match = 1;
1139             }
1140         }
1141     }
1142
1143   while (!re_string_eoi (&mctx->input))
1144     {
1145       re_dfastate_t *old_state = cur_state;
1146       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1147
1148       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1149           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1150               && mctx->input.valid_len < mctx->input.len))
1151         {
1152           err = extend_buffers (mctx);
1153           if (BE (err != REG_NOERROR, 0))
1154             {
1155               assert (err == REG_ESPACE);
1156               return -2;
1157             }
1158         }
1159
1160       cur_state = transit_state (&err, mctx, cur_state);
1161       if (mctx->state_log != NULL)
1162         cur_state = merge_state_with_log (&err, mctx, cur_state);
1163
1164       if (cur_state == NULL)
1165         {
1166           /* Reached the invalid state or an error.  Try to recover a valid
1167              state using the state log, if available and if we have not
1168              already found a valid (even if not the longest) match.  */
1169           if (BE (err != REG_NOERROR, 0))
1170             return -2;
1171
1172           if (mctx->state_log == NULL
1173               || (match && !fl_longest_match)
1174               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1175             break;
1176         }
1177
1178       if (BE (at_init_state, 0))
1179         {
1180           if (old_state == cur_state)
1181             next_start_idx = next_char_idx;
1182           else
1183             at_init_state = 0;
1184         }
1185
1186       if (cur_state->halt)
1187         {
1188           /* Reached a halt state.
1189              Check the halt state can satisfy the current context.  */
1190           if (!cur_state->has_constraint
1191               || check_halt_state_context (mctx, cur_state,
1192                                            re_string_cur_idx (&mctx->input)))
1193             {
1194               /* We found an appropriate halt state.  */
1195               match_last = re_string_cur_idx (&mctx->input);
1196               match = 1;
1197
1198               /* We found a match, do not modify match_first below.  */
1199               p_match_first = NULL;
1200               if (!fl_longest_match)
1201                 break;
1202             }
1203         }
1204     }
1205
1206   if (p_match_first)
1207     *p_match_first += next_start_idx;
1208
1209   return match_last;
1210 }
1211
1212 /* Check NODE match the current context.  */
1213
1214 static int
1215 internal_function
1216 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1217 {
1218   re_token_type_t type = dfa->nodes[node].type;
1219   unsigned int constraint = dfa->nodes[node].constraint;
1220   if (type != END_OF_RE)
1221     return 0;
1222   if (!constraint)
1223     return 1;
1224   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1225     return 0;
1226   return 1;
1227 }
1228
1229 /* Check the halt state STATE match the current context.
1230    Return 0 if not match, if the node, STATE has, is a halt node and
1231    match the context, return the node.  */
1232
1233 static int
1234 internal_function
1235 check_halt_state_context (const re_match_context_t *mctx,
1236                           const re_dfastate_t *state, int idx)
1237 {
1238   int i;
1239   unsigned int context;
1240 #ifdef DEBUG
1241   assert (state->halt);
1242 #endif
1243   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1244   for (i = 0; i < state->nodes.nelem; ++i)
1245     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1246       return state->nodes.elems[i];
1247   return 0;
1248 }
1249
1250 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1251    corresponding to the DFA).
1252    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1253    of errors.  */
1254
1255 static int
1256 internal_function
1257 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1258                    int *pidx, int node, re_node_set *eps_via_nodes,
1259                    struct re_fail_stack_t *fs)
1260 {
1261   const re_dfa_t *const dfa = mctx->dfa;
1262   int i, err;
1263   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1264     {
1265       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1266       re_node_set *edests = &dfa->edests[node];
1267       int dest_node;
1268       err = re_node_set_insert (eps_via_nodes, node);
1269       if (BE (err < 0, 0))
1270         return -2;
1271       /* Pick up a valid destination, or return -1 if none is found.  */
1272       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1273         {
1274           int candidate = edests->elems[i];
1275           if (!re_node_set_contains (cur_nodes, candidate))
1276             continue;
1277           if (dest_node == -1)
1278             dest_node = candidate;
1279
1280           else
1281             {
1282               /* In order to avoid infinite loop like "(a*)*", return the second
1283                  epsilon-transition if the first was already considered.  */
1284               if (re_node_set_contains (eps_via_nodes, dest_node))
1285                 return candidate;
1286
1287               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1288               else if (fs != NULL
1289                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1290                                            eps_via_nodes))
1291                 return -2;
1292
1293               /* We know we are going to exit.  */
1294               break;
1295             }
1296         }
1297       return dest_node;
1298     }
1299   else
1300     {
1301       int naccepted = 0;
1302       re_token_type_t type = dfa->nodes[node].type;
1303
1304 #ifdef RE_ENABLE_I18N
1305       if (dfa->nodes[node].accept_mb)
1306         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1307       else
1308 #endif /* RE_ENABLE_I18N */
1309       if (type == OP_BACK_REF)
1310         {
1311           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1312           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1313           if (fs != NULL)
1314             {
1315               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1316                 return -1;
1317               else if (naccepted)
1318                 {
1319                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1320                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1321                               naccepted) != 0)
1322                     return -1;
1323                 }
1324             }
1325
1326           if (naccepted == 0)
1327             {
1328               int dest_node;
1329               err = re_node_set_insert (eps_via_nodes, node);
1330               if (BE (err < 0, 0))
1331                 return -2;
1332               dest_node = dfa->edests[node].elems[0];
1333               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1334                                         dest_node))
1335                 return dest_node;
1336             }
1337         }
1338
1339       if (naccepted != 0
1340           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1341         {
1342           int dest_node = dfa->nexts[node];
1343           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1344           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1345                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1346                                                dest_node)))
1347             return -1;
1348           re_node_set_empty (eps_via_nodes);
1349           return dest_node;
1350         }
1351     }
1352   return -1;
1353 }
1354
1355 static reg_errcode_t
1356 internal_function
1357 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1358                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1359 {
1360   reg_errcode_t err;
1361   int num = fs->num++;
1362   if (fs->num == fs->alloc)
1363     {
1364       struct re_fail_stack_ent_t *new_array;
1365       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1366                                        * fs->alloc * 2));
1367       if (new_array == NULL)
1368         return REG_ESPACE;
1369       fs->alloc *= 2;
1370       fs->stack = new_array;
1371     }
1372   fs->stack[num].idx = str_idx;
1373   fs->stack[num].node = dest_node;
1374   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1375   if (fs->stack[num].regs == NULL)
1376     return REG_ESPACE;
1377   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1378   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1379   return err;
1380 }
1381
1382 static int
1383 internal_function
1384 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1385                 regmatch_t *regs, re_node_set *eps_via_nodes)
1386 {
1387   int num = --fs->num;
1388   assert (num >= 0);
1389   *pidx = fs->stack[num].idx;
1390   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1391   re_node_set_free (eps_via_nodes);
1392   re_free (fs->stack[num].regs);
1393   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1394   return fs->stack[num].node;
1395 }
1396
1397 /* Set the positions where the subexpressions are starts/ends to registers
1398    PMATCH.
1399    Note: We assume that pmatch[0] is already set, and
1400    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1401
1402 static reg_errcode_t
1403 internal_function
1404 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1405           regmatch_t *pmatch, int fl_backtrack)
1406 {
1407   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1408   int idx, cur_node;
1409   re_node_set eps_via_nodes;
1410   struct re_fail_stack_t *fs;
1411   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1412   regmatch_t *prev_idx_match;
1413   int prev_idx_match_malloced = 0;
1414
1415 #ifdef DEBUG
1416   assert (nmatch > 1);
1417   assert (mctx->state_log != NULL);
1418 #endif
1419   if (fl_backtrack)
1420     {
1421       fs = &fs_body;
1422       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1423       if (fs->stack == NULL)
1424         return REG_ESPACE;
1425     }
1426   else
1427     fs = NULL;
1428
1429   cur_node = dfa->init_node;
1430   re_node_set_init_empty (&eps_via_nodes);
1431
1432 #ifdef HAVE_ALLOCA
1433   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1434     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1435   else
1436 #endif
1437     {
1438       prev_idx_match = re_malloc (regmatch_t, nmatch);
1439       if (prev_idx_match == NULL)
1440         {
1441           free_fail_stack_return (fs);
1442           return REG_ESPACE;
1443         }
1444       prev_idx_match_malloced = 1;
1445     }
1446   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1447
1448   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1449     {
1450       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1451
1452       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1453         {
1454           int reg_idx;
1455           if (fs)
1456             {
1457               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1458                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1459                   break;
1460               if (reg_idx == nmatch)
1461                 {
1462                   re_node_set_free (&eps_via_nodes);
1463                   if (prev_idx_match_malloced)
1464                     re_free (prev_idx_match);
1465                   return free_fail_stack_return (fs);
1466                 }
1467               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1468                                          &eps_via_nodes);
1469             }
1470           else
1471             {
1472               re_node_set_free (&eps_via_nodes);
1473               if (prev_idx_match_malloced)
1474                 re_free (prev_idx_match);
1475               return REG_NOERROR;
1476             }
1477         }
1478
1479       /* Proceed to next node.  */
1480       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1481                                     &eps_via_nodes, fs);
1482
1483       if (BE (cur_node < 0, 0))
1484         {
1485           if (BE (cur_node == -2, 0))
1486             {
1487               re_node_set_free (&eps_via_nodes);
1488               if (prev_idx_match_malloced)
1489                 re_free (prev_idx_match);
1490               free_fail_stack_return (fs);
1491               return REG_ESPACE;
1492             }
1493           if (fs)
1494             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1495                                        &eps_via_nodes);
1496           else
1497             {
1498               re_node_set_free (&eps_via_nodes);
1499               if (prev_idx_match_malloced)
1500                 re_free (prev_idx_match);
1501               return REG_NOMATCH;
1502             }
1503         }
1504     }
1505   re_node_set_free (&eps_via_nodes);
1506   if (prev_idx_match_malloced)
1507     re_free (prev_idx_match);
1508   return free_fail_stack_return (fs);
1509 }
1510
1511 static reg_errcode_t
1512 internal_function
1513 free_fail_stack_return (struct re_fail_stack_t *fs)
1514 {
1515   if (fs)
1516     {
1517       int fs_idx;
1518       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1519         {
1520           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1521           re_free (fs->stack[fs_idx].regs);
1522         }
1523       re_free (fs->stack);
1524     }
1525   return REG_NOERROR;
1526 }
1527
1528 static void
1529 internal_function
1530 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1531              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1532 {
1533   int type = dfa->nodes[cur_node].type;
1534   if (type == OP_OPEN_SUBEXP)
1535     {
1536       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1537
1538       /* We are at the first node of this sub expression.  */
1539       if (reg_num < nmatch)
1540         {
1541           pmatch[reg_num].rm_so = cur_idx;
1542           pmatch[reg_num].rm_eo = -1;
1543         }
1544     }
1545   else if (type == OP_CLOSE_SUBEXP)
1546     {
1547       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1548       if (reg_num < nmatch)
1549         {
1550           /* We are at the last node of this sub expression.  */
1551           if (pmatch[reg_num].rm_so < cur_idx)
1552             {
1553               pmatch[reg_num].rm_eo = cur_idx;
1554               /* This is a non-empty match or we are not inside an optional
1555                  subexpression.  Accept this right away.  */
1556               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1557             }
1558           else
1559             {
1560               if (dfa->nodes[cur_node].opt_subexp
1561                   && prev_idx_match[reg_num].rm_so != -1)
1562                 /* We transited through an empty match for an optional
1563                    subexpression, like (a?)*, and this is not the subexp's
1564                    first match.  Copy back the old content of the registers
1565                    so that matches of an inner subexpression are undone as
1566                    well, like in ((a?))*.  */
1567                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1568               else
1569                 /* We completed a subexpression, but it may be part of
1570                    an optional one, so do not update PREV_IDX_MATCH.  */
1571                 pmatch[reg_num].rm_eo = cur_idx;
1572             }
1573         }
1574     }
1575 }
1576
1577 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1578    and sift the nodes in each states according to the following rules.
1579    Updated state_log will be wrote to STATE_LOG.
1580
1581    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1582      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1583         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1584         the LAST_NODE, we throw away the node `a'.
1585      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1586         string `s' and transit to `b':
1587         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1588            away the node `a'.
1589         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1590             thrown away, we throw away the node `a'.
1591      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1592         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1593            node `a'.
1594         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1595             we throw away the node `a'.  */
1596
1597 #define STATE_NODE_CONTAINS(state,node) \
1598   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1599
1600 static reg_errcode_t
1601 internal_function
1602 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1603 {
1604   reg_errcode_t err;
1605   int null_cnt = 0;
1606   int str_idx = sctx->last_str_idx;
1607   re_node_set cur_dest;
1608
1609 #ifdef DEBUG
1610   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1611 #endif
1612
1613   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1614      transit to the last_node and the last_node itself.  */
1615   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1616   if (BE (err != REG_NOERROR, 0))
1617     return err;
1618   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1619   if (BE (err != REG_NOERROR, 0))
1620     goto free_return;
1621
1622   /* Then check each states in the state_log.  */
1623   while (str_idx > 0)
1624     {
1625       /* Update counters.  */
1626       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1627       if (null_cnt > mctx->max_mb_elem_len)
1628         {
1629           memset (sctx->sifted_states, '\0',
1630                   sizeof (re_dfastate_t *) * str_idx);
1631           re_node_set_free (&cur_dest);
1632           return REG_NOERROR;
1633         }
1634       re_node_set_empty (&cur_dest);
1635       --str_idx;
1636
1637       if (mctx->state_log[str_idx])
1638         {
1639           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1640           if (BE (err != REG_NOERROR, 0))
1641             goto free_return;
1642         }
1643
1644       /* Add all the nodes which satisfy the following conditions:
1645          - It can epsilon transit to a node in CUR_DEST.
1646          - It is in CUR_SRC.
1647          And update state_log.  */
1648       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1649       if (BE (err != REG_NOERROR, 0))
1650         goto free_return;
1651     }
1652   err = REG_NOERROR;
1653  free_return:
1654   re_node_set_free (&cur_dest);
1655   return err;
1656 }
1657
1658 static reg_errcode_t
1659 internal_function
1660 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1661                      int str_idx, re_node_set *cur_dest)
1662 {
1663   const re_dfa_t *const dfa = mctx->dfa;
1664   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1665   int i;
1666
1667   /* Then build the next sifted state.
1668      We build the next sifted state on `cur_dest', and update
1669      `sifted_states[str_idx]' with `cur_dest'.
1670      Note:
1671      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1672      `cur_src' points the node_set of the old `state_log[str_idx]'
1673      (with the epsilon nodes pre-filtered out).  */
1674   for (i = 0; i < cur_src->nelem; i++)
1675     {
1676       int prev_node = cur_src->elems[i];
1677       int naccepted = 0;
1678       int ret;
1679
1680 #ifdef DEBUG
1681       re_token_type_t type = dfa->nodes[prev_node].type;
1682       assert (!IS_EPSILON_NODE (type));
1683 #endif
1684 #ifdef RE_ENABLE_I18N
1685       /* If the node may accept `multi byte'.  */
1686       if (dfa->nodes[prev_node].accept_mb)
1687         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1688                                          str_idx, sctx->last_str_idx);
1689 #endif /* RE_ENABLE_I18N */
1690
1691       /* We don't check backreferences here.
1692          See update_cur_sifted_state().  */
1693       if (!naccepted
1694           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1695           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1696                                   dfa->nexts[prev_node]))
1697         naccepted = 1;
1698
1699       if (naccepted == 0)
1700         continue;
1701
1702       if (sctx->limits.nelem)
1703         {
1704           int to_idx = str_idx + naccepted;
1705           if (check_dst_limits (mctx, &sctx->limits,
1706                                 dfa->nexts[prev_node], to_idx,
1707                                 prev_node, str_idx))
1708             continue;
1709         }
1710       ret = re_node_set_insert (cur_dest, prev_node);
1711       if (BE (ret == -1, 0))
1712         return REG_ESPACE;
1713     }
1714
1715   return REG_NOERROR;
1716 }
1717
1718 /* Helper functions.  */
1719
1720 static reg_errcode_t
1721 internal_function
1722 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1723 {
1724   int top = mctx->state_log_top;
1725
1726   if (next_state_log_idx >= mctx->input.bufs_len
1727       || (next_state_log_idx >= mctx->input.valid_len
1728           && mctx->input.valid_len < mctx->input.len))
1729     {
1730       reg_errcode_t err;
1731       err = extend_buffers (mctx);
1732       if (BE (err != REG_NOERROR, 0))
1733         return err;
1734     }
1735
1736   if (top < next_state_log_idx)
1737     {
1738       memset (mctx->state_log + top + 1, '\0',
1739               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1740       mctx->state_log_top = next_state_log_idx;
1741     }
1742   return REG_NOERROR;
1743 }
1744
1745 static reg_errcode_t
1746 internal_function
1747 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1748                    re_dfastate_t **src, int num)
1749 {
1750   int st_idx;
1751   reg_errcode_t err;
1752   for (st_idx = 0; st_idx < num; ++st_idx)
1753     {
1754       if (dst[st_idx] == NULL)
1755         dst[st_idx] = src[st_idx];
1756       else if (src[st_idx] != NULL)
1757         {
1758           re_node_set merged_set;
1759           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1760                                         &src[st_idx]->nodes);
1761           if (BE (err != REG_NOERROR, 0))
1762             return err;
1763           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1764           re_node_set_free (&merged_set);
1765           if (BE (err != REG_NOERROR, 0))
1766             return err;
1767         }
1768     }
1769   return REG_NOERROR;
1770 }
1771
1772 static reg_errcode_t
1773 internal_function
1774 update_cur_sifted_state (const re_match_context_t *mctx,
1775                          re_sift_context_t *sctx, int str_idx,
1776                          re_node_set *dest_nodes)
1777 {
1778   const re_dfa_t *const dfa = mctx->dfa;
1779   reg_errcode_t err = REG_NOERROR;
1780   const re_node_set *candidates;
1781   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1782                 : &mctx->state_log[str_idx]->nodes);
1783
1784   if (dest_nodes->nelem == 0)
1785     sctx->sifted_states[str_idx] = NULL;
1786   else
1787     {
1788       if (candidates)
1789         {
1790           /* At first, add the nodes which can epsilon transit to a node in
1791              DEST_NODE.  */
1792           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1793           if (BE (err != REG_NOERROR, 0))
1794             return err;
1795
1796           /* Then, check the limitations in the current sift_context.  */
1797           if (sctx->limits.nelem)
1798             {
1799               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1800                                          mctx->bkref_ents, str_idx);
1801               if (BE (err != REG_NOERROR, 0))
1802                 return err;
1803             }
1804         }
1805
1806       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1807       if (BE (err != REG_NOERROR, 0))
1808         return err;
1809     }
1810
1811   if (candidates && mctx->state_log[str_idx]->has_backref)
1812     {
1813       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1814       if (BE (err != REG_NOERROR, 0))
1815         return err;
1816     }
1817   return REG_NOERROR;
1818 }
1819
1820 static reg_errcode_t
1821 internal_function
1822 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1823                        const re_node_set *candidates)
1824 {
1825   reg_errcode_t err = REG_NOERROR;
1826   int i;
1827
1828   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1829   if (BE (err != REG_NOERROR, 0))
1830     return err;
1831
1832   if (!state->inveclosure.alloc)
1833     {
1834       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1835       if (BE (err != REG_NOERROR, 0))
1836         return REG_ESPACE;
1837       for (i = 0; i < dest_nodes->nelem; i++)
1838         {
1839           err = re_node_set_merge (&state->inveclosure,
1840                                    dfa->inveclosures + dest_nodes->elems[i]);
1841           if (BE (err != REG_NOERROR, 0))
1842             return REG_ESPACE;
1843         }
1844     }
1845   return re_node_set_add_intersect (dest_nodes, candidates,
1846                                     &state->inveclosure);
1847 }
1848
1849 static reg_errcode_t
1850 internal_function
1851 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1852                        const re_node_set *candidates)
1853 {
1854     int ecl_idx;
1855     reg_errcode_t err;
1856     re_node_set *inv_eclosure = dfa->inveclosures + node;
1857     re_node_set except_nodes;
1858     re_node_set_init_empty (&except_nodes);
1859     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1860       {
1861         int cur_node = inv_eclosure->elems[ecl_idx];
1862         if (cur_node == node)
1863           continue;
1864         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1865           {
1866             int edst1 = dfa->edests[cur_node].elems[0];
1867             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1868                          ? dfa->edests[cur_node].elems[1] : -1);
1869             if ((!re_node_set_contains (inv_eclosure, edst1)
1870                  && re_node_set_contains (dest_nodes, edst1))
1871                 || (edst2 > 0
1872                     && !re_node_set_contains (inv_eclosure, edst2)
1873                     && re_node_set_contains (dest_nodes, edst2)))
1874               {
1875                 err = re_node_set_add_intersect (&except_nodes, candidates,
1876                                                  dfa->inveclosures + cur_node);
1877                 if (BE (err != REG_NOERROR, 0))
1878                   {
1879                     re_node_set_free (&except_nodes);
1880                     return err;
1881                   }
1882               }
1883           }
1884       }
1885     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1886       {
1887         int cur_node = inv_eclosure->elems[ecl_idx];
1888         if (!re_node_set_contains (&except_nodes, cur_node))
1889           {
1890             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1891             re_node_set_remove_at (dest_nodes, idx);
1892           }
1893       }
1894     re_node_set_free (&except_nodes);
1895     return REG_NOERROR;
1896 }
1897
1898 static int
1899 internal_function
1900 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1901                   int dst_node, int dst_idx, int src_node, int src_idx)
1902 {
1903   const re_dfa_t *const dfa = mctx->dfa;
1904   int lim_idx, src_pos, dst_pos;
1905
1906   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1907   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1908   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1909     {
1910       int subexp_idx;
1911       struct re_backref_cache_entry *ent;
1912       ent = mctx->bkref_ents + limits->elems[lim_idx];
1913       subexp_idx = dfa->nodes[ent->node].opr.idx;
1914
1915       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1916                                            subexp_idx, dst_node, dst_idx,
1917                                            dst_bkref_idx);
1918       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1919                                            subexp_idx, src_node, src_idx,
1920                                            src_bkref_idx);
1921
1922       /* In case of:
1923          <src> <dst> ( <subexp> )
1924          ( <subexp> ) <src> <dst>
1925          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1926       if (src_pos == dst_pos)
1927         continue; /* This is unrelated limitation.  */
1928       else
1929         return 1;
1930     }
1931   return 0;
1932 }
1933
1934 static int
1935 internal_function
1936 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1937                              int subexp_idx, int from_node, int bkref_idx)
1938 {
1939   const re_dfa_t *const dfa = mctx->dfa;
1940   const re_node_set *eclosures = dfa->eclosures + from_node;
1941   int node_idx;
1942
1943   /* Else, we are on the boundary: examine the nodes on the epsilon
1944      closure.  */
1945   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1946     {
1947       int node = eclosures->elems[node_idx];
1948       switch (dfa->nodes[node].type)
1949         {
1950         case OP_BACK_REF:
1951           if (bkref_idx != -1)
1952             {
1953               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1954               do
1955                 {
1956                   int dst, cpos;
1957
1958                   if (ent->node != node)
1959                     continue;
1960
1961                   if (subexp_idx < BITSET_WORD_BITS
1962                       && !(ent->eps_reachable_subexps_map
1963                            & ((bitset_word_t) 1 << subexp_idx)))
1964                     continue;
1965
1966                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1967                      OP_CLOSE_SUBEXP cases below.  But, if the
1968                      destination node is the same node as the source
1969                      node, don't recurse because it would cause an
1970                      infinite loop: a regex that exhibits this behavior
1971                      is ()\1*\1*  */
1972                   dst = dfa->edests[node].elems[0];
1973                   if (dst == from_node)
1974                     {
1975                       if (boundaries & 1)
1976                         return -1;
1977                       else /* if (boundaries & 2) */
1978                         return 0;
1979                     }
1980
1981                   cpos =
1982                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1983                                                  dst, bkref_idx);
1984                   if (cpos == -1 /* && (boundaries & 1) */)
1985                     return -1;
1986                   if (cpos == 0 && (boundaries & 2))
1987                     return 0;
1988
1989                   if (subexp_idx < BITSET_WORD_BITS)
1990                     ent->eps_reachable_subexps_map
1991                       &= ~((bitset_word_t) 1 << subexp_idx);
1992                 }
1993               while (ent++->more);
1994             }
1995           break;
1996
1997         case OP_OPEN_SUBEXP:
1998           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1999             return -1;
2000           break;
2001
2002         case OP_CLOSE_SUBEXP:
2003           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2004             return 0;
2005           break;
2006
2007         default:
2008             break;
2009         }
2010     }
2011
2012   return (boundaries & 2) ? 1 : 0;
2013 }
2014
2015 static int
2016 internal_function
2017 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2018                            int subexp_idx, int from_node, int str_idx,
2019                            int bkref_idx)
2020 {
2021   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2022   int boundaries;
2023
2024   /* If we are outside the range of the subexpression, return -1 or 1.  */
2025   if (str_idx < lim->subexp_from)
2026     return -1;
2027
2028   if (lim->subexp_to < str_idx)
2029     return 1;
2030
2031   /* If we are within the subexpression, return 0.  */
2032   boundaries = (str_idx == lim->subexp_from);
2033   boundaries |= (str_idx == lim->subexp_to) << 1;
2034   if (boundaries == 0)
2035     return 0;
2036
2037   /* Else, examine epsilon closure.  */
2038   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2039                                       from_node, bkref_idx);
2040 }
2041
2042 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2043    which are against limitations from DEST_NODES. */
2044
2045 static reg_errcode_t
2046 internal_function
2047 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2048                      const re_node_set *candidates, re_node_set *limits,
2049                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2050 {
2051   reg_errcode_t err;
2052   int node_idx, lim_idx;
2053
2054   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2055     {
2056       int subexp_idx;
2057       struct re_backref_cache_entry *ent;
2058       ent = bkref_ents + limits->elems[lim_idx];
2059
2060       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2061         continue; /* This is unrelated limitation.  */
2062
2063       subexp_idx = dfa->nodes[ent->node].opr.idx;
2064       if (ent->subexp_to == str_idx)
2065         {
2066           int ops_node = -1;
2067           int cls_node = -1;
2068           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2069             {
2070               int node = dest_nodes->elems[node_idx];
2071               re_token_type_t type = dfa->nodes[node].type;
2072               if (type == OP_OPEN_SUBEXP
2073                   && subexp_idx == dfa->nodes[node].opr.idx)
2074                 ops_node = node;
2075               else if (type == OP_CLOSE_SUBEXP
2076                        && subexp_idx == dfa->nodes[node].opr.idx)
2077                 cls_node = node;
2078             }
2079
2080           /* Check the limitation of the open subexpression.  */
2081           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2082           if (ops_node >= 0)
2083             {
2084               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2085                                            candidates);
2086               if (BE (err != REG_NOERROR, 0))
2087                 return err;
2088             }
2089
2090           /* Check the limitation of the close subexpression.  */
2091           if (cls_node >= 0)
2092             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2093               {
2094                 int node = dest_nodes->elems[node_idx];
2095                 if (!re_node_set_contains (dfa->inveclosures + node,
2096                                            cls_node)
2097                     && !re_node_set_contains (dfa->eclosures + node,
2098                                               cls_node))
2099                   {
2100                     /* It is against this limitation.
2101                        Remove it form the current sifted state.  */
2102                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2103                                                  candidates);
2104                     if (BE (err != REG_NOERROR, 0))
2105                       return err;
2106                     --node_idx;
2107                   }
2108               }
2109         }
2110       else /* (ent->subexp_to != str_idx)  */
2111         {
2112           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2113             {
2114               int node = dest_nodes->elems[node_idx];
2115               re_token_type_t type = dfa->nodes[node].type;
2116               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2117                 {
2118                   if (subexp_idx != dfa->nodes[node].opr.idx)
2119                     continue;
2120                   /* It is against this limitation.
2121                      Remove it form the current sifted state.  */
2122                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2123                                                candidates);
2124                   if (BE (err != REG_NOERROR, 0))
2125                     return err;
2126                 }
2127             }
2128         }
2129     }
2130   return REG_NOERROR;
2131 }
2132
2133 static reg_errcode_t
2134 internal_function
2135 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2136                    int str_idx, const re_node_set *candidates)
2137 {
2138   const re_dfa_t *const dfa = mctx->dfa;
2139   reg_errcode_t err;
2140   int node_idx, node;
2141   re_sift_context_t local_sctx;
2142   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2143
2144   if (first_idx == -1)
2145     return REG_NOERROR;
2146
2147   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2148
2149   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2150     {
2151       int enabled_idx;
2152       re_token_type_t type;
2153       struct re_backref_cache_entry *entry;
2154       node = candidates->elems[node_idx];
2155       type = dfa->nodes[node].type;
2156       /* Avoid infinite loop for the REs like "()\1+".  */
2157       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2158         continue;
2159       if (type != OP_BACK_REF)
2160         continue;
2161
2162       entry = mctx->bkref_ents + first_idx;
2163       enabled_idx = first_idx;
2164       do
2165         {
2166           int subexp_len;
2167           int to_idx;
2168           int dst_node;
2169           int ret;
2170           re_dfastate_t *cur_state;
2171
2172           if (entry->node != node)
2173             continue;
2174           subexp_len = entry->subexp_to - entry->subexp_from;
2175           to_idx = str_idx + subexp_len;
2176           dst_node = (subexp_len ? dfa->nexts[node]
2177                       : dfa->edests[node].elems[0]);
2178
2179           if (to_idx > sctx->last_str_idx
2180               || sctx->sifted_states[to_idx] == NULL
2181               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2182               || check_dst_limits (mctx, &sctx->limits, node,
2183                                    str_idx, dst_node, to_idx))
2184             continue;
2185
2186           if (local_sctx.sifted_states == NULL)
2187             {
2188               local_sctx = *sctx;
2189               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2190               if (BE (err != REG_NOERROR, 0))
2191                 goto free_return;
2192             }
2193           local_sctx.last_node = node;
2194           local_sctx.last_str_idx = str_idx;
2195           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2196           if (BE (ret < 0, 0))
2197             {
2198               err = REG_ESPACE;
2199               goto free_return;
2200             }
2201           cur_state = local_sctx.sifted_states[str_idx];
2202           err = sift_states_backward (mctx, &local_sctx);
2203           if (BE (err != REG_NOERROR, 0))
2204             goto free_return;
2205           if (sctx->limited_states != NULL)
2206             {
2207               err = merge_state_array (dfa, sctx->limited_states,
2208                                        local_sctx.sifted_states,
2209                                        str_idx + 1);
2210               if (BE (err != REG_NOERROR, 0))
2211                 goto free_return;
2212             }
2213           local_sctx.sifted_states[str_idx] = cur_state;
2214           re_node_set_remove (&local_sctx.limits, enabled_idx);
2215
2216           /* mctx->bkref_ents may have changed, reload the pointer.  */
2217           entry = mctx->bkref_ents + enabled_idx;
2218         }
2219       while (enabled_idx++, entry++->more);
2220     }
2221   err = REG_NOERROR;
2222  free_return:
2223   if (local_sctx.sifted_states != NULL)
2224     {
2225       re_node_set_free (&local_sctx.limits);
2226     }
2227
2228   return err;
2229 }
2230
2231
2232 #ifdef RE_ENABLE_I18N
2233 static int
2234 internal_function
2235 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2236                      int node_idx, int str_idx, int max_str_idx)
2237 {
2238   const re_dfa_t *const dfa = mctx->dfa;
2239   int naccepted;
2240   /* Check the node can accept `multi byte'.  */
2241   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2242   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2243       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2244                             dfa->nexts[node_idx]))
2245     /* The node can't accept the `multi byte', or the
2246        destination was already thrown away, then the node
2247        could't accept the current input `multi byte'.   */
2248     naccepted = 0;
2249   /* Otherwise, it is sure that the node could accept
2250      `naccepted' bytes input.  */
2251   return naccepted;
2252 }
2253 #endif /* RE_ENABLE_I18N */
2254
2255 \f
2256 /* Functions for state transition.  */
2257
2258 /* Return the next state to which the current state STATE will transit by
2259    accepting the current input byte, and update STATE_LOG if necessary.
2260    If STATE can accept a multibyte char/collating element/back reference
2261    update the destination of STATE_LOG.  */
2262
2263 static re_dfastate_t *
2264 internal_function
2265 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2266                re_dfastate_t *state)
2267 {
2268   re_dfastate_t **trtable;
2269   unsigned char ch;
2270
2271 #ifdef RE_ENABLE_I18N
2272   /* If the current state can accept multibyte.  */
2273   if (BE (state->accept_mb, 0))
2274     {
2275       *err = transit_state_mb (mctx, state);
2276       if (BE (*err != REG_NOERROR, 0))
2277         return NULL;
2278     }
2279 #endif /* RE_ENABLE_I18N */
2280
2281   /* Then decide the next state with the single byte.  */
2282 #if 0
2283   if (0)
2284     /* don't use transition table  */
2285     return transit_state_sb (err, mctx, state);
2286 #endif
2287
2288   /* Use transition table  */
2289   ch = re_string_fetch_byte (&mctx->input);
2290   for (;;)
2291     {
2292       trtable = state->trtable;
2293       if (BE (trtable != NULL, 1))
2294         return trtable[ch];
2295
2296       trtable = state->word_trtable;
2297       if (BE (trtable != NULL, 1))
2298         {
2299           unsigned int context;
2300           context
2301             = re_string_context_at (&mctx->input,
2302                                     re_string_cur_idx (&mctx->input) - 1,
2303                                     mctx->eflags);
2304           if (IS_WORD_CONTEXT (context))
2305             return trtable[ch + SBC_MAX];
2306           else
2307             return trtable[ch];
2308         }
2309
2310       if (!build_trtable (mctx->dfa, state))
2311         {
2312           *err = REG_ESPACE;
2313           return NULL;
2314         }
2315
2316       /* Retry, we now have a transition table.  */
2317     }
2318 }
2319
2320 /* Update the state_log if we need */
2321 re_dfastate_t *
2322 internal_function
2323 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2324                       re_dfastate_t *next_state)
2325 {
2326   const re_dfa_t *const dfa = mctx->dfa;
2327   int cur_idx = re_string_cur_idx (&mctx->input);
2328
2329   if (cur_idx > mctx->state_log_top)
2330     {
2331       mctx->state_log[cur_idx] = next_state;
2332       mctx->state_log_top = cur_idx;
2333     }
2334   else if (mctx->state_log[cur_idx] == 0)
2335     {
2336       mctx->state_log[cur_idx] = next_state;
2337     }
2338   else
2339     {
2340       re_dfastate_t *pstate;
2341       unsigned int context;
2342       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2343       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2344          the destination of a multibyte char/collating element/
2345          back reference.  Then the next state is the union set of
2346          these destinations and the results of the transition table.  */
2347       pstate = mctx->state_log[cur_idx];
2348       log_nodes = pstate->entrance_nodes;
2349       if (next_state != NULL)
2350         {
2351           table_nodes = next_state->entrance_nodes;
2352           *err = re_node_set_init_union (&next_nodes, table_nodes,
2353                                              log_nodes);
2354           if (BE (*err != REG_NOERROR, 0))
2355             return NULL;
2356         }
2357       else
2358         next_nodes = *log_nodes;
2359       /* Note: We already add the nodes of the initial state,
2360          then we don't need to add them here.  */
2361
2362       context = re_string_context_at (&mctx->input,
2363                                       re_string_cur_idx (&mctx->input) - 1,
2364                                       mctx->eflags);
2365       next_state = mctx->state_log[cur_idx]
2366         = re_acquire_state_context (err, dfa, &next_nodes, context);
2367       /* We don't need to check errors here, since the return value of
2368          this function is next_state and ERR is already set.  */
2369
2370       if (table_nodes != NULL)
2371         re_node_set_free (&next_nodes);
2372     }
2373
2374   if (BE (dfa->nbackref, 0) && next_state != NULL)
2375     {
2376       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2377          later.  We must check them here, since the back references in the
2378          next state might use them.  */
2379       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2380                                         cur_idx);
2381       if (BE (*err != REG_NOERROR, 0))
2382         return NULL;
2383
2384       /* If the next state has back references.  */
2385       if (next_state->has_backref)
2386         {
2387           *err = transit_state_bkref (mctx, &next_state->nodes);
2388           if (BE (*err != REG_NOERROR, 0))
2389             return NULL;
2390           next_state = mctx->state_log[cur_idx];
2391         }
2392     }
2393
2394   return next_state;
2395 }
2396
2397 /* Skip bytes in the input that correspond to part of a
2398    multi-byte match, then look in the log for a state
2399    from which to restart matching.  */
2400 re_dfastate_t *
2401 internal_function
2402 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2403 {
2404   re_dfastate_t *cur_state;
2405   do
2406     {
2407       int max = mctx->state_log_top;
2408       int cur_str_idx = re_string_cur_idx (&mctx->input);
2409
2410       do
2411         {
2412           if (++cur_str_idx > max)
2413             return NULL;
2414           re_string_skip_bytes (&mctx->input, 1);
2415         }
2416       while (mctx->state_log[cur_str_idx] == NULL);
2417
2418       cur_state = merge_state_with_log (err, mctx, NULL);
2419     }
2420   while (*err == REG_NOERROR && cur_state == NULL);
2421   return cur_state;
2422 }
2423
2424 /* Helper functions for transit_state.  */
2425
2426 /* From the node set CUR_NODES, pick up the nodes whose types are
2427    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2428    expression. And register them to use them later for evaluating the
2429    correspoding back references.  */
2430
2431 static reg_errcode_t
2432 internal_function
2433 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2434                            int str_idx)
2435 {
2436   const re_dfa_t *const dfa = mctx->dfa;
2437   int node_idx;
2438   reg_errcode_t err;
2439
2440   /* TODO: This isn't efficient.
2441            Because there might be more than one nodes whose types are
2442            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2443            nodes.
2444            E.g. RE: (a){2}  */
2445   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2446     {
2447       int node = cur_nodes->elems[node_idx];
2448       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2449           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2450           && (dfa->used_bkref_map
2451               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2452         {
2453           err = match_ctx_add_subtop (mctx, node, str_idx);
2454           if (BE (err != REG_NOERROR, 0))
2455             return err;
2456         }
2457     }
2458   return REG_NOERROR;
2459 }
2460
2461 #if 0
2462 /* Return the next state to which the current state STATE will transit by
2463    accepting the current input byte.  */
2464
2465 static re_dfastate_t *
2466 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2467                   re_dfastate_t *state)
2468 {
2469   const re_dfa_t *const dfa = mctx->dfa;
2470   re_node_set next_nodes;
2471   re_dfastate_t *next_state;
2472   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2473   unsigned int context;
2474
2475   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2476   if (BE (*err != REG_NOERROR, 0))
2477     return NULL;
2478   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2479     {
2480       int cur_node = state->nodes.elems[node_cnt];
2481       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2482         {
2483           *err = re_node_set_merge (&next_nodes,
2484                                     dfa->eclosures + dfa->nexts[cur_node]);
2485           if (BE (*err != REG_NOERROR, 0))
2486             {
2487               re_node_set_free (&next_nodes);
2488               return NULL;
2489             }
2490         }
2491     }
2492   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2493   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2494   /* We don't need to check errors here, since the return value of
2495      this function is next_state and ERR is already set.  */
2496
2497   re_node_set_free (&next_nodes);
2498   re_string_skip_bytes (&mctx->input, 1);
2499   return next_state;
2500 }
2501 #endif
2502
2503 #ifdef RE_ENABLE_I18N
2504 static reg_errcode_t
2505 internal_function
2506 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2507 {
2508   const re_dfa_t *const dfa = mctx->dfa;
2509   reg_errcode_t err;
2510   int i;
2511
2512   for (i = 0; i < pstate->nodes.nelem; ++i)
2513     {
2514       re_node_set dest_nodes, *new_nodes;
2515       int cur_node_idx = pstate->nodes.elems[i];
2516       int naccepted, dest_idx;
2517       unsigned int context;
2518       re_dfastate_t *dest_state;
2519
2520       if (!dfa->nodes[cur_node_idx].accept_mb)
2521         continue;
2522
2523       if (dfa->nodes[cur_node_idx].constraint)
2524         {
2525           context = re_string_context_at (&mctx->input,
2526                                           re_string_cur_idx (&mctx->input),
2527                                           mctx->eflags);
2528           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2529                                            context))
2530             continue;
2531         }
2532
2533       /* How many bytes the node can accept?  */
2534       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2535                                            re_string_cur_idx (&mctx->input));
2536       if (naccepted == 0)
2537         continue;
2538
2539       /* The node can accepts `naccepted' bytes.  */
2540       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2541       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2542                                : mctx->max_mb_elem_len);
2543       err = clean_state_log_if_needed (mctx, dest_idx);
2544       if (BE (err != REG_NOERROR, 0))
2545         return err;
2546 #ifdef DEBUG
2547       assert (dfa->nexts[cur_node_idx] != -1);
2548 #endif
2549       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2550
2551       dest_state = mctx->state_log[dest_idx];
2552       if (dest_state == NULL)
2553         dest_nodes = *new_nodes;
2554       else
2555         {
2556           err = re_node_set_init_union (&dest_nodes,
2557                                         dest_state->entrance_nodes, new_nodes);
2558           if (BE (err != REG_NOERROR, 0))
2559             return err;
2560         }
2561       context = re_string_context_at (&mctx->input, dest_idx - 1,
2562                                       mctx->eflags);
2563       mctx->state_log[dest_idx]
2564         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2565       if (dest_state != NULL)
2566         re_node_set_free (&dest_nodes);
2567       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2568         return err;
2569     }
2570   return REG_NOERROR;
2571 }
2572 #endif /* RE_ENABLE_I18N */
2573
2574 static reg_errcode_t
2575 internal_function
2576 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2577 {
2578   const re_dfa_t *const dfa = mctx->dfa;
2579   reg_errcode_t err;
2580   int i;
2581   int cur_str_idx = re_string_cur_idx (&mctx->input);
2582
2583   for (i = 0; i < nodes->nelem; ++i)
2584     {
2585       int dest_str_idx, prev_nelem, bkc_idx;
2586       int node_idx = nodes->elems[i];
2587       unsigned int context;
2588       const re_token_t *node = dfa->nodes + node_idx;
2589       re_node_set *new_dest_nodes;
2590
2591       /* Check whether `node' is a backreference or not.  */
2592       if (node->type != OP_BACK_REF)
2593         continue;
2594
2595       if (node->constraint)
2596         {
2597           context = re_string_context_at (&mctx->input, cur_str_idx,
2598                                           mctx->eflags);
2599           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2600             continue;
2601         }
2602
2603       /* `node' is a backreference.
2604          Check the substring which the substring matched.  */
2605       bkc_idx = mctx->nbkref_ents;
2606       err = get_subexp (mctx, node_idx, cur_str_idx);
2607       if (BE (err != REG_NOERROR, 0))
2608         goto free_return;
2609
2610       /* And add the epsilon closures (which is `new_dest_nodes') of
2611          the backreference to appropriate state_log.  */
2612 #ifdef DEBUG
2613       assert (dfa->nexts[node_idx] != -1);
2614 #endif
2615       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2616         {
2617           int subexp_len;
2618           re_dfastate_t *dest_state;
2619           struct re_backref_cache_entry *bkref_ent;
2620           bkref_ent = mctx->bkref_ents + bkc_idx;
2621           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2622             continue;
2623           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2624           new_dest_nodes = (subexp_len == 0
2625                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2626                             : dfa->eclosures + dfa->nexts[node_idx]);
2627           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2628                           - bkref_ent->subexp_from);
2629           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2630                                           mctx->eflags);
2631           dest_state = mctx->state_log[dest_str_idx];
2632           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2633                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2634           /* Add `new_dest_node' to state_log.  */
2635           if (dest_state == NULL)
2636             {
2637               mctx->state_log[dest_str_idx]
2638                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2639                                             context);
2640               if (BE (mctx->state_log[dest_str_idx] == NULL
2641                       && err != REG_NOERROR, 0))
2642                 goto free_return;
2643             }
2644           else
2645             {
2646               re_node_set dest_nodes;
2647               err = re_node_set_init_union (&dest_nodes,
2648                                             dest_state->entrance_nodes,
2649                                             new_dest_nodes);
2650               if (BE (err != REG_NOERROR, 0))
2651                 {
2652                   re_node_set_free (&dest_nodes);
2653                   goto free_return;
2654                 }
2655               mctx->state_log[dest_str_idx]
2656                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2657               re_node_set_free (&dest_nodes);
2658               if (BE (mctx->state_log[dest_str_idx] == NULL
2659                       && err != REG_NOERROR, 0))
2660                 goto free_return;
2661             }
2662           /* We need to check recursively if the backreference can epsilon
2663              transit.  */
2664           if (subexp_len == 0
2665               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2666             {
2667               err = check_subexp_matching_top (mctx, new_dest_nodes,
2668                                                cur_str_idx);
2669               if (BE (err != REG_NOERROR, 0))
2670                 goto free_return;
2671               err = transit_state_bkref (mctx, new_dest_nodes);
2672               if (BE (err != REG_NOERROR, 0))
2673                 goto free_return;
2674             }
2675         }
2676     }
2677   err = REG_NOERROR;
2678  free_return:
2679   return err;
2680 }
2681
2682 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2683    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2684    Note that we might collect inappropriate candidates here.
2685    However, the cost of checking them strictly here is too high, then we
2686    delay these checking for prune_impossible_nodes().  */
2687
2688 static reg_errcode_t
2689 internal_function
2690 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2691 {
2692   const re_dfa_t *const dfa = mctx->dfa;
2693   int subexp_num, sub_top_idx;
2694   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2695   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2696   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2697   if (cache_idx != -1)
2698     {
2699       const struct re_backref_cache_entry *entry
2700         = mctx->bkref_ents + cache_idx;
2701       do
2702         if (entry->node == bkref_node)
2703           return REG_NOERROR; /* We already checked it.  */
2704       while (entry++->more);
2705     }
2706
2707   subexp_num = dfa->nodes[bkref_node].opr.idx;
2708
2709   /* For each sub expression  */
2710   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2711     {
2712       reg_errcode_t err;
2713       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2714       re_sub_match_last_t *sub_last;
2715       int sub_last_idx, sl_str, bkref_str_off;
2716
2717       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2718         continue; /* It isn't related.  */
2719
2720       sl_str = sub_top->str_idx;
2721       bkref_str_off = bkref_str_idx;
2722       /* At first, check the last node of sub expressions we already
2723          evaluated.  */
2724       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2725         {
2726           int sl_str_diff;
2727           sub_last = sub_top->lasts[sub_last_idx];
2728           sl_str_diff = sub_last->str_idx - sl_str;
2729           /* The matched string by the sub expression match with the substring
2730              at the back reference?  */
2731           if (sl_str_diff > 0)
2732             {
2733               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2734                 {
2735                   /* Not enough chars for a successful match.  */
2736                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2737                     break;
2738
2739                   err = clean_state_log_if_needed (mctx,
2740                                                    bkref_str_off
2741                                                    + sl_str_diff);
2742                   if (BE (err != REG_NOERROR, 0))
2743                     return err;
2744                   buf = (const char *) re_string_get_buffer (&mctx->input);
2745                 }
2746               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2747                 /* We don't need to search this sub expression any more.  */
2748                 break;
2749             }
2750           bkref_str_off += sl_str_diff;
2751           sl_str += sl_str_diff;
2752           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2753                                 bkref_str_idx);
2754
2755           /* Reload buf, since the preceding call might have reallocated
2756              the buffer.  */
2757           buf = (const char *) re_string_get_buffer (&mctx->input);
2758
2759           if (err == REG_NOMATCH)
2760             continue;
2761           if (BE (err != REG_NOERROR, 0))
2762             return err;
2763         }
2764
2765       if (sub_last_idx < sub_top->nlasts)
2766         continue;
2767       if (sub_last_idx > 0)
2768         ++sl_str;
2769       /* Then, search for the other last nodes of the sub expression.  */
2770       for (; sl_str <= bkref_str_idx; ++sl_str)
2771         {
2772           int cls_node, sl_str_off;
2773           const re_node_set *nodes;
2774           sl_str_off = sl_str - sub_top->str_idx;
2775           /* The matched string by the sub expression match with the substring
2776              at the back reference?  */
2777           if (sl_str_off > 0)
2778             {
2779               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2780                 {
2781                   /* If we are at the end of the input, we cannot match.  */
2782                   if (bkref_str_off >= mctx->input.len)
2783                     break;
2784
2785                   err = extend_buffers (mctx);
2786                   if (BE (err != REG_NOERROR, 0))
2787                     return err;
2788
2789                   buf = (const char *) re_string_get_buffer (&mctx->input);
2790                 }
2791               if (buf [bkref_str_off++] != buf[sl_str - 1])
2792                 break; /* We don't need to search this sub expression
2793                           any more.  */
2794             }
2795           if (mctx->state_log[sl_str] == NULL)
2796             continue;
2797           /* Does this state have a ')' of the sub expression?  */
2798           nodes = &mctx->state_log[sl_str]->nodes;
2799           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2800                                        OP_CLOSE_SUBEXP);
2801           if (cls_node == -1)
2802             continue; /* No.  */
2803           if (sub_top->path == NULL)
2804             {
2805               sub_top->path = calloc (sizeof (state_array_t),
2806                                       sl_str - sub_top->str_idx + 1);
2807               if (sub_top->path == NULL)
2808                 return REG_ESPACE;
2809             }
2810           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2811              in the current context?  */
2812           err = check_arrival (mctx, sub_top->path, sub_top->node,
2813                                sub_top->str_idx, cls_node, sl_str,
2814                                OP_CLOSE_SUBEXP);
2815           if (err == REG_NOMATCH)
2816               continue;
2817           if (BE (err != REG_NOERROR, 0))
2818               return err;
2819           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2820           if (BE (sub_last == NULL, 0))
2821             return REG_ESPACE;
2822           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2823                                 bkref_str_idx);
2824           if (err == REG_NOMATCH)
2825             continue;
2826         }
2827     }
2828   return REG_NOERROR;
2829 }
2830
2831 /* Helper functions for get_subexp().  */
2832
2833 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2834    If it can arrive, register the sub expression expressed with SUB_TOP
2835    and SUB_LAST.  */
2836
2837 static reg_errcode_t
2838 internal_function
2839 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2840                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2841 {
2842   reg_errcode_t err;
2843   int to_idx;
2844   /* Can the subexpression arrive the back reference?  */
2845   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2846                        sub_last->str_idx, bkref_node, bkref_str,
2847                        OP_OPEN_SUBEXP);
2848   if (err != REG_NOERROR)
2849     return err;
2850   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2851                              sub_last->str_idx);
2852   if (BE (err != REG_NOERROR, 0))
2853     return err;
2854   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2855   return clean_state_log_if_needed (mctx, to_idx);
2856 }
2857
2858 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2859    Search '(' if FL_OPEN, or search ')' otherwise.
2860    TODO: This function isn't efficient...
2861          Because there might be more than one nodes whose types are
2862          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2863          nodes.
2864          E.g. RE: (a){2}  */
2865
2866 static int
2867 internal_function
2868 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2869                   int subexp_idx, int type)
2870 {
2871   int cls_idx;
2872   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2873     {
2874       int cls_node = nodes->elems[cls_idx];
2875       const re_token_t *node = dfa->nodes + cls_node;
2876       if (node->type == type
2877           && node->opr.idx == subexp_idx)
2878         return cls_node;
2879     }
2880   return -1;
2881 }
2882
2883 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2884    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2885    heavily reused.
2886    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2887
2888 static reg_errcode_t
2889 internal_function
2890 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2891                int top_str, int last_node, int last_str, int type)
2892 {
2893   const re_dfa_t *const dfa = mctx->dfa;
2894   reg_errcode_t err = REG_NOERROR;
2895   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2896   re_dfastate_t *cur_state = NULL;
2897   re_node_set *cur_nodes, next_nodes;
2898   re_dfastate_t **backup_state_log;
2899   unsigned int context;
2900
2901   subexp_num = dfa->nodes[top_node].opr.idx;
2902   /* Extend the buffer if we need.  */
2903   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2904     {
2905       re_dfastate_t **new_array;
2906       int old_alloc = path->alloc;
2907       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2908       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2909       if (BE (new_array == NULL, 0))
2910         {
2911           path->alloc = old_alloc;
2912           return REG_ESPACE;
2913         }
2914       path->array = new_array;
2915       memset (new_array + old_alloc, '\0',
2916               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2917     }
2918
2919   str_idx = path->next_idx ? path->next_idx : top_str;
2920
2921   /* Temporary modify MCTX.  */
2922   backup_state_log = mctx->state_log;
2923   backup_cur_idx = mctx->input.cur_idx;
2924   mctx->state_log = path->array;
2925   mctx->input.cur_idx = str_idx;
2926
2927   /* Setup initial node set.  */
2928   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2929   if (str_idx == top_str)
2930     {
2931       err = re_node_set_init_1 (&next_nodes, top_node);
2932       if (BE (err != REG_NOERROR, 0))
2933         return err;
2934       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2935       if (BE (err != REG_NOERROR, 0))
2936         {
2937           re_node_set_free (&next_nodes);
2938           return err;
2939         }
2940     }
2941   else
2942     {
2943       cur_state = mctx->state_log[str_idx];
2944       if (cur_state && cur_state->has_backref)
2945         {
2946           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2947           if (BE (err != REG_NOERROR, 0))
2948             return err;
2949         }
2950       else
2951         re_node_set_init_empty (&next_nodes);
2952     }
2953   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2954     {
2955       if (next_nodes.nelem)
2956         {
2957           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2958                                     subexp_num, type);
2959           if (BE (err != REG_NOERROR, 0))
2960             {
2961               re_node_set_free (&next_nodes);
2962               return err;
2963             }
2964         }
2965       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2966       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2967         {
2968           re_node_set_free (&next_nodes);
2969           return err;
2970         }
2971       mctx->state_log[str_idx] = cur_state;
2972     }
2973
2974   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2975     {
2976       re_node_set_empty (&next_nodes);
2977       if (mctx->state_log[str_idx + 1])
2978         {
2979           err = re_node_set_merge (&next_nodes,
2980                                    &mctx->state_log[str_idx + 1]->nodes);
2981           if (BE (err != REG_NOERROR, 0))
2982             {
2983               re_node_set_free (&next_nodes);
2984               return err;
2985             }
2986         }
2987       if (cur_state)
2988         {
2989           err = check_arrival_add_next_nodes (mctx, str_idx,
2990                                               &cur_state->non_eps_nodes,
2991                                               &next_nodes);
2992           if (BE (err != REG_NOERROR, 0))
2993             {
2994               re_node_set_free (&next_nodes);
2995               return err;
2996             }
2997         }
2998       ++str_idx;
2999       if (next_nodes.nelem)
3000         {
3001           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3002           if (BE (err != REG_NOERROR, 0))
3003             {
3004               re_node_set_free (&next_nodes);
3005               return err;
3006             }
3007           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3008                                     subexp_num, type);
3009           if (BE (err != REG_NOERROR, 0))
3010             {
3011               re_node_set_free (&next_nodes);
3012               return err;
3013             }
3014         }
3015       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3016       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3017       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3018         {
3019           re_node_set_free (&next_nodes);
3020           return err;
3021         }
3022       mctx->state_log[str_idx] = cur_state;
3023       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3024     }
3025   re_node_set_free (&next_nodes);
3026   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3027                : &mctx->state_log[last_str]->nodes);
3028   path->next_idx = str_idx;
3029
3030   /* Fix MCTX.  */
3031   mctx->state_log = backup_state_log;
3032   mctx->input.cur_idx = backup_cur_idx;
3033
3034   /* Then check the current node set has the node LAST_NODE.  */
3035   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3036     return REG_NOERROR;
3037
3038   return REG_NOMATCH;
3039 }
3040
3041 /* Helper functions for check_arrival.  */
3042
3043 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3044    to NEXT_NODES.
3045    TODO: This function is similar to the functions transit_state*(),
3046          however this function has many additional works.
3047          Can't we unify them?  */
3048
3049 static reg_errcode_t
3050 internal_function
3051 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3052                               re_node_set *cur_nodes, re_node_set *next_nodes)
3053 {
3054   const re_dfa_t *const dfa = mctx->dfa;
3055   int result;
3056   int cur_idx;
3057 #ifdef RE_ENABLE_I18N
3058   reg_errcode_t err = REG_NOERROR;
3059 #endif
3060   re_node_set union_set;
3061   re_node_set_init_empty (&union_set);
3062   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3063     {
3064       int naccepted = 0;
3065       int cur_node = cur_nodes->elems[cur_idx];
3066 #ifdef DEBUG
3067       re_token_type_t type = dfa->nodes[cur_node].type;
3068       assert (!IS_EPSILON_NODE (type));
3069 #endif
3070 #ifdef RE_ENABLE_I18N
3071       /* If the node may accept `multi byte'.  */
3072       if (dfa->nodes[cur_node].accept_mb)
3073         {
3074           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3075                                                str_idx);
3076           if (naccepted > 1)
3077             {
3078               re_dfastate_t *dest_state;
3079               int next_node = dfa->nexts[cur_node];
3080               int next_idx = str_idx + naccepted;
3081               dest_state = mctx->state_log[next_idx];
3082               re_node_set_empty (&union_set);
3083               if (dest_state)
3084                 {
3085                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3086                   if (BE (err != REG_NOERROR, 0))
3087                     {
3088                       re_node_set_free (&union_set);
3089                       return err;
3090                     }
3091                 }
3092               result = re_node_set_insert (&union_set, next_node);
3093               if (BE (result < 0, 0))
3094                 {
3095                   re_node_set_free (&union_set);
3096                   return REG_ESPACE;
3097                 }
3098               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3099                                                             &union_set);
3100               if (BE (mctx->state_log[next_idx] == NULL
3101                       && err != REG_NOERROR, 0))
3102                 {
3103                   re_node_set_free (&union_set);
3104                   return err;
3105                 }
3106             }
3107         }
3108 #endif /* RE_ENABLE_I18N */
3109       if (naccepted
3110           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3111         {
3112           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3113           if (BE (result < 0, 0))
3114             {
3115               re_node_set_free (&union_set);
3116               return REG_ESPACE;
3117             }
3118         }
3119     }
3120   re_node_set_free (&union_set);
3121   return REG_NOERROR;
3122 }
3123
3124 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3125    CUR_NODES, however exclude the nodes which are:
3126     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3127     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3128 */
3129
3130 static reg_errcode_t
3131 internal_function
3132 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3133                           int ex_subexp, int type)
3134 {
3135   reg_errcode_t err;
3136   int idx, outside_node;
3137   re_node_set new_nodes;
3138 #ifdef DEBUG
3139   assert (cur_nodes->nelem);
3140 #endif
3141   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3142   if (BE (err != REG_NOERROR, 0))
3143     return err;
3144   /* Create a new node set NEW_NODES with the nodes which are epsilon
3145      closures of the node in CUR_NODES.  */
3146
3147   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3148     {
3149       int cur_node = cur_nodes->elems[idx];
3150       const re_node_set *eclosure = dfa->eclosures + cur_node;
3151       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3152       if (outside_node == -1)
3153         {
3154           /* There are no problematic nodes, just merge them.  */
3155           err = re_node_set_merge (&new_nodes, eclosure);
3156           if (BE (err != REG_NOERROR, 0))
3157             {
3158               re_node_set_free (&new_nodes);
3159               return err;
3160             }
3161         }
3162       else
3163         {
3164           /* There are problematic nodes, re-calculate incrementally.  */
3165           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3166                                               ex_subexp, type);
3167           if (BE (err != REG_NOERROR, 0))
3168             {
3169               re_node_set_free (&new_nodes);
3170               return err;
3171             }
3172         }
3173     }
3174   re_node_set_free (cur_nodes);
3175   *cur_nodes = new_nodes;
3176   return REG_NOERROR;
3177 }
3178
3179 /* Helper function for check_arrival_expand_ecl.
3180    Check incrementally the epsilon closure of TARGET, and if it isn't
3181    problematic append it to DST_NODES.  */
3182
3183 static reg_errcode_t
3184 internal_function
3185 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3186                               int target, int ex_subexp, int type)
3187 {
3188   int cur_node;
3189   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3190     {
3191       int err;
3192
3193       if (dfa->nodes[cur_node].type == type
3194           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3195         {
3196           if (type == OP_CLOSE_SUBEXP)
3197             {
3198               err = re_node_set_insert (dst_nodes, cur_node);
3199               if (BE (err == -1, 0))
3200                 return REG_ESPACE;
3201             }
3202           break;
3203         }
3204       err = re_node_set_insert (dst_nodes, cur_node);
3205       if (BE (err == -1, 0))
3206         return REG_ESPACE;
3207       if (dfa->edests[cur_node].nelem == 0)
3208         break;
3209       if (dfa->edests[cur_node].nelem == 2)
3210         {
3211           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3212                                               dfa->edests[cur_node].elems[1],
3213                                               ex_subexp, type);
3214           if (BE (err != REG_NOERROR, 0))
3215             return err;
3216         }
3217       cur_node = dfa->edests[cur_node].elems[0];
3218     }
3219   return REG_NOERROR;
3220 }
3221
3222
3223 /* For all the back references in the current state, calculate the
3224    destination of the back references by the appropriate entry
3225    in MCTX->BKREF_ENTS.  */
3226
3227 static reg_errcode_t
3228 internal_function
3229 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3230                     int cur_str, int subexp_num, int type)
3231 {
3232   const re_dfa_t *const dfa = mctx->dfa;
3233   reg_errcode_t err;
3234   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3235   struct re_backref_cache_entry *ent;
3236
3237   if (cache_idx_start == -1)
3238     return REG_NOERROR;
3239
3240  restart:
3241   ent = mctx->bkref_ents + cache_idx_start;
3242   do
3243     {
3244       int to_idx, next_node;
3245
3246       /* Is this entry ENT is appropriate?  */
3247       if (!re_node_set_contains (cur_nodes, ent->node))
3248         continue; /* No.  */
3249
3250       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3251       /* Calculate the destination of the back reference, and append it
3252          to MCTX->STATE_LOG.  */
3253       if (to_idx == cur_str)
3254         {
3255           /* The backreference did epsilon transit, we must re-check all the
3256              node in the current state.  */
3257           re_node_set new_dests;
3258           reg_errcode_t err2, err3;
3259           next_node = dfa->edests[ent->node].elems[0];
3260           if (re_node_set_contains (cur_nodes, next_node))
3261             continue;
3262           err = re_node_set_init_1 (&new_dests, next_node);
3263           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3264           err3 = re_node_set_merge (cur_nodes, &new_dests);
3265           re_node_set_free (&new_dests);
3266           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3267                   || err3 != REG_NOERROR, 0))
3268             {
3269               err = (err != REG_NOERROR ? err
3270                      : (err2 != REG_NOERROR ? err2 : err3));
3271               return err;
3272             }
3273           /* TODO: It is still inefficient...  */
3274           goto restart;
3275         }
3276       else
3277         {
3278           re_node_set union_set;
3279           next_node = dfa->nexts[ent->node];
3280           if (mctx->state_log[to_idx])
3281             {
3282               int ret;
3283               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3284                                         next_node))
3285                 continue;
3286               err = re_node_set_init_copy (&union_set,
3287                                            &mctx->state_log[to_idx]->nodes);
3288               ret = re_node_set_insert (&union_set, next_node);
3289               if (BE (err != REG_NOERROR || ret < 0, 0))
3290                 {
3291                   re_node_set_free (&union_set);
3292                   err = err != REG_NOERROR ? err : REG_ESPACE;
3293                   return err;
3294                 }
3295             }
3296           else
3297             {
3298               err = re_node_set_init_1 (&union_set, next_node);
3299               if (BE (err != REG_NOERROR, 0))
3300                 return err;
3301             }
3302           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3303           re_node_set_free (&union_set);
3304           if (BE (mctx->state_log[to_idx] == NULL
3305                   && err != REG_NOERROR, 0))
3306             return err;
3307         }
3308     }
3309   while (ent++->more);
3310   return REG_NOERROR;
3311 }
3312
3313 /* Build transition table for the state.
3314    Return 1 if succeeded, otherwise return NULL.  */
3315
3316 static int
3317 internal_function
3318 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3319 {
3320   reg_errcode_t err;
3321   int i, j, ch, need_word_trtable = 0;
3322   bitset_word_t elem, mask;
3323   bool dests_node_malloced = false;
3324   bool dest_states_malloced = false;
3325   int ndests; /* Number of the destination states from `state'.  */
3326   re_dfastate_t **trtable;
3327   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3328   re_node_set follows, *dests_node;
3329   bitset_t *dests_ch;
3330   bitset_t acceptable;
3331
3332   struct dests_alloc
3333   {
3334     re_node_set dests_node[SBC_MAX];
3335     bitset_t dests_ch[SBC_MAX];
3336   } *dests_alloc;
3337
3338   /* We build DFA states which corresponds to the destination nodes
3339      from `state'.  `dests_node[i]' represents the nodes which i-th
3340      destination state contains, and `dests_ch[i]' represents the
3341      characters which i-th destination state accepts.  */
3342 #ifdef HAVE_ALLOCA
3343   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3344     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3345   else
3346 #endif
3347     {
3348       dests_alloc = re_malloc (struct dests_alloc, 1);
3349       if (BE (dests_alloc == NULL, 0))
3350         return 0;
3351       dests_node_malloced = true;
3352     }
3353   dests_node = dests_alloc->dests_node;
3354   dests_ch = dests_alloc->dests_ch;
3355
3356   /* Initialize transiton table.  */
3357   state->word_trtable = state->trtable = NULL;
3358
3359   /* At first, group all nodes belonging to `state' into several
3360      destinations.  */
3361   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3362   if (BE (ndests <= 0, 0))
3363     {
3364       if (dests_node_malloced)
3365         free (dests_alloc);
3366       /* Return 0 in case of an error, 1 otherwise.  */
3367       if (ndests == 0)
3368         {
3369           state->trtable = (re_dfastate_t **)
3370             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3371           return 1;
3372         }
3373       return 0;
3374     }
3375
3376   err = re_node_set_alloc (&follows, ndests + 1);
3377   if (BE (err != REG_NOERROR, 0))
3378     goto out_free;
3379
3380   /* Avoid arithmetic overflow in size calculation.  */
3381   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3382             / (3 * sizeof (re_dfastate_t *)))
3383            < ndests),
3384           0))
3385     goto out_free;
3386
3387 #ifdef HAVE_ALLOCA
3388   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3389                          + ndests * 3 * sizeof (re_dfastate_t *)))
3390     dest_states = (re_dfastate_t **)
3391       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3392   else
3393 #endif
3394     {
3395       dest_states = (re_dfastate_t **)
3396         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3397       if (BE (dest_states == NULL, 0))
3398         {
3399 out_free:
3400           if (dest_states_malloced)
3401             free (dest_states);
3402           re_node_set_free (&follows);
3403           for (i = 0; i < ndests; ++i)
3404             re_node_set_free (dests_node + i);
3405           if (dests_node_malloced)
3406             free (dests_alloc);
3407           return 0;
3408         }
3409       dest_states_malloced = true;
3410     }
3411   dest_states_word = dest_states + ndests;
3412   dest_states_nl = dest_states_word + ndests;
3413   bitset_empty (acceptable);
3414
3415   /* Then build the states for all destinations.  */
3416   for (i = 0; i < ndests; ++i)
3417     {
3418       int next_node;
3419       re_node_set_empty (&follows);
3420       /* Merge the follows of this destination states.  */
3421       for (j = 0; j < dests_node[i].nelem; ++j)
3422         {
3423           next_node = dfa->nexts[dests_node[i].elems[j]];
3424           if (next_node != -1)
3425             {
3426               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3427               if (BE (err != REG_NOERROR, 0))
3428                 goto out_free;
3429             }
3430         }
3431       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3432       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3433         goto out_free;
3434       /* If the new state has context constraint,
3435          build appropriate states for these contexts.  */
3436       if (dest_states[i]->has_constraint)
3437         {
3438           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3439                                                           CONTEXT_WORD);
3440           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3441             goto out_free;
3442
3443           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3444             need_word_trtable = 1;
3445
3446           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3447                                                         CONTEXT_NEWLINE);
3448           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3449             goto out_free;
3450         }
3451       else
3452         {
3453           dest_states_word[i] = dest_states[i];
3454           dest_states_nl[i] = dest_states[i];
3455         }
3456       bitset_merge (acceptable, dests_ch[i]);
3457     }
3458
3459   if (!BE (need_word_trtable, 0))
3460     {
3461       /* We don't care about whether the following character is a word
3462          character, or we are in a single-byte character set so we can
3463          discern by looking at the character code: allocate a
3464          256-entry transition table.  */
3465       trtable = state->trtable =
3466         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3467       if (BE (trtable == NULL, 0))
3468         goto out_free;
3469
3470       /* For all characters ch...:  */
3471       for (i = 0; i < BITSET_WORDS; ++i)
3472         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3473              elem;
3474              mask <<= 1, elem >>= 1, ++ch)
3475           if (BE (elem & 1, 0))
3476             {
3477               /* There must be exactly one destination which accepts
3478                  character ch.  See group_nodes_into_DFAstates.  */
3479               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3480                 ;
3481
3482               /* j-th destination accepts the word character ch.  */
3483               if (dfa->word_char[i] & mask)
3484                 trtable[ch] = dest_states_word[j];
3485               else
3486                 trtable[ch] = dest_states[j];
3487             }
3488     }
3489   else
3490     {
3491       /* We care about whether the following character is a word
3492          character, and we are in a multi-byte character set: discern
3493          by looking at the character code: build two 256-entry
3494          transition tables, one starting at trtable[0] and one
3495          starting at trtable[SBC_MAX].  */
3496       trtable = state->word_trtable =
3497         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3498       if (BE (trtable == NULL, 0))
3499         goto out_free;
3500
3501       /* For all characters ch...:  */
3502       for (i = 0; i < BITSET_WORDS; ++i)
3503         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3504              elem;
3505              mask <<= 1, elem >>= 1, ++ch)
3506           if (BE (elem & 1, 0))
3507             {
3508               /* There must be exactly one destination which accepts
3509                  character ch.  See group_nodes_into_DFAstates.  */
3510               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3511                 ;
3512
3513               /* j-th destination accepts the word character ch.  */
3514               trtable[ch] = dest_states[j];
3515               trtable[ch + SBC_MAX] = dest_states_word[j];
3516             }
3517     }
3518
3519   /* new line */
3520   if (bitset_contain (acceptable, NEWLINE_CHAR))
3521     {
3522       /* The current state accepts newline character.  */
3523       for (j = 0; j < ndests; ++j)
3524         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3525           {
3526             /* k-th destination accepts newline character.  */
3527             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3528             if (need_word_trtable)
3529               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3530             /* There must be only one destination which accepts
3531                newline.  See group_nodes_into_DFAstates.  */
3532             break;
3533           }
3534     }
3535
3536   if (dest_states_malloced)
3537     free (dest_states);
3538
3539   re_node_set_free (&follows);
3540   for (i = 0; i < ndests; ++i)
3541     re_node_set_free (dests_node + i);
3542
3543   if (dests_node_malloced)
3544     free (dests_alloc);
3545
3546   return 1;
3547 }
3548
3549 /* Group all nodes belonging to STATE into several destinations.
3550    Then for all destinations, set the nodes belonging to the destination
3551    to DESTS_NODE[i] and set the characters accepted by the destination
3552    to DEST_CH[i].  This function return the number of destinations.  */
3553
3554 static int
3555 internal_function
3556 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3557                             re_node_set *dests_node, bitset_t *dests_ch)
3558 {
3559   reg_errcode_t err;
3560   int result;
3561   int i, j, k;
3562   int ndests; /* Number of the destinations from `state'.  */
3563   bitset_t accepts; /* Characters a node can accept.  */
3564   const re_node_set *cur_nodes = &state->nodes;
3565   bitset_empty (accepts);
3566   ndests = 0;
3567
3568   /* For all the nodes belonging to `state',  */
3569   for (i = 0; i < cur_nodes->nelem; ++i)
3570     {
3571       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3572       re_token_type_t type = node->type;
3573       unsigned int constraint = node->constraint;
3574
3575       /* Enumerate all single byte character this node can accept.  */
3576       if (type == CHARACTER)
3577         bitset_set (accepts, node->opr.c);
3578       else if (type == SIMPLE_BRACKET)
3579         {
3580           bitset_merge (accepts, node->opr.sbcset);
3581         }
3582       else if (type == OP_PERIOD)
3583         {
3584 #ifdef RE_ENABLE_I18N
3585           if (dfa->mb_cur_max > 1)
3586             bitset_merge (accepts, dfa->sb_char);
3587           else
3588 #endif
3589             bitset_set_all (accepts);
3590           if (!(dfa->syntax & RE_DOT_NEWLINE))
3591             bitset_clear (accepts, '\n');
3592           if (dfa->syntax & RE_DOT_NOT_NULL)
3593             bitset_clear (accepts, '\0');
3594         }
3595 #ifdef RE_ENABLE_I18N
3596       else if (type == OP_UTF8_PERIOD)
3597         {
3598           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3599           if (!(dfa->syntax & RE_DOT_NEWLINE))
3600             bitset_clear (accepts, '\n');
3601           if (dfa->syntax & RE_DOT_NOT_NULL)
3602             bitset_clear (accepts, '\0');
3603         }
3604 #endif
3605       else
3606         continue;
3607
3608       /* Check the `accepts' and sift the characters which are not
3609          match it the context.  */
3610       if (constraint)
3611         {
3612           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3613             {
3614               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3615               bitset_empty (accepts);
3616               if (accepts_newline)
3617                 bitset_set (accepts, NEWLINE_CHAR);
3618               else
3619                 continue;
3620             }
3621           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3622             {
3623               bitset_empty (accepts);
3624               continue;
3625             }
3626
3627           if (constraint & NEXT_WORD_CONSTRAINT)
3628             {
3629               bitset_word_t any_set = 0;
3630               if (type == CHARACTER && !node->word_char)
3631                 {
3632                   bitset_empty (accepts);
3633                   continue;
3634                 }
3635 #ifdef RE_ENABLE_I18N
3636               if (dfa->mb_cur_max > 1)
3637                 for (j = 0; j < BITSET_WORDS; ++j)
3638                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3639               else
3640 #endif
3641                 for (j = 0; j < BITSET_WORDS; ++j)
3642                   any_set |= (accepts[j] &= dfa->word_char[j]);
3643               if (!any_set)
3644                 continue;
3645             }
3646           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3647             {
3648               bitset_word_t any_set = 0;
3649               if (type == CHARACTER && node->word_char)
3650                 {
3651                   bitset_empty (accepts);
3652                   continue;
3653                 }
3654 #ifdef RE_ENABLE_I18N
3655               if (dfa->mb_cur_max > 1)
3656                 for (j = 0; j < BITSET_WORDS; ++j)
3657                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3658               else
3659 #endif
3660                 for (j = 0; j < BITSET_WORDS; ++j)
3661                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3662               if (!any_set)
3663                 continue;
3664             }
3665         }
3666
3667       /* Then divide `accepts' into DFA states, or create a new
3668          state.  Above, we make sure that accepts is not empty.  */
3669       for (j = 0; j < ndests; ++j)
3670         {
3671           bitset_t intersec; /* Intersection sets, see below.  */
3672           bitset_t remains;
3673           /* Flags, see below.  */
3674           bitset_word_t has_intersec, not_subset, not_consumed;
3675
3676           /* Optimization, skip if this state doesn't accept the character.  */
3677           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3678             continue;
3679
3680           /* Enumerate the intersection set of this state and `accepts'.  */
3681           has_intersec = 0;
3682           for (k = 0; k < BITSET_WORDS; ++k)
3683             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3684           /* And skip if the intersection set is empty.  */
3685           if (!has_intersec)
3686             continue;
3687
3688           /* Then check if this state is a subset of `accepts'.  */
3689           not_subset = not_consumed = 0;
3690           for (k = 0; k < BITSET_WORDS; ++k)
3691             {
3692               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3693               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3694             }
3695
3696           /* If this state isn't a subset of `accepts', create a
3697              new group state, which has the `remains'. */
3698           if (not_subset)
3699             {
3700               bitset_copy (dests_ch[ndests], remains);
3701               bitset_copy (dests_ch[j], intersec);
3702               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3703               if (BE (err != REG_NOERROR, 0))
3704                 goto error_return;
3705               ++ndests;
3706             }
3707
3708           /* Put the position in the current group. */
3709           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3710           if (BE (result < 0, 0))
3711             goto error_return;
3712
3713           /* If all characters are consumed, go to next node. */
3714           if (!not_consumed)
3715             break;
3716         }
3717       /* Some characters remain, create a new group. */
3718       if (j == ndests)
3719         {
3720           bitset_copy (dests_ch[ndests], accepts);
3721           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3722           if (BE (err != REG_NOERROR, 0))
3723             goto error_return;
3724           ++ndests;
3725           bitset_empty (accepts);
3726         }
3727     }
3728   return ndests;
3729  error_return:
3730   for (j = 0; j < ndests; ++j)
3731     re_node_set_free (dests_node + j);
3732   return -1;
3733 }
3734
3735 #ifdef RE_ENABLE_I18N
3736 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3737    Return the number of the bytes the node accepts.
3738    STR_IDX is the current index of the input string.
3739
3740    This function handles the nodes which can accept one character, or
3741    one collating element like '.', '[a-z]', opposite to the other nodes
3742    can only accept one byte.  */
3743
3744 static int
3745 internal_function
3746 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3747                          const re_string_t *input, int str_idx)
3748 {
3749   const re_token_t *node = dfa->nodes + node_idx;
3750   int char_len, elem_len;
3751   int i;
3752   wint_t wc;
3753
3754   if (BE (node->type == OP_UTF8_PERIOD, 0))
3755     {
3756       unsigned char c = re_string_byte_at (input, str_idx), d;
3757       if (BE (c < 0xc2, 1))
3758         return 0;
3759
3760       if (str_idx + 2 > input->len)
3761         return 0;
3762
3763       d = re_string_byte_at (input, str_idx + 1);
3764       if (c < 0xe0)
3765         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3766       else if (c < 0xf0)
3767         {
3768           char_len = 3;
3769           if (c == 0xe0 && d < 0xa0)
3770             return 0;
3771         }
3772       else if (c < 0xf8)
3773         {
3774           char_len = 4;
3775           if (c == 0xf0 && d < 0x90)
3776             return 0;
3777         }
3778       else if (c < 0xfc)
3779         {
3780           char_len = 5;
3781           if (c == 0xf8 && d < 0x88)
3782             return 0;
3783         }
3784       else if (c < 0xfe)
3785         {
3786           char_len = 6;
3787           if (c == 0xfc && d < 0x84)
3788             return 0;
3789         }
3790       else
3791         return 0;
3792
3793       if (str_idx + char_len > input->len)
3794         return 0;
3795
3796       for (i = 1; i < char_len; ++i)
3797         {
3798           d = re_string_byte_at (input, str_idx + i);
3799           if (d < 0x80 || d > 0xbf)
3800             return 0;
3801         }
3802       return char_len;
3803     }
3804
3805   char_len = re_string_char_size_at (input, str_idx);
3806   if (node->type == OP_PERIOD)
3807     {
3808       if (char_len <= 1)
3809         return 0;
3810       /* FIXME: I don't think this if is needed, as both '\n'
3811          and '\0' are char_len == 1.  */
3812       /* '.' accepts any one character except the following two cases.  */
3813       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3814            re_string_byte_at (input, str_idx) == '\n') ||
3815           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3816            re_string_byte_at (input, str_idx) == '\0'))
3817         return 0;
3818       return char_len;
3819     }
3820
3821   elem_len = re_string_elem_size_at (input, str_idx);
3822   wc = __btowc(*(input->mbs+str_idx));
3823   if (((elem_len <= 1 && char_len <= 1) || char_len == 0) && (wc != WEOF && wc < SBC_MAX))
3824     return 0;
3825
3826   if (node->type == COMPLEX_BRACKET)
3827     {
3828       const re_charset_t *cset = node->opr.mbcset;
3829 # ifdef _LIBC
3830       const unsigned char *pin
3831         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3832       int j;
3833       uint32_t nrules;
3834 # endif /* _LIBC */
3835       int match_len = 0;
3836       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3837                     ? re_string_wchar_at (input, str_idx) : 0);
3838
3839       /* match with multibyte character?  */
3840       for (i = 0; i < cset->nmbchars; ++i)
3841         if (wc == cset->mbchars[i])
3842           {
3843             match_len = char_len;
3844             goto check_node_accept_bytes_match;
3845           }
3846       /* match with character_class?  */
3847       for (i = 0; i < cset->nchar_classes; ++i)
3848         {
3849           wctype_t wt = cset->char_classes[i];
3850           if (__iswctype (wc, wt))
3851             {
3852               match_len = char_len;
3853               goto check_node_accept_bytes_match;
3854             }
3855         }
3856
3857 # ifdef _LIBC
3858       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3859       if (nrules != 0)
3860         {
3861           unsigned int in_collseq = 0;
3862           const int32_t *table, *indirect;
3863           const unsigned char *weights, *extra;
3864           const char *collseqwc;
3865           /* This #include defines a local function!  */
3866 #  include <locale/weight.h>
3867
3868           /* match with collating_symbol?  */
3869           if (cset->ncoll_syms)
3870             extra = (const unsigned char *)
3871               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3872           for (i = 0; i < cset->ncoll_syms; ++i)
3873             {
3874               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3875               /* Compare the length of input collating element and
3876                  the length of current collating element.  */
3877               if (*coll_sym != elem_len)
3878                 continue;
3879               /* Compare each bytes.  */
3880               for (j = 0; j < *coll_sym; j++)
3881                 if (pin[j] != coll_sym[1 + j])
3882                   break;
3883               if (j == *coll_sym)
3884                 {
3885                   /* Match if every bytes is equal.  */
3886                   match_len = j;
3887                   goto check_node_accept_bytes_match;
3888                 }
3889             }
3890
3891           if (cset->nranges)
3892             {
3893               if (elem_len <= char_len)
3894                 {
3895                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3896                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3897                 }
3898               else
3899                 in_collseq = find_collation_sequence_value (pin, elem_len);
3900             }
3901           /* match with range expression?  */
3902           for (i = 0; i < cset->nranges; ++i)
3903             if (cset->range_starts[i] <= in_collseq
3904                 && in_collseq <= cset->range_ends[i])
3905               {
3906                 match_len = elem_len;
3907                 goto check_node_accept_bytes_match;
3908               }
3909
3910           /* match with equivalence_class?  */
3911           if (cset->nequiv_classes)
3912             {
3913               const unsigned char *cp = pin;
3914               table = (const int32_t *)
3915                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3916               weights = (const unsigned char *)
3917                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3918               extra = (const unsigned char *)
3919                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3920               indirect = (const int32_t *)
3921                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3922               int32_t idx = findidx (&cp);
3923               if (idx > 0)
3924                 for (i = 0; i < cset->nequiv_classes; ++i)
3925                   {
3926                     int32_t equiv_class_idx = cset->equiv_classes[i];
3927                     size_t weight_len = weights[idx & 0xffffff];
3928                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3929                         && (idx >> 24) == (equiv_class_idx >> 24))
3930                       {
3931                         int cnt = 0;
3932
3933                         idx &= 0xffffff;
3934                         equiv_class_idx &= 0xffffff;
3935
3936                         while (cnt <= weight_len
3937                                && (weights[equiv_class_idx + 1 + cnt]
3938                                    == weights[idx + 1 + cnt]))
3939                           ++cnt;
3940                         if (cnt > weight_len)
3941                           {
3942                             match_len = elem_len;
3943                             goto check_node_accept_bytes_match;
3944                           }
3945                       }
3946                   }
3947             }
3948         }
3949       else
3950 # endif /* _LIBC */
3951         {
3952           /* match with range expression?  */
3953 #if __GNUC__ >= 2
3954           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3955 #else
3956           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3957           cmp_buf[2] = wc;
3958 #endif
3959           for (i = 0; i < cset->nranges; ++i)
3960             {
3961               cmp_buf[0] = cset->range_starts[i];
3962               cmp_buf[4] = cset->range_ends[i];
3963               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3964                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3965                 {
3966                   match_len = char_len;
3967                   goto check_node_accept_bytes_match;
3968                 }
3969             }
3970         }
3971     check_node_accept_bytes_match:
3972       if (!cset->non_match)
3973         return match_len;
3974       else
3975         {
3976           if (match_len > 0)
3977             return 0;
3978           else
3979             return (elem_len > char_len) ? elem_len : char_len;
3980         }
3981     }
3982   return 0;
3983 }
3984
3985 # ifdef _LIBC
3986 static unsigned int
3987 internal_function
3988 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3989 {
3990   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3991   if (nrules == 0)
3992     {
3993       if (mbs_len == 1)
3994         {
3995           /* No valid character.  Match it as a single byte character.  */
3996           const unsigned char *collseq = (const unsigned char *)
3997             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3998           return collseq[mbs[0]];
3999         }
4000       return UINT_MAX;
4001     }
4002   else
4003     {
4004       int32_t idx;
4005       const unsigned char *extra = (const unsigned char *)
4006         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4007       int32_t extrasize = (const unsigned char *)
4008         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4009
4010       for (idx = 0; idx < extrasize;)
4011         {
4012           int mbs_cnt, found = 0;
4013           int32_t elem_mbs_len;
4014           /* Skip the name of collating element name.  */
4015           idx = idx + extra[idx] + 1;
4016           elem_mbs_len = extra[idx++];
4017           if (mbs_len == elem_mbs_len)
4018             {
4019               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4020                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4021                   break;
4022               if (mbs_cnt == elem_mbs_len)
4023                 /* Found the entry.  */
4024                 found = 1;
4025             }
4026           /* Skip the byte sequence of the collating element.  */
4027           idx += elem_mbs_len;
4028           /* Adjust for the alignment.  */
4029           idx = (idx + 3) & ~3;
4030           /* Skip the collation sequence value.  */
4031           idx += sizeof (uint32_t);
4032           /* Skip the wide char sequence of the collating element.  */
4033           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4034           /* If we found the entry, return the sequence value.  */
4035           if (found)
4036             return *(uint32_t *) (extra + idx);
4037           /* Skip the collation sequence value.  */
4038           idx += sizeof (uint32_t);
4039         }
4040       return UINT_MAX;
4041     }
4042 }
4043 # endif /* _LIBC */
4044 #endif /* RE_ENABLE_I18N */
4045
4046 /* Check whether the node accepts the byte which is IDX-th
4047    byte of the INPUT.  */
4048
4049 static int
4050 internal_function
4051 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4052                    int idx)
4053 {
4054   unsigned char ch;
4055   ch = re_string_byte_at (&mctx->input, idx);
4056   switch (node->type)
4057     {
4058     case CHARACTER:
4059       if (node->opr.c != ch)
4060         return 0;
4061       break;
4062
4063     case SIMPLE_BRACKET:
4064       if (!bitset_contain (node->opr.sbcset, ch))
4065         return 0;
4066       break;
4067
4068 #ifdef RE_ENABLE_I18N
4069     case OP_UTF8_PERIOD:
4070       if (ch >= 0x80)
4071         return 0;
4072       /* FALLTHROUGH */
4073 #endif
4074     case OP_PERIOD:
4075       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4076           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4077         return 0;
4078       break;
4079
4080     default:
4081       return 0;
4082     }
4083
4084   if (node->constraint)
4085     {
4086       /* The node has constraints.  Check whether the current context
4087          satisfies the constraints.  */
4088       unsigned int context = re_string_context_at (&mctx->input, idx,
4089                                                    mctx->eflags);
4090       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4091         return 0;
4092     }
4093
4094   return 1;
4095 }
4096
4097 /* Extend the buffers, if the buffers have run out.  */
4098
4099 static reg_errcode_t
4100 internal_function
4101 extend_buffers (re_match_context_t *mctx)
4102 {
4103   reg_errcode_t ret;
4104   re_string_t *pstr = &mctx->input;
4105
4106   /* Avoid overflow.  */
4107   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4108     return REG_ESPACE;
4109
4110   /* Double the lengthes of the buffers.  */
4111   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4112   if (BE (ret != REG_NOERROR, 0))
4113     return ret;
4114
4115   if (mctx->state_log != NULL)
4116     {
4117       /* And double the length of state_log.  */
4118       /* XXX We have no indication of the size of this buffer.  If this
4119          allocation fail we have no indication that the state_log array
4120          does not have the right size.  */
4121       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4122                                               pstr->bufs_len + 1);
4123       if (BE (new_array == NULL, 0))
4124         return REG_ESPACE;
4125       mctx->state_log = new_array;
4126     }
4127
4128   /* Then reconstruct the buffers.  */
4129   if (pstr->icase)
4130     {
4131 #ifdef RE_ENABLE_I18N
4132       if (pstr->mb_cur_max > 1)
4133         {
4134           ret = build_wcs_upper_buffer (pstr);
4135           if (BE (ret != REG_NOERROR, 0))
4136             return ret;
4137         }
4138       else
4139 #endif /* RE_ENABLE_I18N  */
4140         build_upper_buffer (pstr);
4141     }
4142   else
4143     {
4144 #ifdef RE_ENABLE_I18N
4145       if (pstr->mb_cur_max > 1)
4146         build_wcs_buffer (pstr);
4147       else
4148 #endif /* RE_ENABLE_I18N  */
4149         {
4150           if (pstr->trans != NULL)
4151             re_string_translate_buffer (pstr);
4152         }
4153     }
4154   return REG_NOERROR;
4155 }
4156
4157 \f
4158 /* Functions for matching context.  */
4159
4160 /* Initialize MCTX.  */
4161
4162 static reg_errcode_t
4163 internal_function
4164 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4165 {
4166   mctx->eflags = eflags;
4167   mctx->match_last = -1;
4168   if (n > 0)
4169     {
4170       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4171       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4172       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4173         return REG_ESPACE;
4174     }
4175   /* Already zero-ed by the caller.
4176      else
4177        mctx->bkref_ents = NULL;
4178      mctx->nbkref_ents = 0;
4179      mctx->nsub_tops = 0;  */
4180   mctx->abkref_ents = n;
4181   mctx->max_mb_elem_len = 1;
4182   mctx->asub_tops = n;
4183   return REG_NOERROR;
4184 }
4185
4186 /* Clean the entries which depend on the current input in MCTX.
4187    This function must be invoked when the matcher changes the start index
4188    of the input, or changes the input string.  */
4189
4190 static void
4191 internal_function
4192 match_ctx_clean (re_match_context_t *mctx)
4193 {
4194   int st_idx;
4195   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4196     {
4197       int sl_idx;
4198       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4199       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4200         {
4201           re_sub_match_last_t *last = top->lasts[sl_idx];
4202           re_free (last->path.array);
4203           re_free (last);
4204         }
4205       re_free (top->lasts);
4206       if (top->path)
4207         {
4208           re_free (top->path->array);
4209           re_free (top->path);
4210         }
4211       free (top);
4212     }
4213
4214   mctx->nsub_tops = 0;
4215   mctx->nbkref_ents = 0;
4216 }
4217
4218 /* Free all the memory associated with MCTX.  */
4219
4220 static void
4221 internal_function
4222 match_ctx_free (re_match_context_t *mctx)
4223 {
4224   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4225   match_ctx_clean (mctx);
4226   re_free (mctx->sub_tops);
4227   re_free (mctx->bkref_ents);
4228 }
4229
4230 /* Add a new backreference entry to MCTX.
4231    Note that we assume that caller never call this function with duplicate
4232    entry, and call with STR_IDX which isn't smaller than any existing entry.
4233 */
4234
4235 static reg_errcode_t
4236 internal_function
4237 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4238                      int to)
4239 {
4240   if (mctx->nbkref_ents >= mctx->abkref_ents)
4241     {
4242       struct re_backref_cache_entry* new_entry;
4243       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4244                               mctx->abkref_ents * 2);
4245       if (BE (new_entry == NULL, 0))
4246         {
4247           re_free (mctx->bkref_ents);
4248           return REG_ESPACE;
4249         }
4250       mctx->bkref_ents = new_entry;
4251       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4252               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4253       mctx->abkref_ents *= 2;
4254     }
4255   if (mctx->nbkref_ents > 0
4256       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4257     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4258
4259   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4260   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4261   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4262   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4263
4264   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4265      If bit N is clear, means that this entry won't epsilon-transition to
4266      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4267      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4268      such node.
4269
4270      A backreference does not epsilon-transition unless it is empty, so set
4271      to all zeros if FROM != TO.  */
4272   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4273     = (from == to ? ~0 : 0);
4274
4275   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4276   if (mctx->max_mb_elem_len < to - from)
4277     mctx->max_mb_elem_len = to - from;
4278   return REG_NOERROR;
4279 }
4280
4281 /* Search for the first entry which has the same str_idx, or -1 if none is
4282    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4283
4284 static int
4285 internal_function
4286 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4287 {
4288   int left, right, mid, last;
4289   last = right = mctx->nbkref_ents;
4290   for (left = 0; left < right;)
4291     {
4292       mid = (left + right) / 2;
4293       if (mctx->bkref_ents[mid].str_idx < str_idx)
4294         left = mid + 1;
4295       else
4296         right = mid;
4297     }
4298   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4299     return left;
4300   else
4301     return -1;
4302 }
4303
4304 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4305    at STR_IDX.  */
4306
4307 static reg_errcode_t
4308 internal_function
4309 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4310 {
4311 #ifdef DEBUG
4312   assert (mctx->sub_tops != NULL);
4313   assert (mctx->asub_tops > 0);
4314 #endif
4315   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4316     {
4317       int new_asub_tops = mctx->asub_tops * 2;
4318       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4319                                                    re_sub_match_top_t *,
4320                                                    new_asub_tops);
4321       if (BE (new_array == NULL, 0))
4322         return REG_ESPACE;
4323       mctx->sub_tops = new_array;
4324       mctx->asub_tops = new_asub_tops;
4325     }
4326   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4327   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4328     return REG_ESPACE;
4329   mctx->sub_tops[mctx->nsub_tops]->node = node;
4330   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4331   return REG_NOERROR;
4332 }
4333
4334 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4335    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4336
4337 static re_sub_match_last_t *
4338 internal_function
4339 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4340 {
4341   re_sub_match_last_t *new_entry;
4342   if (BE (subtop->nlasts == subtop->alasts, 0))
4343     {
4344       int new_alasts = 2 * subtop->alasts + 1;
4345       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4346                                                     re_sub_match_last_t *,
4347                                                     new_alasts);
4348       if (BE (new_array == NULL, 0))
4349         return NULL;
4350       subtop->lasts = new_array;
4351       subtop->alasts = new_alasts;
4352     }
4353   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4354   if (BE (new_entry != NULL, 1))
4355     {
4356       subtop->lasts[subtop->nlasts] = new_entry;
4357       new_entry->node = node;
4358       new_entry->str_idx = str_idx;
4359       ++subtop->nlasts;
4360     }
4361   return new_entry;
4362 }
4363
4364 static void
4365 internal_function
4366 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4367                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4368 {
4369   sctx->sifted_states = sifted_sts;
4370   sctx->limited_states = limited_sts;
4371   sctx->last_node = last_node;
4372   sctx->last_str_idx = last_str_idx;
4373   re_node_set_init_empty (&sctx->limits);
4374 }