posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 231
 232   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 233     return REG_BADPAT;
 234
 235   if (eflags & REG_STARTEND)
 236     {
 237       start = pmatch[0].rm_so;
 238       length = pmatch[0].rm_eo;
 239     }
 240   else
 241     {
 242       start = 0;
 243       length = strlen (string);
 244     }
 245
 246   __libc_lock_lock (dfa->lock);
 247   if (preg->no_sub)
 248     err = re_search_internal (preg, string, length, start, length - start,
 249                               length, 0, NULL, eflags);
 250   else
 251     err = re_search_internal (preg, string, length, start, length - start,
 252                               length, nmatch, pmatch, eflags);
 253   __libc_lock_unlock (dfa->lock);
 254   return err != REG_NOERROR;
 255 }
 256
 257 #ifdef _LIBC
 258 # include <shlib-compat.h>
 259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 260
 261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 262 __typeof__ (__regexec) __compat_regexec;
 263
 264 int
 265 attribute_compat_text_section
 266 __compat_regexec (const regex_t *__restrict preg,
 267                   const char *__restrict string, size_t nmatch,
 268                   regmatch_t pmatch[], int eflags)
 269 {
 270   return regexec (preg, string, nmatch, pmatch,
 271                   eflags & (REG_NOTBOL | REG_NOTEOL));
 272 }
 273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 274 # endif
 275 #endif
 276
 277 /* Entry points for GNU code.  */
 278
 279 /* re_match, re_search, re_match_2, re_search_2
 280
 281    The former two functions operate on STRING with length LENGTH,
 282    while the later two operate on concatenation of STRING1 and STRING2
 283    with lengths LENGTH1 and LENGTH2, respectively.
 284
 285    re_match() matches the compiled pattern in BUFP against the string,
 286    starting at index START.
 287
 288    re_search() first tries matching at index START, then it tries to match
 289    starting from index START + 1, and so on.  The last start position tried
 290    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 291    way as re_match().)
 292
 293    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 294    the first STOP characters of the concatenation of the strings should be
 295    concerned.
 296
 297    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 298    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 299    computed relative to the concatenation, not relative to the individual
 300    strings.)
 301
 302    On success, re_match* functions return the length of the match, re_search*
 303    return the position of the start of the match.  Return value -1 means no
 304    match was found and -2 indicates an internal error.  */
 305
 306 int
 307 re_match (bufp, string, length, start, regs)
 308     struct re_pattern_buffer *bufp;
 309     const char *string;
 310     int length, start;
 311     struct re_registers *regs;
 312 {
 313   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 314 }
 315 #ifdef _LIBC
 316 weak_alias (__re_match, re_match)
 317 #endif
 318
 319 int
 320 re_search (bufp, string, length, start, range, regs)
 321     struct re_pattern_buffer *bufp;
 322     const char *string;
 323     int length, start, range;
 324     struct re_registers *regs;
 325 {
 326   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 327 }
 328 #ifdef _LIBC
 329 weak_alias (__re_search, re_search)
 330 #endif
 331
 332 int
 333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 334     struct re_pattern_buffer *bufp;
 335     const char *string1, *string2;
 336     int length1, length2, start, stop;
 337     struct re_registers *regs;
 338 {
 339   return re_search_2_stub (bufp, string1, length1, string2, length2,
 340                            start, 0, regs, stop, 1);
 341 }
 342 #ifdef _LIBC
 343 weak_alias (__re_match_2, re_match_2)
 344 #endif
 345
 346 int
 347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 348     struct re_pattern_buffer *bufp;
 349     const char *string1, *string2;
 350     int length1, length2, start, range, stop;
 351     struct re_registers *regs;
 352 {
 353   return re_search_2_stub (bufp, string1, length1, string2, length2,
 354                            start, range, regs, stop, 0);
 355 }
 356 #ifdef _LIBC
 357 weak_alias (__re_search_2, re_search_2)
 358 #endif
 359
 360 static int
 361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 362                   stop, ret_len)
 363     struct re_pattern_buffer *bufp;
 364     const char *string1, *string2;
 365     int length1, length2, start, range, stop, ret_len;
 366     struct re_registers *regs;
 367 {
 368   const char *str;
 369   int rval;
 370   int len = length1 + length2;
 371   int free_str = 0;
 372
 373   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 374     return -2;
 375
 376   /* Concatenate the strings.  */
 377   if (length2 > 0)
 378     if (length1 > 0)
 379       {
 380         char *s = re_malloc (char, len);
 381
 382         if (BE (s == NULL, 0))
 383           return -2;
 384 #ifdef _LIBC
 385         memcpy (__mempcpy (s, string1, length1), string2, length2);
 386 #else
 387         memcpy (s, string1, length1);
 388         memcpy (s + length1, string2, length2);
 389 #endif
 390         str = s;
 391         free_str = 1;
 392       }
 393     else
 394       str = string2;
 395   else
 396     str = string1;
 397
 398   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 399                          ret_len);
 400   if (free_str)
 401     re_free ((char *) str);
 402   return rval;
 403 }
 404
 405 /* The parameters have the same meaning as those of re_search.
 406    Additional parameters:
 407    If RET_LEN is nonzero the length of the match is returned (re_match style);
 408    otherwise the position of the match is returned.  */
 409
 410 static int
 411 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 412     struct re_pattern_buffer *bufp;
 413     const char *string;
 414     int length, start, range, stop, ret_len;
 415     struct re_registers *regs;
 416 {
 417   reg_errcode_t result;
 418   regmatch_t *pmatch;
 419   int nregs, rval;
 420   int eflags = 0;
 421   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 422
 423   /* Check for out-of-range.  */
 424   if (BE (start < 0 || start > length, 0))
 425     return -1;
 426   if (BE (start + range > length, 0))
 427     range = length - start;
 428   else if (BE (start + range < 0, 0))
 429     range = -start;
 430
 431   __libc_lock_lock (dfa->lock);
 432
 433   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 434   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 435
 436   /* Compile fastmap if we haven't yet.  */
 437   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 438     re_compile_fastmap (bufp);
 439
 440   if (BE (bufp->no_sub, 0))
 441     regs = NULL;
 442
 443   /* We need at least 1 register.  */
 444   if (regs == NULL)
 445     nregs = 1;
 446   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 447                regs->num_regs < bufp->re_nsub + 1, 0))
 448     {
 449       nregs = regs->num_regs;
 450       if (BE (nregs < 1, 0))
 451         {
 452           /* Nothing can be copied to regs.  */
 453           regs = NULL;
 454           nregs = 1;
 455         }
 456     }
 457   else
 458     nregs = bufp->re_nsub + 1;
 459   pmatch = re_malloc (regmatch_t, nregs);
 460   if (BE (pmatch == NULL, 0))
 461     {
 462       rval = -2;
 463       goto out;
 464     }
 465
 466   result = re_search_internal (bufp, string, length, start, range, stop,
 467                                nregs, pmatch, eflags);
 468
 469   rval = 0;
 470
 471   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 472   if (result != REG_NOERROR)
 473     rval = -1;
 474   else if (regs != NULL)
 475     {
 476       /* If caller wants register contents data back, copy them.  */
 477       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 478                                            bufp->regs_allocated);
 479       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 480         rval = -2;
 481     }
 482
 483   if (BE (rval == 0, 1))
 484     {
 485       if (ret_len)
 486         {
 487           assert (pmatch[0].rm_so == start);
 488           rval = pmatch[0].rm_eo - start;
 489         }
 490       else
 491         rval = pmatch[0].rm_so;
 492     }
 493   re_free (pmatch);
 494  out:
 495   __libc_lock_unlock (dfa->lock);
 496   return rval;
 497 }
 498
 499 static unsigned
 500 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 501     struct re_registers *regs;
 502     regmatch_t *pmatch;
 503     int nregs, regs_allocated;
 504 {
 505   int rval = REGS_REALLOCATE;
 506   int i;
 507   int need_regs = nregs + 1;
 508   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 509      uses.  */
 510
 511   /* Have the register data arrays been allocated?  */
 512   if (regs_allocated == REGS_UNALLOCATED)
 513     { /* No.  So allocate them with malloc.  */
 514       regs->start = re_malloc (regoff_t, need_regs);
 515       regs->end = re_malloc (regoff_t, need_regs);
 516       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 517         return REGS_UNALLOCATED;
 518       regs->num_regs = need_regs;
 519     }
 520   else if (regs_allocated == REGS_REALLOCATE)
 521     { /* Yes.  If we need more elements than were already
 522          allocated, reallocate them.  If we need fewer, just
 523          leave it alone.  */
 524       if (BE (need_regs > regs->num_regs, 0))
 525         {
 526           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 527           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 528           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 529             return REGS_UNALLOCATED;
 530           regs->start = new_start;
 531           regs->end = new_end;
 532           regs->num_regs = need_regs;
 533         }
 534     }
 535   else
 536     {
 537       assert (regs_allocated == REGS_FIXED);
 538       /* This function may not be called with REGS_FIXED and nregs too big.  */
 539       assert (regs->num_regs >= nregs);
 540       rval = REGS_FIXED;
 541     }
 542
 543   /* Copy the regs.  */
 544   for (i = 0; i < nregs; ++i)
 545     {
 546       regs->start[i] = pmatch[i].rm_so;
 547       regs->end[i] = pmatch[i].rm_eo;
 548     }
 549   for ( ; i < regs->num_regs; ++i)
 550     regs->start[i] = regs->end[i] = -1;
 551
 552   return rval;
 553 }
 554
 555 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 556    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 557    this memory for recording register information.  STARTS and ENDS
 558    must be allocated using the malloc library routine, and must each
 559    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 560
 561    If NUM_REGS == 0, then subsequent matches should allocate their own
 562    register data.
 563
 564    Unless this function is called, the first search or match using
 565    PATTERN_BUFFER will allocate its own register data, without
 566    freeing the old data.  */
 567
 568 void
 569 re_set_registers (bufp, regs, num_regs, starts, ends)
 570     struct re_pattern_buffer *bufp;
 571     struct re_registers *regs;
 572     unsigned num_regs;
 573     regoff_t *starts, *ends;
 574 {
 575   if (num_regs)
 576     {
 577       bufp->regs_allocated = REGS_REALLOCATE;
 578       regs->num_regs = num_regs;
 579       regs->start = starts;
 580       regs->end = ends;
 581     }
 582   else
 583     {
 584       bufp->regs_allocated = REGS_UNALLOCATED;
 585       regs->num_regs = 0;
 586       regs->start = regs->end = (regoff_t *) 0;
 587     }
 588 }
 589 #ifdef _LIBC
 590 weak_alias (__re_set_registers, re_set_registers)
 591 #endif
 592 \f
 593 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 594    them unless specifically requested.  */
 595
 596 #if defined _REGEX_RE_COMP || defined _LIBC
 597 int
 598 # ifdef _LIBC
 599 weak_function
 600 # endif
 601 re_exec (s)
 602      const char *s;
 603 {
 604   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 605 }
 606 #endif /* _REGEX_RE_COMP */
 607 \f
 608 /* Internal entry point.  */
 609
 610 /* Searches for a compiled pattern PREG in the string STRING, whose
 611    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 612    mingings with regexec.  START, and RANGE have the same meanings
 613    with re_search.
 614    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 615    otherwise return the error code.
 616    Note: We assume front end functions already check ranges.
 617    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 618
 619 static reg_errcode_t
 620 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 621                     eflags)
 622     const regex_t *preg;
 623     const char *string;
 624     int length, start, range, stop, eflags;
 625     size_t nmatch;
 626     regmatch_t pmatch[];
 627 {
 628   reg_errcode_t err;
 629   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 630   int left_lim, right_lim, incr;
 631   int fl_longest_match, match_first, match_kind, match_last = -1;
 632   int extra_nmatch;
 633   int sb, ch;
 634 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 635   re_match_context_t mctx = { .dfa = dfa };
 636 #else
 637   re_match_context_t mctx;
 638 #endif
 639   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 640                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 641   RE_TRANSLATE_TYPE t = preg->translate;
 642
 643 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 644   memset (&mctx, '\0', sizeof (re_match_context_t));
 645   mctx.dfa = dfa;
 646 #endif
 647
 648   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 649   nmatch -= extra_nmatch;
 650
 651   /* Check if the DFA haven't been compiled.  */
 652   if (BE (preg->used == 0 || dfa->init_state == NULL
 653           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 654           || dfa->init_state_begbuf == NULL, 0))
 655     return REG_NOMATCH;
 656
 657 #ifdef DEBUG
 658   /* We assume front-end functions already check them.  */
 659   assert (start + range >= 0 && start + range <= length);
 660 #endif
 661
 662   /* If initial states with non-begbuf contexts have no elements,
 663      the regex must be anchored.  If preg->newline_anchor is set,
 664      we'll never use init_state_nl, so do not check it.  */
 665   if (dfa->init_state->nodes.nelem == 0
 666       && dfa->init_state_word->nodes.nelem == 0
 667       && (dfa->init_state_nl->nodes.nelem == 0
 668           || !preg->newline_anchor))
 669     {
 670       if (start != 0 && start + range != 0)
 671         return REG_NOMATCH;
 672       start = range = 0;
 673     }
 674
 675   /* We must check the longest matching, if nmatch > 0.  */
 676   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 677
 678   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 679                             preg->translate, preg->syntax & RE_ICASE, dfa);
 680   if (BE (err != REG_NOERROR, 0))
 681     goto free_return;
 682   mctx.input.stop = stop;
 683   mctx.input.raw_stop = stop;
 684   mctx.input.newline_anchor = preg->newline_anchor;
 685
 686   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 687   if (BE (err != REG_NOERROR, 0))
 688     goto free_return;
 689
 690   /* We will log all the DFA states through which the dfa pass,
 691      if nmatch > 1, or this dfa has "multibyte node", which is a
 692      back-reference or a node which can accept multibyte character or
 693      multi character collating element.  */
 694   if (nmatch > 1 || dfa->has_mb_node)
 695     {
 696       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 697       if (BE (mctx.state_log == NULL, 0))
 698         {
 699           err = REG_ESPACE;
 700           goto free_return;
 701         }
 702     }
 703   else
 704     mctx.state_log = NULL;
 705
 706   match_first = start;
 707   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 708                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 709
 710   /* Check incrementally whether of not the input string match.  */
 711   incr = (range < 0) ? -1 : 1;
 712   left_lim = (range < 0) ? start + range : start;
 713   right_lim = (range < 0) ? start : start + range;
 714   sb = dfa->mb_cur_max == 1;
 715   match_kind =
 716     (fastmap
 717      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 718         | (range >= 0 ? 2 : 0)
 719         | (t != NULL ? 1 : 0))
 720      : 8);
 721
 722   for (;; match_first += incr)
 723     {
 724       err = REG_NOMATCH;
 725       if (match_first < left_lim || right_lim < match_first)
 726         goto free_return;
 727
 728       /* Advance as rapidly as possible through the string, until we
 729          find a plausible place to start matching.  This may be done
 730          with varying efficiency, so there are various possibilities:
 731          only the most common of them are specialized, in order to
 732          save on code size.  We use a switch statement for speed.  */
 733       switch (match_kind)
 734         {
 735         case 8:
 736           /* No fastmap.  */
 737           break;
 738
 739         case 7:
 740           /* Fastmap with single-byte translation, match forward.  */
 741           while (BE (match_first < right_lim, 1)
 742                  && !fastmap[t[(unsigned char) string[match_first]]])
 743             ++match_first;
 744           goto forward_match_found_start_or_reached_end;
 745
 746         case 6:
 747           /* Fastmap without translation, match forward.  */
 748           while (BE (match_first < right_lim, 1)
 749                  && !fastmap[(unsigned char) string[match_first]])
 750             ++match_first;
 751
 752         forward_match_found_start_or_reached_end:
 753           if (BE (match_first == right_lim, 0))
 754             {
 755               ch = match_first >= length
 756                        ? 0 : (unsigned char) string[match_first];
 757               if (!fastmap[t ? t[ch] : ch])
 758                 goto free_return;
 759             }
 760           break;
 761
 762         case 4:
 763         case 5:
 764           /* Fastmap without multi-byte translation, match backwards.  */
 765           while (match_first >= left_lim)
 766             {
 767               ch = match_first >= length
 768                        ? 0 : (unsigned char) string[match_first];
 769               if (fastmap[t ? t[ch] : ch])
 770                 break;
 771               --match_first;
 772             }
 773           if (match_first < left_lim)
 774             goto free_return;
 775           break;
 776
 777         default:
 778           /* In this case, we can't determine easily the current byte,
 779              since it might be a component byte of a multibyte
 780              character.  Then we use the constructed buffer instead.  */
 781           for (;;)
 782             {
 783               /* If MATCH_FIRST is out of the valid range, reconstruct the
 784                  buffers.  */
 785               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 786               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 787                 {
 788                   err = re_string_reconstruct (&mctx.input, match_first,
 789                                                eflags);
 790                   if (BE (err != REG_NOERROR, 0))
 791                     goto free_return;
 792
 793                   offset = match_first - mctx.input.raw_mbs_idx;
 794                 }
 795               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 796                  Note that MATCH_FIRST must not be smaller than 0.  */
 797               ch = (match_first >= length
 798                     ? 0 : re_string_byte_at (&mctx.input, offset));
 799               if (fastmap[ch])
 800                 break;
 801               match_first += incr;
 802               if (match_first < left_lim || match_first > right_lim)
 803                 {
 804                   err = REG_NOMATCH;
 805                   goto free_return;
 806                 }
 807             }
 808           break;
 809         }
 810
 811       /* Reconstruct the buffers so that the matcher can assume that
 812          the matching starts from the beginning of the buffer.  */
 813       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 814       if (BE (err != REG_NOERROR, 0))
 815         goto free_return;
 816
 817 #ifdef RE_ENABLE_I18N
 818      /* Don't consider this char as a possible match start if it part,
 819         yet isn't the head, of a multibyte character.  */
 820       if (!sb && !re_string_first_byte (&mctx.input, 0))
 821         continue;
 822 #endif
 823
 824       /* It seems to be appropriate one, then use the matcher.  */
 825       /* We assume that the matching starts from 0.  */
 826       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 827       match_last = check_matching (&mctx, fl_longest_match,
 828                                    range >= 0 ? &match_first : NULL);
 829       if (match_last != -1)
 830         {
 831           if (BE (match_last == -2, 0))
 832             {
 833               err = REG_ESPACE;
 834               goto free_return;
 835             }
 836           else
 837             {
 838               mctx.match_last = match_last;
 839               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 840                 {
 841                   re_dfastate_t *pstate = mctx.state_log[match_last];
 842                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 843                                                              match_last);
 844                 }
 845               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 846                   || dfa->nbackref)
 847                 {
 848                   err = prune_impossible_nodes (&mctx);
 849                   if (err == REG_NOERROR)
 850                     break;
 851                   if (BE (err != REG_NOMATCH, 0))
 852                     goto free_return;
 853                   match_last = -1;
 854                 }
 855               else
 856                 break; /* We found a match.  */
 857             }
 858         }
 859
 860       match_ctx_clean (&mctx);
 861     }
 862
 863 #ifdef DEBUG
 864   assert (match_last != -1);
 865   assert (err == REG_NOERROR);
 866 #endif
 867
 868   /* Set pmatch[] if we need.  */
 869   if (nmatch > 0)
 870     {
 871       int reg_idx;
 872
 873       /* Initialize registers.  */
 874       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 875         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 876
 877       /* Set the points where matching start/end.  */
 878       pmatch[0].rm_so = 0;
 879       pmatch[0].rm_eo = mctx.match_last;
 880
 881       if (!preg->no_sub && nmatch > 1)
 882         {
 883           err = set_regs (preg, &mctx, nmatch, pmatch,
 884                           dfa->has_plural_match && dfa->nbackref > 0);
 885           if (BE (err != REG_NOERROR, 0))
 886             goto free_return;
 887         }
 888
 889       /* At last, add the offset to the each registers, since we slided
 890          the buffers so that we could assume that the matching starts
 891          from 0.  */
 892       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 893         if (pmatch[reg_idx].rm_so != -1)
 894           {
 895 #ifdef RE_ENABLE_I18N
 896             if (BE (mctx.input.offsets_needed != 0, 0))
 897               {
 898                 pmatch[reg_idx].rm_so =
 899                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 900                    ? mctx.input.valid_raw_len
 901                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 902                 pmatch[reg_idx].rm_eo =
 903                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 904                    ? mctx.input.valid_raw_len
 905                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 906               }
 907 #else
 908             assert (mctx.input.offsets_needed == 0);
 909 #endif
 910             pmatch[reg_idx].rm_so += match_first;
 911             pmatch[reg_idx].rm_eo += match_first;
 912           }
 913       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 914         {
 915           pmatch[nmatch + reg_idx].rm_so = -1;
 916           pmatch[nmatch + reg_idx].rm_eo = -1;
 917         }
 918
 919       if (dfa->subexp_map)
 920         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 921           if (dfa->subexp_map[reg_idx] != reg_idx)
 922             {
 923               pmatch[reg_idx + 1].rm_so
 924                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 925               pmatch[reg_idx + 1].rm_eo
 926                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 927             }
 928     }
 929
 930  free_return:
 931   re_free (mctx.state_log);
 932   if (dfa->nbackref)
 933     match_ctx_free (&mctx);
 934   re_string_destruct (&mctx.input);
 935   return err;
 936 }
 937
 938 static reg_errcode_t
 939 prune_impossible_nodes (mctx)
 940      re_match_context_t *mctx;
 941 {
 942   const re_dfa_t *const dfa = mctx->dfa;
 943   int halt_node, match_last;
 944   reg_errcode_t ret;
 945   re_dfastate_t **sifted_states;
 946   re_dfastate_t **lim_states = NULL;
 947   re_sift_context_t sctx;
 948 #ifdef DEBUG
 949   assert (mctx->state_log != NULL);
 950 #endif
 951   match_last = mctx->match_last;
 952   halt_node = mctx->last_node;
 953   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 954   if (BE (sifted_states == NULL, 0))
 955     {
 956       ret = REG_ESPACE;
 957       goto free_return;
 958     }
 959   if (dfa->nbackref)
 960     {
 961       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 962       if (BE (lim_states == NULL, 0))
 963         {
 964           ret = REG_ESPACE;
 965           goto free_return;
 966         }
 967       while (1)
 968         {
 969           memset (lim_states, '\0',
 970                   sizeof (re_dfastate_t *) * (match_last + 1));
 971           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 972                          match_last);
 973           ret = sift_states_backward (mctx, &sctx);
 974           re_node_set_free (&sctx.limits);
 975           if (BE (ret != REG_NOERROR, 0))
 976               goto free_return;
 977           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 978             break;
 979           do
 980             {
 981               --match_last;
 982               if (match_last < 0)
 983                 {
 984                   ret = REG_NOMATCH;
 985                   goto free_return;
 986                 }
 987             } while (mctx->state_log[match_last] == NULL
 988                      || !mctx->state_log[match_last]->halt);
 989           halt_node = check_halt_state_context (mctx,
 990                                                 mctx->state_log[match_last],
 991                                                 match_last);
 992         }
 993       ret = merge_state_array (dfa, sifted_states, lim_states,
 994                                match_last + 1);
 995       re_free (lim_states);
 996       lim_states = NULL;
 997       if (BE (ret != REG_NOERROR, 0))
 998         goto free_return;
 999     }
1000   else
1001     {
1002       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1003       ret = sift_states_backward (mctx, &sctx);
1004       re_node_set_free (&sctx.limits);
1005       if (BE (ret != REG_NOERROR, 0))
1006         goto free_return;
1007     }
1008   re_free (mctx->state_log);
1009   mctx->state_log = sifted_states;
1010   sifted_states = NULL;
1011   mctx->last_node = halt_node;
1012   mctx->match_last = match_last;
1013   ret = REG_NOERROR;
1014  free_return:
1015   re_free (sifted_states);
1016   re_free (lim_states);
1017   return ret;
1018 }
1019
1020 /* Acquire an initial state and return it.
1021    We must select appropriate initial state depending on the context,
1022    since initial states may have constraints like "\<", "^", etc..  */
1023
1024 static inline re_dfastate_t *
1025 __attribute ((always_inline)) internal_function
1026 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1027                             int idx)
1028 {
1029   const re_dfa_t *const dfa = mctx->dfa;
1030   if (dfa->init_state->has_constraint)
1031     {
1032       unsigned int context;
1033       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1034       if (IS_WORD_CONTEXT (context))
1035         return dfa->init_state_word;
1036       else if (IS_ORDINARY_CONTEXT (context))
1037         return dfa->init_state;
1038       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1039         return dfa->init_state_begbuf;
1040       else if (IS_NEWLINE_CONTEXT (context))
1041         return dfa->init_state_nl;
1042       else if (IS_BEGBUF_CONTEXT (context))
1043         {
1044           /* It is relatively rare case, then calculate on demand.  */
1045           return re_acquire_state_context (err, dfa,
1046                                            dfa->init_state->entrance_nodes,
1047                                            context);
1048         }
1049       else
1050         /* Must not happen?  */
1051         return dfa->init_state;
1052     }
1053   else
1054     return dfa->init_state;
1055 }
1056
1057 /* Check whether the regular expression match input string INPUT or not,
1058    and return the index where the matching end, return -1 if not match,
1059    or return -2 in case of an error.
1060    FL_LONGEST_MATCH means we want the POSIX longest matching.
1061    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1062    next place where we may want to try matching.
1063    Note that the matcher assume that the maching starts from the current
1064    index of the buffer.  */
1065
1066 static int
1067 internal_function
1068 check_matching (re_match_context_t *mctx, int fl_longest_match,
1069                 int *p_match_first)
1070 {
1071   const re_dfa_t *const dfa = mctx->dfa;
1072   reg_errcode_t err;
1073   int match = 0;
1074   int match_last = -1;
1075   int cur_str_idx = re_string_cur_idx (&mctx->input);
1076   re_dfastate_t *cur_state;
1077   int at_init_state = p_match_first != NULL;
1078   int next_start_idx = cur_str_idx;
1079
1080   err = REG_NOERROR;
1081   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1082   /* An initial state must not be NULL (invalid).  */
1083   if (BE (cur_state == NULL, 0))
1084     {
1085       assert (err == REG_ESPACE);
1086       return -2;
1087     }
1088
1089   if (mctx->state_log != NULL)
1090     {
1091       mctx->state_log[cur_str_idx] = cur_state;
1092
1093       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1094          later.  E.g. Processing back references.  */
1095       if (BE (dfa->nbackref, 0))
1096         {
1097           at_init_state = 0;
1098           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1099           if (BE (err != REG_NOERROR, 0))
1100             return err;
1101
1102           if (cur_state->has_backref)
1103             {
1104               err = transit_state_bkref (mctx, &cur_state->nodes);
1105               if (BE (err != REG_NOERROR, 0))
1106                 return err;
1107             }
1108         }
1109     }
1110
1111   /* If the RE accepts NULL string.  */
1112   if (BE (cur_state->halt, 0))
1113     {
1114       if (!cur_state->has_constraint
1115           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1116         {
1117           if (!fl_longest_match)
1118             return cur_str_idx;
1119           else
1120             {
1121               match_last = cur_str_idx;
1122               match = 1;
1123             }
1124         }
1125     }
1126
1127   while (!re_string_eoi (&mctx->input))
1128     {
1129       re_dfastate_t *old_state = cur_state;
1130       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1131
1132       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1133           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1134               && mctx->input.valid_len < mctx->input.len))
1135         {
1136           err = extend_buffers (mctx);
1137           if (BE (err != REG_NOERROR, 0))
1138             {
1139               assert (err == REG_ESPACE);
1140               return -2;
1141             }
1142         }
1143
1144       cur_state = transit_state (&err, mctx, cur_state);
1145       if (mctx->state_log != NULL)
1146         cur_state = merge_state_with_log (&err, mctx, cur_state);
1147
1148       if (cur_state == NULL)
1149         {
1150           /* Reached the invalid state or an error.  Try to recover a valid
1151              state using the state log, if available and if we have not
1152              already found a valid (even if not the longest) match.  */
1153           if (BE (err != REG_NOERROR, 0))
1154             return -2;
1155
1156           if (mctx->state_log == NULL
1157               || (match && !fl_longest_match)
1158               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1159             break;
1160         }
1161
1162       if (BE (at_init_state, 0))
1163         {
1164           if (old_state == cur_state)
1165             next_start_idx = next_char_idx;
1166           else
1167             at_init_state = 0;
1168         }
1169
1170       if (cur_state->halt)
1171         {
1172           /* Reached a halt state.
1173              Check the halt state can satisfy the current context.  */
1174           if (!cur_state->has_constraint
1175               || check_halt_state_context (mctx, cur_state,
1176                                            re_string_cur_idx (&mctx->input)))
1177             {
1178               /* We found an appropriate halt state.  */
1179               match_last = re_string_cur_idx (&mctx->input);
1180               match = 1;
1181
1182               /* We found a match, do not modify match_first below.  */
1183               p_match_first = NULL;
1184               if (!fl_longest_match)
1185                 break;
1186             }
1187         }
1188     }
1189
1190   if (p_match_first)
1191     *p_match_first += next_start_idx;
1192
1193   return match_last;
1194 }
1195
1196 /* Check NODE match the current context.  */
1197
1198 static int
1199 internal_function
1200 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1201 {
1202   re_token_type_t type = dfa->nodes[node].type;
1203   unsigned int constraint = dfa->nodes[node].constraint;
1204   if (type != END_OF_RE)
1205     return 0;
1206   if (!constraint)
1207     return 1;
1208   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1209     return 0;
1210   return 1;
1211 }
1212
1213 /* Check the halt state STATE match the current context.
1214    Return 0 if not match, if the node, STATE has, is a halt node and
1215    match the context, return the node.  */
1216
1217 static int
1218 internal_function
1219 check_halt_state_context (const re_match_context_t *mctx,
1220                           const re_dfastate_t *state, int idx)
1221 {
1222   int i;
1223   unsigned int context;
1224 #ifdef DEBUG
1225   assert (state->halt);
1226 #endif
1227   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1228   for (i = 0; i < state->nodes.nelem; ++i)
1229     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1230       return state->nodes.elems[i];
1231   return 0;
1232 }
1233
1234 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1235    corresponding to the DFA).
1236    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1237    of errors.  */
1238
1239 static int
1240 internal_function
1241 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1242                    int *pidx, int node, re_node_set *eps_via_nodes,
1243                    struct re_fail_stack_t *fs)
1244 {
1245   const re_dfa_t *const dfa = mctx->dfa;
1246   int i, err;
1247   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1248     {
1249       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1250       re_node_set *edests = &dfa->edests[node];
1251       int dest_node;
1252       err = re_node_set_insert (eps_via_nodes, node);
1253       if (BE (err < 0, 0))
1254         return -2;
1255       /* Pick up a valid destination, or return -1 if none is found.  */
1256       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1257         {
1258           int candidate = edests->elems[i];
1259           if (!re_node_set_contains (cur_nodes, candidate))
1260             continue;
1261           if (dest_node == -1)
1262             dest_node = candidate;
1263
1264           else
1265             {
1266               /* In order to avoid infinite loop like "(a*)*", return the second
1267                  epsilon-transition if the first was already considered.  */
1268               if (re_node_set_contains (eps_via_nodes, dest_node))
1269                 return candidate;
1270
1271               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1272               else if (fs != NULL
1273                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1274                                            eps_via_nodes))
1275                 return -2;
1276
1277               /* We know we are going to exit.  */
1278               break;
1279             }
1280         }
1281       return dest_node;
1282     }
1283   else
1284     {
1285       int naccepted = 0;
1286       re_token_type_t type = dfa->nodes[node].type;
1287
1288 #ifdef RE_ENABLE_I18N
1289       if (dfa->nodes[node].accept_mb)
1290         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1291       else
1292 #endif /* RE_ENABLE_I18N */
1293       if (type == OP_BACK_REF)
1294         {
1295           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1296           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1297           if (fs != NULL)
1298             {
1299               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1300                 return -1;
1301               else if (naccepted)
1302                 {
1303                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1304                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1305                               naccepted) != 0)
1306                     return -1;
1307                 }
1308             }
1309
1310           if (naccepted == 0)
1311             {
1312               int dest_node;
1313               err = re_node_set_insert (eps_via_nodes, node);
1314               if (BE (err < 0, 0))
1315                 return -2;
1316               dest_node = dfa->edests[node].elems[0];
1317               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318                                         dest_node))
1319                 return dest_node;
1320             }
1321         }
1322
1323       if (naccepted != 0
1324           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1325         {
1326           int dest_node = dfa->nexts[node];
1327           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1328           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1329                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1330                                                dest_node)))
1331             return -1;
1332           re_node_set_empty (eps_via_nodes);
1333           return dest_node;
1334         }
1335     }
1336   return -1;
1337 }
1338
1339 static reg_errcode_t
1340 internal_function
1341 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1342                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1343 {
1344   reg_errcode_t err;
1345   int num = fs->num++;
1346   if (fs->num == fs->alloc)
1347     {
1348       struct re_fail_stack_ent_t *new_array;
1349       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1350                                        * fs->alloc * 2));
1351       if (new_array == NULL)
1352         return REG_ESPACE;
1353       fs->alloc *= 2;
1354       fs->stack = new_array;
1355     }
1356   fs->stack[num].idx = str_idx;
1357   fs->stack[num].node = dest_node;
1358   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1359   if (fs->stack[num].regs == NULL)
1360     return REG_ESPACE;
1361   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1362   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1363   return err;
1364 }
1365
1366 static int
1367 internal_function
1368 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1369                 regmatch_t *regs, re_node_set *eps_via_nodes)
1370 {
1371   int num = --fs->num;
1372   assert (num >= 0);
1373   *pidx = fs->stack[num].idx;
1374   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1375   re_node_set_free (eps_via_nodes);
1376   re_free (fs->stack[num].regs);
1377   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1378   return fs->stack[num].node;
1379 }
1380
1381 /* Set the positions where the subexpressions are starts/ends to registers
1382    PMATCH.
1383    Note: We assume that pmatch[0] is already set, and
1384    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1385
1386 static reg_errcode_t
1387 internal_function
1388 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1389           regmatch_t *pmatch, int fl_backtrack)
1390 {
1391   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1392   int idx, cur_node;
1393   re_node_set eps_via_nodes;
1394   struct re_fail_stack_t *fs;
1395   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1396   regmatch_t *prev_idx_match;
1397   int prev_idx_match_malloced = 0;
1398
1399 #ifdef DEBUG
1400   assert (nmatch > 1);
1401   assert (mctx->state_log != NULL);
1402 #endif
1403   if (fl_backtrack)
1404     {
1405       fs = &fs_body;
1406       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1407       if (fs->stack == NULL)
1408         return REG_ESPACE;
1409     }
1410   else
1411     fs = NULL;
1412
1413   cur_node = dfa->init_node;
1414   re_node_set_init_empty (&eps_via_nodes);
1415
1416   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1417     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1418   else
1419     {
1420       prev_idx_match = re_malloc (regmatch_t, nmatch);
1421       if (prev_idx_match == NULL)
1422         {
1423           free_fail_stack_return (fs);
1424           return REG_ESPACE;
1425         }
1426       prev_idx_match_malloced = 1;
1427     }
1428   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1429
1430   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1431     {
1432       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1433
1434       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1435         {
1436           int reg_idx;
1437           if (fs)
1438             {
1439               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1440                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1441                   break;
1442               if (reg_idx == nmatch)
1443                 {
1444                   re_node_set_free (&eps_via_nodes);
1445                   if (prev_idx_match_malloced)
1446                     re_free (prev_idx_match);
1447                   return free_fail_stack_return (fs);
1448                 }
1449               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1450                                          &eps_via_nodes);
1451             }
1452           else
1453             {
1454               re_node_set_free (&eps_via_nodes);
1455               if (prev_idx_match_malloced)
1456                 re_free (prev_idx_match);
1457               return REG_NOERROR;
1458             }
1459         }
1460
1461       /* Proceed to next node.  */
1462       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1463                                     &eps_via_nodes, fs);
1464
1465       if (BE (cur_node < 0, 0))
1466         {
1467           if (BE (cur_node == -2, 0))
1468             {
1469               re_node_set_free (&eps_via_nodes);
1470               if (prev_idx_match_malloced)
1471                 re_free (prev_idx_match);
1472               free_fail_stack_return (fs);
1473               return REG_ESPACE;
1474             }
1475           if (fs)
1476             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1477                                        &eps_via_nodes);
1478           else
1479             {
1480               re_node_set_free (&eps_via_nodes);
1481               if (prev_idx_match_malloced)
1482                 re_free (prev_idx_match);
1483               return REG_NOMATCH;
1484             }
1485         }
1486     }
1487   re_node_set_free (&eps_via_nodes);
1488   if (prev_idx_match_malloced)
1489     re_free (prev_idx_match);
1490   return free_fail_stack_return (fs);
1491 }
1492
1493 static reg_errcode_t
1494 internal_function
1495 free_fail_stack_return (struct re_fail_stack_t *fs)
1496 {
1497   if (fs)
1498     {
1499       int fs_idx;
1500       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1501         {
1502           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1503           re_free (fs->stack[fs_idx].regs);
1504         }
1505       re_free (fs->stack);
1506     }
1507   return REG_NOERROR;
1508 }
1509
1510 static void
1511 internal_function
1512 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1513              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1514 {
1515   int type = dfa->nodes[cur_node].type;
1516   if (type == OP_OPEN_SUBEXP)
1517     {
1518       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1519
1520       /* We are at the first node of this sub expression.  */
1521       if (reg_num < nmatch)
1522         {
1523           pmatch[reg_num].rm_so = cur_idx;
1524           pmatch[reg_num].rm_eo = -1;
1525         }
1526     }
1527   else if (type == OP_CLOSE_SUBEXP)
1528     {
1529       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1530       if (reg_num < nmatch)
1531         {
1532           /* We are at the last node of this sub expression.  */
1533           if (pmatch[reg_num].rm_so < cur_idx)
1534             {
1535               pmatch[reg_num].rm_eo = cur_idx;
1536               /* This is a non-empty match or we are not inside an optional
1537                  subexpression.  Accept this right away.  */
1538               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1539             }
1540           else
1541             {
1542               if (dfa->nodes[cur_node].opt_subexp
1543                   && prev_idx_match[reg_num].rm_so != -1)
1544                 /* We transited through an empty match for an optional
1545                    subexpression, like (a?)*, and this is not the subexp's
1546                    first match.  Copy back the old content of the registers
1547                    so that matches of an inner subexpression are undone as
1548                    well, like in ((a?))*.  */
1549                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1550               else
1551                 /* We completed a subexpression, but it may be part of
1552                    an optional one, so do not update PREV_IDX_MATCH.  */
1553                 pmatch[reg_num].rm_eo = cur_idx;
1554             }
1555         }
1556     }
1557 }
1558
1559 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1560    and sift the nodes in each states according to the following rules.
1561    Updated state_log will be wrote to STATE_LOG.
1562
1563    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1564      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1565         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1566         the LAST_NODE, we throw away the node `a'.
1567      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1568         string `s' and transit to `b':
1569         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1570            away the node `a'.
1571         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1572             thrown away, we throw away the node `a'.
1573      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1574         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1575            node `a'.
1576         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1577             we throw away the node `a'.  */
1578
1579 #define STATE_NODE_CONTAINS(state,node) \
1580   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1581
1582 static reg_errcode_t
1583 internal_function
1584 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1585 {
1586   reg_errcode_t err;
1587   int null_cnt = 0;
1588   int str_idx = sctx->last_str_idx;
1589   re_node_set cur_dest;
1590
1591 #ifdef DEBUG
1592   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1593 #endif
1594
1595   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1596      transit to the last_node and the last_node itself.  */
1597   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1598   if (BE (err != REG_NOERROR, 0))
1599     return err;
1600   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1601   if (BE (err != REG_NOERROR, 0))
1602     goto free_return;
1603
1604   /* Then check each states in the state_log.  */
1605   while (str_idx > 0)
1606     {
1607       /* Update counters.  */
1608       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1609       if (null_cnt > mctx->max_mb_elem_len)
1610         {
1611           memset (sctx->sifted_states, '\0',
1612                   sizeof (re_dfastate_t *) * str_idx);
1613           re_node_set_free (&cur_dest);
1614           return REG_NOERROR;
1615         }
1616       re_node_set_empty (&cur_dest);
1617       --str_idx;
1618
1619       if (mctx->state_log[str_idx])
1620         {
1621           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1622           if (BE (err != REG_NOERROR, 0))
1623             goto free_return;
1624         }
1625
1626       /* Add all the nodes which satisfy the following conditions:
1627          - It can epsilon transit to a node in CUR_DEST.
1628          - It is in CUR_SRC.
1629          And update state_log.  */
1630       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1631       if (BE (err != REG_NOERROR, 0))
1632         goto free_return;
1633     }
1634   err = REG_NOERROR;
1635  free_return:
1636   re_node_set_free (&cur_dest);
1637   return err;
1638 }
1639
1640 static reg_errcode_t
1641 internal_function
1642 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1643                      int str_idx, re_node_set *cur_dest)
1644 {
1645   const re_dfa_t *const dfa = mctx->dfa;
1646   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1647   int i;
1648
1649   /* Then build the next sifted state.
1650      We build the next sifted state on `cur_dest', and update
1651      `sifted_states[str_idx]' with `cur_dest'.
1652      Note:
1653      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1654      `cur_src' points the node_set of the old `state_log[str_idx]'
1655      (with the epsilon nodes pre-filtered out).  */
1656   for (i = 0; i < cur_src->nelem; i++)
1657     {
1658       int prev_node = cur_src->elems[i];
1659       int naccepted = 0;
1660       int ret;
1661
1662 #ifdef DEBUG
1663       re_token_type_t type = dfa->nodes[prev_node].type;
1664       assert (!IS_EPSILON_NODE (type));
1665 #endif
1666 #ifdef RE_ENABLE_I18N
1667       /* If the node may accept `multi byte'.  */
1668       if (dfa->nodes[prev_node].accept_mb)
1669         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1670                                          str_idx, sctx->last_str_idx);
1671 #endif /* RE_ENABLE_I18N */
1672
1673       /* We don't check backreferences here.
1674          See update_cur_sifted_state().  */
1675       if (!naccepted
1676           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1677           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1678                                   dfa->nexts[prev_node]))
1679         naccepted = 1;
1680
1681       if (naccepted == 0)
1682         continue;
1683
1684       if (sctx->limits.nelem)
1685         {
1686           int to_idx = str_idx + naccepted;
1687           if (check_dst_limits (mctx, &sctx->limits,
1688                                 dfa->nexts[prev_node], to_idx,
1689                                 prev_node, str_idx))
1690             continue;
1691         }
1692       ret = re_node_set_insert (cur_dest, prev_node);
1693       if (BE (ret == -1, 0))
1694         return REG_ESPACE;
1695     }
1696
1697   return REG_NOERROR;
1698 }
1699
1700 /* Helper functions.  */
1701
1702 static reg_errcode_t
1703 internal_function
1704 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1705 {
1706   int top = mctx->state_log_top;
1707
1708   if (next_state_log_idx >= mctx->input.bufs_len
1709       || (next_state_log_idx >= mctx->input.valid_len
1710           && mctx->input.valid_len < mctx->input.len))
1711     {
1712       reg_errcode_t err;
1713       err = extend_buffers (mctx);
1714       if (BE (err != REG_NOERROR, 0))
1715         return err;
1716     }
1717
1718   if (top < next_state_log_idx)
1719     {
1720       memset (mctx->state_log + top + 1, '\0',
1721               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1722       mctx->state_log_top = next_state_log_idx;
1723     }
1724   return REG_NOERROR;
1725 }
1726
1727 static reg_errcode_t
1728 internal_function
1729 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1730                    re_dfastate_t **src, int num)
1731 {
1732   int st_idx;
1733   reg_errcode_t err;
1734   for (st_idx = 0; st_idx < num; ++st_idx)
1735     {
1736       if (dst[st_idx] == NULL)
1737         dst[st_idx] = src[st_idx];
1738       else if (src[st_idx] != NULL)
1739         {
1740           re_node_set merged_set;
1741           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1742                                         &src[st_idx]->nodes);
1743           if (BE (err != REG_NOERROR, 0))
1744             return err;
1745           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1746           re_node_set_free (&merged_set);
1747           if (BE (err != REG_NOERROR, 0))
1748             return err;
1749         }
1750     }
1751   return REG_NOERROR;
1752 }
1753
1754 static reg_errcode_t
1755 internal_function
1756 update_cur_sifted_state (const re_match_context_t *mctx,
1757                          re_sift_context_t *sctx, int str_idx,
1758                          re_node_set *dest_nodes)
1759 {
1760   const re_dfa_t *const dfa = mctx->dfa;
1761   reg_errcode_t err = REG_NOERROR;
1762   const re_node_set *candidates;
1763   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1764                 : &mctx->state_log[str_idx]->nodes);
1765
1766   if (dest_nodes->nelem == 0)
1767     sctx->sifted_states[str_idx] = NULL;
1768   else
1769     {
1770       if (candidates)
1771         {
1772           /* At first, add the nodes which can epsilon transit to a node in
1773              DEST_NODE.  */
1774           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1775           if (BE (err != REG_NOERROR, 0))
1776             return err;
1777
1778           /* Then, check the limitations in the current sift_context.  */
1779           if (sctx->limits.nelem)
1780             {
1781               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1782                                          mctx->bkref_ents, str_idx);
1783               if (BE (err != REG_NOERROR, 0))
1784                 return err;
1785             }
1786         }
1787
1788       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1789       if (BE (err != REG_NOERROR, 0))
1790         return err;
1791     }
1792
1793   if (candidates && mctx->state_log[str_idx]->has_backref)
1794     {
1795       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1796       if (BE (err != REG_NOERROR, 0))
1797         return err;
1798     }
1799   return REG_NOERROR;
1800 }
1801
1802 static reg_errcode_t
1803 internal_function
1804 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1805                        const re_node_set *candidates)
1806 {
1807   reg_errcode_t err = REG_NOERROR;
1808   int i;
1809
1810   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1811   if (BE (err != REG_NOERROR, 0))
1812     return err;
1813
1814   if (!state->inveclosure.alloc)
1815     {
1816       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1817       if (BE (err != REG_NOERROR, 0))
1818         return REG_ESPACE;
1819       for (i = 0; i < dest_nodes->nelem; i++)
1820         re_node_set_merge (&state->inveclosure,
1821                            dfa->inveclosures + dest_nodes->elems[i]);
1822     }
1823   return re_node_set_add_intersect (dest_nodes, candidates,
1824                                     &state->inveclosure);
1825 }
1826
1827 static reg_errcode_t
1828 internal_function
1829 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1830                        const re_node_set *candidates)
1831 {
1832     int ecl_idx;
1833     reg_errcode_t err;
1834     re_node_set *inv_eclosure = dfa->inveclosures + node;
1835     re_node_set except_nodes;
1836     re_node_set_init_empty (&except_nodes);
1837     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1838       {
1839         int cur_node = inv_eclosure->elems[ecl_idx];
1840         if (cur_node == node)
1841           continue;
1842         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1843           {
1844             int edst1 = dfa->edests[cur_node].elems[0];
1845             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1846                          ? dfa->edests[cur_node].elems[1] : -1);
1847             if ((!re_node_set_contains (inv_eclosure, edst1)
1848                  && re_node_set_contains (dest_nodes, edst1))
1849                 || (edst2 > 0
1850                     && !re_node_set_contains (inv_eclosure, edst2)
1851                     && re_node_set_contains (dest_nodes, edst2)))
1852               {
1853                 err = re_node_set_add_intersect (&except_nodes, candidates,
1854                                                  dfa->inveclosures + cur_node);
1855                 if (BE (err != REG_NOERROR, 0))
1856                   {
1857                     re_node_set_free (&except_nodes);
1858                     return err;
1859                   }
1860               }
1861           }
1862       }
1863     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1864       {
1865         int cur_node = inv_eclosure->elems[ecl_idx];
1866         if (!re_node_set_contains (&except_nodes, cur_node))
1867           {
1868             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1869             re_node_set_remove_at (dest_nodes, idx);
1870           }
1871       }
1872     re_node_set_free (&except_nodes);
1873     return REG_NOERROR;
1874 }
1875
1876 static int
1877 internal_function
1878 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1879                   int dst_node, int dst_idx, int src_node, int src_idx)
1880 {
1881   const re_dfa_t *const dfa = mctx->dfa;
1882   int lim_idx, src_pos, dst_pos;
1883
1884   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1885   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1886   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1887     {
1888       int subexp_idx;
1889       struct re_backref_cache_entry *ent;
1890       ent = mctx->bkref_ents + limits->elems[lim_idx];
1891       subexp_idx = dfa->nodes[ent->node].opr.idx;
1892
1893       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1894                                            subexp_idx, dst_node, dst_idx,
1895                                            dst_bkref_idx);
1896       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1897                                            subexp_idx, src_node, src_idx,
1898                                            src_bkref_idx);
1899
1900       /* In case of:
1901          <src> <dst> ( <subexp> )
1902          ( <subexp> ) <src> <dst>
1903          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1904       if (src_pos == dst_pos)
1905         continue; /* This is unrelated limitation.  */
1906       else
1907         return 1;
1908     }
1909   return 0;
1910 }
1911
1912 static int
1913 internal_function
1914 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1915                              int subexp_idx, int from_node, int bkref_idx)
1916 {
1917   const re_dfa_t *const dfa = mctx->dfa;
1918   const re_node_set *eclosures = dfa->eclosures + from_node;
1919   int node_idx;
1920
1921   /* Else, we are on the boundary: examine the nodes on the epsilon
1922      closure.  */
1923   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1924     {
1925       int node = eclosures->elems[node_idx];
1926       switch (dfa->nodes[node].type)
1927         {
1928         case OP_BACK_REF:
1929           if (bkref_idx != -1)
1930             {
1931               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1932               do
1933                 {
1934                   int dst, cpos;
1935
1936                   if (ent->node != node)
1937                     continue;
1938
1939                   if (subexp_idx < BITSET_WORD_BITS
1940                       && !(ent->eps_reachable_subexps_map
1941                            & ((bitset_word_t) 1 << subexp_idx)))
1942                     continue;
1943
1944                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1945                      OP_CLOSE_SUBEXP cases below.  But, if the
1946                      destination node is the same node as the source
1947                      node, don't recurse because it would cause an
1948                      infinite loop: a regex that exhibits this behavior
1949                      is ()\1*\1*  */
1950                   dst = dfa->edests[node].elems[0];
1951                   if (dst == from_node)
1952                     {
1953                       if (boundaries & 1)
1954                         return -1;
1955                       else /* if (boundaries & 2) */
1956                         return 0;
1957                     }
1958
1959                   cpos =
1960                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1961                                                  dst, bkref_idx);
1962                   if (cpos == -1 /* && (boundaries & 1) */)
1963                     return -1;
1964                   if (cpos == 0 && (boundaries & 2))
1965                     return 0;
1966
1967                   if (subexp_idx < BITSET_WORD_BITS)
1968                     ent->eps_reachable_subexps_map
1969                       &= ~((bitset_word_t) 1 << subexp_idx);
1970                 }
1971               while (ent++->more);
1972             }
1973           break;
1974
1975         case OP_OPEN_SUBEXP:
1976           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1977             return -1;
1978           break;
1979
1980         case OP_CLOSE_SUBEXP:
1981           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1982             return 0;
1983           break;
1984
1985         default:
1986             break;
1987         }
1988     }
1989
1990   return (boundaries & 2) ? 1 : 0;
1991 }
1992
1993 static int
1994 internal_function
1995 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1996                            int subexp_idx, int from_node, int str_idx,
1997                            int bkref_idx)
1998 {
1999   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2000   int boundaries;
2001
2002   /* If we are outside the range of the subexpression, return -1 or 1.  */
2003   if (str_idx < lim->subexp_from)
2004     return -1;
2005
2006   if (lim->subexp_to < str_idx)
2007     return 1;
2008
2009   /* If we are within the subexpression, return 0.  */
2010   boundaries = (str_idx == lim->subexp_from);
2011   boundaries |= (str_idx == lim->subexp_to) << 1;
2012   if (boundaries == 0)
2013     return 0;
2014
2015   /* Else, examine epsilon closure.  */
2016   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2017                                       from_node, bkref_idx);
2018 }
2019
2020 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2021    which are against limitations from DEST_NODES. */
2022
2023 static reg_errcode_t
2024 internal_function
2025 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2026                      const re_node_set *candidates, re_node_set *limits,
2027                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2028 {
2029   reg_errcode_t err;
2030   int node_idx, lim_idx;
2031
2032   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2033     {
2034       int subexp_idx;
2035       struct re_backref_cache_entry *ent;
2036       ent = bkref_ents + limits->elems[lim_idx];
2037
2038       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2039         continue; /* This is unrelated limitation.  */
2040
2041       subexp_idx = dfa->nodes[ent->node].opr.idx;
2042       if (ent->subexp_to == str_idx)
2043         {
2044           int ops_node = -1;
2045           int cls_node = -1;
2046           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2047             {
2048               int node = dest_nodes->elems[node_idx];
2049               re_token_type_t type = dfa->nodes[node].type;
2050               if (type == OP_OPEN_SUBEXP
2051                   && subexp_idx == dfa->nodes[node].opr.idx)
2052                 ops_node = node;
2053               else if (type == OP_CLOSE_SUBEXP
2054                        && subexp_idx == dfa->nodes[node].opr.idx)
2055                 cls_node = node;
2056             }
2057
2058           /* Check the limitation of the open subexpression.  */
2059           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2060           if (ops_node >= 0)
2061             {
2062               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2063                                            candidates);
2064               if (BE (err != REG_NOERROR, 0))
2065                 return err;
2066             }
2067
2068           /* Check the limitation of the close subexpression.  */
2069           if (cls_node >= 0)
2070             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2071               {
2072                 int node = dest_nodes->elems[node_idx];
2073                 if (!re_node_set_contains (dfa->inveclosures + node,
2074                                            cls_node)
2075                     && !re_node_set_contains (dfa->eclosures + node,
2076                                               cls_node))
2077                   {
2078                     /* It is against this limitation.
2079                        Remove it form the current sifted state.  */
2080                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2081                                                  candidates);
2082                     if (BE (err != REG_NOERROR, 0))
2083                       return err;
2084                     --node_idx;
2085                   }
2086               }
2087         }
2088       else /* (ent->subexp_to != str_idx)  */
2089         {
2090           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2091             {
2092               int node = dest_nodes->elems[node_idx];
2093               re_token_type_t type = dfa->nodes[node].type;
2094               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2095                 {
2096                   if (subexp_idx != dfa->nodes[node].opr.idx)
2097                     continue;
2098                   /* It is against this limitation.
2099                      Remove it form the current sifted state.  */
2100                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2101                                                candidates);
2102                   if (BE (err != REG_NOERROR, 0))
2103                     return err;
2104                 }
2105             }
2106         }
2107     }
2108   return REG_NOERROR;
2109 }
2110
2111 static reg_errcode_t
2112 internal_function
2113 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2114                    int str_idx, const re_node_set *candidates)
2115 {
2116   const re_dfa_t *const dfa = mctx->dfa;
2117   reg_errcode_t err;
2118   int node_idx, node;
2119   re_sift_context_t local_sctx;
2120   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2121
2122   if (first_idx == -1)
2123     return REG_NOERROR;
2124
2125   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2126
2127   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2128     {
2129       int enabled_idx;
2130       re_token_type_t type;
2131       struct re_backref_cache_entry *entry;
2132       node = candidates->elems[node_idx];
2133       type = dfa->nodes[node].type;
2134       /* Avoid infinite loop for the REs like "()\1+".  */
2135       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2136         continue;
2137       if (type != OP_BACK_REF)
2138         continue;
2139
2140       entry = mctx->bkref_ents + first_idx;
2141       enabled_idx = first_idx;
2142       do
2143         {
2144           int subexp_len;
2145           int to_idx;
2146           int dst_node;
2147           int ret;
2148           re_dfastate_t *cur_state;
2149
2150           if (entry->node != node)
2151             continue;
2152           subexp_len = entry->subexp_to - entry->subexp_from;
2153           to_idx = str_idx + subexp_len;
2154           dst_node = (subexp_len ? dfa->nexts[node]
2155                       : dfa->edests[node].elems[0]);
2156
2157           if (to_idx > sctx->last_str_idx
2158               || sctx->sifted_states[to_idx] == NULL
2159               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2160               || check_dst_limits (mctx, &sctx->limits, node,
2161                                    str_idx, dst_node, to_idx))
2162             continue;
2163
2164           if (local_sctx.sifted_states == NULL)
2165             {
2166               local_sctx = *sctx;
2167               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2168               if (BE (err != REG_NOERROR, 0))
2169                 goto free_return;
2170             }
2171           local_sctx.last_node = node;
2172           local_sctx.last_str_idx = str_idx;
2173           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2174           if (BE (ret < 0, 0))
2175             {
2176               err = REG_ESPACE;
2177               goto free_return;
2178             }
2179           cur_state = local_sctx.sifted_states[str_idx];
2180           err = sift_states_backward (mctx, &local_sctx);
2181           if (BE (err != REG_NOERROR, 0))
2182             goto free_return;
2183           if (sctx->limited_states != NULL)
2184             {
2185               err = merge_state_array (dfa, sctx->limited_states,
2186                                        local_sctx.sifted_states,
2187                                        str_idx + 1);
2188               if (BE (err != REG_NOERROR, 0))
2189                 goto free_return;
2190             }
2191           local_sctx.sifted_states[str_idx] = cur_state;
2192           re_node_set_remove (&local_sctx.limits, enabled_idx);
2193
2194           /* mctx->bkref_ents may have changed, reload the pointer.  */
2195           entry = mctx->bkref_ents + enabled_idx;
2196         }
2197       while (enabled_idx++, entry++->more);
2198     }
2199   err = REG_NOERROR;
2200  free_return:
2201   if (local_sctx.sifted_states != NULL)
2202     {
2203       re_node_set_free (&local_sctx.limits);
2204     }
2205
2206   return err;
2207 }
2208
2209
2210 #ifdef RE_ENABLE_I18N
2211 static int
2212 internal_function
2213 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2214                      int node_idx, int str_idx, int max_str_idx)
2215 {
2216   const re_dfa_t *const dfa = mctx->dfa;
2217   int naccepted;
2218   /* Check the node can accept `multi byte'.  */
2219   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2220   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2221       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2222                             dfa->nexts[node_idx]))
2223     /* The node can't accept the `multi byte', or the
2224        destination was already thrown away, then the node
2225        could't accept the current input `multi byte'.   */
2226     naccepted = 0;
2227   /* Otherwise, it is sure that the node could accept
2228      `naccepted' bytes input.  */
2229   return naccepted;
2230 }
2231 #endif /* RE_ENABLE_I18N */
2232
2233 \f
2234 /* Functions for state transition.  */
2235
2236 /* Return the next state to which the current state STATE will transit by
2237    accepting the current input byte, and update STATE_LOG if necessary.
2238    If STATE can accept a multibyte char/collating element/back reference
2239    update the destination of STATE_LOG.  */
2240
2241 static re_dfastate_t *
2242 internal_function
2243 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2244                re_dfastate_t *state)
2245 {
2246   re_dfastate_t **trtable;
2247   unsigned char ch;
2248
2249 #ifdef RE_ENABLE_I18N
2250   /* If the current state can accept multibyte.  */
2251   if (BE (state->accept_mb, 0))
2252     {
2253       *err = transit_state_mb (mctx, state);
2254       if (BE (*err != REG_NOERROR, 0))
2255         return NULL;
2256     }
2257 #endif /* RE_ENABLE_I18N */
2258
2259   /* Then decide the next state with the single byte.  */
2260 #if 0
2261   if (0)
2262     /* don't use transition table  */
2263     return transit_state_sb (err, mctx, state);
2264 #endif
2265
2266   /* Use transition table  */
2267   ch = re_string_fetch_byte (&mctx->input);
2268   for (;;)
2269     {
2270       trtable = state->trtable;
2271       if (BE (trtable != NULL, 1))
2272         return trtable[ch];
2273
2274       trtable = state->word_trtable;
2275       if (BE (trtable != NULL, 1))
2276         {
2277           unsigned int context;
2278           context
2279             = re_string_context_at (&mctx->input,
2280                                     re_string_cur_idx (&mctx->input) - 1,
2281                                     mctx->eflags);
2282           if (IS_WORD_CONTEXT (context))
2283             return trtable[ch + SBC_MAX];
2284           else
2285             return trtable[ch];
2286         }
2287
2288       if (!build_trtable (mctx->dfa, state))
2289         {
2290           *err = REG_ESPACE;
2291           return NULL;
2292         }
2293
2294       /* Retry, we now have a transition table.  */
2295     }
2296 }
2297
2298 /* Update the state_log if we need */
2299 re_dfastate_t *
2300 internal_function
2301 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2302                       re_dfastate_t *next_state)
2303 {
2304   const re_dfa_t *const dfa = mctx->dfa;
2305   int cur_idx = re_string_cur_idx (&mctx->input);
2306
2307   if (cur_idx > mctx->state_log_top)
2308     {
2309       mctx->state_log[cur_idx] = next_state;
2310       mctx->state_log_top = cur_idx;
2311     }
2312   else if (mctx->state_log[cur_idx] == 0)
2313     {
2314       mctx->state_log[cur_idx] = next_state;
2315     }
2316   else
2317     {
2318       re_dfastate_t *pstate;
2319       unsigned int context;
2320       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2321       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2322          the destination of a multibyte char/collating element/
2323          back reference.  Then the next state is the union set of
2324          these destinations and the results of the transition table.  */
2325       pstate = mctx->state_log[cur_idx];
2326       log_nodes = pstate->entrance_nodes;
2327       if (next_state != NULL)
2328         {
2329           table_nodes = next_state->entrance_nodes;
2330           *err = re_node_set_init_union (&next_nodes, table_nodes,
2331                                              log_nodes);
2332           if (BE (*err != REG_NOERROR, 0))
2333             return NULL;
2334         }
2335       else
2336         next_nodes = *log_nodes;
2337       /* Note: We already add the nodes of the initial state,
2338          then we don't need to add them here.  */
2339
2340       context = re_string_context_at (&mctx->input,
2341                                       re_string_cur_idx (&mctx->input) - 1,
2342                                       mctx->eflags);
2343       next_state = mctx->state_log[cur_idx]
2344         = re_acquire_state_context (err, dfa, &next_nodes, context);
2345       /* We don't need to check errors here, since the return value of
2346          this function is next_state and ERR is already set.  */
2347
2348       if (table_nodes != NULL)
2349         re_node_set_free (&next_nodes);
2350     }
2351
2352   if (BE (dfa->nbackref, 0) && next_state != NULL)
2353     {
2354       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2355          later.  We must check them here, since the back references in the
2356          next state might use them.  */
2357       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2358                                         cur_idx);
2359       if (BE (*err != REG_NOERROR, 0))
2360         return NULL;
2361
2362       /* If the next state has back references.  */
2363       if (next_state->has_backref)
2364         {
2365           *err = transit_state_bkref (mctx, &next_state->nodes);
2366           if (BE (*err != REG_NOERROR, 0))
2367             return NULL;
2368           next_state = mctx->state_log[cur_idx];
2369         }
2370     }
2371
2372   return next_state;
2373 }
2374
2375 /* Skip bytes in the input that correspond to part of a
2376    multi-byte match, then look in the log for a state
2377    from which to restart matching.  */
2378 re_dfastate_t *
2379 internal_function
2380 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2381 {
2382   re_dfastate_t *cur_state;
2383   do
2384     {
2385       int max = mctx->state_log_top;
2386       int cur_str_idx = re_string_cur_idx (&mctx->input);
2387
2388       do
2389         {
2390           if (++cur_str_idx > max)
2391             return NULL;
2392           re_string_skip_bytes (&mctx->input, 1);
2393         }
2394       while (mctx->state_log[cur_str_idx] == NULL);
2395
2396       cur_state = merge_state_with_log (err, mctx, NULL);
2397     }
2398   while (*err == REG_NOERROR && cur_state == NULL);
2399   return cur_state;
2400 }
2401
2402 /* Helper functions for transit_state.  */
2403
2404 /* From the node set CUR_NODES, pick up the nodes whose types are
2405    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2406    expression. And register them to use them later for evaluating the
2407    correspoding back references.  */
2408
2409 static reg_errcode_t
2410 internal_function
2411 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2412                            int str_idx)
2413 {
2414   const re_dfa_t *const dfa = mctx->dfa;
2415   int node_idx;
2416   reg_errcode_t err;
2417
2418   /* TODO: This isn't efficient.
2419            Because there might be more than one nodes whose types are
2420            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2421            nodes.
2422            E.g. RE: (a){2}  */
2423   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2424     {
2425       int node = cur_nodes->elems[node_idx];
2426       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2427           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2428           && (dfa->used_bkref_map
2429               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2430         {
2431           err = match_ctx_add_subtop (mctx, node, str_idx);
2432           if (BE (err != REG_NOERROR, 0))
2433             return err;
2434         }
2435     }
2436   return REG_NOERROR;
2437 }
2438
2439 #if 0
2440 /* Return the next state to which the current state STATE will transit by
2441    accepting the current input byte.  */
2442
2443 static re_dfastate_t *
2444 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2445                   re_dfastate_t *state)
2446 {
2447   const re_dfa_t *const dfa = mctx->dfa;
2448   re_node_set next_nodes;
2449   re_dfastate_t *next_state;
2450   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2451   unsigned int context;
2452
2453   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2454   if (BE (*err != REG_NOERROR, 0))
2455     return NULL;
2456   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2457     {
2458       int cur_node = state->nodes.elems[node_cnt];
2459       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2460         {
2461           *err = re_node_set_merge (&next_nodes,
2462                                     dfa->eclosures + dfa->nexts[cur_node]);
2463           if (BE (*err != REG_NOERROR, 0))
2464             {
2465               re_node_set_free (&next_nodes);
2466               return NULL;
2467             }
2468         }
2469     }
2470   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2471   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2472   /* We don't need to check errors here, since the return value of
2473      this function is next_state and ERR is already set.  */
2474
2475   re_node_set_free (&next_nodes);
2476   re_string_skip_bytes (&mctx->input, 1);
2477   return next_state;
2478 }
2479 #endif
2480
2481 #ifdef RE_ENABLE_I18N
2482 static reg_errcode_t
2483 internal_function
2484 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2485 {
2486   const re_dfa_t *const dfa = mctx->dfa;
2487   reg_errcode_t err;
2488   int i;
2489
2490   for (i = 0; i < pstate->nodes.nelem; ++i)
2491     {
2492       re_node_set dest_nodes, *new_nodes;
2493       int cur_node_idx = pstate->nodes.elems[i];
2494       int naccepted, dest_idx;
2495       unsigned int context;
2496       re_dfastate_t *dest_state;
2497
2498       if (!dfa->nodes[cur_node_idx].accept_mb)
2499         continue;
2500
2501       if (dfa->nodes[cur_node_idx].constraint)
2502         {
2503           context = re_string_context_at (&mctx->input,
2504                                           re_string_cur_idx (&mctx->input),
2505                                           mctx->eflags);
2506           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2507                                            context))
2508             continue;
2509         }
2510
2511       /* How many bytes the node can accept?  */
2512       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2513                                            re_string_cur_idx (&mctx->input));
2514       if (naccepted == 0)
2515         continue;
2516
2517       /* The node can accepts `naccepted' bytes.  */
2518       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2519       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2520                                : mctx->max_mb_elem_len);
2521       err = clean_state_log_if_needed (mctx, dest_idx);
2522       if (BE (err != REG_NOERROR, 0))
2523         return err;
2524 #ifdef DEBUG
2525       assert (dfa->nexts[cur_node_idx] != -1);
2526 #endif
2527       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2528
2529       dest_state = mctx->state_log[dest_idx];
2530       if (dest_state == NULL)
2531         dest_nodes = *new_nodes;
2532       else
2533         {
2534           err = re_node_set_init_union (&dest_nodes,
2535                                         dest_state->entrance_nodes, new_nodes);
2536           if (BE (err != REG_NOERROR, 0))
2537             return err;
2538         }
2539       context = re_string_context_at (&mctx->input, dest_idx - 1,
2540                                       mctx->eflags);
2541       mctx->state_log[dest_idx]
2542         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2543       if (dest_state != NULL)
2544         re_node_set_free (&dest_nodes);
2545       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2546         return err;
2547     }
2548   return REG_NOERROR;
2549 }
2550 #endif /* RE_ENABLE_I18N */
2551
2552 static reg_errcode_t
2553 internal_function
2554 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2555 {
2556   const re_dfa_t *const dfa = mctx->dfa;
2557   reg_errcode_t err;
2558   int i;
2559   int cur_str_idx = re_string_cur_idx (&mctx->input);
2560
2561   for (i = 0; i < nodes->nelem; ++i)
2562     {
2563       int dest_str_idx, prev_nelem, bkc_idx;
2564       int node_idx = nodes->elems[i];
2565       unsigned int context;
2566       const re_token_t *node = dfa->nodes + node_idx;
2567       re_node_set *new_dest_nodes;
2568
2569       /* Check whether `node' is a backreference or not.  */
2570       if (node->type != OP_BACK_REF)
2571         continue;
2572
2573       if (node->constraint)
2574         {
2575           context = re_string_context_at (&mctx->input, cur_str_idx,
2576                                           mctx->eflags);
2577           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2578             continue;
2579         }
2580
2581       /* `node' is a backreference.
2582          Check the substring which the substring matched.  */
2583       bkc_idx = mctx->nbkref_ents;
2584       err = get_subexp (mctx, node_idx, cur_str_idx);
2585       if (BE (err != REG_NOERROR, 0))
2586         goto free_return;
2587
2588       /* And add the epsilon closures (which is `new_dest_nodes') of
2589          the backreference to appropriate state_log.  */
2590 #ifdef DEBUG
2591       assert (dfa->nexts[node_idx] != -1);
2592 #endif
2593       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2594         {
2595           int subexp_len;
2596           re_dfastate_t *dest_state;
2597           struct re_backref_cache_entry *bkref_ent;
2598           bkref_ent = mctx->bkref_ents + bkc_idx;
2599           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2600             continue;
2601           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2602           new_dest_nodes = (subexp_len == 0
2603                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2604                             : dfa->eclosures + dfa->nexts[node_idx]);
2605           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2606                           - bkref_ent->subexp_from);
2607           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2608                                           mctx->eflags);
2609           dest_state = mctx->state_log[dest_str_idx];
2610           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2611                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2612           /* Add `new_dest_node' to state_log.  */
2613           if (dest_state == NULL)
2614             {
2615               mctx->state_log[dest_str_idx]
2616                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2617                                             context);
2618               if (BE (mctx->state_log[dest_str_idx] == NULL
2619                       && err != REG_NOERROR, 0))
2620                 goto free_return;
2621             }
2622           else
2623             {
2624               re_node_set dest_nodes;
2625               err = re_node_set_init_union (&dest_nodes,
2626                                             dest_state->entrance_nodes,
2627                                             new_dest_nodes);
2628               if (BE (err != REG_NOERROR, 0))
2629                 {
2630                   re_node_set_free (&dest_nodes);
2631                   goto free_return;
2632                 }
2633               mctx->state_log[dest_str_idx]
2634                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2635               re_node_set_free (&dest_nodes);
2636               if (BE (mctx->state_log[dest_str_idx] == NULL
2637                       && err != REG_NOERROR, 0))
2638                 goto free_return;
2639             }
2640           /* We need to check recursively if the backreference can epsilon
2641              transit.  */
2642           if (subexp_len == 0
2643               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2644             {
2645               err = check_subexp_matching_top (mctx, new_dest_nodes,
2646                                                cur_str_idx);
2647               if (BE (err != REG_NOERROR, 0))
2648                 goto free_return;
2649               err = transit_state_bkref (mctx, new_dest_nodes);
2650               if (BE (err != REG_NOERROR, 0))
2651                 goto free_return;
2652             }
2653         }
2654     }
2655   err = REG_NOERROR;
2656  free_return:
2657   return err;
2658 }
2659
2660 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2661    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2662    Note that we might collect inappropriate candidates here.
2663    However, the cost of checking them strictly here is too high, then we
2664    delay these checking for prune_impossible_nodes().  */
2665
2666 static reg_errcode_t
2667 internal_function
2668 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2669 {
2670   const re_dfa_t *const dfa = mctx->dfa;
2671   int subexp_num, sub_top_idx;
2672   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2673   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2674   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2675   if (cache_idx != -1)
2676     {
2677       const struct re_backref_cache_entry *entry
2678         = mctx->bkref_ents + cache_idx;
2679       do
2680         if (entry->node == bkref_node)
2681           return REG_NOERROR; /* We already checked it.  */
2682       while (entry++->more);
2683     }
2684
2685   subexp_num = dfa->nodes[bkref_node].opr.idx;
2686
2687   /* For each sub expression  */
2688   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2689     {
2690       reg_errcode_t err;
2691       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2692       re_sub_match_last_t *sub_last;
2693       int sub_last_idx, sl_str, bkref_str_off;
2694
2695       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2696         continue; /* It isn't related.  */
2697
2698       sl_str = sub_top->str_idx;
2699       bkref_str_off = bkref_str_idx;
2700       /* At first, check the last node of sub expressions we already
2701          evaluated.  */
2702       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2703         {
2704           int sl_str_diff;
2705           sub_last = sub_top->lasts[sub_last_idx];
2706           sl_str_diff = sub_last->str_idx - sl_str;
2707           /* The matched string by the sub expression match with the substring
2708              at the back reference?  */
2709           if (sl_str_diff > 0)
2710             {
2711               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2712                 {
2713                   /* Not enough chars for a successful match.  */
2714                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2715                     break;
2716
2717                   err = clean_state_log_if_needed (mctx,
2718                                                    bkref_str_off
2719                                                    + sl_str_diff);
2720                   if (BE (err != REG_NOERROR, 0))
2721                     return err;
2722                   buf = (const char *) re_string_get_buffer (&mctx->input);
2723                 }
2724               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2725                 /* We don't need to search this sub expression any more.  */
2726                 break;
2727             }
2728           bkref_str_off += sl_str_diff;
2729           sl_str += sl_str_diff;
2730           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2731                                 bkref_str_idx);
2732
2733           /* Reload buf, since the preceding call might have reallocated
2734              the buffer.  */
2735           buf = (const char *) re_string_get_buffer (&mctx->input);
2736
2737           if (err == REG_NOMATCH)
2738             continue;
2739           if (BE (err != REG_NOERROR, 0))
2740             return err;
2741         }
2742
2743       if (sub_last_idx < sub_top->nlasts)
2744         continue;
2745       if (sub_last_idx > 0)
2746         ++sl_str;
2747       /* Then, search for the other last nodes of the sub expression.  */
2748       for (; sl_str <= bkref_str_idx; ++sl_str)
2749         {
2750           int cls_node, sl_str_off;
2751           const re_node_set *nodes;
2752           sl_str_off = sl_str - sub_top->str_idx;
2753           /* The matched string by the sub expression match with the substring
2754              at the back reference?  */
2755           if (sl_str_off > 0)
2756             {
2757               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2758                 {
2759                   /* If we are at the end of the input, we cannot match.  */
2760                   if (bkref_str_off >= mctx->input.len)
2761                     break;
2762
2763                   err = extend_buffers (mctx);
2764                   if (BE (err != REG_NOERROR, 0))
2765                     return err;
2766
2767                   buf = (const char *) re_string_get_buffer (&mctx->input);
2768                 }
2769               if (buf [bkref_str_off++] != buf[sl_str - 1])
2770                 break; /* We don't need to search this sub expression
2771                           any more.  */
2772             }
2773           if (mctx->state_log[sl_str] == NULL)
2774             continue;
2775           /* Does this state have a ')' of the sub expression?  */
2776           nodes = &mctx->state_log[sl_str]->nodes;
2777           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2778                                        OP_CLOSE_SUBEXP);
2779           if (cls_node == -1)
2780             continue; /* No.  */
2781           if (sub_top->path == NULL)
2782             {
2783               sub_top->path = calloc (sizeof (state_array_t),
2784                                       sl_str - sub_top->str_idx + 1);
2785               if (sub_top->path == NULL)
2786                 return REG_ESPACE;
2787             }
2788           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2789              in the current context?  */
2790           err = check_arrival (mctx, sub_top->path, sub_top->node,
2791                                sub_top->str_idx, cls_node, sl_str,
2792                                OP_CLOSE_SUBEXP);
2793           if (err == REG_NOMATCH)
2794               continue;
2795           if (BE (err != REG_NOERROR, 0))
2796               return err;
2797           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2798           if (BE (sub_last == NULL, 0))
2799             return REG_ESPACE;
2800           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2801                                 bkref_str_idx);
2802           if (err == REG_NOMATCH)
2803             continue;
2804         }
2805     }
2806   return REG_NOERROR;
2807 }
2808
2809 /* Helper functions for get_subexp().  */
2810
2811 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2812    If it can arrive, register the sub expression expressed with SUB_TOP
2813    and SUB_LAST.  */
2814
2815 static reg_errcode_t
2816 internal_function
2817 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2818                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2819 {
2820   reg_errcode_t err;
2821   int to_idx;
2822   /* Can the subexpression arrive the back reference?  */
2823   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2824                        sub_last->str_idx, bkref_node, bkref_str,
2825                        OP_OPEN_SUBEXP);
2826   if (err != REG_NOERROR)
2827     return err;
2828   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2829                              sub_last->str_idx);
2830   if (BE (err != REG_NOERROR, 0))
2831     return err;
2832   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2833   return clean_state_log_if_needed (mctx, to_idx);
2834 }
2835
2836 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2837    Search '(' if FL_OPEN, or search ')' otherwise.
2838    TODO: This function isn't efficient...
2839          Because there might be more than one nodes whose types are
2840          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2841          nodes.
2842          E.g. RE: (a){2}  */
2843
2844 static int
2845 internal_function
2846 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2847                   int subexp_idx, int type)
2848 {
2849   int cls_idx;
2850   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2851     {
2852       int cls_node = nodes->elems[cls_idx];
2853       const re_token_t *node = dfa->nodes + cls_node;
2854       if (node->type == type
2855           && node->opr.idx == subexp_idx)
2856         return cls_node;
2857     }
2858   return -1;
2859 }
2860
2861 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2862    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2863    heavily reused.
2864    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2865
2866 static reg_errcode_t
2867 internal_function
2868 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2869                int top_str, int last_node, int last_str, int type)
2870 {
2871   const re_dfa_t *const dfa = mctx->dfa;
2872   reg_errcode_t err = REG_NOERROR;
2873   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2874   re_dfastate_t *cur_state = NULL;
2875   re_node_set *cur_nodes, next_nodes;
2876   re_dfastate_t **backup_state_log;
2877   unsigned int context;
2878
2879   subexp_num = dfa->nodes[top_node].opr.idx;
2880   /* Extend the buffer if we need.  */
2881   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2882     {
2883       re_dfastate_t **new_array;
2884       int old_alloc = path->alloc;
2885       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2886       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2887       if (BE (new_array == NULL, 0))
2888         {
2889           path->alloc = old_alloc;
2890           return REG_ESPACE;
2891         }
2892       path->array = new_array;
2893       memset (new_array + old_alloc, '\0',
2894               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2895     }
2896
2897   str_idx = path->next_idx ?: top_str;
2898
2899   /* Temporary modify MCTX.  */
2900   backup_state_log = mctx->state_log;
2901   backup_cur_idx = mctx->input.cur_idx;
2902   mctx->state_log = path->array;
2903   mctx->input.cur_idx = str_idx;
2904
2905   /* Setup initial node set.  */
2906   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2907   if (str_idx == top_str)
2908     {
2909       err = re_node_set_init_1 (&next_nodes, top_node);
2910       if (BE (err != REG_NOERROR, 0))
2911         return err;
2912       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2913       if (BE (err != REG_NOERROR, 0))
2914         {
2915           re_node_set_free (&next_nodes);
2916           return err;
2917         }
2918     }
2919   else
2920     {
2921       cur_state = mctx->state_log[str_idx];
2922       if (cur_state && cur_state->has_backref)
2923         {
2924           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2925           if (BE (err != REG_NOERROR, 0))
2926             return err;
2927         }
2928       else
2929         re_node_set_init_empty (&next_nodes);
2930     }
2931   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2932     {
2933       if (next_nodes.nelem)
2934         {
2935           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2936                                     subexp_num, type);
2937           if (BE (err != REG_NOERROR, 0))
2938             {
2939               re_node_set_free (&next_nodes);
2940               return err;
2941             }
2942         }
2943       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2944       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2945         {
2946           re_node_set_free (&next_nodes);
2947           return err;
2948         }
2949       mctx->state_log[str_idx] = cur_state;
2950     }
2951
2952   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2953     {
2954       re_node_set_empty (&next_nodes);
2955       if (mctx->state_log[str_idx + 1])
2956         {
2957           err = re_node_set_merge (&next_nodes,
2958                                    &mctx->state_log[str_idx + 1]->nodes);
2959           if (BE (err != REG_NOERROR, 0))
2960             {
2961               re_node_set_free (&next_nodes);
2962               return err;
2963             }
2964         }
2965       if (cur_state)
2966         {
2967           err = check_arrival_add_next_nodes (mctx, str_idx,
2968                                               &cur_state->non_eps_nodes,
2969                                               &next_nodes);
2970           if (BE (err != REG_NOERROR, 0))
2971             {
2972               re_node_set_free (&next_nodes);
2973               return err;
2974             }
2975         }
2976       ++str_idx;
2977       if (next_nodes.nelem)
2978         {
2979           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2980           if (BE (err != REG_NOERROR, 0))
2981             {
2982               re_node_set_free (&next_nodes);
2983               return err;
2984             }
2985           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2986                                     subexp_num, type);
2987           if (BE (err != REG_NOERROR, 0))
2988             {
2989               re_node_set_free (&next_nodes);
2990               return err;
2991             }
2992         }
2993       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2994       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2995       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2996         {
2997           re_node_set_free (&next_nodes);
2998           return err;
2999         }
3000       mctx->state_log[str_idx] = cur_state;
3001       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3002     }
3003   re_node_set_free (&next_nodes);
3004   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3005                : &mctx->state_log[last_str]->nodes);
3006   path->next_idx = str_idx;
3007
3008   /* Fix MCTX.  */
3009   mctx->state_log = backup_state_log;
3010   mctx->input.cur_idx = backup_cur_idx;
3011
3012   /* Then check the current node set has the node LAST_NODE.  */
3013   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3014     return REG_NOERROR;
3015
3016   return REG_NOMATCH;
3017 }
3018
3019 /* Helper functions for check_arrival.  */
3020
3021 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3022    to NEXT_NODES.
3023    TODO: This function is similar to the functions transit_state*(),
3024          however this function has many additional works.
3025          Can't we unify them?  */
3026
3027 static reg_errcode_t
3028 internal_function
3029 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3030                               re_node_set *cur_nodes, re_node_set *next_nodes)
3031 {
3032   const re_dfa_t *const dfa = mctx->dfa;
3033   int result;
3034   int cur_idx;
3035   reg_errcode_t err = REG_NOERROR;
3036   re_node_set union_set;
3037   re_node_set_init_empty (&union_set);
3038   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3039     {
3040       int naccepted = 0;
3041       int cur_node = cur_nodes->elems[cur_idx];
3042 #ifdef DEBUG
3043       re_token_type_t type = dfa->nodes[cur_node].type;
3044       assert (!IS_EPSILON_NODE (type));
3045 #endif
3046 #ifdef RE_ENABLE_I18N
3047       /* If the node may accept `multi byte'.  */
3048       if (dfa->nodes[cur_node].accept_mb)
3049         {
3050           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3051                                                str_idx);
3052           if (naccepted > 1)
3053             {
3054               re_dfastate_t *dest_state;
3055               int next_node = dfa->nexts[cur_node];
3056               int next_idx = str_idx + naccepted;
3057               dest_state = mctx->state_log[next_idx];
3058               re_node_set_empty (&union_set);
3059               if (dest_state)
3060                 {
3061                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3062                   if (BE (err != REG_NOERROR, 0))
3063                     {
3064                       re_node_set_free (&union_set);
3065                       return err;
3066                     }
3067                 }
3068               result = re_node_set_insert (&union_set, next_node);
3069               if (BE (result < 0, 0))
3070                 {
3071                   re_node_set_free (&union_set);
3072                   return REG_ESPACE;
3073                 }
3074               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3075                                                             &union_set);
3076               if (BE (mctx->state_log[next_idx] == NULL
3077                       && err != REG_NOERROR, 0))
3078                 {
3079                   re_node_set_free (&union_set);
3080                   return err;
3081                 }
3082             }
3083         }
3084 #endif /* RE_ENABLE_I18N */
3085       if (naccepted
3086           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3087         {
3088           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3089           if (BE (result < 0, 0))
3090             {
3091               re_node_set_free (&union_set);
3092               return REG_ESPACE;
3093             }
3094         }
3095     }
3096   re_node_set_free (&union_set);
3097   return REG_NOERROR;
3098 }
3099
3100 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3101    CUR_NODES, however exclude the nodes which are:
3102     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3103     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3104 */
3105
3106 static reg_errcode_t
3107 internal_function
3108 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3109                           int ex_subexp, int type)
3110 {
3111   reg_errcode_t err;
3112   int idx, outside_node;
3113   re_node_set new_nodes;
3114 #ifdef DEBUG
3115   assert (cur_nodes->nelem);
3116 #endif
3117   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3118   if (BE (err != REG_NOERROR, 0))
3119     return err;
3120   /* Create a new node set NEW_NODES with the nodes which are epsilon
3121      closures of the node in CUR_NODES.  */
3122
3123   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3124     {
3125       int cur_node = cur_nodes->elems[idx];
3126       const re_node_set *eclosure = dfa->eclosures + cur_node;
3127       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3128       if (outside_node == -1)
3129         {
3130           /* There are no problematic nodes, just merge them.  */
3131           err = re_node_set_merge (&new_nodes, eclosure);
3132           if (BE (err != REG_NOERROR, 0))
3133             {
3134               re_node_set_free (&new_nodes);
3135               return err;
3136             }
3137         }
3138       else
3139         {
3140           /* There are problematic nodes, re-calculate incrementally.  */
3141           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3142                                               ex_subexp, type);
3143           if (BE (err != REG_NOERROR, 0))
3144             {
3145               re_node_set_free (&new_nodes);
3146               return err;
3147             }
3148         }
3149     }
3150   re_node_set_free (cur_nodes);
3151   *cur_nodes = new_nodes;
3152   return REG_NOERROR;
3153 }
3154
3155 /* Helper function for check_arrival_expand_ecl.
3156    Check incrementally the epsilon closure of TARGET, and if it isn't
3157    problematic append it to DST_NODES.  */
3158
3159 static reg_errcode_t
3160 internal_function
3161 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3162                               int target, int ex_subexp, int type)
3163 {
3164   int cur_node;
3165   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3166     {
3167       int err;
3168
3169       if (dfa->nodes[cur_node].type == type
3170           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3171         {
3172           if (type == OP_CLOSE_SUBEXP)
3173             {
3174               err = re_node_set_insert (dst_nodes, cur_node);
3175               if (BE (err == -1, 0))
3176                 return REG_ESPACE;
3177             }
3178           break;
3179         }
3180       err = re_node_set_insert (dst_nodes, cur_node);
3181       if (BE (err == -1, 0))
3182         return REG_ESPACE;
3183       if (dfa->edests[cur_node].nelem == 0)
3184         break;
3185       if (dfa->edests[cur_node].nelem == 2)
3186         {
3187           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3188                                               dfa->edests[cur_node].elems[1],
3189                                               ex_subexp, type);
3190           if (BE (err != REG_NOERROR, 0))
3191             return err;
3192         }
3193       cur_node = dfa->edests[cur_node].elems[0];
3194     }
3195   return REG_NOERROR;
3196 }
3197
3198
3199 /* For all the back references in the current state, calculate the
3200    destination of the back references by the appropriate entry
3201    in MCTX->BKREF_ENTS.  */
3202
3203 static reg_errcode_t
3204 internal_function
3205 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3206                     int cur_str, int subexp_num, int type)
3207 {
3208   const re_dfa_t *const dfa = mctx->dfa;
3209   reg_errcode_t err;
3210   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3211   struct re_backref_cache_entry *ent;
3212
3213   if (cache_idx_start == -1)
3214     return REG_NOERROR;
3215
3216  restart:
3217   ent = mctx->bkref_ents + cache_idx_start;
3218   do
3219     {
3220       int to_idx, next_node;
3221
3222       /* Is this entry ENT is appropriate?  */
3223       if (!re_node_set_contains (cur_nodes, ent->node))
3224         continue; /* No.  */
3225
3226       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3227       /* Calculate the destination of the back reference, and append it
3228          to MCTX->STATE_LOG.  */
3229       if (to_idx == cur_str)
3230         {
3231           /* The backreference did epsilon transit, we must re-check all the
3232              node in the current state.  */
3233           re_node_set new_dests;
3234           reg_errcode_t err2, err3;
3235           next_node = dfa->edests[ent->node].elems[0];
3236           if (re_node_set_contains (cur_nodes, next_node))
3237             continue;
3238           err = re_node_set_init_1 (&new_dests, next_node);
3239           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3240           err3 = re_node_set_merge (cur_nodes, &new_dests);
3241           re_node_set_free (&new_dests);
3242           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3243                   || err3 != REG_NOERROR, 0))
3244             {
3245               err = (err != REG_NOERROR ? err
3246                      : (err2 != REG_NOERROR ? err2 : err3));
3247               return err;
3248             }
3249           /* TODO: It is still inefficient...  */
3250           goto restart;
3251         }
3252       else
3253         {
3254           re_node_set union_set;
3255           next_node = dfa->nexts[ent->node];
3256           if (mctx->state_log[to_idx])
3257             {
3258               int ret;
3259               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3260                                         next_node))
3261                 continue;
3262               err = re_node_set_init_copy (&union_set,
3263                                            &mctx->state_log[to_idx]->nodes);
3264               ret = re_node_set_insert (&union_set, next_node);
3265               if (BE (err != REG_NOERROR || ret < 0, 0))
3266                 {
3267                   re_node_set_free (&union_set);
3268                   err = err != REG_NOERROR ? err : REG_ESPACE;
3269                   return err;
3270                 }
3271             }
3272           else
3273             {
3274               err = re_node_set_init_1 (&union_set, next_node);
3275               if (BE (err != REG_NOERROR, 0))
3276                 return err;
3277             }
3278           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3279           re_node_set_free (&union_set);
3280           if (BE (mctx->state_log[to_idx] == NULL
3281                   && err != REG_NOERROR, 0))
3282             return err;
3283         }
3284     }
3285   while (ent++->more);
3286   return REG_NOERROR;
3287 }
3288
3289 /* Build transition table for the state.
3290    Return 1 if succeeded, otherwise return NULL.  */
3291
3292 static int
3293 internal_function
3294 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3295 {
3296   reg_errcode_t err;
3297   int i, j, ch, need_word_trtable = 0;
3298   bitset_word_t elem, mask;
3299   bool dests_node_malloced = false;
3300   bool dest_states_malloced = false;
3301   int ndests; /* Number of the destination states from `state'.  */
3302   re_dfastate_t **trtable;
3303   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3304   re_node_set follows, *dests_node;
3305   bitset_t *dests_ch;
3306   bitset_t acceptable;
3307
3308   struct dests_alloc
3309   {
3310     re_node_set dests_node[SBC_MAX];
3311     bitset_t dests_ch[SBC_MAX];
3312   } *dests_alloc;
3313
3314   /* We build DFA states which corresponds to the destination nodes
3315      from `state'.  `dests_node[i]' represents the nodes which i-th
3316      destination state contains, and `dests_ch[i]' represents the
3317      characters which i-th destination state accepts.  */
3318   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3319     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3320   else
3321     {
3322       dests_alloc = re_malloc (struct dests_alloc, 1);
3323       if (BE (dests_alloc == NULL, 0))
3324         return 0;
3325       dests_node_malloced = true;
3326     }
3327   dests_node = dests_alloc->dests_node;
3328   dests_ch = dests_alloc->dests_ch;
3329
3330   /* Initialize transiton table.  */
3331   state->word_trtable = state->trtable = NULL;
3332
3333   /* At first, group all nodes belonging to `state' into several
3334      destinations.  */
3335   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3336   if (BE (ndests <= 0, 0))
3337     {
3338       if (dests_node_malloced)
3339         free (dests_alloc);
3340       /* Return 0 in case of an error, 1 otherwise.  */
3341       if (ndests == 0)
3342         {
3343           state->trtable = (re_dfastate_t **)
3344             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3345           return 1;
3346         }
3347       return 0;
3348     }
3349
3350   err = re_node_set_alloc (&follows, ndests + 1);
3351   if (BE (err != REG_NOERROR, 0))
3352     goto out_free;
3353
3354   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3355                          + ndests * 3 * sizeof (re_dfastate_t *)))
3356     dest_states = (re_dfastate_t **)
3357       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3358   else
3359     {
3360       dest_states = (re_dfastate_t **)
3361         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3362       if (BE (dest_states == NULL, 0))
3363         {
3364 out_free:
3365           if (dest_states_malloced)
3366             free (dest_states);
3367           re_node_set_free (&follows);
3368           for (i = 0; i < ndests; ++i)
3369             re_node_set_free (dests_node + i);
3370           if (dests_node_malloced)
3371             free (dests_alloc);
3372           return 0;
3373         }
3374       dest_states_malloced = true;
3375     }
3376   dest_states_word = dest_states + ndests;
3377   dest_states_nl = dest_states_word + ndests;
3378   bitset_empty (acceptable);
3379
3380   /* Then build the states for all destinations.  */
3381   for (i = 0; i < ndests; ++i)
3382     {
3383       int next_node;
3384       re_node_set_empty (&follows);
3385       /* Merge the follows of this destination states.  */
3386       for (j = 0; j < dests_node[i].nelem; ++j)
3387         {
3388           next_node = dfa->nexts[dests_node[i].elems[j]];
3389           if (next_node != -1)
3390             {
3391               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3392               if (BE (err != REG_NOERROR, 0))
3393                 goto out_free;
3394             }
3395         }
3396       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3397       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3398         goto out_free;
3399       /* If the new state has context constraint,
3400          build appropriate states for these contexts.  */
3401       if (dest_states[i]->has_constraint)
3402         {
3403           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3404                                                           CONTEXT_WORD);
3405           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3406             goto out_free;
3407
3408           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3409             need_word_trtable = 1;
3410
3411           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3412                                                         CONTEXT_NEWLINE);
3413           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3414             goto out_free;
3415         }
3416       else
3417         {
3418           dest_states_word[i] = dest_states[i];
3419           dest_states_nl[i] = dest_states[i];
3420         }
3421       bitset_merge (acceptable, dests_ch[i]);
3422     }
3423
3424   if (!BE (need_word_trtable, 0))
3425     {
3426       /* We don't care about whether the following character is a word
3427          character, or we are in a single-byte character set so we can
3428          discern by looking at the character code: allocate a
3429          256-entry transition table.  */
3430       trtable = state->trtable =
3431         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3432       if (BE (trtable == NULL, 0))
3433         goto out_free;
3434
3435       /* For all characters ch...:  */
3436       for (i = 0; i < BITSET_WORDS; ++i)
3437         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3438              elem;
3439              mask <<= 1, elem >>= 1, ++ch)
3440           if (BE (elem & 1, 0))
3441             {
3442               /* There must be exactly one destination which accepts
3443                  character ch.  See group_nodes_into_DFAstates.  */
3444               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3445                 ;
3446
3447               /* j-th destination accepts the word character ch.  */
3448               if (dfa->word_char[i] & mask)
3449                 trtable[ch] = dest_states_word[j];
3450               else
3451                 trtable[ch] = dest_states[j];
3452             }
3453     }
3454   else
3455     {
3456       /* We care about whether the following character is a word
3457          character, and we are in a multi-byte character set: discern
3458          by looking at the character code: build two 256-entry
3459          transition tables, one starting at trtable[0] and one
3460          starting at trtable[SBC_MAX].  */
3461       trtable = state->word_trtable =
3462         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3463       if (BE (trtable == NULL, 0))
3464         goto out_free;
3465
3466       /* For all characters ch...:  */
3467       for (i = 0; i < BITSET_WORDS; ++i)
3468         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3469              elem;
3470              mask <<= 1, elem >>= 1, ++ch)
3471           if (BE (elem & 1, 0))
3472             {
3473               /* There must be exactly one destination which accepts
3474                  character ch.  See group_nodes_into_DFAstates.  */
3475               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3476                 ;
3477
3478               /* j-th destination accepts the word character ch.  */
3479               trtable[ch] = dest_states[j];
3480               trtable[ch + SBC_MAX] = dest_states_word[j];
3481             }
3482     }
3483
3484   /* new line */
3485   if (bitset_contain (acceptable, NEWLINE_CHAR))
3486     {
3487       /* The current state accepts newline character.  */
3488       for (j = 0; j < ndests; ++j)
3489         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3490           {
3491             /* k-th destination accepts newline character.  */
3492             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3493             if (need_word_trtable)
3494               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3495             /* There must be only one destination which accepts
3496                newline.  See group_nodes_into_DFAstates.  */
3497             break;
3498           }
3499     }
3500
3501   if (dest_states_malloced)
3502     free (dest_states);
3503
3504   re_node_set_free (&follows);
3505   for (i = 0; i < ndests; ++i)
3506     re_node_set_free (dests_node + i);
3507
3508   if (dests_node_malloced)
3509     free (dests_alloc);
3510
3511   return 1;
3512 }
3513
3514 /* Group all nodes belonging to STATE into several destinations.
3515    Then for all destinations, set the nodes belonging to the destination
3516    to DESTS_NODE[i] and set the characters accepted by the destination
3517    to DEST_CH[i].  This function return the number of destinations.  */
3518
3519 static int
3520 internal_function
3521 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3522                             re_node_set *dests_node, bitset_t *dests_ch)
3523 {
3524   reg_errcode_t err;
3525   int result;
3526   int i, j, k;
3527   int ndests; /* Number of the destinations from `state'.  */
3528   bitset_t accepts; /* Characters a node can accept.  */
3529   const re_node_set *cur_nodes = &state->nodes;
3530   bitset_empty (accepts);
3531   ndests = 0;
3532
3533   /* For all the nodes belonging to `state',  */
3534   for (i = 0; i < cur_nodes->nelem; ++i)
3535     {
3536       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3537       re_token_type_t type = node->type;
3538       unsigned int constraint = node->constraint;
3539
3540       /* Enumerate all single byte character this node can accept.  */
3541       if (type == CHARACTER)
3542         bitset_set (accepts, node->opr.c);
3543       else if (type == SIMPLE_BRACKET)
3544         {
3545           bitset_merge (accepts, node->opr.sbcset);
3546         }
3547       else if (type == OP_PERIOD)
3548         {
3549 #ifdef RE_ENABLE_I18N
3550           if (dfa->mb_cur_max > 1)
3551             bitset_merge (accepts, dfa->sb_char);
3552           else
3553 #endif
3554             bitset_set_all (accepts);
3555           if (!(dfa->syntax & RE_DOT_NEWLINE))
3556             bitset_clear (accepts, '\n');
3557           if (dfa->syntax & RE_DOT_NOT_NULL)
3558             bitset_clear (accepts, '\0');
3559         }
3560 #ifdef RE_ENABLE_I18N
3561       else if (type == OP_UTF8_PERIOD)
3562         {
3563           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3564           if (!(dfa->syntax & RE_DOT_NEWLINE))
3565             bitset_clear (accepts, '\n');
3566           if (dfa->syntax & RE_DOT_NOT_NULL)
3567             bitset_clear (accepts, '\0');
3568         }
3569 #endif
3570       else
3571         continue;
3572
3573       /* Check the `accepts' and sift the characters which are not
3574          match it the context.  */
3575       if (constraint)
3576         {
3577           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3578             {
3579               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3580               bitset_empty (accepts);
3581               if (accepts_newline)
3582                 bitset_set (accepts, NEWLINE_CHAR);
3583               else
3584                 continue;
3585             }
3586           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3587             {
3588               bitset_empty (accepts);
3589               continue;
3590             }
3591
3592           if (constraint & NEXT_WORD_CONSTRAINT)
3593             {
3594               bitset_word_t any_set = 0;
3595               if (type == CHARACTER && !node->word_char)
3596                 {
3597                   bitset_empty (accepts);
3598                   continue;
3599                 }
3600 #ifdef RE_ENABLE_I18N
3601               if (dfa->mb_cur_max > 1)
3602                 for (j = 0; j < BITSET_WORDS; ++j)
3603                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3604               else
3605 #endif
3606                 for (j = 0; j < BITSET_WORDS; ++j)
3607                   any_set |= (accepts[j] &= dfa->word_char[j]);
3608               if (!any_set)
3609                 continue;
3610             }
3611           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3612             {
3613               bitset_word_t any_set = 0;
3614               if (type == CHARACTER && node->word_char)
3615                 {
3616                   bitset_empty (accepts);
3617                   continue;
3618                 }
3619 #ifdef RE_ENABLE_I18N
3620               if (dfa->mb_cur_max > 1)
3621                 for (j = 0; j < BITSET_WORDS; ++j)
3622                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3623               else
3624 #endif
3625                 for (j = 0; j < BITSET_WORDS; ++j)
3626                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3627               if (!any_set)
3628                 continue;
3629             }
3630         }
3631
3632       /* Then divide `accepts' into DFA states, or create a new
3633          state.  Above, we make sure that accepts is not empty.  */
3634       for (j = 0; j < ndests; ++j)
3635         {
3636           bitset_t intersec; /* Intersection sets, see below.  */
3637           bitset_t remains;
3638           /* Flags, see below.  */
3639           bitset_word_t has_intersec, not_subset, not_consumed;
3640
3641           /* Optimization, skip if this state doesn't accept the character.  */
3642           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3643             continue;
3644
3645           /* Enumerate the intersection set of this state and `accepts'.  */
3646           has_intersec = 0;
3647           for (k = 0; k < BITSET_WORDS; ++k)
3648             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3649           /* And skip if the intersection set is empty.  */
3650           if (!has_intersec)
3651             continue;
3652
3653           /* Then check if this state is a subset of `accepts'.  */
3654           not_subset = not_consumed = 0;
3655           for (k = 0; k < BITSET_WORDS; ++k)
3656             {
3657               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3658               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3659             }
3660
3661           /* If this state isn't a subset of `accepts', create a
3662              new group state, which has the `remains'. */
3663           if (not_subset)
3664             {
3665               bitset_copy (dests_ch[ndests], remains);
3666               bitset_copy (dests_ch[j], intersec);
3667               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3668               if (BE (err != REG_NOERROR, 0))
3669                 goto error_return;
3670               ++ndests;
3671             }
3672
3673           /* Put the position in the current group. */
3674           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3675           if (BE (result < 0, 0))
3676             goto error_return;
3677
3678           /* If all characters are consumed, go to next node. */
3679           if (!not_consumed)
3680             break;
3681         }
3682       /* Some characters remain, create a new group. */
3683       if (j == ndests)
3684         {
3685           bitset_copy (dests_ch[ndests], accepts);
3686           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3687           if (BE (err != REG_NOERROR, 0))
3688             goto error_return;
3689           ++ndests;
3690           bitset_empty (accepts);
3691         }
3692     }
3693   return ndests;
3694  error_return:
3695   for (j = 0; j < ndests; ++j)
3696     re_node_set_free (dests_node + j);
3697   return -1;
3698 }
3699
3700 #ifdef RE_ENABLE_I18N
3701 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3702    Return the number of the bytes the node accepts.
3703    STR_IDX is the current index of the input string.
3704
3705    This function handles the nodes which can accept one character, or
3706    one collating element like '.', '[a-z]', opposite to the other nodes
3707    can only accept one byte.  */
3708
3709 static int
3710 internal_function
3711 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3712                          const re_string_t *input, int str_idx)
3713 {
3714   const re_token_t *node = dfa->nodes + node_idx;
3715   int char_len, elem_len;
3716   int i;
3717
3718   if (BE (node->type == OP_UTF8_PERIOD, 0))
3719     {
3720       unsigned char c = re_string_byte_at (input, str_idx), d;
3721       if (BE (c < 0xc2, 1))
3722         return 0;
3723
3724       if (str_idx + 2 > input->len)
3725         return 0;
3726
3727       d = re_string_byte_at (input, str_idx + 1);
3728       if (c < 0xe0)
3729         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3730       else if (c < 0xf0)
3731         {
3732           char_len = 3;
3733           if (c == 0xe0 && d < 0xa0)
3734             return 0;
3735         }
3736       else if (c < 0xf8)
3737         {
3738           char_len = 4;
3739           if (c == 0xf0 && d < 0x90)
3740             return 0;
3741         }
3742       else if (c < 0xfc)
3743         {
3744           char_len = 5;
3745           if (c == 0xf8 && d < 0x88)
3746             return 0;
3747         }
3748       else if (c < 0xfe)
3749         {
3750           char_len = 6;
3751           if (c == 0xfc && d < 0x84)
3752             return 0;
3753         }
3754       else
3755         return 0;
3756
3757       if (str_idx + char_len > input->len)
3758         return 0;
3759
3760       for (i = 1; i < char_len; ++i)
3761         {
3762           d = re_string_byte_at (input, str_idx + i);
3763           if (d < 0x80 || d > 0xbf)
3764             return 0;
3765         }
3766       return char_len;
3767     }
3768
3769   char_len = re_string_char_size_at (input, str_idx);
3770   if (node->type == OP_PERIOD)
3771     {
3772       if (char_len <= 1)
3773         return 0;
3774       /* FIXME: I don't think this if is needed, as both '\n'
3775          and '\0' are char_len == 1.  */
3776       /* '.' accepts any one character except the following two cases.  */
3777       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3778            re_string_byte_at (input, str_idx) == '\n') ||
3779           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3780            re_string_byte_at (input, str_idx) == '\0'))
3781         return 0;
3782       return char_len;
3783     }
3784
3785   elem_len = re_string_elem_size_at (input, str_idx);
3786   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3787     return 0;
3788
3789   if (node->type == COMPLEX_BRACKET)
3790     {
3791       const re_charset_t *cset = node->opr.mbcset;
3792 # ifdef _LIBC
3793       const unsigned char *pin
3794         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3795       int j;
3796       uint32_t nrules;
3797 # endif /* _LIBC */
3798       int match_len = 0;
3799       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3800                     ? re_string_wchar_at (input, str_idx) : 0);
3801
3802       /* match with multibyte character?  */
3803       for (i = 0; i < cset->nmbchars; ++i)
3804         if (wc == cset->mbchars[i])
3805           {
3806             match_len = char_len;
3807             goto check_node_accept_bytes_match;
3808           }
3809       /* match with character_class?  */
3810       for (i = 0; i < cset->nchar_classes; ++i)
3811         {
3812           wctype_t wt = cset->char_classes[i];
3813           if (__iswctype (wc, wt))
3814             {
3815               match_len = char_len;
3816               goto check_node_accept_bytes_match;
3817             }
3818         }
3819
3820 # ifdef _LIBC
3821       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3822       if (nrules != 0)
3823         {
3824           unsigned int in_collseq = 0;
3825           const int32_t *table, *indirect;
3826           const unsigned char *weights, *extra;
3827           const char *collseqwc;
3828           /* This #include defines a local function!  */
3829 #  include <locale/weight.h>
3830
3831           /* match with collating_symbol?  */
3832           if (cset->ncoll_syms)
3833             extra = (const unsigned char *)
3834               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3835           for (i = 0; i < cset->ncoll_syms; ++i)
3836             {
3837               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3838               /* Compare the length of input collating element and
3839                  the length of current collating element.  */
3840               if (*coll_sym != elem_len)
3841                 continue;
3842               /* Compare each bytes.  */
3843               for (j = 0; j < *coll_sym; j++)
3844                 if (pin[j] != coll_sym[1 + j])
3845                   break;
3846               if (j == *coll_sym)
3847                 {
3848                   /* Match if every bytes is equal.  */
3849                   match_len = j;
3850                   goto check_node_accept_bytes_match;
3851                 }
3852             }
3853
3854           if (cset->nranges)
3855             {
3856               if (elem_len <= char_len)
3857                 {
3858                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3859                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3860                 }
3861               else
3862                 in_collseq = find_collation_sequence_value (pin, elem_len);
3863             }
3864           /* match with range expression?  */
3865           for (i = 0; i < cset->nranges; ++i)
3866             if (cset->range_starts[i] <= in_collseq
3867                 && in_collseq <= cset->range_ends[i])
3868               {
3869                 match_len = elem_len;
3870                 goto check_node_accept_bytes_match;
3871               }
3872
3873           /* match with equivalence_class?  */
3874           if (cset->nequiv_classes)
3875             {
3876               const unsigned char *cp = pin;
3877               table = (const int32_t *)
3878                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3879               weights = (const unsigned char *)
3880                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3881               extra = (const unsigned char *)
3882                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3883               indirect = (const int32_t *)
3884                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3885               int32_t idx = findidx (&cp);
3886               if (idx > 0)
3887                 for (i = 0; i < cset->nequiv_classes; ++i)
3888                   {
3889                     int32_t equiv_class_idx = cset->equiv_classes[i];
3890                     size_t weight_len = weights[idx & 0xffffff];
3891                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3892                         && (idx >> 24) == (equiv_class_idx >> 24))
3893                       {
3894                         int cnt = 0;
3895
3896                         idx &= 0xffffff;
3897                         equiv_class_idx &= 0xffffff;
3898
3899                         while (cnt <= weight_len
3900                                && (weights[equiv_class_idx + 1 + cnt]
3901                                    == weights[idx + 1 + cnt]))
3902                           ++cnt;
3903                         if (cnt > weight_len)
3904                           {
3905                             match_len = elem_len;
3906                             goto check_node_accept_bytes_match;
3907                           }
3908                       }
3909                   }
3910             }
3911         }
3912       else
3913 # endif /* _LIBC */
3914         {
3915           /* match with range expression?  */
3916 #if __GNUC__ >= 2
3917           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3918 #else
3919           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3920           cmp_buf[2] = wc;
3921 #endif
3922           for (i = 0; i < cset->nranges; ++i)
3923             {
3924               cmp_buf[0] = cset->range_starts[i];
3925               cmp_buf[4] = cset->range_ends[i];
3926               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3927                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3928                 {
3929                   match_len = char_len;
3930                   goto check_node_accept_bytes_match;
3931                 }
3932             }
3933         }
3934     check_node_accept_bytes_match:
3935       if (!cset->non_match)
3936         return match_len;
3937       else
3938         {
3939           if (match_len > 0)
3940             return 0;
3941           else
3942             return (elem_len > char_len) ? elem_len : char_len;
3943         }
3944     }
3945   return 0;
3946 }
3947
3948 # ifdef _LIBC
3949 static unsigned int
3950 internal_function
3951 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3952 {
3953   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3954   if (nrules == 0)
3955     {
3956       if (mbs_len == 1)
3957         {
3958           /* No valid character.  Match it as a single byte character.  */
3959           const unsigned char *collseq = (const unsigned char *)
3960             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3961           return collseq[mbs[0]];
3962         }
3963       return UINT_MAX;
3964     }
3965   else
3966     {
3967       int32_t idx;
3968       const unsigned char *extra = (const unsigned char *)
3969         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3970       int32_t extrasize = (const unsigned char *)
3971         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3972
3973       for (idx = 0; idx < extrasize;)
3974         {
3975           int mbs_cnt, found = 0;
3976           int32_t elem_mbs_len;
3977           /* Skip the name of collating element name.  */
3978           idx = idx + extra[idx] + 1;
3979           elem_mbs_len = extra[idx++];
3980           if (mbs_len == elem_mbs_len)
3981             {
3982               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3983                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3984                   break;
3985               if (mbs_cnt == elem_mbs_len)
3986                 /* Found the entry.  */
3987                 found = 1;
3988             }
3989           /* Skip the byte sequence of the collating element.  */
3990           idx += elem_mbs_len;
3991           /* Adjust for the alignment.  */
3992           idx = (idx + 3) & ~3;
3993           /* Skip the collation sequence value.  */
3994           idx += sizeof (uint32_t);
3995           /* Skip the wide char sequence of the collating element.  */
3996           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3997           /* If we found the entry, return the sequence value.  */
3998           if (found)
3999             return *(uint32_t *) (extra + idx);
4000           /* Skip the collation sequence value.  */
4001           idx += sizeof (uint32_t);
4002         }
4003       return UINT_MAX;
4004     }
4005 }
4006 # endif /* _LIBC */
4007 #endif /* RE_ENABLE_I18N */
4008
4009 /* Check whether the node accepts the byte which is IDX-th
4010    byte of the INPUT.  */
4011
4012 static int
4013 internal_function
4014 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4015                    int idx)
4016 {
4017   unsigned char ch;
4018   ch = re_string_byte_at (&mctx->input, idx);
4019   switch (node->type)
4020     {
4021     case CHARACTER:
4022       if (node->opr.c != ch)
4023         return 0;
4024       break;
4025
4026     case SIMPLE_BRACKET:
4027       if (!bitset_contain (node->opr.sbcset, ch))
4028         return 0;
4029       break;
4030
4031 #ifdef RE_ENABLE_I18N
4032     case OP_UTF8_PERIOD:
4033       if (ch >= 0x80)
4034         return 0;
4035       /* FALLTHROUGH */
4036 #endif
4037     case OP_PERIOD:
4038       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4039           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4040         return 0;
4041       break;
4042
4043     default:
4044       return 0;
4045     }
4046
4047   if (node->constraint)
4048     {
4049       /* The node has constraints.  Check whether the current context
4050          satisfies the constraints.  */
4051       unsigned int context = re_string_context_at (&mctx->input, idx,
4052                                                    mctx->eflags);
4053       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4054         return 0;
4055     }
4056
4057   return 1;
4058 }
4059
4060 /* Extend the buffers, if the buffers have run out.  */
4061
4062 static reg_errcode_t
4063 internal_function
4064 extend_buffers (re_match_context_t *mctx)
4065 {
4066   reg_errcode_t ret;
4067   re_string_t *pstr = &mctx->input;
4068
4069   /* Double the lengthes of the buffers.  */
4070   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4071   if (BE (ret != REG_NOERROR, 0))
4072     return ret;
4073
4074   if (mctx->state_log != NULL)
4075     {
4076       /* And double the length of state_log.  */
4077       /* XXX We have no indication of the size of this buffer.  If this
4078          allocation fail we have no indication that the state_log array
4079          does not have the right size.  */
4080       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4081                                               pstr->bufs_len + 1);
4082       if (BE (new_array == NULL, 0))
4083         return REG_ESPACE;
4084       mctx->state_log = new_array;
4085     }
4086
4087   /* Then reconstruct the buffers.  */
4088   if (pstr->icase)
4089     {
4090 #ifdef RE_ENABLE_I18N
4091       if (pstr->mb_cur_max > 1)
4092         {
4093           ret = build_wcs_upper_buffer (pstr);
4094           if (BE (ret != REG_NOERROR, 0))
4095             return ret;
4096         }
4097       else
4098 #endif /* RE_ENABLE_I18N  */
4099         build_upper_buffer (pstr);
4100     }
4101   else
4102     {
4103 #ifdef RE_ENABLE_I18N
4104       if (pstr->mb_cur_max > 1)
4105         build_wcs_buffer (pstr);
4106       else
4107 #endif /* RE_ENABLE_I18N  */
4108         {
4109           if (pstr->trans != NULL)
4110             re_string_translate_buffer (pstr);
4111         }
4112     }
4113   return REG_NOERROR;
4114 }
4115
4116 \f
4117 /* Functions for matching context.  */
4118
4119 /* Initialize MCTX.  */
4120
4121 static reg_errcode_t
4122 internal_function
4123 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4124 {
4125   mctx->eflags = eflags;
4126   mctx->match_last = -1;
4127   if (n > 0)
4128     {
4129       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4130       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4131       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4132         return REG_ESPACE;
4133     }
4134   /* Already zero-ed by the caller.
4135      else
4136        mctx->bkref_ents = NULL;
4137      mctx->nbkref_ents = 0;
4138      mctx->nsub_tops = 0;  */
4139   mctx->abkref_ents = n;
4140   mctx->max_mb_elem_len = 1;
4141   mctx->asub_tops = n;
4142   return REG_NOERROR;
4143 }
4144
4145 /* Clean the entries which depend on the current input in MCTX.
4146    This function must be invoked when the matcher changes the start index
4147    of the input, or changes the input string.  */
4148
4149 static void
4150 internal_function
4151 match_ctx_clean (re_match_context_t *mctx)
4152 {
4153   int st_idx;
4154   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4155     {
4156       int sl_idx;
4157       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4158       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4159         {
4160           re_sub_match_last_t *last = top->lasts[sl_idx];
4161           re_free (last->path.array);
4162           re_free (last);
4163         }
4164       re_free (top->lasts);
4165       if (top->path)
4166         {
4167           re_free (top->path->array);
4168           re_free (top->path);
4169         }
4170       free (top);
4171     }
4172
4173   mctx->nsub_tops = 0;
4174   mctx->nbkref_ents = 0;
4175 }
4176
4177 /* Free all the memory associated with MCTX.  */
4178
4179 static void
4180 internal_function
4181 match_ctx_free (re_match_context_t *mctx)
4182 {
4183   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4184   match_ctx_clean (mctx);
4185   re_free (mctx->sub_tops);
4186   re_free (mctx->bkref_ents);
4187 }
4188
4189 /* Add a new backreference entry to MCTX.
4190    Note that we assume that caller never call this function with duplicate
4191    entry, and call with STR_IDX which isn't smaller than any existing entry.
4192 */
4193
4194 static reg_errcode_t
4195 internal_function
4196 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4197                      int to)
4198 {
4199   if (mctx->nbkref_ents >= mctx->abkref_ents)
4200     {
4201       struct re_backref_cache_entry* new_entry;
4202       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4203                               mctx->abkref_ents * 2);
4204       if (BE (new_entry == NULL, 0))
4205         {
4206           re_free (mctx->bkref_ents);
4207           return REG_ESPACE;
4208         }
4209       mctx->bkref_ents = new_entry;
4210       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4211               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4212       mctx->abkref_ents *= 2;
4213     }
4214   if (mctx->nbkref_ents > 0
4215       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4216     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4217
4218   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4219   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4220   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4221   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4222
4223   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4224      If bit N is clear, means that this entry won't epsilon-transition to
4225      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4226      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4227      such node.
4228
4229      A backreference does not epsilon-transition unless it is empty, so set
4230      to all zeros if FROM != TO.  */
4231   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4232     = (from == to ? ~0 : 0);
4233
4234   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4235   if (mctx->max_mb_elem_len < to - from)
4236     mctx->max_mb_elem_len = to - from;
4237   return REG_NOERROR;
4238 }
4239
4240 /* Search for the first entry which has the same str_idx, or -1 if none is
4241    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4242
4243 static int
4244 internal_function
4245 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4246 {
4247   int left, right, mid, last;
4248   last = right = mctx->nbkref_ents;
4249   for (left = 0; left < right;)
4250     {
4251       mid = (left + right) / 2;
4252       if (mctx->bkref_ents[mid].str_idx < str_idx)
4253         left = mid + 1;
4254       else
4255         right = mid;
4256     }
4257   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4258     return left;
4259   else
4260     return -1;
4261 }
4262
4263 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4264    at STR_IDX.  */
4265
4266 static reg_errcode_t
4267 internal_function
4268 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4269 {
4270 #ifdef DEBUG
4271   assert (mctx->sub_tops != NULL);
4272   assert (mctx->asub_tops > 0);
4273 #endif
4274   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4275     {
4276       int new_asub_tops = mctx->asub_tops * 2;
4277       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4278                                                    re_sub_match_top_t *,
4279                                                    new_asub_tops);
4280       if (BE (new_array == NULL, 0))
4281         return REG_ESPACE;
4282       mctx->sub_tops = new_array;
4283       mctx->asub_tops = new_asub_tops;
4284     }
4285   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4286   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4287     return REG_ESPACE;
4288   mctx->sub_tops[mctx->nsub_tops]->node = node;
4289   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4290   return REG_NOERROR;
4291 }
4292
4293 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4294    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4295
4296 static re_sub_match_last_t *
4297 internal_function
4298 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4299 {
4300   re_sub_match_last_t *new_entry;
4301   if (BE (subtop->nlasts == subtop->alasts, 0))
4302     {
4303       int new_alasts = 2 * subtop->alasts + 1;
4304       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4305                                                     re_sub_match_last_t *,
4306                                                     new_alasts);
4307       if (BE (new_array == NULL, 0))
4308         return NULL;
4309       subtop->lasts = new_array;
4310       subtop->alasts = new_alasts;
4311     }
4312   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4313   if (BE (new_entry != NULL, 1))
4314     {
4315       subtop->lasts[subtop->nlasts] = new_entry;
4316       new_entry->node = node;
4317       new_entry->str_idx = str_idx;
4318       ++subtop->nlasts;
4319     }
4320   return new_entry;
4321 }
4322
4323 static void
4324 internal_function
4325 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4326                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4327 {
4328   sctx->sifted_states = sifted_sts;
4329   sctx->limited_states = limited_sts;
4330   sctx->last_node = last_node;
4331   sctx->last_str_idx = last_str_idx;
4332   re_node_set_init_empty (&sctx->limits);
4333 }