posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 231
 232   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 233     return REG_BADPAT;
 234
 235   if (eflags & REG_STARTEND)
 236     {
 237       start = pmatch[0].rm_so;
 238       length = pmatch[0].rm_eo;
 239     }
 240   else
 241     {
 242       start = 0;
 243       length = strlen (string);
 244     }
 245
 246   __libc_lock_lock (dfa->lock);
 247   if (preg->no_sub)
 248     err = re_search_internal (preg, string, length, start, length - start,
 249                               length, 0, NULL, eflags);
 250   else
 251     err = re_search_internal (preg, string, length, start, length - start,
 252                               length, nmatch, pmatch, eflags);
 253   __libc_lock_unlock (dfa->lock);
 254   return err != REG_NOERROR;
 255 }
 256
 257 #ifdef _LIBC
 258 # include <shlib-compat.h>
 259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 260
 261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 262 __typeof__ (__regexec) __compat_regexec;
 263
 264 int
 265 attribute_compat_text_section
 266 __compat_regexec (const regex_t *__restrict preg,
 267                   const char *__restrict string, size_t nmatch,
 268                   regmatch_t pmatch[], int eflags)
 269 {
 270   return regexec (preg, string, nmatch, pmatch,
 271                   eflags & (REG_NOTBOL | REG_NOTEOL));
 272 }
 273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 274 # endif
 275 #endif
 276
 277 /* Entry points for GNU code.  */
 278
 279 /* re_match, re_search, re_match_2, re_search_2
 280
 281    The former two functions operate on STRING with length LENGTH,
 282    while the later two operate on concatenation of STRING1 and STRING2
 283    with lengths LENGTH1 and LENGTH2, respectively.
 284
 285    re_match() matches the compiled pattern in BUFP against the string,
 286    starting at index START.
 287
 288    re_search() first tries matching at index START, then it tries to match
 289    starting from index START + 1, and so on.  The last start position tried
 290    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 291    way as re_match().)
 292
 293    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 294    the first STOP characters of the concatenation of the strings should be
 295    concerned.
 296
 297    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 298    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 299    computed relative to the concatenation, not relative to the individual
 300    strings.)
 301
 302    On success, re_match* functions return the length of the match, re_search*
 303    return the position of the start of the match.  Return value -1 means no
 304    match was found and -2 indicates an internal error.  */
 305
 306 int
 307 re_match (bufp, string, length, start, regs)
 308     struct re_pattern_buffer *bufp;
 309     const char *string;
 310     int length, start;
 311     struct re_registers *regs;
 312 {
 313   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 314 }
 315 #ifdef _LIBC
 316 weak_alias (__re_match, re_match)
 317 #endif
 318
 319 int
 320 re_search (bufp, string, length, start, range, regs)
 321     struct re_pattern_buffer *bufp;
 322     const char *string;
 323     int length, start, range;
 324     struct re_registers *regs;
 325 {
 326   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 327 }
 328 #ifdef _LIBC
 329 weak_alias (__re_search, re_search)
 330 #endif
 331
 332 int
 333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 334     struct re_pattern_buffer *bufp;
 335     const char *string1, *string2;
 336     int length1, length2, start, stop;
 337     struct re_registers *regs;
 338 {
 339   return re_search_2_stub (bufp, string1, length1, string2, length2,
 340                            start, 0, regs, stop, 1);
 341 }
 342 #ifdef _LIBC
 343 weak_alias (__re_match_2, re_match_2)
 344 #endif
 345
 346 int
 347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 348     struct re_pattern_buffer *bufp;
 349     const char *string1, *string2;
 350     int length1, length2, start, range, stop;
 351     struct re_registers *regs;
 352 {
 353   return re_search_2_stub (bufp, string1, length1, string2, length2,
 354                            start, range, regs, stop, 0);
 355 }
 356 #ifdef _LIBC
 357 weak_alias (__re_search_2, re_search_2)
 358 #endif
 359
 360 static int
 361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 362                   stop, ret_len)
 363     struct re_pattern_buffer *bufp;
 364     const char *string1, *string2;
 365     int length1, length2, start, range, stop, ret_len;
 366     struct re_registers *regs;
 367 {
 368   const char *str;
 369   int rval;
 370   int len = length1 + length2;
 371   char *s = NULL;
 372
 373   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 374     return -2;
 375
 376   /* Concatenate the strings.  */
 377   if (length2 > 0)
 378     if (length1 > 0)
 379       {
 380         s = re_malloc (char, len);
 381
 382         if (BE (s == NULL, 0))
 383           return -2;
 384 #ifdef _LIBC
 385         memcpy (__mempcpy (s, string1, length1), string2, length2);
 386 #else
 387         memcpy (s, string1, length1);
 388         memcpy (s + length1, string2, length2);
 389 #endif
 390         str = s;
 391       }
 392     else
 393       str = string2;
 394   else
 395     str = string1;
 396
 397   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 398   re_free (s);
 399   return rval;
 400 }
 401
 402 /* The parameters have the same meaning as those of re_search.
 403    Additional parameters:
 404    If RET_LEN is nonzero the length of the match is returned (re_match style);
 405    otherwise the position of the match is returned.  */
 406
 407 static int
 408 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 409     struct re_pattern_buffer *bufp;
 410     const char *string;
 411     int length, start, range, stop, ret_len;
 412     struct re_registers *regs;
 413 {
 414   reg_errcode_t result;
 415   regmatch_t *pmatch;
 416   int nregs, rval;
 417   int eflags = 0;
 418   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 419
 420   /* Check for out-of-range.  */
 421   if (BE (start < 0 || start > length, 0))
 422     return -1;
 423   if (BE (start + range > length, 0))
 424     range = length - start;
 425   else if (BE (start + range < 0, 0))
 426     range = -start;
 427
 428   __libc_lock_lock (dfa->lock);
 429
 430   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 431   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 432
 433   /* Compile fastmap if we haven't yet.  */
 434   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 435     re_compile_fastmap (bufp);
 436
 437   if (BE (bufp->no_sub, 0))
 438     regs = NULL;
 439
 440   /* We need at least 1 register.  */
 441   if (regs == NULL)
 442     nregs = 1;
 443   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 444                regs->num_regs < bufp->re_nsub + 1, 0))
 445     {
 446       nregs = regs->num_regs;
 447       if (BE (nregs < 1, 0))
 448         {
 449           /* Nothing can be copied to regs.  */
 450           regs = NULL;
 451           nregs = 1;
 452         }
 453     }
 454   else
 455     nregs = bufp->re_nsub + 1;
 456   pmatch = re_malloc (regmatch_t, nregs);
 457   if (BE (pmatch == NULL, 0))
 458     {
 459       rval = -2;
 460       goto out;
 461     }
 462
 463   result = re_search_internal (bufp, string, length, start, range, stop,
 464                                nregs, pmatch, eflags);
 465
 466   rval = 0;
 467
 468   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 469   if (result != REG_NOERROR)
 470     rval = -1;
 471   else if (regs != NULL)
 472     {
 473       /* If caller wants register contents data back, copy them.  */
 474       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 475                                            bufp->regs_allocated);
 476       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 477         rval = -2;
 478     }
 479
 480   if (BE (rval == 0, 1))
 481     {
 482       if (ret_len)
 483         {
 484           assert (pmatch[0].rm_so == start);
 485           rval = pmatch[0].rm_eo - start;
 486         }
 487       else
 488         rval = pmatch[0].rm_so;
 489     }
 490   re_free (pmatch);
 491  out:
 492   __libc_lock_unlock (dfa->lock);
 493   return rval;
 494 }
 495
 496 static unsigned
 497 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 498     struct re_registers *regs;
 499     regmatch_t *pmatch;
 500     int nregs, regs_allocated;
 501 {
 502   int rval = REGS_REALLOCATE;
 503   int i;
 504   int need_regs = nregs + 1;
 505   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 506      uses.  */
 507
 508   /* Have the register data arrays been allocated?  */
 509   if (regs_allocated == REGS_UNALLOCATED)
 510     { /* No.  So allocate them with malloc.  */
 511       regs->start = re_malloc (regoff_t, need_regs);
 512       if (BE (regs->start == NULL, 0))
 513         return REGS_UNALLOCATED;
 514       regs->end = re_malloc (regoff_t, need_regs);
 515       if (BE (regs->end == NULL, 0))
 516         {
 517           re_free (regs->start);
 518           return REGS_UNALLOCATED;
 519         }
 520       regs->num_regs = need_regs;
 521     }
 522   else if (regs_allocated == REGS_REALLOCATE)
 523     { /* Yes.  If we need more elements than were already
 524          allocated, reallocate them.  If we need fewer, just
 525          leave it alone.  */
 526       if (BE (need_regs > regs->num_regs, 0))
 527         {
 528           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 529           regoff_t *new_end;
 530           if (BE (new_start == NULL, 0))
 531             return REGS_UNALLOCATED;
 532           new_end = re_realloc (regs->end, regoff_t, need_regs);
 533           if (BE (new_end == NULL, 0))
 534             {
 535               re_free (new_start);
 536               return REGS_UNALLOCATED;
 537             }
 538           regs->start = new_start;
 539           regs->end = new_end;
 540           regs->num_regs = need_regs;
 541         }
 542     }
 543   else
 544     {
 545       assert (regs_allocated == REGS_FIXED);
 546       /* This function may not be called with REGS_FIXED and nregs too big.  */
 547       assert (regs->num_regs >= nregs);
 548       rval = REGS_FIXED;
 549     }
 550
 551   /* Copy the regs.  */
 552   for (i = 0; i < nregs; ++i)
 553     {
 554       regs->start[i] = pmatch[i].rm_so;
 555       regs->end[i] = pmatch[i].rm_eo;
 556     }
 557   for ( ; i < regs->num_regs; ++i)
 558     regs->start[i] = regs->end[i] = -1;
 559
 560   return rval;
 561 }
 562
 563 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 564    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 565    this memory for recording register information.  STARTS and ENDS
 566    must be allocated using the malloc library routine, and must each
 567    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 568
 569    If NUM_REGS == 0, then subsequent matches should allocate their own
 570    register data.
 571
 572    Unless this function is called, the first search or match using
 573    PATTERN_BUFFER will allocate its own register data, without
 574    freeing the old data.  */
 575
 576 void
 577 re_set_registers (bufp, regs, num_regs, starts, ends)
 578     struct re_pattern_buffer *bufp;
 579     struct re_registers *regs;
 580     unsigned num_regs;
 581     regoff_t *starts, *ends;
 582 {
 583   if (num_regs)
 584     {
 585       bufp->regs_allocated = REGS_REALLOCATE;
 586       regs->num_regs = num_regs;
 587       regs->start = starts;
 588       regs->end = ends;
 589     }
 590   else
 591     {
 592       bufp->regs_allocated = REGS_UNALLOCATED;
 593       regs->num_regs = 0;
 594       regs->start = regs->end = (regoff_t *) 0;
 595     }
 596 }
 597 #ifdef _LIBC
 598 weak_alias (__re_set_registers, re_set_registers)
 599 #endif
 600 \f
 601 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 602    them unless specifically requested.  */
 603
 604 #if defined _REGEX_RE_COMP || defined _LIBC
 605 int
 606 # ifdef _LIBC
 607 weak_function
 608 # endif
 609 re_exec (s)
 610      const char *s;
 611 {
 612   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 613 }
 614 #endif /* _REGEX_RE_COMP */
 615 \f
 616 /* Internal entry point.  */
 617
 618 /* Searches for a compiled pattern PREG in the string STRING, whose
 619    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 620    mingings with regexec.  START, and RANGE have the same meanings
 621    with re_search.
 622    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 623    otherwise return the error code.
 624    Note: We assume front end functions already check ranges.
 625    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 626
 627 static reg_errcode_t
 628 __attribute_warn_unused_result__
 629 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 630                     eflags)
 631     const regex_t *preg;
 632     const char *string;
 633     int length, start, range, stop, eflags;
 634     size_t nmatch;
 635     regmatch_t pmatch[];
 636 {
 637   reg_errcode_t err;
 638   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 639   int left_lim, right_lim, incr;
 640   int fl_longest_match, match_first, match_kind, match_last = -1;
 641   int extra_nmatch;
 642   int sb, ch;
 643 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 644   re_match_context_t mctx = { .dfa = dfa };
 645 #else
 646   re_match_context_t mctx;
 647 #endif
 648   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 649                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 650   RE_TRANSLATE_TYPE t = preg->translate;
 651
 652 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 653   memset (&mctx, '\0', sizeof (re_match_context_t));
 654   mctx.dfa = dfa;
 655 #endif
 656
 657   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 658   nmatch -= extra_nmatch;
 659
 660   /* Check if the DFA haven't been compiled.  */
 661   if (BE (preg->used == 0 || dfa->init_state == NULL
 662           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 663           || dfa->init_state_begbuf == NULL, 0))
 664     return REG_NOMATCH;
 665
 666 #ifdef DEBUG
 667   /* We assume front-end functions already check them.  */
 668   assert (start + range >= 0 && start + range <= length);
 669 #endif
 670
 671   /* If initial states with non-begbuf contexts have no elements,
 672      the regex must be anchored.  If preg->newline_anchor is set,
 673      we'll never use init_state_nl, so do not check it.  */
 674   if (dfa->init_state->nodes.nelem == 0
 675       && dfa->init_state_word->nodes.nelem == 0
 676       && (dfa->init_state_nl->nodes.nelem == 0
 677           || !preg->newline_anchor))
 678     {
 679       if (start != 0 && start + range != 0)
 680         return REG_NOMATCH;
 681       start = range = 0;
 682     }
 683
 684   /* We must check the longest matching, if nmatch > 0.  */
 685   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 686
 687   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 688                             preg->translate, preg->syntax & RE_ICASE, dfa);
 689   if (BE (err != REG_NOERROR, 0))
 690     goto free_return;
 691   mctx.input.stop = stop;
 692   mctx.input.raw_stop = stop;
 693   mctx.input.newline_anchor = preg->newline_anchor;
 694
 695   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 696   if (BE (err != REG_NOERROR, 0))
 697     goto free_return;
 698
 699   /* We will log all the DFA states through which the dfa pass,
 700      if nmatch > 1, or this dfa has "multibyte node", which is a
 701      back-reference or a node which can accept multibyte character or
 702      multi character collating element.  */
 703   if (nmatch > 1 || dfa->has_mb_node)
 704     {
 705       /* Avoid overflow.  */
 706       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 707         {
 708           err = REG_ESPACE;
 709           goto free_return;
 710         }
 711
 712       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 713       if (BE (mctx.state_log == NULL, 0))
 714         {
 715           err = REG_ESPACE;
 716           goto free_return;
 717         }
 718     }
 719   else
 720     mctx.state_log = NULL;
 721
 722   match_first = start;
 723   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 724                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 725
 726   /* Check incrementally whether of not the input string match.  */
 727   incr = (range < 0) ? -1 : 1;
 728   left_lim = (range < 0) ? start + range : start;
 729   right_lim = (range < 0) ? start : start + range;
 730   sb = dfa->mb_cur_max == 1;
 731   match_kind =
 732     (fastmap
 733      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 734         | (range >= 0 ? 2 : 0)
 735         | (t != NULL ? 1 : 0))
 736      : 8);
 737
 738   for (;; match_first += incr)
 739     {
 740       err = REG_NOMATCH;
 741       if (match_first < left_lim || right_lim < match_first)
 742         goto free_return;
 743
 744       /* Advance as rapidly as possible through the string, until we
 745          find a plausible place to start matching.  This may be done
 746          with varying efficiency, so there are various possibilities:
 747          only the most common of them are specialized, in order to
 748          save on code size.  We use a switch statement for speed.  */
 749       switch (match_kind)
 750         {
 751         case 8:
 752           /* No fastmap.  */
 753           break;
 754
 755         case 7:
 756           /* Fastmap with single-byte translation, match forward.  */
 757           while (BE (match_first < right_lim, 1)
 758                  && !fastmap[t[(unsigned char) string[match_first]]])
 759             ++match_first;
 760           goto forward_match_found_start_or_reached_end;
 761
 762         case 6:
 763           /* Fastmap without translation, match forward.  */
 764           while (BE (match_first < right_lim, 1)
 765                  && !fastmap[(unsigned char) string[match_first]])
 766             ++match_first;
 767
 768         forward_match_found_start_or_reached_end:
 769           if (BE (match_first == right_lim, 0))
 770             {
 771               ch = match_first >= length
 772                        ? 0 : (unsigned char) string[match_first];
 773               if (!fastmap[t ? t[ch] : ch])
 774                 goto free_return;
 775             }
 776           break;
 777
 778         case 4:
 779         case 5:
 780           /* Fastmap without multi-byte translation, match backwards.  */
 781           while (match_first >= left_lim)
 782             {
 783               ch = match_first >= length
 784                        ? 0 : (unsigned char) string[match_first];
 785               if (fastmap[t ? t[ch] : ch])
 786                 break;
 787               --match_first;
 788             }
 789           if (match_first < left_lim)
 790             goto free_return;
 791           break;
 792
 793         default:
 794           /* In this case, we can't determine easily the current byte,
 795              since it might be a component byte of a multibyte
 796              character.  Then we use the constructed buffer instead.  */
 797           for (;;)
 798             {
 799               /* If MATCH_FIRST is out of the valid range, reconstruct the
 800                  buffers.  */
 801               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 802               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 803                 {
 804                   err = re_string_reconstruct (&mctx.input, match_first,
 805                                                eflags);
 806                   if (BE (err != REG_NOERROR, 0))
 807                     goto free_return;
 808
 809                   offset = match_first - mctx.input.raw_mbs_idx;
 810                 }
 811               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 812                  Note that MATCH_FIRST must not be smaller than 0.  */
 813               ch = (match_first >= length
 814                     ? 0 : re_string_byte_at (&mctx.input, offset));
 815               if (fastmap[ch])
 816                 break;
 817               match_first += incr;
 818               if (match_first < left_lim || match_first > right_lim)
 819                 {
 820                   err = REG_NOMATCH;
 821                   goto free_return;
 822                 }
 823             }
 824           break;
 825         }
 826
 827       /* Reconstruct the buffers so that the matcher can assume that
 828          the matching starts from the beginning of the buffer.  */
 829       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 830       if (BE (err != REG_NOERROR, 0))
 831         goto free_return;
 832
 833 #ifdef RE_ENABLE_I18N
 834      /* Don't consider this char as a possible match start if it part,
 835         yet isn't the head, of a multibyte character.  */
 836       if (!sb && !re_string_first_byte (&mctx.input, 0))
 837         continue;
 838 #endif
 839
 840       /* It seems to be appropriate one, then use the matcher.  */
 841       /* We assume that the matching starts from 0.  */
 842       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 843       match_last = check_matching (&mctx, fl_longest_match,
 844                                    range >= 0 ? &match_first : NULL);
 845       if (match_last != -1)
 846         {
 847           if (BE (match_last == -2, 0))
 848             {
 849               err = REG_ESPACE;
 850               goto free_return;
 851             }
 852           else
 853             {
 854               mctx.match_last = match_last;
 855               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 856                 {
 857                   re_dfastate_t *pstate = mctx.state_log[match_last];
 858                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 859                                                              match_last);
 860                 }
 861               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 862                   || dfa->nbackref)
 863                 {
 864                   err = prune_impossible_nodes (&mctx);
 865                   if (err == REG_NOERROR)
 866                     break;
 867                   if (BE (err != REG_NOMATCH, 0))
 868                     goto free_return;
 869                   match_last = -1;
 870                 }
 871               else
 872                 break; /* We found a match.  */
 873             }
 874         }
 875
 876       match_ctx_clean (&mctx);
 877     }
 878
 879 #ifdef DEBUG
 880   assert (match_last != -1);
 881   assert (err == REG_NOERROR);
 882 #endif
 883
 884   /* Set pmatch[] if we need.  */
 885   if (nmatch > 0)
 886     {
 887       int reg_idx;
 888
 889       /* Initialize registers.  */
 890       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 891         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 892
 893       /* Set the points where matching start/end.  */
 894       pmatch[0].rm_so = 0;
 895       pmatch[0].rm_eo = mctx.match_last;
 896
 897       if (!preg->no_sub && nmatch > 1)
 898         {
 899           err = set_regs (preg, &mctx, nmatch, pmatch,
 900                           dfa->has_plural_match && dfa->nbackref > 0);
 901           if (BE (err != REG_NOERROR, 0))
 902             goto free_return;
 903         }
 904
 905       /* At last, add the offset to the each registers, since we slided
 906          the buffers so that we could assume that the matching starts
 907          from 0.  */
 908       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 909         if (pmatch[reg_idx].rm_so != -1)
 910           {
 911 #ifdef RE_ENABLE_I18N
 912             if (BE (mctx.input.offsets_needed != 0, 0))
 913               {
 914                 pmatch[reg_idx].rm_so =
 915                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 916                    ? mctx.input.valid_raw_len
 917                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 918                 pmatch[reg_idx].rm_eo =
 919                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 920                    ? mctx.input.valid_raw_len
 921                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 922               }
 923 #else
 924             assert (mctx.input.offsets_needed == 0);
 925 #endif
 926             pmatch[reg_idx].rm_so += match_first;
 927             pmatch[reg_idx].rm_eo += match_first;
 928           }
 929       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 930         {
 931           pmatch[nmatch + reg_idx].rm_so = -1;
 932           pmatch[nmatch + reg_idx].rm_eo = -1;
 933         }
 934
 935       if (dfa->subexp_map)
 936         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 937           if (dfa->subexp_map[reg_idx] != reg_idx)
 938             {
 939               pmatch[reg_idx + 1].rm_so
 940                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 941               pmatch[reg_idx + 1].rm_eo
 942                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 943             }
 944     }
 945
 946  free_return:
 947   re_free (mctx.state_log);
 948   if (dfa->nbackref)
 949     match_ctx_free (&mctx);
 950   re_string_destruct (&mctx.input);
 951   return err;
 952 }
 953
 954 static reg_errcode_t
 955 __attribute_warn_unused_result__
 956 prune_impossible_nodes (mctx)
 957      re_match_context_t *mctx;
 958 {
 959   const re_dfa_t *const dfa = mctx->dfa;
 960   int halt_node, match_last;
 961   reg_errcode_t ret;
 962   re_dfastate_t **sifted_states;
 963   re_dfastate_t **lim_states = NULL;
 964   re_sift_context_t sctx;
 965 #ifdef DEBUG
 966   assert (mctx->state_log != NULL);
 967 #endif
 968   match_last = mctx->match_last;
 969   halt_node = mctx->last_node;
 970
 971   /* Avoid overflow.  */
 972   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 973     return REG_ESPACE;
 974
 975   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 976   if (BE (sifted_states == NULL, 0))
 977     {
 978       ret = REG_ESPACE;
 979       goto free_return;
 980     }
 981   if (dfa->nbackref)
 982     {
 983       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 984       if (BE (lim_states == NULL, 0))
 985         {
 986           ret = REG_ESPACE;
 987           goto free_return;
 988         }
 989       while (1)
 990         {
 991           memset (lim_states, '\0',
 992                   sizeof (re_dfastate_t *) * (match_last + 1));
 993           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 994                          match_last);
 995           ret = sift_states_backward (mctx, &sctx);
 996           re_node_set_free (&sctx.limits);
 997           if (BE (ret != REG_NOERROR, 0))
 998               goto free_return;
 999           if (sifted_states[0] != NULL || lim_states[0] != NULL)
1000             break;
1001           do
1002             {
1003               --match_last;
1004               if (match_last < 0)
1005                 {
1006                   ret = REG_NOMATCH;
1007                   goto free_return;
1008                 }
1009             } while (mctx->state_log[match_last] == NULL
1010                      || !mctx->state_log[match_last]->halt);
1011           halt_node = check_halt_state_context (mctx,
1012                                                 mctx->state_log[match_last],
1013                                                 match_last);
1014         }
1015       ret = merge_state_array (dfa, sifted_states, lim_states,
1016                                match_last + 1);
1017       re_free (lim_states);
1018       lim_states = NULL;
1019       if (BE (ret != REG_NOERROR, 0))
1020         goto free_return;
1021     }
1022   else
1023     {
1024       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1025       ret = sift_states_backward (mctx, &sctx);
1026       re_node_set_free (&sctx.limits);
1027       if (BE (ret != REG_NOERROR, 0))
1028         goto free_return;
1029       if (sifted_states[0] == NULL)
1030         {
1031           ret = REG_NOMATCH;
1032           goto free_return;
1033         }
1034     }
1035   re_free (mctx->state_log);
1036   mctx->state_log = sifted_states;
1037   sifted_states = NULL;
1038   mctx->last_node = halt_node;
1039   mctx->match_last = match_last;
1040   ret = REG_NOERROR;
1041  free_return:
1042   re_free (sifted_states);
1043   re_free (lim_states);
1044   return ret;
1045 }
1046
1047 /* Acquire an initial state and return it.
1048    We must select appropriate initial state depending on the context,
1049    since initial states may have constraints like "\<", "^", etc..  */
1050
1051 static inline re_dfastate_t *
1052 __attribute ((always_inline)) internal_function
1053 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1054                             int idx)
1055 {
1056   const re_dfa_t *const dfa = mctx->dfa;
1057   if (dfa->init_state->has_constraint)
1058     {
1059       unsigned int context;
1060       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1061       if (IS_WORD_CONTEXT (context))
1062         return dfa->init_state_word;
1063       else if (IS_ORDINARY_CONTEXT (context))
1064         return dfa->init_state;
1065       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1066         return dfa->init_state_begbuf;
1067       else if (IS_NEWLINE_CONTEXT (context))
1068         return dfa->init_state_nl;
1069       else if (IS_BEGBUF_CONTEXT (context))
1070         {
1071           /* It is relatively rare case, then calculate on demand.  */
1072           return re_acquire_state_context (err, dfa,
1073                                            dfa->init_state->entrance_nodes,
1074                                            context);
1075         }
1076       else
1077         /* Must not happen?  */
1078         return dfa->init_state;
1079     }
1080   else
1081     return dfa->init_state;
1082 }
1083
1084 /* Check whether the regular expression match input string INPUT or not,
1085    and return the index where the matching end, return -1 if not match,
1086    or return -2 in case of an error.
1087    FL_LONGEST_MATCH means we want the POSIX longest matching.
1088    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1089    next place where we may want to try matching.
1090    Note that the matcher assume that the maching starts from the current
1091    index of the buffer.  */
1092
1093 static int
1094 internal_function __attribute_warn_unused_result__
1095 check_matching (re_match_context_t *mctx, int fl_longest_match,
1096                 int *p_match_first)
1097 {
1098   const re_dfa_t *const dfa = mctx->dfa;
1099   reg_errcode_t err;
1100   int match = 0;
1101   int match_last = -1;
1102   int cur_str_idx = re_string_cur_idx (&mctx->input);
1103   re_dfastate_t *cur_state;
1104   int at_init_state = p_match_first != NULL;
1105   int next_start_idx = cur_str_idx;
1106
1107   err = REG_NOERROR;
1108   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1109   /* An initial state must not be NULL (invalid).  */
1110   if (BE (cur_state == NULL, 0))
1111     {
1112       assert (err == REG_ESPACE);
1113       return -2;
1114     }
1115
1116   if (mctx->state_log != NULL)
1117     {
1118       mctx->state_log[cur_str_idx] = cur_state;
1119
1120       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1121          later.  E.g. Processing back references.  */
1122       if (BE (dfa->nbackref, 0))
1123         {
1124           at_init_state = 0;
1125           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1126           if (BE (err != REG_NOERROR, 0))
1127             return err;
1128
1129           if (cur_state->has_backref)
1130             {
1131               err = transit_state_bkref (mctx, &cur_state->nodes);
1132               if (BE (err != REG_NOERROR, 0))
1133                 return err;
1134             }
1135         }
1136     }
1137
1138   /* If the RE accepts NULL string.  */
1139   if (BE (cur_state->halt, 0))
1140     {
1141       if (!cur_state->has_constraint
1142           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1143         {
1144           if (!fl_longest_match)
1145             return cur_str_idx;
1146           else
1147             {
1148               match_last = cur_str_idx;
1149               match = 1;
1150             }
1151         }
1152     }
1153
1154   while (!re_string_eoi (&mctx->input))
1155     {
1156       re_dfastate_t *old_state = cur_state;
1157       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1158
1159       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1160           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1161               && mctx->input.valid_len < mctx->input.len))
1162         {
1163           err = extend_buffers (mctx);
1164           if (BE (err != REG_NOERROR, 0))
1165             {
1166               assert (err == REG_ESPACE);
1167               return -2;
1168             }
1169         }
1170
1171       cur_state = transit_state (&err, mctx, cur_state);
1172       if (mctx->state_log != NULL)
1173         cur_state = merge_state_with_log (&err, mctx, cur_state);
1174
1175       if (cur_state == NULL)
1176         {
1177           /* Reached the invalid state or an error.  Try to recover a valid
1178              state using the state log, if available and if we have not
1179              already found a valid (even if not the longest) match.  */
1180           if (BE (err != REG_NOERROR, 0))
1181             return -2;
1182
1183           if (mctx->state_log == NULL
1184               || (match && !fl_longest_match)
1185               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1186             break;
1187         }
1188
1189       if (BE (at_init_state, 0))
1190         {
1191           if (old_state == cur_state)
1192             next_start_idx = next_char_idx;
1193           else
1194             at_init_state = 0;
1195         }
1196
1197       if (cur_state->halt)
1198         {
1199           /* Reached a halt state.
1200              Check the halt state can satisfy the current context.  */
1201           if (!cur_state->has_constraint
1202               || check_halt_state_context (mctx, cur_state,
1203                                            re_string_cur_idx (&mctx->input)))
1204             {
1205               /* We found an appropriate halt state.  */
1206               match_last = re_string_cur_idx (&mctx->input);
1207               match = 1;
1208
1209               /* We found a match, do not modify match_first below.  */
1210               p_match_first = NULL;
1211               if (!fl_longest_match)
1212                 break;
1213             }
1214         }
1215     }
1216
1217   if (p_match_first)
1218     *p_match_first += next_start_idx;
1219
1220   return match_last;
1221 }
1222
1223 /* Check NODE match the current context.  */
1224
1225 static int
1226 internal_function
1227 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1228 {
1229   re_token_type_t type = dfa->nodes[node].type;
1230   unsigned int constraint = dfa->nodes[node].constraint;
1231   if (type != END_OF_RE)
1232     return 0;
1233   if (!constraint)
1234     return 1;
1235   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1236     return 0;
1237   return 1;
1238 }
1239
1240 /* Check the halt state STATE match the current context.
1241    Return 0 if not match, if the node, STATE has, is a halt node and
1242    match the context, return the node.  */
1243
1244 static int
1245 internal_function
1246 check_halt_state_context (const re_match_context_t *mctx,
1247                           const re_dfastate_t *state, int idx)
1248 {
1249   int i;
1250   unsigned int context;
1251 #ifdef DEBUG
1252   assert (state->halt);
1253 #endif
1254   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1255   for (i = 0; i < state->nodes.nelem; ++i)
1256     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1257       return state->nodes.elems[i];
1258   return 0;
1259 }
1260
1261 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1262    corresponding to the DFA).
1263    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1264    of errors.  */
1265
1266 static int
1267 internal_function
1268 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1269                    int *pidx, int node, re_node_set *eps_via_nodes,
1270                    struct re_fail_stack_t *fs)
1271 {
1272   const re_dfa_t *const dfa = mctx->dfa;
1273   int i, err;
1274   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1275     {
1276       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1277       re_node_set *edests = &dfa->edests[node];
1278       int dest_node;
1279       err = re_node_set_insert (eps_via_nodes, node);
1280       if (BE (err < 0, 0))
1281         return -2;
1282       /* Pick up a valid destination, or return -1 if none is found.  */
1283       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1284         {
1285           int candidate = edests->elems[i];
1286           if (!re_node_set_contains (cur_nodes, candidate))
1287             continue;
1288           if (dest_node == -1)
1289             dest_node = candidate;
1290
1291           else
1292             {
1293               /* In order to avoid infinite loop like "(a*)*", return the second
1294                  epsilon-transition if the first was already considered.  */
1295               if (re_node_set_contains (eps_via_nodes, dest_node))
1296                 return candidate;
1297
1298               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1299               else if (fs != NULL
1300                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1301                                            eps_via_nodes))
1302                 return -2;
1303
1304               /* We know we are going to exit.  */
1305               break;
1306             }
1307         }
1308       return dest_node;
1309     }
1310   else
1311     {
1312       int naccepted = 0;
1313       re_token_type_t type = dfa->nodes[node].type;
1314
1315 #ifdef RE_ENABLE_I18N
1316       if (dfa->nodes[node].accept_mb)
1317         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1318       else
1319 #endif /* RE_ENABLE_I18N */
1320       if (type == OP_BACK_REF)
1321         {
1322           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1323           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1324           if (fs != NULL)
1325             {
1326               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1327                 return -1;
1328               else if (naccepted)
1329                 {
1330                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1331                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1332                               naccepted) != 0)
1333                     return -1;
1334                 }
1335             }
1336
1337           if (naccepted == 0)
1338             {
1339               int dest_node;
1340               err = re_node_set_insert (eps_via_nodes, node);
1341               if (BE (err < 0, 0))
1342                 return -2;
1343               dest_node = dfa->edests[node].elems[0];
1344               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1345                                         dest_node))
1346                 return dest_node;
1347             }
1348         }
1349
1350       if (naccepted != 0
1351           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1352         {
1353           int dest_node = dfa->nexts[node];
1354           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1355           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1356                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1357                                                dest_node)))
1358             return -1;
1359           re_node_set_empty (eps_via_nodes);
1360           return dest_node;
1361         }
1362     }
1363   return -1;
1364 }
1365
1366 static reg_errcode_t
1367 internal_function __attribute_warn_unused_result__
1368 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1369                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1370 {
1371   reg_errcode_t err;
1372   int num = fs->num++;
1373   if (fs->num == fs->alloc)
1374     {
1375       struct re_fail_stack_ent_t *new_array;
1376       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1377                                        * fs->alloc * 2));
1378       if (new_array == NULL)
1379         return REG_ESPACE;
1380       fs->alloc *= 2;
1381       fs->stack = new_array;
1382     }
1383   fs->stack[num].idx = str_idx;
1384   fs->stack[num].node = dest_node;
1385   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1386   if (fs->stack[num].regs == NULL)
1387     return REG_ESPACE;
1388   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1389   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1390   return err;
1391 }
1392
1393 static int
1394 internal_function
1395 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1396                 regmatch_t *regs, re_node_set *eps_via_nodes)
1397 {
1398   int num = --fs->num;
1399   assert (num >= 0);
1400   *pidx = fs->stack[num].idx;
1401   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1402   re_node_set_free (eps_via_nodes);
1403   re_free (fs->stack[num].regs);
1404   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1405   return fs->stack[num].node;
1406 }
1407
1408 /* Set the positions where the subexpressions are starts/ends to registers
1409    PMATCH.
1410    Note: We assume that pmatch[0] is already set, and
1411    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1412
1413 static reg_errcode_t
1414 internal_function __attribute_warn_unused_result__
1415 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1416           regmatch_t *pmatch, int fl_backtrack)
1417 {
1418   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1419   int idx, cur_node;
1420   re_node_set eps_via_nodes;
1421   struct re_fail_stack_t *fs;
1422   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1423   regmatch_t *prev_idx_match;
1424   int prev_idx_match_malloced = 0;
1425
1426 #ifdef DEBUG
1427   assert (nmatch > 1);
1428   assert (mctx->state_log != NULL);
1429 #endif
1430   if (fl_backtrack)
1431     {
1432       fs = &fs_body;
1433       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1434       if (fs->stack == NULL)
1435         return REG_ESPACE;
1436     }
1437   else
1438     fs = NULL;
1439
1440   cur_node = dfa->init_node;
1441   re_node_set_init_empty (&eps_via_nodes);
1442
1443   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1444     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1445   else
1446     {
1447       prev_idx_match = re_malloc (regmatch_t, nmatch);
1448       if (prev_idx_match == NULL)
1449         {
1450           free_fail_stack_return (fs);
1451           return REG_ESPACE;
1452         }
1453       prev_idx_match_malloced = 1;
1454     }
1455   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1456
1457   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1458     {
1459       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1460
1461       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1462         {
1463           int reg_idx;
1464           if (fs)
1465             {
1466               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1467                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1468                   break;
1469               if (reg_idx == nmatch)
1470                 {
1471                   re_node_set_free (&eps_via_nodes);
1472                   if (prev_idx_match_malloced)
1473                     re_free (prev_idx_match);
1474                   return free_fail_stack_return (fs);
1475                 }
1476               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1477                                          &eps_via_nodes);
1478             }
1479           else
1480             {
1481               re_node_set_free (&eps_via_nodes);
1482               if (prev_idx_match_malloced)
1483                 re_free (prev_idx_match);
1484               return REG_NOERROR;
1485             }
1486         }
1487
1488       /* Proceed to next node.  */
1489       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1490                                     &eps_via_nodes, fs);
1491
1492       if (BE (cur_node < 0, 0))
1493         {
1494           if (BE (cur_node == -2, 0))
1495             {
1496               re_node_set_free (&eps_via_nodes);
1497               if (prev_idx_match_malloced)
1498                 re_free (prev_idx_match);
1499               free_fail_stack_return (fs);
1500               return REG_ESPACE;
1501             }
1502           if (fs)
1503             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1504                                        &eps_via_nodes);
1505           else
1506             {
1507               re_node_set_free (&eps_via_nodes);
1508               if (prev_idx_match_malloced)
1509                 re_free (prev_idx_match);
1510               return REG_NOMATCH;
1511             }
1512         }
1513     }
1514   re_node_set_free (&eps_via_nodes);
1515   if (prev_idx_match_malloced)
1516     re_free (prev_idx_match);
1517   return free_fail_stack_return (fs);
1518 }
1519
1520 static reg_errcode_t
1521 internal_function
1522 free_fail_stack_return (struct re_fail_stack_t *fs)
1523 {
1524   if (fs)
1525     {
1526       int fs_idx;
1527       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1528         {
1529           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1530           re_free (fs->stack[fs_idx].regs);
1531         }
1532       re_free (fs->stack);
1533     }
1534   return REG_NOERROR;
1535 }
1536
1537 static void
1538 internal_function
1539 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1540              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1541 {
1542   int type = dfa->nodes[cur_node].type;
1543   if (type == OP_OPEN_SUBEXP)
1544     {
1545       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1546
1547       /* We are at the first node of this sub expression.  */
1548       if (reg_num < nmatch)
1549         {
1550           pmatch[reg_num].rm_so = cur_idx;
1551           pmatch[reg_num].rm_eo = -1;
1552         }
1553     }
1554   else if (type == OP_CLOSE_SUBEXP)
1555     {
1556       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1557       if (reg_num < nmatch)
1558         {
1559           /* We are at the last node of this sub expression.  */
1560           if (pmatch[reg_num].rm_so < cur_idx)
1561             {
1562               pmatch[reg_num].rm_eo = cur_idx;
1563               /* This is a non-empty match or we are not inside an optional
1564                  subexpression.  Accept this right away.  */
1565               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1566             }
1567           else
1568             {
1569               if (dfa->nodes[cur_node].opt_subexp
1570                   && prev_idx_match[reg_num].rm_so != -1)
1571                 /* We transited through an empty match for an optional
1572                    subexpression, like (a?)*, and this is not the subexp's
1573                    first match.  Copy back the old content of the registers
1574                    so that matches of an inner subexpression are undone as
1575                    well, like in ((a?))*.  */
1576                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1577               else
1578                 /* We completed a subexpression, but it may be part of
1579                    an optional one, so do not update PREV_IDX_MATCH.  */
1580                 pmatch[reg_num].rm_eo = cur_idx;
1581             }
1582         }
1583     }
1584 }
1585
1586 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1587    and sift the nodes in each states according to the following rules.
1588    Updated state_log will be wrote to STATE_LOG.
1589
1590    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1591      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1592         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1593         the LAST_NODE, we throw away the node `a'.
1594      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1595         string `s' and transit to `b':
1596         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1597            away the node `a'.
1598         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1599             thrown away, we throw away the node `a'.
1600      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1601         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1602            node `a'.
1603         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1604             we throw away the node `a'.  */
1605
1606 #define STATE_NODE_CONTAINS(state,node) \
1607   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1608
1609 static reg_errcode_t
1610 internal_function
1611 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1612 {
1613   reg_errcode_t err;
1614   int null_cnt = 0;
1615   int str_idx = sctx->last_str_idx;
1616   re_node_set cur_dest;
1617
1618 #ifdef DEBUG
1619   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1620 #endif
1621
1622   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1623      transit to the last_node and the last_node itself.  */
1624   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1625   if (BE (err != REG_NOERROR, 0))
1626     return err;
1627   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1628   if (BE (err != REG_NOERROR, 0))
1629     goto free_return;
1630
1631   /* Then check each states in the state_log.  */
1632   while (str_idx > 0)
1633     {
1634       /* Update counters.  */
1635       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1636       if (null_cnt > mctx->max_mb_elem_len)
1637         {
1638           memset (sctx->sifted_states, '\0',
1639                   sizeof (re_dfastate_t *) * str_idx);
1640           re_node_set_free (&cur_dest);
1641           return REG_NOERROR;
1642         }
1643       re_node_set_empty (&cur_dest);
1644       --str_idx;
1645
1646       if (mctx->state_log[str_idx])
1647         {
1648           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1649           if (BE (err != REG_NOERROR, 0))
1650             goto free_return;
1651         }
1652
1653       /* Add all the nodes which satisfy the following conditions:
1654          - It can epsilon transit to a node in CUR_DEST.
1655          - It is in CUR_SRC.
1656          And update state_log.  */
1657       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1658       if (BE (err != REG_NOERROR, 0))
1659         goto free_return;
1660     }
1661   err = REG_NOERROR;
1662  free_return:
1663   re_node_set_free (&cur_dest);
1664   return err;
1665 }
1666
1667 static reg_errcode_t
1668 internal_function __attribute_warn_unused_result__
1669 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1670                      int str_idx, re_node_set *cur_dest)
1671 {
1672   const re_dfa_t *const dfa = mctx->dfa;
1673   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1674   int i;
1675
1676   /* Then build the next sifted state.
1677      We build the next sifted state on `cur_dest', and update
1678      `sifted_states[str_idx]' with `cur_dest'.
1679      Note:
1680      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1681      `cur_src' points the node_set of the old `state_log[str_idx]'
1682      (with the epsilon nodes pre-filtered out).  */
1683   for (i = 0; i < cur_src->nelem; i++)
1684     {
1685       int prev_node = cur_src->elems[i];
1686       int naccepted = 0;
1687       int ret;
1688
1689 #ifdef DEBUG
1690       re_token_type_t type = dfa->nodes[prev_node].type;
1691       assert (!IS_EPSILON_NODE (type));
1692 #endif
1693 #ifdef RE_ENABLE_I18N
1694       /* If the node may accept `multi byte'.  */
1695       if (dfa->nodes[prev_node].accept_mb)
1696         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1697                                          str_idx, sctx->last_str_idx);
1698 #endif /* RE_ENABLE_I18N */
1699
1700       /* We don't check backreferences here.
1701          See update_cur_sifted_state().  */
1702       if (!naccepted
1703           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1704           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1705                                   dfa->nexts[prev_node]))
1706         naccepted = 1;
1707
1708       if (naccepted == 0)
1709         continue;
1710
1711       if (sctx->limits.nelem)
1712         {
1713           int to_idx = str_idx + naccepted;
1714           if (check_dst_limits (mctx, &sctx->limits,
1715                                 dfa->nexts[prev_node], to_idx,
1716                                 prev_node, str_idx))
1717             continue;
1718         }
1719       ret = re_node_set_insert (cur_dest, prev_node);
1720       if (BE (ret == -1, 0))
1721         return REG_ESPACE;
1722     }
1723
1724   return REG_NOERROR;
1725 }
1726
1727 /* Helper functions.  */
1728
1729 static reg_errcode_t
1730 internal_function
1731 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1732 {
1733   int top = mctx->state_log_top;
1734
1735   if (next_state_log_idx >= mctx->input.bufs_len
1736       || (next_state_log_idx >= mctx->input.valid_len
1737           && mctx->input.valid_len < mctx->input.len))
1738     {
1739       reg_errcode_t err;
1740       err = extend_buffers (mctx);
1741       if (BE (err != REG_NOERROR, 0))
1742         return err;
1743     }
1744
1745   if (top < next_state_log_idx)
1746     {
1747       memset (mctx->state_log + top + 1, '\0',
1748               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1749       mctx->state_log_top = next_state_log_idx;
1750     }
1751   return REG_NOERROR;
1752 }
1753
1754 static reg_errcode_t
1755 internal_function
1756 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1757                    re_dfastate_t **src, int num)
1758 {
1759   int st_idx;
1760   reg_errcode_t err;
1761   for (st_idx = 0; st_idx < num; ++st_idx)
1762     {
1763       if (dst[st_idx] == NULL)
1764         dst[st_idx] = src[st_idx];
1765       else if (src[st_idx] != NULL)
1766         {
1767           re_node_set merged_set;
1768           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1769                                         &src[st_idx]->nodes);
1770           if (BE (err != REG_NOERROR, 0))
1771             return err;
1772           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1773           re_node_set_free (&merged_set);
1774           if (BE (err != REG_NOERROR, 0))
1775             return err;
1776         }
1777     }
1778   return REG_NOERROR;
1779 }
1780
1781 static reg_errcode_t
1782 internal_function
1783 update_cur_sifted_state (const re_match_context_t *mctx,
1784                          re_sift_context_t *sctx, int str_idx,
1785                          re_node_set *dest_nodes)
1786 {
1787   const re_dfa_t *const dfa = mctx->dfa;
1788   reg_errcode_t err = REG_NOERROR;
1789   const re_node_set *candidates;
1790   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1791                 : &mctx->state_log[str_idx]->nodes);
1792
1793   if (dest_nodes->nelem == 0)
1794     sctx->sifted_states[str_idx] = NULL;
1795   else
1796     {
1797       if (candidates)
1798         {
1799           /* At first, add the nodes which can epsilon transit to a node in
1800              DEST_NODE.  */
1801           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1802           if (BE (err != REG_NOERROR, 0))
1803             return err;
1804
1805           /* Then, check the limitations in the current sift_context.  */
1806           if (sctx->limits.nelem)
1807             {
1808               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1809                                          mctx->bkref_ents, str_idx);
1810               if (BE (err != REG_NOERROR, 0))
1811                 return err;
1812             }
1813         }
1814
1815       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1816       if (BE (err != REG_NOERROR, 0))
1817         return err;
1818     }
1819
1820   if (candidates && mctx->state_log[str_idx]->has_backref)
1821     {
1822       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1823       if (BE (err != REG_NOERROR, 0))
1824         return err;
1825     }
1826   return REG_NOERROR;
1827 }
1828
1829 static reg_errcode_t
1830 internal_function __attribute_warn_unused_result__
1831 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1832                        const re_node_set *candidates)
1833 {
1834   reg_errcode_t err = REG_NOERROR;
1835   int i;
1836
1837   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1838   if (BE (err != REG_NOERROR, 0))
1839     return err;
1840
1841   if (!state->inveclosure.alloc)
1842     {
1843       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1844       if (BE (err != REG_NOERROR, 0))
1845         return REG_ESPACE;
1846       for (i = 0; i < dest_nodes->nelem; i++)
1847         {
1848           err = re_node_set_merge (&state->inveclosure,
1849                                    dfa->inveclosures + dest_nodes->elems[i]);
1850           if (BE (err != REG_NOERROR, 0))
1851             return REG_ESPACE;
1852         }
1853     }
1854   return re_node_set_add_intersect (dest_nodes, candidates,
1855                                     &state->inveclosure);
1856 }
1857
1858 static reg_errcode_t
1859 internal_function
1860 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1861                        const re_node_set *candidates)
1862 {
1863     int ecl_idx;
1864     reg_errcode_t err;
1865     re_node_set *inv_eclosure = dfa->inveclosures + node;
1866     re_node_set except_nodes;
1867     re_node_set_init_empty (&except_nodes);
1868     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1869       {
1870         int cur_node = inv_eclosure->elems[ecl_idx];
1871         if (cur_node == node)
1872           continue;
1873         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1874           {
1875             int edst1 = dfa->edests[cur_node].elems[0];
1876             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1877                          ? dfa->edests[cur_node].elems[1] : -1);
1878             if ((!re_node_set_contains (inv_eclosure, edst1)
1879                  && re_node_set_contains (dest_nodes, edst1))
1880                 || (edst2 > 0
1881                     && !re_node_set_contains (inv_eclosure, edst2)
1882                     && re_node_set_contains (dest_nodes, edst2)))
1883               {
1884                 err = re_node_set_add_intersect (&except_nodes, candidates,
1885                                                  dfa->inveclosures + cur_node);
1886                 if (BE (err != REG_NOERROR, 0))
1887                   {
1888                     re_node_set_free (&except_nodes);
1889                     return err;
1890                   }
1891               }
1892           }
1893       }
1894     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1895       {
1896         int cur_node = inv_eclosure->elems[ecl_idx];
1897         if (!re_node_set_contains (&except_nodes, cur_node))
1898           {
1899             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1900             re_node_set_remove_at (dest_nodes, idx);
1901           }
1902       }
1903     re_node_set_free (&except_nodes);
1904     return REG_NOERROR;
1905 }
1906
1907 static int
1908 internal_function
1909 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1910                   int dst_node, int dst_idx, int src_node, int src_idx)
1911 {
1912   const re_dfa_t *const dfa = mctx->dfa;
1913   int lim_idx, src_pos, dst_pos;
1914
1915   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1916   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1917   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1918     {
1919       int subexp_idx;
1920       struct re_backref_cache_entry *ent;
1921       ent = mctx->bkref_ents + limits->elems[lim_idx];
1922       subexp_idx = dfa->nodes[ent->node].opr.idx;
1923
1924       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1925                                            subexp_idx, dst_node, dst_idx,
1926                                            dst_bkref_idx);
1927       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1928                                            subexp_idx, src_node, src_idx,
1929                                            src_bkref_idx);
1930
1931       /* In case of:
1932          <src> <dst> ( <subexp> )
1933          ( <subexp> ) <src> <dst>
1934          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1935       if (src_pos == dst_pos)
1936         continue; /* This is unrelated limitation.  */
1937       else
1938         return 1;
1939     }
1940   return 0;
1941 }
1942
1943 static int
1944 internal_function
1945 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1946                              int subexp_idx, int from_node, int bkref_idx)
1947 {
1948   const re_dfa_t *const dfa = mctx->dfa;
1949   const re_node_set *eclosures = dfa->eclosures + from_node;
1950   int node_idx;
1951
1952   /* Else, we are on the boundary: examine the nodes on the epsilon
1953      closure.  */
1954   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1955     {
1956       int node = eclosures->elems[node_idx];
1957       switch (dfa->nodes[node].type)
1958         {
1959         case OP_BACK_REF:
1960           if (bkref_idx != -1)
1961             {
1962               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1963               do
1964                 {
1965                   int dst, cpos;
1966
1967                   if (ent->node != node)
1968                     continue;
1969
1970                   if (subexp_idx < BITSET_WORD_BITS
1971                       && !(ent->eps_reachable_subexps_map
1972                            & ((bitset_word_t) 1 << subexp_idx)))
1973                     continue;
1974
1975                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1976                      OP_CLOSE_SUBEXP cases below.  But, if the
1977                      destination node is the same node as the source
1978                      node, don't recurse because it would cause an
1979                      infinite loop: a regex that exhibits this behavior
1980                      is ()\1*\1*  */
1981                   dst = dfa->edests[node].elems[0];
1982                   if (dst == from_node)
1983                     {
1984                       if (boundaries & 1)
1985                         return -1;
1986                       else /* if (boundaries & 2) */
1987                         return 0;
1988                     }
1989
1990                   cpos =
1991                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1992                                                  dst, bkref_idx);
1993                   if (cpos == -1 /* && (boundaries & 1) */)
1994                     return -1;
1995                   if (cpos == 0 && (boundaries & 2))
1996                     return 0;
1997
1998                   if (subexp_idx < BITSET_WORD_BITS)
1999                     ent->eps_reachable_subexps_map
2000                       &= ~((bitset_word_t) 1 << subexp_idx);
2001                 }
2002               while (ent++->more);
2003             }
2004           break;
2005
2006         case OP_OPEN_SUBEXP:
2007           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2008             return -1;
2009           break;
2010
2011         case OP_CLOSE_SUBEXP:
2012           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2013             return 0;
2014           break;
2015
2016         default:
2017             break;
2018         }
2019     }
2020
2021   return (boundaries & 2) ? 1 : 0;
2022 }
2023
2024 static int
2025 internal_function
2026 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2027                            int subexp_idx, int from_node, int str_idx,
2028                            int bkref_idx)
2029 {
2030   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2031   int boundaries;
2032
2033   /* If we are outside the range of the subexpression, return -1 or 1.  */
2034   if (str_idx < lim->subexp_from)
2035     return -1;
2036
2037   if (lim->subexp_to < str_idx)
2038     return 1;
2039
2040   /* If we are within the subexpression, return 0.  */
2041   boundaries = (str_idx == lim->subexp_from);
2042   boundaries |= (str_idx == lim->subexp_to) << 1;
2043   if (boundaries == 0)
2044     return 0;
2045
2046   /* Else, examine epsilon closure.  */
2047   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2048                                       from_node, bkref_idx);
2049 }
2050
2051 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2052    which are against limitations from DEST_NODES. */
2053
2054 static reg_errcode_t
2055 internal_function
2056 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2057                      const re_node_set *candidates, re_node_set *limits,
2058                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2059 {
2060   reg_errcode_t err;
2061   int node_idx, lim_idx;
2062
2063   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2064     {
2065       int subexp_idx;
2066       struct re_backref_cache_entry *ent;
2067       ent = bkref_ents + limits->elems[lim_idx];
2068
2069       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2070         continue; /* This is unrelated limitation.  */
2071
2072       subexp_idx = dfa->nodes[ent->node].opr.idx;
2073       if (ent->subexp_to == str_idx)
2074         {
2075           int ops_node = -1;
2076           int cls_node = -1;
2077           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2078             {
2079               int node = dest_nodes->elems[node_idx];
2080               re_token_type_t type = dfa->nodes[node].type;
2081               if (type == OP_OPEN_SUBEXP
2082                   && subexp_idx == dfa->nodes[node].opr.idx)
2083                 ops_node = node;
2084               else if (type == OP_CLOSE_SUBEXP
2085                        && subexp_idx == dfa->nodes[node].opr.idx)
2086                 cls_node = node;
2087             }
2088
2089           /* Check the limitation of the open subexpression.  */
2090           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2091           if (ops_node >= 0)
2092             {
2093               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2094                                            candidates);
2095               if (BE (err != REG_NOERROR, 0))
2096                 return err;
2097             }
2098
2099           /* Check the limitation of the close subexpression.  */
2100           if (cls_node >= 0)
2101             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2102               {
2103                 int node = dest_nodes->elems[node_idx];
2104                 if (!re_node_set_contains (dfa->inveclosures + node,
2105                                            cls_node)
2106                     && !re_node_set_contains (dfa->eclosures + node,
2107                                               cls_node))
2108                   {
2109                     /* It is against this limitation.
2110                        Remove it form the current sifted state.  */
2111                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2112                                                  candidates);
2113                     if (BE (err != REG_NOERROR, 0))
2114                       return err;
2115                     --node_idx;
2116                   }
2117               }
2118         }
2119       else /* (ent->subexp_to != str_idx)  */
2120         {
2121           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2122             {
2123               int node = dest_nodes->elems[node_idx];
2124               re_token_type_t type = dfa->nodes[node].type;
2125               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2126                 {
2127                   if (subexp_idx != dfa->nodes[node].opr.idx)
2128                     continue;
2129                   /* It is against this limitation.
2130                      Remove it form the current sifted state.  */
2131                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2132                                                candidates);
2133                   if (BE (err != REG_NOERROR, 0))
2134                     return err;
2135                 }
2136             }
2137         }
2138     }
2139   return REG_NOERROR;
2140 }
2141
2142 static reg_errcode_t
2143 internal_function __attribute_warn_unused_result__
2144 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2145                    int str_idx, const re_node_set *candidates)
2146 {
2147   const re_dfa_t *const dfa = mctx->dfa;
2148   reg_errcode_t err;
2149   int node_idx, node;
2150   re_sift_context_t local_sctx;
2151   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2152
2153   if (first_idx == -1)
2154     return REG_NOERROR;
2155
2156   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2157
2158   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2159     {
2160       int enabled_idx;
2161       re_token_type_t type;
2162       struct re_backref_cache_entry *entry;
2163       node = candidates->elems[node_idx];
2164       type = dfa->nodes[node].type;
2165       /* Avoid infinite loop for the REs like "()\1+".  */
2166       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2167         continue;
2168       if (type != OP_BACK_REF)
2169         continue;
2170
2171       entry = mctx->bkref_ents + first_idx;
2172       enabled_idx = first_idx;
2173       do
2174         {
2175           int subexp_len;
2176           int to_idx;
2177           int dst_node;
2178           int ret;
2179           re_dfastate_t *cur_state;
2180
2181           if (entry->node != node)
2182             continue;
2183           subexp_len = entry->subexp_to - entry->subexp_from;
2184           to_idx = str_idx + subexp_len;
2185           dst_node = (subexp_len ? dfa->nexts[node]
2186                       : dfa->edests[node].elems[0]);
2187
2188           if (to_idx > sctx->last_str_idx
2189               || sctx->sifted_states[to_idx] == NULL
2190               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2191               || check_dst_limits (mctx, &sctx->limits, node,
2192                                    str_idx, dst_node, to_idx))
2193             continue;
2194
2195           if (local_sctx.sifted_states == NULL)
2196             {
2197               local_sctx = *sctx;
2198               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2199               if (BE (err != REG_NOERROR, 0))
2200                 goto free_return;
2201             }
2202           local_sctx.last_node = node;
2203           local_sctx.last_str_idx = str_idx;
2204           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2205           if (BE (ret < 0, 0))
2206             {
2207               err = REG_ESPACE;
2208               goto free_return;
2209             }
2210           cur_state = local_sctx.sifted_states[str_idx];
2211           err = sift_states_backward (mctx, &local_sctx);
2212           if (BE (err != REG_NOERROR, 0))
2213             goto free_return;
2214           if (sctx->limited_states != NULL)
2215             {
2216               err = merge_state_array (dfa, sctx->limited_states,
2217                                        local_sctx.sifted_states,
2218                                        str_idx + 1);
2219               if (BE (err != REG_NOERROR, 0))
2220                 goto free_return;
2221             }
2222           local_sctx.sifted_states[str_idx] = cur_state;
2223           re_node_set_remove (&local_sctx.limits, enabled_idx);
2224
2225           /* mctx->bkref_ents may have changed, reload the pointer.  */
2226           entry = mctx->bkref_ents + enabled_idx;
2227         }
2228       while (enabled_idx++, entry++->more);
2229     }
2230   err = REG_NOERROR;
2231  free_return:
2232   if (local_sctx.sifted_states != NULL)
2233     {
2234       re_node_set_free (&local_sctx.limits);
2235     }
2236
2237   return err;
2238 }
2239
2240
2241 #ifdef RE_ENABLE_I18N
2242 static int
2243 internal_function
2244 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2245                      int node_idx, int str_idx, int max_str_idx)
2246 {
2247   const re_dfa_t *const dfa = mctx->dfa;
2248   int naccepted;
2249   /* Check the node can accept `multi byte'.  */
2250   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2251   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2252       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2253                             dfa->nexts[node_idx]))
2254     /* The node can't accept the `multi byte', or the
2255        destination was already thrown away, then the node
2256        could't accept the current input `multi byte'.   */
2257     naccepted = 0;
2258   /* Otherwise, it is sure that the node could accept
2259      `naccepted' bytes input.  */
2260   return naccepted;
2261 }
2262 #endif /* RE_ENABLE_I18N */
2263
2264 \f
2265 /* Functions for state transition.  */
2266
2267 /* Return the next state to which the current state STATE will transit by
2268    accepting the current input byte, and update STATE_LOG if necessary.
2269    If STATE can accept a multibyte char/collating element/back reference
2270    update the destination of STATE_LOG.  */
2271
2272 static re_dfastate_t *
2273 internal_function __attribute_warn_unused_result__
2274 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2275                re_dfastate_t *state)
2276 {
2277   re_dfastate_t **trtable;
2278   unsigned char ch;
2279
2280 #ifdef RE_ENABLE_I18N
2281   /* If the current state can accept multibyte.  */
2282   if (BE (state->accept_mb, 0))
2283     {
2284       *err = transit_state_mb (mctx, state);
2285       if (BE (*err != REG_NOERROR, 0))
2286         return NULL;
2287     }
2288 #endif /* RE_ENABLE_I18N */
2289
2290   /* Then decide the next state with the single byte.  */
2291 #if 0
2292   if (0)
2293     /* don't use transition table  */
2294     return transit_state_sb (err, mctx, state);
2295 #endif
2296
2297   /* Use transition table  */
2298   ch = re_string_fetch_byte (&mctx->input);
2299   for (;;)
2300     {
2301       trtable = state->trtable;
2302       if (BE (trtable != NULL, 1))
2303         return trtable[ch];
2304
2305       trtable = state->word_trtable;
2306       if (BE (trtable != NULL, 1))
2307         {
2308           unsigned int context;
2309           context
2310             = re_string_context_at (&mctx->input,
2311                                     re_string_cur_idx (&mctx->input) - 1,
2312                                     mctx->eflags);
2313           if (IS_WORD_CONTEXT (context))
2314             return trtable[ch + SBC_MAX];
2315           else
2316             return trtable[ch];
2317         }
2318
2319       if (!build_trtable (mctx->dfa, state))
2320         {
2321           *err = REG_ESPACE;
2322           return NULL;
2323         }
2324
2325       /* Retry, we now have a transition table.  */
2326     }
2327 }
2328
2329 /* Update the state_log if we need */
2330 re_dfastate_t *
2331 internal_function
2332 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2333                       re_dfastate_t *next_state)
2334 {
2335   const re_dfa_t *const dfa = mctx->dfa;
2336   int cur_idx = re_string_cur_idx (&mctx->input);
2337
2338   if (cur_idx > mctx->state_log_top)
2339     {
2340       mctx->state_log[cur_idx] = next_state;
2341       mctx->state_log_top = cur_idx;
2342     }
2343   else if (mctx->state_log[cur_idx] == 0)
2344     {
2345       mctx->state_log[cur_idx] = next_state;
2346     }
2347   else
2348     {
2349       re_dfastate_t *pstate;
2350       unsigned int context;
2351       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2352       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2353          the destination of a multibyte char/collating element/
2354          back reference.  Then the next state is the union set of
2355          these destinations and the results of the transition table.  */
2356       pstate = mctx->state_log[cur_idx];
2357       log_nodes = pstate->entrance_nodes;
2358       if (next_state != NULL)
2359         {
2360           table_nodes = next_state->entrance_nodes;
2361           *err = re_node_set_init_union (&next_nodes, table_nodes,
2362                                              log_nodes);
2363           if (BE (*err != REG_NOERROR, 0))
2364             return NULL;
2365         }
2366       else
2367         next_nodes = *log_nodes;
2368       /* Note: We already add the nodes of the initial state,
2369          then we don't need to add them here.  */
2370
2371       context = re_string_context_at (&mctx->input,
2372                                       re_string_cur_idx (&mctx->input) - 1,
2373                                       mctx->eflags);
2374       next_state = mctx->state_log[cur_idx]
2375         = re_acquire_state_context (err, dfa, &next_nodes, context);
2376       /* We don't need to check errors here, since the return value of
2377          this function is next_state and ERR is already set.  */
2378
2379       if (table_nodes != NULL)
2380         re_node_set_free (&next_nodes);
2381     }
2382
2383   if (BE (dfa->nbackref, 0) && next_state != NULL)
2384     {
2385       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2386          later.  We must check them here, since the back references in the
2387          next state might use them.  */
2388       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2389                                         cur_idx);
2390       if (BE (*err != REG_NOERROR, 0))
2391         return NULL;
2392
2393       /* If the next state has back references.  */
2394       if (next_state->has_backref)
2395         {
2396           *err = transit_state_bkref (mctx, &next_state->nodes);
2397           if (BE (*err != REG_NOERROR, 0))
2398             return NULL;
2399           next_state = mctx->state_log[cur_idx];
2400         }
2401     }
2402
2403   return next_state;
2404 }
2405
2406 /* Skip bytes in the input that correspond to part of a
2407    multi-byte match, then look in the log for a state
2408    from which to restart matching.  */
2409 re_dfastate_t *
2410 internal_function
2411 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2412 {
2413   re_dfastate_t *cur_state;
2414   do
2415     {
2416       int max = mctx->state_log_top;
2417       int cur_str_idx = re_string_cur_idx (&mctx->input);
2418
2419       do
2420         {
2421           if (++cur_str_idx > max)
2422             return NULL;
2423           re_string_skip_bytes (&mctx->input, 1);
2424         }
2425       while (mctx->state_log[cur_str_idx] == NULL);
2426
2427       cur_state = merge_state_with_log (err, mctx, NULL);
2428     }
2429   while (*err == REG_NOERROR && cur_state == NULL);
2430   return cur_state;
2431 }
2432
2433 /* Helper functions for transit_state.  */
2434
2435 /* From the node set CUR_NODES, pick up the nodes whose types are
2436    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2437    expression. And register them to use them later for evaluating the
2438    correspoding back references.  */
2439
2440 static reg_errcode_t
2441 internal_function
2442 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2443                            int str_idx)
2444 {
2445   const re_dfa_t *const dfa = mctx->dfa;
2446   int node_idx;
2447   reg_errcode_t err;
2448
2449   /* TODO: This isn't efficient.
2450            Because there might be more than one nodes whose types are
2451            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2452            nodes.
2453            E.g. RE: (a){2}  */
2454   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2455     {
2456       int node = cur_nodes->elems[node_idx];
2457       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2458           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2459           && (dfa->used_bkref_map
2460               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2461         {
2462           err = match_ctx_add_subtop (mctx, node, str_idx);
2463           if (BE (err != REG_NOERROR, 0))
2464             return err;
2465         }
2466     }
2467   return REG_NOERROR;
2468 }
2469
2470 #if 0
2471 /* Return the next state to which the current state STATE will transit by
2472    accepting the current input byte.  */
2473
2474 static re_dfastate_t *
2475 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2476                   re_dfastate_t *state)
2477 {
2478   const re_dfa_t *const dfa = mctx->dfa;
2479   re_node_set next_nodes;
2480   re_dfastate_t *next_state;
2481   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2482   unsigned int context;
2483
2484   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2485   if (BE (*err != REG_NOERROR, 0))
2486     return NULL;
2487   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2488     {
2489       int cur_node = state->nodes.elems[node_cnt];
2490       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2491         {
2492           *err = re_node_set_merge (&next_nodes,
2493                                     dfa->eclosures + dfa->nexts[cur_node]);
2494           if (BE (*err != REG_NOERROR, 0))
2495             {
2496               re_node_set_free (&next_nodes);
2497               return NULL;
2498             }
2499         }
2500     }
2501   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2502   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2503   /* We don't need to check errors here, since the return value of
2504      this function is next_state and ERR is already set.  */
2505
2506   re_node_set_free (&next_nodes);
2507   re_string_skip_bytes (&mctx->input, 1);
2508   return next_state;
2509 }
2510 #endif
2511
2512 #ifdef RE_ENABLE_I18N
2513 static reg_errcode_t
2514 internal_function
2515 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2516 {
2517   const re_dfa_t *const dfa = mctx->dfa;
2518   reg_errcode_t err;
2519   int i;
2520
2521   for (i = 0; i < pstate->nodes.nelem; ++i)
2522     {
2523       re_node_set dest_nodes, *new_nodes;
2524       int cur_node_idx = pstate->nodes.elems[i];
2525       int naccepted, dest_idx;
2526       unsigned int context;
2527       re_dfastate_t *dest_state;
2528
2529       if (!dfa->nodes[cur_node_idx].accept_mb)
2530         continue;
2531
2532       if (dfa->nodes[cur_node_idx].constraint)
2533         {
2534           context = re_string_context_at (&mctx->input,
2535                                           re_string_cur_idx (&mctx->input),
2536                                           mctx->eflags);
2537           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2538                                            context))
2539             continue;
2540         }
2541
2542       /* How many bytes the node can accept?  */
2543       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2544                                            re_string_cur_idx (&mctx->input));
2545       if (naccepted == 0)
2546         continue;
2547
2548       /* The node can accepts `naccepted' bytes.  */
2549       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2550       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2551                                : mctx->max_mb_elem_len);
2552       err = clean_state_log_if_needed (mctx, dest_idx);
2553       if (BE (err != REG_NOERROR, 0))
2554         return err;
2555 #ifdef DEBUG
2556       assert (dfa->nexts[cur_node_idx] != -1);
2557 #endif
2558       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2559
2560       dest_state = mctx->state_log[dest_idx];
2561       if (dest_state == NULL)
2562         dest_nodes = *new_nodes;
2563       else
2564         {
2565           err = re_node_set_init_union (&dest_nodes,
2566                                         dest_state->entrance_nodes, new_nodes);
2567           if (BE (err != REG_NOERROR, 0))
2568             return err;
2569         }
2570       context = re_string_context_at (&mctx->input, dest_idx - 1,
2571                                       mctx->eflags);
2572       mctx->state_log[dest_idx]
2573         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2574       if (dest_state != NULL)
2575         re_node_set_free (&dest_nodes);
2576       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2577         return err;
2578     }
2579   return REG_NOERROR;
2580 }
2581 #endif /* RE_ENABLE_I18N */
2582
2583 static reg_errcode_t
2584 internal_function
2585 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2586 {
2587   const re_dfa_t *const dfa = mctx->dfa;
2588   reg_errcode_t err;
2589   int i;
2590   int cur_str_idx = re_string_cur_idx (&mctx->input);
2591
2592   for (i = 0; i < nodes->nelem; ++i)
2593     {
2594       int dest_str_idx, prev_nelem, bkc_idx;
2595       int node_idx = nodes->elems[i];
2596       unsigned int context;
2597       const re_token_t *node = dfa->nodes + node_idx;
2598       re_node_set *new_dest_nodes;
2599
2600       /* Check whether `node' is a backreference or not.  */
2601       if (node->type != OP_BACK_REF)
2602         continue;
2603
2604       if (node->constraint)
2605         {
2606           context = re_string_context_at (&mctx->input, cur_str_idx,
2607                                           mctx->eflags);
2608           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2609             continue;
2610         }
2611
2612       /* `node' is a backreference.
2613          Check the substring which the substring matched.  */
2614       bkc_idx = mctx->nbkref_ents;
2615       err = get_subexp (mctx, node_idx, cur_str_idx);
2616       if (BE (err != REG_NOERROR, 0))
2617         goto free_return;
2618
2619       /* And add the epsilon closures (which is `new_dest_nodes') of
2620          the backreference to appropriate state_log.  */
2621 #ifdef DEBUG
2622       assert (dfa->nexts[node_idx] != -1);
2623 #endif
2624       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2625         {
2626           int subexp_len;
2627           re_dfastate_t *dest_state;
2628           struct re_backref_cache_entry *bkref_ent;
2629           bkref_ent = mctx->bkref_ents + bkc_idx;
2630           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2631             continue;
2632           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2633           new_dest_nodes = (subexp_len == 0
2634                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2635                             : dfa->eclosures + dfa->nexts[node_idx]);
2636           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2637                           - bkref_ent->subexp_from);
2638           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2639                                           mctx->eflags);
2640           dest_state = mctx->state_log[dest_str_idx];
2641           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2642                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2643           /* Add `new_dest_node' to state_log.  */
2644           if (dest_state == NULL)
2645             {
2646               mctx->state_log[dest_str_idx]
2647                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2648                                             context);
2649               if (BE (mctx->state_log[dest_str_idx] == NULL
2650                       && err != REG_NOERROR, 0))
2651                 goto free_return;
2652             }
2653           else
2654             {
2655               re_node_set dest_nodes;
2656               err = re_node_set_init_union (&dest_nodes,
2657                                             dest_state->entrance_nodes,
2658                                             new_dest_nodes);
2659               if (BE (err != REG_NOERROR, 0))
2660                 {
2661                   re_node_set_free (&dest_nodes);
2662                   goto free_return;
2663                 }
2664               mctx->state_log[dest_str_idx]
2665                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2666               re_node_set_free (&dest_nodes);
2667               if (BE (mctx->state_log[dest_str_idx] == NULL
2668                       && err != REG_NOERROR, 0))
2669                 goto free_return;
2670             }
2671           /* We need to check recursively if the backreference can epsilon
2672              transit.  */
2673           if (subexp_len == 0
2674               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2675             {
2676               err = check_subexp_matching_top (mctx, new_dest_nodes,
2677                                                cur_str_idx);
2678               if (BE (err != REG_NOERROR, 0))
2679                 goto free_return;
2680               err = transit_state_bkref (mctx, new_dest_nodes);
2681               if (BE (err != REG_NOERROR, 0))
2682                 goto free_return;
2683             }
2684         }
2685     }
2686   err = REG_NOERROR;
2687  free_return:
2688   return err;
2689 }
2690
2691 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2692    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2693    Note that we might collect inappropriate candidates here.
2694    However, the cost of checking them strictly here is too high, then we
2695    delay these checking for prune_impossible_nodes().  */
2696
2697 static reg_errcode_t
2698 internal_function __attribute_warn_unused_result__
2699 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2700 {
2701   const re_dfa_t *const dfa = mctx->dfa;
2702   int subexp_num, sub_top_idx;
2703   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2704   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2705   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2706   if (cache_idx != -1)
2707     {
2708       const struct re_backref_cache_entry *entry
2709         = mctx->bkref_ents + cache_idx;
2710       do
2711         if (entry->node == bkref_node)
2712           return REG_NOERROR; /* We already checked it.  */
2713       while (entry++->more);
2714     }
2715
2716   subexp_num = dfa->nodes[bkref_node].opr.idx;
2717
2718   /* For each sub expression  */
2719   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2720     {
2721       reg_errcode_t err;
2722       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2723       re_sub_match_last_t *sub_last;
2724       int sub_last_idx, sl_str, bkref_str_off;
2725
2726       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2727         continue; /* It isn't related.  */
2728
2729       sl_str = sub_top->str_idx;
2730       bkref_str_off = bkref_str_idx;
2731       /* At first, check the last node of sub expressions we already
2732          evaluated.  */
2733       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2734         {
2735           int sl_str_diff;
2736           sub_last = sub_top->lasts[sub_last_idx];
2737           sl_str_diff = sub_last->str_idx - sl_str;
2738           /* The matched string by the sub expression match with the substring
2739              at the back reference?  */
2740           if (sl_str_diff > 0)
2741             {
2742               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2743                 {
2744                   /* Not enough chars for a successful match.  */
2745                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2746                     break;
2747
2748                   err = clean_state_log_if_needed (mctx,
2749                                                    bkref_str_off
2750                                                    + sl_str_diff);
2751                   if (BE (err != REG_NOERROR, 0))
2752                     return err;
2753                   buf = (const char *) re_string_get_buffer (&mctx->input);
2754                 }
2755               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2756                 /* We don't need to search this sub expression any more.  */
2757                 break;
2758             }
2759           bkref_str_off += sl_str_diff;
2760           sl_str += sl_str_diff;
2761           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2762                                 bkref_str_idx);
2763
2764           /* Reload buf, since the preceding call might have reallocated
2765              the buffer.  */
2766           buf = (const char *) re_string_get_buffer (&mctx->input);
2767
2768           if (err == REG_NOMATCH)
2769             continue;
2770           if (BE (err != REG_NOERROR, 0))
2771             return err;
2772         }
2773
2774       if (sub_last_idx < sub_top->nlasts)
2775         continue;
2776       if (sub_last_idx > 0)
2777         ++sl_str;
2778       /* Then, search for the other last nodes of the sub expression.  */
2779       for (; sl_str <= bkref_str_idx; ++sl_str)
2780         {
2781           int cls_node, sl_str_off;
2782           const re_node_set *nodes;
2783           sl_str_off = sl_str - sub_top->str_idx;
2784           /* The matched string by the sub expression match with the substring
2785              at the back reference?  */
2786           if (sl_str_off > 0)
2787             {
2788               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2789                 {
2790                   /* If we are at the end of the input, we cannot match.  */
2791                   if (bkref_str_off >= mctx->input.len)
2792                     break;
2793
2794                   err = extend_buffers (mctx);
2795                   if (BE (err != REG_NOERROR, 0))
2796                     return err;
2797
2798                   buf = (const char *) re_string_get_buffer (&mctx->input);
2799                 }
2800               if (buf [bkref_str_off++] != buf[sl_str - 1])
2801                 break; /* We don't need to search this sub expression
2802                           any more.  */
2803             }
2804           if (mctx->state_log[sl_str] == NULL)
2805             continue;
2806           /* Does this state have a ')' of the sub expression?  */
2807           nodes = &mctx->state_log[sl_str]->nodes;
2808           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2809                                        OP_CLOSE_SUBEXP);
2810           if (cls_node == -1)
2811             continue; /* No.  */
2812           if (sub_top->path == NULL)
2813             {
2814               sub_top->path = calloc (sizeof (state_array_t),
2815                                       sl_str - sub_top->str_idx + 1);
2816               if (sub_top->path == NULL)
2817                 return REG_ESPACE;
2818             }
2819           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2820              in the current context?  */
2821           err = check_arrival (mctx, sub_top->path, sub_top->node,
2822                                sub_top->str_idx, cls_node, sl_str,
2823                                OP_CLOSE_SUBEXP);
2824           if (err == REG_NOMATCH)
2825               continue;
2826           if (BE (err != REG_NOERROR, 0))
2827               return err;
2828           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2829           if (BE (sub_last == NULL, 0))
2830             return REG_ESPACE;
2831           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2832                                 bkref_str_idx);
2833           if (err == REG_NOMATCH)
2834             continue;
2835         }
2836     }
2837   return REG_NOERROR;
2838 }
2839
2840 /* Helper functions for get_subexp().  */
2841
2842 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2843    If it can arrive, register the sub expression expressed with SUB_TOP
2844    and SUB_LAST.  */
2845
2846 static reg_errcode_t
2847 internal_function
2848 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2849                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2850 {
2851   reg_errcode_t err;
2852   int to_idx;
2853   /* Can the subexpression arrive the back reference?  */
2854   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2855                        sub_last->str_idx, bkref_node, bkref_str,
2856                        OP_OPEN_SUBEXP);
2857   if (err != REG_NOERROR)
2858     return err;
2859   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2860                              sub_last->str_idx);
2861   if (BE (err != REG_NOERROR, 0))
2862     return err;
2863   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2864   return clean_state_log_if_needed (mctx, to_idx);
2865 }
2866
2867 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2868    Search '(' if FL_OPEN, or search ')' otherwise.
2869    TODO: This function isn't efficient...
2870          Because there might be more than one nodes whose types are
2871          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2872          nodes.
2873          E.g. RE: (a){2}  */
2874
2875 static int
2876 internal_function
2877 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2878                   int subexp_idx, int type)
2879 {
2880   int cls_idx;
2881   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2882     {
2883       int cls_node = nodes->elems[cls_idx];
2884       const re_token_t *node = dfa->nodes + cls_node;
2885       if (node->type == type
2886           && node->opr.idx == subexp_idx)
2887         return cls_node;
2888     }
2889   return -1;
2890 }
2891
2892 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2893    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2894    heavily reused.
2895    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2896
2897 static reg_errcode_t
2898 internal_function __attribute_warn_unused_result__
2899 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2900                int top_str, int last_node, int last_str, int type)
2901 {
2902   const re_dfa_t *const dfa = mctx->dfa;
2903   reg_errcode_t err = REG_NOERROR;
2904   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2905   re_dfastate_t *cur_state = NULL;
2906   re_node_set *cur_nodes, next_nodes;
2907   re_dfastate_t **backup_state_log;
2908   unsigned int context;
2909
2910   subexp_num = dfa->nodes[top_node].opr.idx;
2911   /* Extend the buffer if we need.  */
2912   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2913     {
2914       re_dfastate_t **new_array;
2915       int old_alloc = path->alloc;
2916       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2917       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2918       if (BE (new_array == NULL, 0))
2919         {
2920           path->alloc = old_alloc;
2921           return REG_ESPACE;
2922         }
2923       path->array = new_array;
2924       memset (new_array + old_alloc, '\0',
2925               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2926     }
2927
2928   str_idx = path->next_idx ?: top_str;
2929
2930   /* Temporary modify MCTX.  */
2931   backup_state_log = mctx->state_log;
2932   backup_cur_idx = mctx->input.cur_idx;
2933   mctx->state_log = path->array;
2934   mctx->input.cur_idx = str_idx;
2935
2936   /* Setup initial node set.  */
2937   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2938   if (str_idx == top_str)
2939     {
2940       err = re_node_set_init_1 (&next_nodes, top_node);
2941       if (BE (err != REG_NOERROR, 0))
2942         return err;
2943       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2944       if (BE (err != REG_NOERROR, 0))
2945         {
2946           re_node_set_free (&next_nodes);
2947           return err;
2948         }
2949     }
2950   else
2951     {
2952       cur_state = mctx->state_log[str_idx];
2953       if (cur_state && cur_state->has_backref)
2954         {
2955           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2956           if (BE (err != REG_NOERROR, 0))
2957             return err;
2958         }
2959       else
2960         re_node_set_init_empty (&next_nodes);
2961     }
2962   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2963     {
2964       if (next_nodes.nelem)
2965         {
2966           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2967                                     subexp_num, type);
2968           if (BE (err != REG_NOERROR, 0))
2969             {
2970               re_node_set_free (&next_nodes);
2971               return err;
2972             }
2973         }
2974       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976         {
2977           re_node_set_free (&next_nodes);
2978           return err;
2979         }
2980       mctx->state_log[str_idx] = cur_state;
2981     }
2982
2983   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2984     {
2985       re_node_set_empty (&next_nodes);
2986       if (mctx->state_log[str_idx + 1])
2987         {
2988           err = re_node_set_merge (&next_nodes,
2989                                    &mctx->state_log[str_idx + 1]->nodes);
2990           if (BE (err != REG_NOERROR, 0))
2991             {
2992               re_node_set_free (&next_nodes);
2993               return err;
2994             }
2995         }
2996       if (cur_state)
2997         {
2998           err = check_arrival_add_next_nodes (mctx, str_idx,
2999                                               &cur_state->non_eps_nodes,
3000                                               &next_nodes);
3001           if (BE (err != REG_NOERROR, 0))
3002             {
3003               re_node_set_free (&next_nodes);
3004               return err;
3005             }
3006         }
3007       ++str_idx;
3008       if (next_nodes.nelem)
3009         {
3010           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3011           if (BE (err != REG_NOERROR, 0))
3012             {
3013               re_node_set_free (&next_nodes);
3014               return err;
3015             }
3016           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3017                                     subexp_num, type);
3018           if (BE (err != REG_NOERROR, 0))
3019             {
3020               re_node_set_free (&next_nodes);
3021               return err;
3022             }
3023         }
3024       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3025       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3026       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3027         {
3028           re_node_set_free (&next_nodes);
3029           return err;
3030         }
3031       mctx->state_log[str_idx] = cur_state;
3032       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3033     }
3034   re_node_set_free (&next_nodes);
3035   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3036                : &mctx->state_log[last_str]->nodes);
3037   path->next_idx = str_idx;
3038
3039   /* Fix MCTX.  */
3040   mctx->state_log = backup_state_log;
3041   mctx->input.cur_idx = backup_cur_idx;
3042
3043   /* Then check the current node set has the node LAST_NODE.  */
3044   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3045     return REG_NOERROR;
3046
3047   return REG_NOMATCH;
3048 }
3049
3050 /* Helper functions for check_arrival.  */
3051
3052 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3053    to NEXT_NODES.
3054    TODO: This function is similar to the functions transit_state*(),
3055          however this function has many additional works.
3056          Can't we unify them?  */
3057
3058 static reg_errcode_t
3059 internal_function __attribute_warn_unused_result__
3060 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3061                               re_node_set *cur_nodes, re_node_set *next_nodes)
3062 {
3063   const re_dfa_t *const dfa = mctx->dfa;
3064   int result;
3065   int cur_idx;
3066   reg_errcode_t err = REG_NOERROR;
3067   re_node_set union_set;
3068   re_node_set_init_empty (&union_set);
3069   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3070     {
3071       int naccepted = 0;
3072       int cur_node = cur_nodes->elems[cur_idx];
3073 #ifdef DEBUG
3074       re_token_type_t type = dfa->nodes[cur_node].type;
3075       assert (!IS_EPSILON_NODE (type));
3076 #endif
3077 #ifdef RE_ENABLE_I18N
3078       /* If the node may accept `multi byte'.  */
3079       if (dfa->nodes[cur_node].accept_mb)
3080         {
3081           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3082                                                str_idx);
3083           if (naccepted > 1)
3084             {
3085               re_dfastate_t *dest_state;
3086               int next_node = dfa->nexts[cur_node];
3087               int next_idx = str_idx + naccepted;
3088               dest_state = mctx->state_log[next_idx];
3089               re_node_set_empty (&union_set);
3090               if (dest_state)
3091                 {
3092                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3093                   if (BE (err != REG_NOERROR, 0))
3094                     {
3095                       re_node_set_free (&union_set);
3096                       return err;
3097                     }
3098                 }
3099               result = re_node_set_insert (&union_set, next_node);
3100               if (BE (result < 0, 0))
3101                 {
3102                   re_node_set_free (&union_set);
3103                   return REG_ESPACE;
3104                 }
3105               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3106                                                             &union_set);
3107               if (BE (mctx->state_log[next_idx] == NULL
3108                       && err != REG_NOERROR, 0))
3109                 {
3110                   re_node_set_free (&union_set);
3111                   return err;
3112                 }
3113             }
3114         }
3115 #endif /* RE_ENABLE_I18N */
3116       if (naccepted
3117           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3118         {
3119           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3120           if (BE (result < 0, 0))
3121             {
3122               re_node_set_free (&union_set);
3123               return REG_ESPACE;
3124             }
3125         }
3126     }
3127   re_node_set_free (&union_set);
3128   return REG_NOERROR;
3129 }
3130
3131 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3132    CUR_NODES, however exclude the nodes which are:
3133     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3134     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3135 */
3136
3137 static reg_errcode_t
3138 internal_function
3139 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3140                           int ex_subexp, int type)
3141 {
3142   reg_errcode_t err;
3143   int idx, outside_node;
3144   re_node_set new_nodes;
3145 #ifdef DEBUG
3146   assert (cur_nodes->nelem);
3147 #endif
3148   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3149   if (BE (err != REG_NOERROR, 0))
3150     return err;
3151   /* Create a new node set NEW_NODES with the nodes which are epsilon
3152      closures of the node in CUR_NODES.  */
3153
3154   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3155     {
3156       int cur_node = cur_nodes->elems[idx];
3157       const re_node_set *eclosure = dfa->eclosures + cur_node;
3158       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3159       if (outside_node == -1)
3160         {
3161           /* There are no problematic nodes, just merge them.  */
3162           err = re_node_set_merge (&new_nodes, eclosure);
3163           if (BE (err != REG_NOERROR, 0))
3164             {
3165               re_node_set_free (&new_nodes);
3166               return err;
3167             }
3168         }
3169       else
3170         {
3171           /* There are problematic nodes, re-calculate incrementally.  */
3172           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3173                                               ex_subexp, type);
3174           if (BE (err != REG_NOERROR, 0))
3175             {
3176               re_node_set_free (&new_nodes);
3177               return err;
3178             }
3179         }
3180     }
3181   re_node_set_free (cur_nodes);
3182   *cur_nodes = new_nodes;
3183   return REG_NOERROR;
3184 }
3185
3186 /* Helper function for check_arrival_expand_ecl.
3187    Check incrementally the epsilon closure of TARGET, and if it isn't
3188    problematic append it to DST_NODES.  */
3189
3190 static reg_errcode_t
3191 internal_function __attribute_warn_unused_result__
3192 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3193                               int target, int ex_subexp, int type)
3194 {
3195   int cur_node;
3196   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3197     {
3198       int err;
3199
3200       if (dfa->nodes[cur_node].type == type
3201           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3202         {
3203           if (type == OP_CLOSE_SUBEXP)
3204             {
3205               err = re_node_set_insert (dst_nodes, cur_node);
3206               if (BE (err == -1, 0))
3207                 return REG_ESPACE;
3208             }
3209           break;
3210         }
3211       err = re_node_set_insert (dst_nodes, cur_node);
3212       if (BE (err == -1, 0))
3213         return REG_ESPACE;
3214       if (dfa->edests[cur_node].nelem == 0)
3215         break;
3216       if (dfa->edests[cur_node].nelem == 2)
3217         {
3218           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3219                                               dfa->edests[cur_node].elems[1],
3220                                               ex_subexp, type);
3221           if (BE (err != REG_NOERROR, 0))
3222             return err;
3223         }
3224       cur_node = dfa->edests[cur_node].elems[0];
3225     }
3226   return REG_NOERROR;
3227 }
3228
3229
3230 /* For all the back references in the current state, calculate the
3231    destination of the back references by the appropriate entry
3232    in MCTX->BKREF_ENTS.  */
3233
3234 static reg_errcode_t
3235 internal_function __attribute_warn_unused_result__
3236 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3237                     int cur_str, int subexp_num, int type)
3238 {
3239   const re_dfa_t *const dfa = mctx->dfa;
3240   reg_errcode_t err;
3241   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3242   struct re_backref_cache_entry *ent;
3243
3244   if (cache_idx_start == -1)
3245     return REG_NOERROR;
3246
3247  restart:
3248   ent = mctx->bkref_ents + cache_idx_start;
3249   do
3250     {
3251       int to_idx, next_node;
3252
3253       /* Is this entry ENT is appropriate?  */
3254       if (!re_node_set_contains (cur_nodes, ent->node))
3255         continue; /* No.  */
3256
3257       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3258       /* Calculate the destination of the back reference, and append it
3259          to MCTX->STATE_LOG.  */
3260       if (to_idx == cur_str)
3261         {
3262           /* The backreference did epsilon transit, we must re-check all the
3263              node in the current state.  */
3264           re_node_set new_dests;
3265           reg_errcode_t err2, err3;
3266           next_node = dfa->edests[ent->node].elems[0];
3267           if (re_node_set_contains (cur_nodes, next_node))
3268             continue;
3269           err = re_node_set_init_1 (&new_dests, next_node);
3270           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3271           err3 = re_node_set_merge (cur_nodes, &new_dests);
3272           re_node_set_free (&new_dests);
3273           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3274                   || err3 != REG_NOERROR, 0))
3275             {
3276               err = (err != REG_NOERROR ? err
3277                      : (err2 != REG_NOERROR ? err2 : err3));
3278               return err;
3279             }
3280           /* TODO: It is still inefficient...  */
3281           goto restart;
3282         }
3283       else
3284         {
3285           re_node_set union_set;
3286           next_node = dfa->nexts[ent->node];
3287           if (mctx->state_log[to_idx])
3288             {
3289               int ret;
3290               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3291                                         next_node))
3292                 continue;
3293               err = re_node_set_init_copy (&union_set,
3294                                            &mctx->state_log[to_idx]->nodes);
3295               ret = re_node_set_insert (&union_set, next_node);
3296               if (BE (err != REG_NOERROR || ret < 0, 0))
3297                 {
3298                   re_node_set_free (&union_set);
3299                   err = err != REG_NOERROR ? err : REG_ESPACE;
3300                   return err;
3301                 }
3302             }
3303           else
3304             {
3305               err = re_node_set_init_1 (&union_set, next_node);
3306               if (BE (err != REG_NOERROR, 0))
3307                 return err;
3308             }
3309           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3310           re_node_set_free (&union_set);
3311           if (BE (mctx->state_log[to_idx] == NULL
3312                   && err != REG_NOERROR, 0))
3313             return err;
3314         }
3315     }
3316   while (ent++->more);
3317   return REG_NOERROR;
3318 }
3319
3320 /* Build transition table for the state.
3321    Return 1 if succeeded, otherwise return NULL.  */
3322
3323 static int
3324 internal_function
3325 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3326 {
3327   reg_errcode_t err;
3328   int i, j, ch, need_word_trtable = 0;
3329   bitset_word_t elem, mask;
3330   bool dests_node_malloced = false;
3331   bool dest_states_malloced = false;
3332   int ndests; /* Number of the destination states from `state'.  */
3333   re_dfastate_t **trtable;
3334   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3335   re_node_set follows, *dests_node;
3336   bitset_t *dests_ch;
3337   bitset_t acceptable;
3338
3339   struct dests_alloc
3340   {
3341     re_node_set dests_node[SBC_MAX];
3342     bitset_t dests_ch[SBC_MAX];
3343   } *dests_alloc;
3344
3345   /* We build DFA states which corresponds to the destination nodes
3346      from `state'.  `dests_node[i]' represents the nodes which i-th
3347      destination state contains, and `dests_ch[i]' represents the
3348      characters which i-th destination state accepts.  */
3349   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3350     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3351   else
3352     {
3353       dests_alloc = re_malloc (struct dests_alloc, 1);
3354       if (BE (dests_alloc == NULL, 0))
3355         return 0;
3356       dests_node_malloced = true;
3357     }
3358   dests_node = dests_alloc->dests_node;
3359   dests_ch = dests_alloc->dests_ch;
3360
3361   /* Initialize transiton table.  */
3362   state->word_trtable = state->trtable = NULL;
3363
3364   /* At first, group all nodes belonging to `state' into several
3365      destinations.  */
3366   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3367   if (BE (ndests <= 0, 0))
3368     {
3369       if (dests_node_malloced)
3370         free (dests_alloc);
3371       /* Return 0 in case of an error, 1 otherwise.  */
3372       if (ndests == 0)
3373         {
3374           state->trtable = (re_dfastate_t **)
3375             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3376           if (BE (state->trtable == NULL, 0))
3377             return 0;
3378           return 1;
3379         }
3380       return 0;
3381     }
3382
3383   err = re_node_set_alloc (&follows, ndests + 1);
3384   if (BE (err != REG_NOERROR, 0))
3385     goto out_free;
3386
3387   /* Avoid arithmetic overflow in size calculation.  */
3388   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3389             / (3 * sizeof (re_dfastate_t *)))
3390            < ndests),
3391           0))
3392     goto out_free;
3393
3394   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3395                          + ndests * 3 * sizeof (re_dfastate_t *)))
3396     dest_states = (re_dfastate_t **)
3397       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3398   else
3399     {
3400       dest_states = (re_dfastate_t **)
3401         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3402       if (BE (dest_states == NULL, 0))
3403         {
3404 out_free:
3405           if (dest_states_malloced)
3406             free (dest_states);
3407           re_node_set_free (&follows);
3408           for (i = 0; i < ndests; ++i)
3409             re_node_set_free (dests_node + i);
3410           if (dests_node_malloced)
3411             free (dests_alloc);
3412           return 0;
3413         }
3414       dest_states_malloced = true;
3415     }
3416   dest_states_word = dest_states + ndests;
3417   dest_states_nl = dest_states_word + ndests;
3418   bitset_empty (acceptable);
3419
3420   /* Then build the states for all destinations.  */
3421   for (i = 0; i < ndests; ++i)
3422     {
3423       int next_node;
3424       re_node_set_empty (&follows);
3425       /* Merge the follows of this destination states.  */
3426       for (j = 0; j < dests_node[i].nelem; ++j)
3427         {
3428           next_node = dfa->nexts[dests_node[i].elems[j]];
3429           if (next_node != -1)
3430             {
3431               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3432               if (BE (err != REG_NOERROR, 0))
3433                 goto out_free;
3434             }
3435         }
3436       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3437       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3438         goto out_free;
3439       /* If the new state has context constraint,
3440          build appropriate states for these contexts.  */
3441       if (dest_states[i]->has_constraint)
3442         {
3443           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3444                                                           CONTEXT_WORD);
3445           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3446             goto out_free;
3447
3448           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3449             need_word_trtable = 1;
3450
3451           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3452                                                         CONTEXT_NEWLINE);
3453           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3454             goto out_free;
3455         }
3456       else
3457         {
3458           dest_states_word[i] = dest_states[i];
3459           dest_states_nl[i] = dest_states[i];
3460         }
3461       bitset_merge (acceptable, dests_ch[i]);
3462     }
3463
3464   if (!BE (need_word_trtable, 0))
3465     {
3466       /* We don't care about whether the following character is a word
3467          character, or we are in a single-byte character set so we can
3468          discern by looking at the character code: allocate a
3469          256-entry transition table.  */
3470       trtable = state->trtable =
3471         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3472       if (BE (trtable == NULL, 0))
3473         goto out_free;
3474
3475       /* For all characters ch...:  */
3476       for (i = 0; i < BITSET_WORDS; ++i)
3477         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3478              elem;
3479              mask <<= 1, elem >>= 1, ++ch)
3480           if (BE (elem & 1, 0))
3481             {
3482               /* There must be exactly one destination which accepts
3483                  character ch.  See group_nodes_into_DFAstates.  */
3484               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3485                 ;
3486
3487               /* j-th destination accepts the word character ch.  */
3488               if (dfa->word_char[i] & mask)
3489                 trtable[ch] = dest_states_word[j];
3490               else
3491                 trtable[ch] = dest_states[j];
3492             }
3493     }
3494   else
3495     {
3496       /* We care about whether the following character is a word
3497          character, and we are in a multi-byte character set: discern
3498          by looking at the character code: build two 256-entry
3499          transition tables, one starting at trtable[0] and one
3500          starting at trtable[SBC_MAX].  */
3501       trtable = state->word_trtable =
3502         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3503       if (BE (trtable == NULL, 0))
3504         goto out_free;
3505
3506       /* For all characters ch...:  */
3507       for (i = 0; i < BITSET_WORDS; ++i)
3508         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3509              elem;
3510              mask <<= 1, elem >>= 1, ++ch)
3511           if (BE (elem & 1, 0))
3512             {
3513               /* There must be exactly one destination which accepts
3514                  character ch.  See group_nodes_into_DFAstates.  */
3515               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3516                 ;
3517
3518               /* j-th destination accepts the word character ch.  */
3519               trtable[ch] = dest_states[j];
3520               trtable[ch + SBC_MAX] = dest_states_word[j];
3521             }
3522     }
3523
3524   /* new line */
3525   if (bitset_contain (acceptable, NEWLINE_CHAR))
3526     {
3527       /* The current state accepts newline character.  */
3528       for (j = 0; j < ndests; ++j)
3529         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3530           {
3531             /* k-th destination accepts newline character.  */
3532             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3533             if (need_word_trtable)
3534               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3535             /* There must be only one destination which accepts
3536                newline.  See group_nodes_into_DFAstates.  */
3537             break;
3538           }
3539     }
3540
3541   if (dest_states_malloced)
3542     free (dest_states);
3543
3544   re_node_set_free (&follows);
3545   for (i = 0; i < ndests; ++i)
3546     re_node_set_free (dests_node + i);
3547
3548   if (dests_node_malloced)
3549     free (dests_alloc);
3550
3551   return 1;
3552 }
3553
3554 /* Group all nodes belonging to STATE into several destinations.
3555    Then for all destinations, set the nodes belonging to the destination
3556    to DESTS_NODE[i] and set the characters accepted by the destination
3557    to DEST_CH[i].  This function return the number of destinations.  */
3558
3559 static int
3560 internal_function
3561 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3562                             re_node_set *dests_node, bitset_t *dests_ch)
3563 {
3564   reg_errcode_t err;
3565   int result;
3566   int i, j, k;
3567   int ndests; /* Number of the destinations from `state'.  */
3568   bitset_t accepts; /* Characters a node can accept.  */
3569   const re_node_set *cur_nodes = &state->nodes;
3570   bitset_empty (accepts);
3571   ndests = 0;
3572
3573   /* For all the nodes belonging to `state',  */
3574   for (i = 0; i < cur_nodes->nelem; ++i)
3575     {
3576       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3577       re_token_type_t type = node->type;
3578       unsigned int constraint = node->constraint;
3579
3580       /* Enumerate all single byte character this node can accept.  */
3581       if (type == CHARACTER)
3582         bitset_set (accepts, node->opr.c);
3583       else if (type == SIMPLE_BRACKET)
3584         {
3585           bitset_merge (accepts, node->opr.sbcset);
3586         }
3587       else if (type == OP_PERIOD)
3588         {
3589 #ifdef RE_ENABLE_I18N
3590           if (dfa->mb_cur_max > 1)
3591             bitset_merge (accepts, dfa->sb_char);
3592           else
3593 #endif
3594             bitset_set_all (accepts);
3595           if (!(dfa->syntax & RE_DOT_NEWLINE))
3596             bitset_clear (accepts, '\n');
3597           if (dfa->syntax & RE_DOT_NOT_NULL)
3598             bitset_clear (accepts, '\0');
3599         }
3600 #ifdef RE_ENABLE_I18N
3601       else if (type == OP_UTF8_PERIOD)
3602         {
3603           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3604           if (!(dfa->syntax & RE_DOT_NEWLINE))
3605             bitset_clear (accepts, '\n');
3606           if (dfa->syntax & RE_DOT_NOT_NULL)
3607             bitset_clear (accepts, '\0');
3608         }
3609 #endif
3610       else
3611         continue;
3612
3613       /* Check the `accepts' and sift the characters which are not
3614          match it the context.  */
3615       if (constraint)
3616         {
3617           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3618             {
3619               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3620               bitset_empty (accepts);
3621               if (accepts_newline)
3622                 bitset_set (accepts, NEWLINE_CHAR);
3623               else
3624                 continue;
3625             }
3626           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3627             {
3628               bitset_empty (accepts);
3629               continue;
3630             }
3631
3632           if (constraint & NEXT_WORD_CONSTRAINT)
3633             {
3634               bitset_word_t any_set = 0;
3635               if (type == CHARACTER && !node->word_char)
3636                 {
3637                   bitset_empty (accepts);
3638                   continue;
3639                 }
3640 #ifdef RE_ENABLE_I18N
3641               if (dfa->mb_cur_max > 1)
3642                 for (j = 0; j < BITSET_WORDS; ++j)
3643                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3644               else
3645 #endif
3646                 for (j = 0; j < BITSET_WORDS; ++j)
3647                   any_set |= (accepts[j] &= dfa->word_char[j]);
3648               if (!any_set)
3649                 continue;
3650             }
3651           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3652             {
3653               bitset_word_t any_set = 0;
3654               if (type == CHARACTER && node->word_char)
3655                 {
3656                   bitset_empty (accepts);
3657                   continue;
3658                 }
3659 #ifdef RE_ENABLE_I18N
3660               if (dfa->mb_cur_max > 1)
3661                 for (j = 0; j < BITSET_WORDS; ++j)
3662                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3663               else
3664 #endif
3665                 for (j = 0; j < BITSET_WORDS; ++j)
3666                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3667               if (!any_set)
3668                 continue;
3669             }
3670         }
3671
3672       /* Then divide `accepts' into DFA states, or create a new
3673          state.  Above, we make sure that accepts is not empty.  */
3674       for (j = 0; j < ndests; ++j)
3675         {
3676           bitset_t intersec; /* Intersection sets, see below.  */
3677           bitset_t remains;
3678           /* Flags, see below.  */
3679           bitset_word_t has_intersec, not_subset, not_consumed;
3680
3681           /* Optimization, skip if this state doesn't accept the character.  */
3682           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3683             continue;
3684
3685           /* Enumerate the intersection set of this state and `accepts'.  */
3686           has_intersec = 0;
3687           for (k = 0; k < BITSET_WORDS; ++k)
3688             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3689           /* And skip if the intersection set is empty.  */
3690           if (!has_intersec)
3691             continue;
3692
3693           /* Then check if this state is a subset of `accepts'.  */
3694           not_subset = not_consumed = 0;
3695           for (k = 0; k < BITSET_WORDS; ++k)
3696             {
3697               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3698               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3699             }
3700
3701           /* If this state isn't a subset of `accepts', create a
3702              new group state, which has the `remains'. */
3703           if (not_subset)
3704             {
3705               bitset_copy (dests_ch[ndests], remains);
3706               bitset_copy (dests_ch[j], intersec);
3707               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3708               if (BE (err != REG_NOERROR, 0))
3709                 goto error_return;
3710               ++ndests;
3711             }
3712
3713           /* Put the position in the current group. */
3714           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3715           if (BE (result < 0, 0))
3716             goto error_return;
3717
3718           /* If all characters are consumed, go to next node. */
3719           if (!not_consumed)
3720             break;
3721         }
3722       /* Some characters remain, create a new group. */
3723       if (j == ndests)
3724         {
3725           bitset_copy (dests_ch[ndests], accepts);
3726           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3727           if (BE (err != REG_NOERROR, 0))
3728             goto error_return;
3729           ++ndests;
3730           bitset_empty (accepts);
3731         }
3732     }
3733   return ndests;
3734  error_return:
3735   for (j = 0; j < ndests; ++j)
3736     re_node_set_free (dests_node + j);
3737   return -1;
3738 }
3739
3740 #ifdef RE_ENABLE_I18N
3741 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3742    Return the number of the bytes the node accepts.
3743    STR_IDX is the current index of the input string.
3744
3745    This function handles the nodes which can accept one character, or
3746    one collating element like '.', '[a-z]', opposite to the other nodes
3747    can only accept one byte.  */
3748
3749 static int
3750 internal_function
3751 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3752                          const re_string_t *input, int str_idx)
3753 {
3754   const re_token_t *node = dfa->nodes + node_idx;
3755   int char_len, elem_len;
3756   int i;
3757
3758   if (BE (node->type == OP_UTF8_PERIOD, 0))
3759     {
3760       unsigned char c = re_string_byte_at (input, str_idx), d;
3761       if (BE (c < 0xc2, 1))
3762         return 0;
3763
3764       if (str_idx + 2 > input->len)
3765         return 0;
3766
3767       d = re_string_byte_at (input, str_idx + 1);
3768       if (c < 0xe0)
3769         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3770       else if (c < 0xf0)
3771         {
3772           char_len = 3;
3773           if (c == 0xe0 && d < 0xa0)
3774             return 0;
3775         }
3776       else if (c < 0xf8)
3777         {
3778           char_len = 4;
3779           if (c == 0xf0 && d < 0x90)
3780             return 0;
3781         }
3782       else if (c < 0xfc)
3783         {
3784           char_len = 5;
3785           if (c == 0xf8 && d < 0x88)
3786             return 0;
3787         }
3788       else if (c < 0xfe)
3789         {
3790           char_len = 6;
3791           if (c == 0xfc && d < 0x84)
3792             return 0;
3793         }
3794       else
3795         return 0;
3796
3797       if (str_idx + char_len > input->len)
3798         return 0;
3799
3800       for (i = 1; i < char_len; ++i)
3801         {
3802           d = re_string_byte_at (input, str_idx + i);
3803           if (d < 0x80 || d > 0xbf)
3804             return 0;
3805         }
3806       return char_len;
3807     }
3808
3809   char_len = re_string_char_size_at (input, str_idx);
3810   if (node->type == OP_PERIOD)
3811     {
3812       if (char_len <= 1)
3813         return 0;
3814       /* FIXME: I don't think this if is needed, as both '\n'
3815          and '\0' are char_len == 1.  */
3816       /* '.' accepts any one character except the following two cases.  */
3817       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3818            re_string_byte_at (input, str_idx) == '\n') ||
3819           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3820            re_string_byte_at (input, str_idx) == '\0'))
3821         return 0;
3822       return char_len;
3823     }
3824
3825   elem_len = re_string_elem_size_at (input, str_idx);
3826   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3827     return 0;
3828
3829   if (node->type == COMPLEX_BRACKET)
3830     {
3831       const re_charset_t *cset = node->opr.mbcset;
3832 # ifdef _LIBC
3833       const unsigned char *pin
3834         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3835       int j;
3836       uint32_t nrules;
3837 # endif /* _LIBC */
3838       int match_len = 0;
3839       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3840                     ? re_string_wchar_at (input, str_idx) : 0);
3841
3842       /* match with multibyte character?  */
3843       for (i = 0; i < cset->nmbchars; ++i)
3844         if (wc == cset->mbchars[i])
3845           {
3846             match_len = char_len;
3847             goto check_node_accept_bytes_match;
3848           }
3849       /* match with character_class?  */
3850       for (i = 0; i < cset->nchar_classes; ++i)
3851         {
3852           wctype_t wt = cset->char_classes[i];
3853           if (__iswctype (wc, wt))
3854             {
3855               match_len = char_len;
3856               goto check_node_accept_bytes_match;
3857             }
3858         }
3859
3860 # ifdef _LIBC
3861       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3862       if (nrules != 0)
3863         {
3864           unsigned int in_collseq = 0;
3865           const int32_t *table, *indirect;
3866           const unsigned char *weights, *extra;
3867           const char *collseqwc;
3868           /* This #include defines a local function!  */
3869 #  include <locale/weight.h>
3870
3871           /* match with collating_symbol?  */
3872           if (cset->ncoll_syms)
3873             extra = (const unsigned char *)
3874               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3875           for (i = 0; i < cset->ncoll_syms; ++i)
3876             {
3877               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3878               /* Compare the length of input collating element and
3879                  the length of current collating element.  */
3880               if (*coll_sym != elem_len)
3881                 continue;
3882               /* Compare each bytes.  */
3883               for (j = 0; j < *coll_sym; j++)
3884                 if (pin[j] != coll_sym[1 + j])
3885                   break;
3886               if (j == *coll_sym)
3887                 {
3888                   /* Match if every bytes is equal.  */
3889                   match_len = j;
3890                   goto check_node_accept_bytes_match;
3891                 }
3892             }
3893
3894           if (cset->nranges)
3895             {
3896               if (elem_len <= char_len)
3897                 {
3898                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3899                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3900                 }
3901               else
3902                 in_collseq = find_collation_sequence_value (pin, elem_len);
3903             }
3904           /* match with range expression?  */
3905           for (i = 0; i < cset->nranges; ++i)
3906             if (cset->range_starts[i] <= in_collseq
3907                 && in_collseq <= cset->range_ends[i])
3908               {
3909                 match_len = elem_len;
3910                 goto check_node_accept_bytes_match;
3911               }
3912
3913           /* match with equivalence_class?  */
3914           if (cset->nequiv_classes)
3915             {
3916               const unsigned char *cp = pin;
3917               table = (const int32_t *)
3918                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3919               weights = (const unsigned char *)
3920                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3921               extra = (const unsigned char *)
3922                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3923               indirect = (const int32_t *)
3924                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3925               int32_t idx = findidx (&cp);
3926               if (idx > 0)
3927                 for (i = 0; i < cset->nequiv_classes; ++i)
3928                   {
3929                     int32_t equiv_class_idx = cset->equiv_classes[i];
3930                     size_t weight_len = weights[idx & 0xffffff];
3931                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3932                         && (idx >> 24) == (equiv_class_idx >> 24))
3933                       {
3934                         int cnt = 0;
3935
3936                         idx &= 0xffffff;
3937                         equiv_class_idx &= 0xffffff;
3938
3939                         while (cnt <= weight_len
3940                                && (weights[equiv_class_idx + 1 + cnt]
3941                                    == weights[idx + 1 + cnt]))
3942                           ++cnt;
3943                         if (cnt > weight_len)
3944                           {
3945                             match_len = elem_len;
3946                             goto check_node_accept_bytes_match;
3947                           }
3948                       }
3949                   }
3950             }
3951         }
3952       else
3953 # endif /* _LIBC */
3954         {
3955           /* match with range expression?  */
3956 #if __GNUC__ >= 2
3957           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3958 #else
3959           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3960           cmp_buf[2] = wc;
3961 #endif
3962           for (i = 0; i < cset->nranges; ++i)
3963             {
3964               cmp_buf[0] = cset->range_starts[i];
3965               cmp_buf[4] = cset->range_ends[i];
3966               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3967                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3968                 {
3969                   match_len = char_len;
3970                   goto check_node_accept_bytes_match;
3971                 }
3972             }
3973         }
3974     check_node_accept_bytes_match:
3975       if (!cset->non_match)
3976         return match_len;
3977       else
3978         {
3979           if (match_len > 0)
3980             return 0;
3981           else
3982             return (elem_len > char_len) ? elem_len : char_len;
3983         }
3984     }
3985   return 0;
3986 }
3987
3988 # ifdef _LIBC
3989 static unsigned int
3990 internal_function
3991 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3992 {
3993   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3994   if (nrules == 0)
3995     {
3996       if (mbs_len == 1)
3997         {
3998           /* No valid character.  Match it as a single byte character.  */
3999           const unsigned char *collseq = (const unsigned char *)
4000             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4001           return collseq[mbs[0]];
4002         }
4003       return UINT_MAX;
4004     }
4005   else
4006     {
4007       int32_t idx;
4008       const unsigned char *extra = (const unsigned char *)
4009         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4010       int32_t extrasize = (const unsigned char *)
4011         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4012
4013       for (idx = 0; idx < extrasize;)
4014         {
4015           int mbs_cnt, found = 0;
4016           int32_t elem_mbs_len;
4017           /* Skip the name of collating element name.  */
4018           idx = idx + extra[idx] + 1;
4019           elem_mbs_len = extra[idx++];
4020           if (mbs_len == elem_mbs_len)
4021             {
4022               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4023                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4024                   break;
4025               if (mbs_cnt == elem_mbs_len)
4026                 /* Found the entry.  */
4027                 found = 1;
4028             }
4029           /* Skip the byte sequence of the collating element.  */
4030           idx += elem_mbs_len;
4031           /* Adjust for the alignment.  */
4032           idx = (idx + 3) & ~3;
4033           /* Skip the collation sequence value.  */
4034           idx += sizeof (uint32_t);
4035           /* Skip the wide char sequence of the collating element.  */
4036           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4037           /* If we found the entry, return the sequence value.  */
4038           if (found)
4039             return *(uint32_t *) (extra + idx);
4040           /* Skip the collation sequence value.  */
4041           idx += sizeof (uint32_t);
4042         }
4043       return UINT_MAX;
4044     }
4045 }
4046 # endif /* _LIBC */
4047 #endif /* RE_ENABLE_I18N */
4048
4049 /* Check whether the node accepts the byte which is IDX-th
4050    byte of the INPUT.  */
4051
4052 static int
4053 internal_function
4054 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4055                    int idx)
4056 {
4057   unsigned char ch;
4058   ch = re_string_byte_at (&mctx->input, idx);
4059   switch (node->type)
4060     {
4061     case CHARACTER:
4062       if (node->opr.c != ch)
4063         return 0;
4064       break;
4065
4066     case SIMPLE_BRACKET:
4067       if (!bitset_contain (node->opr.sbcset, ch))
4068         return 0;
4069       break;
4070
4071 #ifdef RE_ENABLE_I18N
4072     case OP_UTF8_PERIOD:
4073       if (ch >= 0x80)
4074         return 0;
4075       /* FALLTHROUGH */
4076 #endif
4077     case OP_PERIOD:
4078       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4079           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4080         return 0;
4081       break;
4082
4083     default:
4084       return 0;
4085     }
4086
4087   if (node->constraint)
4088     {
4089       /* The node has constraints.  Check whether the current context
4090          satisfies the constraints.  */
4091       unsigned int context = re_string_context_at (&mctx->input, idx,
4092                                                    mctx->eflags);
4093       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4094         return 0;
4095     }
4096
4097   return 1;
4098 }
4099
4100 /* Extend the buffers, if the buffers have run out.  */
4101
4102 static reg_errcode_t
4103 internal_function __attribute_warn_unused_result__
4104 extend_buffers (re_match_context_t *mctx)
4105 {
4106   reg_errcode_t ret;
4107   re_string_t *pstr = &mctx->input;
4108
4109   /* Avoid overflow.  */
4110   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4111     return REG_ESPACE;
4112
4113   /* Double the lengthes of the buffers.  */
4114   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4115   if (BE (ret != REG_NOERROR, 0))
4116     return ret;
4117
4118   if (mctx->state_log != NULL)
4119     {
4120       /* And double the length of state_log.  */
4121       /* XXX We have no indication of the size of this buffer.  If this
4122          allocation fail we have no indication that the state_log array
4123          does not have the right size.  */
4124       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4125                                               pstr->bufs_len + 1);
4126       if (BE (new_array == NULL, 0))
4127         return REG_ESPACE;
4128       mctx->state_log = new_array;
4129     }
4130
4131   /* Then reconstruct the buffers.  */
4132   if (pstr->icase)
4133     {
4134 #ifdef RE_ENABLE_I18N
4135       if (pstr->mb_cur_max > 1)
4136         {
4137           ret = build_wcs_upper_buffer (pstr);
4138           if (BE (ret != REG_NOERROR, 0))
4139             return ret;
4140         }
4141       else
4142 #endif /* RE_ENABLE_I18N  */
4143         build_upper_buffer (pstr);
4144     }
4145   else
4146     {
4147 #ifdef RE_ENABLE_I18N
4148       if (pstr->mb_cur_max > 1)
4149         build_wcs_buffer (pstr);
4150       else
4151 #endif /* RE_ENABLE_I18N  */
4152         {
4153           if (pstr->trans != NULL)
4154             re_string_translate_buffer (pstr);
4155         }
4156     }
4157   return REG_NOERROR;
4158 }
4159
4160 \f
4161 /* Functions for matching context.  */
4162
4163 /* Initialize MCTX.  */
4164
4165 static reg_errcode_t
4166 internal_function __attribute_warn_unused_result__
4167 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4168 {
4169   mctx->eflags = eflags;
4170   mctx->match_last = -1;
4171   if (n > 0)
4172     {
4173       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4174       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4175       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4176         return REG_ESPACE;
4177     }
4178   /* Already zero-ed by the caller.
4179      else
4180        mctx->bkref_ents = NULL;
4181      mctx->nbkref_ents = 0;
4182      mctx->nsub_tops = 0;  */
4183   mctx->abkref_ents = n;
4184   mctx->max_mb_elem_len = 1;
4185   mctx->asub_tops = n;
4186   return REG_NOERROR;
4187 }
4188
4189 /* Clean the entries which depend on the current input in MCTX.
4190    This function must be invoked when the matcher changes the start index
4191    of the input, or changes the input string.  */
4192
4193 static void
4194 internal_function
4195 match_ctx_clean (re_match_context_t *mctx)
4196 {
4197   int st_idx;
4198   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4199     {
4200       int sl_idx;
4201       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4202       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4203         {
4204           re_sub_match_last_t *last = top->lasts[sl_idx];
4205           re_free (last->path.array);
4206           re_free (last);
4207         }
4208       re_free (top->lasts);
4209       if (top->path)
4210         {
4211           re_free (top->path->array);
4212           re_free (top->path);
4213         }
4214       free (top);
4215     }
4216
4217   mctx->nsub_tops = 0;
4218   mctx->nbkref_ents = 0;
4219 }
4220
4221 /* Free all the memory associated with MCTX.  */
4222
4223 static void
4224 internal_function
4225 match_ctx_free (re_match_context_t *mctx)
4226 {
4227   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4228   match_ctx_clean (mctx);
4229   re_free (mctx->sub_tops);
4230   re_free (mctx->bkref_ents);
4231 }
4232
4233 /* Add a new backreference entry to MCTX.
4234    Note that we assume that caller never call this function with duplicate
4235    entry, and call with STR_IDX which isn't smaller than any existing entry.
4236 */
4237
4238 static reg_errcode_t
4239 internal_function __attribute_warn_unused_result__
4240 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4241                      int to)
4242 {
4243   if (mctx->nbkref_ents >= mctx->abkref_ents)
4244     {
4245       struct re_backref_cache_entry* new_entry;
4246       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4247                               mctx->abkref_ents * 2);
4248       if (BE (new_entry == NULL, 0))
4249         {
4250           re_free (mctx->bkref_ents);
4251           return REG_ESPACE;
4252         }
4253       mctx->bkref_ents = new_entry;
4254       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4255               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4256       mctx->abkref_ents *= 2;
4257     }
4258   if (mctx->nbkref_ents > 0
4259       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4260     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4261
4262   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4263   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4264   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4265   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4266
4267   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4268      If bit N is clear, means that this entry won't epsilon-transition to
4269      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4270      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4271      such node.
4272
4273      A backreference does not epsilon-transition unless it is empty, so set
4274      to all zeros if FROM != TO.  */
4275   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4276     = (from == to ? ~0 : 0);
4277
4278   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4279   if (mctx->max_mb_elem_len < to - from)
4280     mctx->max_mb_elem_len = to - from;
4281   return REG_NOERROR;
4282 }
4283
4284 /* Search for the first entry which has the same str_idx, or -1 if none is
4285    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4286
4287 static int
4288 internal_function
4289 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4290 {
4291   int left, right, mid, last;
4292   last = right = mctx->nbkref_ents;
4293   for (left = 0; left < right;)
4294     {
4295       mid = (left + right) / 2;
4296       if (mctx->bkref_ents[mid].str_idx < str_idx)
4297         left = mid + 1;
4298       else
4299         right = mid;
4300     }
4301   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4302     return left;
4303   else
4304     return -1;
4305 }
4306
4307 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4308    at STR_IDX.  */
4309
4310 static reg_errcode_t
4311 internal_function __attribute_warn_unused_result__
4312 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4313 {
4314 #ifdef DEBUG
4315   assert (mctx->sub_tops != NULL);
4316   assert (mctx->asub_tops > 0);
4317 #endif
4318   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4319     {
4320       int new_asub_tops = mctx->asub_tops * 2;
4321       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4322                                                    re_sub_match_top_t *,
4323                                                    new_asub_tops);
4324       if (BE (new_array == NULL, 0))
4325         return REG_ESPACE;
4326       mctx->sub_tops = new_array;
4327       mctx->asub_tops = new_asub_tops;
4328     }
4329   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4330   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4331     return REG_ESPACE;
4332   mctx->sub_tops[mctx->nsub_tops]->node = node;
4333   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4334   return REG_NOERROR;
4335 }
4336
4337 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4338    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4339
4340 static re_sub_match_last_t *
4341 internal_function
4342 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4343 {
4344   re_sub_match_last_t *new_entry;
4345   if (BE (subtop->nlasts == subtop->alasts, 0))
4346     {
4347       int new_alasts = 2 * subtop->alasts + 1;
4348       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4349                                                     re_sub_match_last_t *,
4350                                                     new_alasts);
4351       if (BE (new_array == NULL, 0))
4352         return NULL;
4353       subtop->lasts = new_array;
4354       subtop->alasts = new_alasts;
4355     }
4356   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4357   if (BE (new_entry != NULL, 1))
4358     {
4359       subtop->lasts[subtop->nlasts] = new_entry;
4360       new_entry->node = node;
4361       new_entry->str_idx = str_idx;
4362       ++subtop->nlasts;
4363     }
4364   return new_entry;
4365 }
4366
4367 static void
4368 internal_function
4369 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4370                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4371 {
4372   sctx->sifted_states = sifted_sts;
4373   sctx->limited_states = limited_sts;
4374   sctx->last_node = last_node;
4375   sctx->last_str_idx = last_str_idx;
4376   re_node_set_init_empty (&sctx->limits);
4377 }