posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2005, 2007, 2009, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  56      internal_function;
  57 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  58                            int *p_match_first) internal_function;
  59 static int check_halt_state_context (const re_match_context_t *mctx,
  60                                      const re_dfastate_t *state, int idx)
  61      internal_function;
  62 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  63                          regmatch_t *prev_idx_match, int cur_node,
  64                          int cur_idx, int nmatch) internal_function;
  65 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  66                                       int str_idx, int dest_node, int nregs,
  67                                       regmatch_t *regs,
  68                                       re_node_set *eps_via_nodes)
  69      internal_function;
  70 static reg_errcode_t set_regs (const regex_t *preg,
  71                                const re_match_context_t *mctx,
  72                                size_t nmatch, regmatch_t *pmatch,
  73                                int fl_backtrack) internal_function;
  74 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  75      internal_function;
  76
  77 #ifdef RE_ENABLE_I18N
  78 static int sift_states_iter_mb (const re_match_context_t *mctx,
  79                                 re_sift_context_t *sctx,
  80                                 int node_idx, int str_idx, int max_str_idx)
  81      internal_function;
  82 #endif /* RE_ENABLE_I18N */
  83 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  84                                            re_sift_context_t *sctx)
  85      internal_function;
  86 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  87                                           re_sift_context_t *sctx, int str_idx,
  88                                           re_node_set *cur_dest)
  89      internal_function;
  90 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  91                                               re_sift_context_t *sctx,
  92                                               int str_idx,
  93                                               re_node_set *dest_nodes)
  94      internal_function;
  95 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  96                                             re_node_set *dest_nodes,
  97                                             const re_node_set *candidates)
  98      internal_function;
  99 static int check_dst_limits (const re_match_context_t *mctx,
 100                              re_node_set *limits,
 101                              int dst_node, int dst_idx, int src_node,
 102                              int src_idx) internal_function;
 103 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 104                                         int boundaries, int subexp_idx,
 105                                         int from_node, int bkref_idx)
 106      internal_function;
 107 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 108                                       int limit, int subexp_idx,
 109                                       int node, int str_idx,
 110                                       int bkref_idx) internal_function;
 111 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 112                                           re_node_set *dest_nodes,
 113                                           const re_node_set *candidates,
 114                                           re_node_set *limits,
 115                                           struct re_backref_cache_entry *bkref_ents,
 116                                           int str_idx) internal_function;
 117 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 118                                         re_sift_context_t *sctx,
 119                                         int str_idx, const re_node_set *candidates)
 120      internal_function;
 121 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 122                                         re_dfastate_t **dst,
 123                                         re_dfastate_t **src, int num)
 124      internal_function;
 125 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 126                                          re_match_context_t *mctx) internal_function;
 127 static re_dfastate_t *transit_state (reg_errcode_t *err,
 128                                      re_match_context_t *mctx,
 129                                      re_dfastate_t *state) internal_function;
 130 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 131                                             re_match_context_t *mctx,
 132                                             re_dfastate_t *next_state)
 133      internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate)
 141      internal_function;
 142 #endif
 143 #ifdef RE_ENABLE_I18N
 144 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 145                                        re_dfastate_t *pstate)
 146      internal_function;
 147 #endif /* RE_ENABLE_I18N */
 148 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 149                                           const re_node_set *nodes)
 150      internal_function;
 151 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 152                                  int bkref_node, int bkref_str_idx)
 153      internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str)
 158      internal_function;
 159 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 160                              int subexp_idx, int type) internal_function;
 161 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 162                                     state_array_t *path, int top_node,
 163                                     int top_str, int last_node, int last_str,
 164                                     int type) internal_function;
 165 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 166                                                    int str_idx,
 167                                                    re_node_set *cur_nodes,
 168                                                    re_node_set *next_nodes)
 169      internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type)
 173      internal_function;
 174 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 175                                                    re_node_set *dst_nodes,
 176                                                    int target, int ex_subexp,
 177                                                    int type) internal_function;
 178 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 179                                          re_node_set *cur_nodes, int cur_str,
 180                                          int subexp_num, int type)
 181      internal_function;
 182 static int build_trtable (const re_dfa_t *dfa,
 183                           re_dfastate_t *state) internal_function;
 184 #ifdef RE_ENABLE_I18N
 185 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 186                                     const re_string_t *input, int idx)
 187      internal_function;
 188 # ifdef _LIBC
 189 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 190                                                    size_t name_len)
 191      internal_function;
 192 # endif /* _LIBC */
 193 #endif /* RE_ENABLE_I18N */
 194 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 195                                        const re_dfastate_t *state,
 196                                        re_node_set *states_node,
 197                                        bitset_t *states_ch) internal_function;
 198 static int check_node_accept (const re_match_context_t *mctx,
 199                               const re_token_t *node, int idx)
 200      internal_function;
 201 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 202      internal_function;
 203 \f
 204 /* Entry point for POSIX code.  */
 205
 206 /* regexec searches for a given pattern, specified by PREG, in the
 207    string STRING.
 208
 209    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 210    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 211    least NMATCH elements, and we set them to the offsets of the
 212    corresponding matched substrings.
 213
 214    EFLAGS specifies `execution flags' which affect matching: if
 215    REG_NOTBOL is set, then ^ does not match at the beginning of the
 216    string; if REG_NOTEOL is set, then $ does not match at the end.
 217
 218    We return 0 if we find a match and REG_NOMATCH if not.  */
 219
 220 int
 221 regexec (preg, string, nmatch, pmatch, eflags)
 222     const regex_t *__restrict preg;
 223     const char *__restrict string;
 224     size_t nmatch;
 225     regmatch_t pmatch[];
 226     int eflags;
 227 {
 228   reg_errcode_t err;
 229   int start, length;
 230   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 231
 232   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 233     return REG_BADPAT;
 234
 235   if (eflags & REG_STARTEND)
 236     {
 237       start = pmatch[0].rm_so;
 238       length = pmatch[0].rm_eo;
 239     }
 240   else
 241     {
 242       start = 0;
 243       length = strlen (string);
 244     }
 245
 246   __libc_lock_lock (dfa->lock);
 247   if (preg->no_sub)
 248     err = re_search_internal (preg, string, length, start, length - start,
 249                               length, 0, NULL, eflags);
 250   else
 251     err = re_search_internal (preg, string, length, start, length - start,
 252                               length, nmatch, pmatch, eflags);
 253   __libc_lock_unlock (dfa->lock);
 254   return err != REG_NOERROR;
 255 }
 256
 257 #ifdef _LIBC
 258 # include <shlib-compat.h>
 259 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 260
 261 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 262 __typeof__ (__regexec) __compat_regexec;
 263
 264 int
 265 attribute_compat_text_section
 266 __compat_regexec (const regex_t *__restrict preg,
 267                   const char *__restrict string, size_t nmatch,
 268                   regmatch_t pmatch[], int eflags)
 269 {
 270   return regexec (preg, string, nmatch, pmatch,
 271                   eflags & (REG_NOTBOL | REG_NOTEOL));
 272 }
 273 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 274 # endif
 275 #endif
 276
 277 /* Entry points for GNU code.  */
 278
 279 /* re_match, re_search, re_match_2, re_search_2
 280
 281    The former two functions operate on STRING with length LENGTH,
 282    while the later two operate on concatenation of STRING1 and STRING2
 283    with lengths LENGTH1 and LENGTH2, respectively.
 284
 285    re_match() matches the compiled pattern in BUFP against the string,
 286    starting at index START.
 287
 288    re_search() first tries matching at index START, then it tries to match
 289    starting from index START + 1, and so on.  The last start position tried
 290    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 291    way as re_match().)
 292
 293    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 294    the first STOP characters of the concatenation of the strings should be
 295    concerned.
 296
 297    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 298    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 299    computed relative to the concatenation, not relative to the individual
 300    strings.)
 301
 302    On success, re_match* functions return the length of the match, re_search*
 303    return the position of the start of the match.  Return value -1 means no
 304    match was found and -2 indicates an internal error.  */
 305
 306 int
 307 re_match (bufp, string, length, start, regs)
 308     struct re_pattern_buffer *bufp;
 309     const char *string;
 310     int length, start;
 311     struct re_registers *regs;
 312 {
 313   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 314 }
 315 #ifdef _LIBC
 316 weak_alias (__re_match, re_match)
 317 #endif
 318
 319 int
 320 re_search (bufp, string, length, start, range, regs)
 321     struct re_pattern_buffer *bufp;
 322     const char *string;
 323     int length, start, range;
 324     struct re_registers *regs;
 325 {
 326   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 327 }
 328 #ifdef _LIBC
 329 weak_alias (__re_search, re_search)
 330 #endif
 331
 332 int
 333 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 334     struct re_pattern_buffer *bufp;
 335     const char *string1, *string2;
 336     int length1, length2, start, stop;
 337     struct re_registers *regs;
 338 {
 339   return re_search_2_stub (bufp, string1, length1, string2, length2,
 340                            start, 0, regs, stop, 1);
 341 }
 342 #ifdef _LIBC
 343 weak_alias (__re_match_2, re_match_2)
 344 #endif
 345
 346 int
 347 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 348     struct re_pattern_buffer *bufp;
 349     const char *string1, *string2;
 350     int length1, length2, start, range, stop;
 351     struct re_registers *regs;
 352 {
 353   return re_search_2_stub (bufp, string1, length1, string2, length2,
 354                            start, range, regs, stop, 0);
 355 }
 356 #ifdef _LIBC
 357 weak_alias (__re_search_2, re_search_2)
 358 #endif
 359
 360 static int
 361 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 362                   stop, ret_len)
 363     struct re_pattern_buffer *bufp;
 364     const char *string1, *string2;
 365     int length1, length2, start, range, stop, ret_len;
 366     struct re_registers *regs;
 367 {
 368   const char *str;
 369   int rval;
 370   int len = length1 + length2;
 371   int free_str = 0;
 372
 373   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 374     return -2;
 375
 376   /* Concatenate the strings.  */
 377   if (length2 > 0)
 378     if (length1 > 0)
 379       {
 380         char *s = re_malloc (char, len);
 381
 382         if (BE (s == NULL, 0))
 383           return -2;
 384 #ifdef _LIBC
 385         memcpy (__mempcpy (s, string1, length1), string2, length2);
 386 #else
 387         memcpy (s, string1, length1);
 388         memcpy (s + length1, string2, length2);
 389 #endif
 390         str = s;
 391         free_str = 1;
 392       }
 393     else
 394       str = string2;
 395   else
 396     str = string1;
 397
 398   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 399                          ret_len);
 400   if (free_str)
 401     re_free ((char *) str);
 402   return rval;
 403 }
 404
 405 /* The parameters have the same meaning as those of re_search.
 406    Additional parameters:
 407    If RET_LEN is nonzero the length of the match is returned (re_match style);
 408    otherwise the position of the match is returned.  */
 409
 410 static int
 411 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 412     struct re_pattern_buffer *bufp;
 413     const char *string;
 414     int length, start, range, stop, ret_len;
 415     struct re_registers *regs;
 416 {
 417   reg_errcode_t result;
 418   regmatch_t *pmatch;
 419   int nregs, rval;
 420   int eflags = 0;
 421   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 422
 423   /* Check for out-of-range.  */
 424   if (BE (start < 0 || start > length, 0))
 425     return -1;
 426   if (BE (start + range > length, 0))
 427     range = length - start;
 428   else if (BE (start + range < 0, 0))
 429     range = -start;
 430
 431   __libc_lock_lock (dfa->lock);
 432
 433   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 434   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 435
 436   /* Compile fastmap if we haven't yet.  */
 437   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 438     re_compile_fastmap (bufp);
 439
 440   if (BE (bufp->no_sub, 0))
 441     regs = NULL;
 442
 443   /* We need at least 1 register.  */
 444   if (regs == NULL)
 445     nregs = 1;
 446   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 447                regs->num_regs < bufp->re_nsub + 1, 0))
 448     {
 449       nregs = regs->num_regs;
 450       if (BE (nregs < 1, 0))
 451         {
 452           /* Nothing can be copied to regs.  */
 453           regs = NULL;
 454           nregs = 1;
 455         }
 456     }
 457   else
 458     nregs = bufp->re_nsub + 1;
 459   pmatch = re_malloc (regmatch_t, nregs);
 460   if (BE (pmatch == NULL, 0))
 461     {
 462       rval = -2;
 463       goto out;
 464     }
 465
 466   result = re_search_internal (bufp, string, length, start, range, stop,
 467                                nregs, pmatch, eflags);
 468
 469   rval = 0;
 470
 471   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 472   if (result != REG_NOERROR)
 473     rval = -1;
 474   else if (regs != NULL)
 475     {
 476       /* If caller wants register contents data back, copy them.  */
 477       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 478                                            bufp->regs_allocated);
 479       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 480         rval = -2;
 481     }
 482
 483   if (BE (rval == 0, 1))
 484     {
 485       if (ret_len)
 486         {
 487           assert (pmatch[0].rm_so == start);
 488           rval = pmatch[0].rm_eo - start;
 489         }
 490       else
 491         rval = pmatch[0].rm_so;
 492     }
 493   re_free (pmatch);
 494  out:
 495   __libc_lock_unlock (dfa->lock);
 496   return rval;
 497 }
 498
 499 static unsigned
 500 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 501     struct re_registers *regs;
 502     regmatch_t *pmatch;
 503     int nregs, regs_allocated;
 504 {
 505   int rval = REGS_REALLOCATE;
 506   int i;
 507   int need_regs = nregs + 1;
 508   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 509      uses.  */
 510
 511   /* Have the register data arrays been allocated?  */
 512   if (regs_allocated == REGS_UNALLOCATED)
 513     { /* No.  So allocate them with malloc.  */
 514       regs->start = re_malloc (regoff_t, need_regs);
 515       regs->end = re_malloc (regoff_t, need_regs);
 516       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 517         return REGS_UNALLOCATED;
 518       regs->num_regs = need_regs;
 519     }
 520   else if (regs_allocated == REGS_REALLOCATE)
 521     { /* Yes.  If we need more elements than were already
 522          allocated, reallocate them.  If we need fewer, just
 523          leave it alone.  */
 524       if (BE (need_regs > regs->num_regs, 0))
 525         {
 526           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 527           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 528           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 529             return REGS_UNALLOCATED;
 530           regs->start = new_start;
 531           regs->end = new_end;
 532           regs->num_regs = need_regs;
 533         }
 534     }
 535   else
 536     {
 537       assert (regs_allocated == REGS_FIXED);
 538       /* This function may not be called with REGS_FIXED and nregs too big.  */
 539       assert (regs->num_regs >= nregs);
 540       rval = REGS_FIXED;
 541     }
 542
 543   /* Copy the regs.  */
 544   for (i = 0; i < nregs; ++i)
 545     {
 546       regs->start[i] = pmatch[i].rm_so;
 547       regs->end[i] = pmatch[i].rm_eo;
 548     }
 549   for ( ; i < regs->num_regs; ++i)
 550     regs->start[i] = regs->end[i] = -1;
 551
 552   return rval;
 553 }
 554
 555 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 556    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 557    this memory for recording register information.  STARTS and ENDS
 558    must be allocated using the malloc library routine, and must each
 559    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 560
 561    If NUM_REGS == 0, then subsequent matches should allocate their own
 562    register data.
 563
 564    Unless this function is called, the first search or match using
 565    PATTERN_BUFFER will allocate its own register data, without
 566    freeing the old data.  */
 567
 568 void
 569 re_set_registers (bufp, regs, num_regs, starts, ends)
 570     struct re_pattern_buffer *bufp;
 571     struct re_registers *regs;
 572     unsigned num_regs;
 573     regoff_t *starts, *ends;
 574 {
 575   if (num_regs)
 576     {
 577       bufp->regs_allocated = REGS_REALLOCATE;
 578       regs->num_regs = num_regs;
 579       regs->start = starts;
 580       regs->end = ends;
 581     }
 582   else
 583     {
 584       bufp->regs_allocated = REGS_UNALLOCATED;
 585       regs->num_regs = 0;
 586       regs->start = regs->end = (regoff_t *) 0;
 587     }
 588 }
 589 #ifdef _LIBC
 590 weak_alias (__re_set_registers, re_set_registers)
 591 #endif
 592 \f
 593 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 594    them unless specifically requested.  */
 595
 596 #if defined _REGEX_RE_COMP || defined _LIBC
 597 int
 598 # ifdef _LIBC
 599 weak_function
 600 # endif
 601 re_exec (s)
 602      const char *s;
 603 {
 604   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 605 }
 606 #endif /* _REGEX_RE_COMP */
 607 \f
 608 /* Internal entry point.  */
 609
 610 /* Searches for a compiled pattern PREG in the string STRING, whose
 611    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 612    mingings with regexec.  START, and RANGE have the same meanings
 613    with re_search.
 614    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 615    otherwise return the error code.
 616    Note: We assume front end functions already check ranges.
 617    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 618
 619 static reg_errcode_t
 620 __attribute_warn_unused_result__
 621 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 622                     eflags)
 623     const regex_t *preg;
 624     const char *string;
 625     int length, start, range, stop, eflags;
 626     size_t nmatch;
 627     regmatch_t pmatch[];
 628 {
 629   reg_errcode_t err;
 630   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 631   int left_lim, right_lim, incr;
 632   int fl_longest_match, match_first, match_kind, match_last = -1;
 633   int extra_nmatch;
 634   int sb, ch;
 635 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 636   re_match_context_t mctx = { .dfa = dfa };
 637 #else
 638   re_match_context_t mctx;
 639 #endif
 640   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 641                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 642   RE_TRANSLATE_TYPE t = preg->translate;
 643
 644 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 645   memset (&mctx, '\0', sizeof (re_match_context_t));
 646   mctx.dfa = dfa;
 647 #endif
 648
 649   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 650   nmatch -= extra_nmatch;
 651
 652   /* Check if the DFA haven't been compiled.  */
 653   if (BE (preg->used == 0 || dfa->init_state == NULL
 654           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 655           || dfa->init_state_begbuf == NULL, 0))
 656     return REG_NOMATCH;
 657
 658 #ifdef DEBUG
 659   /* We assume front-end functions already check them.  */
 660   assert (start + range >= 0 && start + range <= length);
 661 #endif
 662
 663   /* If initial states with non-begbuf contexts have no elements,
 664      the regex must be anchored.  If preg->newline_anchor is set,
 665      we'll never use init_state_nl, so do not check it.  */
 666   if (dfa->init_state->nodes.nelem == 0
 667       && dfa->init_state_word->nodes.nelem == 0
 668       && (dfa->init_state_nl->nodes.nelem == 0
 669           || !preg->newline_anchor))
 670     {
 671       if (start != 0 && start + range != 0)
 672         return REG_NOMATCH;
 673       start = range = 0;
 674     }
 675
 676   /* We must check the longest matching, if nmatch > 0.  */
 677   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 678
 679   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 680                             preg->translate, preg->syntax & RE_ICASE, dfa);
 681   if (BE (err != REG_NOERROR, 0))
 682     goto free_return;
 683   mctx.input.stop = stop;
 684   mctx.input.raw_stop = stop;
 685   mctx.input.newline_anchor = preg->newline_anchor;
 686
 687   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 688   if (BE (err != REG_NOERROR, 0))
 689     goto free_return;
 690
 691   /* We will log all the DFA states through which the dfa pass,
 692      if nmatch > 1, or this dfa has "multibyte node", which is a
 693      back-reference or a node which can accept multibyte character or
 694      multi character collating element.  */
 695   if (nmatch > 1 || dfa->has_mb_node)
 696     {
 697       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 698       if (BE (mctx.state_log == NULL, 0))
 699         {
 700           err = REG_ESPACE;
 701           goto free_return;
 702         }
 703     }
 704   else
 705     mctx.state_log = NULL;
 706
 707   match_first = start;
 708   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 709                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 710
 711   /* Check incrementally whether of not the input string match.  */
 712   incr = (range < 0) ? -1 : 1;
 713   left_lim = (range < 0) ? start + range : start;
 714   right_lim = (range < 0) ? start : start + range;
 715   sb = dfa->mb_cur_max == 1;
 716   match_kind =
 717     (fastmap
 718      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 719         | (range >= 0 ? 2 : 0)
 720         | (t != NULL ? 1 : 0))
 721      : 8);
 722
 723   for (;; match_first += incr)
 724     {
 725       err = REG_NOMATCH;
 726       if (match_first < left_lim || right_lim < match_first)
 727         goto free_return;
 728
 729       /* Advance as rapidly as possible through the string, until we
 730          find a plausible place to start matching.  This may be done
 731          with varying efficiency, so there are various possibilities:
 732          only the most common of them are specialized, in order to
 733          save on code size.  We use a switch statement for speed.  */
 734       switch (match_kind)
 735         {
 736         case 8:
 737           /* No fastmap.  */
 738           break;
 739
 740         case 7:
 741           /* Fastmap with single-byte translation, match forward.  */
 742           while (BE (match_first < right_lim, 1)
 743                  && !fastmap[t[(unsigned char) string[match_first]]])
 744             ++match_first;
 745           goto forward_match_found_start_or_reached_end;
 746
 747         case 6:
 748           /* Fastmap without translation, match forward.  */
 749           while (BE (match_first < right_lim, 1)
 750                  && !fastmap[(unsigned char) string[match_first]])
 751             ++match_first;
 752
 753         forward_match_found_start_or_reached_end:
 754           if (BE (match_first == right_lim, 0))
 755             {
 756               ch = match_first >= length
 757                        ? 0 : (unsigned char) string[match_first];
 758               if (!fastmap[t ? t[ch] : ch])
 759                 goto free_return;
 760             }
 761           break;
 762
 763         case 4:
 764         case 5:
 765           /* Fastmap without multi-byte translation, match backwards.  */
 766           while (match_first >= left_lim)
 767             {
 768               ch = match_first >= length
 769                        ? 0 : (unsigned char) string[match_first];
 770               if (fastmap[t ? t[ch] : ch])
 771                 break;
 772               --match_first;
 773             }
 774           if (match_first < left_lim)
 775             goto free_return;
 776           break;
 777
 778         default:
 779           /* In this case, we can't determine easily the current byte,
 780              since it might be a component byte of a multibyte
 781              character.  Then we use the constructed buffer instead.  */
 782           for (;;)
 783             {
 784               /* If MATCH_FIRST is out of the valid range, reconstruct the
 785                  buffers.  */
 786               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 787               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 788                 {
 789                   err = re_string_reconstruct (&mctx.input, match_first,
 790                                                eflags);
 791                   if (BE (err != REG_NOERROR, 0))
 792                     goto free_return;
 793
 794                   offset = match_first - mctx.input.raw_mbs_idx;
 795                 }
 796               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 797                  Note that MATCH_FIRST must not be smaller than 0.  */
 798               ch = (match_first >= length
 799                     ? 0 : re_string_byte_at (&mctx.input, offset));
 800               if (fastmap[ch])
 801                 break;
 802               match_first += incr;
 803               if (match_first < left_lim || match_first > right_lim)
 804                 {
 805                   err = REG_NOMATCH;
 806                   goto free_return;
 807                 }
 808             }
 809           break;
 810         }
 811
 812       /* Reconstruct the buffers so that the matcher can assume that
 813          the matching starts from the beginning of the buffer.  */
 814       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 815       if (BE (err != REG_NOERROR, 0))
 816         goto free_return;
 817
 818 #ifdef RE_ENABLE_I18N
 819      /* Don't consider this char as a possible match start if it part,
 820         yet isn't the head, of a multibyte character.  */
 821       if (!sb && !re_string_first_byte (&mctx.input, 0))
 822         continue;
 823 #endif
 824
 825       /* It seems to be appropriate one, then use the matcher.  */
 826       /* We assume that the matching starts from 0.  */
 827       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 828       match_last = check_matching (&mctx, fl_longest_match,
 829                                    range >= 0 ? &match_first : NULL);
 830       if (match_last != -1)
 831         {
 832           if (BE (match_last == -2, 0))
 833             {
 834               err = REG_ESPACE;
 835               goto free_return;
 836             }
 837           else
 838             {
 839               mctx.match_last = match_last;
 840               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 841                 {
 842                   re_dfastate_t *pstate = mctx.state_log[match_last];
 843                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 844                                                              match_last);
 845                 }
 846               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 847                   || dfa->nbackref)
 848                 {
 849                   err = prune_impossible_nodes (&mctx);
 850                   if (err == REG_NOERROR)
 851                     break;
 852                   if (BE (err != REG_NOMATCH, 0))
 853                     goto free_return;
 854                   match_last = -1;
 855                 }
 856               else
 857                 break; /* We found a match.  */
 858             }
 859         }
 860
 861       match_ctx_clean (&mctx);
 862     }
 863
 864 #ifdef DEBUG
 865   assert (match_last != -1);
 866   assert (err == REG_NOERROR);
 867 #endif
 868
 869   /* Set pmatch[] if we need.  */
 870   if (nmatch > 0)
 871     {
 872       int reg_idx;
 873
 874       /* Initialize registers.  */
 875       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 876         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 877
 878       /* Set the points where matching start/end.  */
 879       pmatch[0].rm_so = 0;
 880       pmatch[0].rm_eo = mctx.match_last;
 881
 882       if (!preg->no_sub && nmatch > 1)
 883         {
 884           err = set_regs (preg, &mctx, nmatch, pmatch,
 885                           dfa->has_plural_match && dfa->nbackref > 0);
 886           if (BE (err != REG_NOERROR, 0))
 887             goto free_return;
 888         }
 889
 890       /* At last, add the offset to the each registers, since we slided
 891          the buffers so that we could assume that the matching starts
 892          from 0.  */
 893       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 894         if (pmatch[reg_idx].rm_so != -1)
 895           {
 896 #ifdef RE_ENABLE_I18N
 897             if (BE (mctx.input.offsets_needed != 0, 0))
 898               {
 899                 pmatch[reg_idx].rm_so =
 900                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 901                    ? mctx.input.valid_raw_len
 902                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 903                 pmatch[reg_idx].rm_eo =
 904                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 905                    ? mctx.input.valid_raw_len
 906                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 907               }
 908 #else
 909             assert (mctx.input.offsets_needed == 0);
 910 #endif
 911             pmatch[reg_idx].rm_so += match_first;
 912             pmatch[reg_idx].rm_eo += match_first;
 913           }
 914       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 915         {
 916           pmatch[nmatch + reg_idx].rm_so = -1;
 917           pmatch[nmatch + reg_idx].rm_eo = -1;
 918         }
 919
 920       if (dfa->subexp_map)
 921         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 922           if (dfa->subexp_map[reg_idx] != reg_idx)
 923             {
 924               pmatch[reg_idx + 1].rm_so
 925                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 926               pmatch[reg_idx + 1].rm_eo
 927                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 928             }
 929     }
 930
 931  free_return:
 932   re_free (mctx.state_log);
 933   if (dfa->nbackref)
 934     match_ctx_free (&mctx);
 935   re_string_destruct (&mctx.input);
 936   return err;
 937 }
 938
 939 static reg_errcode_t
 940 __attribute_warn_unused_result__
 941 prune_impossible_nodes (mctx)
 942      re_match_context_t *mctx;
 943 {
 944   const re_dfa_t *const dfa = mctx->dfa;
 945   int halt_node, match_last;
 946   reg_errcode_t ret;
 947   re_dfastate_t **sifted_states;
 948   re_dfastate_t **lim_states = NULL;
 949   re_sift_context_t sctx;
 950 #ifdef DEBUG
 951   assert (mctx->state_log != NULL);
 952 #endif
 953   match_last = mctx->match_last;
 954   halt_node = mctx->last_node;
 955   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 956   if (BE (sifted_states == NULL, 0))
 957     {
 958       ret = REG_ESPACE;
 959       goto free_return;
 960     }
 961   if (dfa->nbackref)
 962     {
 963       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 964       if (BE (lim_states == NULL, 0))
 965         {
 966           ret = REG_ESPACE;
 967           goto free_return;
 968         }
 969       while (1)
 970         {
 971           memset (lim_states, '\0',
 972                   sizeof (re_dfastate_t *) * (match_last + 1));
 973           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 974                          match_last);
 975           ret = sift_states_backward (mctx, &sctx);
 976           re_node_set_free (&sctx.limits);
 977           if (BE (ret != REG_NOERROR, 0))
 978               goto free_return;
 979           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 980             break;
 981           do
 982             {
 983               --match_last;
 984               if (match_last < 0)
 985                 {
 986                   ret = REG_NOMATCH;
 987                   goto free_return;
 988                 }
 989             } while (mctx->state_log[match_last] == NULL
 990                      || !mctx->state_log[match_last]->halt);
 991           halt_node = check_halt_state_context (mctx,
 992                                                 mctx->state_log[match_last],
 993                                                 match_last);
 994         }
 995       ret = merge_state_array (dfa, sifted_states, lim_states,
 996                                match_last + 1);
 997       re_free (lim_states);
 998       lim_states = NULL;
 999       if (BE (ret != REG_NOERROR, 0))
1000         goto free_return;
1001     }
1002   else
1003     {
1004       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1005       ret = sift_states_backward (mctx, &sctx);
1006       re_node_set_free (&sctx.limits);
1007       if (BE (ret != REG_NOERROR, 0))
1008         goto free_return;
1009       if (sifted_states[0] == NULL)
1010         {
1011           ret = REG_NOMATCH;
1012           goto free_return;
1013         }
1014     }
1015   re_free (mctx->state_log);
1016   mctx->state_log = sifted_states;
1017   sifted_states = NULL;
1018   mctx->last_node = halt_node;
1019   mctx->match_last = match_last;
1020   ret = REG_NOERROR;
1021  free_return:
1022   re_free (sifted_states);
1023   re_free (lim_states);
1024   return ret;
1025 }
1026
1027 /* Acquire an initial state and return it.
1028    We must select appropriate initial state depending on the context,
1029    since initial states may have constraints like "\<", "^", etc..  */
1030
1031 static inline re_dfastate_t *
1032 __attribute ((always_inline)) internal_function
1033 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1034                             int idx)
1035 {
1036   const re_dfa_t *const dfa = mctx->dfa;
1037   if (dfa->init_state->has_constraint)
1038     {
1039       unsigned int context;
1040       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1041       if (IS_WORD_CONTEXT (context))
1042         return dfa->init_state_word;
1043       else if (IS_ORDINARY_CONTEXT (context))
1044         return dfa->init_state;
1045       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1046         return dfa->init_state_begbuf;
1047       else if (IS_NEWLINE_CONTEXT (context))
1048         return dfa->init_state_nl;
1049       else if (IS_BEGBUF_CONTEXT (context))
1050         {
1051           /* It is relatively rare case, then calculate on demand.  */
1052           return re_acquire_state_context (err, dfa,
1053                                            dfa->init_state->entrance_nodes,
1054                                            context);
1055         }
1056       else
1057         /* Must not happen?  */
1058         return dfa->init_state;
1059     }
1060   else
1061     return dfa->init_state;
1062 }
1063
1064 /* Check whether the regular expression match input string INPUT or not,
1065    and return the index where the matching end, return -1 if not match,
1066    or return -2 in case of an error.
1067    FL_LONGEST_MATCH means we want the POSIX longest matching.
1068    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1069    next place where we may want to try matching.
1070    Note that the matcher assume that the maching starts from the current
1071    index of the buffer.  */
1072
1073 static int
1074 internal_function __attribute_warn_unused_result__
1075 check_matching (re_match_context_t *mctx, int fl_longest_match,
1076                 int *p_match_first)
1077 {
1078   const re_dfa_t *const dfa = mctx->dfa;
1079   reg_errcode_t err;
1080   int match = 0;
1081   int match_last = -1;
1082   int cur_str_idx = re_string_cur_idx (&mctx->input);
1083   re_dfastate_t *cur_state;
1084   int at_init_state = p_match_first != NULL;
1085   int next_start_idx = cur_str_idx;
1086
1087   err = REG_NOERROR;
1088   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1089   /* An initial state must not be NULL (invalid).  */
1090   if (BE (cur_state == NULL, 0))
1091     {
1092       assert (err == REG_ESPACE);
1093       return -2;
1094     }
1095
1096   if (mctx->state_log != NULL)
1097     {
1098       mctx->state_log[cur_str_idx] = cur_state;
1099
1100       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1101          later.  E.g. Processing back references.  */
1102       if (BE (dfa->nbackref, 0))
1103         {
1104           at_init_state = 0;
1105           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1106           if (BE (err != REG_NOERROR, 0))
1107             return err;
1108
1109           if (cur_state->has_backref)
1110             {
1111               err = transit_state_bkref (mctx, &cur_state->nodes);
1112               if (BE (err != REG_NOERROR, 0))
1113                 return err;
1114             }
1115         }
1116     }
1117
1118   /* If the RE accepts NULL string.  */
1119   if (BE (cur_state->halt, 0))
1120     {
1121       if (!cur_state->has_constraint
1122           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1123         {
1124           if (!fl_longest_match)
1125             return cur_str_idx;
1126           else
1127             {
1128               match_last = cur_str_idx;
1129               match = 1;
1130             }
1131         }
1132     }
1133
1134   while (!re_string_eoi (&mctx->input))
1135     {
1136       re_dfastate_t *old_state = cur_state;
1137       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1138
1139       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1140           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1141               && mctx->input.valid_len < mctx->input.len))
1142         {
1143           err = extend_buffers (mctx);
1144           if (BE (err != REG_NOERROR, 0))
1145             {
1146               assert (err == REG_ESPACE);
1147               return -2;
1148             }
1149         }
1150
1151       cur_state = transit_state (&err, mctx, cur_state);
1152       if (mctx->state_log != NULL)
1153         cur_state = merge_state_with_log (&err, mctx, cur_state);
1154
1155       if (cur_state == NULL)
1156         {
1157           /* Reached the invalid state or an error.  Try to recover a valid
1158              state using the state log, if available and if we have not
1159              already found a valid (even if not the longest) match.  */
1160           if (BE (err != REG_NOERROR, 0))
1161             return -2;
1162
1163           if (mctx->state_log == NULL
1164               || (match && !fl_longest_match)
1165               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1166             break;
1167         }
1168
1169       if (BE (at_init_state, 0))
1170         {
1171           if (old_state == cur_state)
1172             next_start_idx = next_char_idx;
1173           else
1174             at_init_state = 0;
1175         }
1176
1177       if (cur_state->halt)
1178         {
1179           /* Reached a halt state.
1180              Check the halt state can satisfy the current context.  */
1181           if (!cur_state->has_constraint
1182               || check_halt_state_context (mctx, cur_state,
1183                                            re_string_cur_idx (&mctx->input)))
1184             {
1185               /* We found an appropriate halt state.  */
1186               match_last = re_string_cur_idx (&mctx->input);
1187               match = 1;
1188
1189               /* We found a match, do not modify match_first below.  */
1190               p_match_first = NULL;
1191               if (!fl_longest_match)
1192                 break;
1193             }
1194         }
1195     }
1196
1197   if (p_match_first)
1198     *p_match_first += next_start_idx;
1199
1200   return match_last;
1201 }
1202
1203 /* Check NODE match the current context.  */
1204
1205 static int
1206 internal_function
1207 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1208 {
1209   re_token_type_t type = dfa->nodes[node].type;
1210   unsigned int constraint = dfa->nodes[node].constraint;
1211   if (type != END_OF_RE)
1212     return 0;
1213   if (!constraint)
1214     return 1;
1215   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1216     return 0;
1217   return 1;
1218 }
1219
1220 /* Check the halt state STATE match the current context.
1221    Return 0 if not match, if the node, STATE has, is a halt node and
1222    match the context, return the node.  */
1223
1224 static int
1225 internal_function
1226 check_halt_state_context (const re_match_context_t *mctx,
1227                           const re_dfastate_t *state, int idx)
1228 {
1229   int i;
1230   unsigned int context;
1231 #ifdef DEBUG
1232   assert (state->halt);
1233 #endif
1234   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1235   for (i = 0; i < state->nodes.nelem; ++i)
1236     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1237       return state->nodes.elems[i];
1238   return 0;
1239 }
1240
1241 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1242    corresponding to the DFA).
1243    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1244    of errors.  */
1245
1246 static int
1247 internal_function
1248 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1249                    int *pidx, int node, re_node_set *eps_via_nodes,
1250                    struct re_fail_stack_t *fs)
1251 {
1252   const re_dfa_t *const dfa = mctx->dfa;
1253   int i, err;
1254   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1255     {
1256       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1257       re_node_set *edests = &dfa->edests[node];
1258       int dest_node;
1259       err = re_node_set_insert (eps_via_nodes, node);
1260       if (BE (err < 0, 0))
1261         return -2;
1262       /* Pick up a valid destination, or return -1 if none is found.  */
1263       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1264         {
1265           int candidate = edests->elems[i];
1266           if (!re_node_set_contains (cur_nodes, candidate))
1267             continue;
1268           if (dest_node == -1)
1269             dest_node = candidate;
1270
1271           else
1272             {
1273               /* In order to avoid infinite loop like "(a*)*", return the second
1274                  epsilon-transition if the first was already considered.  */
1275               if (re_node_set_contains (eps_via_nodes, dest_node))
1276                 return candidate;
1277
1278               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1279               else if (fs != NULL
1280                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1281                                            eps_via_nodes))
1282                 return -2;
1283
1284               /* We know we are going to exit.  */
1285               break;
1286             }
1287         }
1288       return dest_node;
1289     }
1290   else
1291     {
1292       int naccepted = 0;
1293       re_token_type_t type = dfa->nodes[node].type;
1294
1295 #ifdef RE_ENABLE_I18N
1296       if (dfa->nodes[node].accept_mb)
1297         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1298       else
1299 #endif /* RE_ENABLE_I18N */
1300       if (type == OP_BACK_REF)
1301         {
1302           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1303           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1304           if (fs != NULL)
1305             {
1306               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1307                 return -1;
1308               else if (naccepted)
1309                 {
1310                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1311                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1312                               naccepted) != 0)
1313                     return -1;
1314                 }
1315             }
1316
1317           if (naccepted == 0)
1318             {
1319               int dest_node;
1320               err = re_node_set_insert (eps_via_nodes, node);
1321               if (BE (err < 0, 0))
1322                 return -2;
1323               dest_node = dfa->edests[node].elems[0];
1324               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1325                                         dest_node))
1326                 return dest_node;
1327             }
1328         }
1329
1330       if (naccepted != 0
1331           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1332         {
1333           int dest_node = dfa->nexts[node];
1334           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1335           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1336                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1337                                                dest_node)))
1338             return -1;
1339           re_node_set_empty (eps_via_nodes);
1340           return dest_node;
1341         }
1342     }
1343   return -1;
1344 }
1345
1346 static reg_errcode_t
1347 internal_function __attribute_warn_unused_result__
1348 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1349                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1350 {
1351   reg_errcode_t err;
1352   int num = fs->num++;
1353   if (fs->num == fs->alloc)
1354     {
1355       struct re_fail_stack_ent_t *new_array;
1356       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1357                                        * fs->alloc * 2));
1358       if (new_array == NULL)
1359         return REG_ESPACE;
1360       fs->alloc *= 2;
1361       fs->stack = new_array;
1362     }
1363   fs->stack[num].idx = str_idx;
1364   fs->stack[num].node = dest_node;
1365   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1366   if (fs->stack[num].regs == NULL)
1367     return REG_ESPACE;
1368   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1369   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1370   return err;
1371 }
1372
1373 static int
1374 internal_function
1375 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1376                 regmatch_t *regs, re_node_set *eps_via_nodes)
1377 {
1378   int num = --fs->num;
1379   assert (num >= 0);
1380   *pidx = fs->stack[num].idx;
1381   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1382   re_node_set_free (eps_via_nodes);
1383   re_free (fs->stack[num].regs);
1384   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1385   return fs->stack[num].node;
1386 }
1387
1388 /* Set the positions where the subexpressions are starts/ends to registers
1389    PMATCH.
1390    Note: We assume that pmatch[0] is already set, and
1391    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1392
1393 static reg_errcode_t
1394 internal_function __attribute_warn_unused_result__
1395 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1396           regmatch_t *pmatch, int fl_backtrack)
1397 {
1398   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1399   int idx, cur_node;
1400   re_node_set eps_via_nodes;
1401   struct re_fail_stack_t *fs;
1402   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1403   regmatch_t *prev_idx_match;
1404   int prev_idx_match_malloced = 0;
1405
1406 #ifdef DEBUG
1407   assert (nmatch > 1);
1408   assert (mctx->state_log != NULL);
1409 #endif
1410   if (fl_backtrack)
1411     {
1412       fs = &fs_body;
1413       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1414       if (fs->stack == NULL)
1415         return REG_ESPACE;
1416     }
1417   else
1418     fs = NULL;
1419
1420   cur_node = dfa->init_node;
1421   re_node_set_init_empty (&eps_via_nodes);
1422
1423   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1424     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1425   else
1426     {
1427       prev_idx_match = re_malloc (regmatch_t, nmatch);
1428       if (prev_idx_match == NULL)
1429         {
1430           free_fail_stack_return (fs);
1431           return REG_ESPACE;
1432         }
1433       prev_idx_match_malloced = 1;
1434     }
1435   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1436
1437   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1438     {
1439       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1440
1441       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1442         {
1443           int reg_idx;
1444           if (fs)
1445             {
1446               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1447                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1448                   break;
1449               if (reg_idx == nmatch)
1450                 {
1451                   re_node_set_free (&eps_via_nodes);
1452                   if (prev_idx_match_malloced)
1453                     re_free (prev_idx_match);
1454                   return free_fail_stack_return (fs);
1455                 }
1456               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1457                                          &eps_via_nodes);
1458             }
1459           else
1460             {
1461               re_node_set_free (&eps_via_nodes);
1462               if (prev_idx_match_malloced)
1463                 re_free (prev_idx_match);
1464               return REG_NOERROR;
1465             }
1466         }
1467
1468       /* Proceed to next node.  */
1469       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1470                                     &eps_via_nodes, fs);
1471
1472       if (BE (cur_node < 0, 0))
1473         {
1474           if (BE (cur_node == -2, 0))
1475             {
1476               re_node_set_free (&eps_via_nodes);
1477               if (prev_idx_match_malloced)
1478                 re_free (prev_idx_match);
1479               free_fail_stack_return (fs);
1480               return REG_ESPACE;
1481             }
1482           if (fs)
1483             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1484                                        &eps_via_nodes);
1485           else
1486             {
1487               re_node_set_free (&eps_via_nodes);
1488               if (prev_idx_match_malloced)
1489                 re_free (prev_idx_match);
1490               return REG_NOMATCH;
1491             }
1492         }
1493     }
1494   re_node_set_free (&eps_via_nodes);
1495   if (prev_idx_match_malloced)
1496     re_free (prev_idx_match);
1497   return free_fail_stack_return (fs);
1498 }
1499
1500 static reg_errcode_t
1501 internal_function
1502 free_fail_stack_return (struct re_fail_stack_t *fs)
1503 {
1504   if (fs)
1505     {
1506       int fs_idx;
1507       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1508         {
1509           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1510           re_free (fs->stack[fs_idx].regs);
1511         }
1512       re_free (fs->stack);
1513     }
1514   return REG_NOERROR;
1515 }
1516
1517 static void
1518 internal_function
1519 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1520              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1521 {
1522   int type = dfa->nodes[cur_node].type;
1523   if (type == OP_OPEN_SUBEXP)
1524     {
1525       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1526
1527       /* We are at the first node of this sub expression.  */
1528       if (reg_num < nmatch)
1529         {
1530           pmatch[reg_num].rm_so = cur_idx;
1531           pmatch[reg_num].rm_eo = -1;
1532         }
1533     }
1534   else if (type == OP_CLOSE_SUBEXP)
1535     {
1536       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1537       if (reg_num < nmatch)
1538         {
1539           /* We are at the last node of this sub expression.  */
1540           if (pmatch[reg_num].rm_so < cur_idx)
1541             {
1542               pmatch[reg_num].rm_eo = cur_idx;
1543               /* This is a non-empty match or we are not inside an optional
1544                  subexpression.  Accept this right away.  */
1545               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1546             }
1547           else
1548             {
1549               if (dfa->nodes[cur_node].opt_subexp
1550                   && prev_idx_match[reg_num].rm_so != -1)
1551                 /* We transited through an empty match for an optional
1552                    subexpression, like (a?)*, and this is not the subexp's
1553                    first match.  Copy back the old content of the registers
1554                    so that matches of an inner subexpression are undone as
1555                    well, like in ((a?))*.  */
1556                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1557               else
1558                 /* We completed a subexpression, but it may be part of
1559                    an optional one, so do not update PREV_IDX_MATCH.  */
1560                 pmatch[reg_num].rm_eo = cur_idx;
1561             }
1562         }
1563     }
1564 }
1565
1566 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1567    and sift the nodes in each states according to the following rules.
1568    Updated state_log will be wrote to STATE_LOG.
1569
1570    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1571      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1572         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1573         the LAST_NODE, we throw away the node `a'.
1574      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1575         string `s' and transit to `b':
1576         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1577            away the node `a'.
1578         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1579             thrown away, we throw away the node `a'.
1580      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1581         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1582            node `a'.
1583         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1584             we throw away the node `a'.  */
1585
1586 #define STATE_NODE_CONTAINS(state,node) \
1587   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1588
1589 static reg_errcode_t
1590 internal_function
1591 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1592 {
1593   reg_errcode_t err;
1594   int null_cnt = 0;
1595   int str_idx = sctx->last_str_idx;
1596   re_node_set cur_dest;
1597
1598 #ifdef DEBUG
1599   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1600 #endif
1601
1602   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1603      transit to the last_node and the last_node itself.  */
1604   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1605   if (BE (err != REG_NOERROR, 0))
1606     return err;
1607   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1608   if (BE (err != REG_NOERROR, 0))
1609     goto free_return;
1610
1611   /* Then check each states in the state_log.  */
1612   while (str_idx > 0)
1613     {
1614       /* Update counters.  */
1615       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1616       if (null_cnt > mctx->max_mb_elem_len)
1617         {
1618           memset (sctx->sifted_states, '\0',
1619                   sizeof (re_dfastate_t *) * str_idx);
1620           re_node_set_free (&cur_dest);
1621           return REG_NOERROR;
1622         }
1623       re_node_set_empty (&cur_dest);
1624       --str_idx;
1625
1626       if (mctx->state_log[str_idx])
1627         {
1628           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1629           if (BE (err != REG_NOERROR, 0))
1630             goto free_return;
1631         }
1632
1633       /* Add all the nodes which satisfy the following conditions:
1634          - It can epsilon transit to a node in CUR_DEST.
1635          - It is in CUR_SRC.
1636          And update state_log.  */
1637       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1638       if (BE (err != REG_NOERROR, 0))
1639         goto free_return;
1640     }
1641   err = REG_NOERROR;
1642  free_return:
1643   re_node_set_free (&cur_dest);
1644   return err;
1645 }
1646
1647 static reg_errcode_t
1648 internal_function __attribute_warn_unused_result__
1649 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1650                      int str_idx, re_node_set *cur_dest)
1651 {
1652   const re_dfa_t *const dfa = mctx->dfa;
1653   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1654   int i;
1655
1656   /* Then build the next sifted state.
1657      We build the next sifted state on `cur_dest', and update
1658      `sifted_states[str_idx]' with `cur_dest'.
1659      Note:
1660      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1661      `cur_src' points the node_set of the old `state_log[str_idx]'
1662      (with the epsilon nodes pre-filtered out).  */
1663   for (i = 0; i < cur_src->nelem; i++)
1664     {
1665       int prev_node = cur_src->elems[i];
1666       int naccepted = 0;
1667       int ret;
1668
1669 #ifdef DEBUG
1670       re_token_type_t type = dfa->nodes[prev_node].type;
1671       assert (!IS_EPSILON_NODE (type));
1672 #endif
1673 #ifdef RE_ENABLE_I18N
1674       /* If the node may accept `multi byte'.  */
1675       if (dfa->nodes[prev_node].accept_mb)
1676         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1677                                          str_idx, sctx->last_str_idx);
1678 #endif /* RE_ENABLE_I18N */
1679
1680       /* We don't check backreferences here.
1681          See update_cur_sifted_state().  */
1682       if (!naccepted
1683           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1684           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1685                                   dfa->nexts[prev_node]))
1686         naccepted = 1;
1687
1688       if (naccepted == 0)
1689         continue;
1690
1691       if (sctx->limits.nelem)
1692         {
1693           int to_idx = str_idx + naccepted;
1694           if (check_dst_limits (mctx, &sctx->limits,
1695                                 dfa->nexts[prev_node], to_idx,
1696                                 prev_node, str_idx))
1697             continue;
1698         }
1699       ret = re_node_set_insert (cur_dest, prev_node);
1700       if (BE (ret == -1, 0))
1701         return REG_ESPACE;
1702     }
1703
1704   return REG_NOERROR;
1705 }
1706
1707 /* Helper functions.  */
1708
1709 static reg_errcode_t
1710 internal_function
1711 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1712 {
1713   int top = mctx->state_log_top;
1714
1715   if (next_state_log_idx >= mctx->input.bufs_len
1716       || (next_state_log_idx >= mctx->input.valid_len
1717           && mctx->input.valid_len < mctx->input.len))
1718     {
1719       reg_errcode_t err;
1720       err = extend_buffers (mctx);
1721       if (BE (err != REG_NOERROR, 0))
1722         return err;
1723     }
1724
1725   if (top < next_state_log_idx)
1726     {
1727       memset (mctx->state_log + top + 1, '\0',
1728               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1729       mctx->state_log_top = next_state_log_idx;
1730     }
1731   return REG_NOERROR;
1732 }
1733
1734 static reg_errcode_t
1735 internal_function
1736 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1737                    re_dfastate_t **src, int num)
1738 {
1739   int st_idx;
1740   reg_errcode_t err;
1741   for (st_idx = 0; st_idx < num; ++st_idx)
1742     {
1743       if (dst[st_idx] == NULL)
1744         dst[st_idx] = src[st_idx];
1745       else if (src[st_idx] != NULL)
1746         {
1747           re_node_set merged_set;
1748           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1749                                         &src[st_idx]->nodes);
1750           if (BE (err != REG_NOERROR, 0))
1751             return err;
1752           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1753           re_node_set_free (&merged_set);
1754           if (BE (err != REG_NOERROR, 0))
1755             return err;
1756         }
1757     }
1758   return REG_NOERROR;
1759 }
1760
1761 static reg_errcode_t
1762 internal_function
1763 update_cur_sifted_state (const re_match_context_t *mctx,
1764                          re_sift_context_t *sctx, int str_idx,
1765                          re_node_set *dest_nodes)
1766 {
1767   const re_dfa_t *const dfa = mctx->dfa;
1768   reg_errcode_t err = REG_NOERROR;
1769   const re_node_set *candidates;
1770   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1771                 : &mctx->state_log[str_idx]->nodes);
1772
1773   if (dest_nodes->nelem == 0)
1774     sctx->sifted_states[str_idx] = NULL;
1775   else
1776     {
1777       if (candidates)
1778         {
1779           /* At first, add the nodes which can epsilon transit to a node in
1780              DEST_NODE.  */
1781           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1782           if (BE (err != REG_NOERROR, 0))
1783             return err;
1784
1785           /* Then, check the limitations in the current sift_context.  */
1786           if (sctx->limits.nelem)
1787             {
1788               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1789                                          mctx->bkref_ents, str_idx);
1790               if (BE (err != REG_NOERROR, 0))
1791                 return err;
1792             }
1793         }
1794
1795       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1796       if (BE (err != REG_NOERROR, 0))
1797         return err;
1798     }
1799
1800   if (candidates && mctx->state_log[str_idx]->has_backref)
1801     {
1802       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1803       if (BE (err != REG_NOERROR, 0))
1804         return err;
1805     }
1806   return REG_NOERROR;
1807 }
1808
1809 static reg_errcode_t
1810 internal_function __attribute_warn_unused_result__
1811 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1812                        const re_node_set *candidates)
1813 {
1814   reg_errcode_t err = REG_NOERROR;
1815   int i;
1816
1817   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1818   if (BE (err != REG_NOERROR, 0))
1819     return err;
1820
1821   if (!state->inveclosure.alloc)
1822     {
1823       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1824       if (BE (err != REG_NOERROR, 0))
1825         return REG_ESPACE;
1826       for (i = 0; i < dest_nodes->nelem; i++)
1827         {
1828           err = re_node_set_merge (&state->inveclosure,
1829                                    dfa->inveclosures + dest_nodes->elems[i]);
1830           if (BE (err != REG_NOERROR, 0))
1831             return REG_ESPACE;
1832         }
1833     }
1834   return re_node_set_add_intersect (dest_nodes, candidates,
1835                                     &state->inveclosure);
1836 }
1837
1838 static reg_errcode_t
1839 internal_function
1840 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1841                        const re_node_set *candidates)
1842 {
1843     int ecl_idx;
1844     reg_errcode_t err;
1845     re_node_set *inv_eclosure = dfa->inveclosures + node;
1846     re_node_set except_nodes;
1847     re_node_set_init_empty (&except_nodes);
1848     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1849       {
1850         int cur_node = inv_eclosure->elems[ecl_idx];
1851         if (cur_node == node)
1852           continue;
1853         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1854           {
1855             int edst1 = dfa->edests[cur_node].elems[0];
1856             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1857                          ? dfa->edests[cur_node].elems[1] : -1);
1858             if ((!re_node_set_contains (inv_eclosure, edst1)
1859                  && re_node_set_contains (dest_nodes, edst1))
1860                 || (edst2 > 0
1861                     && !re_node_set_contains (inv_eclosure, edst2)
1862                     && re_node_set_contains (dest_nodes, edst2)))
1863               {
1864                 err = re_node_set_add_intersect (&except_nodes, candidates,
1865                                                  dfa->inveclosures + cur_node);
1866                 if (BE (err != REG_NOERROR, 0))
1867                   {
1868                     re_node_set_free (&except_nodes);
1869                     return err;
1870                   }
1871               }
1872           }
1873       }
1874     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1875       {
1876         int cur_node = inv_eclosure->elems[ecl_idx];
1877         if (!re_node_set_contains (&except_nodes, cur_node))
1878           {
1879             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1880             re_node_set_remove_at (dest_nodes, idx);
1881           }
1882       }
1883     re_node_set_free (&except_nodes);
1884     return REG_NOERROR;
1885 }
1886
1887 static int
1888 internal_function
1889 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1890                   int dst_node, int dst_idx, int src_node, int src_idx)
1891 {
1892   const re_dfa_t *const dfa = mctx->dfa;
1893   int lim_idx, src_pos, dst_pos;
1894
1895   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1896   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1897   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1898     {
1899       int subexp_idx;
1900       struct re_backref_cache_entry *ent;
1901       ent = mctx->bkref_ents + limits->elems[lim_idx];
1902       subexp_idx = dfa->nodes[ent->node].opr.idx;
1903
1904       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1905                                            subexp_idx, dst_node, dst_idx,
1906                                            dst_bkref_idx);
1907       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1908                                            subexp_idx, src_node, src_idx,
1909                                            src_bkref_idx);
1910
1911       /* In case of:
1912          <src> <dst> ( <subexp> )
1913          ( <subexp> ) <src> <dst>
1914          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1915       if (src_pos == dst_pos)
1916         continue; /* This is unrelated limitation.  */
1917       else
1918         return 1;
1919     }
1920   return 0;
1921 }
1922
1923 static int
1924 internal_function
1925 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1926                              int subexp_idx, int from_node, int bkref_idx)
1927 {
1928   const re_dfa_t *const dfa = mctx->dfa;
1929   const re_node_set *eclosures = dfa->eclosures + from_node;
1930   int node_idx;
1931
1932   /* Else, we are on the boundary: examine the nodes on the epsilon
1933      closure.  */
1934   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1935     {
1936       int node = eclosures->elems[node_idx];
1937       switch (dfa->nodes[node].type)
1938         {
1939         case OP_BACK_REF:
1940           if (bkref_idx != -1)
1941             {
1942               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1943               do
1944                 {
1945                   int dst, cpos;
1946
1947                   if (ent->node != node)
1948                     continue;
1949
1950                   if (subexp_idx < BITSET_WORD_BITS
1951                       && !(ent->eps_reachable_subexps_map
1952                            & ((bitset_word_t) 1 << subexp_idx)))
1953                     continue;
1954
1955                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1956                      OP_CLOSE_SUBEXP cases below.  But, if the
1957                      destination node is the same node as the source
1958                      node, don't recurse because it would cause an
1959                      infinite loop: a regex that exhibits this behavior
1960                      is ()\1*\1*  */
1961                   dst = dfa->edests[node].elems[0];
1962                   if (dst == from_node)
1963                     {
1964                       if (boundaries & 1)
1965                         return -1;
1966                       else /* if (boundaries & 2) */
1967                         return 0;
1968                     }
1969
1970                   cpos =
1971                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1972                                                  dst, bkref_idx);
1973                   if (cpos == -1 /* && (boundaries & 1) */)
1974                     return -1;
1975                   if (cpos == 0 && (boundaries & 2))
1976                     return 0;
1977
1978                   if (subexp_idx < BITSET_WORD_BITS)
1979                     ent->eps_reachable_subexps_map
1980                       &= ~((bitset_word_t) 1 << subexp_idx);
1981                 }
1982               while (ent++->more);
1983             }
1984           break;
1985
1986         case OP_OPEN_SUBEXP:
1987           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1988             return -1;
1989           break;
1990
1991         case OP_CLOSE_SUBEXP:
1992           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1993             return 0;
1994           break;
1995
1996         default:
1997             break;
1998         }
1999     }
2000
2001   return (boundaries & 2) ? 1 : 0;
2002 }
2003
2004 static int
2005 internal_function
2006 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2007                            int subexp_idx, int from_node, int str_idx,
2008                            int bkref_idx)
2009 {
2010   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2011   int boundaries;
2012
2013   /* If we are outside the range of the subexpression, return -1 or 1.  */
2014   if (str_idx < lim->subexp_from)
2015     return -1;
2016
2017   if (lim->subexp_to < str_idx)
2018     return 1;
2019
2020   /* If we are within the subexpression, return 0.  */
2021   boundaries = (str_idx == lim->subexp_from);
2022   boundaries |= (str_idx == lim->subexp_to) << 1;
2023   if (boundaries == 0)
2024     return 0;
2025
2026   /* Else, examine epsilon closure.  */
2027   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2028                                       from_node, bkref_idx);
2029 }
2030
2031 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2032    which are against limitations from DEST_NODES. */
2033
2034 static reg_errcode_t
2035 internal_function
2036 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2037                      const re_node_set *candidates, re_node_set *limits,
2038                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2039 {
2040   reg_errcode_t err;
2041   int node_idx, lim_idx;
2042
2043   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2044     {
2045       int subexp_idx;
2046       struct re_backref_cache_entry *ent;
2047       ent = bkref_ents + limits->elems[lim_idx];
2048
2049       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2050         continue; /* This is unrelated limitation.  */
2051
2052       subexp_idx = dfa->nodes[ent->node].opr.idx;
2053       if (ent->subexp_to == str_idx)
2054         {
2055           int ops_node = -1;
2056           int cls_node = -1;
2057           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2058             {
2059               int node = dest_nodes->elems[node_idx];
2060               re_token_type_t type = dfa->nodes[node].type;
2061               if (type == OP_OPEN_SUBEXP
2062                   && subexp_idx == dfa->nodes[node].opr.idx)
2063                 ops_node = node;
2064               else if (type == OP_CLOSE_SUBEXP
2065                        && subexp_idx == dfa->nodes[node].opr.idx)
2066                 cls_node = node;
2067             }
2068
2069           /* Check the limitation of the open subexpression.  */
2070           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2071           if (ops_node >= 0)
2072             {
2073               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2074                                            candidates);
2075               if (BE (err != REG_NOERROR, 0))
2076                 return err;
2077             }
2078
2079           /* Check the limitation of the close subexpression.  */
2080           if (cls_node >= 0)
2081             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2082               {
2083                 int node = dest_nodes->elems[node_idx];
2084                 if (!re_node_set_contains (dfa->inveclosures + node,
2085                                            cls_node)
2086                     && !re_node_set_contains (dfa->eclosures + node,
2087                                               cls_node))
2088                   {
2089                     /* It is against this limitation.
2090                        Remove it form the current sifted state.  */
2091                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2092                                                  candidates);
2093                     if (BE (err != REG_NOERROR, 0))
2094                       return err;
2095                     --node_idx;
2096                   }
2097               }
2098         }
2099       else /* (ent->subexp_to != str_idx)  */
2100         {
2101           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2102             {
2103               int node = dest_nodes->elems[node_idx];
2104               re_token_type_t type = dfa->nodes[node].type;
2105               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2106                 {
2107                   if (subexp_idx != dfa->nodes[node].opr.idx)
2108                     continue;
2109                   /* It is against this limitation.
2110                      Remove it form the current sifted state.  */
2111                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2112                                                candidates);
2113                   if (BE (err != REG_NOERROR, 0))
2114                     return err;
2115                 }
2116             }
2117         }
2118     }
2119   return REG_NOERROR;
2120 }
2121
2122 static reg_errcode_t
2123 internal_function __attribute_warn_unused_result__
2124 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2125                    int str_idx, const re_node_set *candidates)
2126 {
2127   const re_dfa_t *const dfa = mctx->dfa;
2128   reg_errcode_t err;
2129   int node_idx, node;
2130   re_sift_context_t local_sctx;
2131   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2132
2133   if (first_idx == -1)
2134     return REG_NOERROR;
2135
2136   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2137
2138   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2139     {
2140       int enabled_idx;
2141       re_token_type_t type;
2142       struct re_backref_cache_entry *entry;
2143       node = candidates->elems[node_idx];
2144       type = dfa->nodes[node].type;
2145       /* Avoid infinite loop for the REs like "()\1+".  */
2146       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2147         continue;
2148       if (type != OP_BACK_REF)
2149         continue;
2150
2151       entry = mctx->bkref_ents + first_idx;
2152       enabled_idx = first_idx;
2153       do
2154         {
2155           int subexp_len;
2156           int to_idx;
2157           int dst_node;
2158           int ret;
2159           re_dfastate_t *cur_state;
2160
2161           if (entry->node != node)
2162             continue;
2163           subexp_len = entry->subexp_to - entry->subexp_from;
2164           to_idx = str_idx + subexp_len;
2165           dst_node = (subexp_len ? dfa->nexts[node]
2166                       : dfa->edests[node].elems[0]);
2167
2168           if (to_idx > sctx->last_str_idx
2169               || sctx->sifted_states[to_idx] == NULL
2170               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2171               || check_dst_limits (mctx, &sctx->limits, node,
2172                                    str_idx, dst_node, to_idx))
2173             continue;
2174
2175           if (local_sctx.sifted_states == NULL)
2176             {
2177               local_sctx = *sctx;
2178               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2179               if (BE (err != REG_NOERROR, 0))
2180                 goto free_return;
2181             }
2182           local_sctx.last_node = node;
2183           local_sctx.last_str_idx = str_idx;
2184           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2185           if (BE (ret < 0, 0))
2186             {
2187               err = REG_ESPACE;
2188               goto free_return;
2189             }
2190           cur_state = local_sctx.sifted_states[str_idx];
2191           err = sift_states_backward (mctx, &local_sctx);
2192           if (BE (err != REG_NOERROR, 0))
2193             goto free_return;
2194           if (sctx->limited_states != NULL)
2195             {
2196               err = merge_state_array (dfa, sctx->limited_states,
2197                                        local_sctx.sifted_states,
2198                                        str_idx + 1);
2199               if (BE (err != REG_NOERROR, 0))
2200                 goto free_return;
2201             }
2202           local_sctx.sifted_states[str_idx] = cur_state;
2203           re_node_set_remove (&local_sctx.limits, enabled_idx);
2204
2205           /* mctx->bkref_ents may have changed, reload the pointer.  */
2206           entry = mctx->bkref_ents + enabled_idx;
2207         }
2208       while (enabled_idx++, entry++->more);
2209     }
2210   err = REG_NOERROR;
2211  free_return:
2212   if (local_sctx.sifted_states != NULL)
2213     {
2214       re_node_set_free (&local_sctx.limits);
2215     }
2216
2217   return err;
2218 }
2219
2220
2221 #ifdef RE_ENABLE_I18N
2222 static int
2223 internal_function
2224 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2225                      int node_idx, int str_idx, int max_str_idx)
2226 {
2227   const re_dfa_t *const dfa = mctx->dfa;
2228   int naccepted;
2229   /* Check the node can accept `multi byte'.  */
2230   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2231   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2232       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2233                             dfa->nexts[node_idx]))
2234     /* The node can't accept the `multi byte', or the
2235        destination was already thrown away, then the node
2236        could't accept the current input `multi byte'.   */
2237     naccepted = 0;
2238   /* Otherwise, it is sure that the node could accept
2239      `naccepted' bytes input.  */
2240   return naccepted;
2241 }
2242 #endif /* RE_ENABLE_I18N */
2243
2244 \f
2245 /* Functions for state transition.  */
2246
2247 /* Return the next state to which the current state STATE will transit by
2248    accepting the current input byte, and update STATE_LOG if necessary.
2249    If STATE can accept a multibyte char/collating element/back reference
2250    update the destination of STATE_LOG.  */
2251
2252 static re_dfastate_t *
2253 internal_function __attribute_warn_unused_result__
2254 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2255                re_dfastate_t *state)
2256 {
2257   re_dfastate_t **trtable;
2258   unsigned char ch;
2259
2260 #ifdef RE_ENABLE_I18N
2261   /* If the current state can accept multibyte.  */
2262   if (BE (state->accept_mb, 0))
2263     {
2264       *err = transit_state_mb (mctx, state);
2265       if (BE (*err != REG_NOERROR, 0))
2266         return NULL;
2267     }
2268 #endif /* RE_ENABLE_I18N */
2269
2270   /* Then decide the next state with the single byte.  */
2271 #if 0
2272   if (0)
2273     /* don't use transition table  */
2274     return transit_state_sb (err, mctx, state);
2275 #endif
2276
2277   /* Use transition table  */
2278   ch = re_string_fetch_byte (&mctx->input);
2279   for (;;)
2280     {
2281       trtable = state->trtable;
2282       if (BE (trtable != NULL, 1))
2283         return trtable[ch];
2284
2285       trtable = state->word_trtable;
2286       if (BE (trtable != NULL, 1))
2287         {
2288           unsigned int context;
2289           context
2290             = re_string_context_at (&mctx->input,
2291                                     re_string_cur_idx (&mctx->input) - 1,
2292                                     mctx->eflags);
2293           if (IS_WORD_CONTEXT (context))
2294             return trtable[ch + SBC_MAX];
2295           else
2296             return trtable[ch];
2297         }
2298
2299       if (!build_trtable (mctx->dfa, state))
2300         {
2301           *err = REG_ESPACE;
2302           return NULL;
2303         }
2304
2305       /* Retry, we now have a transition table.  */
2306     }
2307 }
2308
2309 /* Update the state_log if we need */
2310 re_dfastate_t *
2311 internal_function
2312 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2313                       re_dfastate_t *next_state)
2314 {
2315   const re_dfa_t *const dfa = mctx->dfa;
2316   int cur_idx = re_string_cur_idx (&mctx->input);
2317
2318   if (cur_idx > mctx->state_log_top)
2319     {
2320       mctx->state_log[cur_idx] = next_state;
2321       mctx->state_log_top = cur_idx;
2322     }
2323   else if (mctx->state_log[cur_idx] == 0)
2324     {
2325       mctx->state_log[cur_idx] = next_state;
2326     }
2327   else
2328     {
2329       re_dfastate_t *pstate;
2330       unsigned int context;
2331       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2332       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2333          the destination of a multibyte char/collating element/
2334          back reference.  Then the next state is the union set of
2335          these destinations and the results of the transition table.  */
2336       pstate = mctx->state_log[cur_idx];
2337       log_nodes = pstate->entrance_nodes;
2338       if (next_state != NULL)
2339         {
2340           table_nodes = next_state->entrance_nodes;
2341           *err = re_node_set_init_union (&next_nodes, table_nodes,
2342                                              log_nodes);
2343           if (BE (*err != REG_NOERROR, 0))
2344             return NULL;
2345         }
2346       else
2347         next_nodes = *log_nodes;
2348       /* Note: We already add the nodes of the initial state,
2349          then we don't need to add them here.  */
2350
2351       context = re_string_context_at (&mctx->input,
2352                                       re_string_cur_idx (&mctx->input) - 1,
2353                                       mctx->eflags);
2354       next_state = mctx->state_log[cur_idx]
2355         = re_acquire_state_context (err, dfa, &next_nodes, context);
2356       /* We don't need to check errors here, since the return value of
2357          this function is next_state and ERR is already set.  */
2358
2359       if (table_nodes != NULL)
2360         re_node_set_free (&next_nodes);
2361     }
2362
2363   if (BE (dfa->nbackref, 0) && next_state != NULL)
2364     {
2365       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2366          later.  We must check them here, since the back references in the
2367          next state might use them.  */
2368       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2369                                         cur_idx);
2370       if (BE (*err != REG_NOERROR, 0))
2371         return NULL;
2372
2373       /* If the next state has back references.  */
2374       if (next_state->has_backref)
2375         {
2376           *err = transit_state_bkref (mctx, &next_state->nodes);
2377           if (BE (*err != REG_NOERROR, 0))
2378             return NULL;
2379           next_state = mctx->state_log[cur_idx];
2380         }
2381     }
2382
2383   return next_state;
2384 }
2385
2386 /* Skip bytes in the input that correspond to part of a
2387    multi-byte match, then look in the log for a state
2388    from which to restart matching.  */
2389 re_dfastate_t *
2390 internal_function
2391 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2392 {
2393   re_dfastate_t *cur_state;
2394   do
2395     {
2396       int max = mctx->state_log_top;
2397       int cur_str_idx = re_string_cur_idx (&mctx->input);
2398
2399       do
2400         {
2401           if (++cur_str_idx > max)
2402             return NULL;
2403           re_string_skip_bytes (&mctx->input, 1);
2404         }
2405       while (mctx->state_log[cur_str_idx] == NULL);
2406
2407       cur_state = merge_state_with_log (err, mctx, NULL);
2408     }
2409   while (*err == REG_NOERROR && cur_state == NULL);
2410   return cur_state;
2411 }
2412
2413 /* Helper functions for transit_state.  */
2414
2415 /* From the node set CUR_NODES, pick up the nodes whose types are
2416    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2417    expression. And register them to use them later for evaluating the
2418    correspoding back references.  */
2419
2420 static reg_errcode_t
2421 internal_function
2422 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2423                            int str_idx)
2424 {
2425   const re_dfa_t *const dfa = mctx->dfa;
2426   int node_idx;
2427   reg_errcode_t err;
2428
2429   /* TODO: This isn't efficient.
2430            Because there might be more than one nodes whose types are
2431            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2432            nodes.
2433            E.g. RE: (a){2}  */
2434   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2435     {
2436       int node = cur_nodes->elems[node_idx];
2437       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2438           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2439           && (dfa->used_bkref_map
2440               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2441         {
2442           err = match_ctx_add_subtop (mctx, node, str_idx);
2443           if (BE (err != REG_NOERROR, 0))
2444             return err;
2445         }
2446     }
2447   return REG_NOERROR;
2448 }
2449
2450 #if 0
2451 /* Return the next state to which the current state STATE will transit by
2452    accepting the current input byte.  */
2453
2454 static re_dfastate_t *
2455 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2456                   re_dfastate_t *state)
2457 {
2458   const re_dfa_t *const dfa = mctx->dfa;
2459   re_node_set next_nodes;
2460   re_dfastate_t *next_state;
2461   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2462   unsigned int context;
2463
2464   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2465   if (BE (*err != REG_NOERROR, 0))
2466     return NULL;
2467   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2468     {
2469       int cur_node = state->nodes.elems[node_cnt];
2470       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2471         {
2472           *err = re_node_set_merge (&next_nodes,
2473                                     dfa->eclosures + dfa->nexts[cur_node]);
2474           if (BE (*err != REG_NOERROR, 0))
2475             {
2476               re_node_set_free (&next_nodes);
2477               return NULL;
2478             }
2479         }
2480     }
2481   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2482   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2483   /* We don't need to check errors here, since the return value of
2484      this function is next_state and ERR is already set.  */
2485
2486   re_node_set_free (&next_nodes);
2487   re_string_skip_bytes (&mctx->input, 1);
2488   return next_state;
2489 }
2490 #endif
2491
2492 #ifdef RE_ENABLE_I18N
2493 static reg_errcode_t
2494 internal_function
2495 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2496 {
2497   const re_dfa_t *const dfa = mctx->dfa;
2498   reg_errcode_t err;
2499   int i;
2500
2501   for (i = 0; i < pstate->nodes.nelem; ++i)
2502     {
2503       re_node_set dest_nodes, *new_nodes;
2504       int cur_node_idx = pstate->nodes.elems[i];
2505       int naccepted, dest_idx;
2506       unsigned int context;
2507       re_dfastate_t *dest_state;
2508
2509       if (!dfa->nodes[cur_node_idx].accept_mb)
2510         continue;
2511
2512       if (dfa->nodes[cur_node_idx].constraint)
2513         {
2514           context = re_string_context_at (&mctx->input,
2515                                           re_string_cur_idx (&mctx->input),
2516                                           mctx->eflags);
2517           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2518                                            context))
2519             continue;
2520         }
2521
2522       /* How many bytes the node can accept?  */
2523       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2524                                            re_string_cur_idx (&mctx->input));
2525       if (naccepted == 0)
2526         continue;
2527
2528       /* The node can accepts `naccepted' bytes.  */
2529       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2530       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2531                                : mctx->max_mb_elem_len);
2532       err = clean_state_log_if_needed (mctx, dest_idx);
2533       if (BE (err != REG_NOERROR, 0))
2534         return err;
2535 #ifdef DEBUG
2536       assert (dfa->nexts[cur_node_idx] != -1);
2537 #endif
2538       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2539
2540       dest_state = mctx->state_log[dest_idx];
2541       if (dest_state == NULL)
2542         dest_nodes = *new_nodes;
2543       else
2544         {
2545           err = re_node_set_init_union (&dest_nodes,
2546                                         dest_state->entrance_nodes, new_nodes);
2547           if (BE (err != REG_NOERROR, 0))
2548             return err;
2549         }
2550       context = re_string_context_at (&mctx->input, dest_idx - 1,
2551                                       mctx->eflags);
2552       mctx->state_log[dest_idx]
2553         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2554       if (dest_state != NULL)
2555         re_node_set_free (&dest_nodes);
2556       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2557         return err;
2558     }
2559   return REG_NOERROR;
2560 }
2561 #endif /* RE_ENABLE_I18N */
2562
2563 static reg_errcode_t
2564 internal_function
2565 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2566 {
2567   const re_dfa_t *const dfa = mctx->dfa;
2568   reg_errcode_t err;
2569   int i;
2570   int cur_str_idx = re_string_cur_idx (&mctx->input);
2571
2572   for (i = 0; i < nodes->nelem; ++i)
2573     {
2574       int dest_str_idx, prev_nelem, bkc_idx;
2575       int node_idx = nodes->elems[i];
2576       unsigned int context;
2577       const re_token_t *node = dfa->nodes + node_idx;
2578       re_node_set *new_dest_nodes;
2579
2580       /* Check whether `node' is a backreference or not.  */
2581       if (node->type != OP_BACK_REF)
2582         continue;
2583
2584       if (node->constraint)
2585         {
2586           context = re_string_context_at (&mctx->input, cur_str_idx,
2587                                           mctx->eflags);
2588           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2589             continue;
2590         }
2591
2592       /* `node' is a backreference.
2593          Check the substring which the substring matched.  */
2594       bkc_idx = mctx->nbkref_ents;
2595       err = get_subexp (mctx, node_idx, cur_str_idx);
2596       if (BE (err != REG_NOERROR, 0))
2597         goto free_return;
2598
2599       /* And add the epsilon closures (which is `new_dest_nodes') of
2600          the backreference to appropriate state_log.  */
2601 #ifdef DEBUG
2602       assert (dfa->nexts[node_idx] != -1);
2603 #endif
2604       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2605         {
2606           int subexp_len;
2607           re_dfastate_t *dest_state;
2608           struct re_backref_cache_entry *bkref_ent;
2609           bkref_ent = mctx->bkref_ents + bkc_idx;
2610           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2611             continue;
2612           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2613           new_dest_nodes = (subexp_len == 0
2614                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2615                             : dfa->eclosures + dfa->nexts[node_idx]);
2616           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2617                           - bkref_ent->subexp_from);
2618           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2619                                           mctx->eflags);
2620           dest_state = mctx->state_log[dest_str_idx];
2621           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2622                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2623           /* Add `new_dest_node' to state_log.  */
2624           if (dest_state == NULL)
2625             {
2626               mctx->state_log[dest_str_idx]
2627                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2628                                             context);
2629               if (BE (mctx->state_log[dest_str_idx] == NULL
2630                       && err != REG_NOERROR, 0))
2631                 goto free_return;
2632             }
2633           else
2634             {
2635               re_node_set dest_nodes;
2636               err = re_node_set_init_union (&dest_nodes,
2637                                             dest_state->entrance_nodes,
2638                                             new_dest_nodes);
2639               if (BE (err != REG_NOERROR, 0))
2640                 {
2641                   re_node_set_free (&dest_nodes);
2642                   goto free_return;
2643                 }
2644               mctx->state_log[dest_str_idx]
2645                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2646               re_node_set_free (&dest_nodes);
2647               if (BE (mctx->state_log[dest_str_idx] == NULL
2648                       && err != REG_NOERROR, 0))
2649                 goto free_return;
2650             }
2651           /* We need to check recursively if the backreference can epsilon
2652              transit.  */
2653           if (subexp_len == 0
2654               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2655             {
2656               err = check_subexp_matching_top (mctx, new_dest_nodes,
2657                                                cur_str_idx);
2658               if (BE (err != REG_NOERROR, 0))
2659                 goto free_return;
2660               err = transit_state_bkref (mctx, new_dest_nodes);
2661               if (BE (err != REG_NOERROR, 0))
2662                 goto free_return;
2663             }
2664         }
2665     }
2666   err = REG_NOERROR;
2667  free_return:
2668   return err;
2669 }
2670
2671 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2672    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2673    Note that we might collect inappropriate candidates here.
2674    However, the cost of checking them strictly here is too high, then we
2675    delay these checking for prune_impossible_nodes().  */
2676
2677 static reg_errcode_t
2678 internal_function __attribute_warn_unused_result__
2679 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2680 {
2681   const re_dfa_t *const dfa = mctx->dfa;
2682   int subexp_num, sub_top_idx;
2683   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2684   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2685   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2686   if (cache_idx != -1)
2687     {
2688       const struct re_backref_cache_entry *entry
2689         = mctx->bkref_ents + cache_idx;
2690       do
2691         if (entry->node == bkref_node)
2692           return REG_NOERROR; /* We already checked it.  */
2693       while (entry++->more);
2694     }
2695
2696   subexp_num = dfa->nodes[bkref_node].opr.idx;
2697
2698   /* For each sub expression  */
2699   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2700     {
2701       reg_errcode_t err;
2702       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2703       re_sub_match_last_t *sub_last;
2704       int sub_last_idx, sl_str, bkref_str_off;
2705
2706       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2707         continue; /* It isn't related.  */
2708
2709       sl_str = sub_top->str_idx;
2710       bkref_str_off = bkref_str_idx;
2711       /* At first, check the last node of sub expressions we already
2712          evaluated.  */
2713       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2714         {
2715           int sl_str_diff;
2716           sub_last = sub_top->lasts[sub_last_idx];
2717           sl_str_diff = sub_last->str_idx - sl_str;
2718           /* The matched string by the sub expression match with the substring
2719              at the back reference?  */
2720           if (sl_str_diff > 0)
2721             {
2722               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2723                 {
2724                   /* Not enough chars for a successful match.  */
2725                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2726                     break;
2727
2728                   err = clean_state_log_if_needed (mctx,
2729                                                    bkref_str_off
2730                                                    + sl_str_diff);
2731                   if (BE (err != REG_NOERROR, 0))
2732                     return err;
2733                   buf = (const char *) re_string_get_buffer (&mctx->input);
2734                 }
2735               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2736                 /* We don't need to search this sub expression any more.  */
2737                 break;
2738             }
2739           bkref_str_off += sl_str_diff;
2740           sl_str += sl_str_diff;
2741           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2742                                 bkref_str_idx);
2743
2744           /* Reload buf, since the preceding call might have reallocated
2745              the buffer.  */
2746           buf = (const char *) re_string_get_buffer (&mctx->input);
2747
2748           if (err == REG_NOMATCH)
2749             continue;
2750           if (BE (err != REG_NOERROR, 0))
2751             return err;
2752         }
2753
2754       if (sub_last_idx < sub_top->nlasts)
2755         continue;
2756       if (sub_last_idx > 0)
2757         ++sl_str;
2758       /* Then, search for the other last nodes of the sub expression.  */
2759       for (; sl_str <= bkref_str_idx; ++sl_str)
2760         {
2761           int cls_node, sl_str_off;
2762           const re_node_set *nodes;
2763           sl_str_off = sl_str - sub_top->str_idx;
2764           /* The matched string by the sub expression match with the substring
2765              at the back reference?  */
2766           if (sl_str_off > 0)
2767             {
2768               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2769                 {
2770                   /* If we are at the end of the input, we cannot match.  */
2771                   if (bkref_str_off >= mctx->input.len)
2772                     break;
2773
2774                   err = extend_buffers (mctx);
2775                   if (BE (err != REG_NOERROR, 0))
2776                     return err;
2777
2778                   buf = (const char *) re_string_get_buffer (&mctx->input);
2779                 }
2780               if (buf [bkref_str_off++] != buf[sl_str - 1])
2781                 break; /* We don't need to search this sub expression
2782                           any more.  */
2783             }
2784           if (mctx->state_log[sl_str] == NULL)
2785             continue;
2786           /* Does this state have a ')' of the sub expression?  */
2787           nodes = &mctx->state_log[sl_str]->nodes;
2788           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2789                                        OP_CLOSE_SUBEXP);
2790           if (cls_node == -1)
2791             continue; /* No.  */
2792           if (sub_top->path == NULL)
2793             {
2794               sub_top->path = calloc (sizeof (state_array_t),
2795                                       sl_str - sub_top->str_idx + 1);
2796               if (sub_top->path == NULL)
2797                 return REG_ESPACE;
2798             }
2799           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2800              in the current context?  */
2801           err = check_arrival (mctx, sub_top->path, sub_top->node,
2802                                sub_top->str_idx, cls_node, sl_str,
2803                                OP_CLOSE_SUBEXP);
2804           if (err == REG_NOMATCH)
2805               continue;
2806           if (BE (err != REG_NOERROR, 0))
2807               return err;
2808           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2809           if (BE (sub_last == NULL, 0))
2810             return REG_ESPACE;
2811           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2812                                 bkref_str_idx);
2813           if (err == REG_NOMATCH)
2814             continue;
2815         }
2816     }
2817   return REG_NOERROR;
2818 }
2819
2820 /* Helper functions for get_subexp().  */
2821
2822 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2823    If it can arrive, register the sub expression expressed with SUB_TOP
2824    and SUB_LAST.  */
2825
2826 static reg_errcode_t
2827 internal_function
2828 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2829                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2830 {
2831   reg_errcode_t err;
2832   int to_idx;
2833   /* Can the subexpression arrive the back reference?  */
2834   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2835                        sub_last->str_idx, bkref_node, bkref_str,
2836                        OP_OPEN_SUBEXP);
2837   if (err != REG_NOERROR)
2838     return err;
2839   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2840                              sub_last->str_idx);
2841   if (BE (err != REG_NOERROR, 0))
2842     return err;
2843   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2844   return clean_state_log_if_needed (mctx, to_idx);
2845 }
2846
2847 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2848    Search '(' if FL_OPEN, or search ')' otherwise.
2849    TODO: This function isn't efficient...
2850          Because there might be more than one nodes whose types are
2851          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2852          nodes.
2853          E.g. RE: (a){2}  */
2854
2855 static int
2856 internal_function
2857 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2858                   int subexp_idx, int type)
2859 {
2860   int cls_idx;
2861   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2862     {
2863       int cls_node = nodes->elems[cls_idx];
2864       const re_token_t *node = dfa->nodes + cls_node;
2865       if (node->type == type
2866           && node->opr.idx == subexp_idx)
2867         return cls_node;
2868     }
2869   return -1;
2870 }
2871
2872 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2873    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2874    heavily reused.
2875    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2876
2877 static reg_errcode_t
2878 internal_function __attribute_warn_unused_result__
2879 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2880                int top_str, int last_node, int last_str, int type)
2881 {
2882   const re_dfa_t *const dfa = mctx->dfa;
2883   reg_errcode_t err = REG_NOERROR;
2884   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2885   re_dfastate_t *cur_state = NULL;
2886   re_node_set *cur_nodes, next_nodes;
2887   re_dfastate_t **backup_state_log;
2888   unsigned int context;
2889
2890   subexp_num = dfa->nodes[top_node].opr.idx;
2891   /* Extend the buffer if we need.  */
2892   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2893     {
2894       re_dfastate_t **new_array;
2895       int old_alloc = path->alloc;
2896       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2897       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2898       if (BE (new_array == NULL, 0))
2899         {
2900           path->alloc = old_alloc;
2901           return REG_ESPACE;
2902         }
2903       path->array = new_array;
2904       memset (new_array + old_alloc, '\0',
2905               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2906     }
2907
2908   str_idx = path->next_idx ?: top_str;
2909
2910   /* Temporary modify MCTX.  */
2911   backup_state_log = mctx->state_log;
2912   backup_cur_idx = mctx->input.cur_idx;
2913   mctx->state_log = path->array;
2914   mctx->input.cur_idx = str_idx;
2915
2916   /* Setup initial node set.  */
2917   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2918   if (str_idx == top_str)
2919     {
2920       err = re_node_set_init_1 (&next_nodes, top_node);
2921       if (BE (err != REG_NOERROR, 0))
2922         return err;
2923       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2924       if (BE (err != REG_NOERROR, 0))
2925         {
2926           re_node_set_free (&next_nodes);
2927           return err;
2928         }
2929     }
2930   else
2931     {
2932       cur_state = mctx->state_log[str_idx];
2933       if (cur_state && cur_state->has_backref)
2934         {
2935           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2936           if (BE (err != REG_NOERROR, 0))
2937             return err;
2938         }
2939       else
2940         re_node_set_init_empty (&next_nodes);
2941     }
2942   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2943     {
2944       if (next_nodes.nelem)
2945         {
2946           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2947                                     subexp_num, type);
2948           if (BE (err != REG_NOERROR, 0))
2949             {
2950               re_node_set_free (&next_nodes);
2951               return err;
2952             }
2953         }
2954       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2955       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2956         {
2957           re_node_set_free (&next_nodes);
2958           return err;
2959         }
2960       mctx->state_log[str_idx] = cur_state;
2961     }
2962
2963   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2964     {
2965       re_node_set_empty (&next_nodes);
2966       if (mctx->state_log[str_idx + 1])
2967         {
2968           err = re_node_set_merge (&next_nodes,
2969                                    &mctx->state_log[str_idx + 1]->nodes);
2970           if (BE (err != REG_NOERROR, 0))
2971             {
2972               re_node_set_free (&next_nodes);
2973               return err;
2974             }
2975         }
2976       if (cur_state)
2977         {
2978           err = check_arrival_add_next_nodes (mctx, str_idx,
2979                                               &cur_state->non_eps_nodes,
2980                                               &next_nodes);
2981           if (BE (err != REG_NOERROR, 0))
2982             {
2983               re_node_set_free (&next_nodes);
2984               return err;
2985             }
2986         }
2987       ++str_idx;
2988       if (next_nodes.nelem)
2989         {
2990           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2991           if (BE (err != REG_NOERROR, 0))
2992             {
2993               re_node_set_free (&next_nodes);
2994               return err;
2995             }
2996           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2997                                     subexp_num, type);
2998           if (BE (err != REG_NOERROR, 0))
2999             {
3000               re_node_set_free (&next_nodes);
3001               return err;
3002             }
3003         }
3004       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3005       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3006       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3007         {
3008           re_node_set_free (&next_nodes);
3009           return err;
3010         }
3011       mctx->state_log[str_idx] = cur_state;
3012       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3013     }
3014   re_node_set_free (&next_nodes);
3015   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3016                : &mctx->state_log[last_str]->nodes);
3017   path->next_idx = str_idx;
3018
3019   /* Fix MCTX.  */
3020   mctx->state_log = backup_state_log;
3021   mctx->input.cur_idx = backup_cur_idx;
3022
3023   /* Then check the current node set has the node LAST_NODE.  */
3024   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3025     return REG_NOERROR;
3026
3027   return REG_NOMATCH;
3028 }
3029
3030 /* Helper functions for check_arrival.  */
3031
3032 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3033    to NEXT_NODES.
3034    TODO: This function is similar to the functions transit_state*(),
3035          however this function has many additional works.
3036          Can't we unify them?  */
3037
3038 static reg_errcode_t
3039 internal_function __attribute_warn_unused_result__
3040 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3041                               re_node_set *cur_nodes, re_node_set *next_nodes)
3042 {
3043   const re_dfa_t *const dfa = mctx->dfa;
3044   int result;
3045   int cur_idx;
3046   reg_errcode_t err = REG_NOERROR;
3047   re_node_set union_set;
3048   re_node_set_init_empty (&union_set);
3049   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3050     {
3051       int naccepted = 0;
3052       int cur_node = cur_nodes->elems[cur_idx];
3053 #ifdef DEBUG
3054       re_token_type_t type = dfa->nodes[cur_node].type;
3055       assert (!IS_EPSILON_NODE (type));
3056 #endif
3057 #ifdef RE_ENABLE_I18N
3058       /* If the node may accept `multi byte'.  */
3059       if (dfa->nodes[cur_node].accept_mb)
3060         {
3061           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3062                                                str_idx);
3063           if (naccepted > 1)
3064             {
3065               re_dfastate_t *dest_state;
3066               int next_node = dfa->nexts[cur_node];
3067               int next_idx = str_idx + naccepted;
3068               dest_state = mctx->state_log[next_idx];
3069               re_node_set_empty (&union_set);
3070               if (dest_state)
3071                 {
3072                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3073                   if (BE (err != REG_NOERROR, 0))
3074                     {
3075                       re_node_set_free (&union_set);
3076                       return err;
3077                     }
3078                 }
3079               result = re_node_set_insert (&union_set, next_node);
3080               if (BE (result < 0, 0))
3081                 {
3082                   re_node_set_free (&union_set);
3083                   return REG_ESPACE;
3084                 }
3085               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3086                                                             &union_set);
3087               if (BE (mctx->state_log[next_idx] == NULL
3088                       && err != REG_NOERROR, 0))
3089                 {
3090                   re_node_set_free (&union_set);
3091                   return err;
3092                 }
3093             }
3094         }
3095 #endif /* RE_ENABLE_I18N */
3096       if (naccepted
3097           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3098         {
3099           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3100           if (BE (result < 0, 0))
3101             {
3102               re_node_set_free (&union_set);
3103               return REG_ESPACE;
3104             }
3105         }
3106     }
3107   re_node_set_free (&union_set);
3108   return REG_NOERROR;
3109 }
3110
3111 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3112    CUR_NODES, however exclude the nodes which are:
3113     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3114     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3115 */
3116
3117 static reg_errcode_t
3118 internal_function
3119 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3120                           int ex_subexp, int type)
3121 {
3122   reg_errcode_t err;
3123   int idx, outside_node;
3124   re_node_set new_nodes;
3125 #ifdef DEBUG
3126   assert (cur_nodes->nelem);
3127 #endif
3128   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3129   if (BE (err != REG_NOERROR, 0))
3130     return err;
3131   /* Create a new node set NEW_NODES with the nodes which are epsilon
3132      closures of the node in CUR_NODES.  */
3133
3134   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3135     {
3136       int cur_node = cur_nodes->elems[idx];
3137       const re_node_set *eclosure = dfa->eclosures + cur_node;
3138       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3139       if (outside_node == -1)
3140         {
3141           /* There are no problematic nodes, just merge them.  */
3142           err = re_node_set_merge (&new_nodes, eclosure);
3143           if (BE (err != REG_NOERROR, 0))
3144             {
3145               re_node_set_free (&new_nodes);
3146               return err;
3147             }
3148         }
3149       else
3150         {
3151           /* There are problematic nodes, re-calculate incrementally.  */
3152           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3153                                               ex_subexp, type);
3154           if (BE (err != REG_NOERROR, 0))
3155             {
3156               re_node_set_free (&new_nodes);
3157               return err;
3158             }
3159         }
3160     }
3161   re_node_set_free (cur_nodes);
3162   *cur_nodes = new_nodes;
3163   return REG_NOERROR;
3164 }
3165
3166 /* Helper function for check_arrival_expand_ecl.
3167    Check incrementally the epsilon closure of TARGET, and if it isn't
3168    problematic append it to DST_NODES.  */
3169
3170 static reg_errcode_t
3171 internal_function __attribute_warn_unused_result__
3172 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3173                               int target, int ex_subexp, int type)
3174 {
3175   int cur_node;
3176   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3177     {
3178       int err;
3179
3180       if (dfa->nodes[cur_node].type == type
3181           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3182         {
3183           if (type == OP_CLOSE_SUBEXP)
3184             {
3185               err = re_node_set_insert (dst_nodes, cur_node);
3186               if (BE (err == -1, 0))
3187                 return REG_ESPACE;
3188             }
3189           break;
3190         }
3191       err = re_node_set_insert (dst_nodes, cur_node);
3192       if (BE (err == -1, 0))
3193         return REG_ESPACE;
3194       if (dfa->edests[cur_node].nelem == 0)
3195         break;
3196       if (dfa->edests[cur_node].nelem == 2)
3197         {
3198           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3199                                               dfa->edests[cur_node].elems[1],
3200                                               ex_subexp, type);
3201           if (BE (err != REG_NOERROR, 0))
3202             return err;
3203         }
3204       cur_node = dfa->edests[cur_node].elems[0];
3205     }
3206   return REG_NOERROR;
3207 }
3208
3209
3210 /* For all the back references in the current state, calculate the
3211    destination of the back references by the appropriate entry
3212    in MCTX->BKREF_ENTS.  */
3213
3214 static reg_errcode_t
3215 internal_function __attribute_warn_unused_result__
3216 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3217                     int cur_str, int subexp_num, int type)
3218 {
3219   const re_dfa_t *const dfa = mctx->dfa;
3220   reg_errcode_t err;
3221   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3222   struct re_backref_cache_entry *ent;
3223
3224   if (cache_idx_start == -1)
3225     return REG_NOERROR;
3226
3227  restart:
3228   ent = mctx->bkref_ents + cache_idx_start;
3229   do
3230     {
3231       int to_idx, next_node;
3232
3233       /* Is this entry ENT is appropriate?  */
3234       if (!re_node_set_contains (cur_nodes, ent->node))
3235         continue; /* No.  */
3236
3237       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3238       /* Calculate the destination of the back reference, and append it
3239          to MCTX->STATE_LOG.  */
3240       if (to_idx == cur_str)
3241         {
3242           /* The backreference did epsilon transit, we must re-check all the
3243              node in the current state.  */
3244           re_node_set new_dests;
3245           reg_errcode_t err2, err3;
3246           next_node = dfa->edests[ent->node].elems[0];
3247           if (re_node_set_contains (cur_nodes, next_node))
3248             continue;
3249           err = re_node_set_init_1 (&new_dests, next_node);
3250           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3251           err3 = re_node_set_merge (cur_nodes, &new_dests);
3252           re_node_set_free (&new_dests);
3253           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3254                   || err3 != REG_NOERROR, 0))
3255             {
3256               err = (err != REG_NOERROR ? err
3257                      : (err2 != REG_NOERROR ? err2 : err3));
3258               return err;
3259             }
3260           /* TODO: It is still inefficient...  */
3261           goto restart;
3262         }
3263       else
3264         {
3265           re_node_set union_set;
3266           next_node = dfa->nexts[ent->node];
3267           if (mctx->state_log[to_idx])
3268             {
3269               int ret;
3270               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3271                                         next_node))
3272                 continue;
3273               err = re_node_set_init_copy (&union_set,
3274                                            &mctx->state_log[to_idx]->nodes);
3275               ret = re_node_set_insert (&union_set, next_node);
3276               if (BE (err != REG_NOERROR || ret < 0, 0))
3277                 {
3278                   re_node_set_free (&union_set);
3279                   err = err != REG_NOERROR ? err : REG_ESPACE;
3280                   return err;
3281                 }
3282             }
3283           else
3284             {
3285               err = re_node_set_init_1 (&union_set, next_node);
3286               if (BE (err != REG_NOERROR, 0))
3287                 return err;
3288             }
3289           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3290           re_node_set_free (&union_set);
3291           if (BE (mctx->state_log[to_idx] == NULL
3292                   && err != REG_NOERROR, 0))
3293             return err;
3294         }
3295     }
3296   while (ent++->more);
3297   return REG_NOERROR;
3298 }
3299
3300 /* Build transition table for the state.
3301    Return 1 if succeeded, otherwise return NULL.  */
3302
3303 static int
3304 internal_function
3305 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3306 {
3307   reg_errcode_t err;
3308   int i, j, ch, need_word_trtable = 0;
3309   bitset_word_t elem, mask;
3310   bool dests_node_malloced = false;
3311   bool dest_states_malloced = false;
3312   int ndests; /* Number of the destination states from `state'.  */
3313   re_dfastate_t **trtable;
3314   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3315   re_node_set follows, *dests_node;
3316   bitset_t *dests_ch;
3317   bitset_t acceptable;
3318
3319   struct dests_alloc
3320   {
3321     re_node_set dests_node[SBC_MAX];
3322     bitset_t dests_ch[SBC_MAX];
3323   } *dests_alloc;
3324
3325   /* We build DFA states which corresponds to the destination nodes
3326      from `state'.  `dests_node[i]' represents the nodes which i-th
3327      destination state contains, and `dests_ch[i]' represents the
3328      characters which i-th destination state accepts.  */
3329   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3330     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3331   else
3332     {
3333       dests_alloc = re_malloc (struct dests_alloc, 1);
3334       if (BE (dests_alloc == NULL, 0))
3335         return 0;
3336       dests_node_malloced = true;
3337     }
3338   dests_node = dests_alloc->dests_node;
3339   dests_ch = dests_alloc->dests_ch;
3340
3341   /* Initialize transiton table.  */
3342   state->word_trtable = state->trtable = NULL;
3343
3344   /* At first, group all nodes belonging to `state' into several
3345      destinations.  */
3346   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3347   if (BE (ndests <= 0, 0))
3348     {
3349       if (dests_node_malloced)
3350         free (dests_alloc);
3351       /* Return 0 in case of an error, 1 otherwise.  */
3352       if (ndests == 0)
3353         {
3354           state->trtable = (re_dfastate_t **)
3355             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3356           return 1;
3357         }
3358       return 0;
3359     }
3360
3361   err = re_node_set_alloc (&follows, ndests + 1);
3362   if (BE (err != REG_NOERROR, 0))
3363     goto out_free;
3364
3365   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3366                          + ndests * 3 * sizeof (re_dfastate_t *)))
3367     dest_states = (re_dfastate_t **)
3368       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3369   else
3370     {
3371       dest_states = (re_dfastate_t **)
3372         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3373       if (BE (dest_states == NULL, 0))
3374         {
3375 out_free:
3376           if (dest_states_malloced)
3377             free (dest_states);
3378           re_node_set_free (&follows);
3379           for (i = 0; i < ndests; ++i)
3380             re_node_set_free (dests_node + i);
3381           if (dests_node_malloced)
3382             free (dests_alloc);
3383           return 0;
3384         }
3385       dest_states_malloced = true;
3386     }
3387   dest_states_word = dest_states + ndests;
3388   dest_states_nl = dest_states_word + ndests;
3389   bitset_empty (acceptable);
3390
3391   /* Then build the states for all destinations.  */
3392   for (i = 0; i < ndests; ++i)
3393     {
3394       int next_node;
3395       re_node_set_empty (&follows);
3396       /* Merge the follows of this destination states.  */
3397       for (j = 0; j < dests_node[i].nelem; ++j)
3398         {
3399           next_node = dfa->nexts[dests_node[i].elems[j]];
3400           if (next_node != -1)
3401             {
3402               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3403               if (BE (err != REG_NOERROR, 0))
3404                 goto out_free;
3405             }
3406         }
3407       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3408       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3409         goto out_free;
3410       /* If the new state has context constraint,
3411          build appropriate states for these contexts.  */
3412       if (dest_states[i]->has_constraint)
3413         {
3414           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3415                                                           CONTEXT_WORD);
3416           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3417             goto out_free;
3418
3419           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3420             need_word_trtable = 1;
3421
3422           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3423                                                         CONTEXT_NEWLINE);
3424           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3425             goto out_free;
3426         }
3427       else
3428         {
3429           dest_states_word[i] = dest_states[i];
3430           dest_states_nl[i] = dest_states[i];
3431         }
3432       bitset_merge (acceptable, dests_ch[i]);
3433     }
3434
3435   if (!BE (need_word_trtable, 0))
3436     {
3437       /* We don't care about whether the following character is a word
3438          character, or we are in a single-byte character set so we can
3439          discern by looking at the character code: allocate a
3440          256-entry transition table.  */
3441       trtable = state->trtable =
3442         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3443       if (BE (trtable == NULL, 0))
3444         goto out_free;
3445
3446       /* For all characters ch...:  */
3447       for (i = 0; i < BITSET_WORDS; ++i)
3448         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3449              elem;
3450              mask <<= 1, elem >>= 1, ++ch)
3451           if (BE (elem & 1, 0))
3452             {
3453               /* There must be exactly one destination which accepts
3454                  character ch.  See group_nodes_into_DFAstates.  */
3455               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3456                 ;
3457
3458               /* j-th destination accepts the word character ch.  */
3459               if (dfa->word_char[i] & mask)
3460                 trtable[ch] = dest_states_word[j];
3461               else
3462                 trtable[ch] = dest_states[j];
3463             }
3464     }
3465   else
3466     {
3467       /* We care about whether the following character is a word
3468          character, and we are in a multi-byte character set: discern
3469          by looking at the character code: build two 256-entry
3470          transition tables, one starting at trtable[0] and one
3471          starting at trtable[SBC_MAX].  */
3472       trtable = state->word_trtable =
3473         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3474       if (BE (trtable == NULL, 0))
3475         goto out_free;
3476
3477       /* For all characters ch...:  */
3478       for (i = 0; i < BITSET_WORDS; ++i)
3479         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3480              elem;
3481              mask <<= 1, elem >>= 1, ++ch)
3482           if (BE (elem & 1, 0))
3483             {
3484               /* There must be exactly one destination which accepts
3485                  character ch.  See group_nodes_into_DFAstates.  */
3486               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3487                 ;
3488
3489               /* j-th destination accepts the word character ch.  */
3490               trtable[ch] = dest_states[j];
3491               trtable[ch + SBC_MAX] = dest_states_word[j];
3492             }
3493     }
3494
3495   /* new line */
3496   if (bitset_contain (acceptable, NEWLINE_CHAR))
3497     {
3498       /* The current state accepts newline character.  */
3499       for (j = 0; j < ndests; ++j)
3500         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3501           {
3502             /* k-th destination accepts newline character.  */
3503             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3504             if (need_word_trtable)
3505               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3506             /* There must be only one destination which accepts
3507                newline.  See group_nodes_into_DFAstates.  */
3508             break;
3509           }
3510     }
3511
3512   if (dest_states_malloced)
3513     free (dest_states);
3514
3515   re_node_set_free (&follows);
3516   for (i = 0; i < ndests; ++i)
3517     re_node_set_free (dests_node + i);
3518
3519   if (dests_node_malloced)
3520     free (dests_alloc);
3521
3522   return 1;
3523 }
3524
3525 /* Group all nodes belonging to STATE into several destinations.
3526    Then for all destinations, set the nodes belonging to the destination
3527    to DESTS_NODE[i] and set the characters accepted by the destination
3528    to DEST_CH[i].  This function return the number of destinations.  */
3529
3530 static int
3531 internal_function
3532 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3533                             re_node_set *dests_node, bitset_t *dests_ch)
3534 {
3535   reg_errcode_t err;
3536   int result;
3537   int i, j, k;
3538   int ndests; /* Number of the destinations from `state'.  */
3539   bitset_t accepts; /* Characters a node can accept.  */
3540   const re_node_set *cur_nodes = &state->nodes;
3541   bitset_empty (accepts);
3542   ndests = 0;
3543
3544   /* For all the nodes belonging to `state',  */
3545   for (i = 0; i < cur_nodes->nelem; ++i)
3546     {
3547       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3548       re_token_type_t type = node->type;
3549       unsigned int constraint = node->constraint;
3550
3551       /* Enumerate all single byte character this node can accept.  */
3552       if (type == CHARACTER)
3553         bitset_set (accepts, node->opr.c);
3554       else if (type == SIMPLE_BRACKET)
3555         {
3556           bitset_merge (accepts, node->opr.sbcset);
3557         }
3558       else if (type == OP_PERIOD)
3559         {
3560 #ifdef RE_ENABLE_I18N
3561           if (dfa->mb_cur_max > 1)
3562             bitset_merge (accepts, dfa->sb_char);
3563           else
3564 #endif
3565             bitset_set_all (accepts);
3566           if (!(dfa->syntax & RE_DOT_NEWLINE))
3567             bitset_clear (accepts, '\n');
3568           if (dfa->syntax & RE_DOT_NOT_NULL)
3569             bitset_clear (accepts, '\0');
3570         }
3571 #ifdef RE_ENABLE_I18N
3572       else if (type == OP_UTF8_PERIOD)
3573         {
3574           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3575           if (!(dfa->syntax & RE_DOT_NEWLINE))
3576             bitset_clear (accepts, '\n');
3577           if (dfa->syntax & RE_DOT_NOT_NULL)
3578             bitset_clear (accepts, '\0');
3579         }
3580 #endif
3581       else
3582         continue;
3583
3584       /* Check the `accepts' and sift the characters which are not
3585          match it the context.  */
3586       if (constraint)
3587         {
3588           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3589             {
3590               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3591               bitset_empty (accepts);
3592               if (accepts_newline)
3593                 bitset_set (accepts, NEWLINE_CHAR);
3594               else
3595                 continue;
3596             }
3597           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3598             {
3599               bitset_empty (accepts);
3600               continue;
3601             }
3602
3603           if (constraint & NEXT_WORD_CONSTRAINT)
3604             {
3605               bitset_word_t any_set = 0;
3606               if (type == CHARACTER && !node->word_char)
3607                 {
3608                   bitset_empty (accepts);
3609                   continue;
3610                 }
3611 #ifdef RE_ENABLE_I18N
3612               if (dfa->mb_cur_max > 1)
3613                 for (j = 0; j < BITSET_WORDS; ++j)
3614                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3615               else
3616 #endif
3617                 for (j = 0; j < BITSET_WORDS; ++j)
3618                   any_set |= (accepts[j] &= dfa->word_char[j]);
3619               if (!any_set)
3620                 continue;
3621             }
3622           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3623             {
3624               bitset_word_t any_set = 0;
3625               if (type == CHARACTER && node->word_char)
3626                 {
3627                   bitset_empty (accepts);
3628                   continue;
3629                 }
3630 #ifdef RE_ENABLE_I18N
3631               if (dfa->mb_cur_max > 1)
3632                 for (j = 0; j < BITSET_WORDS; ++j)
3633                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3634               else
3635 #endif
3636                 for (j = 0; j < BITSET_WORDS; ++j)
3637                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3638               if (!any_set)
3639                 continue;
3640             }
3641         }
3642
3643       /* Then divide `accepts' into DFA states, or create a new
3644          state.  Above, we make sure that accepts is not empty.  */
3645       for (j = 0; j < ndests; ++j)
3646         {
3647           bitset_t intersec; /* Intersection sets, see below.  */
3648           bitset_t remains;
3649           /* Flags, see below.  */
3650           bitset_word_t has_intersec, not_subset, not_consumed;
3651
3652           /* Optimization, skip if this state doesn't accept the character.  */
3653           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3654             continue;
3655
3656           /* Enumerate the intersection set of this state and `accepts'.  */
3657           has_intersec = 0;
3658           for (k = 0; k < BITSET_WORDS; ++k)
3659             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3660           /* And skip if the intersection set is empty.  */
3661           if (!has_intersec)
3662             continue;
3663
3664           /* Then check if this state is a subset of `accepts'.  */
3665           not_subset = not_consumed = 0;
3666           for (k = 0; k < BITSET_WORDS; ++k)
3667             {
3668               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3669               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3670             }
3671
3672           /* If this state isn't a subset of `accepts', create a
3673              new group state, which has the `remains'. */
3674           if (not_subset)
3675             {
3676               bitset_copy (dests_ch[ndests], remains);
3677               bitset_copy (dests_ch[j], intersec);
3678               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3679               if (BE (err != REG_NOERROR, 0))
3680                 goto error_return;
3681               ++ndests;
3682             }
3683
3684           /* Put the position in the current group. */
3685           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3686           if (BE (result < 0, 0))
3687             goto error_return;
3688
3689           /* If all characters are consumed, go to next node. */
3690           if (!not_consumed)
3691             break;
3692         }
3693       /* Some characters remain, create a new group. */
3694       if (j == ndests)
3695         {
3696           bitset_copy (dests_ch[ndests], accepts);
3697           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3698           if (BE (err != REG_NOERROR, 0))
3699             goto error_return;
3700           ++ndests;
3701           bitset_empty (accepts);
3702         }
3703     }
3704   return ndests;
3705  error_return:
3706   for (j = 0; j < ndests; ++j)
3707     re_node_set_free (dests_node + j);
3708   return -1;
3709 }
3710
3711 #ifdef RE_ENABLE_I18N
3712 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3713    Return the number of the bytes the node accepts.
3714    STR_IDX is the current index of the input string.
3715
3716    This function handles the nodes which can accept one character, or
3717    one collating element like '.', '[a-z]', opposite to the other nodes
3718    can only accept one byte.  */
3719
3720 static int
3721 internal_function
3722 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3723                          const re_string_t *input, int str_idx)
3724 {
3725   const re_token_t *node = dfa->nodes + node_idx;
3726   int char_len, elem_len;
3727   int i;
3728
3729   if (BE (node->type == OP_UTF8_PERIOD, 0))
3730     {
3731       unsigned char c = re_string_byte_at (input, str_idx), d;
3732       if (BE (c < 0xc2, 1))
3733         return 0;
3734
3735       if (str_idx + 2 > input->len)
3736         return 0;
3737
3738       d = re_string_byte_at (input, str_idx + 1);
3739       if (c < 0xe0)
3740         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3741       else if (c < 0xf0)
3742         {
3743           char_len = 3;
3744           if (c == 0xe0 && d < 0xa0)
3745             return 0;
3746         }
3747       else if (c < 0xf8)
3748         {
3749           char_len = 4;
3750           if (c == 0xf0 && d < 0x90)
3751             return 0;
3752         }
3753       else if (c < 0xfc)
3754         {
3755           char_len = 5;
3756           if (c == 0xf8 && d < 0x88)
3757             return 0;
3758         }
3759       else if (c < 0xfe)
3760         {
3761           char_len = 6;
3762           if (c == 0xfc && d < 0x84)
3763             return 0;
3764         }
3765       else
3766         return 0;
3767
3768       if (str_idx + char_len > input->len)
3769         return 0;
3770
3771       for (i = 1; i < char_len; ++i)
3772         {
3773           d = re_string_byte_at (input, str_idx + i);
3774           if (d < 0x80 || d > 0xbf)
3775             return 0;
3776         }
3777       return char_len;
3778     }
3779
3780   char_len = re_string_char_size_at (input, str_idx);
3781   if (node->type == OP_PERIOD)
3782     {
3783       if (char_len <= 1)
3784         return 0;
3785       /* FIXME: I don't think this if is needed, as both '\n'
3786          and '\0' are char_len == 1.  */
3787       /* '.' accepts any one character except the following two cases.  */
3788       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3789            re_string_byte_at (input, str_idx) == '\n') ||
3790           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3791            re_string_byte_at (input, str_idx) == '\0'))
3792         return 0;
3793       return char_len;
3794     }
3795
3796   elem_len = re_string_elem_size_at (input, str_idx);
3797   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3798     return 0;
3799
3800   if (node->type == COMPLEX_BRACKET)
3801     {
3802       const re_charset_t *cset = node->opr.mbcset;
3803 # ifdef _LIBC
3804       const unsigned char *pin
3805         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3806       int j;
3807       uint32_t nrules;
3808 # endif /* _LIBC */
3809       int match_len = 0;
3810       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3811                     ? re_string_wchar_at (input, str_idx) : 0);
3812
3813       /* match with multibyte character?  */
3814       for (i = 0; i < cset->nmbchars; ++i)
3815         if (wc == cset->mbchars[i])
3816           {
3817             match_len = char_len;
3818             goto check_node_accept_bytes_match;
3819           }
3820       /* match with character_class?  */
3821       for (i = 0; i < cset->nchar_classes; ++i)
3822         {
3823           wctype_t wt = cset->char_classes[i];
3824           if (__iswctype (wc, wt))
3825             {
3826               match_len = char_len;
3827               goto check_node_accept_bytes_match;
3828             }
3829         }
3830
3831 # ifdef _LIBC
3832       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3833       if (nrules != 0)
3834         {
3835           unsigned int in_collseq = 0;
3836           const int32_t *table, *indirect;
3837           const unsigned char *weights, *extra;
3838           const char *collseqwc;
3839           /* This #include defines a local function!  */
3840 #  include <locale/weight.h>
3841
3842           /* match with collating_symbol?  */
3843           if (cset->ncoll_syms)
3844             extra = (const unsigned char *)
3845               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3846           for (i = 0; i < cset->ncoll_syms; ++i)
3847             {
3848               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3849               /* Compare the length of input collating element and
3850                  the length of current collating element.  */
3851               if (*coll_sym != elem_len)
3852                 continue;
3853               /* Compare each bytes.  */
3854               for (j = 0; j < *coll_sym; j++)
3855                 if (pin[j] != coll_sym[1 + j])
3856                   break;
3857               if (j == *coll_sym)
3858                 {
3859                   /* Match if every bytes is equal.  */
3860                   match_len = j;
3861                   goto check_node_accept_bytes_match;
3862                 }
3863             }
3864
3865           if (cset->nranges)
3866             {
3867               if (elem_len <= char_len)
3868                 {
3869                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3870                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3871                 }
3872               else
3873                 in_collseq = find_collation_sequence_value (pin, elem_len);
3874             }
3875           /* match with range expression?  */
3876           for (i = 0; i < cset->nranges; ++i)
3877             if (cset->range_starts[i] <= in_collseq
3878                 && in_collseq <= cset->range_ends[i])
3879               {
3880                 match_len = elem_len;
3881                 goto check_node_accept_bytes_match;
3882               }
3883
3884           /* match with equivalence_class?  */
3885           if (cset->nequiv_classes)
3886             {
3887               const unsigned char *cp = pin;
3888               table = (const int32_t *)
3889                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3890               weights = (const unsigned char *)
3891                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3892               extra = (const unsigned char *)
3893                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3894               indirect = (const int32_t *)
3895                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3896               int32_t idx = findidx (&cp);
3897               if (idx > 0)
3898                 for (i = 0; i < cset->nequiv_classes; ++i)
3899                   {
3900                     int32_t equiv_class_idx = cset->equiv_classes[i];
3901                     size_t weight_len = weights[idx & 0xffffff];
3902                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3903                         && (idx >> 24) == (equiv_class_idx >> 24))
3904                       {
3905                         int cnt = 0;
3906
3907                         idx &= 0xffffff;
3908                         equiv_class_idx &= 0xffffff;
3909
3910                         while (cnt <= weight_len
3911                                && (weights[equiv_class_idx + 1 + cnt]
3912                                    == weights[idx + 1 + cnt]))
3913                           ++cnt;
3914                         if (cnt > weight_len)
3915                           {
3916                             match_len = elem_len;
3917                             goto check_node_accept_bytes_match;
3918                           }
3919                       }
3920                   }
3921             }
3922         }
3923       else
3924 # endif /* _LIBC */
3925         {
3926           /* match with range expression?  */
3927 #if __GNUC__ >= 2
3928           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3929 #else
3930           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3931           cmp_buf[2] = wc;
3932 #endif
3933           for (i = 0; i < cset->nranges; ++i)
3934             {
3935               cmp_buf[0] = cset->range_starts[i];
3936               cmp_buf[4] = cset->range_ends[i];
3937               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3938                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3939                 {
3940                   match_len = char_len;
3941                   goto check_node_accept_bytes_match;
3942                 }
3943             }
3944         }
3945     check_node_accept_bytes_match:
3946       if (!cset->non_match)
3947         return match_len;
3948       else
3949         {
3950           if (match_len > 0)
3951             return 0;
3952           else
3953             return (elem_len > char_len) ? elem_len : char_len;
3954         }
3955     }
3956   return 0;
3957 }
3958
3959 # ifdef _LIBC
3960 static unsigned int
3961 internal_function
3962 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3963 {
3964   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3965   if (nrules == 0)
3966     {
3967       if (mbs_len == 1)
3968         {
3969           /* No valid character.  Match it as a single byte character.  */
3970           const unsigned char *collseq = (const unsigned char *)
3971             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3972           return collseq[mbs[0]];
3973         }
3974       return UINT_MAX;
3975     }
3976   else
3977     {
3978       int32_t idx;
3979       const unsigned char *extra = (const unsigned char *)
3980         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3981       int32_t extrasize = (const unsigned char *)
3982         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3983
3984       for (idx = 0; idx < extrasize;)
3985         {
3986           int mbs_cnt, found = 0;
3987           int32_t elem_mbs_len;
3988           /* Skip the name of collating element name.  */
3989           idx = idx + extra[idx] + 1;
3990           elem_mbs_len = extra[idx++];
3991           if (mbs_len == elem_mbs_len)
3992             {
3993               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3994                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3995                   break;
3996               if (mbs_cnt == elem_mbs_len)
3997                 /* Found the entry.  */
3998                 found = 1;
3999             }
4000           /* Skip the byte sequence of the collating element.  */
4001           idx += elem_mbs_len;
4002           /* Adjust for the alignment.  */
4003           idx = (idx + 3) & ~3;
4004           /* Skip the collation sequence value.  */
4005           idx += sizeof (uint32_t);
4006           /* Skip the wide char sequence of the collating element.  */
4007           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
4008           /* If we found the entry, return the sequence value.  */
4009           if (found)
4010             return *(uint32_t *) (extra + idx);
4011           /* Skip the collation sequence value.  */
4012           idx += sizeof (uint32_t);
4013         }
4014       return UINT_MAX;
4015     }
4016 }
4017 # endif /* _LIBC */
4018 #endif /* RE_ENABLE_I18N */
4019
4020 /* Check whether the node accepts the byte which is IDX-th
4021    byte of the INPUT.  */
4022
4023 static int
4024 internal_function
4025 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4026                    int idx)
4027 {
4028   unsigned char ch;
4029   ch = re_string_byte_at (&mctx->input, idx);
4030   switch (node->type)
4031     {
4032     case CHARACTER:
4033       if (node->opr.c != ch)
4034         return 0;
4035       break;
4036
4037     case SIMPLE_BRACKET:
4038       if (!bitset_contain (node->opr.sbcset, ch))
4039         return 0;
4040       break;
4041
4042 #ifdef RE_ENABLE_I18N
4043     case OP_UTF8_PERIOD:
4044       if (ch >= 0x80)
4045         return 0;
4046       /* FALLTHROUGH */
4047 #endif
4048     case OP_PERIOD:
4049       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4050           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4051         return 0;
4052       break;
4053
4054     default:
4055       return 0;
4056     }
4057
4058   if (node->constraint)
4059     {
4060       /* The node has constraints.  Check whether the current context
4061          satisfies the constraints.  */
4062       unsigned int context = re_string_context_at (&mctx->input, idx,
4063                                                    mctx->eflags);
4064       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4065         return 0;
4066     }
4067
4068   return 1;
4069 }
4070
4071 /* Extend the buffers, if the buffers have run out.  */
4072
4073 static reg_errcode_t
4074 internal_function __attribute_warn_unused_result__
4075 extend_buffers (re_match_context_t *mctx)
4076 {
4077   reg_errcode_t ret;
4078   re_string_t *pstr = &mctx->input;
4079
4080   /* Double the lengthes of the buffers.  */
4081   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4082   if (BE (ret != REG_NOERROR, 0))
4083     return ret;
4084
4085   if (mctx->state_log != NULL)
4086     {
4087       /* And double the length of state_log.  */
4088       /* XXX We have no indication of the size of this buffer.  If this
4089          allocation fail we have no indication that the state_log array
4090          does not have the right size.  */
4091       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4092                                               pstr->bufs_len + 1);
4093       if (BE (new_array == NULL, 0))
4094         return REG_ESPACE;
4095       mctx->state_log = new_array;
4096     }
4097
4098   /* Then reconstruct the buffers.  */
4099   if (pstr->icase)
4100     {
4101 #ifdef RE_ENABLE_I18N
4102       if (pstr->mb_cur_max > 1)
4103         {
4104           ret = build_wcs_upper_buffer (pstr);
4105           if (BE (ret != REG_NOERROR, 0))
4106             return ret;
4107         }
4108       else
4109 #endif /* RE_ENABLE_I18N  */
4110         build_upper_buffer (pstr);
4111     }
4112   else
4113     {
4114 #ifdef RE_ENABLE_I18N
4115       if (pstr->mb_cur_max > 1)
4116         build_wcs_buffer (pstr);
4117       else
4118 #endif /* RE_ENABLE_I18N  */
4119         {
4120           if (pstr->trans != NULL)
4121             re_string_translate_buffer (pstr);
4122         }
4123     }
4124   return REG_NOERROR;
4125 }
4126
4127 \f
4128 /* Functions for matching context.  */
4129
4130 /* Initialize MCTX.  */
4131
4132 static reg_errcode_t
4133 internal_function __attribute_warn_unused_result__
4134 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4135 {
4136   mctx->eflags = eflags;
4137   mctx->match_last = -1;
4138   if (n > 0)
4139     {
4140       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4141       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4142       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4143         return REG_ESPACE;
4144     }
4145   /* Already zero-ed by the caller.
4146      else
4147        mctx->bkref_ents = NULL;
4148      mctx->nbkref_ents = 0;
4149      mctx->nsub_tops = 0;  */
4150   mctx->abkref_ents = n;
4151   mctx->max_mb_elem_len = 1;
4152   mctx->asub_tops = n;
4153   return REG_NOERROR;
4154 }
4155
4156 /* Clean the entries which depend on the current input in MCTX.
4157    This function must be invoked when the matcher changes the start index
4158    of the input, or changes the input string.  */
4159
4160 static void
4161 internal_function
4162 match_ctx_clean (re_match_context_t *mctx)
4163 {
4164   int st_idx;
4165   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4166     {
4167       int sl_idx;
4168       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4169       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4170         {
4171           re_sub_match_last_t *last = top->lasts[sl_idx];
4172           re_free (last->path.array);
4173           re_free (last);
4174         }
4175       re_free (top->lasts);
4176       if (top->path)
4177         {
4178           re_free (top->path->array);
4179           re_free (top->path);
4180         }
4181       free (top);
4182     }
4183
4184   mctx->nsub_tops = 0;
4185   mctx->nbkref_ents = 0;
4186 }
4187
4188 /* Free all the memory associated with MCTX.  */
4189
4190 static void
4191 internal_function
4192 match_ctx_free (re_match_context_t *mctx)
4193 {
4194   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4195   match_ctx_clean (mctx);
4196   re_free (mctx->sub_tops);
4197   re_free (mctx->bkref_ents);
4198 }
4199
4200 /* Add a new backreference entry to MCTX.
4201    Note that we assume that caller never call this function with duplicate
4202    entry, and call with STR_IDX which isn't smaller than any existing entry.
4203 */
4204
4205 static reg_errcode_t
4206 internal_function __attribute_warn_unused_result__
4207 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4208                      int to)
4209 {
4210   if (mctx->nbkref_ents >= mctx->abkref_ents)
4211     {
4212       struct re_backref_cache_entry* new_entry;
4213       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4214                               mctx->abkref_ents * 2);
4215       if (BE (new_entry == NULL, 0))
4216         {
4217           re_free (mctx->bkref_ents);
4218           return REG_ESPACE;
4219         }
4220       mctx->bkref_ents = new_entry;
4221       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4222               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4223       mctx->abkref_ents *= 2;
4224     }
4225   if (mctx->nbkref_ents > 0
4226       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4227     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4228
4229   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4230   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4231   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4232   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4233
4234   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4235      If bit N is clear, means that this entry won't epsilon-transition to
4236      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4237      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4238      such node.
4239
4240      A backreference does not epsilon-transition unless it is empty, so set
4241      to all zeros if FROM != TO.  */
4242   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4243     = (from == to ? ~0 : 0);
4244
4245   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4246   if (mctx->max_mb_elem_len < to - from)
4247     mctx->max_mb_elem_len = to - from;
4248   return REG_NOERROR;
4249 }
4250
4251 /* Search for the first entry which has the same str_idx, or -1 if none is
4252    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4253
4254 static int
4255 internal_function
4256 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4257 {
4258   int left, right, mid, last;
4259   last = right = mctx->nbkref_ents;
4260   for (left = 0; left < right;)
4261     {
4262       mid = (left + right) / 2;
4263       if (mctx->bkref_ents[mid].str_idx < str_idx)
4264         left = mid + 1;
4265       else
4266         right = mid;
4267     }
4268   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4269     return left;
4270   else
4271     return -1;
4272 }
4273
4274 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4275    at STR_IDX.  */
4276
4277 static reg_errcode_t
4278 internal_function __attribute_warn_unused_result__
4279 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4280 {
4281 #ifdef DEBUG
4282   assert (mctx->sub_tops != NULL);
4283   assert (mctx->asub_tops > 0);
4284 #endif
4285   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4286     {
4287       int new_asub_tops = mctx->asub_tops * 2;
4288       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4289                                                    re_sub_match_top_t *,
4290                                                    new_asub_tops);
4291       if (BE (new_array == NULL, 0))
4292         return REG_ESPACE;
4293       mctx->sub_tops = new_array;
4294       mctx->asub_tops = new_asub_tops;
4295     }
4296   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4297   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4298     return REG_ESPACE;
4299   mctx->sub_tops[mctx->nsub_tops]->node = node;
4300   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4301   return REG_NOERROR;
4302 }
4303
4304 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4305    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4306
4307 static re_sub_match_last_t *
4308 internal_function
4309 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4310 {
4311   re_sub_match_last_t *new_entry;
4312   if (BE (subtop->nlasts == subtop->alasts, 0))
4313     {
4314       int new_alasts = 2 * subtop->alasts + 1;
4315       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4316                                                     re_sub_match_last_t *,
4317                                                     new_alasts);
4318       if (BE (new_array == NULL, 0))
4319         return NULL;
4320       subtop->lasts = new_array;
4321       subtop->alasts = new_alasts;
4322     }
4323   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4324   if (BE (new_entry != NULL, 1))
4325     {
4326       subtop->lasts[subtop->nlasts] = new_entry;
4327       new_entry->node = node;
4328       new_entry->str_idx = str_idx;
4329       ++subtop->nlasts;
4330     }
4331   return new_entry;
4332 }
4333
4334 static void
4335 internal_function
4336 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4337                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4338 {
4339   sctx->sifted_states = sifted_sts;
4340   sctx->limited_states = limited_sts;
4341   sctx->last_node = last_node;
4342   sctx->last_str_idx = last_str_idx;
4343   re_node_set_init_empty (&sctx->limits);
4344 }