posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static void match_ctx_free_subtops (re_match_context_t *mctx)
  26      internal_function;
  27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  28                                           int str_idx, int from, int to)
  29      internal_function;
  30 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  31      internal_function;
  32 static void match_ctx_clear_flag (re_match_context_t *mctx) internal_function;
  33 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  34                                            int str_idx) internal_function;
  35 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  36                                                    int node, int str_idx)
  37      internal_function;
  38 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  39                            re_dfastate_t **limited_sts, int last_node,
  40                            int last_str_idx, int check_subexp)
  41      internal_function;
  42 static reg_errcode_t re_search_internal (const regex_t *preg,
  43                                          const char *string, int length,
  44                                          int start, int range, int stop,
  45                                          size_t nmatch, regmatch_t pmatch[],
  46                                          int eflags) internal_function;
  47 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  48                              const char *string1, int length1,
  49                              const char *string2, int length2,
  50                              int start, int range, struct re_registers *regs,
  51                              int stop, int ret_len) internal_function;
  52 static int re_search_stub (struct re_pattern_buffer *bufp,
  53                            const char *string, int length, int start,
  54                            int range, int stop, struct re_registers *regs,
  55                            int ret_len) internal_function;
  56 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  57                               int nregs, int regs_allocated) internal_function;
  58 static inline re_dfastate_t *acquire_init_state_context
  59      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  60      __attribute ((always_inline)) internal_function;
  61 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  62      internal_function;
  63 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  64                            int *p_match_first)
  65      internal_function;
  66 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  67                                     unsigned int context) internal_function;
  68 static int check_halt_state_context (const re_match_context_t *mctx,
  69                                      const re_dfastate_t *state, int idx)
  70      internal_function;
  71 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  72                          regmatch_t *prev_idx_match, int cur_node,
  73                          int cur_idx, int nmatch) internal_function;
  74 static int proceed_next_node (const re_match_context_t *mctx,
  75                               int nregs, regmatch_t *regs,
  76                               int *pidx, int node, re_node_set *eps_via_nodes,
  77                               struct re_fail_stack_t *fs) internal_function;
  78 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  79                                       int str_idx, int *dests, int nregs,
  80                                       regmatch_t *regs,
  81                                       re_node_set *eps_via_nodes) internal_function;
  82 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  83                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  84 static reg_errcode_t set_regs (const regex_t *preg,
  85                                const re_match_context_t *mctx,
  86                                size_t nmatch, regmatch_t *pmatch,
  87                                int fl_backtrack) internal_function;
  88 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  89
  90 #ifdef RE_ENABLE_I18N
  91 static int sift_states_iter_mb (const re_match_context_t *mctx,
  92                                 re_sift_context_t *sctx,
  93                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  94 #endif /* RE_ENABLE_I18N */
  95 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  96                                            re_sift_context_t *sctx) internal_function;
  97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  98                                               re_sift_context_t *sctx,
  99                                               int str_idx,
 100                                               re_node_set *dest_nodes) internal_function;
 101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 102                                             re_node_set *dest_nodes,
 103                                             const re_node_set *candidates) internal_function;
 104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 105                                             re_node_set *dest_nodes,
 106                                             const re_node_set *and_nodes) internal_function;
 107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 108                              int dst_node, int dst_idx, int src_node,
 109                              int src_idx) internal_function;
 110 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 111                                       int limit, re_node_set *eclosures,
 112                                       int subexp_idx, int node, int str_idx) internal_function;
 113 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 114                                           re_node_set *dest_nodes,
 115                                           const re_node_set *candidates,
 116                                           re_node_set *limits,
 117                                           struct re_backref_cache_entry *bkref_ents,
 118                                           int str_idx) internal_function;
 119 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 120                                         re_sift_context_t *sctx,
 121                                         int str_idx, re_node_set *dest_nodes) internal_function;
 122 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 123                                                 int next_state_log_idx) internal_function;
 124 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 125                                         re_dfastate_t **src, int num) internal_function;
 126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 127                                          re_match_context_t *mctx) internal_function;
 128 static re_dfastate_t *transit_state (reg_errcode_t *err,
 129                                      re_match_context_t *mctx,
 130                                      re_dfastate_t *state) internal_function;
 131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 132                                             re_match_context_t *mctx,
 133                                             re_dfastate_t *next_state) internal_function;
 134 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 135                                                 re_node_set *cur_nodes,
 136                                                 int str_idx) internal_function;
 137 #if 0
 138 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 139                                         re_match_context_t *mctx,
 140                                         re_dfastate_t *pstate) internal_function;
 141 #endif
 142 #ifdef RE_ENABLE_I18N
 143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 144                                        re_dfastate_t *pstate) internal_function;
 145 #endif /* RE_ENABLE_I18N */
 146 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 147                                           const re_node_set *nodes) internal_function;
 148 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 149                                  int bkref_node, int bkref_str_idx) internal_function;
 150 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 151                                      const re_sub_match_top_t *sub_top,
 152                                      re_sub_match_last_t *sub_last,
 153                                      int bkref_node, int bkref_str) internal_function;
 154 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 155                              int subexp_idx, int type) internal_function;
 156 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 157                                     state_array_t *path, int top_node,
 158                                     int top_str, int last_node, int last_str,
 159                                     int type) internal_function;
 160 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 161                                                    int str_idx,
 162                                                    re_node_set *cur_nodes,
 163                                                    re_node_set *next_nodes) internal_function;
 164 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 165                                                re_node_set *cur_nodes,
 166                                                int ex_subexp, int type) internal_function;
 167 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 168                                                    re_node_set *dst_nodes,
 169                                                    int target, int ex_subexp,
 170                                                    int type) internal_function;
 171 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 172                                          re_node_set *cur_nodes, int cur_str,
 173                                          int last_str, int subexp_num,
 174                                          int type) internal_function;
 175 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
 176                                       re_dfastate_t *state) internal_function;
 177 #ifdef RE_ENABLE_I18N
 178 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 179                                     const re_string_t *input, int idx) internal_function;
 180 # ifdef _LIBC
 181 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 182                                                    size_t name_len) internal_function;
 183 # endif /* _LIBC */
 184 #endif /* RE_ENABLE_I18N */
 185 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 186                                        const re_dfastate_t *state,
 187                                        re_node_set *states_node,
 188                                        bitset *states_ch) internal_function;
 189 static int check_node_accept (const re_match_context_t *mctx,
 190                               const re_token_t *node, int idx) internal_function;
 191 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 192 \f
 193 /* Entry point for POSIX code.  */
 194
 195 /* regexec searches for a given pattern, specified by PREG, in the
 196    string STRING.
 197
 198    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 199    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 200    least NMATCH elements, and we set them to the offsets of the
 201    corresponding matched substrings.
 202
 203    EFLAGS specifies `execution flags' which affect matching: if
 204    REG_NOTBOL is set, then ^ does not match at the beginning of the
 205    string; if REG_NOTEOL is set, then $ does not match at the end.
 206
 207    We return 0 if we find a match and REG_NOMATCH if not.  */
 208
 209 int
 210 regexec (preg, string, nmatch, pmatch, eflags)
 211     const regex_t *__restrict preg;
 212     const char *__restrict string;
 213     size_t nmatch;
 214     regmatch_t pmatch[];
 215     int eflags;
 216 {
 217   reg_errcode_t err;
 218   int start, length;
 219
 220   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 221     return REG_BADPAT;
 222
 223   if (eflags & REG_STARTEND)
 224     {
 225       start = pmatch[0].rm_so;
 226       length = pmatch[0].rm_eo;
 227     }
 228   else
 229     {
 230       start = 0;
 231       length = strlen (string);
 232     }
 233   if (preg->no_sub)
 234     err = re_search_internal (preg, string, length, start, length - start,
 235                               length, 0, NULL, eflags);
 236   else
 237     err = re_search_internal (preg, string, length, start, length - start,
 238                               length, nmatch, pmatch, eflags);
 239   return err != REG_NOERROR;
 240 }
 241
 242 #ifdef _LIBC
 243 # include <shlib-compat.h>
 244 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 245
 246 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 247 __typeof__ (__regexec) __compat_regexec;
 248
 249 int
 250 attribute_compat_text_section
 251 __compat_regexec (const regex_t *__restrict preg,
 252                   const char *__restrict string, size_t nmatch,
 253                   regmatch_t pmatch[], int eflags)
 254 {
 255   return regexec (preg, string, nmatch, pmatch,
 256                   eflags & (REG_NOTBOL | REG_NOTEOL));
 257 }
 258 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 259 # endif
 260 #endif
 261
 262 /* Entry points for GNU code.  */
 263
 264 /* re_match, re_search, re_match_2, re_search_2
 265
 266    The former two functions operate on STRING with length LENGTH,
 267    while the later two operate on concatenation of STRING1 and STRING2
 268    with lengths LENGTH1 and LENGTH2, respectively.
 269
 270    re_match() matches the compiled pattern in BUFP against the string,
 271    starting at index START.
 272
 273    re_search() first tries matching at index START, then it tries to match
 274    starting from index START + 1, and so on.  The last start position tried
 275    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 276    way as re_match().)
 277
 278    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 279    the first STOP characters of the concatenation of the strings should be
 280    concerned.
 281
 282    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 283    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 284    computed relative to the concatenation, not relative to the individual
 285    strings.)
 286
 287    On success, re_match* functions return the length of the match, re_search*
 288    return the position of the start of the match.  Return value -1 means no
 289    match was found and -2 indicates an internal error.  */
 290
 291 int
 292 re_match (bufp, string, length, start, regs)
 293     struct re_pattern_buffer *bufp;
 294     const char *string;
 295     int length, start;
 296     struct re_registers *regs;
 297 {
 298   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 299 }
 300 #ifdef _LIBC
 301 weak_alias (__re_match, re_match)
 302 #endif
 303
 304 int
 305 re_search (bufp, string, length, start, range, regs)
 306     struct re_pattern_buffer *bufp;
 307     const char *string;
 308     int length, start, range;
 309     struct re_registers *regs;
 310 {
 311   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 312 }
 313 #ifdef _LIBC
 314 weak_alias (__re_search, re_search)
 315 #endif
 316
 317 int
 318 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 319     struct re_pattern_buffer *bufp;
 320     const char *string1, *string2;
 321     int length1, length2, start, stop;
 322     struct re_registers *regs;
 323 {
 324   return re_search_2_stub (bufp, string1, length1, string2, length2,
 325                            start, 0, regs, stop, 1);
 326 }
 327 #ifdef _LIBC
 328 weak_alias (__re_match_2, re_match_2)
 329 #endif
 330
 331 int
 332 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 333     struct re_pattern_buffer *bufp;
 334     const char *string1, *string2;
 335     int length1, length2, start, range, stop;
 336     struct re_registers *regs;
 337 {
 338   return re_search_2_stub (bufp, string1, length1, string2, length2,
 339                            start, range, regs, stop, 0);
 340 }
 341 #ifdef _LIBC
 342 weak_alias (__re_search_2, re_search_2)
 343 #endif
 344
 345 static int
 346 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 347                   stop, ret_len)
 348     struct re_pattern_buffer *bufp;
 349     const char *string1, *string2;
 350     int length1, length2, start, range, stop, ret_len;
 351     struct re_registers *regs;
 352 {
 353   const char *str;
 354   int rval;
 355   int len = length1 + length2;
 356   int free_str = 0;
 357
 358   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 359     return -2;
 360
 361   /* Concatenate the strings.  */
 362   if (length2 > 0)
 363     if (length1 > 0)
 364       {
 365         char *s = re_malloc (char, len);
 366
 367         if (BE (s == NULL, 0))
 368           return -2;
 369         memcpy (s, string1, length1);
 370         memcpy (s + length1, string2, length2);
 371         str = s;
 372         free_str = 1;
 373       }
 374     else
 375       str = string2;
 376   else
 377     str = string1;
 378
 379   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 380                          ret_len);
 381   if (free_str)
 382     re_free ((char *) str);
 383   return rval;
 384 }
 385
 386 /* The parameters have the same meaning as those of re_search.
 387    Additional parameters:
 388    If RET_LEN is nonzero the length of the match is returned (re_match style);
 389    otherwise the position of the match is returned.  */
 390
 391 static int
 392 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 393     struct re_pattern_buffer *bufp;
 394     const char *string;
 395     int length, start, range, stop, ret_len;
 396     struct re_registers *regs;
 397 {
 398   reg_errcode_t result;
 399   regmatch_t *pmatch;
 400   int nregs, rval;
 401   int eflags = 0;
 402
 403   /* Check for out-of-range.  */
 404   if (BE (start < 0 || start > length, 0))
 405     return -1;
 406   if (BE (start + range > length, 0))
 407     range = length - start;
 408   else if (BE (start + range < 0, 0))
 409     range = -start;
 410
 411   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 412   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 413
 414   /* Compile fastmap if we haven't yet.  */
 415   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 416     re_compile_fastmap (bufp);
 417
 418   if (BE (bufp->no_sub, 0))
 419     regs = NULL;
 420
 421   /* We need at least 1 register.  */
 422   if (regs == NULL)
 423     nregs = 1;
 424   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 425                regs->num_regs < bufp->re_nsub + 1, 0))
 426     {
 427       nregs = regs->num_regs;
 428       if (BE (nregs < 1, 0))
 429         {
 430           /* Nothing can be copied to regs.  */
 431           regs = NULL;
 432           nregs = 1;
 433         }
 434     }
 435   else
 436     nregs = bufp->re_nsub + 1;
 437   pmatch = re_malloc (regmatch_t, nregs);
 438   if (BE (pmatch == NULL, 0))
 439     return -2;
 440
 441   result = re_search_internal (bufp, string, length, start, range, stop,
 442                                nregs, pmatch, eflags);
 443
 444   rval = 0;
 445
 446   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 447   if (result != REG_NOERROR)
 448     rval = -1;
 449   else if (regs != NULL)
 450     {
 451       /* If caller wants register contents data back, copy them.  */
 452       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 453                                            bufp->regs_allocated);
 454       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 455         rval = -2;
 456     }
 457
 458   if (BE (rval == 0, 1))
 459     {
 460       if (ret_len)
 461         {
 462           assert (pmatch[0].rm_so == start);
 463           rval = pmatch[0].rm_eo - start;
 464         }
 465       else
 466         rval = pmatch[0].rm_so;
 467     }
 468   re_free (pmatch);
 469   return rval;
 470 }
 471
 472 static unsigned
 473 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 474     struct re_registers *regs;
 475     regmatch_t *pmatch;
 476     int nregs, regs_allocated;
 477 {
 478   int rval = REGS_REALLOCATE;
 479   int i;
 480   int need_regs = nregs + 1;
 481   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 482      uses.  */
 483
 484   /* Have the register data arrays been allocated?  */
 485   if (regs_allocated == REGS_UNALLOCATED)
 486     { /* No.  So allocate them with malloc.  */
 487       regs->start = re_malloc (regoff_t, need_regs);
 488       regs->end = re_malloc (regoff_t, need_regs);
 489       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 490         return REGS_UNALLOCATED;
 491       regs->num_regs = need_regs;
 492     }
 493   else if (regs_allocated == REGS_REALLOCATE)
 494     { /* Yes.  If we need more elements than were already
 495          allocated, reallocate them.  If we need fewer, just
 496          leave it alone.  */
 497       if (BE (need_regs > regs->num_regs, 0))
 498         {
 499           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 500           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 501           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 502             return REGS_UNALLOCATED;
 503           regs->start = new_start;
 504           regs->end = new_end;
 505           regs->num_regs = need_regs;
 506         }
 507     }
 508   else
 509     {
 510       assert (regs_allocated == REGS_FIXED);
 511       /* This function may not be called with REGS_FIXED and nregs too big.  */
 512       assert (regs->num_regs >= nregs);
 513       rval = REGS_FIXED;
 514     }
 515
 516   /* Copy the regs.  */
 517   for (i = 0; i < nregs; ++i)
 518     {
 519       regs->start[i] = pmatch[i].rm_so;
 520       regs->end[i] = pmatch[i].rm_eo;
 521     }
 522   for ( ; i < regs->num_regs; ++i)
 523     regs->start[i] = regs->end[i] = -1;
 524
 525   return rval;
 526 }
 527
 528 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 529    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 530    this memory for recording register information.  STARTS and ENDS
 531    must be allocated using the malloc library routine, and must each
 532    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 533
 534    If NUM_REGS == 0, then subsequent matches should allocate their own
 535    register data.
 536
 537    Unless this function is called, the first search or match using
 538    PATTERN_BUFFER will allocate its own register data, without
 539    freeing the old data.  */
 540
 541 void
 542 re_set_registers (bufp, regs, num_regs, starts, ends)
 543     struct re_pattern_buffer *bufp;
 544     struct re_registers *regs;
 545     unsigned num_regs;
 546     regoff_t *starts, *ends;
 547 {
 548   if (num_regs)
 549     {
 550       bufp->regs_allocated = REGS_REALLOCATE;
 551       regs->num_regs = num_regs;
 552       regs->start = starts;
 553       regs->end = ends;
 554     }
 555   else
 556     {
 557       bufp->regs_allocated = REGS_UNALLOCATED;
 558       regs->num_regs = 0;
 559       regs->start = regs->end = (regoff_t *) 0;
 560     }
 561 }
 562 #ifdef _LIBC
 563 weak_alias (__re_set_registers, re_set_registers)
 564 #endif
 565 \f
 566 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 567    them unless specifically requested.  */
 568
 569 #if defined _REGEX_RE_COMP || defined _LIBC
 570 int
 571 # ifdef _LIBC
 572 weak_function
 573 # endif
 574 re_exec (s)
 575      const char *s;
 576 {
 577   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 578 }
 579 #endif /* _REGEX_RE_COMP */
 580 \f
 581 static re_node_set empty_set;
 582
 583 /* Internal entry point.  */
 584
 585 /* Searches for a compiled pattern PREG in the string STRING, whose
 586    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 587    mingings with regexec.  START, and RANGE have the same meanings
 588    with re_search.
 589    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 590    otherwise return the error code.
 591    Note: We assume front end functions already check ranges.
 592    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 593
 594 static reg_errcode_t
 595 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 596                     eflags)
 597     const regex_t *preg;
 598     const char *string;
 599     int length, start, range, stop, eflags;
 600     size_t nmatch;
 601     regmatch_t pmatch[];
 602 {
 603   reg_errcode_t err;
 604   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 605   int left_lim, right_lim, incr;
 606   int fl_longest_match, match_first, match_last = -1;
 607   int fast_translate, sb;
 608 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 609   re_match_context_t mctx = { .dfa = dfa };
 610 #else
 611   re_match_context_t mctx;
 612 #endif
 613   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
 614                     && range && !preg->can_be_null) ? preg->fastmap : NULL);
 615
 616 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 617   memset (&mctx, '\0', sizeof (re_match_context_t));
 618   mctx.dfa = dfa;
 619 #endif
 620
 621   /* Check if the DFA haven't been compiled.  */
 622   if (BE (preg->used == 0 || dfa->init_state == NULL
 623           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 624           || dfa->init_state_begbuf == NULL, 0))
 625     return REG_NOMATCH;
 626
 627 #ifdef DEBUG
 628   /* We assume front-end functions already check them.  */
 629   assert (start + range >= 0 && start + range <= length);
 630 #endif
 631
 632   /* If initial states with non-begbuf contexts have no elements,
 633      the regex must be anchored.  If preg->newline_anchor is set,
 634      we'll never use init_state_nl, so do not check it.  */
 635   if (dfa->init_state->nodes.nelem == 0
 636       && dfa->init_state_word->nodes.nelem == 0
 637       && (dfa->init_state_nl->nodes.nelem == 0
 638           || !preg->newline_anchor))
 639     {
 640       if (start != 0 && start + range != 0)
 641         return REG_NOMATCH;
 642       start = range = 0;
 643     }
 644
 645   re_node_set_init_empty (&empty_set);
 646
 647   /* We must check the longest matching, if nmatch > 0.  */
 648   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 649
 650   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 651                             preg->translate, preg->syntax & RE_ICASE, dfa);
 652   if (BE (err != REG_NOERROR, 0))
 653     goto free_return;
 654   mctx.input.stop = stop;
 655   mctx.input.raw_stop = stop;
 656   mctx.input.newline_anchor = preg->newline_anchor;
 657
 658   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 659   if (BE (err != REG_NOERROR, 0))
 660     goto free_return;
 661
 662   /* We will log all the DFA states through which the dfa pass,
 663      if nmatch > 1, or this dfa has "multibyte node", which is a
 664      back-reference or a node which can accept multibyte character or
 665      multi character collating element.  */
 666   if (nmatch > 1 || dfa->has_mb_node)
 667     {
 668       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 669       if (BE (mctx.state_log == NULL, 0))
 670         {
 671           err = REG_ESPACE;
 672           goto free_return;
 673         }
 674     }
 675   else
 676     mctx.state_log = NULL;
 677
 678   match_first = start;
 679   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 680                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 681
 682   /* Check incrementally whether of not the input string match.  */
 683   incr = (range < 0) ? -1 : 1;
 684   left_lim = (range < 0) ? start + range : start;
 685   right_lim = (range < 0) ? start : start + range;
 686   sb = dfa->mb_cur_max == 1;
 687   fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
 688
 689   for (;;)
 690     {
 691       /* At first get the current byte from input string.  */
 692       if (fastmap)
 693         {
 694           if (BE (fast_translate, 1))
 695             {
 696               unsigned RE_TRANSLATE_TYPE t
 697                 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 698               if (BE (range >= 0, 1))
 699                 {
 700                   if (BE (t != NULL, 0))
 701                     {
 702                       while (BE (match_first < right_lim, 1)
 703                              && !fastmap[t[(unsigned char) string[match_first]]])
 704                         ++match_first;
 705                     }
 706                   else
 707                     {
 708                       while (BE (match_first < right_lim, 1)
 709                              && !fastmap[(unsigned char) string[match_first]])
 710                         ++match_first;
 711                     }
 712                   if (BE (match_first == right_lim, 0))
 713                     {
 714                       int ch = match_first >= length
 715                                ? 0 : (unsigned char) string[match_first];
 716                       if (!fastmap[t ? t[ch] : ch])
 717                         break;
 718                     }
 719                 }
 720               else
 721                 {
 722                   while (match_first >= left_lim)
 723                     {
 724                       int ch = match_first >= length
 725                                ? 0 : (unsigned char) string[match_first];
 726                       if (fastmap[t ? t[ch] : ch])
 727                         break;
 728                       --match_first;
 729                     }
 730                   if (match_first < left_lim)
 731                     break;
 732                 }
 733             }
 734           else
 735             {
 736               int ch;
 737
 738               do
 739                 {
 740                   /* In this case, we can't determine easily the current byte,
 741                      since it might be a component byte of a multibyte
 742                      character.  Then we use the constructed buffer
 743                      instead.  */
 744                   /* If MATCH_FIRST is out of the valid range, reconstruct the
 745                      buffers.  */
 746                   if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
 747                       <= match_first
 748                       || match_first < mctx.input.raw_mbs_idx)
 749                     {
 750                       err = re_string_reconstruct (&mctx.input, match_first,
 751                                                    eflags);
 752                       if (BE (err != REG_NOERROR, 0))
 753                         goto free_return;
 754                     }
 755                   /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 756                      Note that MATCH_FIRST must not be smaller than 0.  */
 757                   ch = ((match_first >= length) ? 0
 758                        : re_string_byte_at (&mctx.input,
 759                                             match_first
 760                                             - mctx.input.raw_mbs_idx));
 761                   if (fastmap[ch])
 762                     break;
 763                   match_first += incr;
 764                 }
 765               while (match_first >= left_lim && match_first <= right_lim);
 766               if (! fastmap[ch])
 767                 break;
 768             }
 769         }
 770
 771       /* Reconstruct the buffers so that the matcher can assume that
 772          the matching starts from the beginning of the buffer.  */
 773       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 774       if (BE (err != REG_NOERROR, 0))
 775         goto free_return;
 776 #ifdef RE_ENABLE_I18N
 777      /* Eliminate it when it is a component of a multibyte character
 778          and isn't the head of a multibyte character.  */
 779       if (sb || re_string_first_byte (&mctx.input, 0))
 780 #endif
 781         {
 782           /* It seems to be appropriate one, then use the matcher.  */
 783           /* We assume that the matching starts from 0.  */
 784           mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 785           match_last = check_matching (&mctx, fl_longest_match,
 786                                        range >= 0 ? &match_first : NULL);
 787           if (match_last != -1)
 788             {
 789               if (BE (match_last == -2, 0))
 790                 {
 791                   err = REG_ESPACE;
 792                   goto free_return;
 793                 }
 794               else
 795                 {
 796                   mctx.match_last = match_last;
 797                   if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 798                     {
 799                       re_dfastate_t *pstate = mctx.state_log[match_last];
 800                       mctx.last_node = check_halt_state_context (&mctx, pstate,
 801                                                                  match_last);
 802                     }
 803                   if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 804                       || dfa->nbackref)
 805                     {
 806                       err = prune_impossible_nodes (&mctx);
 807                       if (err == REG_NOERROR)
 808                         break;
 809                       if (BE (err != REG_NOMATCH, 0))
 810                         goto free_return;
 811                       match_last = -1;
 812                     }
 813                   else
 814                     break; /* We found a match.  */
 815                 }
 816             }
 817           match_ctx_clean (&mctx);
 818         }
 819       /* Update counter.  */
 820       match_first += incr;
 821       if (match_first < left_lim || right_lim < match_first)
 822         break;
 823     }
 824
 825   /* Set pmatch[] if we need.  */
 826   if (match_last != -1 && nmatch > 0)
 827     {
 828       int reg_idx;
 829
 830       /* Initialize registers.  */
 831       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 832         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 833
 834       /* Set the points where matching start/end.  */
 835       pmatch[0].rm_so = 0;
 836       pmatch[0].rm_eo = mctx.match_last;
 837
 838       if (!preg->no_sub && nmatch > 1)
 839         {
 840           err = set_regs (preg, &mctx, nmatch, pmatch,
 841                           dfa->has_plural_match && dfa->nbackref > 0);
 842           if (BE (err != REG_NOERROR, 0))
 843             goto free_return;
 844         }
 845
 846       /* At last, add the offset to the each registers, since we slided
 847          the buffers so that we could assume that the matching starts
 848          from 0.  */
 849       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 850         if (pmatch[reg_idx].rm_so != -1)
 851           {
 852 #ifdef RE_ENABLE_I18N
 853             if (BE (mctx.input.offsets_needed != 0, 0))
 854               {
 855                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 856                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 857                 else
 858                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 859                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 860                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 861                 else
 862                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 863               }
 864 #else
 865             assert (mctx.input.offsets_needed == 0);
 866 #endif
 867             pmatch[reg_idx].rm_so += match_first;
 868             pmatch[reg_idx].rm_eo += match_first;
 869           }
 870     }
 871   err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
 872  free_return:
 873   re_free (mctx.state_log);
 874   if (dfa->nbackref)
 875     match_ctx_free (&mctx);
 876   re_string_destruct (&mctx.input);
 877   return err;
 878 }
 879
 880 static reg_errcode_t
 881 prune_impossible_nodes (mctx)
 882      re_match_context_t *mctx;
 883 {
 884   re_dfa_t *const dfa = mctx->dfa;
 885   int halt_node, match_last;
 886   reg_errcode_t ret;
 887   re_dfastate_t **sifted_states;
 888   re_dfastate_t **lim_states = NULL;
 889   re_sift_context_t sctx;
 890 #ifdef DEBUG
 891   assert (mctx->state_log != NULL);
 892 #endif
 893   match_last = mctx->match_last;
 894   halt_node = mctx->last_node;
 895   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 896   if (BE (sifted_states == NULL, 0))
 897     {
 898       ret = REG_ESPACE;
 899       goto free_return;
 900     }
 901   if (dfa->nbackref)
 902     {
 903       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 904       if (BE (lim_states == NULL, 0))
 905         {
 906           ret = REG_ESPACE;
 907           goto free_return;
 908         }
 909       while (1)
 910         {
 911           memset (lim_states, '\0',
 912                   sizeof (re_dfastate_t *) * (match_last + 1));
 913           match_ctx_clear_flag (mctx);
 914           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 915                          match_last, 0);
 916           ret = sift_states_backward (mctx, &sctx);
 917           re_node_set_free (&sctx.limits);
 918           if (BE (ret != REG_NOERROR, 0))
 919               goto free_return;
 920           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 921             break;
 922           do
 923             {
 924               --match_last;
 925               if (match_last < 0)
 926                 {
 927                   ret = REG_NOMATCH;
 928                   goto free_return;
 929                 }
 930             } while (mctx->state_log[match_last] == NULL
 931                      || !mctx->state_log[match_last]->halt);
 932           halt_node = check_halt_state_context (mctx,
 933                                                 mctx->state_log[match_last],
 934                                                 match_last);
 935         }
 936       ret = merge_state_array (dfa, sifted_states, lim_states,
 937                                match_last + 1);
 938       re_free (lim_states);
 939       lim_states = NULL;
 940       if (BE (ret != REG_NOERROR, 0))
 941         goto free_return;
 942     }
 943   else
 944     {
 945       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 946                      match_last, 0);
 947       ret = sift_states_backward (mctx, &sctx);
 948       re_node_set_free (&sctx.limits);
 949       if (BE (ret != REG_NOERROR, 0))
 950         goto free_return;
 951     }
 952   re_free (mctx->state_log);
 953   mctx->state_log = sifted_states;
 954   sifted_states = NULL;
 955   mctx->last_node = halt_node;
 956   mctx->match_last = match_last;
 957   ret = REG_NOERROR;
 958  free_return:
 959   re_free (sifted_states);
 960   re_free (lim_states);
 961   return ret;
 962 }
 963
 964 /* Acquire an initial state and return it.
 965    We must select appropriate initial state depending on the context,
 966    since initial states may have constraints like "\<", "^", etc..  */
 967
 968 static inline re_dfastate_t *
 969 acquire_init_state_context (err, mctx, idx)
 970      reg_errcode_t *err;
 971      const re_match_context_t *mctx;
 972      int idx;
 973 {
 974   re_dfa_t *const dfa = mctx->dfa;
 975   if (dfa->init_state->has_constraint)
 976     {
 977       unsigned int context;
 978       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 979       if (IS_WORD_CONTEXT (context))
 980         return dfa->init_state_word;
 981       else if (IS_ORDINARY_CONTEXT (context))
 982         return dfa->init_state;
 983       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
 984         return dfa->init_state_begbuf;
 985       else if (IS_NEWLINE_CONTEXT (context))
 986         return dfa->init_state_nl;
 987       else if (IS_BEGBUF_CONTEXT (context))
 988         {
 989           /* It is relatively rare case, then calculate on demand.  */
 990           return re_acquire_state_context (err, dfa,
 991                                            dfa->init_state->entrance_nodes,
 992                                            context);
 993         }
 994       else
 995         /* Must not happen?  */
 996         return dfa->init_state;
 997     }
 998   else
 999     return dfa->init_state;
1000 }
1001
1002 /* Check whether the regular expression match input string INPUT or not,
1003    and return the index where the matching end, return -1 if not match,
1004    or return -2 in case of an error.
1005    FL_LONGEST_MATCH means we want the POSIX longest matching.
1006    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1007    next place where we may want to try matching.
1008    Note that the matcher assume that the maching starts from the current
1009    index of the buffer.  */
1010
1011 static int
1012 check_matching (mctx, fl_longest_match, p_match_first)
1013     re_match_context_t *mctx;
1014     int fl_longest_match;
1015     int *p_match_first;
1016 {
1017   re_dfa_t *const dfa = mctx->dfa;
1018   reg_errcode_t err;
1019   int match = 0;
1020   int match_last = -1;
1021   int cur_str_idx = re_string_cur_idx (&mctx->input);
1022   re_dfastate_t *cur_state;
1023   int at_init_state = p_match_first != NULL;
1024   int next_start_idx = cur_str_idx;
1025
1026   err = REG_NOERROR;
1027   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1028   /* An initial state must not be NULL (invalid).  */
1029   if (BE (cur_state == NULL, 0))
1030     {
1031       assert (err == REG_ESPACE);
1032       return -2;
1033     }
1034
1035   if (mctx->state_log != NULL)
1036     {
1037       mctx->state_log[cur_str_idx] = cur_state;
1038
1039       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1040          later.  E.g. Processing back references.  */
1041       if (BE (dfa->nbackref, 0))
1042         {
1043           at_init_state = 0;
1044           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1045           if (BE (err != REG_NOERROR, 0))
1046             return err;
1047
1048           if (cur_state->has_backref)
1049             {
1050               err = transit_state_bkref (mctx, &cur_state->nodes);
1051               if (BE (err != REG_NOERROR, 0))
1052                 return err;
1053             }
1054         }
1055     }
1056
1057   /* If the RE accepts NULL string.  */
1058   if (BE (cur_state->halt, 0))
1059     {
1060       if (!cur_state->has_constraint
1061           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1062         {
1063           if (!fl_longest_match)
1064             return cur_str_idx;
1065           else
1066             {
1067               match_last = cur_str_idx;
1068               match = 1;
1069             }
1070         }
1071     }
1072
1073   while (!re_string_eoi (&mctx->input))
1074     {
1075       re_dfastate_t *old_state = cur_state;
1076       cur_state = transit_state (&err, mctx, cur_state);
1077       if (mctx->state_log != NULL)
1078         cur_state = merge_state_with_log (&err, mctx, cur_state);
1079
1080       if (cur_state == NULL)
1081         {
1082           /* Reached the invalid state or an error.  Try to recover a valid
1083              state using the state log, if available and if we have not
1084              already found a valid (even if not the longest) match.  */
1085           if (BE (err != REG_NOERROR, 0))
1086             return -2;
1087
1088           if (mctx->state_log == NULL
1089               || (match && !fl_longest_match)
1090               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1091             break;
1092         }
1093
1094       if (at_init_state)
1095         {
1096           if (old_state == cur_state)
1097             next_start_idx = re_string_cur_idx (&mctx->input);
1098           else
1099             at_init_state = 0;
1100         }
1101
1102       if (cur_state->halt)
1103         {
1104           /* Reached a halt state.
1105              Check the halt state can satisfy the current context.  */
1106           if (!cur_state->has_constraint
1107               || check_halt_state_context (mctx, cur_state,
1108                                            re_string_cur_idx (&mctx->input)))
1109             {
1110               /* We found an appropriate halt state.  */
1111               match_last = re_string_cur_idx (&mctx->input);
1112               match = 1;
1113               if (!fl_longest_match)
1114                 break;
1115             }
1116         }
1117    }
1118
1119   if (match_last == -1 && p_match_first)
1120     *p_match_first += next_start_idx;
1121
1122   return match_last;
1123 }
1124
1125 /* Check NODE match the current context.  */
1126
1127 static int check_halt_node_context (dfa, node, context)
1128     const re_dfa_t *dfa;
1129     int node;
1130     unsigned int context;
1131 {
1132   re_token_type_t type = dfa->nodes[node].type;
1133   unsigned int constraint = dfa->nodes[node].constraint;
1134   if (type != END_OF_RE)
1135     return 0;
1136   if (!constraint)
1137     return 1;
1138   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1139     return 0;
1140   return 1;
1141 }
1142
1143 /* Check the halt state STATE match the current context.
1144    Return 0 if not match, if the node, STATE has, is a halt node and
1145    match the context, return the node.  */
1146
1147 static int
1148 check_halt_state_context (mctx, state, idx)
1149     const re_match_context_t *mctx;
1150     const re_dfastate_t *state;
1151     int idx;
1152 {
1153   int i;
1154   unsigned int context;
1155 #ifdef DEBUG
1156   assert (state->halt);
1157 #endif
1158   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1159   for (i = 0; i < state->nodes.nelem; ++i)
1160     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1161       return state->nodes.elems[i];
1162   return 0;
1163 }
1164
1165 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1166    corresponding to the DFA).
1167    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1168    of errors.  */
1169
1170 static int
1171 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1172     const re_match_context_t *mctx;
1173     regmatch_t *regs;
1174     int nregs, *pidx, node;
1175     re_node_set *eps_via_nodes;
1176     struct re_fail_stack_t *fs;
1177 {
1178   re_dfa_t *const dfa = mctx->dfa;
1179   int i, err, dest_node;
1180   dest_node = -1;
1181   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1182     {
1183       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1184       int ndest, dest_nodes[2];
1185       err = re_node_set_insert (eps_via_nodes, node);
1186       if (BE (err < 0, 0))
1187         return -2;
1188       /* Pick up valid destinations.  */
1189       for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1190         {
1191           int candidate = dfa->edests[node].elems[i];
1192           if (!re_node_set_contains (cur_nodes, candidate))
1193             continue;
1194           dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1195           dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1196           ++ndest;
1197         }
1198       if (ndest <= 1)
1199         return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1200       /* In order to avoid infinite loop like "(a*)*".  */
1201       if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1202         return dest_nodes[1];
1203       if (fs != NULL
1204           && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1205                               eps_via_nodes))
1206         return -2;
1207       return dest_nodes[0];
1208     }
1209   else
1210     {
1211       int naccepted = 0;
1212       re_token_type_t type = dfa->nodes[node].type;
1213
1214 #ifdef RE_ENABLE_I18N
1215       if (ACCEPT_MB_NODE (type))
1216         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1217       else
1218 #endif /* RE_ENABLE_I18N */
1219       if (type == OP_BACK_REF)
1220         {
1221           int subexp_idx = dfa->nodes[node].opr.idx;
1222           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1223           if (fs != NULL)
1224             {
1225               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1226                 return -1;
1227               else if (naccepted)
1228                 {
1229                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1230                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1231                               naccepted) != 0)
1232                     return -1;
1233                 }
1234             }
1235
1236           if (naccepted == 0)
1237             {
1238               err = re_node_set_insert (eps_via_nodes, node);
1239               if (BE (err < 0, 0))
1240                 return -2;
1241               dest_node = dfa->edests[node].elems[0];
1242               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1243                                         dest_node))
1244                 return dest_node;
1245             }
1246         }
1247
1248       if (naccepted != 0
1249           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1250         {
1251           dest_node = dfa->nexts[node];
1252           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1253           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1254                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1255                                                dest_node)))
1256             return -1;
1257           re_node_set_empty (eps_via_nodes);
1258           return dest_node;
1259         }
1260     }
1261   return -1;
1262 }
1263
1264 static reg_errcode_t
1265 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1266      struct re_fail_stack_t *fs;
1267      int str_idx, *dests, nregs;
1268      regmatch_t *regs;
1269      re_node_set *eps_via_nodes;
1270 {
1271   reg_errcode_t err;
1272   int num = fs->num++;
1273   if (fs->num == fs->alloc)
1274     {
1275       struct re_fail_stack_ent_t *new_array;
1276       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1277                                        * fs->alloc * 2));
1278       if (new_array == NULL)
1279         return REG_ESPACE;
1280       fs->alloc *= 2;
1281       fs->stack = new_array;
1282     }
1283   fs->stack[num].idx = str_idx;
1284   fs->stack[num].node = dests[1];
1285   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1286   if (fs->stack[num].regs == NULL)
1287     return REG_ESPACE;
1288   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1289   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1290   return err;
1291 }
1292
1293 static int
1294 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1295      struct re_fail_stack_t *fs;
1296      int *pidx, nregs;
1297      regmatch_t *regs;
1298      re_node_set *eps_via_nodes;
1299 {
1300   int num = --fs->num;
1301   assert (num >= 0);
1302   *pidx = fs->stack[num].idx;
1303   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1304   re_node_set_free (eps_via_nodes);
1305   re_free (fs->stack[num].regs);
1306   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1307   return fs->stack[num].node;
1308 }
1309
1310 /* Set the positions where the subexpressions are starts/ends to registers
1311    PMATCH.
1312    Note: We assume that pmatch[0] is already set, and
1313    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1314
1315 static reg_errcode_t
1316 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1317      const regex_t *preg;
1318      const re_match_context_t *mctx;
1319      size_t nmatch;
1320      regmatch_t *pmatch;
1321      int fl_backtrack;
1322 {
1323   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1324   int idx, cur_node, real_nmatch;
1325   re_node_set eps_via_nodes;
1326   struct re_fail_stack_t *fs;
1327   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1328   regmatch_t *prev_idx_match;
1329
1330 #ifdef DEBUG
1331   assert (nmatch > 1);
1332   assert (mctx->state_log != NULL);
1333 #endif
1334   if (fl_backtrack)
1335     {
1336       fs = &fs_body;
1337       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1338       if (fs->stack == NULL)
1339         return REG_ESPACE;
1340     }
1341   else
1342     fs = NULL;
1343
1344   cur_node = dfa->init_node;
1345   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1346   re_node_set_init_empty (&eps_via_nodes);
1347
1348   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1349   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1350
1351   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1352     {
1353       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1354
1355       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1356         {
1357           int reg_idx;
1358           if (fs)
1359             {
1360               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1361                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1362                   break;
1363               if (reg_idx == nmatch)
1364                 {
1365                   re_node_set_free (&eps_via_nodes);
1366                   return free_fail_stack_return (fs);
1367                 }
1368               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1369                                          &eps_via_nodes);
1370             }
1371           else
1372             {
1373               re_node_set_free (&eps_via_nodes);
1374               return REG_NOERROR;
1375             }
1376         }
1377
1378       /* Proceed to next node.  */
1379       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1380                                     &eps_via_nodes, fs);
1381
1382       if (BE (cur_node < 0, 0))
1383         {
1384           if (BE (cur_node == -2, 0))
1385             {
1386               re_node_set_free (&eps_via_nodes);
1387               free_fail_stack_return (fs);
1388               return REG_ESPACE;
1389             }
1390           if (fs)
1391             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1392                                        &eps_via_nodes);
1393           else
1394             {
1395               re_node_set_free (&eps_via_nodes);
1396               return REG_NOMATCH;
1397             }
1398         }
1399     }
1400   re_node_set_free (&eps_via_nodes);
1401   return free_fail_stack_return (fs);
1402 }
1403
1404 static reg_errcode_t
1405 free_fail_stack_return (fs)
1406      struct re_fail_stack_t *fs;
1407 {
1408   if (fs)
1409     {
1410       int fs_idx;
1411       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1412         {
1413           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1414           re_free (fs->stack[fs_idx].regs);
1415         }
1416       re_free (fs->stack);
1417     }
1418   return REG_NOERROR;
1419 }
1420
1421 static void
1422 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1423      re_dfa_t *dfa;
1424      regmatch_t *pmatch, *prev_idx_match;
1425      int cur_node, cur_idx, nmatch;
1426 {
1427   int type = dfa->nodes[cur_node].type;
1428   if (type == OP_OPEN_SUBEXP)
1429     {
1430       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1431
1432       /* We are at the first node of this sub expression.  */
1433       if (reg_num < nmatch)
1434         {
1435           pmatch[reg_num].rm_so = cur_idx;
1436           pmatch[reg_num].rm_eo = -1;
1437         }
1438     }
1439   else if (type == OP_CLOSE_SUBEXP)
1440     {
1441       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1442       if (reg_num < nmatch)
1443         {
1444           /* We are at the last node of this sub expression.  */
1445           if (pmatch[reg_num].rm_so < cur_idx)
1446             {
1447               pmatch[reg_num].rm_eo = cur_idx;
1448               /* This is a non-empty match or we are not inside an optional
1449                  subexpression.  Accept this right away.  */
1450               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1451             }
1452           else
1453             {
1454               if (dfa->nodes[cur_node].opt_subexp
1455                   && prev_idx_match[reg_num].rm_so != -1)
1456                 /* We transited through an empty match for an optional
1457                    subexpression, like (a?)*, and this is not the subexp's
1458                    first match.  Copy back the old content of the registers
1459                    so that matches of an inner subexpression are undone as
1460                    well, like in ((a?))*.  */
1461                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1462               else
1463                 /* We completed a subexpression, but it may be part of
1464                    an optional one, so do not update PREV_IDX_MATCH.  */
1465                 pmatch[reg_num].rm_eo = cur_idx;
1466             }
1467         }
1468     }
1469 }
1470
1471 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1472    and sift the nodes in each states according to the following rules.
1473    Updated state_log will be wrote to STATE_LOG.
1474
1475    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1476      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1477         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1478         the LAST_NODE, we throw away the node `a'.
1479      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1480         string `s' and transit to `b':
1481         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1482            away the node `a'.
1483         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1484             thrown away, we throw away the node `a'.
1485      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1486         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1487            node `a'.
1488         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1489             we throw away the node `a'.  */
1490
1491 #define STATE_NODE_CONTAINS(state,node) \
1492   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1493
1494 static reg_errcode_t
1495 sift_states_backward (mctx, sctx)
1496      re_match_context_t *mctx;
1497      re_sift_context_t *sctx;
1498 {
1499   re_dfa_t *const dfa = mctx->dfa;
1500   reg_errcode_t err;
1501   int null_cnt = 0;
1502   int str_idx = sctx->last_str_idx;
1503   re_node_set cur_dest;
1504   re_node_set *cur_src; /* Points the state_log[str_idx]->nodes  */
1505
1506 #ifdef DEBUG
1507   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1508 #endif
1509   cur_src = &mctx->state_log[str_idx]->nodes;
1510
1511   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1512      transit to the last_node and the last_node itself.  */
1513   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1514   if (BE (err != REG_NOERROR, 0))
1515     return err;
1516   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1517   if (BE (err != REG_NOERROR, 0))
1518     goto free_return;
1519
1520   /* Then check each states in the state_log.  */
1521   while (str_idx > 0)
1522     {
1523       int i, ret;
1524       /* Update counters.  */
1525       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1526       if (null_cnt > mctx->max_mb_elem_len)
1527         {
1528           memset (sctx->sifted_states, '\0',
1529                   sizeof (re_dfastate_t *) * str_idx);
1530           re_node_set_free (&cur_dest);
1531           return REG_NOERROR;
1532         }
1533       re_node_set_empty (&cur_dest);
1534       --str_idx;
1535       cur_src = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1536                  : &mctx->state_log[str_idx]->nodes);
1537
1538       /* Then build the next sifted state.
1539          We build the next sifted state on `cur_dest', and update
1540          `sifted_states[str_idx]' with `cur_dest'.
1541          Note:
1542          `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1543          `cur_src' points the node_set of the old `state_log[str_idx]'.  */
1544       for (i = 0; i < cur_src->nelem; i++)
1545         {
1546           int prev_node = cur_src->elems[i];
1547           int naccepted = 0;
1548           re_token_type_t type = dfa->nodes[prev_node].type;
1549
1550           if (IS_EPSILON_NODE (type))
1551             continue;
1552 #ifdef RE_ENABLE_I18N
1553           /* If the node may accept `multi byte'.  */
1554           if (ACCEPT_MB_NODE (type))
1555             naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1556                                              str_idx, sctx->last_str_idx);
1557
1558 #endif /* RE_ENABLE_I18N */
1559           /* We don't check backreferences here.
1560              See update_cur_sifted_state().  */
1561
1562           if (!naccepted
1563               && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1564               && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1565                                       dfa->nexts[prev_node]))
1566             naccepted = 1;
1567
1568           if (naccepted == 0)
1569             continue;
1570
1571           if (sctx->limits.nelem)
1572             {
1573               int to_idx = str_idx + naccepted;
1574               if (check_dst_limits (mctx, &sctx->limits,
1575                                     dfa->nexts[prev_node], to_idx,
1576                                     prev_node, str_idx))
1577                 continue;
1578             }
1579           ret = re_node_set_insert (&cur_dest, prev_node);
1580           if (BE (ret == -1, 0))
1581             {
1582               err = REG_ESPACE;
1583               goto free_return;
1584             }
1585         }
1586
1587       /* Add all the nodes which satisfy the following conditions:
1588          - It can epsilon transit to a node in CUR_DEST.
1589          - It is in CUR_SRC.
1590          And update state_log.  */
1591       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1592       if (BE (err != REG_NOERROR, 0))
1593         goto free_return;
1594     }
1595   err = REG_NOERROR;
1596  free_return:
1597   re_node_set_free (&cur_dest);
1598   return err;
1599 }
1600
1601 /* Helper functions.  */
1602
1603 static reg_errcode_t
1604 clean_state_log_if_needed (mctx, next_state_log_idx)
1605     re_match_context_t *mctx;
1606     int next_state_log_idx;
1607 {
1608   int top = mctx->state_log_top;
1609
1610   if (next_state_log_idx >= mctx->input.bufs_len
1611       || (next_state_log_idx >= mctx->input.valid_len
1612           && mctx->input.valid_len < mctx->input.len))
1613     {
1614       reg_errcode_t err;
1615       err = extend_buffers (mctx);
1616       if (BE (err != REG_NOERROR, 0))
1617         return err;
1618     }
1619
1620   if (top < next_state_log_idx)
1621     {
1622       memset (mctx->state_log + top + 1, '\0',
1623               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1624       mctx->state_log_top = next_state_log_idx;
1625     }
1626   return REG_NOERROR;
1627 }
1628
1629 static reg_errcode_t
1630 merge_state_array (dfa, dst, src, num)
1631      re_dfa_t *dfa;
1632      re_dfastate_t **dst;
1633      re_dfastate_t **src;
1634      int num;
1635 {
1636   int st_idx;
1637   reg_errcode_t err;
1638   for (st_idx = 0; st_idx < num; ++st_idx)
1639     {
1640       if (dst[st_idx] == NULL)
1641         dst[st_idx] = src[st_idx];
1642       else if (src[st_idx] != NULL)
1643         {
1644           re_node_set merged_set;
1645           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1646                                         &src[st_idx]->nodes);
1647           if (BE (err != REG_NOERROR, 0))
1648             return err;
1649           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1650           re_node_set_free (&merged_set);
1651           if (BE (err != REG_NOERROR, 0))
1652             return err;
1653         }
1654     }
1655   return REG_NOERROR;
1656 }
1657
1658 static reg_errcode_t
1659 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1660      re_match_context_t *mctx;
1661      re_sift_context_t *sctx;
1662      int str_idx;
1663      re_node_set *dest_nodes;
1664 {
1665   re_dfa_t *const dfa = mctx->dfa;
1666   reg_errcode_t err;
1667   const re_node_set *candidates;
1668   candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1669                 : &mctx->state_log[str_idx]->nodes);
1670
1671   /* At first, add the nodes which can epsilon transit to a node in
1672      DEST_NODE.  */
1673   if (dest_nodes->nelem)
1674     {
1675       err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1676       if (BE (err != REG_NOERROR, 0))
1677         return err;
1678     }
1679
1680   /* Then, check the limitations in the current sift_context.  */
1681   if (dest_nodes->nelem && sctx->limits.nelem)
1682     {
1683       err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1684                                  mctx->bkref_ents, str_idx);
1685       if (BE (err != REG_NOERROR, 0))
1686         return err;
1687     }
1688
1689   /* Update state_log.  */
1690   sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1691   if (BE (sctx->sifted_states[str_idx] == NULL && err != REG_NOERROR, 0))
1692     return err;
1693
1694   if ((mctx->state_log[str_idx] != NULL
1695        && mctx->state_log[str_idx]->has_backref))
1696     {
1697       err = sift_states_bkref (mctx, sctx, str_idx, dest_nodes);
1698       if (BE (err != REG_NOERROR, 0))
1699         return err;
1700     }
1701   return REG_NOERROR;
1702 }
1703
1704 static reg_errcode_t
1705 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1706      re_dfa_t *dfa;
1707      re_node_set *dest_nodes;
1708      const re_node_set *candidates;
1709 {
1710   reg_errcode_t err;
1711   int src_idx;
1712   re_node_set src_copy;
1713
1714   err = re_node_set_init_copy (&src_copy, dest_nodes);
1715   if (BE (err != REG_NOERROR, 0))
1716     return err;
1717   for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1718     {
1719       err = re_node_set_add_intersect (dest_nodes, candidates,
1720                                        dfa->inveclosures
1721                                        + src_copy.elems[src_idx]);
1722       if (BE (err != REG_NOERROR, 0))
1723         {
1724           re_node_set_free (&src_copy);
1725           return err;
1726         }
1727     }
1728   re_node_set_free (&src_copy);
1729   return REG_NOERROR;
1730 }
1731
1732 static reg_errcode_t
1733 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1734      re_dfa_t *dfa;
1735      int node;
1736      re_node_set *dest_nodes;
1737      const re_node_set *candidates;
1738 {
1739     int ecl_idx;
1740     reg_errcode_t err;
1741     re_node_set *inv_eclosure = dfa->inveclosures + node;
1742     re_node_set except_nodes;
1743     re_node_set_init_empty (&except_nodes);
1744     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1745       {
1746         int cur_node = inv_eclosure->elems[ecl_idx];
1747         if (cur_node == node)
1748           continue;
1749         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1750           {
1751             int edst1 = dfa->edests[cur_node].elems[0];
1752             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1753                          ? dfa->edests[cur_node].elems[1] : -1);
1754             if ((!re_node_set_contains (inv_eclosure, edst1)
1755                  && re_node_set_contains (dest_nodes, edst1))
1756                 || (edst2 > 0
1757                     && !re_node_set_contains (inv_eclosure, edst2)
1758                     && re_node_set_contains (dest_nodes, edst2)))
1759               {
1760                 err = re_node_set_add_intersect (&except_nodes, candidates,
1761                                                  dfa->inveclosures + cur_node);
1762                 if (BE (err != REG_NOERROR, 0))
1763                   {
1764                     re_node_set_free (&except_nodes);
1765                     return err;
1766                   }
1767               }
1768           }
1769       }
1770     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1771       {
1772         int cur_node = inv_eclosure->elems[ecl_idx];
1773         if (!re_node_set_contains (&except_nodes, cur_node))
1774           {
1775             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1776             re_node_set_remove_at (dest_nodes, idx);
1777           }
1778       }
1779     re_node_set_free (&except_nodes);
1780     return REG_NOERROR;
1781 }
1782
1783 static int
1784 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1785      re_match_context_t *mctx;
1786      re_node_set *limits;
1787      int dst_node, dst_idx, src_node, src_idx;
1788 {
1789   re_dfa_t *const dfa = mctx->dfa;
1790   int lim_idx, src_pos, dst_pos;
1791
1792   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1793     {
1794       int subexp_idx;
1795       struct re_backref_cache_entry *ent;
1796       ent = mctx->bkref_ents + limits->elems[lim_idx];
1797       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1798
1799       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1800                                            dfa->eclosures + dst_node,
1801                                            subexp_idx, dst_node, dst_idx);
1802       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1803                                            dfa->eclosures + src_node,
1804                                            subexp_idx, src_node, src_idx);
1805
1806       /* In case of:
1807          <src> <dst> ( <subexp> )
1808          ( <subexp> ) <src> <dst>
1809          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1810       if (src_pos == dst_pos)
1811         continue; /* This is unrelated limitation.  */
1812       else
1813         return 1;
1814     }
1815   return 0;
1816 }
1817
1818 static int
1819 check_dst_limits_calc_pos (mctx, limit, eclosures, subexp_idx, from_node,
1820                            str_idx)
1821      re_match_context_t *mctx;
1822      re_node_set *eclosures;
1823      int limit, subexp_idx, from_node, str_idx;
1824 {
1825   re_dfa_t *const dfa = mctx->dfa;
1826   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1827   int node_idx;
1828
1829   /* If we are outside the range of the subexpression, return -1 or 1.  */
1830   if (str_idx < lim->subexp_from)
1831     return -1;
1832
1833   if (lim->subexp_to < str_idx)
1834     return 1;
1835
1836   /* If we are within the subexpression, return 0.  */
1837   if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
1838     return 0;
1839
1840   /* Else, we are on the boundary: examine the nodes on the epsilon
1841      closure.  */
1842   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1843     {
1844       int node = eclosures->elems[node_idx];
1845       switch (dfa->nodes[node].type)
1846         {
1847         case OP_BACK_REF:
1848           {
1849             int bi = search_cur_bkref_entry (mctx, str_idx);
1850             for (; bi < mctx->nbkref_ents; ++bi)
1851               {
1852                 struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
1853                 int dst, cpos;
1854
1855                 /* If this backreference goes beyond the point we're
1856                    examining, don't go any further.  */
1857                 if (ent->str_idx > str_idx)
1858                   break;
1859
1860                 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1861                   continue;
1862
1863                 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1864                    OP_CLOSE_SUBEXP cases below.  But, if the
1865                    destination node is the same node as the source
1866                    node, don't recurse because it would cause an
1867                    infinite loop: a regex that exhibits this behavior
1868                    is ()\1*\1*  */
1869                 dst = dfa->edests[node].elems[0];
1870                 if (dst == from_node)
1871                   {
1872                     if (str_idx == lim->subexp_from)
1873                       return -1;
1874                     else /* if (str_idx == lim->subexp_to) */
1875                       return 0;
1876                   }
1877
1878                 cpos = check_dst_limits_calc_pos (mctx, limit,
1879                                                   dfa->eclosures + dst,
1880                                                   subexp_idx, dst,
1881                                                   str_idx);
1882
1883                 if (cpos == -1 && str_idx == lim->subexp_from)
1884                   return -1;
1885
1886                 if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
1887                   return 0;
1888               }
1889               break;
1890             }
1891
1892         case OP_OPEN_SUBEXP:
1893           if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
1894             return -1;
1895           break;
1896
1897         case OP_CLOSE_SUBEXP:
1898           if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
1899             return 0;
1900           break;
1901
1902         default:
1903             break;
1904         }
1905     }
1906
1907   if (str_idx == lim->subexp_to)
1908     return 1;
1909   else
1910     return 0;
1911 }
1912
1913 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1914    which are against limitations from DEST_NODES. */
1915
1916 static reg_errcode_t
1917 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1918      re_dfa_t *dfa;
1919      re_node_set *dest_nodes;
1920      const re_node_set *candidates;
1921      re_node_set *limits;
1922      struct re_backref_cache_entry *bkref_ents;
1923      int str_idx;
1924 {
1925   reg_errcode_t err;
1926   int node_idx, lim_idx;
1927
1928   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1929     {
1930       int subexp_idx;
1931       struct re_backref_cache_entry *ent;
1932       ent = bkref_ents + limits->elems[lim_idx];
1933
1934       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1935         continue; /* This is unrelated limitation.  */
1936
1937       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1938       if (ent->subexp_to == str_idx)
1939         {
1940           int ops_node = -1;
1941           int cls_node = -1;
1942           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1943             {
1944               int node = dest_nodes->elems[node_idx];
1945               re_token_type_t type = dfa->nodes[node].type;
1946               if (type == OP_OPEN_SUBEXP
1947                   && subexp_idx == dfa->nodes[node].opr.idx)
1948                 ops_node = node;
1949               else if (type == OP_CLOSE_SUBEXP
1950                        && subexp_idx == dfa->nodes[node].opr.idx)
1951                 cls_node = node;
1952             }
1953
1954           /* Check the limitation of the open subexpression.  */
1955           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
1956           if (ops_node >= 0)
1957             {
1958               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1959                                            candidates);
1960               if (BE (err != REG_NOERROR, 0))
1961                 return err;
1962             }
1963
1964           /* Check the limitation of the close subexpression.  */
1965           if (cls_node >= 0)
1966             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1967               {
1968                 int node = dest_nodes->elems[node_idx];
1969                 if (!re_node_set_contains (dfa->inveclosures + node,
1970                                            cls_node)
1971                     && !re_node_set_contains (dfa->eclosures + node,
1972                                               cls_node))
1973                   {
1974                     /* It is against this limitation.
1975                        Remove it form the current sifted state.  */
1976                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1977                                                  candidates);
1978                     if (BE (err != REG_NOERROR, 0))
1979                       return err;
1980                     --node_idx;
1981                   }
1982               }
1983         }
1984       else /* (ent->subexp_to != str_idx)  */
1985         {
1986           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1987             {
1988               int node = dest_nodes->elems[node_idx];
1989               re_token_type_t type = dfa->nodes[node].type;
1990               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
1991                 {
1992                   if (subexp_idx != dfa->nodes[node].opr.idx)
1993                     continue;
1994                   if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
1995                       || (type == OP_OPEN_SUBEXP))
1996                     {
1997                       /* It is against this limitation.
1998                          Remove it form the current sifted state.  */
1999                       err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2000                                                    candidates);
2001                       if (BE (err != REG_NOERROR, 0))
2002                         return err;
2003                     }
2004                 }
2005             }
2006         }
2007     }
2008   return REG_NOERROR;
2009 }
2010
2011 static reg_errcode_t
2012 sift_states_bkref (mctx, sctx, str_idx, dest_nodes)
2013      re_match_context_t *mctx;
2014      re_sift_context_t *sctx;
2015      int str_idx;
2016      re_node_set *dest_nodes;
2017 {
2018   re_dfa_t *const dfa = mctx->dfa;
2019   reg_errcode_t err;
2020   int node_idx, node;
2021   re_sift_context_t local_sctx;
2022   const re_node_set *candidates;
2023   candidates = ((mctx->state_log[str_idx] == NULL) ? &empty_set
2024                 : &mctx->state_log[str_idx]->nodes);
2025   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2026
2027   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2028     {
2029       int cur_bkref_idx = re_string_cur_idx (&mctx->input);
2030       re_token_type_t type;
2031       node = candidates->elems[node_idx];
2032       type = dfa->nodes[node].type;
2033       if (node == sctx->cur_bkref && str_idx == cur_bkref_idx)
2034         continue;
2035       /* Avoid infinite loop for the REs like "()\1+".  */
2036       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2037         continue;
2038       if (type == OP_BACK_REF)
2039         {
2040           int enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2041           for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2042             {
2043               int disabled_idx, subexp_len, to_idx, dst_node;
2044               struct re_backref_cache_entry *entry;
2045               entry = mctx->bkref_ents + enabled_idx;
2046               if (entry->str_idx > str_idx)
2047                 break;
2048               if (entry->node != node)
2049                   continue;
2050               subexp_len = entry->subexp_to - entry->subexp_from;
2051               to_idx = str_idx + subexp_len;
2052               dst_node = (subexp_len ? dfa->nexts[node]
2053                           : dfa->edests[node].elems[0]);
2054
2055               if (to_idx > sctx->last_str_idx
2056                   || sctx->sifted_states[to_idx] == NULL
2057                   || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx],
2058                                            dst_node)
2059                   || check_dst_limits (mctx, &sctx->limits, node,
2060                                        str_idx, dst_node, to_idx))
2061                 continue;
2062                 {
2063                   re_dfastate_t *cur_state;
2064                   entry->flag = 0;
2065                   for (disabled_idx = enabled_idx + 1;
2066                        disabled_idx < mctx->nbkref_ents; ++disabled_idx)
2067                     {
2068                       struct re_backref_cache_entry *entry2;
2069                       entry2 = mctx->bkref_ents + disabled_idx;
2070                       if (entry2->str_idx > str_idx)
2071                         break;
2072                       entry2->flag = (entry2->node == node) ? 1 : entry2->flag;
2073                     }
2074
2075                   if (local_sctx.sifted_states == NULL)
2076                     {
2077                       local_sctx = *sctx;
2078                       err = re_node_set_init_copy (&local_sctx.limits,
2079                                                    &sctx->limits);
2080                       if (BE (err != REG_NOERROR, 0))
2081                         goto free_return;
2082                     }
2083                   local_sctx.last_node = node;
2084                   local_sctx.last_str_idx = str_idx;
2085                   err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2086                   if (BE (err < 0, 0))
2087                     {
2088                       err = REG_ESPACE;
2089                       goto free_return;
2090                     }
2091                   cur_state = local_sctx.sifted_states[str_idx];
2092                   err = sift_states_backward (mctx, &local_sctx);
2093                   if (BE (err != REG_NOERROR, 0))
2094                     goto free_return;
2095                   if (sctx->limited_states != NULL)
2096                     {
2097                       err = merge_state_array (dfa, sctx->limited_states,
2098                                                local_sctx.sifted_states,
2099                                                str_idx + 1);
2100                       if (BE (err != REG_NOERROR, 0))
2101                         goto free_return;
2102                     }
2103                   local_sctx.sifted_states[str_idx] = cur_state;
2104                   re_node_set_remove (&local_sctx.limits, enabled_idx);
2105                   /* We must not use the variable entry here, since
2106                      mctx->bkref_ents might be realloced.  */
2107                   mctx->bkref_ents[enabled_idx].flag = 1;
2108                 }
2109             }
2110           enabled_idx = search_cur_bkref_entry (mctx, str_idx);
2111           for (; enabled_idx < mctx->nbkref_ents; ++enabled_idx)
2112             {
2113               struct re_backref_cache_entry *entry;
2114               entry = mctx->bkref_ents + enabled_idx;
2115               if (entry->str_idx > str_idx)
2116                 break;
2117               if (entry->node == node)
2118                 entry->flag = 0;
2119             }
2120         }
2121     }
2122   err = REG_NOERROR;
2123  free_return:
2124   if (local_sctx.sifted_states != NULL)
2125     {
2126       re_node_set_free (&local_sctx.limits);
2127     }
2128
2129   return err;
2130 }
2131
2132
2133 #ifdef RE_ENABLE_I18N
2134 static int
2135 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2136     const re_match_context_t *mctx;
2137     re_sift_context_t *sctx;
2138     int node_idx, str_idx, max_str_idx;
2139 {
2140   re_dfa_t *const dfa = mctx->dfa;
2141   int naccepted;
2142   /* Check the node can accept `multi byte'.  */
2143   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2144   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2145       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2146                             dfa->nexts[node_idx]))
2147     /* The node can't accept the `multi byte', or the
2148        destination was already thrown away, then the node
2149        could't accept the current input `multi byte'.   */
2150     naccepted = 0;
2151   /* Otherwise, it is sure that the node could accept
2152      `naccepted' bytes input.  */
2153   return naccepted;
2154 }
2155 #endif /* RE_ENABLE_I18N */
2156
2157 \f
2158 /* Functions for state transition.  */
2159
2160 /* Return the next state to which the current state STATE will transit by
2161    accepting the current input byte, and update STATE_LOG if necessary.
2162    If STATE can accept a multibyte char/collating element/back reference
2163    update the destination of STATE_LOG.  */
2164
2165 static re_dfastate_t *
2166 transit_state (err, mctx, state)
2167      reg_errcode_t *err;
2168      re_match_context_t *mctx;
2169      re_dfastate_t *state;
2170 {
2171   re_dfa_t *const dfa = mctx->dfa;
2172   re_dfastate_t **trtable;
2173   unsigned char ch;
2174
2175   if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
2176       || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
2177           && mctx->input.valid_len < mctx->input.len))
2178     {
2179       *err = extend_buffers (mctx);
2180       if (BE (*err != REG_NOERROR, 0))
2181         return NULL;
2182     }
2183
2184 #ifdef RE_ENABLE_I18N
2185       /* If the current state can accept multibyte.  */
2186       if (state->accept_mb)
2187         {
2188           *err = transit_state_mb (mctx, state);
2189           if (BE (*err != REG_NOERROR, 0))
2190             return NULL;
2191         }
2192 #endif /* RE_ENABLE_I18N */
2193
2194   /* Then decide the next state with the single byte.  */
2195   if (1)
2196     {
2197       /* Use transition table  */
2198       ch = re_string_fetch_byte (&mctx->input);
2199       trtable = state->trtable;
2200       if (trtable == NULL)
2201         {
2202           trtable = build_trtable (dfa, state);
2203           if (trtable == NULL)
2204             {
2205               *err = REG_ESPACE;
2206               return NULL;
2207             }
2208         }
2209       if (BE (state->word_trtable, 0))
2210         {
2211           unsigned int context;
2212           context
2213             = re_string_context_at (&mctx->input,
2214                                     re_string_cur_idx (&mctx->input) - 1,
2215                                     mctx->eflags);
2216           if (IS_WORD_CONTEXT (context))
2217             return trtable[ch + SBC_MAX];
2218           else
2219             return trtable[ch];
2220         }
2221       else
2222         return trtable[ch];
2223     }
2224 #if 0
2225   else
2226     /* don't use transition table  */
2227     return transit_state_sb (err, mctx, state);
2228 #endif
2229 }
2230
2231 /* Update the state_log if we need */
2232 re_dfastate_t *
2233 merge_state_with_log (err, mctx, next_state)
2234      reg_errcode_t *err;
2235      re_match_context_t *mctx;
2236      re_dfastate_t *next_state;
2237 {
2238   re_dfa_t *const dfa = mctx->dfa;
2239   int cur_idx = re_string_cur_idx (&mctx->input);
2240
2241   if (cur_idx > mctx->state_log_top)
2242     {
2243       mctx->state_log[cur_idx] = next_state;
2244       mctx->state_log_top = cur_idx;
2245     }
2246   else if (mctx->state_log[cur_idx] == 0)
2247     {
2248       mctx->state_log[cur_idx] = next_state;
2249     }
2250   else
2251     {
2252       re_dfastate_t *pstate;
2253       unsigned int context;
2254       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2255       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2256          the destination of a multibyte char/collating element/
2257          back reference.  Then the next state is the union set of
2258          these destinations and the results of the transition table.  */
2259       pstate = mctx->state_log[cur_idx];
2260       log_nodes = pstate->entrance_nodes;
2261       if (next_state != NULL)
2262         {
2263           table_nodes = next_state->entrance_nodes;
2264           *err = re_node_set_init_union (&next_nodes, table_nodes,
2265                                              log_nodes);
2266           if (BE (*err != REG_NOERROR, 0))
2267             return NULL;
2268         }
2269       else
2270         next_nodes = *log_nodes;
2271       /* Note: We already add the nodes of the initial state,
2272          then we don't need to add them here.  */
2273
2274       context = re_string_context_at (&mctx->input,
2275                                       re_string_cur_idx (&mctx->input) - 1,
2276                                       mctx->eflags);
2277       next_state = mctx->state_log[cur_idx]
2278         = re_acquire_state_context (err, dfa, &next_nodes, context);
2279       /* We don't need to check errors here, since the return value of
2280          this function is next_state and ERR is already set.  */
2281
2282       if (table_nodes != NULL)
2283         re_node_set_free (&next_nodes);
2284     }
2285
2286   if (BE (dfa->nbackref, 0) && next_state != NULL)
2287     {
2288       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2289          later.  We must check them here, since the back references in the
2290          next state might use them.  */
2291       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2292                                         cur_idx);
2293       if (BE (*err != REG_NOERROR, 0))
2294         return NULL;
2295
2296       /* If the next state has back references.  */
2297       if (next_state->has_backref)
2298         {
2299           *err = transit_state_bkref (mctx, &next_state->nodes);
2300           if (BE (*err != REG_NOERROR, 0))
2301             return NULL;
2302           next_state = mctx->state_log[cur_idx];
2303         }
2304     }
2305
2306   return next_state;
2307 }
2308
2309 /* Skip bytes in the input that correspond to part of a
2310    multi-byte match, then look in the log for a state
2311    from which to restart matching.  */
2312 re_dfastate_t *
2313 find_recover_state (err, mctx)
2314      reg_errcode_t *err;
2315      re_match_context_t *mctx;
2316 {
2317   re_dfastate_t *cur_state = NULL;
2318   do
2319     {
2320       int max = mctx->state_log_top;
2321       int cur_str_idx = re_string_cur_idx (&mctx->input);
2322
2323       do
2324         {
2325           if (++cur_str_idx > max)
2326             return NULL;
2327           re_string_skip_bytes (&mctx->input, 1);
2328         }
2329       while (mctx->state_log[cur_str_idx] == NULL);
2330
2331       cur_state = merge_state_with_log (err, mctx, NULL);
2332     }
2333   while (err == REG_NOERROR && cur_state == NULL);
2334   return cur_state;
2335 }
2336
2337 /* Helper functions for transit_state.  */
2338
2339 /* From the node set CUR_NODES, pick up the nodes whose types are
2340    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2341    expression. And register them to use them later for evaluating the
2342    correspoding back references.  */
2343
2344 static reg_errcode_t
2345 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2346      re_match_context_t *mctx;
2347      re_node_set *cur_nodes;
2348      int str_idx;
2349 {
2350   re_dfa_t *const dfa = mctx->dfa;
2351   int node_idx;
2352   reg_errcode_t err;
2353
2354   /* TODO: This isn't efficient.
2355            Because there might be more than one nodes whose types are
2356            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2357            nodes.
2358            E.g. RE: (a){2}  */
2359   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2360     {
2361       int node = cur_nodes->elems[node_idx];
2362       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2363           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2364           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2365         {
2366           err = match_ctx_add_subtop (mctx, node, str_idx);
2367           if (BE (err != REG_NOERROR, 0))
2368             return err;
2369         }
2370     }
2371   return REG_NOERROR;
2372 }
2373
2374 #if 0
2375 /* Return the next state to which the current state STATE will transit by
2376    accepting the current input byte.  */
2377
2378 static re_dfastate_t *
2379 transit_state_sb (err, mctx, state)
2380      reg_errcode_t *err;
2381      re_match_context_t *mctx;
2382      re_dfastate_t *state;
2383 {
2384   re_dfa_t *const dfa = mctx->dfa;
2385   re_node_set next_nodes;
2386   re_dfastate_t *next_state;
2387   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2388   unsigned int context;
2389
2390   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2391   if (BE (*err != REG_NOERROR, 0))
2392     return NULL;
2393   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2394     {
2395       int cur_node = state->nodes.elems[node_cnt];
2396       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2397         {
2398           *err = re_node_set_merge (&next_nodes,
2399                                     dfa->eclosures + dfa->nexts[cur_node]);
2400           if (BE (*err != REG_NOERROR, 0))
2401             {
2402               re_node_set_free (&next_nodes);
2403               return NULL;
2404             }
2405         }
2406     }
2407   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2408   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2409   /* We don't need to check errors here, since the return value of
2410      this function is next_state and ERR is already set.  */
2411
2412   re_node_set_free (&next_nodes);
2413   re_string_skip_bytes (&mctx->input, 1);
2414   return next_state;
2415 }
2416 #endif
2417
2418 #ifdef RE_ENABLE_I18N
2419 static reg_errcode_t
2420 transit_state_mb (mctx, pstate)
2421     re_match_context_t *mctx;
2422     re_dfastate_t *pstate;
2423 {
2424   re_dfa_t *const dfa = mctx->dfa;
2425   reg_errcode_t err;
2426   int i;
2427
2428   for (i = 0; i < pstate->nodes.nelem; ++i)
2429     {
2430       re_node_set dest_nodes, *new_nodes;
2431       int cur_node_idx = pstate->nodes.elems[i];
2432       int naccepted = 0, dest_idx;
2433       unsigned int context;
2434       re_dfastate_t *dest_state;
2435
2436       if (dfa->nodes[cur_node_idx].constraint)
2437         {
2438           context = re_string_context_at (&mctx->input,
2439                                           re_string_cur_idx (&mctx->input),
2440                                           mctx->eflags);
2441           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2442                                            context))
2443             continue;
2444         }
2445
2446       /* How many bytes the node can accept?  */
2447       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2448         naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2449                                              re_string_cur_idx (&mctx->input));
2450       if (naccepted == 0)
2451         continue;
2452
2453       /* The node can accepts `naccepted' bytes.  */
2454       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2455       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2456                                : mctx->max_mb_elem_len);
2457       err = clean_state_log_if_needed (mctx, dest_idx);
2458       if (BE (err != REG_NOERROR, 0))
2459         return err;
2460 #ifdef DEBUG
2461       assert (dfa->nexts[cur_node_idx] != -1);
2462 #endif
2463       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2464          then we use pstate->nodes.elems[i] instead.  */
2465       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2466
2467       dest_state = mctx->state_log[dest_idx];
2468       if (dest_state == NULL)
2469         dest_nodes = *new_nodes;
2470       else
2471         {
2472           err = re_node_set_init_union (&dest_nodes,
2473                                         dest_state->entrance_nodes, new_nodes);
2474           if (BE (err != REG_NOERROR, 0))
2475             return err;
2476         }
2477       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2478       mctx->state_log[dest_idx]
2479         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2480       if (dest_state != NULL)
2481         re_node_set_free (&dest_nodes);
2482       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2483         return err;
2484     }
2485   return REG_NOERROR;
2486 }
2487 #endif /* RE_ENABLE_I18N */
2488
2489 static reg_errcode_t
2490 transit_state_bkref (mctx, nodes)
2491     re_match_context_t *mctx;
2492     const re_node_set *nodes;
2493 {
2494   re_dfa_t *const dfa = mctx->dfa;
2495   reg_errcode_t err;
2496   int i;
2497   int cur_str_idx = re_string_cur_idx (&mctx->input);
2498
2499   for (i = 0; i < nodes->nelem; ++i)
2500     {
2501       int dest_str_idx, prev_nelem, bkc_idx;
2502       int node_idx = nodes->elems[i];
2503       unsigned int context;
2504       const re_token_t *node = dfa->nodes + node_idx;
2505       re_node_set *new_dest_nodes;
2506
2507       /* Check whether `node' is a backreference or not.  */
2508       if (node->type != OP_BACK_REF)
2509         continue;
2510
2511       if (node->constraint)
2512         {
2513           context = re_string_context_at (&mctx->input, cur_str_idx,
2514                                           mctx->eflags);
2515           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2516             continue;
2517         }
2518
2519       /* `node' is a backreference.
2520          Check the substring which the substring matched.  */
2521       bkc_idx = mctx->nbkref_ents;
2522       err = get_subexp (mctx, node_idx, cur_str_idx);
2523       if (BE (err != REG_NOERROR, 0))
2524         goto free_return;
2525
2526       /* And add the epsilon closures (which is `new_dest_nodes') of
2527          the backreference to appropriate state_log.  */
2528 #ifdef DEBUG
2529       assert (dfa->nexts[node_idx] != -1);
2530 #endif
2531       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2532         {
2533           int subexp_len;
2534           re_dfastate_t *dest_state;
2535           struct re_backref_cache_entry *bkref_ent;
2536           bkref_ent = mctx->bkref_ents + bkc_idx;
2537           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2538             continue;
2539           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2540           new_dest_nodes = (subexp_len == 0
2541                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2542                             : dfa->eclosures + dfa->nexts[node_idx]);
2543           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2544                           - bkref_ent->subexp_from);
2545           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2546                                           mctx->eflags);
2547           dest_state = mctx->state_log[dest_str_idx];
2548           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2549                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2550           /* Add `new_dest_node' to state_log.  */
2551           if (dest_state == NULL)
2552             {
2553               mctx->state_log[dest_str_idx]
2554                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2555                                             context);
2556               if (BE (mctx->state_log[dest_str_idx] == NULL
2557                       && err != REG_NOERROR, 0))
2558                 goto free_return;
2559             }
2560           else
2561             {
2562               re_node_set dest_nodes;
2563               err = re_node_set_init_union (&dest_nodes,
2564                                             dest_state->entrance_nodes,
2565                                             new_dest_nodes);
2566               if (BE (err != REG_NOERROR, 0))
2567                 {
2568                   re_node_set_free (&dest_nodes);
2569                   goto free_return;
2570                 }
2571               mctx->state_log[dest_str_idx]
2572                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2573               re_node_set_free (&dest_nodes);
2574               if (BE (mctx->state_log[dest_str_idx] == NULL
2575                       && err != REG_NOERROR, 0))
2576                 goto free_return;
2577             }
2578           /* We need to check recursively if the backreference can epsilon
2579              transit.  */
2580           if (subexp_len == 0
2581               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2582             {
2583               err = check_subexp_matching_top (mctx, new_dest_nodes,
2584                                                cur_str_idx);
2585               if (BE (err != REG_NOERROR, 0))
2586                 goto free_return;
2587               err = transit_state_bkref (mctx, new_dest_nodes);
2588               if (BE (err != REG_NOERROR, 0))
2589                 goto free_return;
2590             }
2591         }
2592     }
2593   err = REG_NOERROR;
2594  free_return:
2595   return err;
2596 }
2597
2598 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2599    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2600    Note that we might collect inappropriate candidates here.
2601    However, the cost of checking them strictly here is too high, then we
2602    delay these checking for prune_impossible_nodes().  */
2603
2604 static reg_errcode_t
2605 get_subexp (mctx, bkref_node, bkref_str_idx)
2606      re_match_context_t *mctx;
2607      int bkref_node, bkref_str_idx;
2608 {
2609   re_dfa_t *const dfa = mctx->dfa;
2610   int subexp_num, sub_top_idx;
2611   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2612   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2613   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2614   for (; cache_idx < mctx->nbkref_ents; ++cache_idx)
2615     {
2616       const struct re_backref_cache_entry *entry
2617         = &mctx->bkref_ents[cache_idx];
2618       if (entry->str_idx > bkref_str_idx)
2619         break;
2620       if (entry->node == bkref_node)
2621         return REG_NOERROR; /* We already checked it.  */
2622     }
2623   subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2624
2625   /* For each sub expression  */
2626   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2627     {
2628       reg_errcode_t err;
2629       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2630       re_sub_match_last_t *sub_last;
2631       int sub_last_idx, sl_str, bkref_str_off;
2632
2633       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2634         continue; /* It isn't related.  */
2635
2636       sl_str = sub_top->str_idx;
2637       bkref_str_off = bkref_str_idx;
2638       /* At first, check the last node of sub expressions we already
2639          evaluated.  */
2640       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2641         {
2642           int sl_str_diff;
2643           sub_last = sub_top->lasts[sub_last_idx];
2644           sl_str_diff = sub_last->str_idx - sl_str;
2645           /* The matched string by the sub expression match with the substring
2646              at the back reference?  */
2647           if (sl_str_diff > 0)
2648             {
2649               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2650                 {
2651                   /* Not enough chars for a successful match.  */
2652                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2653                     break;
2654
2655                   err = clean_state_log_if_needed (mctx,
2656                                                    bkref_str_off
2657                                                    + sl_str_diff);
2658                   if (BE (err != REG_NOERROR, 0))
2659                     return err;
2660                   buf = (const char *) re_string_get_buffer (&mctx->input);
2661                 }
2662               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2663                 break; /* We don't need to search this sub expression any more.  */
2664             }
2665           bkref_str_off += sl_str_diff;
2666           sl_str += sl_str_diff;
2667           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2668                                 bkref_str_idx);
2669
2670           /* Reload buf, since the preceding call might have reallocated
2671              the buffer.  */
2672           buf = (const char *) re_string_get_buffer (&mctx->input);
2673
2674           if (err == REG_NOMATCH)
2675             continue;
2676           if (BE (err != REG_NOERROR, 0))
2677             return err;
2678         }
2679
2680       if (sub_last_idx < sub_top->nlasts)
2681         continue;
2682       if (sub_last_idx > 0)
2683         ++sl_str;
2684       /* Then, search for the other last nodes of the sub expression.  */
2685       for (; sl_str <= bkref_str_idx; ++sl_str)
2686         {
2687           int cls_node, sl_str_off;
2688           const re_node_set *nodes;
2689           sl_str_off = sl_str - sub_top->str_idx;
2690           /* The matched string by the sub expression match with the substring
2691              at the back reference?  */
2692           if (sl_str_off > 0)
2693             {
2694               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2695                 {
2696                   /* If we are at the end of the input, we cannot match.  */
2697                   if (bkref_str_off >= mctx->input.len)
2698                     break;
2699
2700                   err = extend_buffers (mctx);
2701                   if (BE (err != REG_NOERROR, 0))
2702                     return err;
2703
2704                   buf = (const char *) re_string_get_buffer (&mctx->input);
2705                 }
2706               if (buf [bkref_str_off++] != buf[sl_str - 1])
2707                 break; /* We don't need to search this sub expression
2708                           any more.  */
2709             }
2710           if (mctx->state_log[sl_str] == NULL)
2711             continue;
2712           /* Does this state have a ')' of the sub expression?  */
2713           nodes = &mctx->state_log[sl_str]->nodes;
2714           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2715           if (cls_node == -1)
2716             continue; /* No.  */
2717           if (sub_top->path == NULL)
2718             {
2719               sub_top->path = calloc (sizeof (state_array_t),
2720                                       sl_str - sub_top->str_idx + 1);
2721               if (sub_top->path == NULL)
2722                 return REG_ESPACE;
2723             }
2724           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2725              in the current context?  */
2726           err = check_arrival (mctx, sub_top->path, sub_top->node,
2727                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2728           if (err == REG_NOMATCH)
2729               continue;
2730           if (BE (err != REG_NOERROR, 0))
2731               return err;
2732           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2733           if (BE (sub_last == NULL, 0))
2734             return REG_ESPACE;
2735           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2736                                 bkref_str_idx);
2737           if (err == REG_NOMATCH)
2738             continue;
2739         }
2740     }
2741   return REG_NOERROR;
2742 }
2743
2744 /* Helper functions for get_subexp().  */
2745
2746 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2747    If it can arrive, register the sub expression expressed with SUB_TOP
2748    and SUB_LAST.  */
2749
2750 static reg_errcode_t
2751 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2752      re_match_context_t *mctx;
2753      const re_sub_match_top_t *sub_top;
2754      re_sub_match_last_t *sub_last;
2755      int bkref_node, bkref_str;
2756 {
2757   reg_errcode_t err;
2758   int to_idx;
2759   /* Can the subexpression arrive the back reference?  */
2760   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2761                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2762   if (err != REG_NOERROR)
2763     return err;
2764   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2765                              sub_last->str_idx);
2766   if (BE (err != REG_NOERROR, 0))
2767     return err;
2768   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2769   return clean_state_log_if_needed (mctx, to_idx);
2770 }
2771
2772 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2773    Search '(' if FL_OPEN, or search ')' otherwise.
2774    TODO: This function isn't efficient...
2775          Because there might be more than one nodes whose types are
2776          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2777          nodes.
2778          E.g. RE: (a){2}  */
2779
2780 static int
2781 find_subexp_node (dfa, nodes, subexp_idx, type)
2782      const re_dfa_t *dfa;
2783      const re_node_set *nodes;
2784      int subexp_idx, type;
2785 {
2786   int cls_idx;
2787   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2788     {
2789       int cls_node = nodes->elems[cls_idx];
2790       const re_token_t *node = dfa->nodes + cls_node;
2791       if (node->type == type
2792           && node->opr.idx == subexp_idx)
2793         return cls_node;
2794     }
2795   return -1;
2796 }
2797
2798 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2799    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2800    heavily reused.
2801    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2802
2803 static reg_errcode_t
2804 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2805                type)
2806      re_match_context_t *mctx;
2807      state_array_t *path;
2808      int top_node, top_str, last_node, last_str, type;
2809 {
2810   re_dfa_t *const dfa = mctx->dfa;
2811   reg_errcode_t err;
2812   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2813   re_dfastate_t *cur_state = NULL;
2814   re_node_set *cur_nodes, next_nodes;
2815   re_dfastate_t **backup_state_log;
2816   unsigned int context;
2817
2818   subexp_num = dfa->nodes[top_node].opr.idx;
2819   /* Extend the buffer if we need.  */
2820   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2821     {
2822       re_dfastate_t **new_array;
2823       int old_alloc = path->alloc;
2824       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2825       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2826       if (new_array == NULL)
2827         {
2828           path->alloc = old_alloc;
2829           return REG_ESPACE;
2830         }
2831       path->array = new_array;
2832       memset (new_array + old_alloc, '\0',
2833               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2834     }
2835
2836   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2837
2838   /* Temporary modify MCTX.  */
2839   backup_state_log = mctx->state_log;
2840   backup_cur_idx = mctx->input.cur_idx;
2841   mctx->state_log = path->array;
2842   mctx->input.cur_idx = str_idx;
2843
2844   /* Setup initial node set.  */
2845   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2846   if (str_idx == top_str)
2847     {
2848       err = re_node_set_init_1 (&next_nodes, top_node);
2849       if (BE (err != REG_NOERROR, 0))
2850         return err;
2851       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2852       if (BE (err != REG_NOERROR, 0))
2853         {
2854           re_node_set_free (&next_nodes);
2855           return err;
2856         }
2857     }
2858   else
2859     {
2860       cur_state = mctx->state_log[str_idx];
2861       if (cur_state && cur_state->has_backref)
2862         {
2863           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2864           if (BE ( err != REG_NOERROR, 0))
2865             return err;
2866         }
2867       else
2868         re_node_set_init_empty (&next_nodes);
2869     }
2870   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2871     {
2872       if (next_nodes.nelem)
2873         {
2874           err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
2875                                     subexp_num, type);
2876           if (BE ( err != REG_NOERROR, 0))
2877             {
2878               re_node_set_free (&next_nodes);
2879               return err;
2880             }
2881         }
2882       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2883       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2884         {
2885           re_node_set_free (&next_nodes);
2886           return err;
2887         }
2888       mctx->state_log[str_idx] = cur_state;
2889     }
2890
2891   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2892     {
2893       re_node_set_empty (&next_nodes);
2894       if (mctx->state_log[str_idx + 1])
2895         {
2896           err = re_node_set_merge (&next_nodes,
2897                                    &mctx->state_log[str_idx + 1]->nodes);
2898           if (BE (err != REG_NOERROR, 0))
2899             {
2900               re_node_set_free (&next_nodes);
2901               return err;
2902             }
2903         }
2904       if (cur_state)
2905         {
2906           err = check_arrival_add_next_nodes (mctx, str_idx,
2907                                               &cur_state->nodes, &next_nodes);
2908           if (BE (err != REG_NOERROR, 0))
2909             {
2910               re_node_set_free (&next_nodes);
2911               return err;
2912             }
2913         }
2914       ++str_idx;
2915       if (next_nodes.nelem)
2916         {
2917           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2918           if (BE (err != REG_NOERROR, 0))
2919             {
2920               re_node_set_free (&next_nodes);
2921               return err;
2922             }
2923           err = expand_bkref_cache (mctx, &next_nodes, str_idx, last_str,
2924                                     subexp_num, type);
2925           if (BE ( err != REG_NOERROR, 0))
2926             {
2927               re_node_set_free (&next_nodes);
2928               return err;
2929             }
2930         }
2931       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2932       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2933       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2934         {
2935           re_node_set_free (&next_nodes);
2936           return err;
2937         }
2938       mctx->state_log[str_idx] = cur_state;
2939       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2940     }
2941   re_node_set_free (&next_nodes);
2942   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2943                : &mctx->state_log[last_str]->nodes);
2944   path->next_idx = str_idx;
2945
2946   /* Fix MCTX.  */
2947   mctx->state_log = backup_state_log;
2948   mctx->input.cur_idx = backup_cur_idx;
2949
2950   /* Then check the current node set has the node LAST_NODE.  */
2951   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2952     return REG_NOERROR;
2953
2954   return REG_NOMATCH;
2955 }
2956
2957 /* Helper functions for check_arrival.  */
2958
2959 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2960    to NEXT_NODES.
2961    TODO: This function is similar to the functions transit_state*(),
2962          however this function has many additional works.
2963          Can't we unify them?  */
2964
2965 static reg_errcode_t
2966 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
2967      re_match_context_t *mctx;
2968      int str_idx;
2969      re_node_set *cur_nodes, *next_nodes;
2970 {
2971   re_dfa_t *const dfa = mctx->dfa;
2972   int cur_idx;
2973   reg_errcode_t err;
2974   re_node_set union_set;
2975   re_node_set_init_empty (&union_set);
2976   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2977     {
2978       int naccepted = 0;
2979       int cur_node = cur_nodes->elems[cur_idx];
2980       re_token_type_t type = dfa->nodes[cur_node].type;
2981       if (IS_EPSILON_NODE (type))
2982         continue;
2983 #ifdef RE_ENABLE_I18N
2984       /* If the node may accept `multi byte'.  */
2985       if (ACCEPT_MB_NODE (type))
2986         {
2987           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2988                                                str_idx);
2989           if (naccepted > 1)
2990             {
2991               re_dfastate_t *dest_state;
2992               int next_node = dfa->nexts[cur_node];
2993               int next_idx = str_idx + naccepted;
2994               dest_state = mctx->state_log[next_idx];
2995               re_node_set_empty (&union_set);
2996               if (dest_state)
2997                 {
2998                   err = re_node_set_merge (&union_set, &dest_state->nodes);
2999                   if (BE (err != REG_NOERROR, 0))
3000                     {
3001                       re_node_set_free (&union_set);
3002                       return err;
3003                     }
3004                 }
3005               err = re_node_set_insert (&union_set, next_node);
3006               if (BE (err < 0, 0))
3007                 {
3008                   re_node_set_free (&union_set);
3009                   return REG_ESPACE;
3010                 }
3011               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3012                                                             &union_set);
3013               if (BE (mctx->state_log[next_idx] == NULL
3014                       && err != REG_NOERROR, 0))
3015                 {
3016                   re_node_set_free (&union_set);
3017                   return err;
3018                 }
3019             }
3020         }
3021 #endif /* RE_ENABLE_I18N */
3022       if (naccepted
3023           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3024         {
3025           err = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3026           if (BE (err < 0, 0))
3027             {
3028               re_node_set_free (&union_set);
3029               return REG_ESPACE;
3030             }
3031         }
3032     }
3033   re_node_set_free (&union_set);
3034   return REG_NOERROR;
3035 }
3036
3037 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3038    CUR_NODES, however exclude the nodes which are:
3039     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3040     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3041 */
3042
3043 static reg_errcode_t
3044 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3045      re_dfa_t *dfa;
3046      re_node_set *cur_nodes;
3047      int ex_subexp, type;
3048 {
3049   reg_errcode_t err;
3050   int idx, outside_node;
3051   re_node_set new_nodes;
3052 #ifdef DEBUG
3053   assert (cur_nodes->nelem);
3054 #endif
3055   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3056   if (BE (err != REG_NOERROR, 0))
3057     return err;
3058   /* Create a new node set NEW_NODES with the nodes which are epsilon
3059      closures of the node in CUR_NODES.  */
3060
3061   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3062     {
3063       int cur_node = cur_nodes->elems[idx];
3064       re_node_set *eclosure = dfa->eclosures + cur_node;
3065       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3066       if (outside_node == -1)
3067         {
3068           /* There are no problematic nodes, just merge them.  */
3069           err = re_node_set_merge (&new_nodes, eclosure);
3070           if (BE (err != REG_NOERROR, 0))
3071             {
3072               re_node_set_free (&new_nodes);
3073               return err;
3074             }
3075         }
3076       else
3077         {
3078           /* There are problematic nodes, re-calculate incrementally.  */
3079           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3080                                               ex_subexp, type);
3081           if (BE (err != REG_NOERROR, 0))
3082             {
3083               re_node_set_free (&new_nodes);
3084               return err;
3085             }
3086         }
3087     }
3088   re_node_set_free (cur_nodes);
3089   *cur_nodes = new_nodes;
3090   return REG_NOERROR;
3091 }
3092
3093 /* Helper function for check_arrival_expand_ecl.
3094    Check incrementally the epsilon closure of TARGET, and if it isn't
3095    problematic append it to DST_NODES.  */
3096
3097 static reg_errcode_t
3098 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3099      re_dfa_t *dfa;
3100      int target, ex_subexp, type;
3101      re_node_set *dst_nodes;
3102 {
3103   int cur_node;
3104   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3105     {
3106       int err;
3107
3108       if (dfa->nodes[cur_node].type == type
3109           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3110         {
3111           if (type == OP_CLOSE_SUBEXP)
3112             {
3113               err = re_node_set_insert (dst_nodes, cur_node);
3114               if (BE (err == -1, 0))
3115                 return REG_ESPACE;
3116             }
3117           break;
3118         }
3119       err = re_node_set_insert (dst_nodes, cur_node);
3120       if (BE (err == -1, 0))
3121         return REG_ESPACE;
3122       if (dfa->edests[cur_node].nelem == 0)
3123         break;
3124       if (dfa->edests[cur_node].nelem == 2)
3125         {
3126           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3127                                               dfa->edests[cur_node].elems[1],
3128                                               ex_subexp, type);
3129           if (BE (err != REG_NOERROR, 0))
3130             return err;
3131         }
3132       cur_node = dfa->edests[cur_node].elems[0];
3133     }
3134   return REG_NOERROR;
3135 }
3136
3137
3138 /* For all the back references in the current state, calculate the
3139    destination of the back references by the appropriate entry
3140    in MCTX->BKREF_ENTS.  */
3141
3142 static reg_errcode_t
3143 expand_bkref_cache (mctx, cur_nodes, cur_str, last_str, subexp_num,
3144                     type)
3145      re_match_context_t *mctx;
3146      int cur_str, last_str, subexp_num, type;
3147      re_node_set *cur_nodes;
3148 {
3149   re_dfa_t *const dfa = mctx->dfa;
3150   reg_errcode_t err;
3151   int cache_idx, cache_idx_start;
3152   /* The current state.  */
3153
3154   cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3155   for (cache_idx = cache_idx_start; cache_idx < mctx->nbkref_ents; ++cache_idx)
3156     {
3157       int to_idx, next_node;
3158       struct re_backref_cache_entry *ent = mctx->bkref_ents + cache_idx;
3159       if (ent->str_idx > cur_str)
3160         break;
3161       /* Is this entry ENT is appropriate?  */
3162       if (!re_node_set_contains (cur_nodes, ent->node))
3163         continue; /* No.  */
3164
3165       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3166       /* Calculate the destination of the back reference, and append it
3167          to MCTX->STATE_LOG.  */
3168       if (to_idx == cur_str)
3169         {
3170           /* The backreference did epsilon transit, we must re-check all the
3171              node in the current state.  */
3172           re_node_set new_dests;
3173           reg_errcode_t err2, err3;
3174           next_node = dfa->edests[ent->node].elems[0];
3175           if (re_node_set_contains (cur_nodes, next_node))
3176             continue;
3177           err = re_node_set_init_1 (&new_dests, next_node);
3178           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3179           err3 = re_node_set_merge (cur_nodes, &new_dests);
3180           re_node_set_free (&new_dests);
3181           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3182                   || err3 != REG_NOERROR, 0))
3183             {
3184               err = (err != REG_NOERROR ? err
3185                      : (err2 != REG_NOERROR ? err2 : err3));
3186               return err;
3187             }
3188           /* TODO: It is still inefficient...  */
3189           cache_idx = cache_idx_start - 1;
3190           continue;
3191         }
3192       else
3193         {
3194           re_node_set union_set;
3195           next_node = dfa->nexts[ent->node];
3196           if (mctx->state_log[to_idx])
3197             {
3198               int ret;
3199               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3200                                         next_node))
3201                 continue;
3202               err = re_node_set_init_copy (&union_set,
3203                                            &mctx->state_log[to_idx]->nodes);
3204               ret = re_node_set_insert (&union_set, next_node);
3205               if (BE (err != REG_NOERROR || ret < 0, 0))
3206                 {
3207                   re_node_set_free (&union_set);
3208                   err = err != REG_NOERROR ? err : REG_ESPACE;
3209                   return err;
3210                 }
3211             }
3212           else
3213             {
3214               err = re_node_set_init_1 (&union_set, next_node);
3215               if (BE (err != REG_NOERROR, 0))
3216                 return err;
3217             }
3218           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3219           re_node_set_free (&union_set);
3220           if (BE (mctx->state_log[to_idx] == NULL
3221                   && err != REG_NOERROR, 0))
3222             return err;
3223         }
3224     }
3225   return REG_NOERROR;
3226 }
3227
3228 /* Build transition table for the state.
3229    Return the new table if succeeded, otherwise return NULL.  */
3230
3231 static re_dfastate_t **
3232 build_trtable (dfa, state)
3233     re_dfa_t *dfa;
3234     re_dfastate_t *state;
3235 {
3236   reg_errcode_t err;
3237   int i, j, ch;
3238   unsigned int elem, mask;
3239   int dests_node_malloced = 0, dest_states_malloced = 0;
3240   int ndests; /* Number of the destination states from `state'.  */
3241   re_dfastate_t **trtable;
3242   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3243   re_node_set follows, *dests_node;
3244   bitset *dests_ch;
3245   bitset acceptable;
3246
3247   /* We build DFA states which corresponds to the destination nodes
3248      from `state'.  `dests_node[i]' represents the nodes which i-th
3249      destination state contains, and `dests_ch[i]' represents the
3250      characters which i-th destination state accepts.  */
3251 #ifdef _LIBC
3252   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3253     dests_node = (re_node_set *)
3254                  alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3255   else
3256 #endif
3257     {
3258       dests_node = (re_node_set *)
3259                    malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3260       if (BE (dests_node == NULL, 0))
3261         return NULL;
3262       dests_node_malloced = 1;
3263     }
3264   dests_ch = (bitset *) (dests_node + SBC_MAX);
3265
3266   /* Initialize transiton table.  */
3267   state->word_trtable = 0;
3268
3269   /* At first, group all nodes belonging to `state' into several
3270      destinations.  */
3271   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3272   if (BE (ndests <= 0, 0))
3273     {
3274       if (dests_node_malloced)
3275         free (dests_node);
3276       /* Return NULL in case of an error, trtable otherwise.  */
3277       if (ndests == 0)
3278         {
3279           state->trtable = (re_dfastate_t **)
3280             calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3281           return state->trtable;
3282         }
3283       return NULL;
3284     }
3285
3286   err = re_node_set_alloc (&follows, ndests + 1);
3287   if (BE (err != REG_NOERROR, 0))
3288     goto out_free;
3289
3290 #ifdef _LIBC
3291   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3292                          + ndests * 3 * sizeof (re_dfastate_t *)))
3293     dest_states = (re_dfastate_t **)
3294                   alloca (ndests * 3 * sizeof (re_dfastate_t *));
3295   else
3296 #endif
3297     {
3298       dest_states = (re_dfastate_t **)
3299                     malloc (ndests * 3 * sizeof (re_dfastate_t *));
3300       if (BE (dest_states == NULL, 0))
3301         {
3302 out_free:
3303           if (dest_states_malloced)
3304             free (dest_states);
3305           re_node_set_free (&follows);
3306           for (i = 0; i < ndests; ++i)
3307             re_node_set_free (dests_node + i);
3308           if (dests_node_malloced)
3309             free (dests_node);
3310           return NULL;
3311         }
3312       dest_states_malloced = 1;
3313     }
3314   dest_states_word = dest_states + ndests;
3315   dest_states_nl = dest_states_word + ndests;
3316   bitset_empty (acceptable);
3317
3318   /* Then build the states for all destinations.  */
3319   for (i = 0; i < ndests; ++i)
3320     {
3321       int next_node;
3322       re_node_set_empty (&follows);
3323       /* Merge the follows of this destination states.  */
3324       for (j = 0; j < dests_node[i].nelem; ++j)
3325         {
3326           next_node = dfa->nexts[dests_node[i].elems[j]];
3327           if (next_node != -1)
3328             {
3329               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3330               if (BE (err != REG_NOERROR, 0))
3331                 goto out_free;
3332             }
3333         }
3334       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3335       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3336         goto out_free;
3337       /* If the new state has context constraint,
3338          build appropriate states for these contexts.  */
3339       if (dest_states[i]->has_constraint)
3340         {
3341           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3342                                                           CONTEXT_WORD);
3343           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3344             goto out_free;
3345
3346           if (dest_states[i] != dest_states_word[i]
3347               && dfa->mb_cur_max > 1)
3348             state->word_trtable = 1;
3349
3350           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3351                                                         CONTEXT_NEWLINE);
3352           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3353             goto out_free;
3354         }
3355       else
3356         {
3357           dest_states_word[i] = dest_states[i];
3358           dest_states_nl[i] = dest_states[i];
3359         }
3360       bitset_merge (acceptable, dests_ch[i]);
3361     }
3362
3363   if (!BE (state->word_trtable, 0))
3364     {
3365       /* We don't care about whether the following character is a word
3366          character, or we are in a single-byte character set so we can
3367          discern by looking at the character code: allocate a
3368          256-entry transition table.  */
3369       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3370       if (BE (trtable == NULL, 0))
3371         goto out_free;
3372
3373       /* For all characters ch...:  */
3374       for (i = 0; i < BITSET_UINTS; ++i)
3375         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3376              elem;
3377              mask <<= 1, elem >>= 1, ++ch)
3378           if (BE (elem & 1, 0))
3379             {
3380               /* There must be exactly one destination which accepts
3381                  character ch.  See group_nodes_into_DFAstates.  */
3382               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3383                 ;
3384
3385               /* j-th destination accepts the word character ch.  */
3386               if (dfa->word_char[i] & mask)
3387                 trtable[ch] = dest_states_word[j];
3388               else
3389                 trtable[ch] = dest_states[j];
3390             }
3391     }
3392   else
3393     {
3394       /* We care about whether the following character is a word
3395          character, and we are in a multi-byte character set: discern
3396          by looking at the character code: build two 256-entry
3397          transition tables, one starting at trtable[0] and one
3398          starting at trtable[SBC_MAX].  */
3399       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3400                                            2 * SBC_MAX);
3401       if (BE (trtable == NULL, 0))
3402         goto out_free;
3403
3404       /* For all characters ch...:  */
3405       for (i = 0; i < BITSET_UINTS; ++i)
3406         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3407              elem;
3408              mask <<= 1, elem >>= 1, ++ch)
3409           if (BE (elem & 1, 0))
3410             {
3411               /* There must be exactly one destination which accepts
3412                  character ch.  See group_nodes_into_DFAstates.  */
3413               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3414                 ;
3415
3416               /* j-th destination accepts the word character ch.  */
3417               trtable[ch] = dest_states[j];
3418               trtable[ch + SBC_MAX] = dest_states_word[j];
3419             }
3420     }
3421
3422   /* new line */
3423   if (bitset_contain (acceptable, NEWLINE_CHAR))
3424     {
3425       /* The current state accepts newline character.  */
3426       for (j = 0; j < ndests; ++j)
3427         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3428           {
3429             /* k-th destination accepts newline character.  */
3430             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3431             if (state->word_trtable)
3432               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3433             /* There must be only one destination which accepts
3434                newline.  See group_nodes_into_DFAstates.  */
3435             break;
3436           }
3437     }
3438
3439   if (dest_states_malloced)
3440     free (dest_states);
3441
3442   re_node_set_free (&follows);
3443   for (i = 0; i < ndests; ++i)
3444     re_node_set_free (dests_node + i);
3445
3446   if (dests_node_malloced)
3447     free (dests_node);
3448
3449   state->trtable = trtable;
3450   return trtable;
3451 }
3452
3453 /* Group all nodes belonging to STATE into several destinations.
3454    Then for all destinations, set the nodes belonging to the destination
3455    to DESTS_NODE[i] and set the characters accepted by the destination
3456    to DEST_CH[i].  This function return the number of destinations.  */
3457
3458 static int
3459 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3460     re_dfa_t *dfa;
3461     const re_dfastate_t *state;
3462     re_node_set *dests_node;
3463     bitset *dests_ch;
3464 {
3465   reg_errcode_t err;
3466   int i, j, k;
3467   int ndests; /* Number of the destinations from `state'.  */
3468   bitset accepts; /* Characters a node can accept.  */
3469   const re_node_set *cur_nodes = &state->nodes;
3470   bitset_empty (accepts);
3471   ndests = 0;
3472
3473   /* For all the nodes belonging to `state',  */
3474   for (i = 0; i < cur_nodes->nelem; ++i)
3475     {
3476       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3477       re_token_type_t type = node->type;
3478       unsigned int constraint = node->constraint;
3479
3480       /* Enumerate all single byte character this node can accept.  */
3481       if (type == CHARACTER)
3482         bitset_set (accepts, node->opr.c);
3483       else if (type == SIMPLE_BRACKET)
3484         {
3485           bitset_merge (accepts, node->opr.sbcset);
3486         }
3487       else if (type == OP_PERIOD)
3488         {
3489 #ifdef RE_ENABLE_I18N
3490           if (dfa->mb_cur_max > 1)
3491             bitset_merge (accepts, dfa->sb_char);
3492           else
3493 #endif
3494             bitset_set_all (accepts);
3495           if (!(dfa->syntax & RE_DOT_NEWLINE))
3496             bitset_clear (accepts, '\n');
3497           if (dfa->syntax & RE_DOT_NOT_NULL)
3498             bitset_clear (accepts, '\0');
3499         }
3500 #ifdef RE_ENABLE_I18N
3501       else if (type == OP_UTF8_PERIOD)
3502         {
3503           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3504           if (!(dfa->syntax & RE_DOT_NEWLINE))
3505             bitset_clear (accepts, '\n');
3506           if (dfa->syntax & RE_DOT_NOT_NULL)
3507             bitset_clear (accepts, '\0');
3508         }
3509 #endif
3510       else
3511         continue;
3512
3513       /* Check the `accepts' and sift the characters which are not
3514          match it the context.  */
3515       if (constraint)
3516         {
3517           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3518             {
3519               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3520               bitset_empty (accepts);
3521               if (accepts_newline)
3522                 bitset_set (accepts, NEWLINE_CHAR);
3523               else
3524                 continue;
3525             }
3526           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3527             {
3528               bitset_empty (accepts);
3529               continue;
3530             }
3531
3532           if (constraint & NEXT_WORD_CONSTRAINT)
3533             {
3534               unsigned int any_set = 0;
3535               if (type == CHARACTER && !node->word_char)
3536                 {
3537                   bitset_empty (accepts);
3538                   continue;
3539                 }
3540 #ifdef RE_ENABLE_I18N
3541               if (dfa->mb_cur_max > 1)
3542                 for (j = 0; j < BITSET_UINTS; ++j)
3543                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3544               else
3545 #endif
3546                 for (j = 0; j < BITSET_UINTS; ++j)
3547                   any_set |= (accepts[j] &= dfa->word_char[j]);
3548               if (!any_set)
3549                 continue;
3550             }
3551           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3552             {
3553               unsigned int any_set = 0;
3554               if (type == CHARACTER && node->word_char)
3555                 {
3556                   bitset_empty (accepts);
3557                   continue;
3558                 }
3559 #ifdef RE_ENABLE_I18N
3560               if (dfa->mb_cur_max > 1)
3561                 for (j = 0; j < BITSET_UINTS; ++j)
3562                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3563               else
3564 #endif
3565                 for (j = 0; j < BITSET_UINTS; ++j)
3566                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3567               if (!any_set)
3568                 continue;
3569             }
3570         }
3571
3572       /* Then divide `accepts' into DFA states, or create a new
3573          state.  Above, we make sure that accepts is not empty.  */
3574       for (j = 0; j < ndests; ++j)
3575         {
3576           bitset intersec; /* Intersection sets, see below.  */
3577           bitset remains;
3578           /* Flags, see below.  */
3579           int has_intersec, not_subset, not_consumed;
3580
3581           /* Optimization, skip if this state doesn't accept the character.  */
3582           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3583             continue;
3584
3585           /* Enumerate the intersection set of this state and `accepts'.  */
3586           has_intersec = 0;
3587           for (k = 0; k < BITSET_UINTS; ++k)
3588             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3589           /* And skip if the intersection set is empty.  */
3590           if (!has_intersec)
3591             continue;
3592
3593           /* Then check if this state is a subset of `accepts'.  */
3594           not_subset = not_consumed = 0;
3595           for (k = 0; k < BITSET_UINTS; ++k)
3596             {
3597               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3598               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3599             }
3600
3601           /* If this state isn't a subset of `accepts', create a
3602              new group state, which has the `remains'. */
3603           if (not_subset)
3604             {
3605               bitset_copy (dests_ch[ndests], remains);
3606               bitset_copy (dests_ch[j], intersec);
3607               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3608               if (BE (err != REG_NOERROR, 0))
3609                 goto error_return;
3610               ++ndests;
3611             }
3612
3613           /* Put the position in the current group. */
3614           err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3615           if (BE (err < 0, 0))
3616             goto error_return;
3617
3618           /* If all characters are consumed, go to next node. */
3619           if (!not_consumed)
3620             break;
3621         }
3622       /* Some characters remain, create a new group. */
3623       if (j == ndests)
3624         {
3625           bitset_copy (dests_ch[ndests], accepts);
3626           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3627           if (BE (err != REG_NOERROR, 0))
3628             goto error_return;
3629           ++ndests;
3630           bitset_empty (accepts);
3631         }
3632     }
3633   return ndests;
3634  error_return:
3635   for (j = 0; j < ndests; ++j)
3636     re_node_set_free (dests_node + j);
3637   return -1;
3638 }
3639
3640 #ifdef RE_ENABLE_I18N
3641 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3642    Return the number of the bytes the node accepts.
3643    STR_IDX is the current index of the input string.
3644
3645    This function handles the nodes which can accept one character, or
3646    one collating element like '.', '[a-z]', opposite to the other nodes
3647    can only accept one byte.  */
3648
3649 static int
3650 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3651     re_dfa_t *dfa;
3652     int node_idx, str_idx;
3653     const re_string_t *input;
3654 {
3655   const re_token_t *node = dfa->nodes + node_idx;
3656   int char_len, elem_len;
3657   int i;
3658
3659   if (BE (node->type == OP_UTF8_PERIOD, 0))
3660     {
3661       unsigned char c = re_string_byte_at (input, str_idx), d;
3662       if (BE (c < 0xc2, 1))
3663         return 0;
3664
3665       if (str_idx + 2 > input->len)
3666         return 0;
3667
3668       d = re_string_byte_at (input, str_idx + 1);
3669       if (c < 0xe0)
3670         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3671       else if (c < 0xf0)
3672         {
3673           char_len = 3;
3674           if (c == 0xe0 && d < 0xa0)
3675             return 0;
3676         }
3677       else if (c < 0xf8)
3678         {
3679           char_len = 4;
3680           if (c == 0xf0 && d < 0x90)
3681             return 0;
3682         }
3683       else if (c < 0xfc)
3684         {
3685           char_len = 5;
3686           if (c == 0xf8 && d < 0x88)
3687             return 0;
3688         }
3689       else if (c < 0xfe)
3690         {
3691           char_len = 6;
3692           if (c == 0xfc && d < 0x84)
3693             return 0;
3694         }
3695       else
3696         return 0;
3697
3698       if (str_idx + char_len > input->len)
3699         return 0;
3700
3701       for (i = 1; i < char_len; ++i)
3702         {
3703           d = re_string_byte_at (input, str_idx + i);
3704           if (d < 0x80 || d > 0xbf)
3705             return 0;
3706         }
3707       return char_len;
3708     }
3709
3710   char_len = re_string_char_size_at (input, str_idx);
3711   if (node->type == OP_PERIOD)
3712     {
3713       if (char_len <= 1)
3714         return 0;
3715       /* FIXME: I don't think this if is needed, as both '\n'
3716          and '\0' are char_len == 1.  */
3717       /* '.' accepts any one character except the following two cases.  */
3718       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3719            re_string_byte_at (input, str_idx) == '\n') ||
3720           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3721            re_string_byte_at (input, str_idx) == '\0'))
3722         return 0;
3723       return char_len;
3724     }
3725
3726   elem_len = re_string_elem_size_at (input, str_idx);
3727   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3728     return 0;
3729
3730   if (node->type == COMPLEX_BRACKET)
3731     {
3732       const re_charset_t *cset = node->opr.mbcset;
3733 # ifdef _LIBC
3734       const unsigned char *pin = ((char *) re_string_get_buffer (input)
3735                                   + str_idx);
3736       int j;
3737       uint32_t nrules;
3738 # endif /* _LIBC */
3739       int match_len = 0;
3740       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3741                     ? re_string_wchar_at (input, str_idx) : 0);
3742
3743       /* match with multibyte character?  */
3744       for (i = 0; i < cset->nmbchars; ++i)
3745         if (wc == cset->mbchars[i])
3746           {
3747             match_len = char_len;
3748             goto check_node_accept_bytes_match;
3749           }
3750       /* match with character_class?  */
3751       for (i = 0; i < cset->nchar_classes; ++i)
3752         {
3753           wctype_t wt = cset->char_classes[i];
3754           if (__iswctype (wc, wt))
3755             {
3756               match_len = char_len;
3757               goto check_node_accept_bytes_match;
3758             }
3759         }
3760
3761 # ifdef _LIBC
3762       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3763       if (nrules != 0)
3764         {
3765           unsigned int in_collseq = 0;
3766           const int32_t *table, *indirect;
3767           const unsigned char *weights, *extra;
3768           const char *collseqwc;
3769           int32_t idx;
3770           /* This #include defines a local function!  */
3771 #  include <locale/weight.h>
3772
3773           /* match with collating_symbol?  */
3774           if (cset->ncoll_syms)
3775             extra = (const unsigned char *)
3776               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3777           for (i = 0; i < cset->ncoll_syms; ++i)
3778             {
3779               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3780               /* Compare the length of input collating element and
3781                  the length of current collating element.  */
3782               if (*coll_sym != elem_len)
3783                 continue;
3784               /* Compare each bytes.  */
3785               for (j = 0; j < *coll_sym; j++)
3786                 if (pin[j] != coll_sym[1 + j])
3787                   break;
3788               if (j == *coll_sym)
3789                 {
3790                   /* Match if every bytes is equal.  */
3791                   match_len = j;
3792                   goto check_node_accept_bytes_match;
3793                 }
3794             }
3795
3796           if (cset->nranges)
3797             {
3798               if (elem_len <= char_len)
3799                 {
3800                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3801                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3802                 }
3803               else
3804                 in_collseq = find_collation_sequence_value (pin, elem_len);
3805             }
3806           /* match with range expression?  */
3807           for (i = 0; i < cset->nranges; ++i)
3808             if (cset->range_starts[i] <= in_collseq
3809                 && in_collseq <= cset->range_ends[i])
3810               {
3811                 match_len = elem_len;
3812                 goto check_node_accept_bytes_match;
3813               }
3814
3815           /* match with equivalence_class?  */
3816           if (cset->nequiv_classes)
3817             {
3818               const unsigned char *cp = pin;
3819               table = (const int32_t *)
3820                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3821               weights = (const unsigned char *)
3822                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3823               extra = (const unsigned char *)
3824                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3825               indirect = (const int32_t *)
3826                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3827               idx = findidx (&cp);
3828               if (idx > 0)
3829                 for (i = 0; i < cset->nequiv_classes; ++i)
3830                   {
3831                     int32_t equiv_class_idx = cset->equiv_classes[i];
3832                     size_t weight_len = weights[idx];
3833                     if (weight_len == weights[equiv_class_idx])
3834                       {
3835                         int cnt = 0;
3836                         while (cnt <= weight_len
3837                                && (weights[equiv_class_idx + 1 + cnt]
3838                                    == weights[idx + 1 + cnt]))
3839                           ++cnt;
3840                         if (cnt > weight_len)
3841                           {
3842                             match_len = elem_len;
3843                             goto check_node_accept_bytes_match;
3844                           }
3845                       }
3846                   }
3847             }
3848         }
3849       else
3850 # endif /* _LIBC */
3851         {
3852           /* match with range expression?  */
3853 #if __GNUC__ >= 2
3854           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3855 #else
3856           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3857           cmp_buf[2] = wc;
3858 #endif
3859           for (i = 0; i < cset->nranges; ++i)
3860             {
3861               cmp_buf[0] = cset->range_starts[i];
3862               cmp_buf[4] = cset->range_ends[i];
3863               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3864                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3865                 {
3866                   match_len = char_len;
3867                   goto check_node_accept_bytes_match;
3868                 }
3869             }
3870         }
3871     check_node_accept_bytes_match:
3872       if (!cset->non_match)
3873         return match_len;
3874       else
3875         {
3876           if (match_len > 0)
3877             return 0;
3878           else
3879             return (elem_len > char_len) ? elem_len : char_len;
3880         }
3881     }
3882   return 0;
3883 }
3884
3885 # ifdef _LIBC
3886 static unsigned int
3887 find_collation_sequence_value (mbs, mbs_len)
3888     const unsigned char *mbs;
3889     size_t mbs_len;
3890 {
3891   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3892   if (nrules == 0)
3893     {
3894       if (mbs_len == 1)
3895         {
3896           /* No valid character.  Match it as a single byte character.  */
3897           const unsigned char *collseq = (const unsigned char *)
3898             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3899           return collseq[mbs[0]];
3900         }
3901       return UINT_MAX;
3902     }
3903   else
3904     {
3905       int32_t idx;
3906       const unsigned char *extra = (const unsigned char *)
3907         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3908       int32_t extrasize = (const unsigned char *)
3909         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3910
3911       for (idx = 0; idx < extrasize;)
3912         {
3913           int mbs_cnt, found = 0;
3914           int32_t elem_mbs_len;
3915           /* Skip the name of collating element name.  */
3916           idx = idx + extra[idx] + 1;
3917           elem_mbs_len = extra[idx++];
3918           if (mbs_len == elem_mbs_len)
3919             {
3920               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3921                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3922                   break;
3923               if (mbs_cnt == elem_mbs_len)
3924                 /* Found the entry.  */
3925                 found = 1;
3926             }
3927           /* Skip the byte sequence of the collating element.  */
3928           idx += elem_mbs_len;
3929           /* Adjust for the alignment.  */
3930           idx = (idx + 3) & ~3;
3931           /* Skip the collation sequence value.  */
3932           idx += sizeof (uint32_t);
3933           /* Skip the wide char sequence of the collating element.  */
3934           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3935           /* If we found the entry, return the sequence value.  */
3936           if (found)
3937             return *(uint32_t *) (extra + idx);
3938           /* Skip the collation sequence value.  */
3939           idx += sizeof (uint32_t);
3940         }
3941       return UINT_MAX;
3942     }
3943 }
3944 # endif /* _LIBC */
3945 #endif /* RE_ENABLE_I18N */
3946
3947 /* Check whether the node accepts the byte which is IDX-th
3948    byte of the INPUT.  */
3949
3950 static int
3951 check_node_accept (mctx, node, idx)
3952     const re_match_context_t *mctx;
3953     const re_token_t *node;
3954     int idx;
3955 {
3956   re_dfa_t *const dfa = mctx->dfa;
3957   unsigned char ch;
3958   if (node->constraint)
3959     {
3960       /* The node has constraints.  Check whether the current context
3961          satisfies the constraints.  */
3962       unsigned int context = re_string_context_at (&mctx->input, idx,
3963                                                    mctx->eflags);
3964       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3965         return 0;
3966     }
3967   ch = re_string_byte_at (&mctx->input, idx);
3968   switch (node->type)
3969     {
3970     case CHARACTER:
3971       return node->opr.c == ch;
3972     case SIMPLE_BRACKET:
3973       return bitset_contain (node->opr.sbcset, ch);
3974 #ifdef RE_ENABLE_I18N
3975     case OP_UTF8_PERIOD:
3976       if (ch >= 0x80)
3977         return 0;
3978       /* FALLTHROUGH */
3979 #endif
3980     case OP_PERIOD:
3981       return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
3982                || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
3983     default:
3984       return 0;
3985     }
3986 }
3987
3988 /* Extend the buffers, if the buffers have run out.  */
3989
3990 static reg_errcode_t
3991 extend_buffers (mctx)
3992      re_match_context_t *mctx;
3993 {
3994   reg_errcode_t ret;
3995   re_string_t *pstr = &mctx->input;
3996
3997   /* Double the lengthes of the buffers.  */
3998   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3999   if (BE (ret != REG_NOERROR, 0))
4000     return ret;
4001
4002   if (mctx->state_log != NULL)
4003     {
4004       /* And double the length of state_log.  */
4005       /* XXX We have no indication of the size of this buffer.  If this
4006          allocation fail we have no indication that the state_log array
4007          does not have the right size.  */
4008       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4009                                               pstr->bufs_len + 1);
4010       if (BE (new_array == NULL, 0))
4011         return REG_ESPACE;
4012       mctx->state_log = new_array;
4013     }
4014
4015   /* Then reconstruct the buffers.  */
4016   if (pstr->icase)
4017     {
4018 #ifdef RE_ENABLE_I18N
4019       if (pstr->mb_cur_max > 1)
4020         {
4021           ret = build_wcs_upper_buffer (pstr);
4022           if (BE (ret != REG_NOERROR, 0))
4023             return ret;
4024         }
4025       else
4026 #endif /* RE_ENABLE_I18N  */
4027         build_upper_buffer (pstr);
4028     }
4029   else
4030     {
4031 #ifdef RE_ENABLE_I18N
4032       if (pstr->mb_cur_max > 1)
4033         build_wcs_buffer (pstr);
4034       else
4035 #endif /* RE_ENABLE_I18N  */
4036         {
4037           if (pstr->trans != NULL)
4038             re_string_translate_buffer (pstr);
4039         }
4040     }
4041   return REG_NOERROR;
4042 }
4043
4044 \f
4045 /* Functions for matching context.  */
4046
4047 /* Initialize MCTX.  */
4048
4049 static reg_errcode_t
4050 match_ctx_init (mctx, eflags, n)
4051     re_match_context_t *mctx;
4052     int eflags, n;
4053 {
4054   mctx->eflags = eflags;
4055   mctx->match_last = -1;
4056   if (n > 0)
4057     {
4058       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4059       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4060       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4061         return REG_ESPACE;
4062     }
4063   /* Already zero-ed by the caller.
4064      else
4065        mctx->bkref_ents = NULL;
4066      mctx->nbkref_ents = 0;
4067      mctx->nsub_tops = 0;  */
4068   mctx->abkref_ents = n;
4069   mctx->max_mb_elem_len = 1;
4070   mctx->asub_tops = n;
4071   return REG_NOERROR;
4072 }
4073
4074 /* Clean the entries which depend on the current input in MCTX.
4075    This function must be invoked when the matcher changes the start index
4076    of the input, or changes the input string.  */
4077
4078 static void
4079 match_ctx_clean (mctx)
4080     re_match_context_t *mctx;
4081 {
4082   match_ctx_free_subtops (mctx);
4083   mctx->nsub_tops = 0;
4084   mctx->nbkref_ents = 0;
4085 }
4086
4087 /* Free all the memory associated with MCTX.  */
4088
4089 static void
4090 match_ctx_free (mctx)
4091     re_match_context_t *mctx;
4092 {
4093   match_ctx_free_subtops (mctx);
4094   re_free (mctx->sub_tops);
4095   re_free (mctx->bkref_ents);
4096 }
4097
4098 /* Free all the memory associated with MCTX->SUB_TOPS.  */
4099
4100 static void
4101 match_ctx_free_subtops (mctx)
4102      re_match_context_t *mctx;
4103 {
4104   int st_idx;
4105   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4106     {
4107       int sl_idx;
4108       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4109       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4110         {
4111           re_sub_match_last_t *last = top->lasts[sl_idx];
4112           re_free (last->path.array);
4113           re_free (last);
4114         }
4115       re_free (top->lasts);
4116       if (top->path)
4117         {
4118           re_free (top->path->array);
4119           re_free (top->path);
4120         }
4121       free (top);
4122     }
4123 }
4124
4125 /* Add a new backreference entry to MCTX.
4126    Note that we assume that caller never call this function with duplicate
4127    entry, and call with STR_IDX which isn't smaller than any existing entry.
4128 */
4129
4130 static reg_errcode_t
4131 match_ctx_add_entry (mctx, node, str_idx, from, to)
4132      re_match_context_t *mctx;
4133      int node, str_idx, from, to;
4134 {
4135   if (mctx->nbkref_ents >= mctx->abkref_ents)
4136     {
4137       struct re_backref_cache_entry* new_entry;
4138       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4139                               mctx->abkref_ents * 2);
4140       if (BE (new_entry == NULL, 0))
4141         {
4142           re_free (mctx->bkref_ents);
4143           return REG_ESPACE;
4144         }
4145       mctx->bkref_ents = new_entry;
4146       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4147               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4148       mctx->abkref_ents *= 2;
4149     }
4150   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4151   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4152   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4153   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4154   mctx->bkref_ents[mctx->nbkref_ents++].flag = 0;
4155   if (mctx->max_mb_elem_len < to - from)
4156     mctx->max_mb_elem_len = to - from;
4157   return REG_NOERROR;
4158 }
4159
4160 /* Search for the first entry which has the same str_idx.
4161    Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4162
4163 static int
4164 search_cur_bkref_entry (mctx, str_idx)
4165      re_match_context_t *mctx;
4166      int str_idx;
4167 {
4168   int left, right, mid;
4169   right = mctx->nbkref_ents;
4170   for (left = 0; left < right;)
4171     {
4172       mid = (left + right) / 2;
4173       if (mctx->bkref_ents[mid].str_idx < str_idx)
4174         left = mid + 1;
4175       else
4176         right = mid;
4177     }
4178   return left;
4179 }
4180
4181 static void
4182 match_ctx_clear_flag (mctx)
4183      re_match_context_t *mctx;
4184 {
4185   int i;
4186   for (i = 0; i < mctx->nbkref_ents; ++i)
4187     mctx->bkref_ents[i].flag = 0;
4188 }
4189
4190 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4191    at STR_IDX.  */
4192
4193 static reg_errcode_t
4194 match_ctx_add_subtop (mctx, node, str_idx)
4195      re_match_context_t *mctx;
4196      int node, str_idx;
4197 {
4198 #ifdef DEBUG
4199   assert (mctx->sub_tops != NULL);
4200   assert (mctx->asub_tops > 0);
4201 #endif
4202   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4203     {
4204       int new_asub_tops = mctx->asub_tops * 2;
4205       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4206                                                    re_sub_match_top_t *,
4207                                                    new_asub_tops);
4208       if (BE (new_array == NULL, 0))
4209         return REG_ESPACE;
4210       mctx->sub_tops = new_array;
4211       mctx->asub_tops = new_asub_tops;
4212     }
4213   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4214   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4215     return REG_ESPACE;
4216   mctx->sub_tops[mctx->nsub_tops]->node = node;
4217   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4218   return REG_NOERROR;
4219 }
4220
4221 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4222    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4223
4224 static re_sub_match_last_t *
4225 match_ctx_add_sublast (subtop, node, str_idx)
4226      re_sub_match_top_t *subtop;
4227      int node, str_idx;
4228 {
4229   re_sub_match_last_t *new_entry;
4230   if (BE (subtop->nlasts == subtop->alasts, 0))
4231     {
4232       int new_alasts = 2 * subtop->alasts + 1;
4233       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4234                                                     re_sub_match_last_t *,
4235                                                     new_alasts);
4236       if (BE (new_array == NULL, 0))
4237         return NULL;
4238       subtop->lasts = new_array;
4239       subtop->alasts = new_alasts;
4240     }
4241   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4242   if (BE (new_entry != NULL, 1))
4243     {
4244       subtop->lasts[subtop->nlasts] = new_entry;
4245       new_entry->node = node;
4246       new_entry->str_idx = str_idx;
4247       ++subtop->nlasts;
4248     }
4249   return new_entry;
4250 }
4251
4252 static void
4253 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx,
4254                check_subexp)
4255     re_sift_context_t *sctx;
4256     re_dfastate_t **sifted_sts, **limited_sts;
4257     int last_node, last_str_idx, check_subexp;
4258 {
4259   sctx->sifted_states = sifted_sts;
4260   sctx->limited_states = limited_sts;
4261   sctx->last_node = last_node;
4262   sctx->last_str_idx = last_str_idx;
4263   sctx->check_subexp = check_subexp;
4264   sctx->cur_bkref = -1;
4265   sctx->cls_subexp_idx = -1;
4266   re_node_set_init_empty (&sctx->limits);
4267 }