posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static inline re_dfastate_t *acquire_init_state_context
  56      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  57      __attribute ((always_inline)) internal_function;
  58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  59      internal_function;
  60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  61                            int *p_match_first)
  62      internal_function;
  63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  64                                     unsigned int context) internal_function;
  65 static int check_halt_state_context (const re_match_context_t *mctx,
  66                                      const re_dfastate_t *state, int idx)
  67      internal_function;
  68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  69                          regmatch_t *prev_idx_match, int cur_node,
  70                          int cur_idx, int nmatch) internal_function;
  71 static int proceed_next_node (const re_match_context_t *mctx,
  72                               int nregs, regmatch_t *regs,
  73                               int *pidx, int node, re_node_set *eps_via_nodes,
  74                               struct re_fail_stack_t *fs) internal_function;
  75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  76                                       int str_idx, int *dests, int nregs,
  77                                       regmatch_t *regs,
  78                                       re_node_set *eps_via_nodes) internal_function;
  79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  80                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  81 static reg_errcode_t set_regs (const regex_t *preg,
  82                                const re_match_context_t *mctx,
  83                                size_t nmatch, regmatch_t *pmatch,
  84                                int fl_backtrack) internal_function;
  85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  86
  87 #ifdef RE_ENABLE_I18N
  88 static int sift_states_iter_mb (const re_match_context_t *mctx,
  89                                 re_sift_context_t *sctx,
  90                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  91 #endif /* RE_ENABLE_I18N */
  92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  93                                            re_sift_context_t *sctx) internal_function;
  94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  95                                           re_sift_context_t *sctx, int str_idx,
  96                                           re_node_set *cur_dest) internal_function;
  97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  98                                               re_sift_context_t *sctx,
  99                                               int str_idx,
 100                                               re_node_set *dest_nodes) internal_function;
 101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 102                                             re_node_set *dest_nodes,
 103                                             const re_node_set *candidates) internal_function;
 104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 105                                             re_node_set *dest_nodes,
 106                                             const re_node_set *and_nodes) internal_function;
 107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 108                              int dst_node, int dst_idx, int src_node,
 109                              int src_idx) internal_function;
 110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 111                                         int boundaries, int subexp_idx,
 112                                         int from_node, int bkref_idx) internal_function;
 113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 114                                       int limit, int subexp_idx,
 115                                       int node, int str_idx,
 116                                       int bkref_idx) internal_function;
 117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 118                                           re_node_set *dest_nodes,
 119                                           const re_node_set *candidates,
 120                                           re_node_set *limits,
 121                                           struct re_backref_cache_entry *bkref_ents,
 122                                           int str_idx) internal_function;
 123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 124                                         re_sift_context_t *sctx,
 125                                         int str_idx, const re_node_set *candidates) internal_function;
 126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 127                                                 int next_state_log_idx) internal_function;
 128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 129                                         re_dfastate_t **src, int num) internal_function;
 130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 131                                          re_match_context_t *mctx) internal_function;
 132 static re_dfastate_t *transit_state (reg_errcode_t *err,
 133                                      re_match_context_t *mctx,
 134                                      re_dfastate_t *state) internal_function;
 135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 136                                             re_match_context_t *mctx,
 137                                             re_dfastate_t *next_state) internal_function;
 138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 139                                                 re_node_set *cur_nodes,
 140                                                 int str_idx) internal_function;
 141 #if 0
 142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 143                                         re_match_context_t *mctx,
 144                                         re_dfastate_t *pstate) internal_function;
 145 #endif
 146 #ifdef RE_ENABLE_I18N
 147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 148                                        re_dfastate_t *pstate) internal_function;
 149 #endif /* RE_ENABLE_I18N */
 150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 151                                           const re_node_set *nodes) internal_function;
 152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 153                                  int bkref_node, int bkref_str_idx) internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str) internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes) internal_function;
 168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 169                                                re_node_set *cur_nodes,
 170                                                int ex_subexp, int type) internal_function;
 171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 172                                                    re_node_set *dst_nodes,
 173                                                    int target, int ex_subexp,
 174                                                    int type) internal_function;
 175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 176                                          re_node_set *cur_nodes, int cur_str,
 177                                          int subexp_num, int type) internal_function;
 178 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
 179                                       re_dfastate_t *state) internal_function;
 180 #ifdef RE_ENABLE_I18N
 181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 182                                     const re_string_t *input, int idx) internal_function;
 183 # ifdef _LIBC
 184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 185                                                    size_t name_len) internal_function;
 186 # endif /* _LIBC */
 187 #endif /* RE_ENABLE_I18N */
 188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 189                                        const re_dfastate_t *state,
 190                                        re_node_set *states_node,
 191                                        bitset *states_ch) internal_function;
 192 static int check_node_accept (const re_match_context_t *mctx,
 193                               const re_token_t *node, int idx) internal_function;
 194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 195 \f
 196 /* Entry point for POSIX code.  */
 197
 198 /* regexec searches for a given pattern, specified by PREG, in the
 199    string STRING.
 200
 201    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 202    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 203    least NMATCH elements, and we set them to the offsets of the
 204    corresponding matched substrings.
 205
 206    EFLAGS specifies `execution flags' which affect matching: if
 207    REG_NOTBOL is set, then ^ does not match at the beginning of the
 208    string; if REG_NOTEOL is set, then $ does not match at the end.
 209
 210    We return 0 if we find a match and REG_NOMATCH if not.  */
 211
 212 int
 213 regexec (preg, string, nmatch, pmatch, eflags)
 214     const regex_t *__restrict preg;
 215     const char *__restrict string;
 216     size_t nmatch;
 217     regmatch_t pmatch[];
 218     int eflags;
 219 {
 220   reg_errcode_t err;
 221   int start, length;
 222
 223   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 224     return REG_BADPAT;
 225
 226   if (eflags & REG_STARTEND)
 227     {
 228       start = pmatch[0].rm_so;
 229       length = pmatch[0].rm_eo;
 230     }
 231   else
 232     {
 233       start = 0;
 234       length = strlen (string);
 235     }
 236   if (preg->no_sub)
 237     err = re_search_internal (preg, string, length, start, length - start,
 238                               length, 0, NULL, eflags);
 239   else
 240     err = re_search_internal (preg, string, length, start, length - start,
 241                               length, nmatch, pmatch, eflags);
 242   return err != REG_NOERROR;
 243 }
 244
 245 #ifdef _LIBC
 246 # include <shlib-compat.h>
 247 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 248
 249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 250 __typeof__ (__regexec) __compat_regexec;
 251
 252 int
 253 attribute_compat_text_section
 254 __compat_regexec (const regex_t *__restrict preg,
 255                   const char *__restrict string, size_t nmatch,
 256                   regmatch_t pmatch[], int eflags)
 257 {
 258   return regexec (preg, string, nmatch, pmatch,
 259                   eflags & (REG_NOTBOL | REG_NOTEOL));
 260 }
 261 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 262 # endif
 263 #endif
 264
 265 /* Entry points for GNU code.  */
 266
 267 /* re_match, re_search, re_match_2, re_search_2
 268
 269    The former two functions operate on STRING with length LENGTH,
 270    while the later two operate on concatenation of STRING1 and STRING2
 271    with lengths LENGTH1 and LENGTH2, respectively.
 272
 273    re_match() matches the compiled pattern in BUFP against the string,
 274    starting at index START.
 275
 276    re_search() first tries matching at index START, then it tries to match
 277    starting from index START + 1, and so on.  The last start position tried
 278    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 279    way as re_match().)
 280
 281    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 282    the first STOP characters of the concatenation of the strings should be
 283    concerned.
 284
 285    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 286    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 287    computed relative to the concatenation, not relative to the individual
 288    strings.)
 289
 290    On success, re_match* functions return the length of the match, re_search*
 291    return the position of the start of the match.  Return value -1 means no
 292    match was found and -2 indicates an internal error.  */
 293
 294 int
 295 re_match (bufp, string, length, start, regs)
 296     struct re_pattern_buffer *bufp;
 297     const char *string;
 298     int length, start;
 299     struct re_registers *regs;
 300 {
 301   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 302 }
 303 #ifdef _LIBC
 304 weak_alias (__re_match, re_match)
 305 #endif
 306
 307 int
 308 re_search (bufp, string, length, start, range, regs)
 309     struct re_pattern_buffer *bufp;
 310     const char *string;
 311     int length, start, range;
 312     struct re_registers *regs;
 313 {
 314   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 315 }
 316 #ifdef _LIBC
 317 weak_alias (__re_search, re_search)
 318 #endif
 319
 320 int
 321 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 322     struct re_pattern_buffer *bufp;
 323     const char *string1, *string2;
 324     int length1, length2, start, stop;
 325     struct re_registers *regs;
 326 {
 327   return re_search_2_stub (bufp, string1, length1, string2, length2,
 328                            start, 0, regs, stop, 1);
 329 }
 330 #ifdef _LIBC
 331 weak_alias (__re_match_2, re_match_2)
 332 #endif
 333
 334 int
 335 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 336     struct re_pattern_buffer *bufp;
 337     const char *string1, *string2;
 338     int length1, length2, start, range, stop;
 339     struct re_registers *regs;
 340 {
 341   return re_search_2_stub (bufp, string1, length1, string2, length2,
 342                            start, range, regs, stop, 0);
 343 }
 344 #ifdef _LIBC
 345 weak_alias (__re_search_2, re_search_2)
 346 #endif
 347
 348 static int
 349 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 350                   stop, ret_len)
 351     struct re_pattern_buffer *bufp;
 352     const char *string1, *string2;
 353     int length1, length2, start, range, stop, ret_len;
 354     struct re_registers *regs;
 355 {
 356   const char *str;
 357   int rval;
 358   int len = length1 + length2;
 359   int free_str = 0;
 360
 361   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 362     return -2;
 363
 364   /* Concatenate the strings.  */
 365   if (length2 > 0)
 366     if (length1 > 0)
 367       {
 368         char *s = re_malloc (char, len);
 369
 370         if (BE (s == NULL, 0))
 371           return -2;
 372         memcpy (s, string1, length1);
 373         memcpy (s + length1, string2, length2);
 374         str = s;
 375         free_str = 1;
 376       }
 377     else
 378       str = string2;
 379   else
 380     str = string1;
 381
 382   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 383                          ret_len);
 384   if (free_str)
 385     re_free ((char *) str);
 386   return rval;
 387 }
 388
 389 /* The parameters have the same meaning as those of re_search.
 390    Additional parameters:
 391    If RET_LEN is nonzero the length of the match is returned (re_match style);
 392    otherwise the position of the match is returned.  */
 393
 394 static int
 395 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 396     struct re_pattern_buffer *bufp;
 397     const char *string;
 398     int length, start, range, stop, ret_len;
 399     struct re_registers *regs;
 400 {
 401   reg_errcode_t result;
 402   regmatch_t *pmatch;
 403   int nregs, rval;
 404   int eflags = 0;
 405
 406   /* Check for out-of-range.  */
 407   if (BE (start < 0 || start > length, 0))
 408     return -1;
 409   if (BE (start + range > length, 0))
 410     range = length - start;
 411   else if (BE (start + range < 0, 0))
 412     range = -start;
 413
 414   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 415   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 416
 417   /* Compile fastmap if we haven't yet.  */
 418   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 419     re_compile_fastmap (bufp);
 420
 421   if (BE (bufp->no_sub, 0))
 422     regs = NULL;
 423
 424   /* We need at least 1 register.  */
 425   if (regs == NULL)
 426     nregs = 1;
 427   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 428                regs->num_regs < bufp->re_nsub + 1, 0))
 429     {
 430       nregs = regs->num_regs;
 431       if (BE (nregs < 1, 0))
 432         {
 433           /* Nothing can be copied to regs.  */
 434           regs = NULL;
 435           nregs = 1;
 436         }
 437     }
 438   else
 439     nregs = bufp->re_nsub + 1;
 440   pmatch = re_malloc (regmatch_t, nregs);
 441   if (BE (pmatch == NULL, 0))
 442     return -2;
 443
 444   result = re_search_internal (bufp, string, length, start, range, stop,
 445                                nregs, pmatch, eflags);
 446
 447   rval = 0;
 448
 449   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 450   if (result != REG_NOERROR)
 451     rval = -1;
 452   else if (regs != NULL)
 453     {
 454       /* If caller wants register contents data back, copy them.  */
 455       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 456                                            bufp->regs_allocated);
 457       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 458         rval = -2;
 459     }
 460
 461   if (BE (rval == 0, 1))
 462     {
 463       if (ret_len)
 464         {
 465           assert (pmatch[0].rm_so == start);
 466           rval = pmatch[0].rm_eo - start;
 467         }
 468       else
 469         rval = pmatch[0].rm_so;
 470     }
 471   re_free (pmatch);
 472   return rval;
 473 }
 474
 475 static unsigned
 476 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 477     struct re_registers *regs;
 478     regmatch_t *pmatch;
 479     int nregs, regs_allocated;
 480 {
 481   int rval = REGS_REALLOCATE;
 482   int i;
 483   int need_regs = nregs + 1;
 484   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 485      uses.  */
 486
 487   /* Have the register data arrays been allocated?  */
 488   if (regs_allocated == REGS_UNALLOCATED)
 489     { /* No.  So allocate them with malloc.  */
 490       regs->start = re_malloc (regoff_t, need_regs);
 491       regs->end = re_malloc (regoff_t, need_regs);
 492       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 493         return REGS_UNALLOCATED;
 494       regs->num_regs = need_regs;
 495     }
 496   else if (regs_allocated == REGS_REALLOCATE)
 497     { /* Yes.  If we need more elements than were already
 498          allocated, reallocate them.  If we need fewer, just
 499          leave it alone.  */
 500       if (BE (need_regs > regs->num_regs, 0))
 501         {
 502           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 503           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 504           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 505             return REGS_UNALLOCATED;
 506           regs->start = new_start;
 507           regs->end = new_end;
 508           regs->num_regs = need_regs;
 509         }
 510     }
 511   else
 512     {
 513       assert (regs_allocated == REGS_FIXED);
 514       /* This function may not be called with REGS_FIXED and nregs too big.  */
 515       assert (regs->num_regs >= nregs);
 516       rval = REGS_FIXED;
 517     }
 518
 519   /* Copy the regs.  */
 520   for (i = 0; i < nregs; ++i)
 521     {
 522       regs->start[i] = pmatch[i].rm_so;
 523       regs->end[i] = pmatch[i].rm_eo;
 524     }
 525   for ( ; i < regs->num_regs; ++i)
 526     regs->start[i] = regs->end[i] = -1;
 527
 528   return rval;
 529 }
 530
 531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 532    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 533    this memory for recording register information.  STARTS and ENDS
 534    must be allocated using the malloc library routine, and must each
 535    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 536
 537    If NUM_REGS == 0, then subsequent matches should allocate their own
 538    register data.
 539
 540    Unless this function is called, the first search or match using
 541    PATTERN_BUFFER will allocate its own register data, without
 542    freeing the old data.  */
 543
 544 void
 545 re_set_registers (bufp, regs, num_regs, starts, ends)
 546     struct re_pattern_buffer *bufp;
 547     struct re_registers *regs;
 548     unsigned num_regs;
 549     regoff_t *starts, *ends;
 550 {
 551   if (num_regs)
 552     {
 553       bufp->regs_allocated = REGS_REALLOCATE;
 554       regs->num_regs = num_regs;
 555       regs->start = starts;
 556       regs->end = ends;
 557     }
 558   else
 559     {
 560       bufp->regs_allocated = REGS_UNALLOCATED;
 561       regs->num_regs = 0;
 562       regs->start = regs->end = (regoff_t *) 0;
 563     }
 564 }
 565 #ifdef _LIBC
 566 weak_alias (__re_set_registers, re_set_registers)
 567 #endif
 568 \f
 569 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 570    them unless specifically requested.  */
 571
 572 #if defined _REGEX_RE_COMP || defined _LIBC
 573 int
 574 # ifdef _LIBC
 575 weak_function
 576 # endif
 577 re_exec (s)
 578      const char *s;
 579 {
 580   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 581 }
 582 #endif /* _REGEX_RE_COMP */
 583 \f
 584 /* Internal entry point.  */
 585
 586 /* Searches for a compiled pattern PREG in the string STRING, whose
 587    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 588    mingings with regexec.  START, and RANGE have the same meanings
 589    with re_search.
 590    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 591    otherwise return the error code.
 592    Note: We assume front end functions already check ranges.
 593    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 594
 595 static reg_errcode_t
 596 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 597                     eflags)
 598     const regex_t *preg;
 599     const char *string;
 600     int length, start, range, stop, eflags;
 601     size_t nmatch;
 602     regmatch_t pmatch[];
 603 {
 604   reg_errcode_t err;
 605   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 606   int left_lim, right_lim, incr;
 607   int fl_longest_match, match_first, match_kind, match_last = -1;
 608   int sb, ch;
 609 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 610   re_match_context_t mctx = { .dfa = dfa };
 611 #else
 612   re_match_context_t mctx;
 613 #endif
 614   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 615                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 616   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 617
 618 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 619   memset (&mctx, '\0', sizeof (re_match_context_t));
 620   mctx.dfa = dfa;
 621 #endif
 622
 623   /* Check if the DFA haven't been compiled.  */
 624   if (BE (preg->used == 0 || dfa->init_state == NULL
 625           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 626           || dfa->init_state_begbuf == NULL, 0))
 627     return REG_NOMATCH;
 628
 629 #ifdef DEBUG
 630   /* We assume front-end functions already check them.  */
 631   assert (start + range >= 0 && start + range <= length);
 632 #endif
 633
 634   /* If initial states with non-begbuf contexts have no elements,
 635      the regex must be anchored.  If preg->newline_anchor is set,
 636      we'll never use init_state_nl, so do not check it.  */
 637   if (dfa->init_state->nodes.nelem == 0
 638       && dfa->init_state_word->nodes.nelem == 0
 639       && (dfa->init_state_nl->nodes.nelem == 0
 640           || !preg->newline_anchor))
 641     {
 642       if (start != 0 && start + range != 0)
 643         return REG_NOMATCH;
 644       start = range = 0;
 645     }
 646
 647   /* We must check the longest matching, if nmatch > 0.  */
 648   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 649
 650   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 651                             preg->translate, preg->syntax & RE_ICASE, dfa);
 652   if (BE (err != REG_NOERROR, 0))
 653     goto free_return;
 654   mctx.input.stop = stop;
 655   mctx.input.raw_stop = stop;
 656   mctx.input.newline_anchor = preg->newline_anchor;
 657
 658   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 659   if (BE (err != REG_NOERROR, 0))
 660     goto free_return;
 661
 662   /* We will log all the DFA states through which the dfa pass,
 663      if nmatch > 1, or this dfa has "multibyte node", which is a
 664      back-reference or a node which can accept multibyte character or
 665      multi character collating element.  */
 666   if (nmatch > 1 || dfa->has_mb_node)
 667     {
 668       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 669       if (BE (mctx.state_log == NULL, 0))
 670         {
 671           err = REG_ESPACE;
 672           goto free_return;
 673         }
 674     }
 675   else
 676     mctx.state_log = NULL;
 677
 678   match_first = start;
 679   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 680                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 681
 682   /* Check incrementally whether of not the input string match.  */
 683   incr = (range < 0) ? -1 : 1;
 684   left_lim = (range < 0) ? start + range : start;
 685   right_lim = (range < 0) ? start : start + range;
 686   sb = dfa->mb_cur_max == 1;
 687   match_kind =
 688     (fastmap
 689      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 690         | (range >= 0 ? 2 : 0)
 691         | (t != NULL ? 1 : 0))
 692      : 8);
 693
 694   for (;; match_first += incr)
 695     {
 696       err = REG_NOMATCH;
 697       if (match_first < left_lim || right_lim < match_first)
 698         goto free_return;
 699
 700       /* Advance as rapidly as possible through the string, until we
 701          find a plausible place to start matching.  This may be done
 702          with varying efficiency, so there are various possibilities:
 703          only the most common of them are specialized, in order to
 704          save on code size.  We use a switch statement for speed.  */
 705       switch (match_kind)
 706         {
 707         case 8:
 708           /* No fastmap.  */
 709           break;
 710
 711         case 7:
 712           /* Fastmap with single-byte translation, match forward.  */
 713           while (BE (match_first < right_lim, 1)
 714                  && !fastmap[t[(unsigned char) string[match_first]]])
 715             ++match_first;
 716           goto forward_match_found_start_or_reached_end;
 717
 718         case 6:
 719           /* Fastmap without translation, match forward.  */
 720           while (BE (match_first < right_lim, 1)
 721                  && !fastmap[(unsigned char) string[match_first]])
 722             ++match_first;
 723
 724         forward_match_found_start_or_reached_end:
 725           if (BE (match_first == right_lim, 0))
 726             {
 727               ch = match_first >= length
 728                        ? 0 : (unsigned char) string[match_first];
 729               if (!fastmap[t ? t[ch] : ch])
 730                 goto free_return;
 731             }
 732           break;
 733
 734         case 4:
 735         case 5:
 736           /* Fastmap without multi-byte translation, match backwards.  */
 737           while (match_first >= left_lim)
 738             {
 739               ch = match_first >= length
 740                        ? 0 : (unsigned char) string[match_first];
 741               if (fastmap[t ? t[ch] : ch])
 742                 break;
 743               --match_first;
 744             }
 745           if (match_first < left_lim)
 746             goto free_return;
 747           break;
 748
 749         default:
 750           /* In this case, we can't determine easily the current byte,
 751              since it might be a component byte of a multibyte
 752              character.  Then we use the constructed buffer instead.  */
 753           for (;;)
 754             {
 755               /* If MATCH_FIRST is out of the valid range, reconstruct the
 756                  buffers.  */
 757               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 758               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 759                 {
 760                   err = re_string_reconstruct (&mctx.input, match_first,
 761                                                eflags);
 762                   if (BE (err != REG_NOERROR, 0))
 763                     goto free_return;
 764
 765                   offset = match_first - mctx.input.raw_mbs_idx;
 766                 }
 767               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 768                  Note that MATCH_FIRST must not be smaller than 0.  */
 769               ch = (match_first >= length
 770                     ? 0 : re_string_byte_at (&mctx.input, offset));
 771               if (fastmap[ch])
 772                 break;
 773               match_first += incr;
 774               if (match_first < left_lim || match_first > right_lim)
 775                 {
 776                   err = REG_NOMATCH;
 777                   goto free_return;
 778                 }
 779             }
 780           break;
 781         }
 782
 783       /* Reconstruct the buffers so that the matcher can assume that
 784          the matching starts from the beginning of the buffer.  */
 785       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 786       if (BE (err != REG_NOERROR, 0))
 787         goto free_return;
 788
 789 #ifdef RE_ENABLE_I18N
 790      /* Don't consider this char as a possible match start if it part,
 791         yet isn't the head, of a multibyte character.  */
 792       if (!sb && !re_string_first_byte (&mctx.input, 0))
 793         continue;
 794 #endif
 795
 796       /* It seems to be appropriate one, then use the matcher.  */
 797       /* We assume that the matching starts from 0.  */
 798       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 799       match_last = check_matching (&mctx, fl_longest_match,
 800                                    range >= 0 ? &match_first : NULL);
 801       if (match_last != -1)
 802         {
 803           if (BE (match_last == -2, 0))
 804             {
 805               err = REG_ESPACE;
 806               goto free_return;
 807             }
 808           else
 809             {
 810               mctx.match_last = match_last;
 811               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 812                 {
 813                   re_dfastate_t *pstate = mctx.state_log[match_last];
 814                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 815                                                              match_last);
 816                 }
 817               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 818                   || dfa->nbackref)
 819                 {
 820                   err = prune_impossible_nodes (&mctx);
 821                   if (err == REG_NOERROR)
 822                     break;
 823                   if (BE (err != REG_NOMATCH, 0))
 824                     goto free_return;
 825                   match_last = -1;
 826                 }
 827               else
 828                 break; /* We found a match.  */
 829             }
 830         }
 831
 832       match_ctx_clean (&mctx);
 833     }
 834
 835 #ifdef DEBUG
 836   assert (match_last != -1);
 837   assert (err == REG_NOERROR);
 838 #endif
 839
 840   /* Set pmatch[] if we need.  */
 841   if (nmatch > 0)
 842     {
 843       int reg_idx;
 844
 845       /* Initialize registers.  */
 846       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 847         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 848
 849       /* Set the points where matching start/end.  */
 850       pmatch[0].rm_so = 0;
 851       pmatch[0].rm_eo = mctx.match_last;
 852
 853       if (!preg->no_sub && nmatch > 1)
 854         {
 855           err = set_regs (preg, &mctx, nmatch, pmatch,
 856                           dfa->has_plural_match && dfa->nbackref > 0);
 857           if (BE (err != REG_NOERROR, 0))
 858             goto free_return;
 859         }
 860
 861       /* At last, add the offset to the each registers, since we slided
 862          the buffers so that we could assume that the matching starts
 863          from 0.  */
 864       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 865         if (pmatch[reg_idx].rm_so != -1)
 866           {
 867 #ifdef RE_ENABLE_I18N
 868             if (BE (mctx.input.offsets_needed != 0, 0))
 869               {
 870                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 871                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 872                 else
 873                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 874                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 875                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 876                 else
 877                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 878               }
 879 #else
 880             assert (mctx.input.offsets_needed == 0);
 881 #endif
 882             pmatch[reg_idx].rm_so += match_first;
 883             pmatch[reg_idx].rm_eo += match_first;
 884           }
 885
 886       if (dfa->subexp_map)
 887         for (reg_idx = 0;
 888              reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
 889              reg_idx++)
 890           if (dfa->subexp_map[reg_idx] != reg_idx)
 891             {
 892               pmatch[reg_idx + 1].rm_so
 893                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 894               pmatch[reg_idx + 1].rm_eo
 895                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 896             }
 897     }
 898
 899  free_return:
 900   re_free (mctx.state_log);
 901   if (dfa->nbackref)
 902     match_ctx_free (&mctx);
 903   re_string_destruct (&mctx.input);
 904   return err;
 905 }
 906
 907 static reg_errcode_t
 908 prune_impossible_nodes (mctx)
 909      re_match_context_t *mctx;
 910 {
 911   re_dfa_t *const dfa = mctx->dfa;
 912   int halt_node, match_last;
 913   reg_errcode_t ret;
 914   re_dfastate_t **sifted_states;
 915   re_dfastate_t **lim_states = NULL;
 916   re_sift_context_t sctx;
 917 #ifdef DEBUG
 918   assert (mctx->state_log != NULL);
 919 #endif
 920   match_last = mctx->match_last;
 921   halt_node = mctx->last_node;
 922   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 923   if (BE (sifted_states == NULL, 0))
 924     {
 925       ret = REG_ESPACE;
 926       goto free_return;
 927     }
 928   if (dfa->nbackref)
 929     {
 930       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 931       if (BE (lim_states == NULL, 0))
 932         {
 933           ret = REG_ESPACE;
 934           goto free_return;
 935         }
 936       while (1)
 937         {
 938           memset (lim_states, '\0',
 939                   sizeof (re_dfastate_t *) * (match_last + 1));
 940           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 941                          match_last);
 942           ret = sift_states_backward (mctx, &sctx);
 943           re_node_set_free (&sctx.limits);
 944           if (BE (ret != REG_NOERROR, 0))
 945               goto free_return;
 946           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 947             break;
 948           do
 949             {
 950               --match_last;
 951               if (match_last < 0)
 952                 {
 953                   ret = REG_NOMATCH;
 954                   goto free_return;
 955                 }
 956             } while (mctx->state_log[match_last] == NULL
 957                      || !mctx->state_log[match_last]->halt);
 958           halt_node = check_halt_state_context (mctx,
 959                                                 mctx->state_log[match_last],
 960                                                 match_last);
 961         }
 962       ret = merge_state_array (dfa, sifted_states, lim_states,
 963                                match_last + 1);
 964       re_free (lim_states);
 965       lim_states = NULL;
 966       if (BE (ret != REG_NOERROR, 0))
 967         goto free_return;
 968     }
 969   else
 970     {
 971       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 972       ret = sift_states_backward (mctx, &sctx);
 973       re_node_set_free (&sctx.limits);
 974       if (BE (ret != REG_NOERROR, 0))
 975         goto free_return;
 976     }
 977   re_free (mctx->state_log);
 978   mctx->state_log = sifted_states;
 979   sifted_states = NULL;
 980   mctx->last_node = halt_node;
 981   mctx->match_last = match_last;
 982   ret = REG_NOERROR;
 983  free_return:
 984   re_free (sifted_states);
 985   re_free (lim_states);
 986   return ret;
 987 }
 988
 989 /* Acquire an initial state and return it.
 990    We must select appropriate initial state depending on the context,
 991    since initial states may have constraints like "\<", "^", etc..  */
 992
 993 static inline re_dfastate_t *
 994 acquire_init_state_context (err, mctx, idx)
 995      reg_errcode_t *err;
 996      const re_match_context_t *mctx;
 997      int idx;
 998 {
 999   re_dfa_t *const dfa = mctx->dfa;
1000   if (dfa->init_state->has_constraint)
1001     {
1002       unsigned int context;
1003       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1004       if (IS_WORD_CONTEXT (context))
1005         return dfa->init_state_word;
1006       else if (IS_ORDINARY_CONTEXT (context))
1007         return dfa->init_state;
1008       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1009         return dfa->init_state_begbuf;
1010       else if (IS_NEWLINE_CONTEXT (context))
1011         return dfa->init_state_nl;
1012       else if (IS_BEGBUF_CONTEXT (context))
1013         {
1014           /* It is relatively rare case, then calculate on demand.  */
1015           return re_acquire_state_context (err, dfa,
1016                                            dfa->init_state->entrance_nodes,
1017                                            context);
1018         }
1019       else
1020         /* Must not happen?  */
1021         return dfa->init_state;
1022     }
1023   else
1024     return dfa->init_state;
1025 }
1026
1027 /* Check whether the regular expression match input string INPUT or not,
1028    and return the index where the matching end, return -1 if not match,
1029    or return -2 in case of an error.
1030    FL_LONGEST_MATCH means we want the POSIX longest matching.
1031    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1032    next place where we may want to try matching.
1033    Note that the matcher assume that the maching starts from the current
1034    index of the buffer.  */
1035
1036 static int
1037 check_matching (mctx, fl_longest_match, p_match_first)
1038     re_match_context_t *mctx;
1039     int fl_longest_match;
1040     int *p_match_first;
1041 {
1042   re_dfa_t *const dfa = mctx->dfa;
1043   reg_errcode_t err;
1044   int match = 0;
1045   int match_last = -1;
1046   int cur_str_idx = re_string_cur_idx (&mctx->input);
1047   re_dfastate_t *cur_state;
1048   int at_init_state = p_match_first != NULL;
1049   int next_start_idx = cur_str_idx;
1050
1051   err = REG_NOERROR;
1052   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1053   /* An initial state must not be NULL (invalid).  */
1054   if (BE (cur_state == NULL, 0))
1055     {
1056       assert (err == REG_ESPACE);
1057       return -2;
1058     }
1059
1060   if (mctx->state_log != NULL)
1061     {
1062       mctx->state_log[cur_str_idx] = cur_state;
1063
1064       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1065          later.  E.g. Processing back references.  */
1066       if (BE (dfa->nbackref, 0))
1067         {
1068           at_init_state = 0;
1069           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1070           if (BE (err != REG_NOERROR, 0))
1071             return err;
1072
1073           if (cur_state->has_backref)
1074             {
1075               err = transit_state_bkref (mctx, &cur_state->nodes);
1076               if (BE (err != REG_NOERROR, 0))
1077                 return err;
1078             }
1079         }
1080     }
1081
1082   /* If the RE accepts NULL string.  */
1083   if (BE (cur_state->halt, 0))
1084     {
1085       if (!cur_state->has_constraint
1086           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1087         {
1088           if (!fl_longest_match)
1089             return cur_str_idx;
1090           else
1091             {
1092               match_last = cur_str_idx;
1093               match = 1;
1094             }
1095         }
1096     }
1097
1098   while (!re_string_eoi (&mctx->input))
1099     {
1100       re_dfastate_t *old_state = cur_state;
1101       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1102
1103       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1104           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1105               && mctx->input.valid_len < mctx->input.len))
1106         {
1107           err = extend_buffers (mctx);
1108           if (BE (err != REG_NOERROR, 0))
1109             {
1110               assert (err == REG_ESPACE);
1111               return -2;
1112             }
1113         }
1114
1115       cur_state = transit_state (&err, mctx, cur_state);
1116       if (mctx->state_log != NULL)
1117         cur_state = merge_state_with_log (&err, mctx, cur_state);
1118
1119       if (cur_state == NULL)
1120         {
1121           /* Reached the invalid state or an error.  Try to recover a valid
1122              state using the state log, if available and if we have not
1123              already found a valid (even if not the longest) match.  */
1124           if (BE (err != REG_NOERROR, 0))
1125             return -2;
1126
1127           if (mctx->state_log == NULL
1128               || (match && !fl_longest_match)
1129               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1130             break;
1131         }
1132
1133       if (BE (at_init_state, 0))
1134         {
1135           if (old_state == cur_state)
1136             next_start_idx = next_char_idx;
1137           else
1138             at_init_state = 0;
1139         }
1140
1141       if (cur_state->halt)
1142         {
1143           /* Reached a halt state.
1144              Check the halt state can satisfy the current context.  */
1145           if (!cur_state->has_constraint
1146               || check_halt_state_context (mctx, cur_state,
1147                                            re_string_cur_idx (&mctx->input)))
1148             {
1149               /* We found an appropriate halt state.  */
1150               match_last = re_string_cur_idx (&mctx->input);
1151               match = 1;
1152
1153               /* We found a match, do not modify match_first below.  */
1154               p_match_first = NULL;
1155               if (!fl_longest_match)
1156                 break;
1157             }
1158         }
1159     }
1160
1161   if (p_match_first)
1162     *p_match_first += next_start_idx;
1163
1164   return match_last;
1165 }
1166
1167 /* Check NODE match the current context.  */
1168
1169 static int check_halt_node_context (dfa, node, context)
1170     const re_dfa_t *dfa;
1171     int node;
1172     unsigned int context;
1173 {
1174   re_token_type_t type = dfa->nodes[node].type;
1175   unsigned int constraint = dfa->nodes[node].constraint;
1176   if (type != END_OF_RE)
1177     return 0;
1178   if (!constraint)
1179     return 1;
1180   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1181     return 0;
1182   return 1;
1183 }
1184
1185 /* Check the halt state STATE match the current context.
1186    Return 0 if not match, if the node, STATE has, is a halt node and
1187    match the context, return the node.  */
1188
1189 static int
1190 check_halt_state_context (mctx, state, idx)
1191     const re_match_context_t *mctx;
1192     const re_dfastate_t *state;
1193     int idx;
1194 {
1195   int i;
1196   unsigned int context;
1197 #ifdef DEBUG
1198   assert (state->halt);
1199 #endif
1200   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201   for (i = 0; i < state->nodes.nelem; ++i)
1202     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203       return state->nodes.elems[i];
1204   return 0;
1205 }
1206
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208    corresponding to the DFA).
1209    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1210    of errors.  */
1211
1212 static int
1213 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1214     const re_match_context_t *mctx;
1215     regmatch_t *regs;
1216     int nregs, *pidx, node;
1217     re_node_set *eps_via_nodes;
1218     struct re_fail_stack_t *fs;
1219 {
1220   re_dfa_t *const dfa = mctx->dfa;
1221   int i, err, dest_node;
1222   dest_node = -1;
1223   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1224     {
1225       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1226       int ndest, dest_nodes[2];
1227       err = re_node_set_insert (eps_via_nodes, node);
1228       if (BE (err < 0, 0))
1229         return -2;
1230       /* Pick up valid destinations.  */
1231       for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1232         {
1233           int candidate = dfa->edests[node].elems[i];
1234           if (!re_node_set_contains (cur_nodes, candidate))
1235             continue;
1236           dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1237           dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1238           ++ndest;
1239         }
1240       if (ndest <= 1)
1241         return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1242       /* In order to avoid infinite loop like "(a*)*".  */
1243       if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1244         return dest_nodes[1];
1245       if (fs != NULL
1246           && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1247                               eps_via_nodes))
1248         return -2;
1249       return dest_nodes[0];
1250     }
1251   else
1252     {
1253       int naccepted = 0;
1254       re_token_type_t type = dfa->nodes[node].type;
1255
1256 #ifdef RE_ENABLE_I18N
1257       if (ACCEPT_MB_NODE (type))
1258         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1259       else
1260 #endif /* RE_ENABLE_I18N */
1261       if (type == OP_BACK_REF)
1262         {
1263           int subexp_idx = dfa->nodes[node].opr.idx;
1264           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1265           if (fs != NULL)
1266             {
1267               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1268                 return -1;
1269               else if (naccepted)
1270                 {
1271                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1272                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1273                               naccepted) != 0)
1274                     return -1;
1275                 }
1276             }
1277
1278           if (naccepted == 0)
1279             {
1280               err = re_node_set_insert (eps_via_nodes, node);
1281               if (BE (err < 0, 0))
1282                 return -2;
1283               dest_node = dfa->edests[node].elems[0];
1284               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1285                                         dest_node))
1286                 return dest_node;
1287             }
1288         }
1289
1290       if (naccepted != 0
1291           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1292         {
1293           dest_node = dfa->nexts[node];
1294           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1295           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1296                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1297                                                dest_node)))
1298             return -1;
1299           re_node_set_empty (eps_via_nodes);
1300           return dest_node;
1301         }
1302     }
1303   return -1;
1304 }
1305
1306 static reg_errcode_t
1307 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1308      struct re_fail_stack_t *fs;
1309      int str_idx, *dests, nregs;
1310      regmatch_t *regs;
1311      re_node_set *eps_via_nodes;
1312 {
1313   reg_errcode_t err;
1314   int num = fs->num++;
1315   if (fs->num == fs->alloc)
1316     {
1317       struct re_fail_stack_ent_t *new_array;
1318       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1319                                        * fs->alloc * 2));
1320       if (new_array == NULL)
1321         return REG_ESPACE;
1322       fs->alloc *= 2;
1323       fs->stack = new_array;
1324     }
1325   fs->stack[num].idx = str_idx;
1326   fs->stack[num].node = dests[1];
1327   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1328   if (fs->stack[num].regs == NULL)
1329     return REG_ESPACE;
1330   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1331   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1332   return err;
1333 }
1334
1335 static int
1336 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1337      struct re_fail_stack_t *fs;
1338      int *pidx, nregs;
1339      regmatch_t *regs;
1340      re_node_set *eps_via_nodes;
1341 {
1342   int num = --fs->num;
1343   assert (num >= 0);
1344   *pidx = fs->stack[num].idx;
1345   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1346   re_node_set_free (eps_via_nodes);
1347   re_free (fs->stack[num].regs);
1348   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1349   return fs->stack[num].node;
1350 }
1351
1352 /* Set the positions where the subexpressions are starts/ends to registers
1353    PMATCH.
1354    Note: We assume that pmatch[0] is already set, and
1355    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1356
1357 static reg_errcode_t
1358 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1359      const regex_t *preg;
1360      const re_match_context_t *mctx;
1361      size_t nmatch;
1362      regmatch_t *pmatch;
1363      int fl_backtrack;
1364 {
1365   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1366   int idx, cur_node, real_nmatch;
1367   re_node_set eps_via_nodes;
1368   struct re_fail_stack_t *fs;
1369   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1370   regmatch_t *prev_idx_match;
1371
1372 #ifdef DEBUG
1373   assert (nmatch > 1);
1374   assert (mctx->state_log != NULL);
1375 #endif
1376   if (fl_backtrack)
1377     {
1378       fs = &fs_body;
1379       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1380       if (fs->stack == NULL)
1381         return REG_ESPACE;
1382     }
1383   else
1384     fs = NULL;
1385
1386   cur_node = dfa->init_node;
1387   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1388   re_node_set_init_empty (&eps_via_nodes);
1389
1390   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1391   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1392
1393   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1394     {
1395       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1396
1397       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1398         {
1399           int reg_idx;
1400           if (fs)
1401             {
1402               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1403                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1404                   break;
1405               if (reg_idx == nmatch)
1406                 {
1407                   re_node_set_free (&eps_via_nodes);
1408                   return free_fail_stack_return (fs);
1409                 }
1410               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1411                                          &eps_via_nodes);
1412             }
1413           else
1414             {
1415               re_node_set_free (&eps_via_nodes);
1416               return REG_NOERROR;
1417             }
1418         }
1419
1420       /* Proceed to next node.  */
1421       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1422                                     &eps_via_nodes, fs);
1423
1424       if (BE (cur_node < 0, 0))
1425         {
1426           if (BE (cur_node == -2, 0))
1427             {
1428               re_node_set_free (&eps_via_nodes);
1429               free_fail_stack_return (fs);
1430               return REG_ESPACE;
1431             }
1432           if (fs)
1433             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1434                                        &eps_via_nodes);
1435           else
1436             {
1437               re_node_set_free (&eps_via_nodes);
1438               return REG_NOMATCH;
1439             }
1440         }
1441     }
1442   re_node_set_free (&eps_via_nodes);
1443   return free_fail_stack_return (fs);
1444 }
1445
1446 static reg_errcode_t
1447 free_fail_stack_return (fs)
1448      struct re_fail_stack_t *fs;
1449 {
1450   if (fs)
1451     {
1452       int fs_idx;
1453       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1454         {
1455           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1456           re_free (fs->stack[fs_idx].regs);
1457         }
1458       re_free (fs->stack);
1459     }
1460   return REG_NOERROR;
1461 }
1462
1463 static void
1464 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1465      re_dfa_t *dfa;
1466      regmatch_t *pmatch, *prev_idx_match;
1467      int cur_node, cur_idx, nmatch;
1468 {
1469   int type = dfa->nodes[cur_node].type;
1470   if (type == OP_OPEN_SUBEXP)
1471     {
1472       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1473
1474       /* We are at the first node of this sub expression.  */
1475       if (reg_num < nmatch)
1476         {
1477           pmatch[reg_num].rm_so = cur_idx;
1478           pmatch[reg_num].rm_eo = -1;
1479         }
1480     }
1481   else if (type == OP_CLOSE_SUBEXP)
1482     {
1483       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1484       if (reg_num < nmatch)
1485         {
1486           /* We are at the last node of this sub expression.  */
1487           if (pmatch[reg_num].rm_so < cur_idx)
1488             {
1489               pmatch[reg_num].rm_eo = cur_idx;
1490               /* This is a non-empty match or we are not inside an optional
1491                  subexpression.  Accept this right away.  */
1492               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1493             }
1494           else
1495             {
1496               if (dfa->nodes[cur_node].opt_subexp
1497                   && prev_idx_match[reg_num].rm_so != -1)
1498                 /* We transited through an empty match for an optional
1499                    subexpression, like (a?)*, and this is not the subexp's
1500                    first match.  Copy back the old content of the registers
1501                    so that matches of an inner subexpression are undone as
1502                    well, like in ((a?))*.  */
1503                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1504               else
1505                 /* We completed a subexpression, but it may be part of
1506                    an optional one, so do not update PREV_IDX_MATCH.  */
1507                 pmatch[reg_num].rm_eo = cur_idx;
1508             }
1509         }
1510     }
1511 }
1512
1513 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1514    and sift the nodes in each states according to the following rules.
1515    Updated state_log will be wrote to STATE_LOG.
1516
1517    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1518      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1519         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1520         the LAST_NODE, we throw away the node `a'.
1521      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1522         string `s' and transit to `b':
1523         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1524            away the node `a'.
1525         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1526             thrown away, we throw away the node `a'.
1527      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1528         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1529            node `a'.
1530         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1531             we throw away the node `a'.  */
1532
1533 #define STATE_NODE_CONTAINS(state,node) \
1534   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1535
1536 static reg_errcode_t
1537 sift_states_backward (mctx, sctx)
1538      re_match_context_t *mctx;
1539      re_sift_context_t *sctx;
1540 {
1541   reg_errcode_t err;
1542   int null_cnt = 0;
1543   int str_idx = sctx->last_str_idx;
1544   re_node_set cur_dest;
1545
1546 #ifdef DEBUG
1547   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1548 #endif
1549
1550   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1551      transit to the last_node and the last_node itself.  */
1552   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1553   if (BE (err != REG_NOERROR, 0))
1554     return err;
1555   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1556   if (BE (err != REG_NOERROR, 0))
1557     goto free_return;
1558
1559   /* Then check each states in the state_log.  */
1560   while (str_idx > 0)
1561     {
1562       /* Update counters.  */
1563       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1564       if (null_cnt > mctx->max_mb_elem_len)
1565         {
1566           memset (sctx->sifted_states, '\0',
1567                   sizeof (re_dfastate_t *) * str_idx);
1568           re_node_set_free (&cur_dest);
1569           return REG_NOERROR;
1570         }
1571       re_node_set_empty (&cur_dest);
1572       --str_idx;
1573
1574       if (mctx->state_log[str_idx])
1575         {
1576           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1577           if (BE (err != REG_NOERROR, 0))
1578             goto free_return;
1579         }
1580
1581       /* Add all the nodes which satisfy the following conditions:
1582          - It can epsilon transit to a node in CUR_DEST.
1583          - It is in CUR_SRC.
1584          And update state_log.  */
1585       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1586       if (BE (err != REG_NOERROR, 0))
1587         goto free_return;
1588     }
1589   err = REG_NOERROR;
1590  free_return:
1591   re_node_set_free (&cur_dest);
1592   return err;
1593 }
1594
1595 static reg_errcode_t
1596 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1597      re_match_context_t *mctx;
1598      re_sift_context_t *sctx;
1599      int str_idx;
1600      re_node_set *cur_dest;
1601 {
1602   re_dfa_t *const dfa = mctx->dfa;
1603   re_node_set *cur_src = &mctx->state_log[str_idx]->nodes;
1604   int i;
1605
1606   /* Then build the next sifted state.
1607      We build the next sifted state on `cur_dest', and update
1608      `sifted_states[str_idx]' with `cur_dest'.
1609      Note:
1610      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1611      `cur_src' points the node_set of the old `state_log[str_idx]'.  */
1612   for (i = 0; i < cur_src->nelem; i++)
1613     {
1614       int prev_node = cur_src->elems[i];
1615       int naccepted = 0;
1616       re_token_type_t type = dfa->nodes[prev_node].type;
1617       int ret;
1618
1619       if (IS_EPSILON_NODE (type))
1620         continue;
1621 #ifdef RE_ENABLE_I18N
1622       /* If the node may accept `multi byte'.  */
1623       if (ACCEPT_MB_NODE (type))
1624         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1625                                          str_idx, sctx->last_str_idx);
1626 #endif /* RE_ENABLE_I18N */
1627
1628       /* We don't check backreferences here.
1629          See update_cur_sifted_state().  */
1630       if (!naccepted
1631           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1632           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1633                                   dfa->nexts[prev_node]))
1634         naccepted = 1;
1635
1636       if (naccepted == 0)
1637         continue;
1638
1639       if (sctx->limits.nelem)
1640         {
1641           int to_idx = str_idx + naccepted;
1642           if (check_dst_limits (mctx, &sctx->limits,
1643                                 dfa->nexts[prev_node], to_idx,
1644                                 prev_node, str_idx))
1645             continue;
1646         }
1647       ret = re_node_set_insert (cur_dest, prev_node);
1648       if (BE (ret == -1, 0))
1649         return REG_ESPACE;
1650     }
1651
1652   return REG_NOERROR;
1653 }
1654
1655 /* Helper functions.  */
1656
1657 static reg_errcode_t
1658 clean_state_log_if_needed (mctx, next_state_log_idx)
1659     re_match_context_t *mctx;
1660     int next_state_log_idx;
1661 {
1662   int top = mctx->state_log_top;
1663
1664   if (next_state_log_idx >= mctx->input.bufs_len
1665       || (next_state_log_idx >= mctx->input.valid_len
1666           && mctx->input.valid_len < mctx->input.len))
1667     {
1668       reg_errcode_t err;
1669       err = extend_buffers (mctx);
1670       if (BE (err != REG_NOERROR, 0))
1671         return err;
1672     }
1673
1674   if (top < next_state_log_idx)
1675     {
1676       memset (mctx->state_log + top + 1, '\0',
1677               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1678       mctx->state_log_top = next_state_log_idx;
1679     }
1680   return REG_NOERROR;
1681 }
1682
1683 static reg_errcode_t
1684 merge_state_array (dfa, dst, src, num)
1685      re_dfa_t *dfa;
1686      re_dfastate_t **dst;
1687      re_dfastate_t **src;
1688      int num;
1689 {
1690   int st_idx;
1691   reg_errcode_t err;
1692   for (st_idx = 0; st_idx < num; ++st_idx)
1693     {
1694       if (dst[st_idx] == NULL)
1695         dst[st_idx] = src[st_idx];
1696       else if (src[st_idx] != NULL)
1697         {
1698           re_node_set merged_set;
1699           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1700                                         &src[st_idx]->nodes);
1701           if (BE (err != REG_NOERROR, 0))
1702             return err;
1703           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1704           re_node_set_free (&merged_set);
1705           if (BE (err != REG_NOERROR, 0))
1706             return err;
1707         }
1708     }
1709   return REG_NOERROR;
1710 }
1711
1712 static reg_errcode_t
1713 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1714      re_match_context_t *mctx;
1715      re_sift_context_t *sctx;
1716      int str_idx;
1717      re_node_set *dest_nodes;
1718 {
1719   re_dfa_t *const dfa = mctx->dfa;
1720   reg_errcode_t err;
1721   const re_node_set *candidates;
1722   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1723                 : &mctx->state_log[str_idx]->nodes);
1724
1725   if (dest_nodes->nelem == 0)
1726     sctx->sifted_states[str_idx] = NULL;
1727   else
1728     {
1729       if (candidates)
1730         {
1731           /* At first, add the nodes which can epsilon transit to a node in
1732              DEST_NODE.  */
1733           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1734           if (BE (err != REG_NOERROR, 0))
1735             return err;
1736
1737           /* Then, check the limitations in the current sift_context.  */
1738           if (sctx->limits.nelem)
1739             {
1740               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1741                                          mctx->bkref_ents, str_idx);
1742               if (BE (err != REG_NOERROR, 0))
1743                 return err;
1744             }
1745         }
1746
1747       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1748       if (BE (err != REG_NOERROR, 0))
1749         return err;
1750     }
1751
1752   if (candidates && mctx->state_log[str_idx]->has_backref)
1753     {
1754       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1755       if (BE (err != REG_NOERROR, 0))
1756         return err;
1757     }
1758   return REG_NOERROR;
1759 }
1760
1761 static reg_errcode_t
1762 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1763      re_dfa_t *dfa;
1764      re_node_set *dest_nodes;
1765      const re_node_set *candidates;
1766 {
1767   reg_errcode_t err;
1768   int src_idx;
1769   re_node_set src_copy;
1770
1771   err = re_node_set_init_copy (&src_copy, dest_nodes);
1772   if (BE (err != REG_NOERROR, 0))
1773     return err;
1774   for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1775     {
1776       err = re_node_set_add_intersect (dest_nodes, candidates,
1777                                        dfa->inveclosures
1778                                        + src_copy.elems[src_idx]);
1779       if (BE (err != REG_NOERROR, 0))
1780         {
1781           re_node_set_free (&src_copy);
1782           return err;
1783         }
1784     }
1785   re_node_set_free (&src_copy);
1786   return REG_NOERROR;
1787 }
1788
1789 static reg_errcode_t
1790 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1791      re_dfa_t *dfa;
1792      int node;
1793      re_node_set *dest_nodes;
1794      const re_node_set *candidates;
1795 {
1796     int ecl_idx;
1797     reg_errcode_t err;
1798     re_node_set *inv_eclosure = dfa->inveclosures + node;
1799     re_node_set except_nodes;
1800     re_node_set_init_empty (&except_nodes);
1801     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1802       {
1803         int cur_node = inv_eclosure->elems[ecl_idx];
1804         if (cur_node == node)
1805           continue;
1806         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1807           {
1808             int edst1 = dfa->edests[cur_node].elems[0];
1809             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1810                          ? dfa->edests[cur_node].elems[1] : -1);
1811             if ((!re_node_set_contains (inv_eclosure, edst1)
1812                  && re_node_set_contains (dest_nodes, edst1))
1813                 || (edst2 > 0
1814                     && !re_node_set_contains (inv_eclosure, edst2)
1815                     && re_node_set_contains (dest_nodes, edst2)))
1816               {
1817                 err = re_node_set_add_intersect (&except_nodes, candidates,
1818                                                  dfa->inveclosures + cur_node);
1819                 if (BE (err != REG_NOERROR, 0))
1820                   {
1821                     re_node_set_free (&except_nodes);
1822                     return err;
1823                   }
1824               }
1825           }
1826       }
1827     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1828       {
1829         int cur_node = inv_eclosure->elems[ecl_idx];
1830         if (!re_node_set_contains (&except_nodes, cur_node))
1831           {
1832             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1833             re_node_set_remove_at (dest_nodes, idx);
1834           }
1835       }
1836     re_node_set_free (&except_nodes);
1837     return REG_NOERROR;
1838 }
1839
1840 static int
1841 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1842      re_match_context_t *mctx;
1843      re_node_set *limits;
1844      int dst_node, dst_idx, src_node, src_idx;
1845 {
1846   re_dfa_t *const dfa = mctx->dfa;
1847   int lim_idx, src_pos, dst_pos;
1848
1849   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1850   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1851   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1852     {
1853       int subexp_idx;
1854       struct re_backref_cache_entry *ent;
1855       ent = mctx->bkref_ents + limits->elems[lim_idx];
1856       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1857
1858       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1859                                            subexp_idx, dst_node, dst_idx,
1860                                            dst_bkref_idx);
1861       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1862                                            subexp_idx, src_node, src_idx,
1863                                            src_bkref_idx);
1864
1865       /* In case of:
1866          <src> <dst> ( <subexp> )
1867          ( <subexp> ) <src> <dst>
1868          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1869       if (src_pos == dst_pos)
1870         continue; /* This is unrelated limitation.  */
1871       else
1872         return 1;
1873     }
1874   return 0;
1875 }
1876
1877 static int
1878 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1879      re_match_context_t *mctx;
1880      int boundaries, subexp_idx, from_node, bkref_idx;
1881 {
1882   re_dfa_t *const dfa = mctx->dfa;
1883   re_node_set *eclosures = dfa->eclosures + from_node;
1884   int node_idx;
1885
1886   /* Else, we are on the boundary: examine the nodes on the epsilon
1887      closure.  */
1888   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1889     {
1890       int node = eclosures->elems[node_idx];
1891       switch (dfa->nodes[node].type)
1892         {
1893         case OP_BACK_REF:
1894           {
1895             struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1896             do
1897               {
1898                 int dst, cpos;
1899
1900                 if (ent->node != node)
1901                   continue;
1902
1903                 if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1904                     && (ent->eps_reachable_subexps_map
1905                         & (1 << (subexp_idx - 1))) == 0)
1906                   continue;
1907
1908                 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1909                    OP_CLOSE_SUBEXP cases below.  But, if the
1910                    destination node is the same node as the source
1911                    node, don't recurse because it would cause an
1912                    infinite loop: a regex that exhibits this behavior
1913                    is ()\1*\1*  */
1914                 dst = dfa->edests[node].elems[0];
1915                 if (dst == from_node)
1916                   {
1917                     if (boundaries & 1)
1918                       return -1;
1919                     else /* if (boundaries & 2) */
1920                       return 0;
1921                   }
1922
1923                 cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
1924                                                     subexp_idx, dst, bkref_idx);
1925
1926                 if (cpos == -1 /* && (boundaries & 1) */)
1927                   return -1;
1928
1929                 if (cpos == 0 && (boundaries & 2))
1930                   return 0;
1931
1932                 ent->eps_reachable_subexps_map &= ~(1 << (subexp_idx - 1));
1933               }
1934             while (ent++->more);
1935             break;
1936           }
1937
1938         case OP_OPEN_SUBEXP:
1939           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1940             return -1;
1941           break;
1942
1943         case OP_CLOSE_SUBEXP:
1944           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1945             return 0;
1946           break;
1947
1948         default:
1949             break;
1950         }
1951     }
1952
1953   return (boundaries & 2) ? 1 : 0;
1954 }
1955
1956 static int
1957 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1958      re_match_context_t *mctx;
1959      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1960 {
1961   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1962   int boundaries;
1963
1964   /* If we are outside the range of the subexpression, return -1 or 1.  */
1965   if (str_idx < lim->subexp_from)
1966     return -1;
1967
1968   if (lim->subexp_to < str_idx)
1969     return 1;
1970
1971   /* If we are within the subexpression, return 0.  */
1972   boundaries = (str_idx == lim->subexp_from);
1973   boundaries |= (str_idx == lim->subexp_to) << 1;
1974   if (boundaries == 0)
1975     return 0;
1976
1977   /* Else, examine epsilon closure.  */
1978   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1979                                       from_node, bkref_idx);
1980 }
1981
1982 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1983    which are against limitations from DEST_NODES. */
1984
1985 static reg_errcode_t
1986 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1987      re_dfa_t *dfa;
1988      re_node_set *dest_nodes;
1989      const re_node_set *candidates;
1990      re_node_set *limits;
1991      struct re_backref_cache_entry *bkref_ents;
1992      int str_idx;
1993 {
1994   reg_errcode_t err;
1995   int node_idx, lim_idx;
1996
1997   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1998     {
1999       int subexp_idx;
2000       struct re_backref_cache_entry *ent;
2001       ent = bkref_ents + limits->elems[lim_idx];
2002
2003       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2004         continue; /* This is unrelated limitation.  */
2005
2006       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
2007       if (ent->subexp_to == str_idx)
2008         {
2009           int ops_node = -1;
2010           int cls_node = -1;
2011           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2012             {
2013               int node = dest_nodes->elems[node_idx];
2014               re_token_type_t type = dfa->nodes[node].type;
2015               if (type == OP_OPEN_SUBEXP
2016                   && subexp_idx == dfa->nodes[node].opr.idx)
2017                 ops_node = node;
2018               else if (type == OP_CLOSE_SUBEXP
2019                        && subexp_idx == dfa->nodes[node].opr.idx)
2020                 cls_node = node;
2021             }
2022
2023           /* Check the limitation of the open subexpression.  */
2024           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2025           if (ops_node >= 0)
2026             {
2027               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2028                                            candidates);
2029               if (BE (err != REG_NOERROR, 0))
2030                 return err;
2031             }
2032
2033           /* Check the limitation of the close subexpression.  */
2034           if (cls_node >= 0)
2035             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2036               {
2037                 int node = dest_nodes->elems[node_idx];
2038                 if (!re_node_set_contains (dfa->inveclosures + node,
2039                                            cls_node)
2040                     && !re_node_set_contains (dfa->eclosures + node,
2041                                               cls_node))
2042                   {
2043                     /* It is against this limitation.
2044                        Remove it form the current sifted state.  */
2045                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2046                                                  candidates);
2047                     if (BE (err != REG_NOERROR, 0))
2048                       return err;
2049                     --node_idx;
2050                   }
2051               }
2052         }
2053       else /* (ent->subexp_to != str_idx)  */
2054         {
2055           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2056             {
2057               int node = dest_nodes->elems[node_idx];
2058               re_token_type_t type = dfa->nodes[node].type;
2059               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2060                 {
2061                   if (subexp_idx != dfa->nodes[node].opr.idx)
2062                     continue;
2063                   if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
2064                       || (type == OP_OPEN_SUBEXP))
2065                     {
2066                       /* It is against this limitation.
2067                          Remove it form the current sifted state.  */
2068                       err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2069                                                    candidates);
2070                       if (BE (err != REG_NOERROR, 0))
2071                         return err;
2072                     }
2073                 }
2074             }
2075         }
2076     }
2077   return REG_NOERROR;
2078 }
2079
2080 static reg_errcode_t
2081 sift_states_bkref (mctx, sctx, str_idx, candidates)
2082      re_match_context_t *mctx;
2083      re_sift_context_t *sctx;
2084      int str_idx;
2085      const re_node_set *candidates;
2086 {
2087   re_dfa_t *const dfa = mctx->dfa;
2088   reg_errcode_t err;
2089   int node_idx, node;
2090   re_sift_context_t local_sctx;
2091   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2092
2093   if (first_idx == -1)
2094     return REG_NOERROR;
2095
2096   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2097
2098   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2099     {
2100       int enabled_idx;
2101       re_token_type_t type;
2102       struct re_backref_cache_entry *entry;
2103       node = candidates->elems[node_idx];
2104       type = dfa->nodes[node].type;
2105       /* Avoid infinite loop for the REs like "()\1+".  */
2106       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2107         continue;
2108       if (type != OP_BACK_REF)
2109         continue;
2110
2111       entry = mctx->bkref_ents + first_idx;
2112       enabled_idx = first_idx;
2113       do
2114         {
2115           int subexp_len, to_idx, dst_node;
2116           re_dfastate_t *cur_state;
2117
2118           if (entry->node != node)
2119             continue;
2120           subexp_len = entry->subexp_to - entry->subexp_from;
2121           to_idx = str_idx + subexp_len;
2122           dst_node = (subexp_len ? dfa->nexts[node]
2123                       : dfa->edests[node].elems[0]);
2124
2125           if (to_idx > sctx->last_str_idx
2126               || sctx->sifted_states[to_idx] == NULL
2127               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2128               || check_dst_limits (mctx, &sctx->limits, node,
2129                                    str_idx, dst_node, to_idx))
2130             continue;
2131
2132           if (local_sctx.sifted_states == NULL)
2133             {
2134               local_sctx = *sctx;
2135               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2136               if (BE (err != REG_NOERROR, 0))
2137                 goto free_return;
2138             }
2139           local_sctx.last_node = node;
2140           local_sctx.last_str_idx = str_idx;
2141           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2142           if (BE (err < 0, 0))
2143             {
2144               err = REG_ESPACE;
2145               goto free_return;
2146             }
2147           cur_state = local_sctx.sifted_states[str_idx];
2148           err = sift_states_backward (mctx, &local_sctx);
2149           if (BE (err != REG_NOERROR, 0))
2150             goto free_return;
2151           if (sctx->limited_states != NULL)
2152             {
2153               err = merge_state_array (dfa, sctx->limited_states,
2154                                        local_sctx.sifted_states,
2155                                        str_idx + 1);
2156               if (BE (err != REG_NOERROR, 0))
2157                 goto free_return;
2158             }
2159           local_sctx.sifted_states[str_idx] = cur_state;
2160           re_node_set_remove (&local_sctx.limits, enabled_idx);
2161
2162           /* mctx->bkref_ents may have changed, reload the pointer.  */
2163           entry = mctx->bkref_ents + enabled_idx;
2164         }
2165       while (enabled_idx++, entry++->more);
2166     }
2167   err = REG_NOERROR;
2168  free_return:
2169   if (local_sctx.sifted_states != NULL)
2170     {
2171       re_node_set_free (&local_sctx.limits);
2172     }
2173
2174   return err;
2175 }
2176
2177
2178 #ifdef RE_ENABLE_I18N
2179 static int
2180 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2181     const re_match_context_t *mctx;
2182     re_sift_context_t *sctx;
2183     int node_idx, str_idx, max_str_idx;
2184 {
2185   re_dfa_t *const dfa = mctx->dfa;
2186   int naccepted;
2187   /* Check the node can accept `multi byte'.  */
2188   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2189   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2190       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2191                             dfa->nexts[node_idx]))
2192     /* The node can't accept the `multi byte', or the
2193        destination was already thrown away, then the node
2194        could't accept the current input `multi byte'.   */
2195     naccepted = 0;
2196   /* Otherwise, it is sure that the node could accept
2197      `naccepted' bytes input.  */
2198   return naccepted;
2199 }
2200 #endif /* RE_ENABLE_I18N */
2201
2202 \f
2203 /* Functions for state transition.  */
2204
2205 /* Return the next state to which the current state STATE will transit by
2206    accepting the current input byte, and update STATE_LOG if necessary.
2207    If STATE can accept a multibyte char/collating element/back reference
2208    update the destination of STATE_LOG.  */
2209
2210 static re_dfastate_t *
2211 transit_state (err, mctx, state)
2212      reg_errcode_t *err;
2213      re_match_context_t *mctx;
2214      re_dfastate_t *state;
2215 {
2216   re_dfa_t *const dfa = mctx->dfa;
2217   re_dfastate_t **trtable;
2218   unsigned char ch;
2219
2220 #ifdef RE_ENABLE_I18N
2221   /* If the current state can accept multibyte.  */
2222   if (BE (state->accept_mb, 0))
2223     {
2224       *err = transit_state_mb (mctx, state);
2225       if (BE (*err != REG_NOERROR, 0))
2226         return NULL;
2227     }
2228 #endif /* RE_ENABLE_I18N */
2229
2230   /* Then decide the next state with the single byte.  */
2231   if (1)
2232     {
2233       /* Use transition table  */
2234       ch = re_string_fetch_byte (&mctx->input);
2235       trtable = state->trtable;
2236       if (trtable == NULL)
2237         {
2238           trtable = build_trtable (dfa, state);
2239           if (trtable == NULL)
2240             {
2241               *err = REG_ESPACE;
2242               return NULL;
2243             }
2244         }
2245       if (BE (state->word_trtable, 0))
2246         {
2247           unsigned int context;
2248           context
2249             = re_string_context_at (&mctx->input,
2250                                     re_string_cur_idx (&mctx->input) - 1,
2251                                     mctx->eflags);
2252           if (IS_WORD_CONTEXT (context))
2253             return trtable[ch + SBC_MAX];
2254           else
2255             return trtable[ch];
2256         }
2257       else
2258         return trtable[ch];
2259     }
2260 #if 0
2261   else
2262     /* don't use transition table  */
2263     return transit_state_sb (err, mctx, state);
2264 #endif
2265 }
2266
2267 /* Update the state_log if we need */
2268 re_dfastate_t *
2269 merge_state_with_log (err, mctx, next_state)
2270      reg_errcode_t *err;
2271      re_match_context_t *mctx;
2272      re_dfastate_t *next_state;
2273 {
2274   re_dfa_t *const dfa = mctx->dfa;
2275   int cur_idx = re_string_cur_idx (&mctx->input);
2276
2277   if (cur_idx > mctx->state_log_top)
2278     {
2279       mctx->state_log[cur_idx] = next_state;
2280       mctx->state_log_top = cur_idx;
2281     }
2282   else if (mctx->state_log[cur_idx] == 0)
2283     {
2284       mctx->state_log[cur_idx] = next_state;
2285     }
2286   else
2287     {
2288       re_dfastate_t *pstate;
2289       unsigned int context;
2290       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2291       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2292          the destination of a multibyte char/collating element/
2293          back reference.  Then the next state is the union set of
2294          these destinations and the results of the transition table.  */
2295       pstate = mctx->state_log[cur_idx];
2296       log_nodes = pstate->entrance_nodes;
2297       if (next_state != NULL)
2298         {
2299           table_nodes = next_state->entrance_nodes;
2300           *err = re_node_set_init_union (&next_nodes, table_nodes,
2301                                              log_nodes);
2302           if (BE (*err != REG_NOERROR, 0))
2303             return NULL;
2304         }
2305       else
2306         next_nodes = *log_nodes;
2307       /* Note: We already add the nodes of the initial state,
2308          then we don't need to add them here.  */
2309
2310       context = re_string_context_at (&mctx->input,
2311                                       re_string_cur_idx (&mctx->input) - 1,
2312                                       mctx->eflags);
2313       next_state = mctx->state_log[cur_idx]
2314         = re_acquire_state_context (err, dfa, &next_nodes, context);
2315       /* We don't need to check errors here, since the return value of
2316          this function is next_state and ERR is already set.  */
2317
2318       if (table_nodes != NULL)
2319         re_node_set_free (&next_nodes);
2320     }
2321
2322   if (BE (dfa->nbackref, 0) && next_state != NULL)
2323     {
2324       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2325          later.  We must check them here, since the back references in the
2326          next state might use them.  */
2327       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2328                                         cur_idx);
2329       if (BE (*err != REG_NOERROR, 0))
2330         return NULL;
2331
2332       /* If the next state has back references.  */
2333       if (next_state->has_backref)
2334         {
2335           *err = transit_state_bkref (mctx, &next_state->nodes);
2336           if (BE (*err != REG_NOERROR, 0))
2337             return NULL;
2338           next_state = mctx->state_log[cur_idx];
2339         }
2340     }
2341
2342   return next_state;
2343 }
2344
2345 /* Skip bytes in the input that correspond to part of a
2346    multi-byte match, then look in the log for a state
2347    from which to restart matching.  */
2348 re_dfastate_t *
2349 find_recover_state (err, mctx)
2350      reg_errcode_t *err;
2351      re_match_context_t *mctx;
2352 {
2353   re_dfastate_t *cur_state = NULL;
2354   do
2355     {
2356       int max = mctx->state_log_top;
2357       int cur_str_idx = re_string_cur_idx (&mctx->input);
2358
2359       do
2360         {
2361           if (++cur_str_idx > max)
2362             return NULL;
2363           re_string_skip_bytes (&mctx->input, 1);
2364         }
2365       while (mctx->state_log[cur_str_idx] == NULL);
2366
2367       cur_state = merge_state_with_log (err, mctx, NULL);
2368     }
2369   while (err == REG_NOERROR && cur_state == NULL);
2370   return cur_state;
2371 }
2372
2373 /* Helper functions for transit_state.  */
2374
2375 /* From the node set CUR_NODES, pick up the nodes whose types are
2376    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2377    expression. And register them to use them later for evaluating the
2378    correspoding back references.  */
2379
2380 static reg_errcode_t
2381 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2382      re_match_context_t *mctx;
2383      re_node_set *cur_nodes;
2384      int str_idx;
2385 {
2386   re_dfa_t *const dfa = mctx->dfa;
2387   int node_idx;
2388   reg_errcode_t err;
2389
2390   /* TODO: This isn't efficient.
2391            Because there might be more than one nodes whose types are
2392            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2393            nodes.
2394            E.g. RE: (a){2}  */
2395   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2396     {
2397       int node = cur_nodes->elems[node_idx];
2398       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2399           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2400           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2401         {
2402           err = match_ctx_add_subtop (mctx, node, str_idx);
2403           if (BE (err != REG_NOERROR, 0))
2404             return err;
2405         }
2406     }
2407   return REG_NOERROR;
2408 }
2409
2410 #if 0
2411 /* Return the next state to which the current state STATE will transit by
2412    accepting the current input byte.  */
2413
2414 static re_dfastate_t *
2415 transit_state_sb (err, mctx, state)
2416      reg_errcode_t *err;
2417      re_match_context_t *mctx;
2418      re_dfastate_t *state;
2419 {
2420   re_dfa_t *const dfa = mctx->dfa;
2421   re_node_set next_nodes;
2422   re_dfastate_t *next_state;
2423   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2424   unsigned int context;
2425
2426   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2427   if (BE (*err != REG_NOERROR, 0))
2428     return NULL;
2429   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2430     {
2431       int cur_node = state->nodes.elems[node_cnt];
2432       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2433         {
2434           *err = re_node_set_merge (&next_nodes,
2435                                     dfa->eclosures + dfa->nexts[cur_node]);
2436           if (BE (*err != REG_NOERROR, 0))
2437             {
2438               re_node_set_free (&next_nodes);
2439               return NULL;
2440             }
2441         }
2442     }
2443   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2444   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2445   /* We don't need to check errors here, since the return value of
2446      this function is next_state and ERR is already set.  */
2447
2448   re_node_set_free (&next_nodes);
2449   re_string_skip_bytes (&mctx->input, 1);
2450   return next_state;
2451 }
2452 #endif
2453
2454 #ifdef RE_ENABLE_I18N
2455 static reg_errcode_t
2456 transit_state_mb (mctx, pstate)
2457     re_match_context_t *mctx;
2458     re_dfastate_t *pstate;
2459 {
2460   re_dfa_t *const dfa = mctx->dfa;
2461   reg_errcode_t err;
2462   int i;
2463
2464   for (i = 0; i < pstate->nodes.nelem; ++i)
2465     {
2466       re_node_set dest_nodes, *new_nodes;
2467       int cur_node_idx = pstate->nodes.elems[i];
2468       int naccepted = 0, dest_idx;
2469       unsigned int context;
2470       re_dfastate_t *dest_state;
2471
2472       if (dfa->nodes[cur_node_idx].constraint)
2473         {
2474           context = re_string_context_at (&mctx->input,
2475                                           re_string_cur_idx (&mctx->input),
2476                                           mctx->eflags);
2477           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2478                                            context))
2479             continue;
2480         }
2481
2482       /* How many bytes the node can accept?  */
2483       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2484         naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2485                                              re_string_cur_idx (&mctx->input));
2486       if (naccepted == 0)
2487         continue;
2488
2489       /* The node can accepts `naccepted' bytes.  */
2490       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2491       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2492                                : mctx->max_mb_elem_len);
2493       err = clean_state_log_if_needed (mctx, dest_idx);
2494       if (BE (err != REG_NOERROR, 0))
2495         return err;
2496 #ifdef DEBUG
2497       assert (dfa->nexts[cur_node_idx] != -1);
2498 #endif
2499       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2500          then we use pstate->nodes.elems[i] instead.  */
2501       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2502
2503       dest_state = mctx->state_log[dest_idx];
2504       if (dest_state == NULL)
2505         dest_nodes = *new_nodes;
2506       else
2507         {
2508           err = re_node_set_init_union (&dest_nodes,
2509                                         dest_state->entrance_nodes, new_nodes);
2510           if (BE (err != REG_NOERROR, 0))
2511             return err;
2512         }
2513       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2514       mctx->state_log[dest_idx]
2515         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2516       if (dest_state != NULL)
2517         re_node_set_free (&dest_nodes);
2518       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2519         return err;
2520     }
2521   return REG_NOERROR;
2522 }
2523 #endif /* RE_ENABLE_I18N */
2524
2525 static reg_errcode_t
2526 transit_state_bkref (mctx, nodes)
2527     re_match_context_t *mctx;
2528     const re_node_set *nodes;
2529 {
2530   re_dfa_t *const dfa = mctx->dfa;
2531   reg_errcode_t err;
2532   int i;
2533   int cur_str_idx = re_string_cur_idx (&mctx->input);
2534
2535   for (i = 0; i < nodes->nelem; ++i)
2536     {
2537       int dest_str_idx, prev_nelem, bkc_idx;
2538       int node_idx = nodes->elems[i];
2539       unsigned int context;
2540       const re_token_t *node = dfa->nodes + node_idx;
2541       re_node_set *new_dest_nodes;
2542
2543       /* Check whether `node' is a backreference or not.  */
2544       if (node->type != OP_BACK_REF)
2545         continue;
2546
2547       if (node->constraint)
2548         {
2549           context = re_string_context_at (&mctx->input, cur_str_idx,
2550                                           mctx->eflags);
2551           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2552             continue;
2553         }
2554
2555       /* `node' is a backreference.
2556          Check the substring which the substring matched.  */
2557       bkc_idx = mctx->nbkref_ents;
2558       err = get_subexp (mctx, node_idx, cur_str_idx);
2559       if (BE (err != REG_NOERROR, 0))
2560         goto free_return;
2561
2562       /* And add the epsilon closures (which is `new_dest_nodes') of
2563          the backreference to appropriate state_log.  */
2564 #ifdef DEBUG
2565       assert (dfa->nexts[node_idx] != -1);
2566 #endif
2567       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2568         {
2569           int subexp_len;
2570           re_dfastate_t *dest_state;
2571           struct re_backref_cache_entry *bkref_ent;
2572           bkref_ent = mctx->bkref_ents + bkc_idx;
2573           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2574             continue;
2575           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2576           new_dest_nodes = (subexp_len == 0
2577                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2578                             : dfa->eclosures + dfa->nexts[node_idx]);
2579           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2580                           - bkref_ent->subexp_from);
2581           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2582                                           mctx->eflags);
2583           dest_state = mctx->state_log[dest_str_idx];
2584           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2585                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2586           /* Add `new_dest_node' to state_log.  */
2587           if (dest_state == NULL)
2588             {
2589               mctx->state_log[dest_str_idx]
2590                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2591                                             context);
2592               if (BE (mctx->state_log[dest_str_idx] == NULL
2593                       && err != REG_NOERROR, 0))
2594                 goto free_return;
2595             }
2596           else
2597             {
2598               re_node_set dest_nodes;
2599               err = re_node_set_init_union (&dest_nodes,
2600                                             dest_state->entrance_nodes,
2601                                             new_dest_nodes);
2602               if (BE (err != REG_NOERROR, 0))
2603                 {
2604                   re_node_set_free (&dest_nodes);
2605                   goto free_return;
2606                 }
2607               mctx->state_log[dest_str_idx]
2608                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2609               re_node_set_free (&dest_nodes);
2610               if (BE (mctx->state_log[dest_str_idx] == NULL
2611                       && err != REG_NOERROR, 0))
2612                 goto free_return;
2613             }
2614           /* We need to check recursively if the backreference can epsilon
2615              transit.  */
2616           if (subexp_len == 0
2617               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2618             {
2619               err = check_subexp_matching_top (mctx, new_dest_nodes,
2620                                                cur_str_idx);
2621               if (BE (err != REG_NOERROR, 0))
2622                 goto free_return;
2623               err = transit_state_bkref (mctx, new_dest_nodes);
2624               if (BE (err != REG_NOERROR, 0))
2625                 goto free_return;
2626             }
2627         }
2628     }
2629   err = REG_NOERROR;
2630  free_return:
2631   return err;
2632 }
2633
2634 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2635    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2636    Note that we might collect inappropriate candidates here.
2637    However, the cost of checking them strictly here is too high, then we
2638    delay these checking for prune_impossible_nodes().  */
2639
2640 static reg_errcode_t
2641 get_subexp (mctx, bkref_node, bkref_str_idx)
2642      re_match_context_t *mctx;
2643      int bkref_node, bkref_str_idx;
2644 {
2645   re_dfa_t *const dfa = mctx->dfa;
2646   int subexp_num, sub_top_idx;
2647   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2648   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2649   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2650   if (cache_idx != -1)
2651     {
2652       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2653       do
2654         if (entry->node == bkref_node)
2655           return REG_NOERROR; /* We already checked it.  */
2656       while (entry++->more);
2657     }
2658
2659   subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2660
2661   /* For each sub expression  */
2662   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2663     {
2664       reg_errcode_t err;
2665       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2666       re_sub_match_last_t *sub_last;
2667       int sub_last_idx, sl_str, bkref_str_off;
2668
2669       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2670         continue; /* It isn't related.  */
2671
2672       sl_str = sub_top->str_idx;
2673       bkref_str_off = bkref_str_idx;
2674       /* At first, check the last node of sub expressions we already
2675          evaluated.  */
2676       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2677         {
2678           int sl_str_diff;
2679           sub_last = sub_top->lasts[sub_last_idx];
2680           sl_str_diff = sub_last->str_idx - sl_str;
2681           /* The matched string by the sub expression match with the substring
2682              at the back reference?  */
2683           if (sl_str_diff > 0)
2684             {
2685               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2686                 {
2687                   /* Not enough chars for a successful match.  */
2688                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2689                     break;
2690
2691                   err = clean_state_log_if_needed (mctx,
2692                                                    bkref_str_off
2693                                                    + sl_str_diff);
2694                   if (BE (err != REG_NOERROR, 0))
2695                     return err;
2696                   buf = (const char *) re_string_get_buffer (&mctx->input);
2697                 }
2698               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2699                 break; /* We don't need to search this sub expression any more.  */
2700             }
2701           bkref_str_off += sl_str_diff;
2702           sl_str += sl_str_diff;
2703           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2704                                 bkref_str_idx);
2705
2706           /* Reload buf, since the preceding call might have reallocated
2707              the buffer.  */
2708           buf = (const char *) re_string_get_buffer (&mctx->input);
2709
2710           if (err == REG_NOMATCH)
2711             continue;
2712           if (BE (err != REG_NOERROR, 0))
2713             return err;
2714         }
2715
2716       if (sub_last_idx < sub_top->nlasts)
2717         continue;
2718       if (sub_last_idx > 0)
2719         ++sl_str;
2720       /* Then, search for the other last nodes of the sub expression.  */
2721       for (; sl_str <= bkref_str_idx; ++sl_str)
2722         {
2723           int cls_node, sl_str_off;
2724           const re_node_set *nodes;
2725           sl_str_off = sl_str - sub_top->str_idx;
2726           /* The matched string by the sub expression match with the substring
2727              at the back reference?  */
2728           if (sl_str_off > 0)
2729             {
2730               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2731                 {
2732                   /* If we are at the end of the input, we cannot match.  */
2733                   if (bkref_str_off >= mctx->input.len)
2734                     break;
2735
2736                   err = extend_buffers (mctx);
2737                   if (BE (err != REG_NOERROR, 0))
2738                     return err;
2739
2740                   buf = (const char *) re_string_get_buffer (&mctx->input);
2741                 }
2742               if (buf [bkref_str_off++] != buf[sl_str - 1])
2743                 break; /* We don't need to search this sub expression
2744                           any more.  */
2745             }
2746           if (mctx->state_log[sl_str] == NULL)
2747             continue;
2748           /* Does this state have a ')' of the sub expression?  */
2749           nodes = &mctx->state_log[sl_str]->nodes;
2750           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2751           if (cls_node == -1)
2752             continue; /* No.  */
2753           if (sub_top->path == NULL)
2754             {
2755               sub_top->path = calloc (sizeof (state_array_t),
2756                                       sl_str - sub_top->str_idx + 1);
2757               if (sub_top->path == NULL)
2758                 return REG_ESPACE;
2759             }
2760           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2761              in the current context?  */
2762           err = check_arrival (mctx, sub_top->path, sub_top->node,
2763                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2764           if (err == REG_NOMATCH)
2765               continue;
2766           if (BE (err != REG_NOERROR, 0))
2767               return err;
2768           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2769           if (BE (sub_last == NULL, 0))
2770             return REG_ESPACE;
2771           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2772                                 bkref_str_idx);
2773           if (err == REG_NOMATCH)
2774             continue;
2775         }
2776     }
2777   return REG_NOERROR;
2778 }
2779
2780 /* Helper functions for get_subexp().  */
2781
2782 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2783    If it can arrive, register the sub expression expressed with SUB_TOP
2784    and SUB_LAST.  */
2785
2786 static reg_errcode_t
2787 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2788      re_match_context_t *mctx;
2789      const re_sub_match_top_t *sub_top;
2790      re_sub_match_last_t *sub_last;
2791      int bkref_node, bkref_str;
2792 {
2793   reg_errcode_t err;
2794   int to_idx;
2795   /* Can the subexpression arrive the back reference?  */
2796   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2797                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2798   if (err != REG_NOERROR)
2799     return err;
2800   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2801                              sub_last->str_idx);
2802   if (BE (err != REG_NOERROR, 0))
2803     return err;
2804   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2805   return clean_state_log_if_needed (mctx, to_idx);
2806 }
2807
2808 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2809    Search '(' if FL_OPEN, or search ')' otherwise.
2810    TODO: This function isn't efficient...
2811          Because there might be more than one nodes whose types are
2812          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2813          nodes.
2814          E.g. RE: (a){2}  */
2815
2816 static int
2817 find_subexp_node (dfa, nodes, subexp_idx, type)
2818      const re_dfa_t *dfa;
2819      const re_node_set *nodes;
2820      int subexp_idx, type;
2821 {
2822   int cls_idx;
2823   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2824     {
2825       int cls_node = nodes->elems[cls_idx];
2826       const re_token_t *node = dfa->nodes + cls_node;
2827       if (node->type == type
2828           && node->opr.idx == subexp_idx)
2829         return cls_node;
2830     }
2831   return -1;
2832 }
2833
2834 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2835    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2836    heavily reused.
2837    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2838
2839 static reg_errcode_t
2840 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2841                type)
2842      re_match_context_t *mctx;
2843      state_array_t *path;
2844      int top_node, top_str, last_node, last_str, type;
2845 {
2846   re_dfa_t *const dfa = mctx->dfa;
2847   reg_errcode_t err;
2848   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2849   re_dfastate_t *cur_state = NULL;
2850   re_node_set *cur_nodes, next_nodes;
2851   re_dfastate_t **backup_state_log;
2852   unsigned int context;
2853
2854   subexp_num = dfa->nodes[top_node].opr.idx;
2855   /* Extend the buffer if we need.  */
2856   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2857     {
2858       re_dfastate_t **new_array;
2859       int old_alloc = path->alloc;
2860       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2861       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2862       if (new_array == NULL)
2863         {
2864           path->alloc = old_alloc;
2865           return REG_ESPACE;
2866         }
2867       path->array = new_array;
2868       memset (new_array + old_alloc, '\0',
2869               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2870     }
2871
2872   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2873
2874   /* Temporary modify MCTX.  */
2875   backup_state_log = mctx->state_log;
2876   backup_cur_idx = mctx->input.cur_idx;
2877   mctx->state_log = path->array;
2878   mctx->input.cur_idx = str_idx;
2879
2880   /* Setup initial node set.  */
2881   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2882   if (str_idx == top_str)
2883     {
2884       err = re_node_set_init_1 (&next_nodes, top_node);
2885       if (BE (err != REG_NOERROR, 0))
2886         return err;
2887       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2888       if (BE (err != REG_NOERROR, 0))
2889         {
2890           re_node_set_free (&next_nodes);
2891           return err;
2892         }
2893     }
2894   else
2895     {
2896       cur_state = mctx->state_log[str_idx];
2897       if (cur_state && cur_state->has_backref)
2898         {
2899           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2900           if (BE ( err != REG_NOERROR, 0))
2901             return err;
2902         }
2903       else
2904         re_node_set_init_empty (&next_nodes);
2905     }
2906   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2907     {
2908       if (next_nodes.nelem)
2909         {
2910           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2911                                     subexp_num, type);
2912           if (BE ( err != REG_NOERROR, 0))
2913             {
2914               re_node_set_free (&next_nodes);
2915               return err;
2916             }
2917         }
2918       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2919       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2920         {
2921           re_node_set_free (&next_nodes);
2922           return err;
2923         }
2924       mctx->state_log[str_idx] = cur_state;
2925     }
2926
2927   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2928     {
2929       re_node_set_empty (&next_nodes);
2930       if (mctx->state_log[str_idx + 1])
2931         {
2932           err = re_node_set_merge (&next_nodes,
2933                                    &mctx->state_log[str_idx + 1]->nodes);
2934           if (BE (err != REG_NOERROR, 0))
2935             {
2936               re_node_set_free (&next_nodes);
2937               return err;
2938             }
2939         }
2940       if (cur_state)
2941         {
2942           err = check_arrival_add_next_nodes (mctx, str_idx,
2943                                               &cur_state->nodes, &next_nodes);
2944           if (BE (err != REG_NOERROR, 0))
2945             {
2946               re_node_set_free (&next_nodes);
2947               return err;
2948             }
2949         }
2950       ++str_idx;
2951       if (next_nodes.nelem)
2952         {
2953           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2954           if (BE (err != REG_NOERROR, 0))
2955             {
2956               re_node_set_free (&next_nodes);
2957               return err;
2958             }
2959           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2960                                     subexp_num, type);
2961           if (BE ( err != REG_NOERROR, 0))
2962             {
2963               re_node_set_free (&next_nodes);
2964               return err;
2965             }
2966         }
2967       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2968       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2969       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2970         {
2971           re_node_set_free (&next_nodes);
2972           return err;
2973         }
2974       mctx->state_log[str_idx] = cur_state;
2975       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2976     }
2977   re_node_set_free (&next_nodes);
2978   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2979                : &mctx->state_log[last_str]->nodes);
2980   path->next_idx = str_idx;
2981
2982   /* Fix MCTX.  */
2983   mctx->state_log = backup_state_log;
2984   mctx->input.cur_idx = backup_cur_idx;
2985
2986   /* Then check the current node set has the node LAST_NODE.  */
2987   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2988     return REG_NOERROR;
2989
2990   return REG_NOMATCH;
2991 }
2992
2993 /* Helper functions for check_arrival.  */
2994
2995 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2996    to NEXT_NODES.
2997    TODO: This function is similar to the functions transit_state*(),
2998          however this function has many additional works.
2999          Can't we unify them?  */
3000
3001 static reg_errcode_t
3002 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3003      re_match_context_t *mctx;
3004      int str_idx;
3005      re_node_set *cur_nodes, *next_nodes;
3006 {
3007   re_dfa_t *const dfa = mctx->dfa;
3008   int result;
3009   int cur_idx;
3010   reg_errcode_t err;
3011   re_node_set union_set;
3012   re_node_set_init_empty (&union_set);
3013   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3014     {
3015       int naccepted = 0;
3016       int cur_node = cur_nodes->elems[cur_idx];
3017       re_token_type_t type = dfa->nodes[cur_node].type;
3018       if (IS_EPSILON_NODE (type))
3019         continue;
3020 #ifdef RE_ENABLE_I18N
3021       /* If the node may accept `multi byte'.  */
3022       if (ACCEPT_MB_NODE (type))
3023         {
3024           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3025                                                str_idx);
3026           if (naccepted > 1)
3027             {
3028               re_dfastate_t *dest_state;
3029               int next_node = dfa->nexts[cur_node];
3030               int next_idx = str_idx + naccepted;
3031               dest_state = mctx->state_log[next_idx];
3032               re_node_set_empty (&union_set);
3033               if (dest_state)
3034                 {
3035                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3036                   if (BE (err != REG_NOERROR, 0))
3037                     {
3038                       re_node_set_free (&union_set);
3039                       return err;
3040                     }
3041                 }
3042               result = re_node_set_insert (&union_set, next_node);
3043               if (BE (result < 0, 0))
3044                 {
3045                   re_node_set_free (&union_set);
3046                   return REG_ESPACE;
3047                 }
3048               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3049                                                             &union_set);
3050               if (BE (mctx->state_log[next_idx] == NULL
3051                       && err != REG_NOERROR, 0))
3052                 {
3053                   re_node_set_free (&union_set);
3054                   return err;
3055                 }
3056             }
3057         }
3058 #endif /* RE_ENABLE_I18N */
3059       if (naccepted
3060           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3061         {
3062           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3063           if (BE (result < 0, 0))
3064             {
3065               re_node_set_free (&union_set);
3066               return REG_ESPACE;
3067             }
3068         }
3069     }
3070   re_node_set_free (&union_set);
3071   return REG_NOERROR;
3072 }
3073
3074 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3075    CUR_NODES, however exclude the nodes which are:
3076     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3077     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3078 */
3079
3080 static reg_errcode_t
3081 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3082      re_dfa_t *dfa;
3083      re_node_set *cur_nodes;
3084      int ex_subexp, type;
3085 {
3086   reg_errcode_t err;
3087   int idx, outside_node;
3088   re_node_set new_nodes;
3089 #ifdef DEBUG
3090   assert (cur_nodes->nelem);
3091 #endif
3092   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3093   if (BE (err != REG_NOERROR, 0))
3094     return err;
3095   /* Create a new node set NEW_NODES with the nodes which are epsilon
3096      closures of the node in CUR_NODES.  */
3097
3098   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3099     {
3100       int cur_node = cur_nodes->elems[idx];
3101       re_node_set *eclosure = dfa->eclosures + cur_node;
3102       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3103       if (outside_node == -1)
3104         {
3105           /* There are no problematic nodes, just merge them.  */
3106           err = re_node_set_merge (&new_nodes, eclosure);
3107           if (BE (err != REG_NOERROR, 0))
3108             {
3109               re_node_set_free (&new_nodes);
3110               return err;
3111             }
3112         }
3113       else
3114         {
3115           /* There are problematic nodes, re-calculate incrementally.  */
3116           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3117                                               ex_subexp, type);
3118           if (BE (err != REG_NOERROR, 0))
3119             {
3120               re_node_set_free (&new_nodes);
3121               return err;
3122             }
3123         }
3124     }
3125   re_node_set_free (cur_nodes);
3126   *cur_nodes = new_nodes;
3127   return REG_NOERROR;
3128 }
3129
3130 /* Helper function for check_arrival_expand_ecl.
3131    Check incrementally the epsilon closure of TARGET, and if it isn't
3132    problematic append it to DST_NODES.  */
3133
3134 static reg_errcode_t
3135 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3136      re_dfa_t *dfa;
3137      int target, ex_subexp, type;
3138      re_node_set *dst_nodes;
3139 {
3140   int cur_node;
3141   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3142     {
3143       int err;
3144
3145       if (dfa->nodes[cur_node].type == type
3146           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3147         {
3148           if (type == OP_CLOSE_SUBEXP)
3149             {
3150               err = re_node_set_insert (dst_nodes, cur_node);
3151               if (BE (err == -1, 0))
3152                 return REG_ESPACE;
3153             }
3154           break;
3155         }
3156       err = re_node_set_insert (dst_nodes, cur_node);
3157       if (BE (err == -1, 0))
3158         return REG_ESPACE;
3159       if (dfa->edests[cur_node].nelem == 0)
3160         break;
3161       if (dfa->edests[cur_node].nelem == 2)
3162         {
3163           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3164                                               dfa->edests[cur_node].elems[1],
3165                                               ex_subexp, type);
3166           if (BE (err != REG_NOERROR, 0))
3167             return err;
3168         }
3169       cur_node = dfa->edests[cur_node].elems[0];
3170     }
3171   return REG_NOERROR;
3172 }
3173
3174
3175 /* For all the back references in the current state, calculate the
3176    destination of the back references by the appropriate entry
3177    in MCTX->BKREF_ENTS.  */
3178
3179 static reg_errcode_t
3180 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3181                     type)
3182      re_match_context_t *mctx;
3183      int cur_str, subexp_num, type;
3184      re_node_set *cur_nodes;
3185 {
3186   re_dfa_t *const dfa = mctx->dfa;
3187   reg_errcode_t err;
3188   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3189   struct re_backref_cache_entry *ent;
3190
3191   if (cache_idx_start == -1)
3192     return REG_NOERROR;
3193
3194  restart:
3195   ent = mctx->bkref_ents + cache_idx_start;
3196   do
3197     {
3198       int to_idx, next_node;
3199
3200       /* Is this entry ENT is appropriate?  */
3201       if (!re_node_set_contains (cur_nodes, ent->node))
3202         continue; /* No.  */
3203
3204       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3205       /* Calculate the destination of the back reference, and append it
3206          to MCTX->STATE_LOG.  */
3207       if (to_idx == cur_str)
3208         {
3209           /* The backreference did epsilon transit, we must re-check all the
3210              node in the current state.  */
3211           re_node_set new_dests;
3212           reg_errcode_t err2, err3;
3213           next_node = dfa->edests[ent->node].elems[0];
3214           if (re_node_set_contains (cur_nodes, next_node))
3215             continue;
3216           err = re_node_set_init_1 (&new_dests, next_node);
3217           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3218           err3 = re_node_set_merge (cur_nodes, &new_dests);
3219           re_node_set_free (&new_dests);
3220           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3221                   || err3 != REG_NOERROR, 0))
3222             {
3223               err = (err != REG_NOERROR ? err
3224                      : (err2 != REG_NOERROR ? err2 : err3));
3225               return err;
3226             }
3227           /* TODO: It is still inefficient...  */
3228           goto restart;
3229         }
3230       else
3231         {
3232           re_node_set union_set;
3233           next_node = dfa->nexts[ent->node];
3234           if (mctx->state_log[to_idx])
3235             {
3236               int ret;
3237               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3238                                         next_node))
3239                 continue;
3240               err = re_node_set_init_copy (&union_set,
3241                                            &mctx->state_log[to_idx]->nodes);
3242               ret = re_node_set_insert (&union_set, next_node);
3243               if (BE (err != REG_NOERROR || ret < 0, 0))
3244                 {
3245                   re_node_set_free (&union_set);
3246                   err = err != REG_NOERROR ? err : REG_ESPACE;
3247                   return err;
3248                 }
3249             }
3250           else
3251             {
3252               err = re_node_set_init_1 (&union_set, next_node);
3253               if (BE (err != REG_NOERROR, 0))
3254                 return err;
3255             }
3256           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3257           re_node_set_free (&union_set);
3258           if (BE (mctx->state_log[to_idx] == NULL
3259                   && err != REG_NOERROR, 0))
3260             return err;
3261         }
3262     }
3263   while (ent++->more);
3264   return REG_NOERROR;
3265 }
3266
3267 /* Build transition table for the state.
3268    Return the new table if succeeded, otherwise return NULL.  */
3269
3270 static re_dfastate_t **
3271 build_trtable (dfa, state)
3272     re_dfa_t *dfa;
3273     re_dfastate_t *state;
3274 {
3275   reg_errcode_t err;
3276   int i, j, ch;
3277   unsigned int elem, mask;
3278   int dests_node_malloced = 0, dest_states_malloced = 0;
3279   int ndests; /* Number of the destination states from `state'.  */
3280   re_dfastate_t **trtable;
3281   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3282   re_node_set follows, *dests_node;
3283   bitset *dests_ch;
3284   bitset acceptable;
3285
3286   /* We build DFA states which corresponds to the destination nodes
3287      from `state'.  `dests_node[i]' represents the nodes which i-th
3288      destination state contains, and `dests_ch[i]' represents the
3289      characters which i-th destination state accepts.  */
3290 #ifdef _LIBC
3291   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3292     dests_node = (re_node_set *)
3293                  alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3294   else
3295 #endif
3296     {
3297       dests_node = (re_node_set *)
3298                    malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3299       if (BE (dests_node == NULL, 0))
3300         return NULL;
3301       dests_node_malloced = 1;
3302     }
3303   dests_ch = (bitset *) (dests_node + SBC_MAX);
3304
3305   /* Initialize transiton table.  */
3306   state->word_trtable = 0;
3307
3308   /* At first, group all nodes belonging to `state' into several
3309      destinations.  */
3310   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3311   if (BE (ndests <= 0, 0))
3312     {
3313       if (dests_node_malloced)
3314         free (dests_node);
3315       /* Return NULL in case of an error, trtable otherwise.  */
3316       if (ndests == 0)
3317         {
3318           state->trtable = (re_dfastate_t **)
3319             calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3320           return state->trtable;
3321         }
3322       return NULL;
3323     }
3324
3325   err = re_node_set_alloc (&follows, ndests + 1);
3326   if (BE (err != REG_NOERROR, 0))
3327     goto out_free;
3328
3329 #ifdef _LIBC
3330   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3331                          + ndests * 3 * sizeof (re_dfastate_t *)))
3332     dest_states = (re_dfastate_t **)
3333                   alloca (ndests * 3 * sizeof (re_dfastate_t *));
3334   else
3335 #endif
3336     {
3337       dest_states = (re_dfastate_t **)
3338                     malloc (ndests * 3 * sizeof (re_dfastate_t *));
3339       if (BE (dest_states == NULL, 0))
3340         {
3341 out_free:
3342           if (dest_states_malloced)
3343             free (dest_states);
3344           re_node_set_free (&follows);
3345           for (i = 0; i < ndests; ++i)
3346             re_node_set_free (dests_node + i);
3347           if (dests_node_malloced)
3348             free (dests_node);
3349           return NULL;
3350         }
3351       dest_states_malloced = 1;
3352     }
3353   dest_states_word = dest_states + ndests;
3354   dest_states_nl = dest_states_word + ndests;
3355   bitset_empty (acceptable);
3356
3357   /* Then build the states for all destinations.  */
3358   for (i = 0; i < ndests; ++i)
3359     {
3360       int next_node;
3361       re_node_set_empty (&follows);
3362       /* Merge the follows of this destination states.  */
3363       for (j = 0; j < dests_node[i].nelem; ++j)
3364         {
3365           next_node = dfa->nexts[dests_node[i].elems[j]];
3366           if (next_node != -1)
3367             {
3368               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3369               if (BE (err != REG_NOERROR, 0))
3370                 goto out_free;
3371             }
3372         }
3373       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3374       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3375         goto out_free;
3376       /* If the new state has context constraint,
3377          build appropriate states for these contexts.  */
3378       if (dest_states[i]->has_constraint)
3379         {
3380           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3381                                                           CONTEXT_WORD);
3382           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3383             goto out_free;
3384
3385           if (dest_states[i] != dest_states_word[i]
3386               && dfa->mb_cur_max > 1)
3387             state->word_trtable = 1;
3388
3389           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3390                                                         CONTEXT_NEWLINE);
3391           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3392             goto out_free;
3393         }
3394       else
3395         {
3396           dest_states_word[i] = dest_states[i];
3397           dest_states_nl[i] = dest_states[i];
3398         }
3399       bitset_merge (acceptable, dests_ch[i]);
3400     }
3401
3402   if (!BE (state->word_trtable, 0))
3403     {
3404       /* We don't care about whether the following character is a word
3405          character, or we are in a single-byte character set so we can
3406          discern by looking at the character code: allocate a
3407          256-entry transition table.  */
3408       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3409       if (BE (trtable == NULL, 0))
3410         goto out_free;
3411
3412       /* For all characters ch...:  */
3413       for (i = 0; i < BITSET_UINTS; ++i)
3414         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3415              elem;
3416              mask <<= 1, elem >>= 1, ++ch)
3417           if (BE (elem & 1, 0))
3418             {
3419               /* There must be exactly one destination which accepts
3420                  character ch.  See group_nodes_into_DFAstates.  */
3421               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3422                 ;
3423
3424               /* j-th destination accepts the word character ch.  */
3425               if (dfa->word_char[i] & mask)
3426                 trtable[ch] = dest_states_word[j];
3427               else
3428                 trtable[ch] = dest_states[j];
3429             }
3430     }
3431   else
3432     {
3433       /* We care about whether the following character is a word
3434          character, and we are in a multi-byte character set: discern
3435          by looking at the character code: build two 256-entry
3436          transition tables, one starting at trtable[0] and one
3437          starting at trtable[SBC_MAX].  */
3438       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3439                                            2 * SBC_MAX);
3440       if (BE (trtable == NULL, 0))
3441         goto out_free;
3442
3443       /* For all characters ch...:  */
3444       for (i = 0; i < BITSET_UINTS; ++i)
3445         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3446              elem;
3447              mask <<= 1, elem >>= 1, ++ch)
3448           if (BE (elem & 1, 0))
3449             {
3450               /* There must be exactly one destination which accepts
3451                  character ch.  See group_nodes_into_DFAstates.  */
3452               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3453                 ;
3454
3455               /* j-th destination accepts the word character ch.  */
3456               trtable[ch] = dest_states[j];
3457               trtable[ch + SBC_MAX] = dest_states_word[j];
3458             }
3459     }
3460
3461   /* new line */
3462   if (bitset_contain (acceptable, NEWLINE_CHAR))
3463     {
3464       /* The current state accepts newline character.  */
3465       for (j = 0; j < ndests; ++j)
3466         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3467           {
3468             /* k-th destination accepts newline character.  */
3469             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3470             if (state->word_trtable)
3471               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3472             /* There must be only one destination which accepts
3473                newline.  See group_nodes_into_DFAstates.  */
3474             break;
3475           }
3476     }
3477
3478   if (dest_states_malloced)
3479     free (dest_states);
3480
3481   re_node_set_free (&follows);
3482   for (i = 0; i < ndests; ++i)
3483     re_node_set_free (dests_node + i);
3484
3485   if (dests_node_malloced)
3486     free (dests_node);
3487
3488   state->trtable = trtable;
3489   return trtable;
3490 }
3491
3492 /* Group all nodes belonging to STATE into several destinations.
3493    Then for all destinations, set the nodes belonging to the destination
3494    to DESTS_NODE[i] and set the characters accepted by the destination
3495    to DEST_CH[i].  This function return the number of destinations.  */
3496
3497 static int
3498 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3499     re_dfa_t *dfa;
3500     const re_dfastate_t *state;
3501     re_node_set *dests_node;
3502     bitset *dests_ch;
3503 {
3504   reg_errcode_t err;
3505   int result;
3506   int i, j, k;
3507   int ndests; /* Number of the destinations from `state'.  */
3508   bitset accepts; /* Characters a node can accept.  */
3509   const re_node_set *cur_nodes = &state->nodes;
3510   bitset_empty (accepts);
3511   ndests = 0;
3512
3513   /* For all the nodes belonging to `state',  */
3514   for (i = 0; i < cur_nodes->nelem; ++i)
3515     {
3516       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3517       re_token_type_t type = node->type;
3518       unsigned int constraint = node->constraint;
3519
3520       /* Enumerate all single byte character this node can accept.  */
3521       if (type == CHARACTER)
3522         bitset_set (accepts, node->opr.c);
3523       else if (type == SIMPLE_BRACKET)
3524         {
3525           bitset_merge (accepts, node->opr.sbcset);
3526         }
3527       else if (type == OP_PERIOD)
3528         {
3529 #ifdef RE_ENABLE_I18N
3530           if (dfa->mb_cur_max > 1)
3531             bitset_merge (accepts, dfa->sb_char);
3532           else
3533 #endif
3534             bitset_set_all (accepts);
3535           if (!(dfa->syntax & RE_DOT_NEWLINE))
3536             bitset_clear (accepts, '\n');
3537           if (dfa->syntax & RE_DOT_NOT_NULL)
3538             bitset_clear (accepts, '\0');
3539         }
3540 #ifdef RE_ENABLE_I18N
3541       else if (type == OP_UTF8_PERIOD)
3542         {
3543           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3544           if (!(dfa->syntax & RE_DOT_NEWLINE))
3545             bitset_clear (accepts, '\n');
3546           if (dfa->syntax & RE_DOT_NOT_NULL)
3547             bitset_clear (accepts, '\0');
3548         }
3549 #endif
3550       else
3551         continue;
3552
3553       /* Check the `accepts' and sift the characters which are not
3554          match it the context.  */
3555       if (constraint)
3556         {
3557           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3558             {
3559               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3560               bitset_empty (accepts);
3561               if (accepts_newline)
3562                 bitset_set (accepts, NEWLINE_CHAR);
3563               else
3564                 continue;
3565             }
3566           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3567             {
3568               bitset_empty (accepts);
3569               continue;
3570             }
3571
3572           if (constraint & NEXT_WORD_CONSTRAINT)
3573             {
3574               unsigned int any_set = 0;
3575               if (type == CHARACTER && !node->word_char)
3576                 {
3577                   bitset_empty (accepts);
3578                   continue;
3579                 }
3580 #ifdef RE_ENABLE_I18N
3581               if (dfa->mb_cur_max > 1)
3582                 for (j = 0; j < BITSET_UINTS; ++j)
3583                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3584               else
3585 #endif
3586                 for (j = 0; j < BITSET_UINTS; ++j)
3587                   any_set |= (accepts[j] &= dfa->word_char[j]);
3588               if (!any_set)
3589                 continue;
3590             }
3591           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3592             {
3593               unsigned int any_set = 0;
3594               if (type == CHARACTER && node->word_char)
3595                 {
3596                   bitset_empty (accepts);
3597                   continue;
3598                 }
3599 #ifdef RE_ENABLE_I18N
3600               if (dfa->mb_cur_max > 1)
3601                 for (j = 0; j < BITSET_UINTS; ++j)
3602                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3603               else
3604 #endif
3605                 for (j = 0; j < BITSET_UINTS; ++j)
3606                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3607               if (!any_set)
3608                 continue;
3609             }
3610         }
3611
3612       /* Then divide `accepts' into DFA states, or create a new
3613          state.  Above, we make sure that accepts is not empty.  */
3614       for (j = 0; j < ndests; ++j)
3615         {
3616           bitset intersec; /* Intersection sets, see below.  */
3617           bitset remains;
3618           /* Flags, see below.  */
3619           int has_intersec, not_subset, not_consumed;
3620
3621           /* Optimization, skip if this state doesn't accept the character.  */
3622           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3623             continue;
3624
3625           /* Enumerate the intersection set of this state and `accepts'.  */
3626           has_intersec = 0;
3627           for (k = 0; k < BITSET_UINTS; ++k)
3628             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3629           /* And skip if the intersection set is empty.  */
3630           if (!has_intersec)
3631             continue;
3632
3633           /* Then check if this state is a subset of `accepts'.  */
3634           not_subset = not_consumed = 0;
3635           for (k = 0; k < BITSET_UINTS; ++k)
3636             {
3637               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3638               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3639             }
3640
3641           /* If this state isn't a subset of `accepts', create a
3642              new group state, which has the `remains'. */
3643           if (not_subset)
3644             {
3645               bitset_copy (dests_ch[ndests], remains);
3646               bitset_copy (dests_ch[j], intersec);
3647               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3648               if (BE (err != REG_NOERROR, 0))
3649                 goto error_return;
3650               ++ndests;
3651             }
3652
3653           /* Put the position in the current group. */
3654           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3655           if (BE (result < 0, 0))
3656             goto error_return;
3657
3658           /* If all characters are consumed, go to next node. */
3659           if (!not_consumed)
3660             break;
3661         }
3662       /* Some characters remain, create a new group. */
3663       if (j == ndests)
3664         {
3665           bitset_copy (dests_ch[ndests], accepts);
3666           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3667           if (BE (err != REG_NOERROR, 0))
3668             goto error_return;
3669           ++ndests;
3670           bitset_empty (accepts);
3671         }
3672     }
3673   return ndests;
3674  error_return:
3675   for (j = 0; j < ndests; ++j)
3676     re_node_set_free (dests_node + j);
3677   return -1;
3678 }
3679
3680 #ifdef RE_ENABLE_I18N
3681 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3682    Return the number of the bytes the node accepts.
3683    STR_IDX is the current index of the input string.
3684
3685    This function handles the nodes which can accept one character, or
3686    one collating element like '.', '[a-z]', opposite to the other nodes
3687    can only accept one byte.  */
3688
3689 static int
3690 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3691     re_dfa_t *dfa;
3692     int node_idx, str_idx;
3693     const re_string_t *input;
3694 {
3695   const re_token_t *node = dfa->nodes + node_idx;
3696   int char_len, elem_len;
3697   int i;
3698
3699   if (BE (node->type == OP_UTF8_PERIOD, 0))
3700     {
3701       unsigned char c = re_string_byte_at (input, str_idx), d;
3702       if (BE (c < 0xc2, 1))
3703         return 0;
3704
3705       if (str_idx + 2 > input->len)
3706         return 0;
3707
3708       d = re_string_byte_at (input, str_idx + 1);
3709       if (c < 0xe0)
3710         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3711       else if (c < 0xf0)
3712         {
3713           char_len = 3;
3714           if (c == 0xe0 && d < 0xa0)
3715             return 0;
3716         }
3717       else if (c < 0xf8)
3718         {
3719           char_len = 4;
3720           if (c == 0xf0 && d < 0x90)
3721             return 0;
3722         }
3723       else if (c < 0xfc)
3724         {
3725           char_len = 5;
3726           if (c == 0xf8 && d < 0x88)
3727             return 0;
3728         }
3729       else if (c < 0xfe)
3730         {
3731           char_len = 6;
3732           if (c == 0xfc && d < 0x84)
3733             return 0;
3734         }
3735       else
3736         return 0;
3737
3738       if (str_idx + char_len > input->len)
3739         return 0;
3740
3741       for (i = 1; i < char_len; ++i)
3742         {
3743           d = re_string_byte_at (input, str_idx + i);
3744           if (d < 0x80 || d > 0xbf)
3745             return 0;
3746         }
3747       return char_len;
3748     }
3749
3750   char_len = re_string_char_size_at (input, str_idx);
3751   if (node->type == OP_PERIOD)
3752     {
3753       if (char_len <= 1)
3754         return 0;
3755       /* FIXME: I don't think this if is needed, as both '\n'
3756          and '\0' are char_len == 1.  */
3757       /* '.' accepts any one character except the following two cases.  */
3758       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3759            re_string_byte_at (input, str_idx) == '\n') ||
3760           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3761            re_string_byte_at (input, str_idx) == '\0'))
3762         return 0;
3763       return char_len;
3764     }
3765
3766   elem_len = re_string_elem_size_at (input, str_idx);
3767   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3768     return 0;
3769
3770   if (node->type == COMPLEX_BRACKET)
3771     {
3772       const re_charset_t *cset = node->opr.mbcset;
3773 # ifdef _LIBC
3774       const unsigned char *pin = ((char *) re_string_get_buffer (input)
3775                                   + str_idx);
3776       int j;
3777       uint32_t nrules;
3778 # endif /* _LIBC */
3779       int match_len = 0;
3780       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3781                     ? re_string_wchar_at (input, str_idx) : 0);
3782
3783       /* match with multibyte character?  */
3784       for (i = 0; i < cset->nmbchars; ++i)
3785         if (wc == cset->mbchars[i])
3786           {
3787             match_len = char_len;
3788             goto check_node_accept_bytes_match;
3789           }
3790       /* match with character_class?  */
3791       for (i = 0; i < cset->nchar_classes; ++i)
3792         {
3793           wctype_t wt = cset->char_classes[i];
3794           if (__iswctype (wc, wt))
3795             {
3796               match_len = char_len;
3797               goto check_node_accept_bytes_match;
3798             }
3799         }
3800
3801 # ifdef _LIBC
3802       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3803       if (nrules != 0)
3804         {
3805           unsigned int in_collseq = 0;
3806           const int32_t *table, *indirect;
3807           const unsigned char *weights, *extra;
3808           const char *collseqwc;
3809           int32_t idx;
3810           /* This #include defines a local function!  */
3811 #  include <locale/weight.h>
3812
3813           /* match with collating_symbol?  */
3814           if (cset->ncoll_syms)
3815             extra = (const unsigned char *)
3816               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3817           for (i = 0; i < cset->ncoll_syms; ++i)
3818             {
3819               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3820               /* Compare the length of input collating element and
3821                  the length of current collating element.  */
3822               if (*coll_sym != elem_len)
3823                 continue;
3824               /* Compare each bytes.  */
3825               for (j = 0; j < *coll_sym; j++)
3826                 if (pin[j] != coll_sym[1 + j])
3827                   break;
3828               if (j == *coll_sym)
3829                 {
3830                   /* Match if every bytes is equal.  */
3831                   match_len = j;
3832                   goto check_node_accept_bytes_match;
3833                 }
3834             }
3835
3836           if (cset->nranges)
3837             {
3838               if (elem_len <= char_len)
3839                 {
3840                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3841                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3842                 }
3843               else
3844                 in_collseq = find_collation_sequence_value (pin, elem_len);
3845             }
3846           /* match with range expression?  */
3847           for (i = 0; i < cset->nranges; ++i)
3848             if (cset->range_starts[i] <= in_collseq
3849                 && in_collseq <= cset->range_ends[i])
3850               {
3851                 match_len = elem_len;
3852                 goto check_node_accept_bytes_match;
3853               }
3854
3855           /* match with equivalence_class?  */
3856           if (cset->nequiv_classes)
3857             {
3858               const unsigned char *cp = pin;
3859               table = (const int32_t *)
3860                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3861               weights = (const unsigned char *)
3862                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3863               extra = (const unsigned char *)
3864                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3865               indirect = (const int32_t *)
3866                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3867               idx = findidx (&cp);
3868               if (idx > 0)
3869                 for (i = 0; i < cset->nequiv_classes; ++i)
3870                   {
3871                     int32_t equiv_class_idx = cset->equiv_classes[i];
3872                     size_t weight_len = weights[idx];
3873                     if (weight_len == weights[equiv_class_idx])
3874                       {
3875                         int cnt = 0;
3876                         while (cnt <= weight_len
3877                                && (weights[equiv_class_idx + 1 + cnt]
3878                                    == weights[idx + 1 + cnt]))
3879                           ++cnt;
3880                         if (cnt > weight_len)
3881                           {
3882                             match_len = elem_len;
3883                             goto check_node_accept_bytes_match;
3884                           }
3885                       }
3886                   }
3887             }
3888         }
3889       else
3890 # endif /* _LIBC */
3891         {
3892           /* match with range expression?  */
3893 #if __GNUC__ >= 2
3894           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3895 #else
3896           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3897           cmp_buf[2] = wc;
3898 #endif
3899           for (i = 0; i < cset->nranges; ++i)
3900             {
3901               cmp_buf[0] = cset->range_starts[i];
3902               cmp_buf[4] = cset->range_ends[i];
3903               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3904                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3905                 {
3906                   match_len = char_len;
3907                   goto check_node_accept_bytes_match;
3908                 }
3909             }
3910         }
3911     check_node_accept_bytes_match:
3912       if (!cset->non_match)
3913         return match_len;
3914       else
3915         {
3916           if (match_len > 0)
3917             return 0;
3918           else
3919             return (elem_len > char_len) ? elem_len : char_len;
3920         }
3921     }
3922   return 0;
3923 }
3924
3925 # ifdef _LIBC
3926 static unsigned int
3927 find_collation_sequence_value (mbs, mbs_len)
3928     const unsigned char *mbs;
3929     size_t mbs_len;
3930 {
3931   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3932   if (nrules == 0)
3933     {
3934       if (mbs_len == 1)
3935         {
3936           /* No valid character.  Match it as a single byte character.  */
3937           const unsigned char *collseq = (const unsigned char *)
3938             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3939           return collseq[mbs[0]];
3940         }
3941       return UINT_MAX;
3942     }
3943   else
3944     {
3945       int32_t idx;
3946       const unsigned char *extra = (const unsigned char *)
3947         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3948       int32_t extrasize = (const unsigned char *)
3949         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3950
3951       for (idx = 0; idx < extrasize;)
3952         {
3953           int mbs_cnt, found = 0;
3954           int32_t elem_mbs_len;
3955           /* Skip the name of collating element name.  */
3956           idx = idx + extra[idx] + 1;
3957           elem_mbs_len = extra[idx++];
3958           if (mbs_len == elem_mbs_len)
3959             {
3960               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3961                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3962                   break;
3963               if (mbs_cnt == elem_mbs_len)
3964                 /* Found the entry.  */
3965                 found = 1;
3966             }
3967           /* Skip the byte sequence of the collating element.  */
3968           idx += elem_mbs_len;
3969           /* Adjust for the alignment.  */
3970           idx = (idx + 3) & ~3;
3971           /* Skip the collation sequence value.  */
3972           idx += sizeof (uint32_t);
3973           /* Skip the wide char sequence of the collating element.  */
3974           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3975           /* If we found the entry, return the sequence value.  */
3976           if (found)
3977             return *(uint32_t *) (extra + idx);
3978           /* Skip the collation sequence value.  */
3979           idx += sizeof (uint32_t);
3980         }
3981       return UINT_MAX;
3982     }
3983 }
3984 # endif /* _LIBC */
3985 #endif /* RE_ENABLE_I18N */
3986
3987 /* Check whether the node accepts the byte which is IDX-th
3988    byte of the INPUT.  */
3989
3990 static int
3991 check_node_accept (mctx, node, idx)
3992     const re_match_context_t *mctx;
3993     const re_token_t *node;
3994     int idx;
3995 {
3996   re_dfa_t *const dfa = mctx->dfa;
3997   unsigned char ch;
3998   if (node->constraint)
3999     {
4000       /* The node has constraints.  Check whether the current context
4001          satisfies the constraints.  */
4002       unsigned int context = re_string_context_at (&mctx->input, idx,
4003                                                    mctx->eflags);
4004       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4005         return 0;
4006     }
4007   ch = re_string_byte_at (&mctx->input, idx);
4008   switch (node->type)
4009     {
4010     case CHARACTER:
4011       return node->opr.c == ch;
4012     case SIMPLE_BRACKET:
4013       return bitset_contain (node->opr.sbcset, ch);
4014 #ifdef RE_ENABLE_I18N
4015     case OP_UTF8_PERIOD:
4016       if (ch >= 0x80)
4017         return 0;
4018       /* FALLTHROUGH */
4019 #endif
4020     case OP_PERIOD:
4021       return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
4022                || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
4023     default:
4024       return 0;
4025     }
4026 }
4027
4028 /* Extend the buffers, if the buffers have run out.  */
4029
4030 static reg_errcode_t
4031 extend_buffers (mctx)
4032      re_match_context_t *mctx;
4033 {
4034   reg_errcode_t ret;
4035   re_string_t *pstr = &mctx->input;
4036
4037   /* Double the lengthes of the buffers.  */
4038   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4039   if (BE (ret != REG_NOERROR, 0))
4040     return ret;
4041
4042   if (mctx->state_log != NULL)
4043     {
4044       /* And double the length of state_log.  */
4045       /* XXX We have no indication of the size of this buffer.  If this
4046          allocation fail we have no indication that the state_log array
4047          does not have the right size.  */
4048       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4049                                               pstr->bufs_len + 1);
4050       if (BE (new_array == NULL, 0))
4051         return REG_ESPACE;
4052       mctx->state_log = new_array;
4053     }
4054
4055   /* Then reconstruct the buffers.  */
4056   if (pstr->icase)
4057     {
4058 #ifdef RE_ENABLE_I18N
4059       if (pstr->mb_cur_max > 1)
4060         {
4061           ret = build_wcs_upper_buffer (pstr);
4062           if (BE (ret != REG_NOERROR, 0))
4063             return ret;
4064         }
4065       else
4066 #endif /* RE_ENABLE_I18N  */
4067         build_upper_buffer (pstr);
4068     }
4069   else
4070     {
4071 #ifdef RE_ENABLE_I18N
4072       if (pstr->mb_cur_max > 1)
4073         build_wcs_buffer (pstr);
4074       else
4075 #endif /* RE_ENABLE_I18N  */
4076         {
4077           if (pstr->trans != NULL)
4078             re_string_translate_buffer (pstr);
4079         }
4080     }
4081   return REG_NOERROR;
4082 }
4083
4084 \f
4085 /* Functions for matching context.  */
4086
4087 /* Initialize MCTX.  */
4088
4089 static reg_errcode_t
4090 match_ctx_init (mctx, eflags, n)
4091     re_match_context_t *mctx;
4092     int eflags, n;
4093 {
4094   mctx->eflags = eflags;
4095   mctx->match_last = -1;
4096   if (n > 0)
4097     {
4098       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4099       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4100       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4101         return REG_ESPACE;
4102     }
4103   /* Already zero-ed by the caller.
4104      else
4105        mctx->bkref_ents = NULL;
4106      mctx->nbkref_ents = 0;
4107      mctx->nsub_tops = 0;  */
4108   mctx->abkref_ents = n;
4109   mctx->max_mb_elem_len = 1;
4110   mctx->asub_tops = n;
4111   return REG_NOERROR;
4112 }
4113
4114 /* Clean the entries which depend on the current input in MCTX.
4115    This function must be invoked when the matcher changes the start index
4116    of the input, or changes the input string.  */
4117
4118 static void
4119 match_ctx_clean (mctx)
4120     re_match_context_t *mctx;
4121 {
4122   int st_idx;
4123   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4124     {
4125       int sl_idx;
4126       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4127       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4128         {
4129           re_sub_match_last_t *last = top->lasts[sl_idx];
4130           re_free (last->path.array);
4131           re_free (last);
4132         }
4133       re_free (top->lasts);
4134       if (top->path)
4135         {
4136           re_free (top->path->array);
4137           re_free (top->path);
4138         }
4139       free (top);
4140     }
4141
4142   mctx->nsub_tops = 0;
4143   mctx->nbkref_ents = 0;
4144 }
4145
4146 /* Free all the memory associated with MCTX.  */
4147
4148 static void
4149 match_ctx_free (mctx)
4150     re_match_context_t *mctx;
4151 {
4152   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4153   match_ctx_clean (mctx);
4154   re_free (mctx->sub_tops);
4155   re_free (mctx->bkref_ents);
4156 }
4157
4158 /* Add a new backreference entry to MCTX.
4159    Note that we assume that caller never call this function with duplicate
4160    entry, and call with STR_IDX which isn't smaller than any existing entry.
4161 */
4162
4163 static reg_errcode_t
4164 match_ctx_add_entry (mctx, node, str_idx, from, to)
4165      re_match_context_t *mctx;
4166      int node, str_idx, from, to;
4167 {
4168   if (mctx->nbkref_ents >= mctx->abkref_ents)
4169     {
4170       struct re_backref_cache_entry* new_entry;
4171       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4172                               mctx->abkref_ents * 2);
4173       if (BE (new_entry == NULL, 0))
4174         {
4175           re_free (mctx->bkref_ents);
4176           return REG_ESPACE;
4177         }
4178       mctx->bkref_ents = new_entry;
4179       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4180               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4181       mctx->abkref_ents *= 2;
4182     }
4183   if (mctx->nbkref_ents > 0
4184       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4185     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4186
4187   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4188   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4189   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4190   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4191
4192   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4193      If bit N is clear, means that this entry won't epsilon-transition to
4194      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4195      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4196      such node.
4197
4198      A backreference does not epsilon-transition unless it is empty, so set
4199      to all zeros if FROM != TO.  */
4200   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4201     = (from == to ? ~0 : 0);
4202
4203   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4204   if (mctx->max_mb_elem_len < to - from)
4205     mctx->max_mb_elem_len = to - from;
4206   return REG_NOERROR;
4207 }
4208
4209 /* Search for the first entry which has the same str_idx, or -1 if none is
4210    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4211
4212 static int
4213 search_cur_bkref_entry (mctx, str_idx)
4214      re_match_context_t *mctx;
4215      int str_idx;
4216 {
4217   int left, right, mid, last;
4218   last = right = mctx->nbkref_ents;
4219   for (left = 0; left < right;)
4220     {
4221       mid = (left + right) / 2;
4222       if (mctx->bkref_ents[mid].str_idx < str_idx)
4223         left = mid + 1;
4224       else
4225         right = mid;
4226     }
4227   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4228     return left;
4229   else
4230     return -1;
4231 }
4232
4233 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4234    at STR_IDX.  */
4235
4236 static reg_errcode_t
4237 match_ctx_add_subtop (mctx, node, str_idx)
4238      re_match_context_t *mctx;
4239      int node, str_idx;
4240 {
4241 #ifdef DEBUG
4242   assert (mctx->sub_tops != NULL);
4243   assert (mctx->asub_tops > 0);
4244 #endif
4245   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4246     {
4247       int new_asub_tops = mctx->asub_tops * 2;
4248       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4249                                                    re_sub_match_top_t *,
4250                                                    new_asub_tops);
4251       if (BE (new_array == NULL, 0))
4252         return REG_ESPACE;
4253       mctx->sub_tops = new_array;
4254       mctx->asub_tops = new_asub_tops;
4255     }
4256   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4257   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4258     return REG_ESPACE;
4259   mctx->sub_tops[mctx->nsub_tops]->node = node;
4260   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4261   return REG_NOERROR;
4262 }
4263
4264 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4265    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4266
4267 static re_sub_match_last_t *
4268 match_ctx_add_sublast (subtop, node, str_idx)
4269      re_sub_match_top_t *subtop;
4270      int node, str_idx;
4271 {
4272   re_sub_match_last_t *new_entry;
4273   if (BE (subtop->nlasts == subtop->alasts, 0))
4274     {
4275       int new_alasts = 2 * subtop->alasts + 1;
4276       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4277                                                     re_sub_match_last_t *,
4278                                                     new_alasts);
4279       if (BE (new_array == NULL, 0))
4280         return NULL;
4281       subtop->lasts = new_array;
4282       subtop->alasts = new_alasts;
4283     }
4284   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4285   if (BE (new_entry != NULL, 1))
4286     {
4287       subtop->lasts[subtop->nlasts] = new_entry;
4288       new_entry->node = node;
4289       new_entry->str_idx = str_idx;
4290       ++subtop->nlasts;
4291     }
4292   return new_entry;
4293 }
4294
4295 static void
4296 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4297     re_sift_context_t *sctx;
4298     re_dfastate_t **sifted_sts, **limited_sts;
4299     int last_node, last_str_idx;
4300 {
4301   sctx->sifted_states = sifted_sts;
4302   sctx->limited_states = limited_sts;
4303   sctx->last_node = last_node;
4304   sctx->last_str_idx = last_str_idx;
4305   re_node_set_init_empty (&sctx->limits);
4306 }