posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static inline re_dfastate_t *acquire_init_state_context
  56      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  57      __attribute ((always_inline)) internal_function;
  58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  59      internal_function;
  60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  61                            int *p_match_first)
  62      internal_function;
  63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  64                                     unsigned int context) internal_function;
  65 static int check_halt_state_context (const re_match_context_t *mctx,
  66                                      const re_dfastate_t *state, int idx)
  67      internal_function;
  68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  69                          regmatch_t *prev_idx_match, int cur_node,
  70                          int cur_idx, int nmatch) internal_function;
  71 static int proceed_next_node (const re_match_context_t *mctx,
  72                               int nregs, regmatch_t *regs,
  73                               int *pidx, int node, re_node_set *eps_via_nodes,
  74                               struct re_fail_stack_t *fs) internal_function;
  75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  76                                       int str_idx, int dest_node, int nregs,
  77                                       regmatch_t *regs,
  78                                       re_node_set *eps_via_nodes) internal_function;
  79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  80                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  81 static reg_errcode_t set_regs (const regex_t *preg,
  82                                const re_match_context_t *mctx,
  83                                size_t nmatch, regmatch_t *pmatch,
  84                                int fl_backtrack) internal_function;
  85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  86
  87 #ifdef RE_ENABLE_I18N
  88 static int sift_states_iter_mb (const re_match_context_t *mctx,
  89                                 re_sift_context_t *sctx,
  90                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  91 #endif /* RE_ENABLE_I18N */
  92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  93                                            re_sift_context_t *sctx) internal_function;
  94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  95                                           re_sift_context_t *sctx, int str_idx,
  96                                           re_node_set *cur_dest) internal_function;
  97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  98                                               re_sift_context_t *sctx,
  99                                               int str_idx,
 100                                               re_node_set *dest_nodes) internal_function;
 101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 102                                             re_node_set *dest_nodes,
 103                                             const re_node_set *candidates) internal_function;
 104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 105                                             re_node_set *dest_nodes,
 106                                             const re_node_set *and_nodes) internal_function;
 107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 108                              int dst_node, int dst_idx, int src_node,
 109                              int src_idx) internal_function;
 110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 111                                         int boundaries, int subexp_idx,
 112                                         int from_node, int bkref_idx) internal_function;
 113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 114                                       int limit, int subexp_idx,
 115                                       int node, int str_idx,
 116                                       int bkref_idx) internal_function;
 117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 118                                           re_node_set *dest_nodes,
 119                                           const re_node_set *candidates,
 120                                           re_node_set *limits,
 121                                           struct re_backref_cache_entry *bkref_ents,
 122                                           int str_idx) internal_function;
 123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 124                                         re_sift_context_t *sctx,
 125                                         int str_idx, const re_node_set *candidates) internal_function;
 126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 127                                                 int next_state_log_idx) internal_function;
 128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 129                                         re_dfastate_t **src, int num) internal_function;
 130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 131                                          re_match_context_t *mctx) internal_function;
 132 static re_dfastate_t *transit_state (reg_errcode_t *err,
 133                                      re_match_context_t *mctx,
 134                                      re_dfastate_t *state) internal_function;
 135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 136                                             re_match_context_t *mctx,
 137                                             re_dfastate_t *next_state) internal_function;
 138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 139                                                 re_node_set *cur_nodes,
 140                                                 int str_idx) internal_function;
 141 #if 0
 142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 143                                         re_match_context_t *mctx,
 144                                         re_dfastate_t *pstate) internal_function;
 145 #endif
 146 #ifdef RE_ENABLE_I18N
 147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 148                                        re_dfastate_t *pstate) internal_function;
 149 #endif /* RE_ENABLE_I18N */
 150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 151                                           const re_node_set *nodes) internal_function;
 152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 153                                  int bkref_node, int bkref_str_idx) internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str) internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes) internal_function;
 168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 169                                                re_node_set *cur_nodes,
 170                                                int ex_subexp, int type) internal_function;
 171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 172                                                    re_node_set *dst_nodes,
 173                                                    int target, int ex_subexp,
 174                                                    int type) internal_function;
 175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 176                                          re_node_set *cur_nodes, int cur_str,
 177                                          int subexp_num, int type) internal_function;
 178 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
 179                                       re_dfastate_t *state) internal_function;
 180 #ifdef RE_ENABLE_I18N
 181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 182                                     const re_string_t *input, int idx) internal_function;
 183 # ifdef _LIBC
 184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 185                                                    size_t name_len) internal_function;
 186 # endif /* _LIBC */
 187 #endif /* RE_ENABLE_I18N */
 188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 189                                        const re_dfastate_t *state,
 190                                        re_node_set *states_node,
 191                                        bitset *states_ch) internal_function;
 192 static int check_node_accept (const re_match_context_t *mctx,
 193                               const re_token_t *node, int idx) internal_function;
 194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 195 \f
 196 /* Entry point for POSIX code.  */
 197
 198 /* regexec searches for a given pattern, specified by PREG, in the
 199    string STRING.
 200
 201    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 202    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 203    least NMATCH elements, and we set them to the offsets of the
 204    corresponding matched substrings.
 205
 206    EFLAGS specifies `execution flags' which affect matching: if
 207    REG_NOTBOL is set, then ^ does not match at the beginning of the
 208    string; if REG_NOTEOL is set, then $ does not match at the end.
 209
 210    We return 0 if we find a match and REG_NOMATCH if not.  */
 211
 212 int
 213 regexec (preg, string, nmatch, pmatch, eflags)
 214     const regex_t *__restrict preg;
 215     const char *__restrict string;
 216     size_t nmatch;
 217     regmatch_t pmatch[];
 218     int eflags;
 219 {
 220   reg_errcode_t err;
 221   int start, length;
 222   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 223
 224   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 225     return REG_BADPAT;
 226
 227   if (eflags & REG_STARTEND)
 228     {
 229       start = pmatch[0].rm_so;
 230       length = pmatch[0].rm_eo;
 231     }
 232   else
 233     {
 234       start = 0;
 235       length = strlen (string);
 236     }
 237
 238   __libc_lock_lock (dfa->lock);
 239   if (preg->no_sub)
 240     err = re_search_internal (preg, string, length, start, length - start,
 241                               length, 0, NULL, eflags);
 242   else
 243     err = re_search_internal (preg, string, length, start, length - start,
 244                               length, nmatch, pmatch, eflags);
 245   __libc_lock_unlock (dfa->lock);
 246   return err != REG_NOERROR;
 247 }
 248
 249 #ifdef _LIBC
 250 # include <shlib-compat.h>
 251 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 252
 253 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 254 __typeof__ (__regexec) __compat_regexec;
 255
 256 int
 257 attribute_compat_text_section
 258 __compat_regexec (const regex_t *__restrict preg,
 259                   const char *__restrict string, size_t nmatch,
 260                   regmatch_t pmatch[], int eflags)
 261 {
 262   return regexec (preg, string, nmatch, pmatch,
 263                   eflags & (REG_NOTBOL | REG_NOTEOL));
 264 }
 265 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 266 # endif
 267 #endif
 268
 269 /* Entry points for GNU code.  */
 270
 271 /* re_match, re_search, re_match_2, re_search_2
 272
 273    The former two functions operate on STRING with length LENGTH,
 274    while the later two operate on concatenation of STRING1 and STRING2
 275    with lengths LENGTH1 and LENGTH2, respectively.
 276
 277    re_match() matches the compiled pattern in BUFP against the string,
 278    starting at index START.
 279
 280    re_search() first tries matching at index START, then it tries to match
 281    starting from index START + 1, and so on.  The last start position tried
 282    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 283    way as re_match().)
 284
 285    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 286    the first STOP characters of the concatenation of the strings should be
 287    concerned.
 288
 289    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 290    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 291    computed relative to the concatenation, not relative to the individual
 292    strings.)
 293
 294    On success, re_match* functions return the length of the match, re_search*
 295    return the position of the start of the match.  Return value -1 means no
 296    match was found and -2 indicates an internal error.  */
 297
 298 int
 299 re_match (bufp, string, length, start, regs)
 300     struct re_pattern_buffer *bufp;
 301     const char *string;
 302     int length, start;
 303     struct re_registers *regs;
 304 {
 305   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 306 }
 307 #ifdef _LIBC
 308 weak_alias (__re_match, re_match)
 309 #endif
 310
 311 int
 312 re_search (bufp, string, length, start, range, regs)
 313     struct re_pattern_buffer *bufp;
 314     const char *string;
 315     int length, start, range;
 316     struct re_registers *regs;
 317 {
 318   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 319 }
 320 #ifdef _LIBC
 321 weak_alias (__re_search, re_search)
 322 #endif
 323
 324 int
 325 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 326     struct re_pattern_buffer *bufp;
 327     const char *string1, *string2;
 328     int length1, length2, start, stop;
 329     struct re_registers *regs;
 330 {
 331   return re_search_2_stub (bufp, string1, length1, string2, length2,
 332                            start, 0, regs, stop, 1);
 333 }
 334 #ifdef _LIBC
 335 weak_alias (__re_match_2, re_match_2)
 336 #endif
 337
 338 int
 339 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 340     struct re_pattern_buffer *bufp;
 341     const char *string1, *string2;
 342     int length1, length2, start, range, stop;
 343     struct re_registers *regs;
 344 {
 345   return re_search_2_stub (bufp, string1, length1, string2, length2,
 346                            start, range, regs, stop, 0);
 347 }
 348 #ifdef _LIBC
 349 weak_alias (__re_search_2, re_search_2)
 350 #endif
 351
 352 static int
 353 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 354                   stop, ret_len)
 355     struct re_pattern_buffer *bufp;
 356     const char *string1, *string2;
 357     int length1, length2, start, range, stop, ret_len;
 358     struct re_registers *regs;
 359 {
 360   const char *str;
 361   int rval;
 362   int len = length1 + length2;
 363   int free_str = 0;
 364
 365   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 366     return -2;
 367
 368   /* Concatenate the strings.  */
 369   if (length2 > 0)
 370     if (length1 > 0)
 371       {
 372         char *s = re_malloc (char, len);
 373
 374         if (BE (s == NULL, 0))
 375           return -2;
 376         memcpy (s, string1, length1);
 377         memcpy (s + length1, string2, length2);
 378         str = s;
 379         free_str = 1;
 380       }
 381     else
 382       str = string2;
 383   else
 384     str = string1;
 385
 386   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 387                          ret_len);
 388   if (free_str)
 389     re_free ((char *) str);
 390   return rval;
 391 }
 392
 393 /* The parameters have the same meaning as those of re_search.
 394    Additional parameters:
 395    If RET_LEN is nonzero the length of the match is returned (re_match style);
 396    otherwise the position of the match is returned.  */
 397
 398 static int
 399 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 400     struct re_pattern_buffer *bufp;
 401     const char *string;
 402     int length, start, range, stop, ret_len;
 403     struct re_registers *regs;
 404 {
 405   reg_errcode_t result;
 406   regmatch_t *pmatch;
 407   int nregs, rval;
 408   int eflags = 0;
 409   re_dfa_t *dfa = (re_dfa_t *)bufp->buffer;
 410
 411   /* Check for out-of-range.  */
 412   if (BE (start < 0 || start > length, 0))
 413     return -1;
 414   if (BE (start + range > length, 0))
 415     range = length - start;
 416   else if (BE (start + range < 0, 0))
 417     range = -start;
 418
 419   __libc_lock_lock (dfa->lock);
 420
 421   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 422   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 423
 424   /* Compile fastmap if we haven't yet.  */
 425   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 426     re_compile_fastmap (bufp);
 427
 428   if (BE (bufp->no_sub, 0))
 429     regs = NULL;
 430
 431   /* We need at least 1 register.  */
 432   if (regs == NULL)
 433     nregs = 1;
 434   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 435                regs->num_regs < bufp->re_nsub + 1, 0))
 436     {
 437       nregs = regs->num_regs;
 438       if (BE (nregs < 1, 0))
 439         {
 440           /* Nothing can be copied to regs.  */
 441           regs = NULL;
 442           nregs = 1;
 443         }
 444     }
 445   else
 446     nregs = bufp->re_nsub + 1;
 447   pmatch = re_malloc (regmatch_t, nregs);
 448   if (BE (pmatch == NULL, 0))
 449     {
 450       rval = -2;
 451       goto out;
 452     }
 453
 454   result = re_search_internal (bufp, string, length, start, range, stop,
 455                                nregs, pmatch, eflags);
 456
 457   rval = 0;
 458
 459   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 460   if (result != REG_NOERROR)
 461     rval = -1;
 462   else if (regs != NULL)
 463     {
 464       /* If caller wants register contents data back, copy them.  */
 465       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 466                                            bufp->regs_allocated);
 467       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 468         rval = -2;
 469     }
 470
 471   if (BE (rval == 0, 1))
 472     {
 473       if (ret_len)
 474         {
 475           assert (pmatch[0].rm_so == start);
 476           rval = pmatch[0].rm_eo - start;
 477         }
 478       else
 479         rval = pmatch[0].rm_so;
 480     }
 481   re_free (pmatch);
 482  out:
 483   __libc_lock_unlock (dfa->lock);
 484   return rval;
 485 }
 486
 487 static unsigned
 488 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 489     struct re_registers *regs;
 490     regmatch_t *pmatch;
 491     int nregs, regs_allocated;
 492 {
 493   int rval = REGS_REALLOCATE;
 494   int i;
 495   int need_regs = nregs + 1;
 496   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 497      uses.  */
 498
 499   /* Have the register data arrays been allocated?  */
 500   if (regs_allocated == REGS_UNALLOCATED)
 501     { /* No.  So allocate them with malloc.  */
 502       regs->start = re_malloc (regoff_t, need_regs);
 503       regs->end = re_malloc (regoff_t, need_regs);
 504       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 505         return REGS_UNALLOCATED;
 506       regs->num_regs = need_regs;
 507     }
 508   else if (regs_allocated == REGS_REALLOCATE)
 509     { /* Yes.  If we need more elements than were already
 510          allocated, reallocate them.  If we need fewer, just
 511          leave it alone.  */
 512       if (BE (need_regs > regs->num_regs, 0))
 513         {
 514           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 515           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 516           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 517             return REGS_UNALLOCATED;
 518           regs->start = new_start;
 519           regs->end = new_end;
 520           regs->num_regs = need_regs;
 521         }
 522     }
 523   else
 524     {
 525       assert (regs_allocated == REGS_FIXED);
 526       /* This function may not be called with REGS_FIXED and nregs too big.  */
 527       assert (regs->num_regs >= nregs);
 528       rval = REGS_FIXED;
 529     }
 530
 531   /* Copy the regs.  */
 532   for (i = 0; i < nregs; ++i)
 533     {
 534       regs->start[i] = pmatch[i].rm_so;
 535       regs->end[i] = pmatch[i].rm_eo;
 536     }
 537   for ( ; i < regs->num_regs; ++i)
 538     regs->start[i] = regs->end[i] = -1;
 539
 540   return rval;
 541 }
 542
 543 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 544    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 545    this memory for recording register information.  STARTS and ENDS
 546    must be allocated using the malloc library routine, and must each
 547    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 548
 549    If NUM_REGS == 0, then subsequent matches should allocate their own
 550    register data.
 551
 552    Unless this function is called, the first search or match using
 553    PATTERN_BUFFER will allocate its own register data, without
 554    freeing the old data.  */
 555
 556 void
 557 re_set_registers (bufp, regs, num_regs, starts, ends)
 558     struct re_pattern_buffer *bufp;
 559     struct re_registers *regs;
 560     unsigned num_regs;
 561     regoff_t *starts, *ends;
 562 {
 563   if (num_regs)
 564     {
 565       bufp->regs_allocated = REGS_REALLOCATE;
 566       regs->num_regs = num_regs;
 567       regs->start = starts;
 568       regs->end = ends;
 569     }
 570   else
 571     {
 572       bufp->regs_allocated = REGS_UNALLOCATED;
 573       regs->num_regs = 0;
 574       regs->start = regs->end = (regoff_t *) 0;
 575     }
 576 }
 577 #ifdef _LIBC
 578 weak_alias (__re_set_registers, re_set_registers)
 579 #endif
 580 \f
 581 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 582    them unless specifically requested.  */
 583
 584 #if defined _REGEX_RE_COMP || defined _LIBC
 585 int
 586 # ifdef _LIBC
 587 weak_function
 588 # endif
 589 re_exec (s)
 590      const char *s;
 591 {
 592   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 593 }
 594 #endif /* _REGEX_RE_COMP */
 595 \f
 596 /* Internal entry point.  */
 597
 598 /* Searches for a compiled pattern PREG in the string STRING, whose
 599    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 600    mingings with regexec.  START, and RANGE have the same meanings
 601    with re_search.
 602    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 603    otherwise return the error code.
 604    Note: We assume front end functions already check ranges.
 605    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 606
 607 static reg_errcode_t
 608 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 609                     eflags)
 610     const regex_t *preg;
 611     const char *string;
 612     int length, start, range, stop, eflags;
 613     size_t nmatch;
 614     regmatch_t pmatch[];
 615 {
 616   reg_errcode_t err;
 617   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 618   int left_lim, right_lim, incr;
 619   int fl_longest_match, match_first, match_kind, match_last = -1;
 620   int sb, ch;
 621 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 622   re_match_context_t mctx = { .dfa = dfa };
 623 #else
 624   re_match_context_t mctx;
 625 #endif
 626   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 627                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 628   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 629
 630 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 631   memset (&mctx, '\0', sizeof (re_match_context_t));
 632   mctx.dfa = dfa;
 633 #endif
 634
 635   /* Check if the DFA haven't been compiled.  */
 636   if (BE (preg->used == 0 || dfa->init_state == NULL
 637           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 638           || dfa->init_state_begbuf == NULL, 0))
 639     return REG_NOMATCH;
 640
 641 #ifdef DEBUG
 642   /* We assume front-end functions already check them.  */
 643   assert (start + range >= 0 && start + range <= length);
 644 #endif
 645
 646   /* If initial states with non-begbuf contexts have no elements,
 647      the regex must be anchored.  If preg->newline_anchor is set,
 648      we'll never use init_state_nl, so do not check it.  */
 649   if (dfa->init_state->nodes.nelem == 0
 650       && dfa->init_state_word->nodes.nelem == 0
 651       && (dfa->init_state_nl->nodes.nelem == 0
 652           || !preg->newline_anchor))
 653     {
 654       if (start != 0 && start + range != 0)
 655         return REG_NOMATCH;
 656       start = range = 0;
 657     }
 658
 659   /* We must check the longest matching, if nmatch > 0.  */
 660   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 661
 662   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 663                             preg->translate, preg->syntax & RE_ICASE, dfa);
 664   if (BE (err != REG_NOERROR, 0))
 665     goto free_return;
 666   mctx.input.stop = stop;
 667   mctx.input.raw_stop = stop;
 668   mctx.input.newline_anchor = preg->newline_anchor;
 669
 670   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 671   if (BE (err != REG_NOERROR, 0))
 672     goto free_return;
 673
 674   /* We will log all the DFA states through which the dfa pass,
 675      if nmatch > 1, or this dfa has "multibyte node", which is a
 676      back-reference or a node which can accept multibyte character or
 677      multi character collating element.  */
 678   if (nmatch > 1 || dfa->has_mb_node)
 679     {
 680       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 681       if (BE (mctx.state_log == NULL, 0))
 682         {
 683           err = REG_ESPACE;
 684           goto free_return;
 685         }
 686     }
 687   else
 688     mctx.state_log = NULL;
 689
 690   match_first = start;
 691   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 692                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 693
 694   /* Check incrementally whether of not the input string match.  */
 695   incr = (range < 0) ? -1 : 1;
 696   left_lim = (range < 0) ? start + range : start;
 697   right_lim = (range < 0) ? start : start + range;
 698   sb = dfa->mb_cur_max == 1;
 699   match_kind =
 700     (fastmap
 701      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 702         | (range >= 0 ? 2 : 0)
 703         | (t != NULL ? 1 : 0))
 704      : 8);
 705
 706   for (;; match_first += incr)
 707     {
 708       err = REG_NOMATCH;
 709       if (match_first < left_lim || right_lim < match_first)
 710         goto free_return;
 711
 712       /* Advance as rapidly as possible through the string, until we
 713          find a plausible place to start matching.  This may be done
 714          with varying efficiency, so there are various possibilities:
 715          only the most common of them are specialized, in order to
 716          save on code size.  We use a switch statement for speed.  */
 717       switch (match_kind)
 718         {
 719         case 8:
 720           /* No fastmap.  */
 721           break;
 722
 723         case 7:
 724           /* Fastmap with single-byte translation, match forward.  */
 725           while (BE (match_first < right_lim, 1)
 726                  && !fastmap[t[(unsigned char) string[match_first]]])
 727             ++match_first;
 728           goto forward_match_found_start_or_reached_end;
 729
 730         case 6:
 731           /* Fastmap without translation, match forward.  */
 732           while (BE (match_first < right_lim, 1)
 733                  && !fastmap[(unsigned char) string[match_first]])
 734             ++match_first;
 735
 736         forward_match_found_start_or_reached_end:
 737           if (BE (match_first == right_lim, 0))
 738             {
 739               ch = match_first >= length
 740                        ? 0 : (unsigned char) string[match_first];
 741               if (!fastmap[t ? t[ch] : ch])
 742                 goto free_return;
 743             }
 744           break;
 745
 746         case 4:
 747         case 5:
 748           /* Fastmap without multi-byte translation, match backwards.  */
 749           while (match_first >= left_lim)
 750             {
 751               ch = match_first >= length
 752                        ? 0 : (unsigned char) string[match_first];
 753               if (fastmap[t ? t[ch] : ch])
 754                 break;
 755               --match_first;
 756             }
 757           if (match_first < left_lim)
 758             goto free_return;
 759           break;
 760
 761         default:
 762           /* In this case, we can't determine easily the current byte,
 763              since it might be a component byte of a multibyte
 764              character.  Then we use the constructed buffer instead.  */
 765           for (;;)
 766             {
 767               /* If MATCH_FIRST is out of the valid range, reconstruct the
 768                  buffers.  */
 769               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 770               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 771                 {
 772                   err = re_string_reconstruct (&mctx.input, match_first,
 773                                                eflags);
 774                   if (BE (err != REG_NOERROR, 0))
 775                     goto free_return;
 776
 777                   offset = match_first - mctx.input.raw_mbs_idx;
 778                 }
 779               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 780                  Note that MATCH_FIRST must not be smaller than 0.  */
 781               ch = (match_first >= length
 782                     ? 0 : re_string_byte_at (&mctx.input, offset));
 783               if (fastmap[ch])
 784                 break;
 785               match_first += incr;
 786               if (match_first < left_lim || match_first > right_lim)
 787                 {
 788                   err = REG_NOMATCH;
 789                   goto free_return;
 790                 }
 791             }
 792           break;
 793         }
 794
 795       /* Reconstruct the buffers so that the matcher can assume that
 796          the matching starts from the beginning of the buffer.  */
 797       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 798       if (BE (err != REG_NOERROR, 0))
 799         goto free_return;
 800
 801 #ifdef RE_ENABLE_I18N
 802      /* Don't consider this char as a possible match start if it part,
 803         yet isn't the head, of a multibyte character.  */
 804       if (!sb && !re_string_first_byte (&mctx.input, 0))
 805         continue;
 806 #endif
 807
 808       /* It seems to be appropriate one, then use the matcher.  */
 809       /* We assume that the matching starts from 0.  */
 810       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 811       match_last = check_matching (&mctx, fl_longest_match,
 812                                    range >= 0 ? &match_first : NULL);
 813       if (match_last != -1)
 814         {
 815           if (BE (match_last == -2, 0))
 816             {
 817               err = REG_ESPACE;
 818               goto free_return;
 819             }
 820           else
 821             {
 822               mctx.match_last = match_last;
 823               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 824                 {
 825                   re_dfastate_t *pstate = mctx.state_log[match_last];
 826                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 827                                                              match_last);
 828                 }
 829               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 830                   || dfa->nbackref)
 831                 {
 832                   err = prune_impossible_nodes (&mctx);
 833                   if (err == REG_NOERROR)
 834                     break;
 835                   if (BE (err != REG_NOMATCH, 0))
 836                     goto free_return;
 837                   match_last = -1;
 838                 }
 839               else
 840                 break; /* We found a match.  */
 841             }
 842         }
 843
 844       match_ctx_clean (&mctx);
 845     }
 846
 847 #ifdef DEBUG
 848   assert (match_last != -1);
 849   assert (err == REG_NOERROR);
 850 #endif
 851
 852   /* Set pmatch[] if we need.  */
 853   if (nmatch > 0)
 854     {
 855       int reg_idx;
 856
 857       /* Initialize registers.  */
 858       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 859         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 860
 861       /* Set the points where matching start/end.  */
 862       pmatch[0].rm_so = 0;
 863       pmatch[0].rm_eo = mctx.match_last;
 864
 865       if (!preg->no_sub && nmatch > 1)
 866         {
 867           err = set_regs (preg, &mctx, nmatch, pmatch,
 868                           dfa->has_plural_match && dfa->nbackref > 0);
 869           if (BE (err != REG_NOERROR, 0))
 870             goto free_return;
 871         }
 872
 873       /* At last, add the offset to the each registers, since we slided
 874          the buffers so that we could assume that the matching starts
 875          from 0.  */
 876       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 877         if (pmatch[reg_idx].rm_so != -1)
 878           {
 879 #ifdef RE_ENABLE_I18N
 880             if (BE (mctx.input.offsets_needed != 0, 0))
 881               {
 882                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 883                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 884                 else
 885                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 886                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 887                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 888                 else
 889                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 890               }
 891 #else
 892             assert (mctx.input.offsets_needed == 0);
 893 #endif
 894             pmatch[reg_idx].rm_so += match_first;
 895             pmatch[reg_idx].rm_eo += match_first;
 896           }
 897
 898       if (dfa->subexp_map)
 899         for (reg_idx = 0;
 900              reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
 901              reg_idx++)
 902           if (dfa->subexp_map[reg_idx] != reg_idx)
 903             {
 904               pmatch[reg_idx + 1].rm_so
 905                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 906               pmatch[reg_idx + 1].rm_eo
 907                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 908             }
 909     }
 910
 911  free_return:
 912   re_free (mctx.state_log);
 913   if (dfa->nbackref)
 914     match_ctx_free (&mctx);
 915   re_string_destruct (&mctx.input);
 916   return err;
 917 }
 918
 919 static reg_errcode_t
 920 prune_impossible_nodes (mctx)
 921      re_match_context_t *mctx;
 922 {
 923   re_dfa_t *const dfa = mctx->dfa;
 924   int halt_node, match_last;
 925   reg_errcode_t ret;
 926   re_dfastate_t **sifted_states;
 927   re_dfastate_t **lim_states = NULL;
 928   re_sift_context_t sctx;
 929 #ifdef DEBUG
 930   assert (mctx->state_log != NULL);
 931 #endif
 932   match_last = mctx->match_last;
 933   halt_node = mctx->last_node;
 934   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 935   if (BE (sifted_states == NULL, 0))
 936     {
 937       ret = REG_ESPACE;
 938       goto free_return;
 939     }
 940   if (dfa->nbackref)
 941     {
 942       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 943       if (BE (lim_states == NULL, 0))
 944         {
 945           ret = REG_ESPACE;
 946           goto free_return;
 947         }
 948       while (1)
 949         {
 950           memset (lim_states, '\0',
 951                   sizeof (re_dfastate_t *) * (match_last + 1));
 952           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 953                          match_last);
 954           ret = sift_states_backward (mctx, &sctx);
 955           re_node_set_free (&sctx.limits);
 956           if (BE (ret != REG_NOERROR, 0))
 957               goto free_return;
 958           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 959             break;
 960           do
 961             {
 962               --match_last;
 963               if (match_last < 0)
 964                 {
 965                   ret = REG_NOMATCH;
 966                   goto free_return;
 967                 }
 968             } while (mctx->state_log[match_last] == NULL
 969                      || !mctx->state_log[match_last]->halt);
 970           halt_node = check_halt_state_context (mctx,
 971                                                 mctx->state_log[match_last],
 972                                                 match_last);
 973         }
 974       ret = merge_state_array (dfa, sifted_states, lim_states,
 975                                match_last + 1);
 976       re_free (lim_states);
 977       lim_states = NULL;
 978       if (BE (ret != REG_NOERROR, 0))
 979         goto free_return;
 980     }
 981   else
 982     {
 983       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 984       ret = sift_states_backward (mctx, &sctx);
 985       re_node_set_free (&sctx.limits);
 986       if (BE (ret != REG_NOERROR, 0))
 987         goto free_return;
 988     }
 989   re_free (mctx->state_log);
 990   mctx->state_log = sifted_states;
 991   sifted_states = NULL;
 992   mctx->last_node = halt_node;
 993   mctx->match_last = match_last;
 994   ret = REG_NOERROR;
 995  free_return:
 996   re_free (sifted_states);
 997   re_free (lim_states);
 998   return ret;
 999 }
1000
1001 /* Acquire an initial state and return it.
1002    We must select appropriate initial state depending on the context,
1003    since initial states may have constraints like "\<", "^", etc..  */
1004
1005 static inline re_dfastate_t *
1006 acquire_init_state_context (err, mctx, idx)
1007      reg_errcode_t *err;
1008      const re_match_context_t *mctx;
1009      int idx;
1010 {
1011   re_dfa_t *const dfa = mctx->dfa;
1012   if (dfa->init_state->has_constraint)
1013     {
1014       unsigned int context;
1015       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1016       if (IS_WORD_CONTEXT (context))
1017         return dfa->init_state_word;
1018       else if (IS_ORDINARY_CONTEXT (context))
1019         return dfa->init_state;
1020       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1021         return dfa->init_state_begbuf;
1022       else if (IS_NEWLINE_CONTEXT (context))
1023         return dfa->init_state_nl;
1024       else if (IS_BEGBUF_CONTEXT (context))
1025         {
1026           /* It is relatively rare case, then calculate on demand.  */
1027           return re_acquire_state_context (err, dfa,
1028                                            dfa->init_state->entrance_nodes,
1029                                            context);
1030         }
1031       else
1032         /* Must not happen?  */
1033         return dfa->init_state;
1034     }
1035   else
1036     return dfa->init_state;
1037 }
1038
1039 /* Check whether the regular expression match input string INPUT or not,
1040    and return the index where the matching end, return -1 if not match,
1041    or return -2 in case of an error.
1042    FL_LONGEST_MATCH means we want the POSIX longest matching.
1043    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1044    next place where we may want to try matching.
1045    Note that the matcher assume that the maching starts from the current
1046    index of the buffer.  */
1047
1048 static int
1049 check_matching (mctx, fl_longest_match, p_match_first)
1050     re_match_context_t *mctx;
1051     int fl_longest_match;
1052     int *p_match_first;
1053 {
1054   re_dfa_t *const dfa = mctx->dfa;
1055   reg_errcode_t err;
1056   int match = 0;
1057   int match_last = -1;
1058   int cur_str_idx = re_string_cur_idx (&mctx->input);
1059   re_dfastate_t *cur_state;
1060   int at_init_state = p_match_first != NULL;
1061   int next_start_idx = cur_str_idx;
1062
1063   err = REG_NOERROR;
1064   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1065   /* An initial state must not be NULL (invalid).  */
1066   if (BE (cur_state == NULL, 0))
1067     {
1068       assert (err == REG_ESPACE);
1069       return -2;
1070     }
1071
1072   if (mctx->state_log != NULL)
1073     {
1074       mctx->state_log[cur_str_idx] = cur_state;
1075
1076       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1077          later.  E.g. Processing back references.  */
1078       if (BE (dfa->nbackref, 0))
1079         {
1080           at_init_state = 0;
1081           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1082           if (BE (err != REG_NOERROR, 0))
1083             return err;
1084
1085           if (cur_state->has_backref)
1086             {
1087               err = transit_state_bkref (mctx, &cur_state->nodes);
1088               if (BE (err != REG_NOERROR, 0))
1089                 return err;
1090             }
1091         }
1092     }
1093
1094   /* If the RE accepts NULL string.  */
1095   if (BE (cur_state->halt, 0))
1096     {
1097       if (!cur_state->has_constraint
1098           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1099         {
1100           if (!fl_longest_match)
1101             return cur_str_idx;
1102           else
1103             {
1104               match_last = cur_str_idx;
1105               match = 1;
1106             }
1107         }
1108     }
1109
1110   while (!re_string_eoi (&mctx->input))
1111     {
1112       re_dfastate_t *old_state = cur_state;
1113       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1114
1115       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1116           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1117               && mctx->input.valid_len < mctx->input.len))
1118         {
1119           err = extend_buffers (mctx);
1120           if (BE (err != REG_NOERROR, 0))
1121             {
1122               assert (err == REG_ESPACE);
1123               return -2;
1124             }
1125         }
1126
1127       cur_state = transit_state (&err, mctx, cur_state);
1128       if (mctx->state_log != NULL)
1129         cur_state = merge_state_with_log (&err, mctx, cur_state);
1130
1131       if (cur_state == NULL)
1132         {
1133           /* Reached the invalid state or an error.  Try to recover a valid
1134              state using the state log, if available and if we have not
1135              already found a valid (even if not the longest) match.  */
1136           if (BE (err != REG_NOERROR, 0))
1137             return -2;
1138
1139           if (mctx->state_log == NULL
1140               || (match && !fl_longest_match)
1141               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1142             break;
1143         }
1144
1145       if (BE (at_init_state, 0))
1146         {
1147           if (old_state == cur_state)
1148             next_start_idx = next_char_idx;
1149           else
1150             at_init_state = 0;
1151         }
1152
1153       if (cur_state->halt)
1154         {
1155           /* Reached a halt state.
1156              Check the halt state can satisfy the current context.  */
1157           if (!cur_state->has_constraint
1158               || check_halt_state_context (mctx, cur_state,
1159                                            re_string_cur_idx (&mctx->input)))
1160             {
1161               /* We found an appropriate halt state.  */
1162               match_last = re_string_cur_idx (&mctx->input);
1163               match = 1;
1164
1165               /* We found a match, do not modify match_first below.  */
1166               p_match_first = NULL;
1167               if (!fl_longest_match)
1168                 break;
1169             }
1170         }
1171     }
1172
1173   if (p_match_first)
1174     *p_match_first += next_start_idx;
1175
1176   return match_last;
1177 }
1178
1179 /* Check NODE match the current context.  */
1180
1181 static int check_halt_node_context (dfa, node, context)
1182     const re_dfa_t *dfa;
1183     int node;
1184     unsigned int context;
1185 {
1186   re_token_type_t type = dfa->nodes[node].type;
1187   unsigned int constraint = dfa->nodes[node].constraint;
1188   if (type != END_OF_RE)
1189     return 0;
1190   if (!constraint)
1191     return 1;
1192   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1193     return 0;
1194   return 1;
1195 }
1196
1197 /* Check the halt state STATE match the current context.
1198    Return 0 if not match, if the node, STATE has, is a halt node and
1199    match the context, return the node.  */
1200
1201 static int
1202 check_halt_state_context (mctx, state, idx)
1203     const re_match_context_t *mctx;
1204     const re_dfastate_t *state;
1205     int idx;
1206 {
1207   int i;
1208   unsigned int context;
1209 #ifdef DEBUG
1210   assert (state->halt);
1211 #endif
1212   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1213   for (i = 0; i < state->nodes.nelem; ++i)
1214     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1215       return state->nodes.elems[i];
1216   return 0;
1217 }
1218
1219 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1220    corresponding to the DFA).
1221    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1222    of errors.  */
1223
1224 static int
1225 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1226     const re_match_context_t *mctx;
1227     regmatch_t *regs;
1228     int nregs, *pidx, node;
1229     re_node_set *eps_via_nodes;
1230     struct re_fail_stack_t *fs;
1231 {
1232   re_dfa_t *const dfa = mctx->dfa;
1233   int i, err, dest_node;
1234   dest_node = -1;
1235   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1236     {
1237       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1238       re_node_set *edests = &dfa->edests[node];
1239       int dest_node;
1240       err = re_node_set_insert (eps_via_nodes, node);
1241       if (BE (err < 0, 0))
1242         return -2;
1243       /* Pick up a valid destination, or return -1 if none is found.  */
1244       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1245         {
1246           int candidate = edests->elems[i];
1247           if (!re_node_set_contains (cur_nodes, candidate))
1248             continue;
1249           if (dest_node == -1)
1250             dest_node = candidate;
1251
1252           else
1253             {
1254               /* In order to avoid infinite loop like "(a*)*", return the second
1255                  epsilon-transition if the first was already considered.  */
1256               if (re_node_set_contains (eps_via_nodes, dest_node))
1257                 return candidate;
1258
1259               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1260               else if (fs != NULL
1261                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262                                            eps_via_nodes))
1263                 return -2;
1264
1265               /* We know we are going to exit.  */
1266               break;
1267             }
1268         }
1269       return dest_node;
1270     }
1271   else
1272     {
1273       int naccepted = 0;
1274       re_token_type_t type = dfa->nodes[node].type;
1275
1276 #ifdef RE_ENABLE_I18N
1277       if (ACCEPT_MB_NODE (type))
1278         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1279       else
1280 #endif /* RE_ENABLE_I18N */
1281       if (type == OP_BACK_REF)
1282         {
1283           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1284           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285           if (fs != NULL)
1286             {
1287               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288                 return -1;
1289               else if (naccepted)
1290                 {
1291                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1292                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1293                               naccepted) != 0)
1294                     return -1;
1295                 }
1296             }
1297
1298           if (naccepted == 0)
1299             {
1300               err = re_node_set_insert (eps_via_nodes, node);
1301               if (BE (err < 0, 0))
1302                 return -2;
1303               dest_node = dfa->edests[node].elems[0];
1304               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1305                                         dest_node))
1306                 return dest_node;
1307             }
1308         }
1309
1310       if (naccepted != 0
1311           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1312         {
1313           dest_node = dfa->nexts[node];
1314           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1315           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1316                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1317                                                dest_node)))
1318             return -1;
1319           re_node_set_empty (eps_via_nodes);
1320           return dest_node;
1321         }
1322     }
1323   return -1;
1324 }
1325
1326 static reg_errcode_t
1327 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1328      struct re_fail_stack_t *fs;
1329      int str_idx, dest_node, nregs;
1330      regmatch_t *regs;
1331      re_node_set *eps_via_nodes;
1332 {
1333   reg_errcode_t err;
1334   int num = fs->num++;
1335   if (fs->num == fs->alloc)
1336     {
1337       struct re_fail_stack_ent_t *new_array;
1338       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1339                                        * fs->alloc * 2));
1340       if (new_array == NULL)
1341         return REG_ESPACE;
1342       fs->alloc *= 2;
1343       fs->stack = new_array;
1344     }
1345   fs->stack[num].idx = str_idx;
1346   fs->stack[num].node = dest_node;
1347   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1348   if (fs->stack[num].regs == NULL)
1349     return REG_ESPACE;
1350   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1351   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1352   return err;
1353 }
1354
1355 static int
1356 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1357      struct re_fail_stack_t *fs;
1358      int *pidx, nregs;
1359      regmatch_t *regs;
1360      re_node_set *eps_via_nodes;
1361 {
1362   int num = --fs->num;
1363   assert (num >= 0);
1364   *pidx = fs->stack[num].idx;
1365   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1366   re_node_set_free (eps_via_nodes);
1367   re_free (fs->stack[num].regs);
1368   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1369   return fs->stack[num].node;
1370 }
1371
1372 /* Set the positions where the subexpressions are starts/ends to registers
1373    PMATCH.
1374    Note: We assume that pmatch[0] is already set, and
1375    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1376
1377 static reg_errcode_t
1378 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1379      const regex_t *preg;
1380      const re_match_context_t *mctx;
1381      size_t nmatch;
1382      regmatch_t *pmatch;
1383      int fl_backtrack;
1384 {
1385   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1386   int idx, cur_node, real_nmatch;
1387   re_node_set eps_via_nodes;
1388   struct re_fail_stack_t *fs;
1389   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1390   regmatch_t *prev_idx_match;
1391
1392 #ifdef DEBUG
1393   assert (nmatch > 1);
1394   assert (mctx->state_log != NULL);
1395 #endif
1396   if (fl_backtrack)
1397     {
1398       fs = &fs_body;
1399       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1400       if (fs->stack == NULL)
1401         return REG_ESPACE;
1402     }
1403   else
1404     fs = NULL;
1405
1406   cur_node = dfa->init_node;
1407   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1408   re_node_set_init_empty (&eps_via_nodes);
1409
1410   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1411   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1412
1413   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1414     {
1415       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1416
1417       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1418         {
1419           int reg_idx;
1420           if (fs)
1421             {
1422               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1423                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1424                   break;
1425               if (reg_idx == nmatch)
1426                 {
1427                   re_node_set_free (&eps_via_nodes);
1428                   return free_fail_stack_return (fs);
1429                 }
1430               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1431                                          &eps_via_nodes);
1432             }
1433           else
1434             {
1435               re_node_set_free (&eps_via_nodes);
1436               return REG_NOERROR;
1437             }
1438         }
1439
1440       /* Proceed to next node.  */
1441       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1442                                     &eps_via_nodes, fs);
1443
1444       if (BE (cur_node < 0, 0))
1445         {
1446           if (BE (cur_node == -2, 0))
1447             {
1448               re_node_set_free (&eps_via_nodes);
1449               free_fail_stack_return (fs);
1450               return REG_ESPACE;
1451             }
1452           if (fs)
1453             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1454                                        &eps_via_nodes);
1455           else
1456             {
1457               re_node_set_free (&eps_via_nodes);
1458               return REG_NOMATCH;
1459             }
1460         }
1461     }
1462   re_node_set_free (&eps_via_nodes);
1463   return free_fail_stack_return (fs);
1464 }
1465
1466 static reg_errcode_t
1467 free_fail_stack_return (fs)
1468      struct re_fail_stack_t *fs;
1469 {
1470   if (fs)
1471     {
1472       int fs_idx;
1473       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1474         {
1475           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1476           re_free (fs->stack[fs_idx].regs);
1477         }
1478       re_free (fs->stack);
1479     }
1480   return REG_NOERROR;
1481 }
1482
1483 static void
1484 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1485      re_dfa_t *dfa;
1486      regmatch_t *pmatch, *prev_idx_match;
1487      int cur_node, cur_idx, nmatch;
1488 {
1489   int type = dfa->nodes[cur_node].type;
1490   if (type == OP_OPEN_SUBEXP)
1491     {
1492       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1493
1494       /* We are at the first node of this sub expression.  */
1495       if (reg_num < nmatch)
1496         {
1497           pmatch[reg_num].rm_so = cur_idx;
1498           pmatch[reg_num].rm_eo = -1;
1499         }
1500     }
1501   else if (type == OP_CLOSE_SUBEXP)
1502     {
1503       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1504       if (reg_num < nmatch)
1505         {
1506           /* We are at the last node of this sub expression.  */
1507           if (pmatch[reg_num].rm_so < cur_idx)
1508             {
1509               pmatch[reg_num].rm_eo = cur_idx;
1510               /* This is a non-empty match or we are not inside an optional
1511                  subexpression.  Accept this right away.  */
1512               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1513             }
1514           else
1515             {
1516               if (dfa->nodes[cur_node].opt_subexp
1517                   && prev_idx_match[reg_num].rm_so != -1)
1518                 /* We transited through an empty match for an optional
1519                    subexpression, like (a?)*, and this is not the subexp's
1520                    first match.  Copy back the old content of the registers
1521                    so that matches of an inner subexpression are undone as
1522                    well, like in ((a?))*.  */
1523                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1524               else
1525                 /* We completed a subexpression, but it may be part of
1526                    an optional one, so do not update PREV_IDX_MATCH.  */
1527                 pmatch[reg_num].rm_eo = cur_idx;
1528             }
1529         }
1530     }
1531 }
1532
1533 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1534    and sift the nodes in each states according to the following rules.
1535    Updated state_log will be wrote to STATE_LOG.
1536
1537    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1538      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1539         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1540         the LAST_NODE, we throw away the node `a'.
1541      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1542         string `s' and transit to `b':
1543         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1544            away the node `a'.
1545         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1546             thrown away, we throw away the node `a'.
1547      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1548         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1549            node `a'.
1550         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1551             we throw away the node `a'.  */
1552
1553 #define STATE_NODE_CONTAINS(state,node) \
1554   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1555
1556 static reg_errcode_t
1557 sift_states_backward (mctx, sctx)
1558      re_match_context_t *mctx;
1559      re_sift_context_t *sctx;
1560 {
1561   reg_errcode_t err;
1562   int null_cnt = 0;
1563   int str_idx = sctx->last_str_idx;
1564   re_node_set cur_dest;
1565
1566 #ifdef DEBUG
1567   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1568 #endif
1569
1570   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1571      transit to the last_node and the last_node itself.  */
1572   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1573   if (BE (err != REG_NOERROR, 0))
1574     return err;
1575   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1576   if (BE (err != REG_NOERROR, 0))
1577     goto free_return;
1578
1579   /* Then check each states in the state_log.  */
1580   while (str_idx > 0)
1581     {
1582       /* Update counters.  */
1583       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1584       if (null_cnt > mctx->max_mb_elem_len)
1585         {
1586           memset (sctx->sifted_states, '\0',
1587                   sizeof (re_dfastate_t *) * str_idx);
1588           re_node_set_free (&cur_dest);
1589           return REG_NOERROR;
1590         }
1591       re_node_set_empty (&cur_dest);
1592       --str_idx;
1593
1594       if (mctx->state_log[str_idx])
1595         {
1596           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1597           if (BE (err != REG_NOERROR, 0))
1598             goto free_return;
1599         }
1600
1601       /* Add all the nodes which satisfy the following conditions:
1602          - It can epsilon transit to a node in CUR_DEST.
1603          - It is in CUR_SRC.
1604          And update state_log.  */
1605       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1606       if (BE (err != REG_NOERROR, 0))
1607         goto free_return;
1608     }
1609   err = REG_NOERROR;
1610  free_return:
1611   re_node_set_free (&cur_dest);
1612   return err;
1613 }
1614
1615 static reg_errcode_t
1616 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1617      re_match_context_t *mctx;
1618      re_sift_context_t *sctx;
1619      int str_idx;
1620      re_node_set *cur_dest;
1621 {
1622   re_dfa_t *const dfa = mctx->dfa;
1623   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1624   int i;
1625
1626   /* Then build the next sifted state.
1627      We build the next sifted state on `cur_dest', and update
1628      `sifted_states[str_idx]' with `cur_dest'.
1629      Note:
1630      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1631      `cur_src' points the node_set of the old `state_log[str_idx]'
1632      (with the epsilon nodes pre-filtered out).  */
1633   for (i = 0; i < cur_src->nelem; i++)
1634     {
1635       int prev_node = cur_src->elems[i];
1636       int naccepted = 0;
1637       int ret;
1638
1639 #if defined DEBUG || defined RE_ENABLE_I18N
1640       re_token_type_t type = dfa->nodes[prev_node].type;
1641 #endif
1642 #ifdef DEBUG
1643       assert (!IS_EPSILON_NODE (type));
1644 #endif
1645 #ifdef RE_ENABLE_I18N
1646       /* If the node may accept `multi byte'.  */
1647       if (ACCEPT_MB_NODE (type))
1648         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1649                                          str_idx, sctx->last_str_idx);
1650 #endif /* RE_ENABLE_I18N */
1651
1652       /* We don't check backreferences here.
1653          See update_cur_sifted_state().  */
1654       if (!naccepted
1655           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1656           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1657                                   dfa->nexts[prev_node]))
1658         naccepted = 1;
1659
1660       if (naccepted == 0)
1661         continue;
1662
1663       if (sctx->limits.nelem)
1664         {
1665           int to_idx = str_idx + naccepted;
1666           if (check_dst_limits (mctx, &sctx->limits,
1667                                 dfa->nexts[prev_node], to_idx,
1668                                 prev_node, str_idx))
1669             continue;
1670         }
1671       ret = re_node_set_insert (cur_dest, prev_node);
1672       if (BE (ret == -1, 0))
1673         return REG_ESPACE;
1674     }
1675
1676   return REG_NOERROR;
1677 }
1678
1679 /* Helper functions.  */
1680
1681 static reg_errcode_t
1682 clean_state_log_if_needed (mctx, next_state_log_idx)
1683     re_match_context_t *mctx;
1684     int next_state_log_idx;
1685 {
1686   int top = mctx->state_log_top;
1687
1688   if (next_state_log_idx >= mctx->input.bufs_len
1689       || (next_state_log_idx >= mctx->input.valid_len
1690           && mctx->input.valid_len < mctx->input.len))
1691     {
1692       reg_errcode_t err;
1693       err = extend_buffers (mctx);
1694       if (BE (err != REG_NOERROR, 0))
1695         return err;
1696     }
1697
1698   if (top < next_state_log_idx)
1699     {
1700       memset (mctx->state_log + top + 1, '\0',
1701               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1702       mctx->state_log_top = next_state_log_idx;
1703     }
1704   return REG_NOERROR;
1705 }
1706
1707 static reg_errcode_t
1708 merge_state_array (dfa, dst, src, num)
1709      re_dfa_t *dfa;
1710      re_dfastate_t **dst;
1711      re_dfastate_t **src;
1712      int num;
1713 {
1714   int st_idx;
1715   reg_errcode_t err;
1716   for (st_idx = 0; st_idx < num; ++st_idx)
1717     {
1718       if (dst[st_idx] == NULL)
1719         dst[st_idx] = src[st_idx];
1720       else if (src[st_idx] != NULL)
1721         {
1722           re_node_set merged_set;
1723           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1724                                         &src[st_idx]->nodes);
1725           if (BE (err != REG_NOERROR, 0))
1726             return err;
1727           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1728           re_node_set_free (&merged_set);
1729           if (BE (err != REG_NOERROR, 0))
1730             return err;
1731         }
1732     }
1733   return REG_NOERROR;
1734 }
1735
1736 static reg_errcode_t
1737 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1738      re_match_context_t *mctx;
1739      re_sift_context_t *sctx;
1740      int str_idx;
1741      re_node_set *dest_nodes;
1742 {
1743   re_dfa_t *const dfa = mctx->dfa;
1744   reg_errcode_t err;
1745   const re_node_set *candidates;
1746   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1747                 : &mctx->state_log[str_idx]->nodes);
1748
1749   if (dest_nodes->nelem == 0)
1750     sctx->sifted_states[str_idx] = NULL;
1751   else
1752     {
1753       if (candidates)
1754         {
1755           /* At first, add the nodes which can epsilon transit to a node in
1756              DEST_NODE.  */
1757           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1758           if (BE (err != REG_NOERROR, 0))
1759             return err;
1760
1761           /* Then, check the limitations in the current sift_context.  */
1762           if (sctx->limits.nelem)
1763             {
1764               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1765                                          mctx->bkref_ents, str_idx);
1766               if (BE (err != REG_NOERROR, 0))
1767                 return err;
1768             }
1769         }
1770
1771       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1772       if (BE (err != REG_NOERROR, 0))
1773         return err;
1774     }
1775
1776   if (candidates && mctx->state_log[str_idx]->has_backref)
1777     {
1778       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1779       if (BE (err != REG_NOERROR, 0))
1780         return err;
1781     }
1782   return REG_NOERROR;
1783 }
1784
1785 static reg_errcode_t
1786 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1787      re_dfa_t *dfa;
1788      re_node_set *dest_nodes;
1789      const re_node_set *candidates;
1790 {
1791   reg_errcode_t err = REG_NOERROR;
1792   int i;
1793
1794   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1795   if (BE (err != REG_NOERROR, 0))
1796     return err;
1797
1798   if (!state->inveclosure.alloc)
1799     {
1800       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1801       if (BE (err != REG_NOERROR, 0))
1802         return REG_ESPACE;
1803       for (i = 0; i < dest_nodes->nelem; i++)
1804         re_node_set_merge (&state->inveclosure,
1805                            dfa->inveclosures + dest_nodes->elems[i]);
1806     }
1807   return re_node_set_add_intersect (dest_nodes, candidates,
1808                                     &state->inveclosure);
1809 }
1810
1811 static reg_errcode_t
1812 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1813      re_dfa_t *dfa;
1814      int node;
1815      re_node_set *dest_nodes;
1816      const re_node_set *candidates;
1817 {
1818     int ecl_idx;
1819     reg_errcode_t err;
1820     re_node_set *inv_eclosure = dfa->inveclosures + node;
1821     re_node_set except_nodes;
1822     re_node_set_init_empty (&except_nodes);
1823     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1824       {
1825         int cur_node = inv_eclosure->elems[ecl_idx];
1826         if (cur_node == node)
1827           continue;
1828         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1829           {
1830             int edst1 = dfa->edests[cur_node].elems[0];
1831             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1832                          ? dfa->edests[cur_node].elems[1] : -1);
1833             if ((!re_node_set_contains (inv_eclosure, edst1)
1834                  && re_node_set_contains (dest_nodes, edst1))
1835                 || (edst2 > 0
1836                     && !re_node_set_contains (inv_eclosure, edst2)
1837                     && re_node_set_contains (dest_nodes, edst2)))
1838               {
1839                 err = re_node_set_add_intersect (&except_nodes, candidates,
1840                                                  dfa->inveclosures + cur_node);
1841                 if (BE (err != REG_NOERROR, 0))
1842                   {
1843                     re_node_set_free (&except_nodes);
1844                     return err;
1845                   }
1846               }
1847           }
1848       }
1849     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1850       {
1851         int cur_node = inv_eclosure->elems[ecl_idx];
1852         if (!re_node_set_contains (&except_nodes, cur_node))
1853           {
1854             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1855             re_node_set_remove_at (dest_nodes, idx);
1856           }
1857       }
1858     re_node_set_free (&except_nodes);
1859     return REG_NOERROR;
1860 }
1861
1862 static int
1863 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1864      re_match_context_t *mctx;
1865      re_node_set *limits;
1866      int dst_node, dst_idx, src_node, src_idx;
1867 {
1868   re_dfa_t *const dfa = mctx->dfa;
1869   int lim_idx, src_pos, dst_pos;
1870
1871   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1872   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1873   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1874     {
1875       int subexp_idx;
1876       struct re_backref_cache_entry *ent;
1877       ent = mctx->bkref_ents + limits->elems[lim_idx];
1878       subexp_idx = dfa->nodes[ent->node].opr.idx;
1879
1880       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881                                            subexp_idx, dst_node, dst_idx,
1882                                            dst_bkref_idx);
1883       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1884                                            subexp_idx, src_node, src_idx,
1885                                            src_bkref_idx);
1886
1887       /* In case of:
1888          <src> <dst> ( <subexp> )
1889          ( <subexp> ) <src> <dst>
1890          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1891       if (src_pos == dst_pos)
1892         continue; /* This is unrelated limitation.  */
1893       else
1894         return 1;
1895     }
1896   return 0;
1897 }
1898
1899 static int
1900 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1901      re_match_context_t *mctx;
1902      int boundaries, subexp_idx, from_node, bkref_idx;
1903 {
1904   re_dfa_t *const dfa = mctx->dfa;
1905   re_node_set *eclosures = dfa->eclosures + from_node;
1906   int node_idx;
1907
1908   /* Else, we are on the boundary: examine the nodes on the epsilon
1909      closure.  */
1910   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1911     {
1912       int node = eclosures->elems[node_idx];
1913       switch (dfa->nodes[node].type)
1914         {
1915         case OP_BACK_REF:
1916           if (bkref_idx != -1)
1917             {
1918               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1919               do
1920                 {
1921                   int dst, cpos;
1922
1923                   if (ent->node != node)
1924                     continue;
1925
1926                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1927                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1928                     continue;
1929
1930                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1931                      OP_CLOSE_SUBEXP cases below.  But, if the
1932                      destination node is the same node as the source
1933                      node, don't recurse because it would cause an
1934                      infinite loop: a regex that exhibits this behavior
1935                      is ()\1*\1*  */
1936                   dst = dfa->edests[node].elems[0];
1937                   if (dst == from_node)
1938                     {
1939                       if (boundaries & 1)
1940                         return -1;
1941                       else /* if (boundaries & 2) */
1942                         return 0;
1943                     }
1944
1945                   cpos =
1946                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1947                                                  dst, bkref_idx);
1948                   if (cpos == -1 /* && (boundaries & 1) */)
1949                     return -1;
1950                   if (cpos == 0 && (boundaries & 2))
1951                     return 0;
1952
1953                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1954                 }
1955               while (ent++->more);
1956             }
1957           break;
1958
1959         case OP_OPEN_SUBEXP:
1960           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961             return -1;
1962           break;
1963
1964         case OP_CLOSE_SUBEXP:
1965           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966             return 0;
1967           break;
1968
1969         default:
1970             break;
1971         }
1972     }
1973
1974   return (boundaries & 2) ? 1 : 0;
1975 }
1976
1977 static int
1978 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1979      re_match_context_t *mctx;
1980      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1981 {
1982   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983   int boundaries;
1984
1985   /* If we are outside the range of the subexpression, return -1 or 1.  */
1986   if (str_idx < lim->subexp_from)
1987     return -1;
1988
1989   if (lim->subexp_to < str_idx)
1990     return 1;
1991
1992   /* If we are within the subexpression, return 0.  */
1993   boundaries = (str_idx == lim->subexp_from);
1994   boundaries |= (str_idx == lim->subexp_to) << 1;
1995   if (boundaries == 0)
1996     return 0;
1997
1998   /* Else, examine epsilon closure.  */
1999   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000                                       from_node, bkref_idx);
2001 }
2002
2003 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2004    which are against limitations from DEST_NODES. */
2005
2006 static reg_errcode_t
2007 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
2008      re_dfa_t *dfa;
2009      re_node_set *dest_nodes;
2010      const re_node_set *candidates;
2011      re_node_set *limits;
2012      struct re_backref_cache_entry *bkref_ents;
2013      int str_idx;
2014 {
2015   reg_errcode_t err;
2016   int node_idx, lim_idx;
2017
2018   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2019     {
2020       int subexp_idx;
2021       struct re_backref_cache_entry *ent;
2022       ent = bkref_ents + limits->elems[lim_idx];
2023
2024       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2025         continue; /* This is unrelated limitation.  */
2026
2027       subexp_idx = dfa->nodes[ent->node].opr.idx;
2028       if (ent->subexp_to == str_idx)
2029         {
2030           int ops_node = -1;
2031           int cls_node = -1;
2032           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2033             {
2034               int node = dest_nodes->elems[node_idx];
2035               re_token_type_t type = dfa->nodes[node].type;
2036               if (type == OP_OPEN_SUBEXP
2037                   && subexp_idx == dfa->nodes[node].opr.idx)
2038                 ops_node = node;
2039               else if (type == OP_CLOSE_SUBEXP
2040                        && subexp_idx == dfa->nodes[node].opr.idx)
2041                 cls_node = node;
2042             }
2043
2044           /* Check the limitation of the open subexpression.  */
2045           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2046           if (ops_node >= 0)
2047             {
2048               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2049                                            candidates);
2050               if (BE (err != REG_NOERROR, 0))
2051                 return err;
2052             }
2053
2054           /* Check the limitation of the close subexpression.  */
2055           if (cls_node >= 0)
2056             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2057               {
2058                 int node = dest_nodes->elems[node_idx];
2059                 if (!re_node_set_contains (dfa->inveclosures + node,
2060                                            cls_node)
2061                     && !re_node_set_contains (dfa->eclosures + node,
2062                                               cls_node))
2063                   {
2064                     /* It is against this limitation.
2065                        Remove it form the current sifted state.  */
2066                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2067                                                  candidates);
2068                     if (BE (err != REG_NOERROR, 0))
2069                       return err;
2070                     --node_idx;
2071                   }
2072               }
2073         }
2074       else /* (ent->subexp_to != str_idx)  */
2075         {
2076           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2077             {
2078               int node = dest_nodes->elems[node_idx];
2079               re_token_type_t type = dfa->nodes[node].type;
2080               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2081                 {
2082                   if (subexp_idx != dfa->nodes[node].opr.idx)
2083                     continue;
2084                   /* It is against this limitation.
2085                      Remove it form the current sifted state.  */
2086                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2087                                                candidates);
2088                   if (BE (err != REG_NOERROR, 0))
2089                     return err;
2090                 }
2091             }
2092         }
2093     }
2094   return REG_NOERROR;
2095 }
2096
2097 static reg_errcode_t
2098 sift_states_bkref (mctx, sctx, str_idx, candidates)
2099      re_match_context_t *mctx;
2100      re_sift_context_t *sctx;
2101      int str_idx;
2102      const re_node_set *candidates;
2103 {
2104   re_dfa_t *const dfa = mctx->dfa;
2105   reg_errcode_t err;
2106   int node_idx, node;
2107   re_sift_context_t local_sctx;
2108   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2109
2110   if (first_idx == -1)
2111     return REG_NOERROR;
2112
2113   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2114
2115   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2116     {
2117       int enabled_idx;
2118       re_token_type_t type;
2119       struct re_backref_cache_entry *entry;
2120       node = candidates->elems[node_idx];
2121       type = dfa->nodes[node].type;
2122       /* Avoid infinite loop for the REs like "()\1+".  */
2123       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2124         continue;
2125       if (type != OP_BACK_REF)
2126         continue;
2127
2128       entry = mctx->bkref_ents + first_idx;
2129       enabled_idx = first_idx;
2130       do
2131         {
2132           int subexp_len, to_idx, dst_node;
2133           re_dfastate_t *cur_state;
2134
2135           if (entry->node != node)
2136             continue;
2137           subexp_len = entry->subexp_to - entry->subexp_from;
2138           to_idx = str_idx + subexp_len;
2139           dst_node = (subexp_len ? dfa->nexts[node]
2140                       : dfa->edests[node].elems[0]);
2141
2142           if (to_idx > sctx->last_str_idx
2143               || sctx->sifted_states[to_idx] == NULL
2144               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2145               || check_dst_limits (mctx, &sctx->limits, node,
2146                                    str_idx, dst_node, to_idx))
2147             continue;
2148
2149           if (local_sctx.sifted_states == NULL)
2150             {
2151               local_sctx = *sctx;
2152               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2153               if (BE (err != REG_NOERROR, 0))
2154                 goto free_return;
2155             }
2156           local_sctx.last_node = node;
2157           local_sctx.last_str_idx = str_idx;
2158           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2159           if (BE (err < 0, 0))
2160             {
2161               err = REG_ESPACE;
2162               goto free_return;
2163             }
2164           cur_state = local_sctx.sifted_states[str_idx];
2165           err = sift_states_backward (mctx, &local_sctx);
2166           if (BE (err != REG_NOERROR, 0))
2167             goto free_return;
2168           if (sctx->limited_states != NULL)
2169             {
2170               err = merge_state_array (dfa, sctx->limited_states,
2171                                        local_sctx.sifted_states,
2172                                        str_idx + 1);
2173               if (BE (err != REG_NOERROR, 0))
2174                 goto free_return;
2175             }
2176           local_sctx.sifted_states[str_idx] = cur_state;
2177           re_node_set_remove (&local_sctx.limits, enabled_idx);
2178
2179           /* mctx->bkref_ents may have changed, reload the pointer.  */
2180           entry = mctx->bkref_ents + enabled_idx;
2181         }
2182       while (enabled_idx++, entry++->more);
2183     }
2184   err = REG_NOERROR;
2185  free_return:
2186   if (local_sctx.sifted_states != NULL)
2187     {
2188       re_node_set_free (&local_sctx.limits);
2189     }
2190
2191   return err;
2192 }
2193
2194
2195 #ifdef RE_ENABLE_I18N
2196 static int
2197 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2198     const re_match_context_t *mctx;
2199     re_sift_context_t *sctx;
2200     int node_idx, str_idx, max_str_idx;
2201 {
2202   re_dfa_t *const dfa = mctx->dfa;
2203   int naccepted;
2204   /* Check the node can accept `multi byte'.  */
2205   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2206   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2207       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2208                             dfa->nexts[node_idx]))
2209     /* The node can't accept the `multi byte', or the
2210        destination was already thrown away, then the node
2211        could't accept the current input `multi byte'.   */
2212     naccepted = 0;
2213   /* Otherwise, it is sure that the node could accept
2214      `naccepted' bytes input.  */
2215   return naccepted;
2216 }
2217 #endif /* RE_ENABLE_I18N */
2218
2219 \f
2220 /* Functions for state transition.  */
2221
2222 /* Return the next state to which the current state STATE will transit by
2223    accepting the current input byte, and update STATE_LOG if necessary.
2224    If STATE can accept a multibyte char/collating element/back reference
2225    update the destination of STATE_LOG.  */
2226
2227 static re_dfastate_t *
2228 transit_state (err, mctx, state)
2229      reg_errcode_t *err;
2230      re_match_context_t *mctx;
2231      re_dfastate_t *state;
2232 {
2233   re_dfa_t *const dfa = mctx->dfa;
2234   re_dfastate_t **trtable;
2235   unsigned char ch;
2236
2237 #ifdef RE_ENABLE_I18N
2238   /* If the current state can accept multibyte.  */
2239   if (BE (state->accept_mb, 0))
2240     {
2241       *err = transit_state_mb (mctx, state);
2242       if (BE (*err != REG_NOERROR, 0))
2243         return NULL;
2244     }
2245 #endif /* RE_ENABLE_I18N */
2246
2247   /* Then decide the next state with the single byte.  */
2248   if (1)
2249     {
2250       /* Use transition table  */
2251       ch = re_string_fetch_byte (&mctx->input);
2252       trtable = state->trtable;
2253       if (trtable == NULL)
2254         {
2255           trtable = build_trtable (dfa, state);
2256           if (trtable == NULL)
2257             {
2258               *err = REG_ESPACE;
2259               return NULL;
2260             }
2261         }
2262       if (BE (state->word_trtable, 0))
2263         {
2264           unsigned int context;
2265           context
2266             = re_string_context_at (&mctx->input,
2267                                     re_string_cur_idx (&mctx->input) - 1,
2268                                     mctx->eflags);
2269           if (IS_WORD_CONTEXT (context))
2270             return trtable[ch + SBC_MAX];
2271           else
2272             return trtable[ch];
2273         }
2274       else
2275         return trtable[ch];
2276     }
2277 #if 0
2278   else
2279     /* don't use transition table  */
2280     return transit_state_sb (err, mctx, state);
2281 #endif
2282 }
2283
2284 /* Update the state_log if we need */
2285 re_dfastate_t *
2286 merge_state_with_log (err, mctx, next_state)
2287      reg_errcode_t *err;
2288      re_match_context_t *mctx;
2289      re_dfastate_t *next_state;
2290 {
2291   re_dfa_t *const dfa = mctx->dfa;
2292   int cur_idx = re_string_cur_idx (&mctx->input);
2293
2294   if (cur_idx > mctx->state_log_top)
2295     {
2296       mctx->state_log[cur_idx] = next_state;
2297       mctx->state_log_top = cur_idx;
2298     }
2299   else if (mctx->state_log[cur_idx] == 0)
2300     {
2301       mctx->state_log[cur_idx] = next_state;
2302     }
2303   else
2304     {
2305       re_dfastate_t *pstate;
2306       unsigned int context;
2307       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2308       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2309          the destination of a multibyte char/collating element/
2310          back reference.  Then the next state is the union set of
2311          these destinations and the results of the transition table.  */
2312       pstate = mctx->state_log[cur_idx];
2313       log_nodes = pstate->entrance_nodes;
2314       if (next_state != NULL)
2315         {
2316           table_nodes = next_state->entrance_nodes;
2317           *err = re_node_set_init_union (&next_nodes, table_nodes,
2318                                              log_nodes);
2319           if (BE (*err != REG_NOERROR, 0))
2320             return NULL;
2321         }
2322       else
2323         next_nodes = *log_nodes;
2324       /* Note: We already add the nodes of the initial state,
2325          then we don't need to add them here.  */
2326
2327       context = re_string_context_at (&mctx->input,
2328                                       re_string_cur_idx (&mctx->input) - 1,
2329                                       mctx->eflags);
2330       next_state = mctx->state_log[cur_idx]
2331         = re_acquire_state_context (err, dfa, &next_nodes, context);
2332       /* We don't need to check errors here, since the return value of
2333          this function is next_state and ERR is already set.  */
2334
2335       if (table_nodes != NULL)
2336         re_node_set_free (&next_nodes);
2337     }
2338
2339   if (BE (dfa->nbackref, 0) && next_state != NULL)
2340     {
2341       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2342          later.  We must check them here, since the back references in the
2343          next state might use them.  */
2344       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2345                                         cur_idx);
2346       if (BE (*err != REG_NOERROR, 0))
2347         return NULL;
2348
2349       /* If the next state has back references.  */
2350       if (next_state->has_backref)
2351         {
2352           *err = transit_state_bkref (mctx, &next_state->nodes);
2353           if (BE (*err != REG_NOERROR, 0))
2354             return NULL;
2355           next_state = mctx->state_log[cur_idx];
2356         }
2357     }
2358
2359   return next_state;
2360 }
2361
2362 /* Skip bytes in the input that correspond to part of a
2363    multi-byte match, then look in the log for a state
2364    from which to restart matching.  */
2365 re_dfastate_t *
2366 find_recover_state (err, mctx)
2367      reg_errcode_t *err;
2368      re_match_context_t *mctx;
2369 {
2370   re_dfastate_t *cur_state = NULL;
2371   do
2372     {
2373       int max = mctx->state_log_top;
2374       int cur_str_idx = re_string_cur_idx (&mctx->input);
2375
2376       do
2377         {
2378           if (++cur_str_idx > max)
2379             return NULL;
2380           re_string_skip_bytes (&mctx->input, 1);
2381         }
2382       while (mctx->state_log[cur_str_idx] == NULL);
2383
2384       cur_state = merge_state_with_log (err, mctx, NULL);
2385     }
2386   while (err == REG_NOERROR && cur_state == NULL);
2387   return cur_state;
2388 }
2389
2390 /* Helper functions for transit_state.  */
2391
2392 /* From the node set CUR_NODES, pick up the nodes whose types are
2393    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2394    expression. And register them to use them later for evaluating the
2395    correspoding back references.  */
2396
2397 static reg_errcode_t
2398 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2399      re_match_context_t *mctx;
2400      re_node_set *cur_nodes;
2401      int str_idx;
2402 {
2403   re_dfa_t *const dfa = mctx->dfa;
2404   int node_idx;
2405   reg_errcode_t err;
2406
2407   /* TODO: This isn't efficient.
2408            Because there might be more than one nodes whose types are
2409            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2410            nodes.
2411            E.g. RE: (a){2}  */
2412   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2413     {
2414       int node = cur_nodes->elems[node_idx];
2415       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2416           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2417           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2418         {
2419           err = match_ctx_add_subtop (mctx, node, str_idx);
2420           if (BE (err != REG_NOERROR, 0))
2421             return err;
2422         }
2423     }
2424   return REG_NOERROR;
2425 }
2426
2427 #if 0
2428 /* Return the next state to which the current state STATE will transit by
2429    accepting the current input byte.  */
2430
2431 static re_dfastate_t *
2432 transit_state_sb (err, mctx, state)
2433      reg_errcode_t *err;
2434      re_match_context_t *mctx;
2435      re_dfastate_t *state;
2436 {
2437   re_dfa_t *const dfa = mctx->dfa;
2438   re_node_set next_nodes;
2439   re_dfastate_t *next_state;
2440   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2441   unsigned int context;
2442
2443   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2444   if (BE (*err != REG_NOERROR, 0))
2445     return NULL;
2446   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2447     {
2448       int cur_node = state->nodes.elems[node_cnt];
2449       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2450         {
2451           *err = re_node_set_merge (&next_nodes,
2452                                     dfa->eclosures + dfa->nexts[cur_node]);
2453           if (BE (*err != REG_NOERROR, 0))
2454             {
2455               re_node_set_free (&next_nodes);
2456               return NULL;
2457             }
2458         }
2459     }
2460   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2461   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2462   /* We don't need to check errors here, since the return value of
2463      this function is next_state and ERR is already set.  */
2464
2465   re_node_set_free (&next_nodes);
2466   re_string_skip_bytes (&mctx->input, 1);
2467   return next_state;
2468 }
2469 #endif
2470
2471 #ifdef RE_ENABLE_I18N
2472 static reg_errcode_t
2473 transit_state_mb (mctx, pstate)
2474     re_match_context_t *mctx;
2475     re_dfastate_t *pstate;
2476 {
2477   re_dfa_t *const dfa = mctx->dfa;
2478   reg_errcode_t err;
2479   int i;
2480
2481   for (i = 0; i < pstate->nodes.nelem; ++i)
2482     {
2483       re_node_set dest_nodes, *new_nodes;
2484       int cur_node_idx = pstate->nodes.elems[i];
2485       int naccepted = 0, dest_idx;
2486       unsigned int context;
2487       re_dfastate_t *dest_state;
2488
2489       if (dfa->nodes[cur_node_idx].constraint)
2490         {
2491           context = re_string_context_at (&mctx->input,
2492                                           re_string_cur_idx (&mctx->input),
2493                                           mctx->eflags);
2494           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2495                                            context))
2496             continue;
2497         }
2498
2499       /* How many bytes the node can accept?  */
2500       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2501         naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2502                                              re_string_cur_idx (&mctx->input));
2503       if (naccepted == 0)
2504         continue;
2505
2506       /* The node can accepts `naccepted' bytes.  */
2507       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2508       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2509                                : mctx->max_mb_elem_len);
2510       err = clean_state_log_if_needed (mctx, dest_idx);
2511       if (BE (err != REG_NOERROR, 0))
2512         return err;
2513 #ifdef DEBUG
2514       assert (dfa->nexts[cur_node_idx] != -1);
2515 #endif
2516       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2517          then we use pstate->nodes.elems[i] instead.  */
2518       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2519
2520       dest_state = mctx->state_log[dest_idx];
2521       if (dest_state == NULL)
2522         dest_nodes = *new_nodes;
2523       else
2524         {
2525           err = re_node_set_init_union (&dest_nodes,
2526                                         dest_state->entrance_nodes, new_nodes);
2527           if (BE (err != REG_NOERROR, 0))
2528             return err;
2529         }
2530       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2531       mctx->state_log[dest_idx]
2532         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2533       if (dest_state != NULL)
2534         re_node_set_free (&dest_nodes);
2535       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2536         return err;
2537     }
2538   return REG_NOERROR;
2539 }
2540 #endif /* RE_ENABLE_I18N */
2541
2542 static reg_errcode_t
2543 transit_state_bkref (mctx, nodes)
2544     re_match_context_t *mctx;
2545     const re_node_set *nodes;
2546 {
2547   re_dfa_t *const dfa = mctx->dfa;
2548   reg_errcode_t err;
2549   int i;
2550   int cur_str_idx = re_string_cur_idx (&mctx->input);
2551
2552   for (i = 0; i < nodes->nelem; ++i)
2553     {
2554       int dest_str_idx, prev_nelem, bkc_idx;
2555       int node_idx = nodes->elems[i];
2556       unsigned int context;
2557       const re_token_t *node = dfa->nodes + node_idx;
2558       re_node_set *new_dest_nodes;
2559
2560       /* Check whether `node' is a backreference or not.  */
2561       if (node->type != OP_BACK_REF)
2562         continue;
2563
2564       if (node->constraint)
2565         {
2566           context = re_string_context_at (&mctx->input, cur_str_idx,
2567                                           mctx->eflags);
2568           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2569             continue;
2570         }
2571
2572       /* `node' is a backreference.
2573          Check the substring which the substring matched.  */
2574       bkc_idx = mctx->nbkref_ents;
2575       err = get_subexp (mctx, node_idx, cur_str_idx);
2576       if (BE (err != REG_NOERROR, 0))
2577         goto free_return;
2578
2579       /* And add the epsilon closures (which is `new_dest_nodes') of
2580          the backreference to appropriate state_log.  */
2581 #ifdef DEBUG
2582       assert (dfa->nexts[node_idx] != -1);
2583 #endif
2584       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2585         {
2586           int subexp_len;
2587           re_dfastate_t *dest_state;
2588           struct re_backref_cache_entry *bkref_ent;
2589           bkref_ent = mctx->bkref_ents + bkc_idx;
2590           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2591             continue;
2592           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2593           new_dest_nodes = (subexp_len == 0
2594                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2595                             : dfa->eclosures + dfa->nexts[node_idx]);
2596           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2597                           - bkref_ent->subexp_from);
2598           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2599                                           mctx->eflags);
2600           dest_state = mctx->state_log[dest_str_idx];
2601           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2602                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2603           /* Add `new_dest_node' to state_log.  */
2604           if (dest_state == NULL)
2605             {
2606               mctx->state_log[dest_str_idx]
2607                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2608                                             context);
2609               if (BE (mctx->state_log[dest_str_idx] == NULL
2610                       && err != REG_NOERROR, 0))
2611                 goto free_return;
2612             }
2613           else
2614             {
2615               re_node_set dest_nodes;
2616               err = re_node_set_init_union (&dest_nodes,
2617                                             dest_state->entrance_nodes,
2618                                             new_dest_nodes);
2619               if (BE (err != REG_NOERROR, 0))
2620                 {
2621                   re_node_set_free (&dest_nodes);
2622                   goto free_return;
2623                 }
2624               mctx->state_log[dest_str_idx]
2625                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2626               re_node_set_free (&dest_nodes);
2627               if (BE (mctx->state_log[dest_str_idx] == NULL
2628                       && err != REG_NOERROR, 0))
2629                 goto free_return;
2630             }
2631           /* We need to check recursively if the backreference can epsilon
2632              transit.  */
2633           if (subexp_len == 0
2634               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2635             {
2636               err = check_subexp_matching_top (mctx, new_dest_nodes,
2637                                                cur_str_idx);
2638               if (BE (err != REG_NOERROR, 0))
2639                 goto free_return;
2640               err = transit_state_bkref (mctx, new_dest_nodes);
2641               if (BE (err != REG_NOERROR, 0))
2642                 goto free_return;
2643             }
2644         }
2645     }
2646   err = REG_NOERROR;
2647  free_return:
2648   return err;
2649 }
2650
2651 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2652    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2653    Note that we might collect inappropriate candidates here.
2654    However, the cost of checking them strictly here is too high, then we
2655    delay these checking for prune_impossible_nodes().  */
2656
2657 static reg_errcode_t
2658 get_subexp (mctx, bkref_node, bkref_str_idx)
2659      re_match_context_t *mctx;
2660      int bkref_node, bkref_str_idx;
2661 {
2662   re_dfa_t *const dfa = mctx->dfa;
2663   int subexp_num, sub_top_idx;
2664   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2665   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2666   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2667   if (cache_idx != -1)
2668     {
2669       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2670       do
2671         if (entry->node == bkref_node)
2672           return REG_NOERROR; /* We already checked it.  */
2673       while (entry++->more);
2674     }
2675
2676   subexp_num = dfa->nodes[bkref_node].opr.idx;
2677
2678   /* For each sub expression  */
2679   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2680     {
2681       reg_errcode_t err;
2682       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2683       re_sub_match_last_t *sub_last;
2684       int sub_last_idx, sl_str, bkref_str_off;
2685
2686       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2687         continue; /* It isn't related.  */
2688
2689       sl_str = sub_top->str_idx;
2690       bkref_str_off = bkref_str_idx;
2691       /* At first, check the last node of sub expressions we already
2692          evaluated.  */
2693       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2694         {
2695           int sl_str_diff;
2696           sub_last = sub_top->lasts[sub_last_idx];
2697           sl_str_diff = sub_last->str_idx - sl_str;
2698           /* The matched string by the sub expression match with the substring
2699              at the back reference?  */
2700           if (sl_str_diff > 0)
2701             {
2702               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2703                 {
2704                   /* Not enough chars for a successful match.  */
2705                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2706                     break;
2707
2708                   err = clean_state_log_if_needed (mctx,
2709                                                    bkref_str_off
2710                                                    + sl_str_diff);
2711                   if (BE (err != REG_NOERROR, 0))
2712                     return err;
2713                   buf = (const char *) re_string_get_buffer (&mctx->input);
2714                 }
2715               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2716                 break; /* We don't need to search this sub expression any more.  */
2717             }
2718           bkref_str_off += sl_str_diff;
2719           sl_str += sl_str_diff;
2720           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2721                                 bkref_str_idx);
2722
2723           /* Reload buf, since the preceding call might have reallocated
2724              the buffer.  */
2725           buf = (const char *) re_string_get_buffer (&mctx->input);
2726
2727           if (err == REG_NOMATCH)
2728             continue;
2729           if (BE (err != REG_NOERROR, 0))
2730             return err;
2731         }
2732
2733       if (sub_last_idx < sub_top->nlasts)
2734         continue;
2735       if (sub_last_idx > 0)
2736         ++sl_str;
2737       /* Then, search for the other last nodes of the sub expression.  */
2738       for (; sl_str <= bkref_str_idx; ++sl_str)
2739         {
2740           int cls_node, sl_str_off;
2741           const re_node_set *nodes;
2742           sl_str_off = sl_str - sub_top->str_idx;
2743           /* The matched string by the sub expression match with the substring
2744              at the back reference?  */
2745           if (sl_str_off > 0)
2746             {
2747               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2748                 {
2749                   /* If we are at the end of the input, we cannot match.  */
2750                   if (bkref_str_off >= mctx->input.len)
2751                     break;
2752
2753                   err = extend_buffers (mctx);
2754                   if (BE (err != REG_NOERROR, 0))
2755                     return err;
2756
2757                   buf = (const char *) re_string_get_buffer (&mctx->input);
2758                 }
2759               if (buf [bkref_str_off++] != buf[sl_str - 1])
2760                 break; /* We don't need to search this sub expression
2761                           any more.  */
2762             }
2763           if (mctx->state_log[sl_str] == NULL)
2764             continue;
2765           /* Does this state have a ')' of the sub expression?  */
2766           nodes = &mctx->state_log[sl_str]->nodes;
2767           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2768           if (cls_node == -1)
2769             continue; /* No.  */
2770           if (sub_top->path == NULL)
2771             {
2772               sub_top->path = calloc (sizeof (state_array_t),
2773                                       sl_str - sub_top->str_idx + 1);
2774               if (sub_top->path == NULL)
2775                 return REG_ESPACE;
2776             }
2777           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2778              in the current context?  */
2779           err = check_arrival (mctx, sub_top->path, sub_top->node,
2780                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2781           if (err == REG_NOMATCH)
2782               continue;
2783           if (BE (err != REG_NOERROR, 0))
2784               return err;
2785           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2786           if (BE (sub_last == NULL, 0))
2787             return REG_ESPACE;
2788           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2789                                 bkref_str_idx);
2790           if (err == REG_NOMATCH)
2791             continue;
2792         }
2793     }
2794   return REG_NOERROR;
2795 }
2796
2797 /* Helper functions for get_subexp().  */
2798
2799 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2800    If it can arrive, register the sub expression expressed with SUB_TOP
2801    and SUB_LAST.  */
2802
2803 static reg_errcode_t
2804 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2805      re_match_context_t *mctx;
2806      const re_sub_match_top_t *sub_top;
2807      re_sub_match_last_t *sub_last;
2808      int bkref_node, bkref_str;
2809 {
2810   reg_errcode_t err;
2811   int to_idx;
2812   /* Can the subexpression arrive the back reference?  */
2813   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2814                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2815   if (err != REG_NOERROR)
2816     return err;
2817   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2818                              sub_last->str_idx);
2819   if (BE (err != REG_NOERROR, 0))
2820     return err;
2821   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2822   return clean_state_log_if_needed (mctx, to_idx);
2823 }
2824
2825 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2826    Search '(' if FL_OPEN, or search ')' otherwise.
2827    TODO: This function isn't efficient...
2828          Because there might be more than one nodes whose types are
2829          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2830          nodes.
2831          E.g. RE: (a){2}  */
2832
2833 static int
2834 find_subexp_node (dfa, nodes, subexp_idx, type)
2835      const re_dfa_t *dfa;
2836      const re_node_set *nodes;
2837      int subexp_idx, type;
2838 {
2839   int cls_idx;
2840   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2841     {
2842       int cls_node = nodes->elems[cls_idx];
2843       const re_token_t *node = dfa->nodes + cls_node;
2844       if (node->type == type
2845           && node->opr.idx == subexp_idx)
2846         return cls_node;
2847     }
2848   return -1;
2849 }
2850
2851 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2852    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2853    heavily reused.
2854    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2855
2856 static reg_errcode_t
2857 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2858                type)
2859      re_match_context_t *mctx;
2860      state_array_t *path;
2861      int top_node, top_str, last_node, last_str, type;
2862 {
2863   re_dfa_t *const dfa = mctx->dfa;
2864   reg_errcode_t err;
2865   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2866   re_dfastate_t *cur_state = NULL;
2867   re_node_set *cur_nodes, next_nodes;
2868   re_dfastate_t **backup_state_log;
2869   unsigned int context;
2870
2871   subexp_num = dfa->nodes[top_node].opr.idx;
2872   /* Extend the buffer if we need.  */
2873   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2874     {
2875       re_dfastate_t **new_array;
2876       int old_alloc = path->alloc;
2877       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2878       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2879       if (new_array == NULL)
2880         {
2881           path->alloc = old_alloc;
2882           return REG_ESPACE;
2883         }
2884       path->array = new_array;
2885       memset (new_array + old_alloc, '\0',
2886               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2887     }
2888
2889   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2890
2891   /* Temporary modify MCTX.  */
2892   backup_state_log = mctx->state_log;
2893   backup_cur_idx = mctx->input.cur_idx;
2894   mctx->state_log = path->array;
2895   mctx->input.cur_idx = str_idx;
2896
2897   /* Setup initial node set.  */
2898   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2899   if (str_idx == top_str)
2900     {
2901       err = re_node_set_init_1 (&next_nodes, top_node);
2902       if (BE (err != REG_NOERROR, 0))
2903         return err;
2904       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2905       if (BE (err != REG_NOERROR, 0))
2906         {
2907           re_node_set_free (&next_nodes);
2908           return err;
2909         }
2910     }
2911   else
2912     {
2913       cur_state = mctx->state_log[str_idx];
2914       if (cur_state && cur_state->has_backref)
2915         {
2916           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2917           if (BE ( err != REG_NOERROR, 0))
2918             return err;
2919         }
2920       else
2921         re_node_set_init_empty (&next_nodes);
2922     }
2923   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2924     {
2925       if (next_nodes.nelem)
2926         {
2927           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2928                                     subexp_num, type);
2929           if (BE ( err != REG_NOERROR, 0))
2930             {
2931               re_node_set_free (&next_nodes);
2932               return err;
2933             }
2934         }
2935       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2936       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2937         {
2938           re_node_set_free (&next_nodes);
2939           return err;
2940         }
2941       mctx->state_log[str_idx] = cur_state;
2942     }
2943
2944   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2945     {
2946       re_node_set_empty (&next_nodes);
2947       if (mctx->state_log[str_idx + 1])
2948         {
2949           err = re_node_set_merge (&next_nodes,
2950                                    &mctx->state_log[str_idx + 1]->nodes);
2951           if (BE (err != REG_NOERROR, 0))
2952             {
2953               re_node_set_free (&next_nodes);
2954               return err;
2955             }
2956         }
2957       if (cur_state)
2958         {
2959           err = check_arrival_add_next_nodes (mctx, str_idx,
2960                                               &cur_state->non_eps_nodes, &next_nodes);
2961           if (BE (err != REG_NOERROR, 0))
2962             {
2963               re_node_set_free (&next_nodes);
2964               return err;
2965             }
2966         }
2967       ++str_idx;
2968       if (next_nodes.nelem)
2969         {
2970           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2971           if (BE (err != REG_NOERROR, 0))
2972             {
2973               re_node_set_free (&next_nodes);
2974               return err;
2975             }
2976           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2977                                     subexp_num, type);
2978           if (BE ( err != REG_NOERROR, 0))
2979             {
2980               re_node_set_free (&next_nodes);
2981               return err;
2982             }
2983         }
2984       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2985       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2986       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2987         {
2988           re_node_set_free (&next_nodes);
2989           return err;
2990         }
2991       mctx->state_log[str_idx] = cur_state;
2992       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2993     }
2994   re_node_set_free (&next_nodes);
2995   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2996                : &mctx->state_log[last_str]->nodes);
2997   path->next_idx = str_idx;
2998
2999   /* Fix MCTX.  */
3000   mctx->state_log = backup_state_log;
3001   mctx->input.cur_idx = backup_cur_idx;
3002
3003   /* Then check the current node set has the node LAST_NODE.  */
3004   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3005     return REG_NOERROR;
3006
3007   return REG_NOMATCH;
3008 }
3009
3010 /* Helper functions for check_arrival.  */
3011
3012 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3013    to NEXT_NODES.
3014    TODO: This function is similar to the functions transit_state*(),
3015          however this function has many additional works.
3016          Can't we unify them?  */
3017
3018 static reg_errcode_t
3019 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3020      re_match_context_t *mctx;
3021      int str_idx;
3022      re_node_set *cur_nodes, *next_nodes;
3023 {
3024   re_dfa_t *const dfa = mctx->dfa;
3025   int result;
3026   int cur_idx;
3027   reg_errcode_t err;
3028   re_node_set union_set;
3029   re_node_set_init_empty (&union_set);
3030   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3031     {
3032       int naccepted = 0;
3033       int cur_node = cur_nodes->elems[cur_idx];
3034 #if defined DEBUG || defined RE_ENABLE_I18N
3035       re_token_type_t type = dfa->nodes[cur_node].type;
3036 #endif
3037 #ifdef DEBUG
3038       assert (!IS_EPSILON_NODE (type));
3039 #endif
3040 #ifdef RE_ENABLE_I18N
3041       /* If the node may accept `multi byte'.  */
3042       if (ACCEPT_MB_NODE (type))
3043         {
3044           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3045                                                str_idx);
3046           if (naccepted > 1)
3047             {
3048               re_dfastate_t *dest_state;
3049               int next_node = dfa->nexts[cur_node];
3050               int next_idx = str_idx + naccepted;
3051               dest_state = mctx->state_log[next_idx];
3052               re_node_set_empty (&union_set);
3053               if (dest_state)
3054                 {
3055                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3056                   if (BE (err != REG_NOERROR, 0))
3057                     {
3058                       re_node_set_free (&union_set);
3059                       return err;
3060                     }
3061                 }
3062               result = re_node_set_insert (&union_set, next_node);
3063               if (BE (result < 0, 0))
3064                 {
3065                   re_node_set_free (&union_set);
3066                   return REG_ESPACE;
3067                 }
3068               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3069                                                             &union_set);
3070               if (BE (mctx->state_log[next_idx] == NULL
3071                       && err != REG_NOERROR, 0))
3072                 {
3073                   re_node_set_free (&union_set);
3074                   return err;
3075                 }
3076             }
3077         }
3078 #endif /* RE_ENABLE_I18N */
3079       if (naccepted
3080           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3081         {
3082           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3083           if (BE (result < 0, 0))
3084             {
3085               re_node_set_free (&union_set);
3086               return REG_ESPACE;
3087             }
3088         }
3089     }
3090   re_node_set_free (&union_set);
3091   return REG_NOERROR;
3092 }
3093
3094 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3095    CUR_NODES, however exclude the nodes which are:
3096     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3097     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3098 */
3099
3100 static reg_errcode_t
3101 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3102      re_dfa_t *dfa;
3103      re_node_set *cur_nodes;
3104      int ex_subexp, type;
3105 {
3106   reg_errcode_t err;
3107   int idx, outside_node;
3108   re_node_set new_nodes;
3109 #ifdef DEBUG
3110   assert (cur_nodes->nelem);
3111 #endif
3112   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3113   if (BE (err != REG_NOERROR, 0))
3114     return err;
3115   /* Create a new node set NEW_NODES with the nodes which are epsilon
3116      closures of the node in CUR_NODES.  */
3117
3118   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3119     {
3120       int cur_node = cur_nodes->elems[idx];
3121       re_node_set *eclosure = dfa->eclosures + cur_node;
3122       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3123       if (outside_node == -1)
3124         {
3125           /* There are no problematic nodes, just merge them.  */
3126           err = re_node_set_merge (&new_nodes, eclosure);
3127           if (BE (err != REG_NOERROR, 0))
3128             {
3129               re_node_set_free (&new_nodes);
3130               return err;
3131             }
3132         }
3133       else
3134         {
3135           /* There are problematic nodes, re-calculate incrementally.  */
3136           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3137                                               ex_subexp, type);
3138           if (BE (err != REG_NOERROR, 0))
3139             {
3140               re_node_set_free (&new_nodes);
3141               return err;
3142             }
3143         }
3144     }
3145   re_node_set_free (cur_nodes);
3146   *cur_nodes = new_nodes;
3147   return REG_NOERROR;
3148 }
3149
3150 /* Helper function for check_arrival_expand_ecl.
3151    Check incrementally the epsilon closure of TARGET, and if it isn't
3152    problematic append it to DST_NODES.  */
3153
3154 static reg_errcode_t
3155 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3156      re_dfa_t *dfa;
3157      int target, ex_subexp, type;
3158      re_node_set *dst_nodes;
3159 {
3160   int cur_node;
3161   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3162     {
3163       int err;
3164
3165       if (dfa->nodes[cur_node].type == type
3166           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3167         {
3168           if (type == OP_CLOSE_SUBEXP)
3169             {
3170               err = re_node_set_insert (dst_nodes, cur_node);
3171               if (BE (err == -1, 0))
3172                 return REG_ESPACE;
3173             }
3174           break;
3175         }
3176       err = re_node_set_insert (dst_nodes, cur_node);
3177       if (BE (err == -1, 0))
3178         return REG_ESPACE;
3179       if (dfa->edests[cur_node].nelem == 0)
3180         break;
3181       if (dfa->edests[cur_node].nelem == 2)
3182         {
3183           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3184                                               dfa->edests[cur_node].elems[1],
3185                                               ex_subexp, type);
3186           if (BE (err != REG_NOERROR, 0))
3187             return err;
3188         }
3189       cur_node = dfa->edests[cur_node].elems[0];
3190     }
3191   return REG_NOERROR;
3192 }
3193
3194
3195 /* For all the back references in the current state, calculate the
3196    destination of the back references by the appropriate entry
3197    in MCTX->BKREF_ENTS.  */
3198
3199 static reg_errcode_t
3200 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3201                     type)
3202      re_match_context_t *mctx;
3203      int cur_str, subexp_num, type;
3204      re_node_set *cur_nodes;
3205 {
3206   re_dfa_t *const dfa = mctx->dfa;
3207   reg_errcode_t err;
3208   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3209   struct re_backref_cache_entry *ent;
3210
3211   if (cache_idx_start == -1)
3212     return REG_NOERROR;
3213
3214  restart:
3215   ent = mctx->bkref_ents + cache_idx_start;
3216   do
3217     {
3218       int to_idx, next_node;
3219
3220       /* Is this entry ENT is appropriate?  */
3221       if (!re_node_set_contains (cur_nodes, ent->node))
3222         continue; /* No.  */
3223
3224       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3225       /* Calculate the destination of the back reference, and append it
3226          to MCTX->STATE_LOG.  */
3227       if (to_idx == cur_str)
3228         {
3229           /* The backreference did epsilon transit, we must re-check all the
3230              node in the current state.  */
3231           re_node_set new_dests;
3232           reg_errcode_t err2, err3;
3233           next_node = dfa->edests[ent->node].elems[0];
3234           if (re_node_set_contains (cur_nodes, next_node))
3235             continue;
3236           err = re_node_set_init_1 (&new_dests, next_node);
3237           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3238           err3 = re_node_set_merge (cur_nodes, &new_dests);
3239           re_node_set_free (&new_dests);
3240           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3241                   || err3 != REG_NOERROR, 0))
3242             {
3243               err = (err != REG_NOERROR ? err
3244                      : (err2 != REG_NOERROR ? err2 : err3));
3245               return err;
3246             }
3247           /* TODO: It is still inefficient...  */
3248           goto restart;
3249         }
3250       else
3251         {
3252           re_node_set union_set;
3253           next_node = dfa->nexts[ent->node];
3254           if (mctx->state_log[to_idx])
3255             {
3256               int ret;
3257               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3258                                         next_node))
3259                 continue;
3260               err = re_node_set_init_copy (&union_set,
3261                                            &mctx->state_log[to_idx]->nodes);
3262               ret = re_node_set_insert (&union_set, next_node);
3263               if (BE (err != REG_NOERROR || ret < 0, 0))
3264                 {
3265                   re_node_set_free (&union_set);
3266                   err = err != REG_NOERROR ? err : REG_ESPACE;
3267                   return err;
3268                 }
3269             }
3270           else
3271             {
3272               err = re_node_set_init_1 (&union_set, next_node);
3273               if (BE (err != REG_NOERROR, 0))
3274                 return err;
3275             }
3276           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3277           re_node_set_free (&union_set);
3278           if (BE (mctx->state_log[to_idx] == NULL
3279                   && err != REG_NOERROR, 0))
3280             return err;
3281         }
3282     }
3283   while (ent++->more);
3284   return REG_NOERROR;
3285 }
3286
3287 /* Build transition table for the state.
3288    Return the new table if succeeded, otherwise return NULL.  */
3289
3290 static re_dfastate_t **
3291 build_trtable (dfa, state)
3292     re_dfa_t *dfa;
3293     re_dfastate_t *state;
3294 {
3295   reg_errcode_t err;
3296   int i, j, ch;
3297   unsigned int elem, mask;
3298   int dests_node_malloced = 0, dest_states_malloced = 0;
3299   int ndests; /* Number of the destination states from `state'.  */
3300   re_dfastate_t **trtable;
3301   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3302   re_node_set follows, *dests_node;
3303   bitset *dests_ch;
3304   bitset acceptable;
3305
3306   /* We build DFA states which corresponds to the destination nodes
3307      from `state'.  `dests_node[i]' represents the nodes which i-th
3308      destination state contains, and `dests_ch[i]' represents the
3309      characters which i-th destination state accepts.  */
3310 #ifdef _LIBC
3311   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3312     dests_node = (re_node_set *)
3313                  alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3314   else
3315 #endif
3316     {
3317       dests_node = (re_node_set *)
3318                    malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3319       if (BE (dests_node == NULL, 0))
3320         return NULL;
3321       dests_node_malloced = 1;
3322     }
3323   dests_ch = (bitset *) (dests_node + SBC_MAX);
3324
3325   /* Initialize transiton table.  */
3326   state->word_trtable = 0;
3327
3328   /* At first, group all nodes belonging to `state' into several
3329      destinations.  */
3330   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3331   if (BE (ndests <= 0, 0))
3332     {
3333       if (dests_node_malloced)
3334         free (dests_node);
3335       /* Return NULL in case of an error, trtable otherwise.  */
3336       if (ndests == 0)
3337         {
3338           state->trtable = (re_dfastate_t **)
3339             calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3340           return state->trtable;
3341         }
3342       return NULL;
3343     }
3344
3345   err = re_node_set_alloc (&follows, ndests + 1);
3346   if (BE (err != REG_NOERROR, 0))
3347     goto out_free;
3348
3349 #ifdef _LIBC
3350   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3351                          + ndests * 3 * sizeof (re_dfastate_t *)))
3352     dest_states = (re_dfastate_t **)
3353                   alloca (ndests * 3 * sizeof (re_dfastate_t *));
3354   else
3355 #endif
3356     {
3357       dest_states = (re_dfastate_t **)
3358                     malloc (ndests * 3 * sizeof (re_dfastate_t *));
3359       if (BE (dest_states == NULL, 0))
3360         {
3361 out_free:
3362           if (dest_states_malloced)
3363             free (dest_states);
3364           re_node_set_free (&follows);
3365           for (i = 0; i < ndests; ++i)
3366             re_node_set_free (dests_node + i);
3367           if (dests_node_malloced)
3368             free (dests_node);
3369           return NULL;
3370         }
3371       dest_states_malloced = 1;
3372     }
3373   dest_states_word = dest_states + ndests;
3374   dest_states_nl = dest_states_word + ndests;
3375   bitset_empty (acceptable);
3376
3377   /* Then build the states for all destinations.  */
3378   for (i = 0; i < ndests; ++i)
3379     {
3380       int next_node;
3381       re_node_set_empty (&follows);
3382       /* Merge the follows of this destination states.  */
3383       for (j = 0; j < dests_node[i].nelem; ++j)
3384         {
3385           next_node = dfa->nexts[dests_node[i].elems[j]];
3386           if (next_node != -1)
3387             {
3388               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3389               if (BE (err != REG_NOERROR, 0))
3390                 goto out_free;
3391             }
3392         }
3393       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3394       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3395         goto out_free;
3396       /* If the new state has context constraint,
3397          build appropriate states for these contexts.  */
3398       if (dest_states[i]->has_constraint)
3399         {
3400           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3401                                                           CONTEXT_WORD);
3402           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3403             goto out_free;
3404
3405           if (dest_states[i] != dest_states_word[i]
3406               && dfa->mb_cur_max > 1)
3407             state->word_trtable = 1;
3408
3409           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3410                                                         CONTEXT_NEWLINE);
3411           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3412             goto out_free;
3413         }
3414       else
3415         {
3416           dest_states_word[i] = dest_states[i];
3417           dest_states_nl[i] = dest_states[i];
3418         }
3419       bitset_merge (acceptable, dests_ch[i]);
3420     }
3421
3422   if (!BE (state->word_trtable, 0))
3423     {
3424       /* We don't care about whether the following character is a word
3425          character, or we are in a single-byte character set so we can
3426          discern by looking at the character code: allocate a
3427          256-entry transition table.  */
3428       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3429       if (BE (trtable == NULL, 0))
3430         goto out_free;
3431
3432       /* For all characters ch...:  */
3433       for (i = 0; i < BITSET_UINTS; ++i)
3434         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3435              elem;
3436              mask <<= 1, elem >>= 1, ++ch)
3437           if (BE (elem & 1, 0))
3438             {
3439               /* There must be exactly one destination which accepts
3440                  character ch.  See group_nodes_into_DFAstates.  */
3441               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3442                 ;
3443
3444               /* j-th destination accepts the word character ch.  */
3445               if (dfa->word_char[i] & mask)
3446                 trtable[ch] = dest_states_word[j];
3447               else
3448                 trtable[ch] = dest_states[j];
3449             }
3450     }
3451   else
3452     {
3453       /* We care about whether the following character is a word
3454          character, and we are in a multi-byte character set: discern
3455          by looking at the character code: build two 256-entry
3456          transition tables, one starting at trtable[0] and one
3457          starting at trtable[SBC_MAX].  */
3458       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3459                                            2 * SBC_MAX);
3460       if (BE (trtable == NULL, 0))
3461         goto out_free;
3462
3463       /* For all characters ch...:  */
3464       for (i = 0; i < BITSET_UINTS; ++i)
3465         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3466              elem;
3467              mask <<= 1, elem >>= 1, ++ch)
3468           if (BE (elem & 1, 0))
3469             {
3470               /* There must be exactly one destination which accepts
3471                  character ch.  See group_nodes_into_DFAstates.  */
3472               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3473                 ;
3474
3475               /* j-th destination accepts the word character ch.  */
3476               trtable[ch] = dest_states[j];
3477               trtable[ch + SBC_MAX] = dest_states_word[j];
3478             }
3479     }
3480
3481   /* new line */
3482   if (bitset_contain (acceptable, NEWLINE_CHAR))
3483     {
3484       /* The current state accepts newline character.  */
3485       for (j = 0; j < ndests; ++j)
3486         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3487           {
3488             /* k-th destination accepts newline character.  */
3489             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3490             if (state->word_trtable)
3491               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3492             /* There must be only one destination which accepts
3493                newline.  See group_nodes_into_DFAstates.  */
3494             break;
3495           }
3496     }
3497
3498   if (dest_states_malloced)
3499     free (dest_states);
3500
3501   re_node_set_free (&follows);
3502   for (i = 0; i < ndests; ++i)
3503     re_node_set_free (dests_node + i);
3504
3505   if (dests_node_malloced)
3506     free (dests_node);
3507
3508   state->trtable = trtable;
3509   return trtable;
3510 }
3511
3512 /* Group all nodes belonging to STATE into several destinations.
3513    Then for all destinations, set the nodes belonging to the destination
3514    to DESTS_NODE[i] and set the characters accepted by the destination
3515    to DEST_CH[i].  This function return the number of destinations.  */
3516
3517 static int
3518 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3519     re_dfa_t *dfa;
3520     const re_dfastate_t *state;
3521     re_node_set *dests_node;
3522     bitset *dests_ch;
3523 {
3524   reg_errcode_t err;
3525   int result;
3526   int i, j, k;
3527   int ndests; /* Number of the destinations from `state'.  */
3528   bitset accepts; /* Characters a node can accept.  */
3529   const re_node_set *cur_nodes = &state->nodes;
3530   bitset_empty (accepts);
3531   ndests = 0;
3532
3533   /* For all the nodes belonging to `state',  */
3534   for (i = 0; i < cur_nodes->nelem; ++i)
3535     {
3536       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3537       re_token_type_t type = node->type;
3538       unsigned int constraint = node->constraint;
3539
3540       /* Enumerate all single byte character this node can accept.  */
3541       if (type == CHARACTER)
3542         bitset_set (accepts, node->opr.c);
3543       else if (type == SIMPLE_BRACKET)
3544         {
3545           bitset_merge (accepts, node->opr.sbcset);
3546         }
3547       else if (type == OP_PERIOD)
3548         {
3549 #ifdef RE_ENABLE_I18N
3550           if (dfa->mb_cur_max > 1)
3551             bitset_merge (accepts, dfa->sb_char);
3552           else
3553 #endif
3554             bitset_set_all (accepts);
3555           if (!(dfa->syntax & RE_DOT_NEWLINE))
3556             bitset_clear (accepts, '\n');
3557           if (dfa->syntax & RE_DOT_NOT_NULL)
3558             bitset_clear (accepts, '\0');
3559         }
3560 #ifdef RE_ENABLE_I18N
3561       else if (type == OP_UTF8_PERIOD)
3562         {
3563           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3564           if (!(dfa->syntax & RE_DOT_NEWLINE))
3565             bitset_clear (accepts, '\n');
3566           if (dfa->syntax & RE_DOT_NOT_NULL)
3567             bitset_clear (accepts, '\0');
3568         }
3569 #endif
3570       else
3571         continue;
3572
3573       /* Check the `accepts' and sift the characters which are not
3574          match it the context.  */
3575       if (constraint)
3576         {
3577           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3578             {
3579               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3580               bitset_empty (accepts);
3581               if (accepts_newline)
3582                 bitset_set (accepts, NEWLINE_CHAR);
3583               else
3584                 continue;
3585             }
3586           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3587             {
3588               bitset_empty (accepts);
3589               continue;
3590             }
3591
3592           if (constraint & NEXT_WORD_CONSTRAINT)
3593             {
3594               unsigned int any_set = 0;
3595               if (type == CHARACTER && !node->word_char)
3596                 {
3597                   bitset_empty (accepts);
3598                   continue;
3599                 }
3600 #ifdef RE_ENABLE_I18N
3601               if (dfa->mb_cur_max > 1)
3602                 for (j = 0; j < BITSET_UINTS; ++j)
3603                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3604               else
3605 #endif
3606                 for (j = 0; j < BITSET_UINTS; ++j)
3607                   any_set |= (accepts[j] &= dfa->word_char[j]);
3608               if (!any_set)
3609                 continue;
3610             }
3611           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3612             {
3613               unsigned int any_set = 0;
3614               if (type == CHARACTER && node->word_char)
3615                 {
3616                   bitset_empty (accepts);
3617                   continue;
3618                 }
3619 #ifdef RE_ENABLE_I18N
3620               if (dfa->mb_cur_max > 1)
3621                 for (j = 0; j < BITSET_UINTS; ++j)
3622                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3623               else
3624 #endif
3625                 for (j = 0; j < BITSET_UINTS; ++j)
3626                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3627               if (!any_set)
3628                 continue;
3629             }
3630         }
3631
3632       /* Then divide `accepts' into DFA states, or create a new
3633          state.  Above, we make sure that accepts is not empty.  */
3634       for (j = 0; j < ndests; ++j)
3635         {
3636           bitset intersec; /* Intersection sets, see below.  */
3637           bitset remains;
3638           /* Flags, see below.  */
3639           int has_intersec, not_subset, not_consumed;
3640
3641           /* Optimization, skip if this state doesn't accept the character.  */
3642           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3643             continue;
3644
3645           /* Enumerate the intersection set of this state and `accepts'.  */
3646           has_intersec = 0;
3647           for (k = 0; k < BITSET_UINTS; ++k)
3648             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3649           /* And skip if the intersection set is empty.  */
3650           if (!has_intersec)
3651             continue;
3652
3653           /* Then check if this state is a subset of `accepts'.  */
3654           not_subset = not_consumed = 0;
3655           for (k = 0; k < BITSET_UINTS; ++k)
3656             {
3657               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3658               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3659             }
3660
3661           /* If this state isn't a subset of `accepts', create a
3662              new group state, which has the `remains'. */
3663           if (not_subset)
3664             {
3665               bitset_copy (dests_ch[ndests], remains);
3666               bitset_copy (dests_ch[j], intersec);
3667               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3668               if (BE (err != REG_NOERROR, 0))
3669                 goto error_return;
3670               ++ndests;
3671             }
3672
3673           /* Put the position in the current group. */
3674           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3675           if (BE (result < 0, 0))
3676             goto error_return;
3677
3678           /* If all characters are consumed, go to next node. */
3679           if (!not_consumed)
3680             break;
3681         }
3682       /* Some characters remain, create a new group. */
3683       if (j == ndests)
3684         {
3685           bitset_copy (dests_ch[ndests], accepts);
3686           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3687           if (BE (err != REG_NOERROR, 0))
3688             goto error_return;
3689           ++ndests;
3690           bitset_empty (accepts);
3691         }
3692     }
3693   return ndests;
3694  error_return:
3695   for (j = 0; j < ndests; ++j)
3696     re_node_set_free (dests_node + j);
3697   return -1;
3698 }
3699
3700 #ifdef RE_ENABLE_I18N
3701 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3702    Return the number of the bytes the node accepts.
3703    STR_IDX is the current index of the input string.
3704
3705    This function handles the nodes which can accept one character, or
3706    one collating element like '.', '[a-z]', opposite to the other nodes
3707    can only accept one byte.  */
3708
3709 static int
3710 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3711     re_dfa_t *dfa;
3712     int node_idx, str_idx;
3713     const re_string_t *input;
3714 {
3715   const re_token_t *node = dfa->nodes + node_idx;
3716   int char_len, elem_len;
3717   int i;
3718
3719   if (BE (node->type == OP_UTF8_PERIOD, 0))
3720     {
3721       unsigned char c = re_string_byte_at (input, str_idx), d;
3722       if (BE (c < 0xc2, 1))
3723         return 0;
3724
3725       if (str_idx + 2 > input->len)
3726         return 0;
3727
3728       d = re_string_byte_at (input, str_idx + 1);
3729       if (c < 0xe0)
3730         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3731       else if (c < 0xf0)
3732         {
3733           char_len = 3;
3734           if (c == 0xe0 && d < 0xa0)
3735             return 0;
3736         }
3737       else if (c < 0xf8)
3738         {
3739           char_len = 4;
3740           if (c == 0xf0 && d < 0x90)
3741             return 0;
3742         }
3743       else if (c < 0xfc)
3744         {
3745           char_len = 5;
3746           if (c == 0xf8 && d < 0x88)
3747             return 0;
3748         }
3749       else if (c < 0xfe)
3750         {
3751           char_len = 6;
3752           if (c == 0xfc && d < 0x84)
3753             return 0;
3754         }
3755       else
3756         return 0;
3757
3758       if (str_idx + char_len > input->len)
3759         return 0;
3760
3761       for (i = 1; i < char_len; ++i)
3762         {
3763           d = re_string_byte_at (input, str_idx + i);
3764           if (d < 0x80 || d > 0xbf)
3765             return 0;
3766         }
3767       return char_len;
3768     }
3769
3770   char_len = re_string_char_size_at (input, str_idx);
3771   if (node->type == OP_PERIOD)
3772     {
3773       if (char_len <= 1)
3774         return 0;
3775       /* FIXME: I don't think this if is needed, as both '\n'
3776          and '\0' are char_len == 1.  */
3777       /* '.' accepts any one character except the following two cases.  */
3778       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3779            re_string_byte_at (input, str_idx) == '\n') ||
3780           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3781            re_string_byte_at (input, str_idx) == '\0'))
3782         return 0;
3783       return char_len;
3784     }
3785
3786   elem_len = re_string_elem_size_at (input, str_idx);
3787   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3788     return 0;
3789
3790   if (node->type == COMPLEX_BRACKET)
3791     {
3792       const re_charset_t *cset = node->opr.mbcset;
3793 # ifdef _LIBC
3794       const unsigned char *pin = ((char *) re_string_get_buffer (input)
3795                                   + str_idx);
3796       int j;
3797       uint32_t nrules;
3798 # endif /* _LIBC */
3799       int match_len = 0;
3800       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3801                     ? re_string_wchar_at (input, str_idx) : 0);
3802
3803       /* match with multibyte character?  */
3804       for (i = 0; i < cset->nmbchars; ++i)
3805         if (wc == cset->mbchars[i])
3806           {
3807             match_len = char_len;
3808             goto check_node_accept_bytes_match;
3809           }
3810       /* match with character_class?  */
3811       for (i = 0; i < cset->nchar_classes; ++i)
3812         {
3813           wctype_t wt = cset->char_classes[i];
3814           if (__iswctype (wc, wt))
3815             {
3816               match_len = char_len;
3817               goto check_node_accept_bytes_match;
3818             }
3819         }
3820
3821 # ifdef _LIBC
3822       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3823       if (nrules != 0)
3824         {
3825           unsigned int in_collseq = 0;
3826           const int32_t *table, *indirect;
3827           const unsigned char *weights, *extra;
3828           const char *collseqwc;
3829           int32_t idx;
3830           /* This #include defines a local function!  */
3831 #  include <locale/weight.h>
3832
3833           /* match with collating_symbol?  */
3834           if (cset->ncoll_syms)
3835             extra = (const unsigned char *)
3836               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3837           for (i = 0; i < cset->ncoll_syms; ++i)
3838             {
3839               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3840               /* Compare the length of input collating element and
3841                  the length of current collating element.  */
3842               if (*coll_sym != elem_len)
3843                 continue;
3844               /* Compare each bytes.  */
3845               for (j = 0; j < *coll_sym; j++)
3846                 if (pin[j] != coll_sym[1 + j])
3847                   break;
3848               if (j == *coll_sym)
3849                 {
3850                   /* Match if every bytes is equal.  */
3851                   match_len = j;
3852                   goto check_node_accept_bytes_match;
3853                 }
3854             }
3855
3856           if (cset->nranges)
3857             {
3858               if (elem_len <= char_len)
3859                 {
3860                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3861                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3862                 }
3863               else
3864                 in_collseq = find_collation_sequence_value (pin, elem_len);
3865             }
3866           /* match with range expression?  */
3867           for (i = 0; i < cset->nranges; ++i)
3868             if (cset->range_starts[i] <= in_collseq
3869                 && in_collseq <= cset->range_ends[i])
3870               {
3871                 match_len = elem_len;
3872                 goto check_node_accept_bytes_match;
3873               }
3874
3875           /* match with equivalence_class?  */
3876           if (cset->nequiv_classes)
3877             {
3878               const unsigned char *cp = pin;
3879               table = (const int32_t *)
3880                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3881               weights = (const unsigned char *)
3882                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3883               extra = (const unsigned char *)
3884                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3885               indirect = (const int32_t *)
3886                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3887               idx = findidx (&cp);
3888               if (idx > 0)
3889                 for (i = 0; i < cset->nequiv_classes; ++i)
3890                   {
3891                     int32_t equiv_class_idx = cset->equiv_classes[i];
3892                     size_t weight_len = weights[idx];
3893                     if (weight_len == weights[equiv_class_idx])
3894                       {
3895                         int cnt = 0;
3896                         while (cnt <= weight_len
3897                                && (weights[equiv_class_idx + 1 + cnt]
3898                                    == weights[idx + 1 + cnt]))
3899                           ++cnt;
3900                         if (cnt > weight_len)
3901                           {
3902                             match_len = elem_len;
3903                             goto check_node_accept_bytes_match;
3904                           }
3905                       }
3906                   }
3907             }
3908         }
3909       else
3910 # endif /* _LIBC */
3911         {
3912           /* match with range expression?  */
3913 #if __GNUC__ >= 2
3914           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3915 #else
3916           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3917           cmp_buf[2] = wc;
3918 #endif
3919           for (i = 0; i < cset->nranges; ++i)
3920             {
3921               cmp_buf[0] = cset->range_starts[i];
3922               cmp_buf[4] = cset->range_ends[i];
3923               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3924                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3925                 {
3926                   match_len = char_len;
3927                   goto check_node_accept_bytes_match;
3928                 }
3929             }
3930         }
3931     check_node_accept_bytes_match:
3932       if (!cset->non_match)
3933         return match_len;
3934       else
3935         {
3936           if (match_len > 0)
3937             return 0;
3938           else
3939             return (elem_len > char_len) ? elem_len : char_len;
3940         }
3941     }
3942   return 0;
3943 }
3944
3945 # ifdef _LIBC
3946 static unsigned int
3947 find_collation_sequence_value (mbs, mbs_len)
3948     const unsigned char *mbs;
3949     size_t mbs_len;
3950 {
3951   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3952   if (nrules == 0)
3953     {
3954       if (mbs_len == 1)
3955         {
3956           /* No valid character.  Match it as a single byte character.  */
3957           const unsigned char *collseq = (const unsigned char *)
3958             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3959           return collseq[mbs[0]];
3960         }
3961       return UINT_MAX;
3962     }
3963   else
3964     {
3965       int32_t idx;
3966       const unsigned char *extra = (const unsigned char *)
3967         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3968       int32_t extrasize = (const unsigned char *)
3969         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3970
3971       for (idx = 0; idx < extrasize;)
3972         {
3973           int mbs_cnt, found = 0;
3974           int32_t elem_mbs_len;
3975           /* Skip the name of collating element name.  */
3976           idx = idx + extra[idx] + 1;
3977           elem_mbs_len = extra[idx++];
3978           if (mbs_len == elem_mbs_len)
3979             {
3980               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3981                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3982                   break;
3983               if (mbs_cnt == elem_mbs_len)
3984                 /* Found the entry.  */
3985                 found = 1;
3986             }
3987           /* Skip the byte sequence of the collating element.  */
3988           idx += elem_mbs_len;
3989           /* Adjust for the alignment.  */
3990           idx = (idx + 3) & ~3;
3991           /* Skip the collation sequence value.  */
3992           idx += sizeof (uint32_t);
3993           /* Skip the wide char sequence of the collating element.  */
3994           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3995           /* If we found the entry, return the sequence value.  */
3996           if (found)
3997             return *(uint32_t *) (extra + idx);
3998           /* Skip the collation sequence value.  */
3999           idx += sizeof (uint32_t);
4000         }
4001       return UINT_MAX;
4002     }
4003 }
4004 # endif /* _LIBC */
4005 #endif /* RE_ENABLE_I18N */
4006
4007 /* Check whether the node accepts the byte which is IDX-th
4008    byte of the INPUT.  */
4009
4010 static int
4011 check_node_accept (mctx, node, idx)
4012     const re_match_context_t *mctx;
4013     const re_token_t *node;
4014     int idx;
4015 {
4016   unsigned char ch;
4017   ch = re_string_byte_at (&mctx->input, idx);
4018   switch (node->type)
4019     {
4020     case CHARACTER:
4021       if (node->opr.c != ch)
4022         return 0;
4023       break;
4024
4025     case SIMPLE_BRACKET:
4026       if (!bitset_contain (node->opr.sbcset, ch))
4027         return 0;
4028       break;
4029
4030 #ifdef RE_ENABLE_I18N
4031     case OP_UTF8_PERIOD:
4032       if (ch >= 0x80)
4033         return 0;
4034       /* FALLTHROUGH */
4035 #endif
4036     case OP_PERIOD:
4037       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4038           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4039         return 0;
4040       break;
4041
4042     default:
4043       return 0;
4044     }
4045
4046   if (node->constraint)
4047     {
4048       /* The node has constraints.  Check whether the current context
4049          satisfies the constraints.  */
4050       unsigned int context = re_string_context_at (&mctx->input, idx,
4051                                                    mctx->eflags);
4052       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4053         return 0;
4054     }
4055
4056   return 1;
4057 }
4058
4059 /* Extend the buffers, if the buffers have run out.  */
4060
4061 static reg_errcode_t
4062 extend_buffers (mctx)
4063      re_match_context_t *mctx;
4064 {
4065   reg_errcode_t ret;
4066   re_string_t *pstr = &mctx->input;
4067
4068   /* Double the lengthes of the buffers.  */
4069   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4070   if (BE (ret != REG_NOERROR, 0))
4071     return ret;
4072
4073   if (mctx->state_log != NULL)
4074     {
4075       /* And double the length of state_log.  */
4076       /* XXX We have no indication of the size of this buffer.  If this
4077          allocation fail we have no indication that the state_log array
4078          does not have the right size.  */
4079       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4080                                               pstr->bufs_len + 1);
4081       if (BE (new_array == NULL, 0))
4082         return REG_ESPACE;
4083       mctx->state_log = new_array;
4084     }
4085
4086   /* Then reconstruct the buffers.  */
4087   if (pstr->icase)
4088     {
4089 #ifdef RE_ENABLE_I18N
4090       if (pstr->mb_cur_max > 1)
4091         {
4092           ret = build_wcs_upper_buffer (pstr);
4093           if (BE (ret != REG_NOERROR, 0))
4094             return ret;
4095         }
4096       else
4097 #endif /* RE_ENABLE_I18N  */
4098         build_upper_buffer (pstr);
4099     }
4100   else
4101     {
4102 #ifdef RE_ENABLE_I18N
4103       if (pstr->mb_cur_max > 1)
4104         build_wcs_buffer (pstr);
4105       else
4106 #endif /* RE_ENABLE_I18N  */
4107         {
4108           if (pstr->trans != NULL)
4109             re_string_translate_buffer (pstr);
4110         }
4111     }
4112   return REG_NOERROR;
4113 }
4114
4115 \f
4116 /* Functions for matching context.  */
4117
4118 /* Initialize MCTX.  */
4119
4120 static reg_errcode_t
4121 match_ctx_init (mctx, eflags, n)
4122     re_match_context_t *mctx;
4123     int eflags, n;
4124 {
4125   mctx->eflags = eflags;
4126   mctx->match_last = -1;
4127   if (n > 0)
4128     {
4129       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4130       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4131       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4132         return REG_ESPACE;
4133     }
4134   /* Already zero-ed by the caller.
4135      else
4136        mctx->bkref_ents = NULL;
4137      mctx->nbkref_ents = 0;
4138      mctx->nsub_tops = 0;  */
4139   mctx->abkref_ents = n;
4140   mctx->max_mb_elem_len = 1;
4141   mctx->asub_tops = n;
4142   return REG_NOERROR;
4143 }
4144
4145 /* Clean the entries which depend on the current input in MCTX.
4146    This function must be invoked when the matcher changes the start index
4147    of the input, or changes the input string.  */
4148
4149 static void
4150 match_ctx_clean (mctx)
4151     re_match_context_t *mctx;
4152 {
4153   int st_idx;
4154   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4155     {
4156       int sl_idx;
4157       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4158       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4159         {
4160           re_sub_match_last_t *last = top->lasts[sl_idx];
4161           re_free (last->path.array);
4162           re_free (last);
4163         }
4164       re_free (top->lasts);
4165       if (top->path)
4166         {
4167           re_free (top->path->array);
4168           re_free (top->path);
4169         }
4170       free (top);
4171     }
4172
4173   mctx->nsub_tops = 0;
4174   mctx->nbkref_ents = 0;
4175 }
4176
4177 /* Free all the memory associated with MCTX.  */
4178
4179 static void
4180 match_ctx_free (mctx)
4181     re_match_context_t *mctx;
4182 {
4183   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4184   match_ctx_clean (mctx);
4185   re_free (mctx->sub_tops);
4186   re_free (mctx->bkref_ents);
4187 }
4188
4189 /* Add a new backreference entry to MCTX.
4190    Note that we assume that caller never call this function with duplicate
4191    entry, and call with STR_IDX which isn't smaller than any existing entry.
4192 */
4193
4194 static reg_errcode_t
4195 match_ctx_add_entry (mctx, node, str_idx, from, to)
4196      re_match_context_t *mctx;
4197      int node, str_idx, from, to;
4198 {
4199   if (mctx->nbkref_ents >= mctx->abkref_ents)
4200     {
4201       struct re_backref_cache_entry* new_entry;
4202       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4203                               mctx->abkref_ents * 2);
4204       if (BE (new_entry == NULL, 0))
4205         {
4206           re_free (mctx->bkref_ents);
4207           return REG_ESPACE;
4208         }
4209       mctx->bkref_ents = new_entry;
4210       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4211               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4212       mctx->abkref_ents *= 2;
4213     }
4214   if (mctx->nbkref_ents > 0
4215       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4216     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4217
4218   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4219   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4220   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4221   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4222
4223   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4224      If bit N is clear, means that this entry won't epsilon-transition to
4225      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4226      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4227      such node.
4228
4229      A backreference does not epsilon-transition unless it is empty, so set
4230      to all zeros if FROM != TO.  */
4231   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4232     = (from == to ? ~0 : 0);
4233
4234   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4235   if (mctx->max_mb_elem_len < to - from)
4236     mctx->max_mb_elem_len = to - from;
4237   return REG_NOERROR;
4238 }
4239
4240 /* Search for the first entry which has the same str_idx, or -1 if none is
4241    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4242
4243 static int
4244 search_cur_bkref_entry (mctx, str_idx)
4245      re_match_context_t *mctx;
4246      int str_idx;
4247 {
4248   int left, right, mid, last;
4249   last = right = mctx->nbkref_ents;
4250   for (left = 0; left < right;)
4251     {
4252       mid = (left + right) / 2;
4253       if (mctx->bkref_ents[mid].str_idx < str_idx)
4254         left = mid + 1;
4255       else
4256         right = mid;
4257     }
4258   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4259     return left;
4260   else
4261     return -1;
4262 }
4263
4264 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4265    at STR_IDX.  */
4266
4267 static reg_errcode_t
4268 match_ctx_add_subtop (mctx, node, str_idx)
4269      re_match_context_t *mctx;
4270      int node, str_idx;
4271 {
4272 #ifdef DEBUG
4273   assert (mctx->sub_tops != NULL);
4274   assert (mctx->asub_tops > 0);
4275 #endif
4276   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4277     {
4278       int new_asub_tops = mctx->asub_tops * 2;
4279       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4280                                                    re_sub_match_top_t *,
4281                                                    new_asub_tops);
4282       if (BE (new_array == NULL, 0))
4283         return REG_ESPACE;
4284       mctx->sub_tops = new_array;
4285       mctx->asub_tops = new_asub_tops;
4286     }
4287   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4288   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4289     return REG_ESPACE;
4290   mctx->sub_tops[mctx->nsub_tops]->node = node;
4291   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4292   return REG_NOERROR;
4293 }
4294
4295 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4296    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4297
4298 static re_sub_match_last_t *
4299 match_ctx_add_sublast (subtop, node, str_idx)
4300      re_sub_match_top_t *subtop;
4301      int node, str_idx;
4302 {
4303   re_sub_match_last_t *new_entry;
4304   if (BE (subtop->nlasts == subtop->alasts, 0))
4305     {
4306       int new_alasts = 2 * subtop->alasts + 1;
4307       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4308                                                     re_sub_match_last_t *,
4309                                                     new_alasts);
4310       if (BE (new_array == NULL, 0))
4311         return NULL;
4312       subtop->lasts = new_array;
4313       subtop->alasts = new_alasts;
4314     }
4315   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4316   if (BE (new_entry != NULL, 1))
4317     {
4318       subtop->lasts[subtop->nlasts] = new_entry;
4319       new_entry->node = node;
4320       new_entry->str_idx = str_idx;
4321       ++subtop->nlasts;
4322     }
4323   return new_entry;
4324 }
4325
4326 static void
4327 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4328     re_sift_context_t *sctx;
4329     re_dfastate_t **sifted_sts, **limited_sts;
4330     int last_node, last_str_idx;
4331 {
4332   sctx->sifted_states = sifted_sts;
4333   sctx->limited_states = limited_sts;
4334   sctx->last_node = last_node;
4335   sctx->last_str_idx = last_str_idx;
4336   re_node_set_init_empty (&sctx->limits);
4337 }