posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  26                                           int str_idx, int from, int to)
  27      internal_function;
  28 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  29      internal_function;
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx) internal_function;
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx)
  34      internal_function;
  35 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  36                            re_dfastate_t **limited_sts, int last_node,
  37                            int last_str_idx)
  38      internal_function;
  39 static reg_errcode_t re_search_internal (const regex_t *preg,
  40                                          const char *string, int length,
  41                                          int start, int range, int stop,
  42                                          size_t nmatch, regmatch_t pmatch[],
  43                                          int eflags) internal_function;
  44 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  45                              const char *string1, int length1,
  46                              const char *string2, int length2,
  47                              int start, int range, struct re_registers *regs,
  48                              int stop, int ret_len) internal_function;
  49 static int re_search_stub (struct re_pattern_buffer *bufp,
  50                            const char *string, int length, int start,
  51                            int range, int stop, struct re_registers *regs,
  52                            int ret_len) internal_function;
  53 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  54                               int nregs, int regs_allocated) internal_function;
  55 static inline re_dfastate_t *acquire_init_state_context
  56      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  57      __attribute ((always_inline)) internal_function;
  58 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  59      internal_function;
  60 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  61                            int *p_match_first)
  62      internal_function;
  63 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  64                                     unsigned int context) internal_function;
  65 static int check_halt_state_context (const re_match_context_t *mctx,
  66                                      const re_dfastate_t *state, int idx)
  67      internal_function;
  68 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  69                          regmatch_t *prev_idx_match, int cur_node,
  70                          int cur_idx, int nmatch) internal_function;
  71 static int proceed_next_node (const re_match_context_t *mctx,
  72                               int nregs, regmatch_t *regs,
  73                               int *pidx, int node, re_node_set *eps_via_nodes,
  74                               struct re_fail_stack_t *fs) internal_function;
  75 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  76                                       int str_idx, int dest_node, int nregs,
  77                                       regmatch_t *regs,
  78                                       re_node_set *eps_via_nodes) internal_function;
  79 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  80                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  81 static reg_errcode_t set_regs (const regex_t *preg,
  82                                const re_match_context_t *mctx,
  83                                size_t nmatch, regmatch_t *pmatch,
  84                                int fl_backtrack) internal_function;
  85 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  86
  87 #ifdef RE_ENABLE_I18N
  88 static int sift_states_iter_mb (const re_match_context_t *mctx,
  89                                 re_sift_context_t *sctx,
  90                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  91 #endif /* RE_ENABLE_I18N */
  92 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  93                                            re_sift_context_t *sctx) internal_function;
  94 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  95                                           re_sift_context_t *sctx, int str_idx,
  96                                           re_node_set *cur_dest) internal_function;
  97 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
  98                                               re_sift_context_t *sctx,
  99                                               int str_idx,
 100                                               re_node_set *dest_nodes) internal_function;
 101 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 102                                             re_node_set *dest_nodes,
 103                                             const re_node_set *candidates) internal_function;
 104 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 105                                             re_node_set *dest_nodes,
 106                                             const re_node_set *and_nodes) internal_function;
 107 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 108                              int dst_node, int dst_idx, int src_node,
 109                              int src_idx) internal_function;
 110 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 111                                         int boundaries, int subexp_idx,
 112                                         int from_node, int bkref_idx) internal_function;
 113 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 114                                       int limit, int subexp_idx,
 115                                       int node, int str_idx,
 116                                       int bkref_idx) internal_function;
 117 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 118                                           re_node_set *dest_nodes,
 119                                           const re_node_set *candidates,
 120                                           re_node_set *limits,
 121                                           struct re_backref_cache_entry *bkref_ents,
 122                                           int str_idx) internal_function;
 123 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 124                                         re_sift_context_t *sctx,
 125                                         int str_idx, const re_node_set *candidates) internal_function;
 126 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 127                                                 int next_state_log_idx) internal_function;
 128 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 129                                         re_dfastate_t **src, int num) internal_function;
 130 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 131                                          re_match_context_t *mctx) internal_function;
 132 static re_dfastate_t *transit_state (reg_errcode_t *err,
 133                                      re_match_context_t *mctx,
 134                                      re_dfastate_t *state) internal_function;
 135 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 136                                             re_match_context_t *mctx,
 137                                             re_dfastate_t *next_state) internal_function;
 138 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 139                                                 re_node_set *cur_nodes,
 140                                                 int str_idx) internal_function;
 141 #if 0
 142 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 143                                         re_match_context_t *mctx,
 144                                         re_dfastate_t *pstate) internal_function;
 145 #endif
 146 #ifdef RE_ENABLE_I18N
 147 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 148                                        re_dfastate_t *pstate) internal_function;
 149 #endif /* RE_ENABLE_I18N */
 150 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 151                                           const re_node_set *nodes) internal_function;
 152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 153                                  int bkref_node, int bkref_str_idx) internal_function;
 154 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 155                                      const re_sub_match_top_t *sub_top,
 156                                      re_sub_match_last_t *sub_last,
 157                                      int bkref_node, int bkref_str) internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes) internal_function;
 168 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 169                                                re_node_set *cur_nodes,
 170                                                int ex_subexp, int type) internal_function;
 171 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 172                                                    re_node_set *dst_nodes,
 173                                                    int target, int ex_subexp,
 174                                                    int type) internal_function;
 175 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 176                                          re_node_set *cur_nodes, int cur_str,
 177                                          int subexp_num, int type) internal_function;
 178 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
 179                                       re_dfastate_t *state) internal_function;
 180 #ifdef RE_ENABLE_I18N
 181 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 182                                     const re_string_t *input, int idx) internal_function;
 183 # ifdef _LIBC
 184 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 185                                                    size_t name_len) internal_function;
 186 # endif /* _LIBC */
 187 #endif /* RE_ENABLE_I18N */
 188 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 189                                        const re_dfastate_t *state,
 190                                        re_node_set *states_node,
 191                                        bitset *states_ch) internal_function;
 192 static int check_node_accept (const re_match_context_t *mctx,
 193                               const re_token_t *node, int idx) internal_function;
 194 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 195 \f
 196 /* Entry point for POSIX code.  */
 197
 198 /* regexec searches for a given pattern, specified by PREG, in the
 199    string STRING.
 200
 201    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 202    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 203    least NMATCH elements, and we set them to the offsets of the
 204    corresponding matched substrings.
 205
 206    EFLAGS specifies `execution flags' which affect matching: if
 207    REG_NOTBOL is set, then ^ does not match at the beginning of the
 208    string; if REG_NOTEOL is set, then $ does not match at the end.
 209
 210    We return 0 if we find a match and REG_NOMATCH if not.  */
 211
 212 int
 213 regexec (preg, string, nmatch, pmatch, eflags)
 214     const regex_t *__restrict preg;
 215     const char *__restrict string;
 216     size_t nmatch;
 217     regmatch_t pmatch[];
 218     int eflags;
 219 {
 220   reg_errcode_t err;
 221   int start, length;
 222
 223   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 224     return REG_BADPAT;
 225
 226   if (eflags & REG_STARTEND)
 227     {
 228       start = pmatch[0].rm_so;
 229       length = pmatch[0].rm_eo;
 230     }
 231   else
 232     {
 233       start = 0;
 234       length = strlen (string);
 235     }
 236   if (preg->no_sub)
 237     err = re_search_internal (preg, string, length, start, length - start,
 238                               length, 0, NULL, eflags);
 239   else
 240     err = re_search_internal (preg, string, length, start, length - start,
 241                               length, nmatch, pmatch, eflags);
 242   return err != REG_NOERROR;
 243 }
 244
 245 #ifdef _LIBC
 246 # include <shlib-compat.h>
 247 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 248
 249 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 250 __typeof__ (__regexec) __compat_regexec;
 251
 252 int
 253 attribute_compat_text_section
 254 __compat_regexec (const regex_t *__restrict preg,
 255                   const char *__restrict string, size_t nmatch,
 256                   regmatch_t pmatch[], int eflags)
 257 {
 258   return regexec (preg, string, nmatch, pmatch,
 259                   eflags & (REG_NOTBOL | REG_NOTEOL));
 260 }
 261 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 262 # endif
 263 #endif
 264
 265 /* Entry points for GNU code.  */
 266
 267 /* re_match, re_search, re_match_2, re_search_2
 268
 269    The former two functions operate on STRING with length LENGTH,
 270    while the later two operate on concatenation of STRING1 and STRING2
 271    with lengths LENGTH1 and LENGTH2, respectively.
 272
 273    re_match() matches the compiled pattern in BUFP against the string,
 274    starting at index START.
 275
 276    re_search() first tries matching at index START, then it tries to match
 277    starting from index START + 1, and so on.  The last start position tried
 278    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 279    way as re_match().)
 280
 281    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 282    the first STOP characters of the concatenation of the strings should be
 283    concerned.
 284
 285    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 286    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 287    computed relative to the concatenation, not relative to the individual
 288    strings.)
 289
 290    On success, re_match* functions return the length of the match, re_search*
 291    return the position of the start of the match.  Return value -1 means no
 292    match was found and -2 indicates an internal error.  */
 293
 294 int
 295 re_match (bufp, string, length, start, regs)
 296     struct re_pattern_buffer *bufp;
 297     const char *string;
 298     int length, start;
 299     struct re_registers *regs;
 300 {
 301   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 302 }
 303 #ifdef _LIBC
 304 weak_alias (__re_match, re_match)
 305 #endif
 306
 307 int
 308 re_search (bufp, string, length, start, range, regs)
 309     struct re_pattern_buffer *bufp;
 310     const char *string;
 311     int length, start, range;
 312     struct re_registers *regs;
 313 {
 314   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 315 }
 316 #ifdef _LIBC
 317 weak_alias (__re_search, re_search)
 318 #endif
 319
 320 int
 321 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 322     struct re_pattern_buffer *bufp;
 323     const char *string1, *string2;
 324     int length1, length2, start, stop;
 325     struct re_registers *regs;
 326 {
 327   return re_search_2_stub (bufp, string1, length1, string2, length2,
 328                            start, 0, regs, stop, 1);
 329 }
 330 #ifdef _LIBC
 331 weak_alias (__re_match_2, re_match_2)
 332 #endif
 333
 334 int
 335 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 336     struct re_pattern_buffer *bufp;
 337     const char *string1, *string2;
 338     int length1, length2, start, range, stop;
 339     struct re_registers *regs;
 340 {
 341   return re_search_2_stub (bufp, string1, length1, string2, length2,
 342                            start, range, regs, stop, 0);
 343 }
 344 #ifdef _LIBC
 345 weak_alias (__re_search_2, re_search_2)
 346 #endif
 347
 348 static int
 349 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 350                   stop, ret_len)
 351     struct re_pattern_buffer *bufp;
 352     const char *string1, *string2;
 353     int length1, length2, start, range, stop, ret_len;
 354     struct re_registers *regs;
 355 {
 356   const char *str;
 357   int rval;
 358   int len = length1 + length2;
 359   int free_str = 0;
 360
 361   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 362     return -2;
 363
 364   /* Concatenate the strings.  */
 365   if (length2 > 0)
 366     if (length1 > 0)
 367       {
 368         char *s = re_malloc (char, len);
 369
 370         if (BE (s == NULL, 0))
 371           return -2;
 372         memcpy (s, string1, length1);
 373         memcpy (s + length1, string2, length2);
 374         str = s;
 375         free_str = 1;
 376       }
 377     else
 378       str = string2;
 379   else
 380     str = string1;
 381
 382   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 383                          ret_len);
 384   if (free_str)
 385     re_free ((char *) str);
 386   return rval;
 387 }
 388
 389 /* The parameters have the same meaning as those of re_search.
 390    Additional parameters:
 391    If RET_LEN is nonzero the length of the match is returned (re_match style);
 392    otherwise the position of the match is returned.  */
 393
 394 static int
 395 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 396     struct re_pattern_buffer *bufp;
 397     const char *string;
 398     int length, start, range, stop, ret_len;
 399     struct re_registers *regs;
 400 {
 401   reg_errcode_t result;
 402   regmatch_t *pmatch;
 403   int nregs, rval;
 404   int eflags = 0;
 405
 406   /* Check for out-of-range.  */
 407   if (BE (start < 0 || start > length, 0))
 408     return -1;
 409   if (BE (start + range > length, 0))
 410     range = length - start;
 411   else if (BE (start + range < 0, 0))
 412     range = -start;
 413
 414   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 415   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 416
 417   /* Compile fastmap if we haven't yet.  */
 418   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 419     re_compile_fastmap (bufp);
 420
 421   if (BE (bufp->no_sub, 0))
 422     regs = NULL;
 423
 424   /* We need at least 1 register.  */
 425   if (regs == NULL)
 426     nregs = 1;
 427   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 428                regs->num_regs < bufp->re_nsub + 1, 0))
 429     {
 430       nregs = regs->num_regs;
 431       if (BE (nregs < 1, 0))
 432         {
 433           /* Nothing can be copied to regs.  */
 434           regs = NULL;
 435           nregs = 1;
 436         }
 437     }
 438   else
 439     nregs = bufp->re_nsub + 1;
 440   pmatch = re_malloc (regmatch_t, nregs);
 441   if (BE (pmatch == NULL, 0))
 442     return -2;
 443
 444   result = re_search_internal (bufp, string, length, start, range, stop,
 445                                nregs, pmatch, eflags);
 446
 447   rval = 0;
 448
 449   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 450   if (result != REG_NOERROR)
 451     rval = -1;
 452   else if (regs != NULL)
 453     {
 454       /* If caller wants register contents data back, copy them.  */
 455       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 456                                            bufp->regs_allocated);
 457       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 458         rval = -2;
 459     }
 460
 461   if (BE (rval == 0, 1))
 462     {
 463       if (ret_len)
 464         {
 465           assert (pmatch[0].rm_so == start);
 466           rval = pmatch[0].rm_eo - start;
 467         }
 468       else
 469         rval = pmatch[0].rm_so;
 470     }
 471   re_free (pmatch);
 472   return rval;
 473 }
 474
 475 static unsigned
 476 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 477     struct re_registers *regs;
 478     regmatch_t *pmatch;
 479     int nregs, regs_allocated;
 480 {
 481   int rval = REGS_REALLOCATE;
 482   int i;
 483   int need_regs = nregs + 1;
 484   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 485      uses.  */
 486
 487   /* Have the register data arrays been allocated?  */
 488   if (regs_allocated == REGS_UNALLOCATED)
 489     { /* No.  So allocate them with malloc.  */
 490       regs->start = re_malloc (regoff_t, need_regs);
 491       regs->end = re_malloc (regoff_t, need_regs);
 492       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 493         return REGS_UNALLOCATED;
 494       regs->num_regs = need_regs;
 495     }
 496   else if (regs_allocated == REGS_REALLOCATE)
 497     { /* Yes.  If we need more elements than were already
 498          allocated, reallocate them.  If we need fewer, just
 499          leave it alone.  */
 500       if (BE (need_regs > regs->num_regs, 0))
 501         {
 502           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 503           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 504           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 505             return REGS_UNALLOCATED;
 506           regs->start = new_start;
 507           regs->end = new_end;
 508           regs->num_regs = need_regs;
 509         }
 510     }
 511   else
 512     {
 513       assert (regs_allocated == REGS_FIXED);
 514       /* This function may not be called with REGS_FIXED and nregs too big.  */
 515       assert (regs->num_regs >= nregs);
 516       rval = REGS_FIXED;
 517     }
 518
 519   /* Copy the regs.  */
 520   for (i = 0; i < nregs; ++i)
 521     {
 522       regs->start[i] = pmatch[i].rm_so;
 523       regs->end[i] = pmatch[i].rm_eo;
 524     }
 525   for ( ; i < regs->num_regs; ++i)
 526     regs->start[i] = regs->end[i] = -1;
 527
 528   return rval;
 529 }
 530
 531 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 532    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 533    this memory for recording register information.  STARTS and ENDS
 534    must be allocated using the malloc library routine, and must each
 535    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 536
 537    If NUM_REGS == 0, then subsequent matches should allocate their own
 538    register data.
 539
 540    Unless this function is called, the first search or match using
 541    PATTERN_BUFFER will allocate its own register data, without
 542    freeing the old data.  */
 543
 544 void
 545 re_set_registers (bufp, regs, num_regs, starts, ends)
 546     struct re_pattern_buffer *bufp;
 547     struct re_registers *regs;
 548     unsigned num_regs;
 549     regoff_t *starts, *ends;
 550 {
 551   if (num_regs)
 552     {
 553       bufp->regs_allocated = REGS_REALLOCATE;
 554       regs->num_regs = num_regs;
 555       regs->start = starts;
 556       regs->end = ends;
 557     }
 558   else
 559     {
 560       bufp->regs_allocated = REGS_UNALLOCATED;
 561       regs->num_regs = 0;
 562       regs->start = regs->end = (regoff_t *) 0;
 563     }
 564 }
 565 #ifdef _LIBC
 566 weak_alias (__re_set_registers, re_set_registers)
 567 #endif
 568 \f
 569 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 570    them unless specifically requested.  */
 571
 572 #if defined _REGEX_RE_COMP || defined _LIBC
 573 int
 574 # ifdef _LIBC
 575 weak_function
 576 # endif
 577 re_exec (s)
 578      const char *s;
 579 {
 580   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 581 }
 582 #endif /* _REGEX_RE_COMP */
 583 \f
 584 /* Internal entry point.  */
 585
 586 /* Searches for a compiled pattern PREG in the string STRING, whose
 587    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 588    mingings with regexec.  START, and RANGE have the same meanings
 589    with re_search.
 590    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 591    otherwise return the error code.
 592    Note: We assume front end functions already check ranges.
 593    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 594
 595 static reg_errcode_t
 596 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 597                     eflags)
 598     const regex_t *preg;
 599     const char *string;
 600     int length, start, range, stop, eflags;
 601     size_t nmatch;
 602     regmatch_t pmatch[];
 603 {
 604   reg_errcode_t err;
 605   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 606   int left_lim, right_lim, incr;
 607   int fl_longest_match, match_first, match_kind, match_last = -1;
 608   int sb, ch;
 609 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 610   re_match_context_t mctx = { .dfa = dfa };
 611 #else
 612   re_match_context_t mctx;
 613 #endif
 614   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 615                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 616   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 617
 618 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 619   memset (&mctx, '\0', sizeof (re_match_context_t));
 620   mctx.dfa = dfa;
 621 #endif
 622
 623   /* Check if the DFA haven't been compiled.  */
 624   if (BE (preg->used == 0 || dfa->init_state == NULL
 625           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 626           || dfa->init_state_begbuf == NULL, 0))
 627     return REG_NOMATCH;
 628
 629 #ifdef DEBUG
 630   /* We assume front-end functions already check them.  */
 631   assert (start + range >= 0 && start + range <= length);
 632 #endif
 633
 634   /* If initial states with non-begbuf contexts have no elements,
 635      the regex must be anchored.  If preg->newline_anchor is set,
 636      we'll never use init_state_nl, so do not check it.  */
 637   if (dfa->init_state->nodes.nelem == 0
 638       && dfa->init_state_word->nodes.nelem == 0
 639       && (dfa->init_state_nl->nodes.nelem == 0
 640           || !preg->newline_anchor))
 641     {
 642       if (start != 0 && start + range != 0)
 643         return REG_NOMATCH;
 644       start = range = 0;
 645     }
 646
 647   /* We must check the longest matching, if nmatch > 0.  */
 648   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 649
 650   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 651                             preg->translate, preg->syntax & RE_ICASE, dfa);
 652   if (BE (err != REG_NOERROR, 0))
 653     goto free_return;
 654   mctx.input.stop = stop;
 655   mctx.input.raw_stop = stop;
 656   mctx.input.newline_anchor = preg->newline_anchor;
 657
 658   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 659   if (BE (err != REG_NOERROR, 0))
 660     goto free_return;
 661
 662   /* We will log all the DFA states through which the dfa pass,
 663      if nmatch > 1, or this dfa has "multibyte node", which is a
 664      back-reference or a node which can accept multibyte character or
 665      multi character collating element.  */
 666   if (nmatch > 1 || dfa->has_mb_node)
 667     {
 668       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 669       if (BE (mctx.state_log == NULL, 0))
 670         {
 671           err = REG_ESPACE;
 672           goto free_return;
 673         }
 674     }
 675   else
 676     mctx.state_log = NULL;
 677
 678   match_first = start;
 679   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 680                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 681
 682   /* Check incrementally whether of not the input string match.  */
 683   incr = (range < 0) ? -1 : 1;
 684   left_lim = (range < 0) ? start + range : start;
 685   right_lim = (range < 0) ? start : start + range;
 686   sb = dfa->mb_cur_max == 1;
 687   match_kind =
 688     (fastmap
 689      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 690         | (range >= 0 ? 2 : 0)
 691         | (t != NULL ? 1 : 0))
 692      : 8);
 693
 694   for (;; match_first += incr)
 695     {
 696       err = REG_NOMATCH;
 697       if (match_first < left_lim || right_lim < match_first)
 698         goto free_return;
 699
 700       /* Advance as rapidly as possible through the string, until we
 701          find a plausible place to start matching.  This may be done
 702          with varying efficiency, so there are various possibilities:
 703          only the most common of them are specialized, in order to
 704          save on code size.  We use a switch statement for speed.  */
 705       switch (match_kind)
 706         {
 707         case 8:
 708           /* No fastmap.  */
 709           break;
 710
 711         case 7:
 712           /* Fastmap with single-byte translation, match forward.  */
 713           while (BE (match_first < right_lim, 1)
 714                  && !fastmap[t[(unsigned char) string[match_first]]])
 715             ++match_first;
 716           goto forward_match_found_start_or_reached_end;
 717
 718         case 6:
 719           /* Fastmap without translation, match forward.  */
 720           while (BE (match_first < right_lim, 1)
 721                  && !fastmap[(unsigned char) string[match_first]])
 722             ++match_first;
 723
 724         forward_match_found_start_or_reached_end:
 725           if (BE (match_first == right_lim, 0))
 726             {
 727               ch = match_first >= length
 728                        ? 0 : (unsigned char) string[match_first];
 729               if (!fastmap[t ? t[ch] : ch])
 730                 goto free_return;
 731             }
 732           break;
 733
 734         case 4:
 735         case 5:
 736           /* Fastmap without multi-byte translation, match backwards.  */
 737           while (match_first >= left_lim)
 738             {
 739               ch = match_first >= length
 740                        ? 0 : (unsigned char) string[match_first];
 741               if (fastmap[t ? t[ch] : ch])
 742                 break;
 743               --match_first;
 744             }
 745           if (match_first < left_lim)
 746             goto free_return;
 747           break;
 748
 749         default:
 750           /* In this case, we can't determine easily the current byte,
 751              since it might be a component byte of a multibyte
 752              character.  Then we use the constructed buffer instead.  */
 753           for (;;)
 754             {
 755               /* If MATCH_FIRST is out of the valid range, reconstruct the
 756                  buffers.  */
 757               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 758               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 759                 {
 760                   err = re_string_reconstruct (&mctx.input, match_first,
 761                                                eflags);
 762                   if (BE (err != REG_NOERROR, 0))
 763                     goto free_return;
 764
 765                   offset = match_first - mctx.input.raw_mbs_idx;
 766                 }
 767               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 768                  Note that MATCH_FIRST must not be smaller than 0.  */
 769               ch = (match_first >= length
 770                     ? 0 : re_string_byte_at (&mctx.input, offset));
 771               if (fastmap[ch])
 772                 break;
 773               match_first += incr;
 774               if (match_first < left_lim || match_first > right_lim)
 775                 {
 776                   err = REG_NOMATCH;
 777                   goto free_return;
 778                 }
 779             }
 780           break;
 781         }
 782
 783       /* Reconstruct the buffers so that the matcher can assume that
 784          the matching starts from the beginning of the buffer.  */
 785       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 786       if (BE (err != REG_NOERROR, 0))
 787         goto free_return;
 788
 789 #ifdef RE_ENABLE_I18N
 790      /* Don't consider this char as a possible match start if it part,
 791         yet isn't the head, of a multibyte character.  */
 792       if (!sb && !re_string_first_byte (&mctx.input, 0))
 793         continue;
 794 #endif
 795
 796       /* It seems to be appropriate one, then use the matcher.  */
 797       /* We assume that the matching starts from 0.  */
 798       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 799       match_last = check_matching (&mctx, fl_longest_match,
 800                                    range >= 0 ? &match_first : NULL);
 801       if (match_last != -1)
 802         {
 803           if (BE (match_last == -2, 0))
 804             {
 805               err = REG_ESPACE;
 806               goto free_return;
 807             }
 808           else
 809             {
 810               mctx.match_last = match_last;
 811               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 812                 {
 813                   re_dfastate_t *pstate = mctx.state_log[match_last];
 814                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 815                                                              match_last);
 816                 }
 817               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 818                   || dfa->nbackref)
 819                 {
 820                   err = prune_impossible_nodes (&mctx);
 821                   if (err == REG_NOERROR)
 822                     break;
 823                   if (BE (err != REG_NOMATCH, 0))
 824                     goto free_return;
 825                   match_last = -1;
 826                 }
 827               else
 828                 break; /* We found a match.  */
 829             }
 830         }
 831
 832       match_ctx_clean (&mctx);
 833     }
 834
 835 #ifdef DEBUG
 836   assert (match_last != -1);
 837   assert (err == REG_NOERROR);
 838 #endif
 839
 840   /* Set pmatch[] if we need.  */
 841   if (nmatch > 0)
 842     {
 843       int reg_idx;
 844
 845       /* Initialize registers.  */
 846       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 847         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 848
 849       /* Set the points where matching start/end.  */
 850       pmatch[0].rm_so = 0;
 851       pmatch[0].rm_eo = mctx.match_last;
 852
 853       if (!preg->no_sub && nmatch > 1)
 854         {
 855           err = set_regs (preg, &mctx, nmatch, pmatch,
 856                           dfa->has_plural_match && dfa->nbackref > 0);
 857           if (BE (err != REG_NOERROR, 0))
 858             goto free_return;
 859         }
 860
 861       /* At last, add the offset to the each registers, since we slided
 862          the buffers so that we could assume that the matching starts
 863          from 0.  */
 864       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 865         if (pmatch[reg_idx].rm_so != -1)
 866           {
 867 #ifdef RE_ENABLE_I18N
 868             if (BE (mctx.input.offsets_needed != 0, 0))
 869               {
 870                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 871                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 872                 else
 873                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 874                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 875                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 876                 else
 877                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 878               }
 879 #else
 880             assert (mctx.input.offsets_needed == 0);
 881 #endif
 882             pmatch[reg_idx].rm_so += match_first;
 883             pmatch[reg_idx].rm_eo += match_first;
 884           }
 885
 886       if (dfa->subexp_map)
 887         for (reg_idx = 0;
 888              reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
 889              reg_idx++)
 890           if (dfa->subexp_map[reg_idx] != reg_idx)
 891             {
 892               pmatch[reg_idx + 1].rm_so
 893                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 894               pmatch[reg_idx + 1].rm_eo
 895                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 896             }
 897     }
 898
 899  free_return:
 900   re_free (mctx.state_log);
 901   if (dfa->nbackref)
 902     match_ctx_free (&mctx);
 903   re_string_destruct (&mctx.input);
 904   return err;
 905 }
 906
 907 static reg_errcode_t
 908 prune_impossible_nodes (mctx)
 909      re_match_context_t *mctx;
 910 {
 911   re_dfa_t *const dfa = mctx->dfa;
 912   int halt_node, match_last;
 913   reg_errcode_t ret;
 914   re_dfastate_t **sifted_states;
 915   re_dfastate_t **lim_states = NULL;
 916   re_sift_context_t sctx;
 917 #ifdef DEBUG
 918   assert (mctx->state_log != NULL);
 919 #endif
 920   match_last = mctx->match_last;
 921   halt_node = mctx->last_node;
 922   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 923   if (BE (sifted_states == NULL, 0))
 924     {
 925       ret = REG_ESPACE;
 926       goto free_return;
 927     }
 928   if (dfa->nbackref)
 929     {
 930       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 931       if (BE (lim_states == NULL, 0))
 932         {
 933           ret = REG_ESPACE;
 934           goto free_return;
 935         }
 936       while (1)
 937         {
 938           memset (lim_states, '\0',
 939                   sizeof (re_dfastate_t *) * (match_last + 1));
 940           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 941                          match_last);
 942           ret = sift_states_backward (mctx, &sctx);
 943           re_node_set_free (&sctx.limits);
 944           if (BE (ret != REG_NOERROR, 0))
 945               goto free_return;
 946           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 947             break;
 948           do
 949             {
 950               --match_last;
 951               if (match_last < 0)
 952                 {
 953                   ret = REG_NOMATCH;
 954                   goto free_return;
 955                 }
 956             } while (mctx->state_log[match_last] == NULL
 957                      || !mctx->state_log[match_last]->halt);
 958           halt_node = check_halt_state_context (mctx,
 959                                                 mctx->state_log[match_last],
 960                                                 match_last);
 961         }
 962       ret = merge_state_array (dfa, sifted_states, lim_states,
 963                                match_last + 1);
 964       re_free (lim_states);
 965       lim_states = NULL;
 966       if (BE (ret != REG_NOERROR, 0))
 967         goto free_return;
 968     }
 969   else
 970     {
 971       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 972       ret = sift_states_backward (mctx, &sctx);
 973       re_node_set_free (&sctx.limits);
 974       if (BE (ret != REG_NOERROR, 0))
 975         goto free_return;
 976     }
 977   re_free (mctx->state_log);
 978   mctx->state_log = sifted_states;
 979   sifted_states = NULL;
 980   mctx->last_node = halt_node;
 981   mctx->match_last = match_last;
 982   ret = REG_NOERROR;
 983  free_return:
 984   re_free (sifted_states);
 985   re_free (lim_states);
 986   return ret;
 987 }
 988
 989 /* Acquire an initial state and return it.
 990    We must select appropriate initial state depending on the context,
 991    since initial states may have constraints like "\<", "^", etc..  */
 992
 993 static inline re_dfastate_t *
 994 acquire_init_state_context (err, mctx, idx)
 995      reg_errcode_t *err;
 996      const re_match_context_t *mctx;
 997      int idx;
 998 {
 999   re_dfa_t *const dfa = mctx->dfa;
1000   if (dfa->init_state->has_constraint)
1001     {
1002       unsigned int context;
1003       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1004       if (IS_WORD_CONTEXT (context))
1005         return dfa->init_state_word;
1006       else if (IS_ORDINARY_CONTEXT (context))
1007         return dfa->init_state;
1008       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1009         return dfa->init_state_begbuf;
1010       else if (IS_NEWLINE_CONTEXT (context))
1011         return dfa->init_state_nl;
1012       else if (IS_BEGBUF_CONTEXT (context))
1013         {
1014           /* It is relatively rare case, then calculate on demand.  */
1015           return re_acquire_state_context (err, dfa,
1016                                            dfa->init_state->entrance_nodes,
1017                                            context);
1018         }
1019       else
1020         /* Must not happen?  */
1021         return dfa->init_state;
1022     }
1023   else
1024     return dfa->init_state;
1025 }
1026
1027 /* Check whether the regular expression match input string INPUT or not,
1028    and return the index where the matching end, return -1 if not match,
1029    or return -2 in case of an error.
1030    FL_LONGEST_MATCH means we want the POSIX longest matching.
1031    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1032    next place where we may want to try matching.
1033    Note that the matcher assume that the maching starts from the current
1034    index of the buffer.  */
1035
1036 static int
1037 check_matching (mctx, fl_longest_match, p_match_first)
1038     re_match_context_t *mctx;
1039     int fl_longest_match;
1040     int *p_match_first;
1041 {
1042   re_dfa_t *const dfa = mctx->dfa;
1043   reg_errcode_t err;
1044   int match = 0;
1045   int match_last = -1;
1046   int cur_str_idx = re_string_cur_idx (&mctx->input);
1047   re_dfastate_t *cur_state;
1048   int at_init_state = p_match_first != NULL;
1049   int next_start_idx = cur_str_idx;
1050
1051   err = REG_NOERROR;
1052   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1053   /* An initial state must not be NULL (invalid).  */
1054   if (BE (cur_state == NULL, 0))
1055     {
1056       assert (err == REG_ESPACE);
1057       return -2;
1058     }
1059
1060   if (mctx->state_log != NULL)
1061     {
1062       mctx->state_log[cur_str_idx] = cur_state;
1063
1064       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1065          later.  E.g. Processing back references.  */
1066       if (BE (dfa->nbackref, 0))
1067         {
1068           at_init_state = 0;
1069           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1070           if (BE (err != REG_NOERROR, 0))
1071             return err;
1072
1073           if (cur_state->has_backref)
1074             {
1075               err = transit_state_bkref (mctx, &cur_state->nodes);
1076               if (BE (err != REG_NOERROR, 0))
1077                 return err;
1078             }
1079         }
1080     }
1081
1082   /* If the RE accepts NULL string.  */
1083   if (BE (cur_state->halt, 0))
1084     {
1085       if (!cur_state->has_constraint
1086           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1087         {
1088           if (!fl_longest_match)
1089             return cur_str_idx;
1090           else
1091             {
1092               match_last = cur_str_idx;
1093               match = 1;
1094             }
1095         }
1096     }
1097
1098   while (!re_string_eoi (&mctx->input))
1099     {
1100       re_dfastate_t *old_state = cur_state;
1101       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1102
1103       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
1104           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1105               && mctx->input.valid_len < mctx->input.len))
1106         {
1107           err = extend_buffers (mctx);
1108           if (BE (err != REG_NOERROR, 0))
1109             {
1110               assert (err == REG_ESPACE);
1111               return -2;
1112             }
1113         }
1114
1115       cur_state = transit_state (&err, mctx, cur_state);
1116       if (mctx->state_log != NULL)
1117         cur_state = merge_state_with_log (&err, mctx, cur_state);
1118
1119       if (cur_state == NULL)
1120         {
1121           /* Reached the invalid state or an error.  Try to recover a valid
1122              state using the state log, if available and if we have not
1123              already found a valid (even if not the longest) match.  */
1124           if (BE (err != REG_NOERROR, 0))
1125             return -2;
1126
1127           if (mctx->state_log == NULL
1128               || (match && !fl_longest_match)
1129               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1130             break;
1131         }
1132
1133       if (BE (at_init_state, 0))
1134         {
1135           if (old_state == cur_state)
1136             next_start_idx = next_char_idx;
1137           else
1138             at_init_state = 0;
1139         }
1140
1141       if (cur_state->halt)
1142         {
1143           /* Reached a halt state.
1144              Check the halt state can satisfy the current context.  */
1145           if (!cur_state->has_constraint
1146               || check_halt_state_context (mctx, cur_state,
1147                                            re_string_cur_idx (&mctx->input)))
1148             {
1149               /* We found an appropriate halt state.  */
1150               match_last = re_string_cur_idx (&mctx->input);
1151               match = 1;
1152
1153               /* We found a match, do not modify match_first below.  */
1154               p_match_first = NULL;
1155               if (!fl_longest_match)
1156                 break;
1157             }
1158         }
1159     }
1160
1161   if (p_match_first)
1162     *p_match_first += next_start_idx;
1163
1164   return match_last;
1165 }
1166
1167 /* Check NODE match the current context.  */
1168
1169 static int check_halt_node_context (dfa, node, context)
1170     const re_dfa_t *dfa;
1171     int node;
1172     unsigned int context;
1173 {
1174   re_token_type_t type = dfa->nodes[node].type;
1175   unsigned int constraint = dfa->nodes[node].constraint;
1176   if (type != END_OF_RE)
1177     return 0;
1178   if (!constraint)
1179     return 1;
1180   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1181     return 0;
1182   return 1;
1183 }
1184
1185 /* Check the halt state STATE match the current context.
1186    Return 0 if not match, if the node, STATE has, is a halt node and
1187    match the context, return the node.  */
1188
1189 static int
1190 check_halt_state_context (mctx, state, idx)
1191     const re_match_context_t *mctx;
1192     const re_dfastate_t *state;
1193     int idx;
1194 {
1195   int i;
1196   unsigned int context;
1197 #ifdef DEBUG
1198   assert (state->halt);
1199 #endif
1200   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1201   for (i = 0; i < state->nodes.nelem; ++i)
1202     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1203       return state->nodes.elems[i];
1204   return 0;
1205 }
1206
1207 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1208    corresponding to the DFA).
1209    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1210    of errors.  */
1211
1212 static int
1213 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1214     const re_match_context_t *mctx;
1215     regmatch_t *regs;
1216     int nregs, *pidx, node;
1217     re_node_set *eps_via_nodes;
1218     struct re_fail_stack_t *fs;
1219 {
1220   re_dfa_t *const dfa = mctx->dfa;
1221   int i, err, dest_node;
1222   dest_node = -1;
1223   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1224     {
1225       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1226       re_node_set *edests = &dfa->edests[node];
1227       int dest_node;
1228       err = re_node_set_insert (eps_via_nodes, node);
1229       if (BE (err < 0, 0))
1230         return -2;
1231       /* Pick up a valid destination, or return -1 if none is found.  */
1232       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1233         {
1234           int candidate = edests->elems[i];
1235           if (!re_node_set_contains (cur_nodes, candidate))
1236             continue;
1237           if (dest_node == -1)
1238             dest_node = candidate;
1239
1240           else
1241             {
1242               /* In order to avoid infinite loop like "(a*)*", return the second
1243                  epsilon-transition if the first was already considered.  */
1244               if (re_node_set_contains (eps_via_nodes, dest_node))
1245                 return candidate;
1246
1247               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1248               else if (fs != NULL
1249                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1250                                            eps_via_nodes))
1251                 return -2;
1252
1253               /* We know we are going to exit.  */
1254               break;
1255             }
1256         }
1257       return dest_node;
1258     }
1259   else
1260     {
1261       int naccepted = 0;
1262       re_token_type_t type = dfa->nodes[node].type;
1263
1264 #ifdef RE_ENABLE_I18N
1265       if (ACCEPT_MB_NODE (type))
1266         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1267       else
1268 #endif /* RE_ENABLE_I18N */
1269       if (type == OP_BACK_REF)
1270         {
1271           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1272           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1273           if (fs != NULL)
1274             {
1275               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1276                 return -1;
1277               else if (naccepted)
1278                 {
1279                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1280                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1281                               naccepted) != 0)
1282                     return -1;
1283                 }
1284             }
1285
1286           if (naccepted == 0)
1287             {
1288               err = re_node_set_insert (eps_via_nodes, node);
1289               if (BE (err < 0, 0))
1290                 return -2;
1291               dest_node = dfa->edests[node].elems[0];
1292               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1293                                         dest_node))
1294                 return dest_node;
1295             }
1296         }
1297
1298       if (naccepted != 0
1299           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1300         {
1301           dest_node = dfa->nexts[node];
1302           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1303           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1304                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1305                                                dest_node)))
1306             return -1;
1307           re_node_set_empty (eps_via_nodes);
1308           return dest_node;
1309         }
1310     }
1311   return -1;
1312 }
1313
1314 static reg_errcode_t
1315 push_fail_stack (fs, str_idx, dest_node, nregs, regs, eps_via_nodes)
1316      struct re_fail_stack_t *fs;
1317      int str_idx, dest_node, nregs;
1318      regmatch_t *regs;
1319      re_node_set *eps_via_nodes;
1320 {
1321   reg_errcode_t err;
1322   int num = fs->num++;
1323   if (fs->num == fs->alloc)
1324     {
1325       struct re_fail_stack_ent_t *new_array;
1326       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1327                                        * fs->alloc * 2));
1328       if (new_array == NULL)
1329         return REG_ESPACE;
1330       fs->alloc *= 2;
1331       fs->stack = new_array;
1332     }
1333   fs->stack[num].idx = str_idx;
1334   fs->stack[num].node = dest_node;
1335   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1336   if (fs->stack[num].regs == NULL)
1337     return REG_ESPACE;
1338   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1339   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1340   return err;
1341 }
1342
1343 static int
1344 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1345      struct re_fail_stack_t *fs;
1346      int *pidx, nregs;
1347      regmatch_t *regs;
1348      re_node_set *eps_via_nodes;
1349 {
1350   int num = --fs->num;
1351   assert (num >= 0);
1352   *pidx = fs->stack[num].idx;
1353   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1354   re_node_set_free (eps_via_nodes);
1355   re_free (fs->stack[num].regs);
1356   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1357   return fs->stack[num].node;
1358 }
1359
1360 /* Set the positions where the subexpressions are starts/ends to registers
1361    PMATCH.
1362    Note: We assume that pmatch[0] is already set, and
1363    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1364
1365 static reg_errcode_t
1366 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1367      const regex_t *preg;
1368      const re_match_context_t *mctx;
1369      size_t nmatch;
1370      regmatch_t *pmatch;
1371      int fl_backtrack;
1372 {
1373   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1374   int idx, cur_node, real_nmatch;
1375   re_node_set eps_via_nodes;
1376   struct re_fail_stack_t *fs;
1377   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1378   regmatch_t *prev_idx_match;
1379
1380 #ifdef DEBUG
1381   assert (nmatch > 1);
1382   assert (mctx->state_log != NULL);
1383 #endif
1384   if (fl_backtrack)
1385     {
1386       fs = &fs_body;
1387       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1388       if (fs->stack == NULL)
1389         return REG_ESPACE;
1390     }
1391   else
1392     fs = NULL;
1393
1394   cur_node = dfa->init_node;
1395   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1396   re_node_set_init_empty (&eps_via_nodes);
1397
1398   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1399   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1400
1401   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1402     {
1403       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1404
1405       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1406         {
1407           int reg_idx;
1408           if (fs)
1409             {
1410               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1411                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1412                   break;
1413               if (reg_idx == nmatch)
1414                 {
1415                   re_node_set_free (&eps_via_nodes);
1416                   return free_fail_stack_return (fs);
1417                 }
1418               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1419                                          &eps_via_nodes);
1420             }
1421           else
1422             {
1423               re_node_set_free (&eps_via_nodes);
1424               return REG_NOERROR;
1425             }
1426         }
1427
1428       /* Proceed to next node.  */
1429       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1430                                     &eps_via_nodes, fs);
1431
1432       if (BE (cur_node < 0, 0))
1433         {
1434           if (BE (cur_node == -2, 0))
1435             {
1436               re_node_set_free (&eps_via_nodes);
1437               free_fail_stack_return (fs);
1438               return REG_ESPACE;
1439             }
1440           if (fs)
1441             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1442                                        &eps_via_nodes);
1443           else
1444             {
1445               re_node_set_free (&eps_via_nodes);
1446               return REG_NOMATCH;
1447             }
1448         }
1449     }
1450   re_node_set_free (&eps_via_nodes);
1451   return free_fail_stack_return (fs);
1452 }
1453
1454 static reg_errcode_t
1455 free_fail_stack_return (fs)
1456      struct re_fail_stack_t *fs;
1457 {
1458   if (fs)
1459     {
1460       int fs_idx;
1461       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1462         {
1463           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1464           re_free (fs->stack[fs_idx].regs);
1465         }
1466       re_free (fs->stack);
1467     }
1468   return REG_NOERROR;
1469 }
1470
1471 static void
1472 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1473      re_dfa_t *dfa;
1474      regmatch_t *pmatch, *prev_idx_match;
1475      int cur_node, cur_idx, nmatch;
1476 {
1477   int type = dfa->nodes[cur_node].type;
1478   if (type == OP_OPEN_SUBEXP)
1479     {
1480       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1481
1482       /* We are at the first node of this sub expression.  */
1483       if (reg_num < nmatch)
1484         {
1485           pmatch[reg_num].rm_so = cur_idx;
1486           pmatch[reg_num].rm_eo = -1;
1487         }
1488     }
1489   else if (type == OP_CLOSE_SUBEXP)
1490     {
1491       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1492       if (reg_num < nmatch)
1493         {
1494           /* We are at the last node of this sub expression.  */
1495           if (pmatch[reg_num].rm_so < cur_idx)
1496             {
1497               pmatch[reg_num].rm_eo = cur_idx;
1498               /* This is a non-empty match or we are not inside an optional
1499                  subexpression.  Accept this right away.  */
1500               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1501             }
1502           else
1503             {
1504               if (dfa->nodes[cur_node].opt_subexp
1505                   && prev_idx_match[reg_num].rm_so != -1)
1506                 /* We transited through an empty match for an optional
1507                    subexpression, like (a?)*, and this is not the subexp's
1508                    first match.  Copy back the old content of the registers
1509                    so that matches of an inner subexpression are undone as
1510                    well, like in ((a?))*.  */
1511                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1512               else
1513                 /* We completed a subexpression, but it may be part of
1514                    an optional one, so do not update PREV_IDX_MATCH.  */
1515                 pmatch[reg_num].rm_eo = cur_idx;
1516             }
1517         }
1518     }
1519 }
1520
1521 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1522    and sift the nodes in each states according to the following rules.
1523    Updated state_log will be wrote to STATE_LOG.
1524
1525    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1526      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1527         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1528         the LAST_NODE, we throw away the node `a'.
1529      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1530         string `s' and transit to `b':
1531         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1532            away the node `a'.
1533         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1534             thrown away, we throw away the node `a'.
1535      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1536         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1537            node `a'.
1538         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1539             we throw away the node `a'.  */
1540
1541 #define STATE_NODE_CONTAINS(state,node) \
1542   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1543
1544 static reg_errcode_t
1545 sift_states_backward (mctx, sctx)
1546      re_match_context_t *mctx;
1547      re_sift_context_t *sctx;
1548 {
1549   reg_errcode_t err;
1550   int null_cnt = 0;
1551   int str_idx = sctx->last_str_idx;
1552   re_node_set cur_dest;
1553
1554 #ifdef DEBUG
1555   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1556 #endif
1557
1558   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1559      transit to the last_node and the last_node itself.  */
1560   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1561   if (BE (err != REG_NOERROR, 0))
1562     return err;
1563   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1564   if (BE (err != REG_NOERROR, 0))
1565     goto free_return;
1566
1567   /* Then check each states in the state_log.  */
1568   while (str_idx > 0)
1569     {
1570       /* Update counters.  */
1571       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1572       if (null_cnt > mctx->max_mb_elem_len)
1573         {
1574           memset (sctx->sifted_states, '\0',
1575                   sizeof (re_dfastate_t *) * str_idx);
1576           re_node_set_free (&cur_dest);
1577           return REG_NOERROR;
1578         }
1579       re_node_set_empty (&cur_dest);
1580       --str_idx;
1581
1582       if (mctx->state_log[str_idx])
1583         {
1584           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1585           if (BE (err != REG_NOERROR, 0))
1586             goto free_return;
1587         }
1588
1589       /* Add all the nodes which satisfy the following conditions:
1590          - It can epsilon transit to a node in CUR_DEST.
1591          - It is in CUR_SRC.
1592          And update state_log.  */
1593       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1594       if (BE (err != REG_NOERROR, 0))
1595         goto free_return;
1596     }
1597   err = REG_NOERROR;
1598  free_return:
1599   re_node_set_free (&cur_dest);
1600   return err;
1601 }
1602
1603 static reg_errcode_t
1604 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1605      re_match_context_t *mctx;
1606      re_sift_context_t *sctx;
1607      int str_idx;
1608      re_node_set *cur_dest;
1609 {
1610   re_dfa_t *const dfa = mctx->dfa;
1611   re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1612   int i;
1613
1614   /* Then build the next sifted state.
1615      We build the next sifted state on `cur_dest', and update
1616      `sifted_states[str_idx]' with `cur_dest'.
1617      Note:
1618      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1619      `cur_src' points the node_set of the old `state_log[str_idx]'
1620      (with the epsilon nodes pre-filtered out).  */
1621   for (i = 0; i < cur_src->nelem; i++)
1622     {
1623       int prev_node = cur_src->elems[i];
1624       int naccepted = 0;
1625       int ret;
1626
1627 #if defined DEBUG || defined RE_ENABLE_I18N
1628       re_token_type_t type = dfa->nodes[prev_node].type;
1629 #endif
1630 #ifdef DEBUG
1631       assert (!IS_EPSILON_NODE (type));
1632 #endif
1633 #ifdef RE_ENABLE_I18N
1634       /* If the node may accept `multi byte'.  */
1635       if (ACCEPT_MB_NODE (type))
1636         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1637                                          str_idx, sctx->last_str_idx);
1638 #endif /* RE_ENABLE_I18N */
1639
1640       /* We don't check backreferences here.
1641          See update_cur_sifted_state().  */
1642       if (!naccepted
1643           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1644           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1645                                   dfa->nexts[prev_node]))
1646         naccepted = 1;
1647
1648       if (naccepted == 0)
1649         continue;
1650
1651       if (sctx->limits.nelem)
1652         {
1653           int to_idx = str_idx + naccepted;
1654           if (check_dst_limits (mctx, &sctx->limits,
1655                                 dfa->nexts[prev_node], to_idx,
1656                                 prev_node, str_idx))
1657             continue;
1658         }
1659       ret = re_node_set_insert (cur_dest, prev_node);
1660       if (BE (ret == -1, 0))
1661         return REG_ESPACE;
1662     }
1663
1664   return REG_NOERROR;
1665 }
1666
1667 /* Helper functions.  */
1668
1669 static reg_errcode_t
1670 clean_state_log_if_needed (mctx, next_state_log_idx)
1671     re_match_context_t *mctx;
1672     int next_state_log_idx;
1673 {
1674   int top = mctx->state_log_top;
1675
1676   if (next_state_log_idx >= mctx->input.bufs_len
1677       || (next_state_log_idx >= mctx->input.valid_len
1678           && mctx->input.valid_len < mctx->input.len))
1679     {
1680       reg_errcode_t err;
1681       err = extend_buffers (mctx);
1682       if (BE (err != REG_NOERROR, 0))
1683         return err;
1684     }
1685
1686   if (top < next_state_log_idx)
1687     {
1688       memset (mctx->state_log + top + 1, '\0',
1689               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1690       mctx->state_log_top = next_state_log_idx;
1691     }
1692   return REG_NOERROR;
1693 }
1694
1695 static reg_errcode_t
1696 merge_state_array (dfa, dst, src, num)
1697      re_dfa_t *dfa;
1698      re_dfastate_t **dst;
1699      re_dfastate_t **src;
1700      int num;
1701 {
1702   int st_idx;
1703   reg_errcode_t err;
1704   for (st_idx = 0; st_idx < num; ++st_idx)
1705     {
1706       if (dst[st_idx] == NULL)
1707         dst[st_idx] = src[st_idx];
1708       else if (src[st_idx] != NULL)
1709         {
1710           re_node_set merged_set;
1711           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1712                                         &src[st_idx]->nodes);
1713           if (BE (err != REG_NOERROR, 0))
1714             return err;
1715           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1716           re_node_set_free (&merged_set);
1717           if (BE (err != REG_NOERROR, 0))
1718             return err;
1719         }
1720     }
1721   return REG_NOERROR;
1722 }
1723
1724 static reg_errcode_t
1725 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1726      re_match_context_t *mctx;
1727      re_sift_context_t *sctx;
1728      int str_idx;
1729      re_node_set *dest_nodes;
1730 {
1731   re_dfa_t *const dfa = mctx->dfa;
1732   reg_errcode_t err;
1733   const re_node_set *candidates;
1734   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1735                 : &mctx->state_log[str_idx]->nodes);
1736
1737   if (dest_nodes->nelem == 0)
1738     sctx->sifted_states[str_idx] = NULL;
1739   else
1740     {
1741       if (candidates)
1742         {
1743           /* At first, add the nodes which can epsilon transit to a node in
1744              DEST_NODE.  */
1745           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1746           if (BE (err != REG_NOERROR, 0))
1747             return err;
1748
1749           /* Then, check the limitations in the current sift_context.  */
1750           if (sctx->limits.nelem)
1751             {
1752               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1753                                          mctx->bkref_ents, str_idx);
1754               if (BE (err != REG_NOERROR, 0))
1755                 return err;
1756             }
1757         }
1758
1759       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1760       if (BE (err != REG_NOERROR, 0))
1761         return err;
1762     }
1763
1764   if (candidates && mctx->state_log[str_idx]->has_backref)
1765     {
1766       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1767       if (BE (err != REG_NOERROR, 0))
1768         return err;
1769     }
1770   return REG_NOERROR;
1771 }
1772
1773 static reg_errcode_t
1774 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1775      re_dfa_t *dfa;
1776      re_node_set *dest_nodes;
1777      const re_node_set *candidates;
1778 {
1779   reg_errcode_t err = REG_NOERROR;
1780   int i;
1781
1782   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1783   if (BE (err != REG_NOERROR, 0))
1784     return err;
1785
1786   if (!state->inveclosure.alloc)
1787     {
1788       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1789       if (BE (err != REG_NOERROR, 0))
1790         return REG_ESPACE;
1791       for (i = 0; i < dest_nodes->nelem; i++)
1792         re_node_set_merge (&state->inveclosure,
1793                            dfa->inveclosures + dest_nodes->elems[i]);
1794     }
1795   return re_node_set_add_intersect (dest_nodes, candidates,
1796                                     &state->inveclosure);
1797 }
1798
1799 static reg_errcode_t
1800 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1801      re_dfa_t *dfa;
1802      int node;
1803      re_node_set *dest_nodes;
1804      const re_node_set *candidates;
1805 {
1806     int ecl_idx;
1807     reg_errcode_t err;
1808     re_node_set *inv_eclosure = dfa->inveclosures + node;
1809     re_node_set except_nodes;
1810     re_node_set_init_empty (&except_nodes);
1811     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1812       {
1813         int cur_node = inv_eclosure->elems[ecl_idx];
1814         if (cur_node == node)
1815           continue;
1816         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1817           {
1818             int edst1 = dfa->edests[cur_node].elems[0];
1819             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1820                          ? dfa->edests[cur_node].elems[1] : -1);
1821             if ((!re_node_set_contains (inv_eclosure, edst1)
1822                  && re_node_set_contains (dest_nodes, edst1))
1823                 || (edst2 > 0
1824                     && !re_node_set_contains (inv_eclosure, edst2)
1825                     && re_node_set_contains (dest_nodes, edst2)))
1826               {
1827                 err = re_node_set_add_intersect (&except_nodes, candidates,
1828                                                  dfa->inveclosures + cur_node);
1829                 if (BE (err != REG_NOERROR, 0))
1830                   {
1831                     re_node_set_free (&except_nodes);
1832                     return err;
1833                   }
1834               }
1835           }
1836       }
1837     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1838       {
1839         int cur_node = inv_eclosure->elems[ecl_idx];
1840         if (!re_node_set_contains (&except_nodes, cur_node))
1841           {
1842             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1843             re_node_set_remove_at (dest_nodes, idx);
1844           }
1845       }
1846     re_node_set_free (&except_nodes);
1847     return REG_NOERROR;
1848 }
1849
1850 static int
1851 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1852      re_match_context_t *mctx;
1853      re_node_set *limits;
1854      int dst_node, dst_idx, src_node, src_idx;
1855 {
1856   re_dfa_t *const dfa = mctx->dfa;
1857   int lim_idx, src_pos, dst_pos;
1858
1859   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1860   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1861   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1862     {
1863       int subexp_idx;
1864       struct re_backref_cache_entry *ent;
1865       ent = mctx->bkref_ents + limits->elems[lim_idx];
1866       subexp_idx = dfa->nodes[ent->node].opr.idx;
1867
1868       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1869                                            subexp_idx, dst_node, dst_idx,
1870                                            dst_bkref_idx);
1871       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1872                                            subexp_idx, src_node, src_idx,
1873                                            src_bkref_idx);
1874
1875       /* In case of:
1876          <src> <dst> ( <subexp> )
1877          ( <subexp> ) <src> <dst>
1878          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1879       if (src_pos == dst_pos)
1880         continue; /* This is unrelated limitation.  */
1881       else
1882         return 1;
1883     }
1884   return 0;
1885 }
1886
1887 static int
1888 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1889      re_match_context_t *mctx;
1890      int boundaries, subexp_idx, from_node, bkref_idx;
1891 {
1892   re_dfa_t *const dfa = mctx->dfa;
1893   re_node_set *eclosures = dfa->eclosures + from_node;
1894   int node_idx;
1895
1896   /* Else, we are on the boundary: examine the nodes on the epsilon
1897      closure.  */
1898   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1899     {
1900       int node = eclosures->elems[node_idx];
1901       switch (dfa->nodes[node].type)
1902         {
1903         case OP_BACK_REF:
1904           if (bkref_idx != -1)
1905             {
1906               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1907               do
1908                 {
1909                   int dst, cpos;
1910
1911                   if (ent->node != node)
1912                     continue;
1913
1914                   if (subexp_idx <= 8 * sizeof (ent->eps_reachable_subexps_map)
1915                       && !(ent->eps_reachable_subexps_map & (1 << subexp_idx)))
1916                     continue;
1917
1918                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1919                      OP_CLOSE_SUBEXP cases below.  But, if the
1920                      destination node is the same node as the source
1921                      node, don't recurse because it would cause an
1922                      infinite loop: a regex that exhibits this behavior
1923                      is ()\1*\1*  */
1924                   dst = dfa->edests[node].elems[0];
1925                   if (dst == from_node)
1926                     {
1927                       if (boundaries & 1)
1928                         return -1;
1929                       else /* if (boundaries & 2) */
1930                         return 0;
1931                     }
1932
1933                   cpos =
1934                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1935                                                  dst, bkref_idx);
1936                   if (cpos == -1 /* && (boundaries & 1) */)
1937                     return -1;
1938                   if (cpos == 0 && (boundaries & 2))
1939                     return 0;
1940
1941                   ent->eps_reachable_subexps_map &= ~(1 << subexp_idx);
1942                 }
1943               while (ent++->more);
1944             }
1945           break;
1946
1947         case OP_OPEN_SUBEXP:
1948           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1949             return -1;
1950           break;
1951
1952         case OP_CLOSE_SUBEXP:
1953           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1954             return 0;
1955           break;
1956
1957         default:
1958             break;
1959         }
1960     }
1961
1962   return (boundaries & 2) ? 1 : 0;
1963 }
1964
1965 static int
1966 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1967      re_match_context_t *mctx;
1968      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1969 {
1970   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1971   int boundaries;
1972
1973   /* If we are outside the range of the subexpression, return -1 or 1.  */
1974   if (str_idx < lim->subexp_from)
1975     return -1;
1976
1977   if (lim->subexp_to < str_idx)
1978     return 1;
1979
1980   /* If we are within the subexpression, return 0.  */
1981   boundaries = (str_idx == lim->subexp_from);
1982   boundaries |= (str_idx == lim->subexp_to) << 1;
1983   if (boundaries == 0)
1984     return 0;
1985
1986   /* Else, examine epsilon closure.  */
1987   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1988                                       from_node, bkref_idx);
1989 }
1990
1991 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1992    which are against limitations from DEST_NODES. */
1993
1994 static reg_errcode_t
1995 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1996      re_dfa_t *dfa;
1997      re_node_set *dest_nodes;
1998      const re_node_set *candidates;
1999      re_node_set *limits;
2000      struct re_backref_cache_entry *bkref_ents;
2001      int str_idx;
2002 {
2003   reg_errcode_t err;
2004   int node_idx, lim_idx;
2005
2006   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2007     {
2008       int subexp_idx;
2009       struct re_backref_cache_entry *ent;
2010       ent = bkref_ents + limits->elems[lim_idx];
2011
2012       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2013         continue; /* This is unrelated limitation.  */
2014
2015       subexp_idx = dfa->nodes[ent->node].opr.idx;
2016       if (ent->subexp_to == str_idx)
2017         {
2018           int ops_node = -1;
2019           int cls_node = -1;
2020           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2021             {
2022               int node = dest_nodes->elems[node_idx];
2023               re_token_type_t type = dfa->nodes[node].type;
2024               if (type == OP_OPEN_SUBEXP
2025                   && subexp_idx == dfa->nodes[node].opr.idx)
2026                 ops_node = node;
2027               else if (type == OP_CLOSE_SUBEXP
2028                        && subexp_idx == dfa->nodes[node].opr.idx)
2029                 cls_node = node;
2030             }
2031
2032           /* Check the limitation of the open subexpression.  */
2033           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2034           if (ops_node >= 0)
2035             {
2036               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2037                                            candidates);
2038               if (BE (err != REG_NOERROR, 0))
2039                 return err;
2040             }
2041
2042           /* Check the limitation of the close subexpression.  */
2043           if (cls_node >= 0)
2044             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2045               {
2046                 int node = dest_nodes->elems[node_idx];
2047                 if (!re_node_set_contains (dfa->inveclosures + node,
2048                                            cls_node)
2049                     && !re_node_set_contains (dfa->eclosures + node,
2050                                               cls_node))
2051                   {
2052                     /* It is against this limitation.
2053                        Remove it form the current sifted state.  */
2054                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2055                                                  candidates);
2056                     if (BE (err != REG_NOERROR, 0))
2057                       return err;
2058                     --node_idx;
2059                   }
2060               }
2061         }
2062       else /* (ent->subexp_to != str_idx)  */
2063         {
2064           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2065             {
2066               int node = dest_nodes->elems[node_idx];
2067               re_token_type_t type = dfa->nodes[node].type;
2068               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2069                 {
2070                   if (subexp_idx != dfa->nodes[node].opr.idx)
2071                     continue;
2072                   /* It is against this limitation.
2073                      Remove it form the current sifted state.  */
2074                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2075                                                candidates);
2076                   if (BE (err != REG_NOERROR, 0))
2077                     return err;
2078                 }
2079             }
2080         }
2081     }
2082   return REG_NOERROR;
2083 }
2084
2085 static reg_errcode_t
2086 sift_states_bkref (mctx, sctx, str_idx, candidates)
2087      re_match_context_t *mctx;
2088      re_sift_context_t *sctx;
2089      int str_idx;
2090      const re_node_set *candidates;
2091 {
2092   re_dfa_t *const dfa = mctx->dfa;
2093   reg_errcode_t err;
2094   int node_idx, node;
2095   re_sift_context_t local_sctx;
2096   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2097
2098   if (first_idx == -1)
2099     return REG_NOERROR;
2100
2101   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2102
2103   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2104     {
2105       int enabled_idx;
2106       re_token_type_t type;
2107       struct re_backref_cache_entry *entry;
2108       node = candidates->elems[node_idx];
2109       type = dfa->nodes[node].type;
2110       /* Avoid infinite loop for the REs like "()\1+".  */
2111       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2112         continue;
2113       if (type != OP_BACK_REF)
2114         continue;
2115
2116       entry = mctx->bkref_ents + first_idx;
2117       enabled_idx = first_idx;
2118       do
2119         {
2120           int subexp_len, to_idx, dst_node;
2121           re_dfastate_t *cur_state;
2122
2123           if (entry->node != node)
2124             continue;
2125           subexp_len = entry->subexp_to - entry->subexp_from;
2126           to_idx = str_idx + subexp_len;
2127           dst_node = (subexp_len ? dfa->nexts[node]
2128                       : dfa->edests[node].elems[0]);
2129
2130           if (to_idx > sctx->last_str_idx
2131               || sctx->sifted_states[to_idx] == NULL
2132               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2133               || check_dst_limits (mctx, &sctx->limits, node,
2134                                    str_idx, dst_node, to_idx))
2135             continue;
2136
2137           if (local_sctx.sifted_states == NULL)
2138             {
2139               local_sctx = *sctx;
2140               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2141               if (BE (err != REG_NOERROR, 0))
2142                 goto free_return;
2143             }
2144           local_sctx.last_node = node;
2145           local_sctx.last_str_idx = str_idx;
2146           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2147           if (BE (err < 0, 0))
2148             {
2149               err = REG_ESPACE;
2150               goto free_return;
2151             }
2152           cur_state = local_sctx.sifted_states[str_idx];
2153           err = sift_states_backward (mctx, &local_sctx);
2154           if (BE (err != REG_NOERROR, 0))
2155             goto free_return;
2156           if (sctx->limited_states != NULL)
2157             {
2158               err = merge_state_array (dfa, sctx->limited_states,
2159                                        local_sctx.sifted_states,
2160                                        str_idx + 1);
2161               if (BE (err != REG_NOERROR, 0))
2162                 goto free_return;
2163             }
2164           local_sctx.sifted_states[str_idx] = cur_state;
2165           re_node_set_remove (&local_sctx.limits, enabled_idx);
2166
2167           /* mctx->bkref_ents may have changed, reload the pointer.  */
2168           entry = mctx->bkref_ents + enabled_idx;
2169         }
2170       while (enabled_idx++, entry++->more);
2171     }
2172   err = REG_NOERROR;
2173  free_return:
2174   if (local_sctx.sifted_states != NULL)
2175     {
2176       re_node_set_free (&local_sctx.limits);
2177     }
2178
2179   return err;
2180 }
2181
2182
2183 #ifdef RE_ENABLE_I18N
2184 static int
2185 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2186     const re_match_context_t *mctx;
2187     re_sift_context_t *sctx;
2188     int node_idx, str_idx, max_str_idx;
2189 {
2190   re_dfa_t *const dfa = mctx->dfa;
2191   int naccepted;
2192   /* Check the node can accept `multi byte'.  */
2193   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2194   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2195       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2196                             dfa->nexts[node_idx]))
2197     /* The node can't accept the `multi byte', or the
2198        destination was already thrown away, then the node
2199        could't accept the current input `multi byte'.   */
2200     naccepted = 0;
2201   /* Otherwise, it is sure that the node could accept
2202      `naccepted' bytes input.  */
2203   return naccepted;
2204 }
2205 #endif /* RE_ENABLE_I18N */
2206
2207 \f
2208 /* Functions for state transition.  */
2209
2210 /* Return the next state to which the current state STATE will transit by
2211    accepting the current input byte, and update STATE_LOG if necessary.
2212    If STATE can accept a multibyte char/collating element/back reference
2213    update the destination of STATE_LOG.  */
2214
2215 static re_dfastate_t *
2216 transit_state (err, mctx, state)
2217      reg_errcode_t *err;
2218      re_match_context_t *mctx;
2219      re_dfastate_t *state;
2220 {
2221   re_dfa_t *const dfa = mctx->dfa;
2222   re_dfastate_t **trtable;
2223   unsigned char ch;
2224
2225 #ifdef RE_ENABLE_I18N
2226   /* If the current state can accept multibyte.  */
2227   if (BE (state->accept_mb, 0))
2228     {
2229       *err = transit_state_mb (mctx, state);
2230       if (BE (*err != REG_NOERROR, 0))
2231         return NULL;
2232     }
2233 #endif /* RE_ENABLE_I18N */
2234
2235   /* Then decide the next state with the single byte.  */
2236   if (1)
2237     {
2238       /* Use transition table  */
2239       ch = re_string_fetch_byte (&mctx->input);
2240       trtable = state->trtable;
2241       if (trtable == NULL)
2242         {
2243           trtable = build_trtable (dfa, state);
2244           if (trtable == NULL)
2245             {
2246               *err = REG_ESPACE;
2247               return NULL;
2248             }
2249         }
2250       if (BE (state->word_trtable, 0))
2251         {
2252           unsigned int context;
2253           context
2254             = re_string_context_at (&mctx->input,
2255                                     re_string_cur_idx (&mctx->input) - 1,
2256                                     mctx->eflags);
2257           if (IS_WORD_CONTEXT (context))
2258             return trtable[ch + SBC_MAX];
2259           else
2260             return trtable[ch];
2261         }
2262       else
2263         return trtable[ch];
2264     }
2265 #if 0
2266   else
2267     /* don't use transition table  */
2268     return transit_state_sb (err, mctx, state);
2269 #endif
2270 }
2271
2272 /* Update the state_log if we need */
2273 re_dfastate_t *
2274 merge_state_with_log (err, mctx, next_state)
2275      reg_errcode_t *err;
2276      re_match_context_t *mctx;
2277      re_dfastate_t *next_state;
2278 {
2279   re_dfa_t *const dfa = mctx->dfa;
2280   int cur_idx = re_string_cur_idx (&mctx->input);
2281
2282   if (cur_idx > mctx->state_log_top)
2283     {
2284       mctx->state_log[cur_idx] = next_state;
2285       mctx->state_log_top = cur_idx;
2286     }
2287   else if (mctx->state_log[cur_idx] == 0)
2288     {
2289       mctx->state_log[cur_idx] = next_state;
2290     }
2291   else
2292     {
2293       re_dfastate_t *pstate;
2294       unsigned int context;
2295       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2296       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2297          the destination of a multibyte char/collating element/
2298          back reference.  Then the next state is the union set of
2299          these destinations and the results of the transition table.  */
2300       pstate = mctx->state_log[cur_idx];
2301       log_nodes = pstate->entrance_nodes;
2302       if (next_state != NULL)
2303         {
2304           table_nodes = next_state->entrance_nodes;
2305           *err = re_node_set_init_union (&next_nodes, table_nodes,
2306                                              log_nodes);
2307           if (BE (*err != REG_NOERROR, 0))
2308             return NULL;
2309         }
2310       else
2311         next_nodes = *log_nodes;
2312       /* Note: We already add the nodes of the initial state,
2313          then we don't need to add them here.  */
2314
2315       context = re_string_context_at (&mctx->input,
2316                                       re_string_cur_idx (&mctx->input) - 1,
2317                                       mctx->eflags);
2318       next_state = mctx->state_log[cur_idx]
2319         = re_acquire_state_context (err, dfa, &next_nodes, context);
2320       /* We don't need to check errors here, since the return value of
2321          this function is next_state and ERR is already set.  */
2322
2323       if (table_nodes != NULL)
2324         re_node_set_free (&next_nodes);
2325     }
2326
2327   if (BE (dfa->nbackref, 0) && next_state != NULL)
2328     {
2329       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2330          later.  We must check them here, since the back references in the
2331          next state might use them.  */
2332       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2333                                         cur_idx);
2334       if (BE (*err != REG_NOERROR, 0))
2335         return NULL;
2336
2337       /* If the next state has back references.  */
2338       if (next_state->has_backref)
2339         {
2340           *err = transit_state_bkref (mctx, &next_state->nodes);
2341           if (BE (*err != REG_NOERROR, 0))
2342             return NULL;
2343           next_state = mctx->state_log[cur_idx];
2344         }
2345     }
2346
2347   return next_state;
2348 }
2349
2350 /* Skip bytes in the input that correspond to part of a
2351    multi-byte match, then look in the log for a state
2352    from which to restart matching.  */
2353 re_dfastate_t *
2354 find_recover_state (err, mctx)
2355      reg_errcode_t *err;
2356      re_match_context_t *mctx;
2357 {
2358   re_dfastate_t *cur_state = NULL;
2359   do
2360     {
2361       int max = mctx->state_log_top;
2362       int cur_str_idx = re_string_cur_idx (&mctx->input);
2363
2364       do
2365         {
2366           if (++cur_str_idx > max)
2367             return NULL;
2368           re_string_skip_bytes (&mctx->input, 1);
2369         }
2370       while (mctx->state_log[cur_str_idx] == NULL);
2371
2372       cur_state = merge_state_with_log (err, mctx, NULL);
2373     }
2374   while (err == REG_NOERROR && cur_state == NULL);
2375   return cur_state;
2376 }
2377
2378 /* Helper functions for transit_state.  */
2379
2380 /* From the node set CUR_NODES, pick up the nodes whose types are
2381    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2382    expression. And register them to use them later for evaluating the
2383    correspoding back references.  */
2384
2385 static reg_errcode_t
2386 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2387      re_match_context_t *mctx;
2388      re_node_set *cur_nodes;
2389      int str_idx;
2390 {
2391   re_dfa_t *const dfa = mctx->dfa;
2392   int node_idx;
2393   reg_errcode_t err;
2394
2395   /* TODO: This isn't efficient.
2396            Because there might be more than one nodes whose types are
2397            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2398            nodes.
2399            E.g. RE: (a){2}  */
2400   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2401     {
2402       int node = cur_nodes->elems[node_idx];
2403       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2404           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2405           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2406         {
2407           err = match_ctx_add_subtop (mctx, node, str_idx);
2408           if (BE (err != REG_NOERROR, 0))
2409             return err;
2410         }
2411     }
2412   return REG_NOERROR;
2413 }
2414
2415 #if 0
2416 /* Return the next state to which the current state STATE will transit by
2417    accepting the current input byte.  */
2418
2419 static re_dfastate_t *
2420 transit_state_sb (err, mctx, state)
2421      reg_errcode_t *err;
2422      re_match_context_t *mctx;
2423      re_dfastate_t *state;
2424 {
2425   re_dfa_t *const dfa = mctx->dfa;
2426   re_node_set next_nodes;
2427   re_dfastate_t *next_state;
2428   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429   unsigned int context;
2430
2431   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432   if (BE (*err != REG_NOERROR, 0))
2433     return NULL;
2434   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2435     {
2436       int cur_node = state->nodes.elems[node_cnt];
2437       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2438         {
2439           *err = re_node_set_merge (&next_nodes,
2440                                     dfa->eclosures + dfa->nexts[cur_node]);
2441           if (BE (*err != REG_NOERROR, 0))
2442             {
2443               re_node_set_free (&next_nodes);
2444               return NULL;
2445             }
2446         }
2447     }
2448   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450   /* We don't need to check errors here, since the return value of
2451      this function is next_state and ERR is already set.  */
2452
2453   re_node_set_free (&next_nodes);
2454   re_string_skip_bytes (&mctx->input, 1);
2455   return next_state;
2456 }
2457 #endif
2458
2459 #ifdef RE_ENABLE_I18N
2460 static reg_errcode_t
2461 transit_state_mb (mctx, pstate)
2462     re_match_context_t *mctx;
2463     re_dfastate_t *pstate;
2464 {
2465   re_dfa_t *const dfa = mctx->dfa;
2466   reg_errcode_t err;
2467   int i;
2468
2469   for (i = 0; i < pstate->nodes.nelem; ++i)
2470     {
2471       re_node_set dest_nodes, *new_nodes;
2472       int cur_node_idx = pstate->nodes.elems[i];
2473       int naccepted = 0, dest_idx;
2474       unsigned int context;
2475       re_dfastate_t *dest_state;
2476
2477       if (dfa->nodes[cur_node_idx].constraint)
2478         {
2479           context = re_string_context_at (&mctx->input,
2480                                           re_string_cur_idx (&mctx->input),
2481                                           mctx->eflags);
2482           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2483                                            context))
2484             continue;
2485         }
2486
2487       /* How many bytes the node can accept?  */
2488       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2489         naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2490                                              re_string_cur_idx (&mctx->input));
2491       if (naccepted == 0)
2492         continue;
2493
2494       /* The node can accepts `naccepted' bytes.  */
2495       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2496       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2497                                : mctx->max_mb_elem_len);
2498       err = clean_state_log_if_needed (mctx, dest_idx);
2499       if (BE (err != REG_NOERROR, 0))
2500         return err;
2501 #ifdef DEBUG
2502       assert (dfa->nexts[cur_node_idx] != -1);
2503 #endif
2504       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2505          then we use pstate->nodes.elems[i] instead.  */
2506       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2507
2508       dest_state = mctx->state_log[dest_idx];
2509       if (dest_state == NULL)
2510         dest_nodes = *new_nodes;
2511       else
2512         {
2513           err = re_node_set_init_union (&dest_nodes,
2514                                         dest_state->entrance_nodes, new_nodes);
2515           if (BE (err != REG_NOERROR, 0))
2516             return err;
2517         }
2518       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2519       mctx->state_log[dest_idx]
2520         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521       if (dest_state != NULL)
2522         re_node_set_free (&dest_nodes);
2523       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524         return err;
2525     }
2526   return REG_NOERROR;
2527 }
2528 #endif /* RE_ENABLE_I18N */
2529
2530 static reg_errcode_t
2531 transit_state_bkref (mctx, nodes)
2532     re_match_context_t *mctx;
2533     const re_node_set *nodes;
2534 {
2535   re_dfa_t *const dfa = mctx->dfa;
2536   reg_errcode_t err;
2537   int i;
2538   int cur_str_idx = re_string_cur_idx (&mctx->input);
2539
2540   for (i = 0; i < nodes->nelem; ++i)
2541     {
2542       int dest_str_idx, prev_nelem, bkc_idx;
2543       int node_idx = nodes->elems[i];
2544       unsigned int context;
2545       const re_token_t *node = dfa->nodes + node_idx;
2546       re_node_set *new_dest_nodes;
2547
2548       /* Check whether `node' is a backreference or not.  */
2549       if (node->type != OP_BACK_REF)
2550         continue;
2551
2552       if (node->constraint)
2553         {
2554           context = re_string_context_at (&mctx->input, cur_str_idx,
2555                                           mctx->eflags);
2556           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2557             continue;
2558         }
2559
2560       /* `node' is a backreference.
2561          Check the substring which the substring matched.  */
2562       bkc_idx = mctx->nbkref_ents;
2563       err = get_subexp (mctx, node_idx, cur_str_idx);
2564       if (BE (err != REG_NOERROR, 0))
2565         goto free_return;
2566
2567       /* And add the epsilon closures (which is `new_dest_nodes') of
2568          the backreference to appropriate state_log.  */
2569 #ifdef DEBUG
2570       assert (dfa->nexts[node_idx] != -1);
2571 #endif
2572       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2573         {
2574           int subexp_len;
2575           re_dfastate_t *dest_state;
2576           struct re_backref_cache_entry *bkref_ent;
2577           bkref_ent = mctx->bkref_ents + bkc_idx;
2578           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2579             continue;
2580           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2581           new_dest_nodes = (subexp_len == 0
2582                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2583                             : dfa->eclosures + dfa->nexts[node_idx]);
2584           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2585                           - bkref_ent->subexp_from);
2586           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2587                                           mctx->eflags);
2588           dest_state = mctx->state_log[dest_str_idx];
2589           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2590                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2591           /* Add `new_dest_node' to state_log.  */
2592           if (dest_state == NULL)
2593             {
2594               mctx->state_log[dest_str_idx]
2595                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2596                                             context);
2597               if (BE (mctx->state_log[dest_str_idx] == NULL
2598                       && err != REG_NOERROR, 0))
2599                 goto free_return;
2600             }
2601           else
2602             {
2603               re_node_set dest_nodes;
2604               err = re_node_set_init_union (&dest_nodes,
2605                                             dest_state->entrance_nodes,
2606                                             new_dest_nodes);
2607               if (BE (err != REG_NOERROR, 0))
2608                 {
2609                   re_node_set_free (&dest_nodes);
2610                   goto free_return;
2611                 }
2612               mctx->state_log[dest_str_idx]
2613                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2614               re_node_set_free (&dest_nodes);
2615               if (BE (mctx->state_log[dest_str_idx] == NULL
2616                       && err != REG_NOERROR, 0))
2617                 goto free_return;
2618             }
2619           /* We need to check recursively if the backreference can epsilon
2620              transit.  */
2621           if (subexp_len == 0
2622               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2623             {
2624               err = check_subexp_matching_top (mctx, new_dest_nodes,
2625                                                cur_str_idx);
2626               if (BE (err != REG_NOERROR, 0))
2627                 goto free_return;
2628               err = transit_state_bkref (mctx, new_dest_nodes);
2629               if (BE (err != REG_NOERROR, 0))
2630                 goto free_return;
2631             }
2632         }
2633     }
2634   err = REG_NOERROR;
2635  free_return:
2636   return err;
2637 }
2638
2639 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2640    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2641    Note that we might collect inappropriate candidates here.
2642    However, the cost of checking them strictly here is too high, then we
2643    delay these checking for prune_impossible_nodes().  */
2644
2645 static reg_errcode_t
2646 get_subexp (mctx, bkref_node, bkref_str_idx)
2647      re_match_context_t *mctx;
2648      int bkref_node, bkref_str_idx;
2649 {
2650   re_dfa_t *const dfa = mctx->dfa;
2651   int subexp_num, sub_top_idx;
2652   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2653   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2654   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2655   if (cache_idx != -1)
2656     {
2657       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2658       do
2659         if (entry->node == bkref_node)
2660           return REG_NOERROR; /* We already checked it.  */
2661       while (entry++->more);
2662     }
2663
2664   subexp_num = dfa->nodes[bkref_node].opr.idx;
2665
2666   /* For each sub expression  */
2667   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2668     {
2669       reg_errcode_t err;
2670       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2671       re_sub_match_last_t *sub_last;
2672       int sub_last_idx, sl_str, bkref_str_off;
2673
2674       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2675         continue; /* It isn't related.  */
2676
2677       sl_str = sub_top->str_idx;
2678       bkref_str_off = bkref_str_idx;
2679       /* At first, check the last node of sub expressions we already
2680          evaluated.  */
2681       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2682         {
2683           int sl_str_diff;
2684           sub_last = sub_top->lasts[sub_last_idx];
2685           sl_str_diff = sub_last->str_idx - sl_str;
2686           /* The matched string by the sub expression match with the substring
2687              at the back reference?  */
2688           if (sl_str_diff > 0)
2689             {
2690               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2691                 {
2692                   /* Not enough chars for a successful match.  */
2693                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2694                     break;
2695
2696                   err = clean_state_log_if_needed (mctx,
2697                                                    bkref_str_off
2698                                                    + sl_str_diff);
2699                   if (BE (err != REG_NOERROR, 0))
2700                     return err;
2701                   buf = (const char *) re_string_get_buffer (&mctx->input);
2702                 }
2703               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2704                 break; /* We don't need to search this sub expression any more.  */
2705             }
2706           bkref_str_off += sl_str_diff;
2707           sl_str += sl_str_diff;
2708           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2709                                 bkref_str_idx);
2710
2711           /* Reload buf, since the preceding call might have reallocated
2712              the buffer.  */
2713           buf = (const char *) re_string_get_buffer (&mctx->input);
2714
2715           if (err == REG_NOMATCH)
2716             continue;
2717           if (BE (err != REG_NOERROR, 0))
2718             return err;
2719         }
2720
2721       if (sub_last_idx < sub_top->nlasts)
2722         continue;
2723       if (sub_last_idx > 0)
2724         ++sl_str;
2725       /* Then, search for the other last nodes of the sub expression.  */
2726       for (; sl_str <= bkref_str_idx; ++sl_str)
2727         {
2728           int cls_node, sl_str_off;
2729           const re_node_set *nodes;
2730           sl_str_off = sl_str - sub_top->str_idx;
2731           /* The matched string by the sub expression match with the substring
2732              at the back reference?  */
2733           if (sl_str_off > 0)
2734             {
2735               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2736                 {
2737                   /* If we are at the end of the input, we cannot match.  */
2738                   if (bkref_str_off >= mctx->input.len)
2739                     break;
2740
2741                   err = extend_buffers (mctx);
2742                   if (BE (err != REG_NOERROR, 0))
2743                     return err;
2744
2745                   buf = (const char *) re_string_get_buffer (&mctx->input);
2746                 }
2747               if (buf [bkref_str_off++] != buf[sl_str - 1])
2748                 break; /* We don't need to search this sub expression
2749                           any more.  */
2750             }
2751           if (mctx->state_log[sl_str] == NULL)
2752             continue;
2753           /* Does this state have a ')' of the sub expression?  */
2754           nodes = &mctx->state_log[sl_str]->nodes;
2755           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2756           if (cls_node == -1)
2757             continue; /* No.  */
2758           if (sub_top->path == NULL)
2759             {
2760               sub_top->path = calloc (sizeof (state_array_t),
2761                                       sl_str - sub_top->str_idx + 1);
2762               if (sub_top->path == NULL)
2763                 return REG_ESPACE;
2764             }
2765           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2766              in the current context?  */
2767           err = check_arrival (mctx, sub_top->path, sub_top->node,
2768                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2769           if (err == REG_NOMATCH)
2770               continue;
2771           if (BE (err != REG_NOERROR, 0))
2772               return err;
2773           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2774           if (BE (sub_last == NULL, 0))
2775             return REG_ESPACE;
2776           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2777                                 bkref_str_idx);
2778           if (err == REG_NOMATCH)
2779             continue;
2780         }
2781     }
2782   return REG_NOERROR;
2783 }
2784
2785 /* Helper functions for get_subexp().  */
2786
2787 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2788    If it can arrive, register the sub expression expressed with SUB_TOP
2789    and SUB_LAST.  */
2790
2791 static reg_errcode_t
2792 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2793      re_match_context_t *mctx;
2794      const re_sub_match_top_t *sub_top;
2795      re_sub_match_last_t *sub_last;
2796      int bkref_node, bkref_str;
2797 {
2798   reg_errcode_t err;
2799   int to_idx;
2800   /* Can the subexpression arrive the back reference?  */
2801   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2802                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2803   if (err != REG_NOERROR)
2804     return err;
2805   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2806                              sub_last->str_idx);
2807   if (BE (err != REG_NOERROR, 0))
2808     return err;
2809   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2810   return clean_state_log_if_needed (mctx, to_idx);
2811 }
2812
2813 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2814    Search '(' if FL_OPEN, or search ')' otherwise.
2815    TODO: This function isn't efficient...
2816          Because there might be more than one nodes whose types are
2817          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2818          nodes.
2819          E.g. RE: (a){2}  */
2820
2821 static int
2822 find_subexp_node (dfa, nodes, subexp_idx, type)
2823      const re_dfa_t *dfa;
2824      const re_node_set *nodes;
2825      int subexp_idx, type;
2826 {
2827   int cls_idx;
2828   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2829     {
2830       int cls_node = nodes->elems[cls_idx];
2831       const re_token_t *node = dfa->nodes + cls_node;
2832       if (node->type == type
2833           && node->opr.idx == subexp_idx)
2834         return cls_node;
2835     }
2836   return -1;
2837 }
2838
2839 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2840    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2841    heavily reused.
2842    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2843
2844 static reg_errcode_t
2845 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2846                type)
2847      re_match_context_t *mctx;
2848      state_array_t *path;
2849      int top_node, top_str, last_node, last_str, type;
2850 {
2851   re_dfa_t *const dfa = mctx->dfa;
2852   reg_errcode_t err;
2853   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2854   re_dfastate_t *cur_state = NULL;
2855   re_node_set *cur_nodes, next_nodes;
2856   re_dfastate_t **backup_state_log;
2857   unsigned int context;
2858
2859   subexp_num = dfa->nodes[top_node].opr.idx;
2860   /* Extend the buffer if we need.  */
2861   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2862     {
2863       re_dfastate_t **new_array;
2864       int old_alloc = path->alloc;
2865       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2866       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2867       if (new_array == NULL)
2868         {
2869           path->alloc = old_alloc;
2870           return REG_ESPACE;
2871         }
2872       path->array = new_array;
2873       memset (new_array + old_alloc, '\0',
2874               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2875     }
2876
2877   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2878
2879   /* Temporary modify MCTX.  */
2880   backup_state_log = mctx->state_log;
2881   backup_cur_idx = mctx->input.cur_idx;
2882   mctx->state_log = path->array;
2883   mctx->input.cur_idx = str_idx;
2884
2885   /* Setup initial node set.  */
2886   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2887   if (str_idx == top_str)
2888     {
2889       err = re_node_set_init_1 (&next_nodes, top_node);
2890       if (BE (err != REG_NOERROR, 0))
2891         return err;
2892       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2893       if (BE (err != REG_NOERROR, 0))
2894         {
2895           re_node_set_free (&next_nodes);
2896           return err;
2897         }
2898     }
2899   else
2900     {
2901       cur_state = mctx->state_log[str_idx];
2902       if (cur_state && cur_state->has_backref)
2903         {
2904           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2905           if (BE ( err != REG_NOERROR, 0))
2906             return err;
2907         }
2908       else
2909         re_node_set_init_empty (&next_nodes);
2910     }
2911   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2912     {
2913       if (next_nodes.nelem)
2914         {
2915           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2916                                     subexp_num, type);
2917           if (BE ( err != REG_NOERROR, 0))
2918             {
2919               re_node_set_free (&next_nodes);
2920               return err;
2921             }
2922         }
2923       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2924       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2925         {
2926           re_node_set_free (&next_nodes);
2927           return err;
2928         }
2929       mctx->state_log[str_idx] = cur_state;
2930     }
2931
2932   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2933     {
2934       re_node_set_empty (&next_nodes);
2935       if (mctx->state_log[str_idx + 1])
2936         {
2937           err = re_node_set_merge (&next_nodes,
2938                                    &mctx->state_log[str_idx + 1]->nodes);
2939           if (BE (err != REG_NOERROR, 0))
2940             {
2941               re_node_set_free (&next_nodes);
2942               return err;
2943             }
2944         }
2945       if (cur_state)
2946         {
2947           err = check_arrival_add_next_nodes (mctx, str_idx,
2948                                               &cur_state->non_eps_nodes, &next_nodes);
2949           if (BE (err != REG_NOERROR, 0))
2950             {
2951               re_node_set_free (&next_nodes);
2952               return err;
2953             }
2954         }
2955       ++str_idx;
2956       if (next_nodes.nelem)
2957         {
2958           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2959           if (BE (err != REG_NOERROR, 0))
2960             {
2961               re_node_set_free (&next_nodes);
2962               return err;
2963             }
2964           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2965                                     subexp_num, type);
2966           if (BE ( err != REG_NOERROR, 0))
2967             {
2968               re_node_set_free (&next_nodes);
2969               return err;
2970             }
2971         }
2972       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2973       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2974       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2975         {
2976           re_node_set_free (&next_nodes);
2977           return err;
2978         }
2979       mctx->state_log[str_idx] = cur_state;
2980       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2981     }
2982   re_node_set_free (&next_nodes);
2983   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2984                : &mctx->state_log[last_str]->nodes);
2985   path->next_idx = str_idx;
2986
2987   /* Fix MCTX.  */
2988   mctx->state_log = backup_state_log;
2989   mctx->input.cur_idx = backup_cur_idx;
2990
2991   /* Then check the current node set has the node LAST_NODE.  */
2992   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2993     return REG_NOERROR;
2994
2995   return REG_NOMATCH;
2996 }
2997
2998 /* Helper functions for check_arrival.  */
2999
3000 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3001    to NEXT_NODES.
3002    TODO: This function is similar to the functions transit_state*(),
3003          however this function has many additional works.
3004          Can't we unify them?  */
3005
3006 static reg_errcode_t
3007 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
3008      re_match_context_t *mctx;
3009      int str_idx;
3010      re_node_set *cur_nodes, *next_nodes;
3011 {
3012   re_dfa_t *const dfa = mctx->dfa;
3013   int result;
3014   int cur_idx;
3015   reg_errcode_t err;
3016   re_node_set union_set;
3017   re_node_set_init_empty (&union_set);
3018   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3019     {
3020       int naccepted = 0;
3021       int cur_node = cur_nodes->elems[cur_idx];
3022 #if defined DEBUG || defined RE_ENABLE_I18N
3023       re_token_type_t type = dfa->nodes[cur_node].type;
3024 #endif
3025 #ifdef DEBUG
3026       assert (!IS_EPSILON_NODE (type));
3027 #endif
3028 #ifdef RE_ENABLE_I18N
3029       /* If the node may accept `multi byte'.  */
3030       if (ACCEPT_MB_NODE (type))
3031         {
3032           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3033                                                str_idx);
3034           if (naccepted > 1)
3035             {
3036               re_dfastate_t *dest_state;
3037               int next_node = dfa->nexts[cur_node];
3038               int next_idx = str_idx + naccepted;
3039               dest_state = mctx->state_log[next_idx];
3040               re_node_set_empty (&union_set);
3041               if (dest_state)
3042                 {
3043                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3044                   if (BE (err != REG_NOERROR, 0))
3045                     {
3046                       re_node_set_free (&union_set);
3047                       return err;
3048                     }
3049                 }
3050               result = re_node_set_insert (&union_set, next_node);
3051               if (BE (result < 0, 0))
3052                 {
3053                   re_node_set_free (&union_set);
3054                   return REG_ESPACE;
3055                 }
3056               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3057                                                             &union_set);
3058               if (BE (mctx->state_log[next_idx] == NULL
3059                       && err != REG_NOERROR, 0))
3060                 {
3061                   re_node_set_free (&union_set);
3062                   return err;
3063                 }
3064             }
3065         }
3066 #endif /* RE_ENABLE_I18N */
3067       if (naccepted
3068           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3069         {
3070           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3071           if (BE (result < 0, 0))
3072             {
3073               re_node_set_free (&union_set);
3074               return REG_ESPACE;
3075             }
3076         }
3077     }
3078   re_node_set_free (&union_set);
3079   return REG_NOERROR;
3080 }
3081
3082 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3083    CUR_NODES, however exclude the nodes which are:
3084     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3085     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3086 */
3087
3088 static reg_errcode_t
3089 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3090      re_dfa_t *dfa;
3091      re_node_set *cur_nodes;
3092      int ex_subexp, type;
3093 {
3094   reg_errcode_t err;
3095   int idx, outside_node;
3096   re_node_set new_nodes;
3097 #ifdef DEBUG
3098   assert (cur_nodes->nelem);
3099 #endif
3100   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3101   if (BE (err != REG_NOERROR, 0))
3102     return err;
3103   /* Create a new node set NEW_NODES with the nodes which are epsilon
3104      closures of the node in CUR_NODES.  */
3105
3106   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3107     {
3108       int cur_node = cur_nodes->elems[idx];
3109       re_node_set *eclosure = dfa->eclosures + cur_node;
3110       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3111       if (outside_node == -1)
3112         {
3113           /* There are no problematic nodes, just merge them.  */
3114           err = re_node_set_merge (&new_nodes, eclosure);
3115           if (BE (err != REG_NOERROR, 0))
3116             {
3117               re_node_set_free (&new_nodes);
3118               return err;
3119             }
3120         }
3121       else
3122         {
3123           /* There are problematic nodes, re-calculate incrementally.  */
3124           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3125                                               ex_subexp, type);
3126           if (BE (err != REG_NOERROR, 0))
3127             {
3128               re_node_set_free (&new_nodes);
3129               return err;
3130             }
3131         }
3132     }
3133   re_node_set_free (cur_nodes);
3134   *cur_nodes = new_nodes;
3135   return REG_NOERROR;
3136 }
3137
3138 /* Helper function for check_arrival_expand_ecl.
3139    Check incrementally the epsilon closure of TARGET, and if it isn't
3140    problematic append it to DST_NODES.  */
3141
3142 static reg_errcode_t
3143 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3144      re_dfa_t *dfa;
3145      int target, ex_subexp, type;
3146      re_node_set *dst_nodes;
3147 {
3148   int cur_node;
3149   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3150     {
3151       int err;
3152
3153       if (dfa->nodes[cur_node].type == type
3154           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3155         {
3156           if (type == OP_CLOSE_SUBEXP)
3157             {
3158               err = re_node_set_insert (dst_nodes, cur_node);
3159               if (BE (err == -1, 0))
3160                 return REG_ESPACE;
3161             }
3162           break;
3163         }
3164       err = re_node_set_insert (dst_nodes, cur_node);
3165       if (BE (err == -1, 0))
3166         return REG_ESPACE;
3167       if (dfa->edests[cur_node].nelem == 0)
3168         break;
3169       if (dfa->edests[cur_node].nelem == 2)
3170         {
3171           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3172                                               dfa->edests[cur_node].elems[1],
3173                                               ex_subexp, type);
3174           if (BE (err != REG_NOERROR, 0))
3175             return err;
3176         }
3177       cur_node = dfa->edests[cur_node].elems[0];
3178     }
3179   return REG_NOERROR;
3180 }
3181
3182
3183 /* For all the back references in the current state, calculate the
3184    destination of the back references by the appropriate entry
3185    in MCTX->BKREF_ENTS.  */
3186
3187 static reg_errcode_t
3188 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3189                     type)
3190      re_match_context_t *mctx;
3191      int cur_str, subexp_num, type;
3192      re_node_set *cur_nodes;
3193 {
3194   re_dfa_t *const dfa = mctx->dfa;
3195   reg_errcode_t err;
3196   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3197   struct re_backref_cache_entry *ent;
3198
3199   if (cache_idx_start == -1)
3200     return REG_NOERROR;
3201
3202  restart:
3203   ent = mctx->bkref_ents + cache_idx_start;
3204   do
3205     {
3206       int to_idx, next_node;
3207
3208       /* Is this entry ENT is appropriate?  */
3209       if (!re_node_set_contains (cur_nodes, ent->node))
3210         continue; /* No.  */
3211
3212       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3213       /* Calculate the destination of the back reference, and append it
3214          to MCTX->STATE_LOG.  */
3215       if (to_idx == cur_str)
3216         {
3217           /* The backreference did epsilon transit, we must re-check all the
3218              node in the current state.  */
3219           re_node_set new_dests;
3220           reg_errcode_t err2, err3;
3221           next_node = dfa->edests[ent->node].elems[0];
3222           if (re_node_set_contains (cur_nodes, next_node))
3223             continue;
3224           err = re_node_set_init_1 (&new_dests, next_node);
3225           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3226           err3 = re_node_set_merge (cur_nodes, &new_dests);
3227           re_node_set_free (&new_dests);
3228           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3229                   || err3 != REG_NOERROR, 0))
3230             {
3231               err = (err != REG_NOERROR ? err
3232                      : (err2 != REG_NOERROR ? err2 : err3));
3233               return err;
3234             }
3235           /* TODO: It is still inefficient...  */
3236           goto restart;
3237         }
3238       else
3239         {
3240           re_node_set union_set;
3241           next_node = dfa->nexts[ent->node];
3242           if (mctx->state_log[to_idx])
3243             {
3244               int ret;
3245               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3246                                         next_node))
3247                 continue;
3248               err = re_node_set_init_copy (&union_set,
3249                                            &mctx->state_log[to_idx]->nodes);
3250               ret = re_node_set_insert (&union_set, next_node);
3251               if (BE (err != REG_NOERROR || ret < 0, 0))
3252                 {
3253                   re_node_set_free (&union_set);
3254                   err = err != REG_NOERROR ? err : REG_ESPACE;
3255                   return err;
3256                 }
3257             }
3258           else
3259             {
3260               err = re_node_set_init_1 (&union_set, next_node);
3261               if (BE (err != REG_NOERROR, 0))
3262                 return err;
3263             }
3264           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3265           re_node_set_free (&union_set);
3266           if (BE (mctx->state_log[to_idx] == NULL
3267                   && err != REG_NOERROR, 0))
3268             return err;
3269         }
3270     }
3271   while (ent++->more);
3272   return REG_NOERROR;
3273 }
3274
3275 /* Build transition table for the state.
3276    Return the new table if succeeded, otherwise return NULL.  */
3277
3278 static re_dfastate_t **
3279 build_trtable (dfa, state)
3280     re_dfa_t *dfa;
3281     re_dfastate_t *state;
3282 {
3283   reg_errcode_t err;
3284   int i, j, ch;
3285   unsigned int elem, mask;
3286   int dests_node_malloced = 0, dest_states_malloced = 0;
3287   int ndests; /* Number of the destination states from `state'.  */
3288   re_dfastate_t **trtable;
3289   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3290   re_node_set follows, *dests_node;
3291   bitset *dests_ch;
3292   bitset acceptable;
3293
3294   /* We build DFA states which corresponds to the destination nodes
3295      from `state'.  `dests_node[i]' represents the nodes which i-th
3296      destination state contains, and `dests_ch[i]' represents the
3297      characters which i-th destination state accepts.  */
3298 #ifdef _LIBC
3299   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3300     dests_node = (re_node_set *)
3301                  alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3302   else
3303 #endif
3304     {
3305       dests_node = (re_node_set *)
3306                    malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3307       if (BE (dests_node == NULL, 0))
3308         return NULL;
3309       dests_node_malloced = 1;
3310     }
3311   dests_ch = (bitset *) (dests_node + SBC_MAX);
3312
3313   /* Initialize transiton table.  */
3314   state->word_trtable = 0;
3315
3316   /* At first, group all nodes belonging to `state' into several
3317      destinations.  */
3318   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3319   if (BE (ndests <= 0, 0))
3320     {
3321       if (dests_node_malloced)
3322         free (dests_node);
3323       /* Return NULL in case of an error, trtable otherwise.  */
3324       if (ndests == 0)
3325         {
3326           state->trtable = (re_dfastate_t **)
3327             calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3328           return state->trtable;
3329         }
3330       return NULL;
3331     }
3332
3333   err = re_node_set_alloc (&follows, ndests + 1);
3334   if (BE (err != REG_NOERROR, 0))
3335     goto out_free;
3336
3337 #ifdef _LIBC
3338   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3339                          + ndests * 3 * sizeof (re_dfastate_t *)))
3340     dest_states = (re_dfastate_t **)
3341                   alloca (ndests * 3 * sizeof (re_dfastate_t *));
3342   else
3343 #endif
3344     {
3345       dest_states = (re_dfastate_t **)
3346                     malloc (ndests * 3 * sizeof (re_dfastate_t *));
3347       if (BE (dest_states == NULL, 0))
3348         {
3349 out_free:
3350           if (dest_states_malloced)
3351             free (dest_states);
3352           re_node_set_free (&follows);
3353           for (i = 0; i < ndests; ++i)
3354             re_node_set_free (dests_node + i);
3355           if (dests_node_malloced)
3356             free (dests_node);
3357           return NULL;
3358         }
3359       dest_states_malloced = 1;
3360     }
3361   dest_states_word = dest_states + ndests;
3362   dest_states_nl = dest_states_word + ndests;
3363   bitset_empty (acceptable);
3364
3365   /* Then build the states for all destinations.  */
3366   for (i = 0; i < ndests; ++i)
3367     {
3368       int next_node;
3369       re_node_set_empty (&follows);
3370       /* Merge the follows of this destination states.  */
3371       for (j = 0; j < dests_node[i].nelem; ++j)
3372         {
3373           next_node = dfa->nexts[dests_node[i].elems[j]];
3374           if (next_node != -1)
3375             {
3376               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3377               if (BE (err != REG_NOERROR, 0))
3378                 goto out_free;
3379             }
3380         }
3381       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3382       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3383         goto out_free;
3384       /* If the new state has context constraint,
3385          build appropriate states for these contexts.  */
3386       if (dest_states[i]->has_constraint)
3387         {
3388           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3389                                                           CONTEXT_WORD);
3390           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3391             goto out_free;
3392
3393           if (dest_states[i] != dest_states_word[i]
3394               && dfa->mb_cur_max > 1)
3395             state->word_trtable = 1;
3396
3397           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3398                                                         CONTEXT_NEWLINE);
3399           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3400             goto out_free;
3401         }
3402       else
3403         {
3404           dest_states_word[i] = dest_states[i];
3405           dest_states_nl[i] = dest_states[i];
3406         }
3407       bitset_merge (acceptable, dests_ch[i]);
3408     }
3409
3410   if (!BE (state->word_trtable, 0))
3411     {
3412       /* We don't care about whether the following character is a word
3413          character, or we are in a single-byte character set so we can
3414          discern by looking at the character code: allocate a
3415          256-entry transition table.  */
3416       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3417       if (BE (trtable == NULL, 0))
3418         goto out_free;
3419
3420       /* For all characters ch...:  */
3421       for (i = 0; i < BITSET_UINTS; ++i)
3422         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3423              elem;
3424              mask <<= 1, elem >>= 1, ++ch)
3425           if (BE (elem & 1, 0))
3426             {
3427               /* There must be exactly one destination which accepts
3428                  character ch.  See group_nodes_into_DFAstates.  */
3429               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3430                 ;
3431
3432               /* j-th destination accepts the word character ch.  */
3433               if (dfa->word_char[i] & mask)
3434                 trtable[ch] = dest_states_word[j];
3435               else
3436                 trtable[ch] = dest_states[j];
3437             }
3438     }
3439   else
3440     {
3441       /* We care about whether the following character is a word
3442          character, and we are in a multi-byte character set: discern
3443          by looking at the character code: build two 256-entry
3444          transition tables, one starting at trtable[0] and one
3445          starting at trtable[SBC_MAX].  */
3446       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3447                                            2 * SBC_MAX);
3448       if (BE (trtable == NULL, 0))
3449         goto out_free;
3450
3451       /* For all characters ch...:  */
3452       for (i = 0; i < BITSET_UINTS; ++i)
3453         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3454              elem;
3455              mask <<= 1, elem >>= 1, ++ch)
3456           if (BE (elem & 1, 0))
3457             {
3458               /* There must be exactly one destination which accepts
3459                  character ch.  See group_nodes_into_DFAstates.  */
3460               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3461                 ;
3462
3463               /* j-th destination accepts the word character ch.  */
3464               trtable[ch] = dest_states[j];
3465               trtable[ch + SBC_MAX] = dest_states_word[j];
3466             }
3467     }
3468
3469   /* new line */
3470   if (bitset_contain (acceptable, NEWLINE_CHAR))
3471     {
3472       /* The current state accepts newline character.  */
3473       for (j = 0; j < ndests; ++j)
3474         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3475           {
3476             /* k-th destination accepts newline character.  */
3477             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3478             if (state->word_trtable)
3479               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3480             /* There must be only one destination which accepts
3481                newline.  See group_nodes_into_DFAstates.  */
3482             break;
3483           }
3484     }
3485
3486   if (dest_states_malloced)
3487     free (dest_states);
3488
3489   re_node_set_free (&follows);
3490   for (i = 0; i < ndests; ++i)
3491     re_node_set_free (dests_node + i);
3492
3493   if (dests_node_malloced)
3494     free (dests_node);
3495
3496   state->trtable = trtable;
3497   return trtable;
3498 }
3499
3500 /* Group all nodes belonging to STATE into several destinations.
3501    Then for all destinations, set the nodes belonging to the destination
3502    to DESTS_NODE[i] and set the characters accepted by the destination
3503    to DEST_CH[i].  This function return the number of destinations.  */
3504
3505 static int
3506 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3507     re_dfa_t *dfa;
3508     const re_dfastate_t *state;
3509     re_node_set *dests_node;
3510     bitset *dests_ch;
3511 {
3512   reg_errcode_t err;
3513   int result;
3514   int i, j, k;
3515   int ndests; /* Number of the destinations from `state'.  */
3516   bitset accepts; /* Characters a node can accept.  */
3517   const re_node_set *cur_nodes = &state->nodes;
3518   bitset_empty (accepts);
3519   ndests = 0;
3520
3521   /* For all the nodes belonging to `state',  */
3522   for (i = 0; i < cur_nodes->nelem; ++i)
3523     {
3524       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3525       re_token_type_t type = node->type;
3526       unsigned int constraint = node->constraint;
3527
3528       /* Enumerate all single byte character this node can accept.  */
3529       if (type == CHARACTER)
3530         bitset_set (accepts, node->opr.c);
3531       else if (type == SIMPLE_BRACKET)
3532         {
3533           bitset_merge (accepts, node->opr.sbcset);
3534         }
3535       else if (type == OP_PERIOD)
3536         {
3537 #ifdef RE_ENABLE_I18N
3538           if (dfa->mb_cur_max > 1)
3539             bitset_merge (accepts, dfa->sb_char);
3540           else
3541 #endif
3542             bitset_set_all (accepts);
3543           if (!(dfa->syntax & RE_DOT_NEWLINE))
3544             bitset_clear (accepts, '\n');
3545           if (dfa->syntax & RE_DOT_NOT_NULL)
3546             bitset_clear (accepts, '\0');
3547         }
3548 #ifdef RE_ENABLE_I18N
3549       else if (type == OP_UTF8_PERIOD)
3550         {
3551           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3552           if (!(dfa->syntax & RE_DOT_NEWLINE))
3553             bitset_clear (accepts, '\n');
3554           if (dfa->syntax & RE_DOT_NOT_NULL)
3555             bitset_clear (accepts, '\0');
3556         }
3557 #endif
3558       else
3559         continue;
3560
3561       /* Check the `accepts' and sift the characters which are not
3562          match it the context.  */
3563       if (constraint)
3564         {
3565           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3566             {
3567               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3568               bitset_empty (accepts);
3569               if (accepts_newline)
3570                 bitset_set (accepts, NEWLINE_CHAR);
3571               else
3572                 continue;
3573             }
3574           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3575             {
3576               bitset_empty (accepts);
3577               continue;
3578             }
3579
3580           if (constraint & NEXT_WORD_CONSTRAINT)
3581             {
3582               unsigned int any_set = 0;
3583               if (type == CHARACTER && !node->word_char)
3584                 {
3585                   bitset_empty (accepts);
3586                   continue;
3587                 }
3588 #ifdef RE_ENABLE_I18N
3589               if (dfa->mb_cur_max > 1)
3590                 for (j = 0; j < BITSET_UINTS; ++j)
3591                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3592               else
3593 #endif
3594                 for (j = 0; j < BITSET_UINTS; ++j)
3595                   any_set |= (accepts[j] &= dfa->word_char[j]);
3596               if (!any_set)
3597                 continue;
3598             }
3599           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3600             {
3601               unsigned int any_set = 0;
3602               if (type == CHARACTER && node->word_char)
3603                 {
3604                   bitset_empty (accepts);
3605                   continue;
3606                 }
3607 #ifdef RE_ENABLE_I18N
3608               if (dfa->mb_cur_max > 1)
3609                 for (j = 0; j < BITSET_UINTS; ++j)
3610                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3611               else
3612 #endif
3613                 for (j = 0; j < BITSET_UINTS; ++j)
3614                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3615               if (!any_set)
3616                 continue;
3617             }
3618         }
3619
3620       /* Then divide `accepts' into DFA states, or create a new
3621          state.  Above, we make sure that accepts is not empty.  */
3622       for (j = 0; j < ndests; ++j)
3623         {
3624           bitset intersec; /* Intersection sets, see below.  */
3625           bitset remains;
3626           /* Flags, see below.  */
3627           int has_intersec, not_subset, not_consumed;
3628
3629           /* Optimization, skip if this state doesn't accept the character.  */
3630           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3631             continue;
3632
3633           /* Enumerate the intersection set of this state and `accepts'.  */
3634           has_intersec = 0;
3635           for (k = 0; k < BITSET_UINTS; ++k)
3636             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3637           /* And skip if the intersection set is empty.  */
3638           if (!has_intersec)
3639             continue;
3640
3641           /* Then check if this state is a subset of `accepts'.  */
3642           not_subset = not_consumed = 0;
3643           for (k = 0; k < BITSET_UINTS; ++k)
3644             {
3645               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3646               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3647             }
3648
3649           /* If this state isn't a subset of `accepts', create a
3650              new group state, which has the `remains'. */
3651           if (not_subset)
3652             {
3653               bitset_copy (dests_ch[ndests], remains);
3654               bitset_copy (dests_ch[j], intersec);
3655               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3656               if (BE (err != REG_NOERROR, 0))
3657                 goto error_return;
3658               ++ndests;
3659             }
3660
3661           /* Put the position in the current group. */
3662           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3663           if (BE (result < 0, 0))
3664             goto error_return;
3665
3666           /* If all characters are consumed, go to next node. */
3667           if (!not_consumed)
3668             break;
3669         }
3670       /* Some characters remain, create a new group. */
3671       if (j == ndests)
3672         {
3673           bitset_copy (dests_ch[ndests], accepts);
3674           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3675           if (BE (err != REG_NOERROR, 0))
3676             goto error_return;
3677           ++ndests;
3678           bitset_empty (accepts);
3679         }
3680     }
3681   return ndests;
3682  error_return:
3683   for (j = 0; j < ndests; ++j)
3684     re_node_set_free (dests_node + j);
3685   return -1;
3686 }
3687
3688 #ifdef RE_ENABLE_I18N
3689 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3690    Return the number of the bytes the node accepts.
3691    STR_IDX is the current index of the input string.
3692
3693    This function handles the nodes which can accept one character, or
3694    one collating element like '.', '[a-z]', opposite to the other nodes
3695    can only accept one byte.  */
3696
3697 static int
3698 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3699     re_dfa_t *dfa;
3700     int node_idx, str_idx;
3701     const re_string_t *input;
3702 {
3703   const re_token_t *node = dfa->nodes + node_idx;
3704   int char_len, elem_len;
3705   int i;
3706
3707   if (BE (node->type == OP_UTF8_PERIOD, 0))
3708     {
3709       unsigned char c = re_string_byte_at (input, str_idx), d;
3710       if (BE (c < 0xc2, 1))
3711         return 0;
3712
3713       if (str_idx + 2 > input->len)
3714         return 0;
3715
3716       d = re_string_byte_at (input, str_idx + 1);
3717       if (c < 0xe0)
3718         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3719       else if (c < 0xf0)
3720         {
3721           char_len = 3;
3722           if (c == 0xe0 && d < 0xa0)
3723             return 0;
3724         }
3725       else if (c < 0xf8)
3726         {
3727           char_len = 4;
3728           if (c == 0xf0 && d < 0x90)
3729             return 0;
3730         }
3731       else if (c < 0xfc)
3732         {
3733           char_len = 5;
3734           if (c == 0xf8 && d < 0x88)
3735             return 0;
3736         }
3737       else if (c < 0xfe)
3738         {
3739           char_len = 6;
3740           if (c == 0xfc && d < 0x84)
3741             return 0;
3742         }
3743       else
3744         return 0;
3745
3746       if (str_idx + char_len > input->len)
3747         return 0;
3748
3749       for (i = 1; i < char_len; ++i)
3750         {
3751           d = re_string_byte_at (input, str_idx + i);
3752           if (d < 0x80 || d > 0xbf)
3753             return 0;
3754         }
3755       return char_len;
3756     }
3757
3758   char_len = re_string_char_size_at (input, str_idx);
3759   if (node->type == OP_PERIOD)
3760     {
3761       if (char_len <= 1)
3762         return 0;
3763       /* FIXME: I don't think this if is needed, as both '\n'
3764          and '\0' are char_len == 1.  */
3765       /* '.' accepts any one character except the following two cases.  */
3766       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3767            re_string_byte_at (input, str_idx) == '\n') ||
3768           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3769            re_string_byte_at (input, str_idx) == '\0'))
3770         return 0;
3771       return char_len;
3772     }
3773
3774   elem_len = re_string_elem_size_at (input, str_idx);
3775   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3776     return 0;
3777
3778   if (node->type == COMPLEX_BRACKET)
3779     {
3780       const re_charset_t *cset = node->opr.mbcset;
3781 # ifdef _LIBC
3782       const unsigned char *pin = ((char *) re_string_get_buffer (input)
3783                                   + str_idx);
3784       int j;
3785       uint32_t nrules;
3786 # endif /* _LIBC */
3787       int match_len = 0;
3788       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3789                     ? re_string_wchar_at (input, str_idx) : 0);
3790
3791       /* match with multibyte character?  */
3792       for (i = 0; i < cset->nmbchars; ++i)
3793         if (wc == cset->mbchars[i])
3794           {
3795             match_len = char_len;
3796             goto check_node_accept_bytes_match;
3797           }
3798       /* match with character_class?  */
3799       for (i = 0; i < cset->nchar_classes; ++i)
3800         {
3801           wctype_t wt = cset->char_classes[i];
3802           if (__iswctype (wc, wt))
3803             {
3804               match_len = char_len;
3805               goto check_node_accept_bytes_match;
3806             }
3807         }
3808
3809 # ifdef _LIBC
3810       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3811       if (nrules != 0)
3812         {
3813           unsigned int in_collseq = 0;
3814           const int32_t *table, *indirect;
3815           const unsigned char *weights, *extra;
3816           const char *collseqwc;
3817           int32_t idx;
3818           /* This #include defines a local function!  */
3819 #  include <locale/weight.h>
3820
3821           /* match with collating_symbol?  */
3822           if (cset->ncoll_syms)
3823             extra = (const unsigned char *)
3824               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3825           for (i = 0; i < cset->ncoll_syms; ++i)
3826             {
3827               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3828               /* Compare the length of input collating element and
3829                  the length of current collating element.  */
3830               if (*coll_sym != elem_len)
3831                 continue;
3832               /* Compare each bytes.  */
3833               for (j = 0; j < *coll_sym; j++)
3834                 if (pin[j] != coll_sym[1 + j])
3835                   break;
3836               if (j == *coll_sym)
3837                 {
3838                   /* Match if every bytes is equal.  */
3839                   match_len = j;
3840                   goto check_node_accept_bytes_match;
3841                 }
3842             }
3843
3844           if (cset->nranges)
3845             {
3846               if (elem_len <= char_len)
3847                 {
3848                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3849                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3850                 }
3851               else
3852                 in_collseq = find_collation_sequence_value (pin, elem_len);
3853             }
3854           /* match with range expression?  */
3855           for (i = 0; i < cset->nranges; ++i)
3856             if (cset->range_starts[i] <= in_collseq
3857                 && in_collseq <= cset->range_ends[i])
3858               {
3859                 match_len = elem_len;
3860                 goto check_node_accept_bytes_match;
3861               }
3862
3863           /* match with equivalence_class?  */
3864           if (cset->nequiv_classes)
3865             {
3866               const unsigned char *cp = pin;
3867               table = (const int32_t *)
3868                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3869               weights = (const unsigned char *)
3870                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3871               extra = (const unsigned char *)
3872                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3873               indirect = (const int32_t *)
3874                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3875               idx = findidx (&cp);
3876               if (idx > 0)
3877                 for (i = 0; i < cset->nequiv_classes; ++i)
3878                   {
3879                     int32_t equiv_class_idx = cset->equiv_classes[i];
3880                     size_t weight_len = weights[idx];
3881                     if (weight_len == weights[equiv_class_idx])
3882                       {
3883                         int cnt = 0;
3884                         while (cnt <= weight_len
3885                                && (weights[equiv_class_idx + 1 + cnt]
3886                                    == weights[idx + 1 + cnt]))
3887                           ++cnt;
3888                         if (cnt > weight_len)
3889                           {
3890                             match_len = elem_len;
3891                             goto check_node_accept_bytes_match;
3892                           }
3893                       }
3894                   }
3895             }
3896         }
3897       else
3898 # endif /* _LIBC */
3899         {
3900           /* match with range expression?  */
3901 #if __GNUC__ >= 2
3902           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3903 #else
3904           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3905           cmp_buf[2] = wc;
3906 #endif
3907           for (i = 0; i < cset->nranges; ++i)
3908             {
3909               cmp_buf[0] = cset->range_starts[i];
3910               cmp_buf[4] = cset->range_ends[i];
3911               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3912                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3913                 {
3914                   match_len = char_len;
3915                   goto check_node_accept_bytes_match;
3916                 }
3917             }
3918         }
3919     check_node_accept_bytes_match:
3920       if (!cset->non_match)
3921         return match_len;
3922       else
3923         {
3924           if (match_len > 0)
3925             return 0;
3926           else
3927             return (elem_len > char_len) ? elem_len : char_len;
3928         }
3929     }
3930   return 0;
3931 }
3932
3933 # ifdef _LIBC
3934 static unsigned int
3935 find_collation_sequence_value (mbs, mbs_len)
3936     const unsigned char *mbs;
3937     size_t mbs_len;
3938 {
3939   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3940   if (nrules == 0)
3941     {
3942       if (mbs_len == 1)
3943         {
3944           /* No valid character.  Match it as a single byte character.  */
3945           const unsigned char *collseq = (const unsigned char *)
3946             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3947           return collseq[mbs[0]];
3948         }
3949       return UINT_MAX;
3950     }
3951   else
3952     {
3953       int32_t idx;
3954       const unsigned char *extra = (const unsigned char *)
3955         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3956       int32_t extrasize = (const unsigned char *)
3957         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3958
3959       for (idx = 0; idx < extrasize;)
3960         {
3961           int mbs_cnt, found = 0;
3962           int32_t elem_mbs_len;
3963           /* Skip the name of collating element name.  */
3964           idx = idx + extra[idx] + 1;
3965           elem_mbs_len = extra[idx++];
3966           if (mbs_len == elem_mbs_len)
3967             {
3968               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3969                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3970                   break;
3971               if (mbs_cnt == elem_mbs_len)
3972                 /* Found the entry.  */
3973                 found = 1;
3974             }
3975           /* Skip the byte sequence of the collating element.  */
3976           idx += elem_mbs_len;
3977           /* Adjust for the alignment.  */
3978           idx = (idx + 3) & ~3;
3979           /* Skip the collation sequence value.  */
3980           idx += sizeof (uint32_t);
3981           /* Skip the wide char sequence of the collating element.  */
3982           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3983           /* If we found the entry, return the sequence value.  */
3984           if (found)
3985             return *(uint32_t *) (extra + idx);
3986           /* Skip the collation sequence value.  */
3987           idx += sizeof (uint32_t);
3988         }
3989       return UINT_MAX;
3990     }
3991 }
3992 # endif /* _LIBC */
3993 #endif /* RE_ENABLE_I18N */
3994
3995 /* Check whether the node accepts the byte which is IDX-th
3996    byte of the INPUT.  */
3997
3998 static int
3999 check_node_accept (mctx, node, idx)
4000     const re_match_context_t *mctx;
4001     const re_token_t *node;
4002     int idx;
4003 {
4004   unsigned char ch;
4005   ch = re_string_byte_at (&mctx->input, idx);
4006   switch (node->type)
4007     {
4008     case CHARACTER:
4009       if (node->opr.c != ch)
4010         return 0;
4011       break;
4012
4013     case SIMPLE_BRACKET:
4014       if (!bitset_contain (node->opr.sbcset, ch))
4015         return 0;
4016       break;
4017
4018 #ifdef RE_ENABLE_I18N
4019     case OP_UTF8_PERIOD:
4020       if (ch >= 0x80)
4021         return 0;
4022       /* FALLTHROUGH */
4023 #endif
4024     case OP_PERIOD:
4025       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4026           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4027         return 0;
4028       break;
4029
4030     default:
4031       return 0;
4032     }
4033
4034   if (node->constraint)
4035     {
4036       /* The node has constraints.  Check whether the current context
4037          satisfies the constraints.  */
4038       unsigned int context = re_string_context_at (&mctx->input, idx,
4039                                                    mctx->eflags);
4040       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4041         return 0;
4042     }
4043
4044   return 1;
4045 }
4046
4047 /* Extend the buffers, if the buffers have run out.  */
4048
4049 static reg_errcode_t
4050 extend_buffers (mctx)
4051      re_match_context_t *mctx;
4052 {
4053   reg_errcode_t ret;
4054   re_string_t *pstr = &mctx->input;
4055
4056   /* Double the lengthes of the buffers.  */
4057   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
4058   if (BE (ret != REG_NOERROR, 0))
4059     return ret;
4060
4061   if (mctx->state_log != NULL)
4062     {
4063       /* And double the length of state_log.  */
4064       /* XXX We have no indication of the size of this buffer.  If this
4065          allocation fail we have no indication that the state_log array
4066          does not have the right size.  */
4067       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4068                                               pstr->bufs_len + 1);
4069       if (BE (new_array == NULL, 0))
4070         return REG_ESPACE;
4071       mctx->state_log = new_array;
4072     }
4073
4074   /* Then reconstruct the buffers.  */
4075   if (pstr->icase)
4076     {
4077 #ifdef RE_ENABLE_I18N
4078       if (pstr->mb_cur_max > 1)
4079         {
4080           ret = build_wcs_upper_buffer (pstr);
4081           if (BE (ret != REG_NOERROR, 0))
4082             return ret;
4083         }
4084       else
4085 #endif /* RE_ENABLE_I18N  */
4086         build_upper_buffer (pstr);
4087     }
4088   else
4089     {
4090 #ifdef RE_ENABLE_I18N
4091       if (pstr->mb_cur_max > 1)
4092         build_wcs_buffer (pstr);
4093       else
4094 #endif /* RE_ENABLE_I18N  */
4095         {
4096           if (pstr->trans != NULL)
4097             re_string_translate_buffer (pstr);
4098         }
4099     }
4100   return REG_NOERROR;
4101 }
4102
4103 \f
4104 /* Functions for matching context.  */
4105
4106 /* Initialize MCTX.  */
4107
4108 static reg_errcode_t
4109 match_ctx_init (mctx, eflags, n)
4110     re_match_context_t *mctx;
4111     int eflags, n;
4112 {
4113   mctx->eflags = eflags;
4114   mctx->match_last = -1;
4115   if (n > 0)
4116     {
4117       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4118       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4119       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4120         return REG_ESPACE;
4121     }
4122   /* Already zero-ed by the caller.
4123      else
4124        mctx->bkref_ents = NULL;
4125      mctx->nbkref_ents = 0;
4126      mctx->nsub_tops = 0;  */
4127   mctx->abkref_ents = n;
4128   mctx->max_mb_elem_len = 1;
4129   mctx->asub_tops = n;
4130   return REG_NOERROR;
4131 }
4132
4133 /* Clean the entries which depend on the current input in MCTX.
4134    This function must be invoked when the matcher changes the start index
4135    of the input, or changes the input string.  */
4136
4137 static void
4138 match_ctx_clean (mctx)
4139     re_match_context_t *mctx;
4140 {
4141   int st_idx;
4142   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4143     {
4144       int sl_idx;
4145       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4146       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4147         {
4148           re_sub_match_last_t *last = top->lasts[sl_idx];
4149           re_free (last->path.array);
4150           re_free (last);
4151         }
4152       re_free (top->lasts);
4153       if (top->path)
4154         {
4155           re_free (top->path->array);
4156           re_free (top->path);
4157         }
4158       free (top);
4159     }
4160
4161   mctx->nsub_tops = 0;
4162   mctx->nbkref_ents = 0;
4163 }
4164
4165 /* Free all the memory associated with MCTX.  */
4166
4167 static void
4168 match_ctx_free (mctx)
4169     re_match_context_t *mctx;
4170 {
4171   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4172   match_ctx_clean (mctx);
4173   re_free (mctx->sub_tops);
4174   re_free (mctx->bkref_ents);
4175 }
4176
4177 /* Add a new backreference entry to MCTX.
4178    Note that we assume that caller never call this function with duplicate
4179    entry, and call with STR_IDX which isn't smaller than any existing entry.
4180 */
4181
4182 static reg_errcode_t
4183 match_ctx_add_entry (mctx, node, str_idx, from, to)
4184      re_match_context_t *mctx;
4185      int node, str_idx, from, to;
4186 {
4187   if (mctx->nbkref_ents >= mctx->abkref_ents)
4188     {
4189       struct re_backref_cache_entry* new_entry;
4190       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4191                               mctx->abkref_ents * 2);
4192       if (BE (new_entry == NULL, 0))
4193         {
4194           re_free (mctx->bkref_ents);
4195           return REG_ESPACE;
4196         }
4197       mctx->bkref_ents = new_entry;
4198       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4199               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4200       mctx->abkref_ents *= 2;
4201     }
4202   if (mctx->nbkref_ents > 0
4203       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4204     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4205
4206   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4207   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4208   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4209   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4210
4211   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4212      If bit N is clear, means that this entry won't epsilon-transition to
4213      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4214      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4215      such node.
4216
4217      A backreference does not epsilon-transition unless it is empty, so set
4218      to all zeros if FROM != TO.  */
4219   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4220     = (from == to ? ~0 : 0);
4221
4222   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4223   if (mctx->max_mb_elem_len < to - from)
4224     mctx->max_mb_elem_len = to - from;
4225   return REG_NOERROR;
4226 }
4227
4228 /* Search for the first entry which has the same str_idx, or -1 if none is
4229    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4230
4231 static int
4232 search_cur_bkref_entry (mctx, str_idx)
4233      re_match_context_t *mctx;
4234      int str_idx;
4235 {
4236   int left, right, mid, last;
4237   last = right = mctx->nbkref_ents;
4238   for (left = 0; left < right;)
4239     {
4240       mid = (left + right) / 2;
4241       if (mctx->bkref_ents[mid].str_idx < str_idx)
4242         left = mid + 1;
4243       else
4244         right = mid;
4245     }
4246   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4247     return left;
4248   else
4249     return -1;
4250 }
4251
4252 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4253    at STR_IDX.  */
4254
4255 static reg_errcode_t
4256 match_ctx_add_subtop (mctx, node, str_idx)
4257      re_match_context_t *mctx;
4258      int node, str_idx;
4259 {
4260 #ifdef DEBUG
4261   assert (mctx->sub_tops != NULL);
4262   assert (mctx->asub_tops > 0);
4263 #endif
4264   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4265     {
4266       int new_asub_tops = mctx->asub_tops * 2;
4267       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4268                                                    re_sub_match_top_t *,
4269                                                    new_asub_tops);
4270       if (BE (new_array == NULL, 0))
4271         return REG_ESPACE;
4272       mctx->sub_tops = new_array;
4273       mctx->asub_tops = new_asub_tops;
4274     }
4275   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4276   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4277     return REG_ESPACE;
4278   mctx->sub_tops[mctx->nsub_tops]->node = node;
4279   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4280   return REG_NOERROR;
4281 }
4282
4283 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4284    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4285
4286 static re_sub_match_last_t *
4287 match_ctx_add_sublast (subtop, node, str_idx)
4288      re_sub_match_top_t *subtop;
4289      int node, str_idx;
4290 {
4291   re_sub_match_last_t *new_entry;
4292   if (BE (subtop->nlasts == subtop->alasts, 0))
4293     {
4294       int new_alasts = 2 * subtop->alasts + 1;
4295       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4296                                                     re_sub_match_last_t *,
4297                                                     new_alasts);
4298       if (BE (new_array == NULL, 0))
4299         return NULL;
4300       subtop->lasts = new_array;
4301       subtop->alasts = new_alasts;
4302     }
4303   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4304   if (BE (new_entry != NULL, 1))
4305     {
4306       subtop->lasts[subtop->nlasts] = new_entry;
4307       new_entry->node = node;
4308       new_entry->str_idx = str_idx;
4309       ++subtop->nlasts;
4310     }
4311   return new_entry;
4312 }
4313
4314 static void
4315 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4316     re_sift_context_t *sctx;
4317     re_dfastate_t **sifted_sts, **limited_sts;
4318     int last_node, last_str_idx;
4319 {
4320   sctx->sifted_states = sifted_sts;
4321   sctx->limited_states = limited_sts;
4322   sctx->last_node = last_node;
4323   sctx->last_str_idx = last_str_idx;
4324   re_node_set_init_empty (&sctx->limits);
4325 }