posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  22                                      int n) internal_function;
  23 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  24 static void match_ctx_free (re_match_context_t *cache) internal_function;
  25 static void match_ctx_free_subtops (re_match_context_t *mctx)
  26      internal_function;
  27 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  28                                           int str_idx, int from, int to)
  29      internal_function;
  30 static int search_cur_bkref_entry (re_match_context_t *mctx, int str_idx)
  31      internal_function;
  32 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  33                                            int str_idx) internal_function;
  34 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  35                                                    int node, int str_idx)
  36      internal_function;
  37 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  38                            re_dfastate_t **limited_sts, int last_node,
  39                            int last_str_idx)
  40      internal_function;
  41 static reg_errcode_t re_search_internal (const regex_t *preg,
  42                                          const char *string, int length,
  43                                          int start, int range, int stop,
  44                                          size_t nmatch, regmatch_t pmatch[],
  45                                          int eflags) internal_function;
  46 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  47                              const char *string1, int length1,
  48                              const char *string2, int length2,
  49                              int start, int range, struct re_registers *regs,
  50                              int stop, int ret_len) internal_function;
  51 static int re_search_stub (struct re_pattern_buffer *bufp,
  52                            const char *string, int length, int start,
  53                            int range, int stop, struct re_registers *regs,
  54                            int ret_len) internal_function;
  55 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  56                               int nregs, int regs_allocated) internal_function;
  57 static inline re_dfastate_t *acquire_init_state_context
  58      (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
  59      __attribute ((always_inline)) internal_function;
  60 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  61      internal_function;
  62 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  63                            int *p_match_first)
  64      internal_function;
  65 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  66                                     unsigned int context) internal_function;
  67 static int check_halt_state_context (const re_match_context_t *mctx,
  68                                      const re_dfastate_t *state, int idx)
  69      internal_function;
  70 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch,
  71                          regmatch_t *prev_idx_match, int cur_node,
  72                          int cur_idx, int nmatch) internal_function;
  73 static int proceed_next_node (const re_match_context_t *mctx,
  74                               int nregs, regmatch_t *regs,
  75                               int *pidx, int node, re_node_set *eps_via_nodes,
  76                               struct re_fail_stack_t *fs) internal_function;
  77 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  78                                       int str_idx, int *dests, int nregs,
  79                                       regmatch_t *regs,
  80                                       re_node_set *eps_via_nodes) internal_function;
  81 static int pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
  82                            regmatch_t *regs, re_node_set *eps_via_nodes) internal_function;
  83 static reg_errcode_t set_regs (const regex_t *preg,
  84                                const re_match_context_t *mctx,
  85                                size_t nmatch, regmatch_t *pmatch,
  86                                int fl_backtrack) internal_function;
  87 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs) internal_function;
  88
  89 #ifdef RE_ENABLE_I18N
  90 static int sift_states_iter_mb (const re_match_context_t *mctx,
  91                                 re_sift_context_t *sctx,
  92                                 int node_idx, int str_idx, int max_str_idx) internal_function;
  93 #endif /* RE_ENABLE_I18N */
  94 static reg_errcode_t sift_states_backward (re_match_context_t *mctx,
  95                                            re_sift_context_t *sctx) internal_function;
  96 static reg_errcode_t build_sifted_states (re_match_context_t *mctx,
  97                                           re_sift_context_t *sctx, int str_idx,
  98                                           re_node_set *cur_dest) internal_function;
  99 static reg_errcode_t update_cur_sifted_state (re_match_context_t *mctx,
 100                                               re_sift_context_t *sctx,
 101                                               int str_idx,
 102                                               re_node_set *dest_nodes) internal_function;
 103 static reg_errcode_t add_epsilon_src_nodes (re_dfa_t *dfa,
 104                                             re_node_set *dest_nodes,
 105                                             const re_node_set *candidates) internal_function;
 106 static reg_errcode_t sub_epsilon_src_nodes (re_dfa_t *dfa, int node,
 107                                             re_node_set *dest_nodes,
 108                                             const re_node_set *and_nodes) internal_function;
 109 static int check_dst_limits (re_match_context_t *mctx, re_node_set *limits,
 110                              int dst_node, int dst_idx, int src_node,
 111                              int src_idx) internal_function;
 112 static int check_dst_limits_calc_pos_1 (re_match_context_t *mctx,
 113                                         int boundaries, int subexp_idx,
 114                                         int from_node, int bkref_idx) internal_function;
 115 static int check_dst_limits_calc_pos (re_match_context_t *mctx,
 116                                       int limit, int subexp_idx,
 117                                       int node, int str_idx,
 118                                       int bkref_idx) internal_function;
 119 static reg_errcode_t check_subexp_limits (re_dfa_t *dfa,
 120                                           re_node_set *dest_nodes,
 121                                           const re_node_set *candidates,
 122                                           re_node_set *limits,
 123                                           struct re_backref_cache_entry *bkref_ents,
 124                                           int str_idx) internal_function;
 125 static reg_errcode_t sift_states_bkref (re_match_context_t *mctx,
 126                                         re_sift_context_t *sctx,
 127                                         int str_idx, const re_node_set *candidates) internal_function;
 128 static reg_errcode_t clean_state_log_if_needed (re_match_context_t *mctx,
 129                                                 int next_state_log_idx) internal_function;
 130 static reg_errcode_t merge_state_array (re_dfa_t *dfa, re_dfastate_t **dst,
 131                                         re_dfastate_t **src, int num) internal_function;
 132 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 133                                          re_match_context_t *mctx) internal_function;
 134 static re_dfastate_t *transit_state (reg_errcode_t *err,
 135                                      re_match_context_t *mctx,
 136                                      re_dfastate_t *state) internal_function;
 137 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 138                                             re_match_context_t *mctx,
 139                                             re_dfastate_t *next_state) internal_function;
 140 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 141                                                 re_node_set *cur_nodes,
 142                                                 int str_idx) internal_function;
 143 #if 0
 144 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 145                                         re_match_context_t *mctx,
 146                                         re_dfastate_t *pstate) internal_function;
 147 #endif
 148 #ifdef RE_ENABLE_I18N
 149 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 150                                        re_dfastate_t *pstate) internal_function;
 151 #endif /* RE_ENABLE_I18N */
 152 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 153                                           const re_node_set *nodes) internal_function;
 154 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 155                                  int bkref_node, int bkref_str_idx) internal_function;
 156 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 157                                      const re_sub_match_top_t *sub_top,
 158                                      re_sub_match_last_t *sub_last,
 159                                      int bkref_node, int bkref_str) internal_function;
 160 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 161                              int subexp_idx, int type) internal_function;
 162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 163                                     state_array_t *path, int top_node,
 164                                     int top_str, int last_node, int last_str,
 165                                     int type) internal_function;
 166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 167                                                    int str_idx,
 168                                                    re_node_set *cur_nodes,
 169                                                    re_node_set *next_nodes) internal_function;
 170 static reg_errcode_t check_arrival_expand_ecl (re_dfa_t *dfa,
 171                                                re_node_set *cur_nodes,
 172                                                int ex_subexp, int type) internal_function;
 173 static reg_errcode_t check_arrival_expand_ecl_sub (re_dfa_t *dfa,
 174                                                    re_node_set *dst_nodes,
 175                                                    int target, int ex_subexp,
 176                                                    int type) internal_function;
 177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 178                                          re_node_set *cur_nodes, int cur_str,
 179                                          int subexp_num, int type) internal_function;
 180 static re_dfastate_t **build_trtable (re_dfa_t *dfa,
 181                                       re_dfastate_t *state) internal_function;
 182 #ifdef RE_ENABLE_I18N
 183 static int check_node_accept_bytes (re_dfa_t *dfa, int node_idx,
 184                                     const re_string_t *input, int idx) internal_function;
 185 # ifdef _LIBC
 186 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 187                                                    size_t name_len) internal_function;
 188 # endif /* _LIBC */
 189 #endif /* RE_ENABLE_I18N */
 190 static int group_nodes_into_DFAstates (re_dfa_t *dfa,
 191                                        const re_dfastate_t *state,
 192                                        re_node_set *states_node,
 193                                        bitset *states_ch) internal_function;
 194 static int check_node_accept (const re_match_context_t *mctx,
 195                               const re_token_t *node, int idx) internal_function;
 196 static reg_errcode_t extend_buffers (re_match_context_t *mctx) internal_function;
 197 \f
 198 /* Entry point for POSIX code.  */
 199
 200 /* regexec searches for a given pattern, specified by PREG, in the
 201    string STRING.
 202
 203    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 204    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 205    least NMATCH elements, and we set them to the offsets of the
 206    corresponding matched substrings.
 207
 208    EFLAGS specifies `execution flags' which affect matching: if
 209    REG_NOTBOL is set, then ^ does not match at the beginning of the
 210    string; if REG_NOTEOL is set, then $ does not match at the end.
 211
 212    We return 0 if we find a match and REG_NOMATCH if not.  */
 213
 214 int
 215 regexec (preg, string, nmatch, pmatch, eflags)
 216     const regex_t *__restrict preg;
 217     const char *__restrict string;
 218     size_t nmatch;
 219     regmatch_t pmatch[];
 220     int eflags;
 221 {
 222   reg_errcode_t err;
 223   int start, length;
 224
 225   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 226     return REG_BADPAT;
 227
 228   if (eflags & REG_STARTEND)
 229     {
 230       start = pmatch[0].rm_so;
 231       length = pmatch[0].rm_eo;
 232     }
 233   else
 234     {
 235       start = 0;
 236       length = strlen (string);
 237     }
 238   if (preg->no_sub)
 239     err = re_search_internal (preg, string, length, start, length - start,
 240                               length, 0, NULL, eflags);
 241   else
 242     err = re_search_internal (preg, string, length, start, length - start,
 243                               length, nmatch, pmatch, eflags);
 244   return err != REG_NOERROR;
 245 }
 246
 247 #ifdef _LIBC
 248 # include <shlib-compat.h>
 249 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 250
 251 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 252 __typeof__ (__regexec) __compat_regexec;
 253
 254 int
 255 attribute_compat_text_section
 256 __compat_regexec (const regex_t *__restrict preg,
 257                   const char *__restrict string, size_t nmatch,
 258                   regmatch_t pmatch[], int eflags)
 259 {
 260   return regexec (preg, string, nmatch, pmatch,
 261                   eflags & (REG_NOTBOL | REG_NOTEOL));
 262 }
 263 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 264 # endif
 265 #endif
 266
 267 /* Entry points for GNU code.  */
 268
 269 /* re_match, re_search, re_match_2, re_search_2
 270
 271    The former two functions operate on STRING with length LENGTH,
 272    while the later two operate on concatenation of STRING1 and STRING2
 273    with lengths LENGTH1 and LENGTH2, respectively.
 274
 275    re_match() matches the compiled pattern in BUFP against the string,
 276    starting at index START.
 277
 278    re_search() first tries matching at index START, then it tries to match
 279    starting from index START + 1, and so on.  The last start position tried
 280    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 281    way as re_match().)
 282
 283    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 284    the first STOP characters of the concatenation of the strings should be
 285    concerned.
 286
 287    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 288    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 289    computed relative to the concatenation, not relative to the individual
 290    strings.)
 291
 292    On success, re_match* functions return the length of the match, re_search*
 293    return the position of the start of the match.  Return value -1 means no
 294    match was found and -2 indicates an internal error.  */
 295
 296 int
 297 re_match (bufp, string, length, start, regs)
 298     struct re_pattern_buffer *bufp;
 299     const char *string;
 300     int length, start;
 301     struct re_registers *regs;
 302 {
 303   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 304 }
 305 #ifdef _LIBC
 306 weak_alias (__re_match, re_match)
 307 #endif
 308
 309 int
 310 re_search (bufp, string, length, start, range, regs)
 311     struct re_pattern_buffer *bufp;
 312     const char *string;
 313     int length, start, range;
 314     struct re_registers *regs;
 315 {
 316   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 317 }
 318 #ifdef _LIBC
 319 weak_alias (__re_search, re_search)
 320 #endif
 321
 322 int
 323 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 324     struct re_pattern_buffer *bufp;
 325     const char *string1, *string2;
 326     int length1, length2, start, stop;
 327     struct re_registers *regs;
 328 {
 329   return re_search_2_stub (bufp, string1, length1, string2, length2,
 330                            start, 0, regs, stop, 1);
 331 }
 332 #ifdef _LIBC
 333 weak_alias (__re_match_2, re_match_2)
 334 #endif
 335
 336 int
 337 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 338     struct re_pattern_buffer *bufp;
 339     const char *string1, *string2;
 340     int length1, length2, start, range, stop;
 341     struct re_registers *regs;
 342 {
 343   return re_search_2_stub (bufp, string1, length1, string2, length2,
 344                            start, range, regs, stop, 0);
 345 }
 346 #ifdef _LIBC
 347 weak_alias (__re_search_2, re_search_2)
 348 #endif
 349
 350 static int
 351 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 352                   stop, ret_len)
 353     struct re_pattern_buffer *bufp;
 354     const char *string1, *string2;
 355     int length1, length2, start, range, stop, ret_len;
 356     struct re_registers *regs;
 357 {
 358   const char *str;
 359   int rval;
 360   int len = length1 + length2;
 361   int free_str = 0;
 362
 363   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 364     return -2;
 365
 366   /* Concatenate the strings.  */
 367   if (length2 > 0)
 368     if (length1 > 0)
 369       {
 370         char *s = re_malloc (char, len);
 371
 372         if (BE (s == NULL, 0))
 373           return -2;
 374         memcpy (s, string1, length1);
 375         memcpy (s + length1, string2, length2);
 376         str = s;
 377         free_str = 1;
 378       }
 379     else
 380       str = string2;
 381   else
 382     str = string1;
 383
 384   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 385                          ret_len);
 386   if (free_str)
 387     re_free ((char *) str);
 388   return rval;
 389 }
 390
 391 /* The parameters have the same meaning as those of re_search.
 392    Additional parameters:
 393    If RET_LEN is nonzero the length of the match is returned (re_match style);
 394    otherwise the position of the match is returned.  */
 395
 396 static int
 397 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 398     struct re_pattern_buffer *bufp;
 399     const char *string;
 400     int length, start, range, stop, ret_len;
 401     struct re_registers *regs;
 402 {
 403   reg_errcode_t result;
 404   regmatch_t *pmatch;
 405   int nregs, rval;
 406   int eflags = 0;
 407
 408   /* Check for out-of-range.  */
 409   if (BE (start < 0 || start > length, 0))
 410     return -1;
 411   if (BE (start + range > length, 0))
 412     range = length - start;
 413   else if (BE (start + range < 0, 0))
 414     range = -start;
 415
 416   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 417   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 418
 419   /* Compile fastmap if we haven't yet.  */
 420   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 421     re_compile_fastmap (bufp);
 422
 423   if (BE (bufp->no_sub, 0))
 424     regs = NULL;
 425
 426   /* We need at least 1 register.  */
 427   if (regs == NULL)
 428     nregs = 1;
 429   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 430                regs->num_regs < bufp->re_nsub + 1, 0))
 431     {
 432       nregs = regs->num_regs;
 433       if (BE (nregs < 1, 0))
 434         {
 435           /* Nothing can be copied to regs.  */
 436           regs = NULL;
 437           nregs = 1;
 438         }
 439     }
 440   else
 441     nregs = bufp->re_nsub + 1;
 442   pmatch = re_malloc (regmatch_t, nregs);
 443   if (BE (pmatch == NULL, 0))
 444     return -2;
 445
 446   result = re_search_internal (bufp, string, length, start, range, stop,
 447                                nregs, pmatch, eflags);
 448
 449   rval = 0;
 450
 451   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 452   if (result != REG_NOERROR)
 453     rval = -1;
 454   else if (regs != NULL)
 455     {
 456       /* If caller wants register contents data back, copy them.  */
 457       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 458                                            bufp->regs_allocated);
 459       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 460         rval = -2;
 461     }
 462
 463   if (BE (rval == 0, 1))
 464     {
 465       if (ret_len)
 466         {
 467           assert (pmatch[0].rm_so == start);
 468           rval = pmatch[0].rm_eo - start;
 469         }
 470       else
 471         rval = pmatch[0].rm_so;
 472     }
 473   re_free (pmatch);
 474   return rval;
 475 }
 476
 477 static unsigned
 478 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 479     struct re_registers *regs;
 480     regmatch_t *pmatch;
 481     int nregs, regs_allocated;
 482 {
 483   int rval = REGS_REALLOCATE;
 484   int i;
 485   int need_regs = nregs + 1;
 486   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 487      uses.  */
 488
 489   /* Have the register data arrays been allocated?  */
 490   if (regs_allocated == REGS_UNALLOCATED)
 491     { /* No.  So allocate them with malloc.  */
 492       regs->start = re_malloc (regoff_t, need_regs);
 493       regs->end = re_malloc (regoff_t, need_regs);
 494       if (BE (regs->start == NULL, 0) || BE (regs->end == NULL, 0))
 495         return REGS_UNALLOCATED;
 496       regs->num_regs = need_regs;
 497     }
 498   else if (regs_allocated == REGS_REALLOCATE)
 499     { /* Yes.  If we need more elements than were already
 500          allocated, reallocate them.  If we need fewer, just
 501          leave it alone.  */
 502       if (BE (need_regs > regs->num_regs, 0))
 503         {
 504           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 505           regoff_t *new_end = re_realloc (regs->end, regoff_t, need_regs);
 506           if (BE (new_start == NULL, 0) || BE (new_end == NULL, 0))
 507             return REGS_UNALLOCATED;
 508           regs->start = new_start;
 509           regs->end = new_end;
 510           regs->num_regs = need_regs;
 511         }
 512     }
 513   else
 514     {
 515       assert (regs_allocated == REGS_FIXED);
 516       /* This function may not be called with REGS_FIXED and nregs too big.  */
 517       assert (regs->num_regs >= nregs);
 518       rval = REGS_FIXED;
 519     }
 520
 521   /* Copy the regs.  */
 522   for (i = 0; i < nregs; ++i)
 523     {
 524       regs->start[i] = pmatch[i].rm_so;
 525       regs->end[i] = pmatch[i].rm_eo;
 526     }
 527   for ( ; i < regs->num_regs; ++i)
 528     regs->start[i] = regs->end[i] = -1;
 529
 530   return rval;
 531 }
 532
 533 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 534    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 535    this memory for recording register information.  STARTS and ENDS
 536    must be allocated using the malloc library routine, and must each
 537    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 538
 539    If NUM_REGS == 0, then subsequent matches should allocate their own
 540    register data.
 541
 542    Unless this function is called, the first search or match using
 543    PATTERN_BUFFER will allocate its own register data, without
 544    freeing the old data.  */
 545
 546 void
 547 re_set_registers (bufp, regs, num_regs, starts, ends)
 548     struct re_pattern_buffer *bufp;
 549     struct re_registers *regs;
 550     unsigned num_regs;
 551     regoff_t *starts, *ends;
 552 {
 553   if (num_regs)
 554     {
 555       bufp->regs_allocated = REGS_REALLOCATE;
 556       regs->num_regs = num_regs;
 557       regs->start = starts;
 558       regs->end = ends;
 559     }
 560   else
 561     {
 562       bufp->regs_allocated = REGS_UNALLOCATED;
 563       regs->num_regs = 0;
 564       regs->start = regs->end = (regoff_t *) 0;
 565     }
 566 }
 567 #ifdef _LIBC
 568 weak_alias (__re_set_registers, re_set_registers)
 569 #endif
 570 \f
 571 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 572    them unless specifically requested.  */
 573
 574 #if defined _REGEX_RE_COMP || defined _LIBC
 575 int
 576 # ifdef _LIBC
 577 weak_function
 578 # endif
 579 re_exec (s)
 580      const char *s;
 581 {
 582   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 583 }
 584 #endif /* _REGEX_RE_COMP */
 585 \f
 586 /* Internal entry point.  */
 587
 588 /* Searches for a compiled pattern PREG in the string STRING, whose
 589    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 590    mingings with regexec.  START, and RANGE have the same meanings
 591    with re_search.
 592    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 593    otherwise return the error code.
 594    Note: We assume front end functions already check ranges.
 595    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 596
 597 static reg_errcode_t
 598 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 599                     eflags)
 600     const regex_t *preg;
 601     const char *string;
 602     int length, start, range, stop, eflags;
 603     size_t nmatch;
 604     regmatch_t pmatch[];
 605 {
 606   reg_errcode_t err;
 607   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 608   int left_lim, right_lim, incr;
 609   int fl_longest_match, match_first, match_last = -1;
 610   int fast_translate, sb;
 611 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 612   re_match_context_t mctx = { .dfa = dfa };
 613 #else
 614   re_match_context_t mctx;
 615 #endif
 616   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
 617                     && range && !preg->can_be_null) ? preg->fastmap : NULL);
 618
 619 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 620   memset (&mctx, '\0', sizeof (re_match_context_t));
 621   mctx.dfa = dfa;
 622 #endif
 623
 624   /* Check if the DFA haven't been compiled.  */
 625   if (BE (preg->used == 0 || dfa->init_state == NULL
 626           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 627           || dfa->init_state_begbuf == NULL, 0))
 628     return REG_NOMATCH;
 629
 630 #ifdef DEBUG
 631   /* We assume front-end functions already check them.  */
 632   assert (start + range >= 0 && start + range <= length);
 633 #endif
 634
 635   /* If initial states with non-begbuf contexts have no elements,
 636      the regex must be anchored.  If preg->newline_anchor is set,
 637      we'll never use init_state_nl, so do not check it.  */
 638   if (dfa->init_state->nodes.nelem == 0
 639       && dfa->init_state_word->nodes.nelem == 0
 640       && (dfa->init_state_nl->nodes.nelem == 0
 641           || !preg->newline_anchor))
 642     {
 643       if (start != 0 && start + range != 0)
 644         return REG_NOMATCH;
 645       start = range = 0;
 646     }
 647
 648   /* We must check the longest matching, if nmatch > 0.  */
 649   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 650
 651   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 652                             preg->translate, preg->syntax & RE_ICASE, dfa);
 653   if (BE (err != REG_NOERROR, 0))
 654     goto free_return;
 655   mctx.input.stop = stop;
 656   mctx.input.raw_stop = stop;
 657   mctx.input.newline_anchor = preg->newline_anchor;
 658
 659   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 660   if (BE (err != REG_NOERROR, 0))
 661     goto free_return;
 662
 663   /* We will log all the DFA states through which the dfa pass,
 664      if nmatch > 1, or this dfa has "multibyte node", which is a
 665      back-reference or a node which can accept multibyte character or
 666      multi character collating element.  */
 667   if (nmatch > 1 || dfa->has_mb_node)
 668     {
 669       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 670       if (BE (mctx.state_log == NULL, 0))
 671         {
 672           err = REG_ESPACE;
 673           goto free_return;
 674         }
 675     }
 676   else
 677     mctx.state_log = NULL;
 678
 679   match_first = start;
 680   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 681                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 682
 683   /* Check incrementally whether of not the input string match.  */
 684   incr = (range < 0) ? -1 : 1;
 685   left_lim = (range < 0) ? start + range : start;
 686   right_lim = (range < 0) ? start : start + range;
 687   sb = dfa->mb_cur_max == 1;
 688   fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
 689
 690   for (;;)
 691     {
 692       /* At first get the current byte from input string.  */
 693       if (fastmap)
 694         {
 695           if (BE (fast_translate, 1))
 696             {
 697               unsigned RE_TRANSLATE_TYPE t
 698                 = (unsigned RE_TRANSLATE_TYPE) preg->translate;
 699               if (BE (range >= 0, 1))
 700                 {
 701                   if (BE (t != NULL, 0))
 702                     {
 703                       while (BE (match_first < right_lim, 1)
 704                              && !fastmap[t[(unsigned char) string[match_first]]])
 705                         ++match_first;
 706                     }
 707                   else
 708                     {
 709                       while (BE (match_first < right_lim, 1)
 710                              && !fastmap[(unsigned char) string[match_first]])
 711                         ++match_first;
 712                     }
 713                   if (BE (match_first == right_lim, 0))
 714                     {
 715                       int ch = match_first >= length
 716                                ? 0 : (unsigned char) string[match_first];
 717                       if (!fastmap[t ? t[ch] : ch])
 718                         break;
 719                     }
 720                 }
 721               else
 722                 {
 723                   while (match_first >= left_lim)
 724                     {
 725                       int ch = match_first >= length
 726                                ? 0 : (unsigned char) string[match_first];
 727                       if (fastmap[t ? t[ch] : ch])
 728                         break;
 729                       --match_first;
 730                     }
 731                   if (match_first < left_lim)
 732                     break;
 733                 }
 734             }
 735           else
 736             {
 737               int ch;
 738
 739               do
 740                 {
 741                   /* In this case, we can't determine easily the current byte,
 742                      since it might be a component byte of a multibyte
 743                      character.  Then we use the constructed buffer
 744                      instead.  */
 745                   /* If MATCH_FIRST is out of the valid range, reconstruct the
 746                      buffers.  */
 747                   if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
 748                       <= match_first
 749                       || match_first < mctx.input.raw_mbs_idx)
 750                     {
 751                       err = re_string_reconstruct (&mctx.input, match_first,
 752                                                    eflags);
 753                       if (BE (err != REG_NOERROR, 0))
 754                         goto free_return;
 755                     }
 756                   /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 757                      Note that MATCH_FIRST must not be smaller than 0.  */
 758                   ch = ((match_first >= length) ? 0
 759                        : re_string_byte_at (&mctx.input,
 760                                             match_first
 761                                             - mctx.input.raw_mbs_idx));
 762                   if (fastmap[ch])
 763                     break;
 764                   match_first += incr;
 765                 }
 766               while (match_first >= left_lim && match_first <= right_lim);
 767               if (! fastmap[ch])
 768                 break;
 769             }
 770         }
 771
 772       /* Reconstruct the buffers so that the matcher can assume that
 773          the matching starts from the beginning of the buffer.  */
 774       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 775       if (BE (err != REG_NOERROR, 0))
 776         goto free_return;
 777 #ifdef RE_ENABLE_I18N
 778      /* Eliminate it when it is a component of a multibyte character
 779          and isn't the head of a multibyte character.  */
 780       if (sb || re_string_first_byte (&mctx.input, 0))
 781 #endif
 782         {
 783           /* It seems to be appropriate one, then use the matcher.  */
 784           /* We assume that the matching starts from 0.  */
 785           mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 786           match_last = check_matching (&mctx, fl_longest_match,
 787                                        range >= 0 ? &match_first : NULL);
 788           if (match_last != -1)
 789             {
 790               if (BE (match_last == -2, 0))
 791                 {
 792                   err = REG_ESPACE;
 793                   goto free_return;
 794                 }
 795               else
 796                 {
 797                   mctx.match_last = match_last;
 798                   if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 799                     {
 800                       re_dfastate_t *pstate = mctx.state_log[match_last];
 801                       mctx.last_node = check_halt_state_context (&mctx, pstate,
 802                                                                  match_last);
 803                     }
 804                   if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 805                       || dfa->nbackref)
 806                     {
 807                       err = prune_impossible_nodes (&mctx);
 808                       if (err == REG_NOERROR)
 809                         break;
 810                       if (BE (err != REG_NOMATCH, 0))
 811                         goto free_return;
 812                       match_last = -1;
 813                     }
 814                   else
 815                     break; /* We found a match.  */
 816                 }
 817             }
 818           match_ctx_clean (&mctx);
 819         }
 820       /* Update counter.  */
 821       match_first += incr;
 822       if (match_first < left_lim || right_lim < match_first)
 823         break;
 824     }
 825
 826   /* Set pmatch[] if we need.  */
 827   if (match_last != -1 && nmatch > 0)
 828     {
 829       int reg_idx;
 830
 831       /* Initialize registers.  */
 832       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 833         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 834
 835       /* Set the points where matching start/end.  */
 836       pmatch[0].rm_so = 0;
 837       pmatch[0].rm_eo = mctx.match_last;
 838
 839       if (!preg->no_sub && nmatch > 1)
 840         {
 841           err = set_regs (preg, &mctx, nmatch, pmatch,
 842                           dfa->has_plural_match && dfa->nbackref > 0);
 843           if (BE (err != REG_NOERROR, 0))
 844             goto free_return;
 845         }
 846
 847       /* At last, add the offset to the each registers, since we slided
 848          the buffers so that we could assume that the matching starts
 849          from 0.  */
 850       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 851         if (pmatch[reg_idx].rm_so != -1)
 852           {
 853 #ifdef RE_ENABLE_I18N
 854             if (BE (mctx.input.offsets_needed != 0, 0))
 855               {
 856                 if (pmatch[reg_idx].rm_so == mctx.input.valid_len)
 857                   pmatch[reg_idx].rm_so += mctx.input.valid_raw_len - mctx.input.valid_len;
 858                 else
 859                   pmatch[reg_idx].rm_so = mctx.input.offsets[pmatch[reg_idx].rm_so];
 860                 if (pmatch[reg_idx].rm_eo == mctx.input.valid_len)
 861                   pmatch[reg_idx].rm_eo += mctx.input.valid_raw_len - mctx.input.valid_len;
 862                 else
 863                   pmatch[reg_idx].rm_eo = mctx.input.offsets[pmatch[reg_idx].rm_eo];
 864               }
 865 #else
 866             assert (mctx.input.offsets_needed == 0);
 867 #endif
 868             pmatch[reg_idx].rm_so += match_first;
 869             pmatch[reg_idx].rm_eo += match_first;
 870           }
 871     }
 872   err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
 873  free_return:
 874   re_free (mctx.state_log);
 875   if (dfa->nbackref)
 876     match_ctx_free (&mctx);
 877   re_string_destruct (&mctx.input);
 878   return err;
 879 }
 880
 881 static reg_errcode_t
 882 prune_impossible_nodes (mctx)
 883      re_match_context_t *mctx;
 884 {
 885   re_dfa_t *const dfa = mctx->dfa;
 886   int halt_node, match_last;
 887   reg_errcode_t ret;
 888   re_dfastate_t **sifted_states;
 889   re_dfastate_t **lim_states = NULL;
 890   re_sift_context_t sctx;
 891 #ifdef DEBUG
 892   assert (mctx->state_log != NULL);
 893 #endif
 894   match_last = mctx->match_last;
 895   halt_node = mctx->last_node;
 896   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 897   if (BE (sifted_states == NULL, 0))
 898     {
 899       ret = REG_ESPACE;
 900       goto free_return;
 901     }
 902   if (dfa->nbackref)
 903     {
 904       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 905       if (BE (lim_states == NULL, 0))
 906         {
 907           ret = REG_ESPACE;
 908           goto free_return;
 909         }
 910       while (1)
 911         {
 912           memset (lim_states, '\0',
 913                   sizeof (re_dfastate_t *) * (match_last + 1));
 914           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 915                          match_last);
 916           ret = sift_states_backward (mctx, &sctx);
 917           re_node_set_free (&sctx.limits);
 918           if (BE (ret != REG_NOERROR, 0))
 919               goto free_return;
 920           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 921             break;
 922           do
 923             {
 924               --match_last;
 925               if (match_last < 0)
 926                 {
 927                   ret = REG_NOMATCH;
 928                   goto free_return;
 929                 }
 930             } while (mctx->state_log[match_last] == NULL
 931                      || !mctx->state_log[match_last]->halt);
 932           halt_node = check_halt_state_context (mctx,
 933                                                 mctx->state_log[match_last],
 934                                                 match_last);
 935         }
 936       ret = merge_state_array (dfa, sifted_states, lim_states,
 937                                match_last + 1);
 938       re_free (lim_states);
 939       lim_states = NULL;
 940       if (BE (ret != REG_NOERROR, 0))
 941         goto free_return;
 942     }
 943   else
 944     {
 945       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 946       ret = sift_states_backward (mctx, &sctx);
 947       re_node_set_free (&sctx.limits);
 948       if (BE (ret != REG_NOERROR, 0))
 949         goto free_return;
 950     }
 951   re_free (mctx->state_log);
 952   mctx->state_log = sifted_states;
 953   sifted_states = NULL;
 954   mctx->last_node = halt_node;
 955   mctx->match_last = match_last;
 956   ret = REG_NOERROR;
 957  free_return:
 958   re_free (sifted_states);
 959   re_free (lim_states);
 960   return ret;
 961 }
 962
 963 /* Acquire an initial state and return it.
 964    We must select appropriate initial state depending on the context,
 965    since initial states may have constraints like "\<", "^", etc..  */
 966
 967 static inline re_dfastate_t *
 968 acquire_init_state_context (err, mctx, idx)
 969      reg_errcode_t *err;
 970      const re_match_context_t *mctx;
 971      int idx;
 972 {
 973   re_dfa_t *const dfa = mctx->dfa;
 974   if (dfa->init_state->has_constraint)
 975     {
 976       unsigned int context;
 977       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
 978       if (IS_WORD_CONTEXT (context))
 979         return dfa->init_state_word;
 980       else if (IS_ORDINARY_CONTEXT (context))
 981         return dfa->init_state;
 982       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
 983         return dfa->init_state_begbuf;
 984       else if (IS_NEWLINE_CONTEXT (context))
 985         return dfa->init_state_nl;
 986       else if (IS_BEGBUF_CONTEXT (context))
 987         {
 988           /* It is relatively rare case, then calculate on demand.  */
 989           return re_acquire_state_context (err, dfa,
 990                                            dfa->init_state->entrance_nodes,
 991                                            context);
 992         }
 993       else
 994         /* Must not happen?  */
 995         return dfa->init_state;
 996     }
 997   else
 998     return dfa->init_state;
 999 }
1000
1001 /* Check whether the regular expression match input string INPUT or not,
1002    and return the index where the matching end, return -1 if not match,
1003    or return -2 in case of an error.
1004    FL_LONGEST_MATCH means we want the POSIX longest matching.
1005    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1006    next place where we may want to try matching.
1007    Note that the matcher assume that the maching starts from the current
1008    index of the buffer.  */
1009
1010 static int
1011 check_matching (mctx, fl_longest_match, p_match_first)
1012     re_match_context_t *mctx;
1013     int fl_longest_match;
1014     int *p_match_first;
1015 {
1016   re_dfa_t *const dfa = mctx->dfa;
1017   reg_errcode_t err;
1018   int match = 0;
1019   int match_last = -1;
1020   int cur_str_idx = re_string_cur_idx (&mctx->input);
1021   re_dfastate_t *cur_state;
1022   int at_init_state = p_match_first != NULL;
1023   int next_start_idx = cur_str_idx;
1024
1025   err = REG_NOERROR;
1026   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1027   /* An initial state must not be NULL (invalid).  */
1028   if (BE (cur_state == NULL, 0))
1029     {
1030       assert (err == REG_ESPACE);
1031       return -2;
1032     }
1033
1034   if (mctx->state_log != NULL)
1035     {
1036       mctx->state_log[cur_str_idx] = cur_state;
1037
1038       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1039          later.  E.g. Processing back references.  */
1040       if (BE (dfa->nbackref, 0))
1041         {
1042           at_init_state = 0;
1043           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1044           if (BE (err != REG_NOERROR, 0))
1045             return err;
1046
1047           if (cur_state->has_backref)
1048             {
1049               err = transit_state_bkref (mctx, &cur_state->nodes);
1050               if (BE (err != REG_NOERROR, 0))
1051                 return err;
1052             }
1053         }
1054     }
1055
1056   /* If the RE accepts NULL string.  */
1057   if (BE (cur_state->halt, 0))
1058     {
1059       if (!cur_state->has_constraint
1060           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1061         {
1062           if (!fl_longest_match)
1063             return cur_str_idx;
1064           else
1065             {
1066               match_last = cur_str_idx;
1067               match = 1;
1068             }
1069         }
1070     }
1071
1072   while (!re_string_eoi (&mctx->input))
1073     {
1074       re_dfastate_t *old_state = cur_state;
1075       cur_state = transit_state (&err, mctx, cur_state);
1076       if (mctx->state_log != NULL)
1077         cur_state = merge_state_with_log (&err, mctx, cur_state);
1078
1079       if (cur_state == NULL)
1080         {
1081           /* Reached the invalid state or an error.  Try to recover a valid
1082              state using the state log, if available and if we have not
1083              already found a valid (even if not the longest) match.  */
1084           if (BE (err != REG_NOERROR, 0))
1085             return -2;
1086
1087           if (mctx->state_log == NULL
1088               || (match && !fl_longest_match)
1089               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1090             break;
1091         }
1092
1093       if (at_init_state)
1094         {
1095           if (old_state == cur_state)
1096             next_start_idx = re_string_cur_idx (&mctx->input);
1097           else
1098             at_init_state = 0;
1099         }
1100
1101       if (cur_state->halt)
1102         {
1103           /* Reached a halt state.
1104              Check the halt state can satisfy the current context.  */
1105           if (!cur_state->has_constraint
1106               || check_halt_state_context (mctx, cur_state,
1107                                            re_string_cur_idx (&mctx->input)))
1108             {
1109               /* We found an appropriate halt state.  */
1110               match_last = re_string_cur_idx (&mctx->input);
1111               match = 1;
1112               if (!fl_longest_match)
1113                 break;
1114             }
1115         }
1116    }
1117
1118   if (match_last == -1 && p_match_first)
1119     *p_match_first += next_start_idx;
1120
1121   return match_last;
1122 }
1123
1124 /* Check NODE match the current context.  */
1125
1126 static int check_halt_node_context (dfa, node, context)
1127     const re_dfa_t *dfa;
1128     int node;
1129     unsigned int context;
1130 {
1131   re_token_type_t type = dfa->nodes[node].type;
1132   unsigned int constraint = dfa->nodes[node].constraint;
1133   if (type != END_OF_RE)
1134     return 0;
1135   if (!constraint)
1136     return 1;
1137   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1138     return 0;
1139   return 1;
1140 }
1141
1142 /* Check the halt state STATE match the current context.
1143    Return 0 if not match, if the node, STATE has, is a halt node and
1144    match the context, return the node.  */
1145
1146 static int
1147 check_halt_state_context (mctx, state, idx)
1148     const re_match_context_t *mctx;
1149     const re_dfastate_t *state;
1150     int idx;
1151 {
1152   int i;
1153   unsigned int context;
1154 #ifdef DEBUG
1155   assert (state->halt);
1156 #endif
1157   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1158   for (i = 0; i < state->nodes.nelem; ++i)
1159     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1160       return state->nodes.elems[i];
1161   return 0;
1162 }
1163
1164 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1165    corresponding to the DFA).
1166    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1167    of errors.  */
1168
1169 static int
1170 proceed_next_node (mctx, nregs, regs, pidx, node, eps_via_nodes, fs)
1171     const re_match_context_t *mctx;
1172     regmatch_t *regs;
1173     int nregs, *pidx, node;
1174     re_node_set *eps_via_nodes;
1175     struct re_fail_stack_t *fs;
1176 {
1177   re_dfa_t *const dfa = mctx->dfa;
1178   int i, err, dest_node;
1179   dest_node = -1;
1180   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1181     {
1182       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1183       int ndest, dest_nodes[2];
1184       err = re_node_set_insert (eps_via_nodes, node);
1185       if (BE (err < 0, 0))
1186         return -2;
1187       /* Pick up valid destinations.  */
1188       for (ndest = 0, i = 0; i < dfa->edests[node].nelem; ++i)
1189         {
1190           int candidate = dfa->edests[node].elems[i];
1191           if (!re_node_set_contains (cur_nodes, candidate))
1192             continue;
1193           dest_nodes[0] = (ndest == 0) ? candidate : dest_nodes[0];
1194           dest_nodes[1] = (ndest == 1) ? candidate : dest_nodes[1];
1195           ++ndest;
1196         }
1197       if (ndest <= 1)
1198         return ndest == 0 ? -1 : (ndest == 1 ? dest_nodes[0] : 0);
1199       /* In order to avoid infinite loop like "(a*)*".  */
1200       if (re_node_set_contains (eps_via_nodes, dest_nodes[0]))
1201         return dest_nodes[1];
1202       if (fs != NULL
1203           && push_fail_stack (fs, *pidx, dest_nodes, nregs, regs,
1204                               eps_via_nodes))
1205         return -2;
1206       return dest_nodes[0];
1207     }
1208   else
1209     {
1210       int naccepted = 0;
1211       re_token_type_t type = dfa->nodes[node].type;
1212
1213 #ifdef RE_ENABLE_I18N
1214       if (ACCEPT_MB_NODE (type))
1215         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1216       else
1217 #endif /* RE_ENABLE_I18N */
1218       if (type == OP_BACK_REF)
1219         {
1220           int subexp_idx = dfa->nodes[node].opr.idx;
1221           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1222           if (fs != NULL)
1223             {
1224               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1225                 return -1;
1226               else if (naccepted)
1227                 {
1228                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1229                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1230                               naccepted) != 0)
1231                     return -1;
1232                 }
1233             }
1234
1235           if (naccepted == 0)
1236             {
1237               err = re_node_set_insert (eps_via_nodes, node);
1238               if (BE (err < 0, 0))
1239                 return -2;
1240               dest_node = dfa->edests[node].elems[0];
1241               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1242                                         dest_node))
1243                 return dest_node;
1244             }
1245         }
1246
1247       if (naccepted != 0
1248           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1249         {
1250           dest_node = dfa->nexts[node];
1251           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1252           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1253                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1254                                                dest_node)))
1255             return -1;
1256           re_node_set_empty (eps_via_nodes);
1257           return dest_node;
1258         }
1259     }
1260   return -1;
1261 }
1262
1263 static reg_errcode_t
1264 push_fail_stack (fs, str_idx, dests, nregs, regs, eps_via_nodes)
1265      struct re_fail_stack_t *fs;
1266      int str_idx, *dests, nregs;
1267      regmatch_t *regs;
1268      re_node_set *eps_via_nodes;
1269 {
1270   reg_errcode_t err;
1271   int num = fs->num++;
1272   if (fs->num == fs->alloc)
1273     {
1274       struct re_fail_stack_ent_t *new_array;
1275       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1276                                        * fs->alloc * 2));
1277       if (new_array == NULL)
1278         return REG_ESPACE;
1279       fs->alloc *= 2;
1280       fs->stack = new_array;
1281     }
1282   fs->stack[num].idx = str_idx;
1283   fs->stack[num].node = dests[1];
1284   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1285   if (fs->stack[num].regs == NULL)
1286     return REG_ESPACE;
1287   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1288   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1289   return err;
1290 }
1291
1292 static int
1293 pop_fail_stack (fs, pidx, nregs, regs, eps_via_nodes)
1294      struct re_fail_stack_t *fs;
1295      int *pidx, nregs;
1296      regmatch_t *regs;
1297      re_node_set *eps_via_nodes;
1298 {
1299   int num = --fs->num;
1300   assert (num >= 0);
1301   *pidx = fs->stack[num].idx;
1302   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1303   re_node_set_free (eps_via_nodes);
1304   re_free (fs->stack[num].regs);
1305   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1306   return fs->stack[num].node;
1307 }
1308
1309 /* Set the positions where the subexpressions are starts/ends to registers
1310    PMATCH.
1311    Note: We assume that pmatch[0] is already set, and
1312    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1313
1314 static reg_errcode_t
1315 set_regs (preg, mctx, nmatch, pmatch, fl_backtrack)
1316      const regex_t *preg;
1317      const re_match_context_t *mctx;
1318      size_t nmatch;
1319      regmatch_t *pmatch;
1320      int fl_backtrack;
1321 {
1322   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1323   int idx, cur_node, real_nmatch;
1324   re_node_set eps_via_nodes;
1325   struct re_fail_stack_t *fs;
1326   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1327   regmatch_t *prev_idx_match;
1328
1329 #ifdef DEBUG
1330   assert (nmatch > 1);
1331   assert (mctx->state_log != NULL);
1332 #endif
1333   if (fl_backtrack)
1334     {
1335       fs = &fs_body;
1336       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1337       if (fs->stack == NULL)
1338         return REG_ESPACE;
1339     }
1340   else
1341     fs = NULL;
1342
1343   cur_node = dfa->init_node;
1344   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1345   re_node_set_init_empty (&eps_via_nodes);
1346
1347   prev_idx_match = (regmatch_t *) alloca (sizeof (regmatch_t) * real_nmatch);
1348   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * real_nmatch);
1349
1350   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1351     {
1352       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, real_nmatch);
1353
1354       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1355         {
1356           int reg_idx;
1357           if (fs)
1358             {
1359               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1360                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1361                   break;
1362               if (reg_idx == nmatch)
1363                 {
1364                   re_node_set_free (&eps_via_nodes);
1365                   return free_fail_stack_return (fs);
1366                 }
1367               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1368                                          &eps_via_nodes);
1369             }
1370           else
1371             {
1372               re_node_set_free (&eps_via_nodes);
1373               return REG_NOERROR;
1374             }
1375         }
1376
1377       /* Proceed to next node.  */
1378       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1379                                     &eps_via_nodes, fs);
1380
1381       if (BE (cur_node < 0, 0))
1382         {
1383           if (BE (cur_node == -2, 0))
1384             {
1385               re_node_set_free (&eps_via_nodes);
1386               free_fail_stack_return (fs);
1387               return REG_ESPACE;
1388             }
1389           if (fs)
1390             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1391                                        &eps_via_nodes);
1392           else
1393             {
1394               re_node_set_free (&eps_via_nodes);
1395               return REG_NOMATCH;
1396             }
1397         }
1398     }
1399   re_node_set_free (&eps_via_nodes);
1400   return free_fail_stack_return (fs);
1401 }
1402
1403 static reg_errcode_t
1404 free_fail_stack_return (fs)
1405      struct re_fail_stack_t *fs;
1406 {
1407   if (fs)
1408     {
1409       int fs_idx;
1410       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1411         {
1412           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1413           re_free (fs->stack[fs_idx].regs);
1414         }
1415       re_free (fs->stack);
1416     }
1417   return REG_NOERROR;
1418 }
1419
1420 static void
1421 update_regs (dfa, pmatch, prev_idx_match, cur_node, cur_idx, nmatch)
1422      re_dfa_t *dfa;
1423      regmatch_t *pmatch, *prev_idx_match;
1424      int cur_node, cur_idx, nmatch;
1425 {
1426   int type = dfa->nodes[cur_node].type;
1427   if (type == OP_OPEN_SUBEXP)
1428     {
1429       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1430
1431       /* We are at the first node of this sub expression.  */
1432       if (reg_num < nmatch)
1433         {
1434           pmatch[reg_num].rm_so = cur_idx;
1435           pmatch[reg_num].rm_eo = -1;
1436         }
1437     }
1438   else if (type == OP_CLOSE_SUBEXP)
1439     {
1440       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1441       if (reg_num < nmatch)
1442         {
1443           /* We are at the last node of this sub expression.  */
1444           if (pmatch[reg_num].rm_so < cur_idx)
1445             {
1446               pmatch[reg_num].rm_eo = cur_idx;
1447               /* This is a non-empty match or we are not inside an optional
1448                  subexpression.  Accept this right away.  */
1449               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1450             }
1451           else
1452             {
1453               if (dfa->nodes[cur_node].opt_subexp
1454                   && prev_idx_match[reg_num].rm_so != -1)
1455                 /* We transited through an empty match for an optional
1456                    subexpression, like (a?)*, and this is not the subexp's
1457                    first match.  Copy back the old content of the registers
1458                    so that matches of an inner subexpression are undone as
1459                    well, like in ((a?))*.  */
1460                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1461               else
1462                 /* We completed a subexpression, but it may be part of
1463                    an optional one, so do not update PREV_IDX_MATCH.  */
1464                 pmatch[reg_num].rm_eo = cur_idx;
1465             }
1466         }
1467     }
1468 }
1469
1470 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1471    and sift the nodes in each states according to the following rules.
1472    Updated state_log will be wrote to STATE_LOG.
1473
1474    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1475      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1476         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1477         the LAST_NODE, we throw away the node `a'.
1478      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1479         string `s' and transit to `b':
1480         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1481            away the node `a'.
1482         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1483             thrown away, we throw away the node `a'.
1484      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1485         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1486            node `a'.
1487         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1488             we throw away the node `a'.  */
1489
1490 #define STATE_NODE_CONTAINS(state,node) \
1491   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1492
1493 static reg_errcode_t
1494 sift_states_backward (mctx, sctx)
1495      re_match_context_t *mctx;
1496      re_sift_context_t *sctx;
1497 {
1498   reg_errcode_t err;
1499   int null_cnt = 0;
1500   int str_idx = sctx->last_str_idx;
1501   re_node_set cur_dest;
1502
1503 #ifdef DEBUG
1504   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1505 #endif
1506
1507   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1508      transit to the last_node and the last_node itself.  */
1509   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1510   if (BE (err != REG_NOERROR, 0))
1511     return err;
1512   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1513   if (BE (err != REG_NOERROR, 0))
1514     goto free_return;
1515
1516   /* Then check each states in the state_log.  */
1517   while (str_idx > 0)
1518     {
1519       /* Update counters.  */
1520       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1521       if (null_cnt > mctx->max_mb_elem_len)
1522         {
1523           memset (sctx->sifted_states, '\0',
1524                   sizeof (re_dfastate_t *) * str_idx);
1525           re_node_set_free (&cur_dest);
1526           return REG_NOERROR;
1527         }
1528       re_node_set_empty (&cur_dest);
1529       --str_idx;
1530
1531       if (mctx->state_log[str_idx])
1532         {
1533           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1534           if (BE (err != REG_NOERROR, 0))
1535             goto free_return;
1536         }
1537
1538       /* Add all the nodes which satisfy the following conditions:
1539          - It can epsilon transit to a node in CUR_DEST.
1540          - It is in CUR_SRC.
1541          And update state_log.  */
1542       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1543       if (BE (err != REG_NOERROR, 0))
1544         goto free_return;
1545     }
1546   err = REG_NOERROR;
1547  free_return:
1548   re_node_set_free (&cur_dest);
1549   return err;
1550 }
1551
1552 static reg_errcode_t
1553 build_sifted_states (mctx, sctx, str_idx, cur_dest)
1554      re_match_context_t *mctx;
1555      re_sift_context_t *sctx;
1556      int str_idx;
1557      re_node_set *cur_dest;
1558 {
1559   re_dfa_t *const dfa = mctx->dfa;
1560   re_node_set *cur_src = &mctx->state_log[str_idx]->nodes;
1561   int i;
1562
1563   /* Then build the next sifted state.
1564      We build the next sifted state on `cur_dest', and update
1565      `sifted_states[str_idx]' with `cur_dest'.
1566      Note:
1567      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1568      `cur_src' points the node_set of the old `state_log[str_idx]'.  */
1569   for (i = 0; i < cur_src->nelem; i++)
1570     {
1571       int prev_node = cur_src->elems[i];
1572       int naccepted = 0;
1573       re_token_type_t type = dfa->nodes[prev_node].type;
1574       int ret;
1575
1576       if (IS_EPSILON_NODE (type))
1577         continue;
1578 #ifdef RE_ENABLE_I18N
1579       /* If the node may accept `multi byte'.  */
1580       if (ACCEPT_MB_NODE (type))
1581         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1582                                          str_idx, sctx->last_str_idx);
1583 #endif /* RE_ENABLE_I18N */
1584
1585       /* We don't check backreferences here.
1586          See update_cur_sifted_state().  */
1587       if (!naccepted
1588           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1589           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1590                                   dfa->nexts[prev_node]))
1591         naccepted = 1;
1592
1593       if (naccepted == 0)
1594         continue;
1595
1596       if (sctx->limits.nelem)
1597         {
1598           int to_idx = str_idx + naccepted;
1599           if (check_dst_limits (mctx, &sctx->limits,
1600                                 dfa->nexts[prev_node], to_idx,
1601                                 prev_node, str_idx))
1602             continue;
1603         }
1604       ret = re_node_set_insert (cur_dest, prev_node);
1605       if (BE (ret == -1, 0))
1606         return REG_ESPACE;
1607     }
1608
1609   return REG_NOERROR;
1610 }
1611
1612 /* Helper functions.  */
1613
1614 static reg_errcode_t
1615 clean_state_log_if_needed (mctx, next_state_log_idx)
1616     re_match_context_t *mctx;
1617     int next_state_log_idx;
1618 {
1619   int top = mctx->state_log_top;
1620
1621   if (next_state_log_idx >= mctx->input.bufs_len
1622       || (next_state_log_idx >= mctx->input.valid_len
1623           && mctx->input.valid_len < mctx->input.len))
1624     {
1625       reg_errcode_t err;
1626       err = extend_buffers (mctx);
1627       if (BE (err != REG_NOERROR, 0))
1628         return err;
1629     }
1630
1631   if (top < next_state_log_idx)
1632     {
1633       memset (mctx->state_log + top + 1, '\0',
1634               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1635       mctx->state_log_top = next_state_log_idx;
1636     }
1637   return REG_NOERROR;
1638 }
1639
1640 static reg_errcode_t
1641 merge_state_array (dfa, dst, src, num)
1642      re_dfa_t *dfa;
1643      re_dfastate_t **dst;
1644      re_dfastate_t **src;
1645      int num;
1646 {
1647   int st_idx;
1648   reg_errcode_t err;
1649   for (st_idx = 0; st_idx < num; ++st_idx)
1650     {
1651       if (dst[st_idx] == NULL)
1652         dst[st_idx] = src[st_idx];
1653       else if (src[st_idx] != NULL)
1654         {
1655           re_node_set merged_set;
1656           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1657                                         &src[st_idx]->nodes);
1658           if (BE (err != REG_NOERROR, 0))
1659             return err;
1660           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1661           re_node_set_free (&merged_set);
1662           if (BE (err != REG_NOERROR, 0))
1663             return err;
1664         }
1665     }
1666   return REG_NOERROR;
1667 }
1668
1669 static reg_errcode_t
1670 update_cur_sifted_state (mctx, sctx, str_idx, dest_nodes)
1671      re_match_context_t *mctx;
1672      re_sift_context_t *sctx;
1673      int str_idx;
1674      re_node_set *dest_nodes;
1675 {
1676   re_dfa_t *const dfa = mctx->dfa;
1677   reg_errcode_t err;
1678   const re_node_set *candidates;
1679   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1680                 : &mctx->state_log[str_idx]->nodes);
1681
1682   if (dest_nodes->nelem == 0)
1683     sctx->sifted_states[str_idx] = NULL;
1684   else
1685     {
1686       if (candidates)
1687         {
1688           /* At first, add the nodes which can epsilon transit to a node in
1689              DEST_NODE.  */
1690           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1691           if (BE (err != REG_NOERROR, 0))
1692             return err;
1693
1694           /* Then, check the limitations in the current sift_context.  */
1695           if (sctx->limits.nelem)
1696             {
1697               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1698                                          mctx->bkref_ents, str_idx);
1699               if (BE (err != REG_NOERROR, 0))
1700                 return err;
1701             }
1702         }
1703
1704       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1705       if (BE (err != REG_NOERROR, 0))
1706         return err;
1707     }
1708
1709   if (candidates && mctx->state_log[str_idx]->has_backref)
1710     {
1711       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1712       if (BE (err != REG_NOERROR, 0))
1713         return err;
1714     }
1715   return REG_NOERROR;
1716 }
1717
1718 static reg_errcode_t
1719 add_epsilon_src_nodes (dfa, dest_nodes, candidates)
1720      re_dfa_t *dfa;
1721      re_node_set *dest_nodes;
1722      const re_node_set *candidates;
1723 {
1724   reg_errcode_t err;
1725   int src_idx;
1726   re_node_set src_copy;
1727
1728   err = re_node_set_init_copy (&src_copy, dest_nodes);
1729   if (BE (err != REG_NOERROR, 0))
1730     return err;
1731   for (src_idx = 0; src_idx < src_copy.nelem; ++src_idx)
1732     {
1733       err = re_node_set_add_intersect (dest_nodes, candidates,
1734                                        dfa->inveclosures
1735                                        + src_copy.elems[src_idx]);
1736       if (BE (err != REG_NOERROR, 0))
1737         {
1738           re_node_set_free (&src_copy);
1739           return err;
1740         }
1741     }
1742   re_node_set_free (&src_copy);
1743   return REG_NOERROR;
1744 }
1745
1746 static reg_errcode_t
1747 sub_epsilon_src_nodes (dfa, node, dest_nodes, candidates)
1748      re_dfa_t *dfa;
1749      int node;
1750      re_node_set *dest_nodes;
1751      const re_node_set *candidates;
1752 {
1753     int ecl_idx;
1754     reg_errcode_t err;
1755     re_node_set *inv_eclosure = dfa->inveclosures + node;
1756     re_node_set except_nodes;
1757     re_node_set_init_empty (&except_nodes);
1758     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1759       {
1760         int cur_node = inv_eclosure->elems[ecl_idx];
1761         if (cur_node == node)
1762           continue;
1763         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1764           {
1765             int edst1 = dfa->edests[cur_node].elems[0];
1766             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1767                          ? dfa->edests[cur_node].elems[1] : -1);
1768             if ((!re_node_set_contains (inv_eclosure, edst1)
1769                  && re_node_set_contains (dest_nodes, edst1))
1770                 || (edst2 > 0
1771                     && !re_node_set_contains (inv_eclosure, edst2)
1772                     && re_node_set_contains (dest_nodes, edst2)))
1773               {
1774                 err = re_node_set_add_intersect (&except_nodes, candidates,
1775                                                  dfa->inveclosures + cur_node);
1776                 if (BE (err != REG_NOERROR, 0))
1777                   {
1778                     re_node_set_free (&except_nodes);
1779                     return err;
1780                   }
1781               }
1782           }
1783       }
1784     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1785       {
1786         int cur_node = inv_eclosure->elems[ecl_idx];
1787         if (!re_node_set_contains (&except_nodes, cur_node))
1788           {
1789             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1790             re_node_set_remove_at (dest_nodes, idx);
1791           }
1792       }
1793     re_node_set_free (&except_nodes);
1794     return REG_NOERROR;
1795 }
1796
1797 static int
1798 check_dst_limits (mctx, limits, dst_node, dst_idx, src_node, src_idx)
1799      re_match_context_t *mctx;
1800      re_node_set *limits;
1801      int dst_node, dst_idx, src_node, src_idx;
1802 {
1803   re_dfa_t *const dfa = mctx->dfa;
1804   int lim_idx, src_pos, dst_pos;
1805
1806   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1807   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1808   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1809     {
1810       int subexp_idx;
1811       struct re_backref_cache_entry *ent;
1812       ent = mctx->bkref_ents + limits->elems[lim_idx];
1813       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1814
1815       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1816                                            subexp_idx, dst_node, dst_idx,
1817                                            dst_bkref_idx);
1818       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1819                                            subexp_idx, src_node, src_idx,
1820                                            src_bkref_idx);
1821
1822       /* In case of:
1823          <src> <dst> ( <subexp> )
1824          ( <subexp> ) <src> <dst>
1825          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1826       if (src_pos == dst_pos)
1827         continue; /* This is unrelated limitation.  */
1828       else
1829         return 1;
1830     }
1831   return 0;
1832 }
1833
1834 static int
1835 check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx, from_node, bkref_idx)
1836      re_match_context_t *mctx;
1837      int boundaries, subexp_idx, from_node, bkref_idx;
1838 {
1839   re_dfa_t *const dfa = mctx->dfa;
1840   re_node_set *eclosures = dfa->eclosures + from_node;
1841   int node_idx;
1842
1843   /* Else, we are on the boundary: examine the nodes on the epsilon
1844      closure.  */
1845   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1846     {
1847       int node = eclosures->elems[node_idx];
1848       switch (dfa->nodes[node].type)
1849         {
1850         case OP_BACK_REF:
1851           {
1852             struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1853             do
1854               {
1855                 int dst, cpos;
1856
1857                 if (ent->node != node || ent->subexp_from != ent->subexp_to)
1858                   continue;
1859
1860                 /* Recurse trying to reach the OP_OPEN_SUBEXP and
1861                    OP_CLOSE_SUBEXP cases below.  But, if the
1862                    destination node is the same node as the source
1863                    node, don't recurse because it would cause an
1864                    infinite loop: a regex that exhibits this behavior
1865                    is ()\1*\1*  */
1866                 dst = dfa->edests[node].elems[0];
1867                 if (dst == from_node)
1868                   {
1869                     if (boundaries & 1)
1870                       return -1;
1871                     else /* if (boundaries & 2) */
1872                       return 0;
1873                   }
1874
1875                 cpos = check_dst_limits_calc_pos_1 (mctx, boundaries,
1876                                                     subexp_idx, dst, bkref_idx);
1877
1878                 if (cpos == -1 && (boundaries & 1))
1879                   return -1;
1880
1881                 if (cpos == 0 /* && (boundaries & 2) */)
1882                   return 0;
1883               }
1884             while (ent++->more);
1885             break;
1886           }
1887
1888         case OP_OPEN_SUBEXP:
1889           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1890             return -1;
1891           break;
1892
1893         case OP_CLOSE_SUBEXP:
1894           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1895             return 0;
1896           break;
1897
1898         default:
1899             break;
1900         }
1901     }
1902
1903   return (boundaries & 2) ? 1 : 0;
1904 }
1905
1906 static int
1907 check_dst_limits_calc_pos (mctx, limit, subexp_idx, from_node, str_idx, bkref_idx)
1908      re_match_context_t *mctx;
1909      int limit, subexp_idx, from_node, str_idx, bkref_idx;
1910 {
1911   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1912   int boundaries;
1913
1914   /* If we are outside the range of the subexpression, return -1 or 1.  */
1915   if (str_idx < lim->subexp_from)
1916     return -1;
1917
1918   if (lim->subexp_to < str_idx)
1919     return 1;
1920
1921   /* If we are within the subexpression, return 0.  */
1922   boundaries = (str_idx == lim->subexp_from);
1923   boundaries |= (str_idx == lim->subexp_to) << 1;
1924   if (boundaries == 0)
1925     return 0;
1926
1927   /* Else, examine epsilon closure.  */
1928   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1929                                       from_node, bkref_idx);
1930 }
1931
1932 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1933    which are against limitations from DEST_NODES. */
1934
1935 static reg_errcode_t
1936 check_subexp_limits (dfa, dest_nodes, candidates, limits, bkref_ents, str_idx)
1937      re_dfa_t *dfa;
1938      re_node_set *dest_nodes;
1939      const re_node_set *candidates;
1940      re_node_set *limits;
1941      struct re_backref_cache_entry *bkref_ents;
1942      int str_idx;
1943 {
1944   reg_errcode_t err;
1945   int node_idx, lim_idx;
1946
1947   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1948     {
1949       int subexp_idx;
1950       struct re_backref_cache_entry *ent;
1951       ent = bkref_ents + limits->elems[lim_idx];
1952
1953       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
1954         continue; /* This is unrelated limitation.  */
1955
1956       subexp_idx = dfa->nodes[ent->node].opr.idx - 1;
1957       if (ent->subexp_to == str_idx)
1958         {
1959           int ops_node = -1;
1960           int cls_node = -1;
1961           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1962             {
1963               int node = dest_nodes->elems[node_idx];
1964               re_token_type_t type = dfa->nodes[node].type;
1965               if (type == OP_OPEN_SUBEXP
1966                   && subexp_idx == dfa->nodes[node].opr.idx)
1967                 ops_node = node;
1968               else if (type == OP_CLOSE_SUBEXP
1969                        && subexp_idx == dfa->nodes[node].opr.idx)
1970                 cls_node = node;
1971             }
1972
1973           /* Check the limitation of the open subexpression.  */
1974           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
1975           if (ops_node >= 0)
1976             {
1977               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
1978                                            candidates);
1979               if (BE (err != REG_NOERROR, 0))
1980                 return err;
1981             }
1982
1983           /* Check the limitation of the close subexpression.  */
1984           if (cls_node >= 0)
1985             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
1986               {
1987                 int node = dest_nodes->elems[node_idx];
1988                 if (!re_node_set_contains (dfa->inveclosures + node,
1989                                            cls_node)
1990                     && !re_node_set_contains (dfa->eclosures + node,
1991                                               cls_node))
1992                   {
1993                     /* It is against this limitation.
1994                        Remove it form the current sifted state.  */
1995                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
1996                                                  candidates);
1997                     if (BE (err != REG_NOERROR, 0))
1998                       return err;
1999                     --node_idx;
2000                   }
2001               }
2002         }
2003       else /* (ent->subexp_to != str_idx)  */
2004         {
2005           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2006             {
2007               int node = dest_nodes->elems[node_idx];
2008               re_token_type_t type = dfa->nodes[node].type;
2009               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2010                 {
2011                   if (subexp_idx != dfa->nodes[node].opr.idx)
2012                     continue;
2013                   if ((type == OP_CLOSE_SUBEXP && ent->subexp_to != str_idx)
2014                       || (type == OP_OPEN_SUBEXP))
2015                     {
2016                       /* It is against this limitation.
2017                          Remove it form the current sifted state.  */
2018                       err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2019                                                    candidates);
2020                       if (BE (err != REG_NOERROR, 0))
2021                         return err;
2022                     }
2023                 }
2024             }
2025         }
2026     }
2027   return REG_NOERROR;
2028 }
2029
2030 static reg_errcode_t
2031 sift_states_bkref (mctx, sctx, str_idx, candidates)
2032      re_match_context_t *mctx;
2033      re_sift_context_t *sctx;
2034      int str_idx;
2035      const re_node_set *candidates;
2036 {
2037   re_dfa_t *const dfa = mctx->dfa;
2038   reg_errcode_t err;
2039   int node_idx, node;
2040   re_sift_context_t local_sctx;
2041   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2042
2043   if (first_idx == -1)
2044     return REG_NOERROR;
2045
2046   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2047
2048   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2049     {
2050       int enabled_idx;
2051       re_token_type_t type;
2052       struct re_backref_cache_entry *entry;
2053       node = candidates->elems[node_idx];
2054       type = dfa->nodes[node].type;
2055       /* Avoid infinite loop for the REs like "()\1+".  */
2056       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2057         continue;
2058       if (type != OP_BACK_REF)
2059         continue;
2060
2061       entry = mctx->bkref_ents + first_idx;
2062       enabled_idx = first_idx;
2063       do
2064         {
2065           int subexp_len, to_idx, dst_node;
2066           re_dfastate_t *cur_state;
2067
2068           if (entry->node != node)
2069             continue;
2070           subexp_len = entry->subexp_to - entry->subexp_from;
2071           to_idx = str_idx + subexp_len;
2072           dst_node = (subexp_len ? dfa->nexts[node]
2073                       : dfa->edests[node].elems[0]);
2074
2075           if (to_idx > sctx->last_str_idx
2076               || sctx->sifted_states[to_idx] == NULL
2077               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2078               || check_dst_limits (mctx, &sctx->limits, node,
2079                                    str_idx, dst_node, to_idx))
2080             continue;
2081
2082           if (local_sctx.sifted_states == NULL)
2083             {
2084               local_sctx = *sctx;
2085               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2086               if (BE (err != REG_NOERROR, 0))
2087                 goto free_return;
2088             }
2089           local_sctx.last_node = node;
2090           local_sctx.last_str_idx = str_idx;
2091           err = re_node_set_insert (&local_sctx.limits, enabled_idx);
2092           if (BE (err < 0, 0))
2093             {
2094               err = REG_ESPACE;
2095               goto free_return;
2096             }
2097           cur_state = local_sctx.sifted_states[str_idx];
2098           err = sift_states_backward (mctx, &local_sctx);
2099           if (BE (err != REG_NOERROR, 0))
2100             goto free_return;
2101           if (sctx->limited_states != NULL)
2102             {
2103               err = merge_state_array (dfa, sctx->limited_states,
2104                                        local_sctx.sifted_states,
2105                                        str_idx + 1);
2106               if (BE (err != REG_NOERROR, 0))
2107                 goto free_return;
2108             }
2109           local_sctx.sifted_states[str_idx] = cur_state;
2110           re_node_set_remove (&local_sctx.limits, enabled_idx);
2111
2112           /* mctx->bkref_ents may have changed, reload the pointer.  */
2113           entry = mctx->bkref_ents + enabled_idx;
2114         }
2115       while (enabled_idx++, entry++->more);
2116     }
2117   err = REG_NOERROR;
2118  free_return:
2119   if (local_sctx.sifted_states != NULL)
2120     {
2121       re_node_set_free (&local_sctx.limits);
2122     }
2123
2124   return err;
2125 }
2126
2127
2128 #ifdef RE_ENABLE_I18N
2129 static int
2130 sift_states_iter_mb (mctx, sctx, node_idx, str_idx, max_str_idx)
2131     const re_match_context_t *mctx;
2132     re_sift_context_t *sctx;
2133     int node_idx, str_idx, max_str_idx;
2134 {
2135   re_dfa_t *const dfa = mctx->dfa;
2136   int naccepted;
2137   /* Check the node can accept `multi byte'.  */
2138   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2139   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2140       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2141                             dfa->nexts[node_idx]))
2142     /* The node can't accept the `multi byte', or the
2143        destination was already thrown away, then the node
2144        could't accept the current input `multi byte'.   */
2145     naccepted = 0;
2146   /* Otherwise, it is sure that the node could accept
2147      `naccepted' bytes input.  */
2148   return naccepted;
2149 }
2150 #endif /* RE_ENABLE_I18N */
2151
2152 \f
2153 /* Functions for state transition.  */
2154
2155 /* Return the next state to which the current state STATE will transit by
2156    accepting the current input byte, and update STATE_LOG if necessary.
2157    If STATE can accept a multibyte char/collating element/back reference
2158    update the destination of STATE_LOG.  */
2159
2160 static re_dfastate_t *
2161 transit_state (err, mctx, state)
2162      reg_errcode_t *err;
2163      re_match_context_t *mctx;
2164      re_dfastate_t *state;
2165 {
2166   re_dfa_t *const dfa = mctx->dfa;
2167   re_dfastate_t **trtable;
2168   unsigned char ch;
2169
2170   if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
2171       || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
2172           && mctx->input.valid_len < mctx->input.len))
2173     {
2174       *err = extend_buffers (mctx);
2175       if (BE (*err != REG_NOERROR, 0))
2176         return NULL;
2177     }
2178
2179 #ifdef RE_ENABLE_I18N
2180       /* If the current state can accept multibyte.  */
2181       if (state->accept_mb)
2182         {
2183           *err = transit_state_mb (mctx, state);
2184           if (BE (*err != REG_NOERROR, 0))
2185             return NULL;
2186         }
2187 #endif /* RE_ENABLE_I18N */
2188
2189   /* Then decide the next state with the single byte.  */
2190   if (1)
2191     {
2192       /* Use transition table  */
2193       ch = re_string_fetch_byte (&mctx->input);
2194       trtable = state->trtable;
2195       if (trtable == NULL)
2196         {
2197           trtable = build_trtable (dfa, state);
2198           if (trtable == NULL)
2199             {
2200               *err = REG_ESPACE;
2201               return NULL;
2202             }
2203         }
2204       if (BE (state->word_trtable, 0))
2205         {
2206           unsigned int context;
2207           context
2208             = re_string_context_at (&mctx->input,
2209                                     re_string_cur_idx (&mctx->input) - 1,
2210                                     mctx->eflags);
2211           if (IS_WORD_CONTEXT (context))
2212             return trtable[ch + SBC_MAX];
2213           else
2214             return trtable[ch];
2215         }
2216       else
2217         return trtable[ch];
2218     }
2219 #if 0
2220   else
2221     /* don't use transition table  */
2222     return transit_state_sb (err, mctx, state);
2223 #endif
2224 }
2225
2226 /* Update the state_log if we need */
2227 re_dfastate_t *
2228 merge_state_with_log (err, mctx, next_state)
2229      reg_errcode_t *err;
2230      re_match_context_t *mctx;
2231      re_dfastate_t *next_state;
2232 {
2233   re_dfa_t *const dfa = mctx->dfa;
2234   int cur_idx = re_string_cur_idx (&mctx->input);
2235
2236   if (cur_idx > mctx->state_log_top)
2237     {
2238       mctx->state_log[cur_idx] = next_state;
2239       mctx->state_log_top = cur_idx;
2240     }
2241   else if (mctx->state_log[cur_idx] == 0)
2242     {
2243       mctx->state_log[cur_idx] = next_state;
2244     }
2245   else
2246     {
2247       re_dfastate_t *pstate;
2248       unsigned int context;
2249       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2250       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2251          the destination of a multibyte char/collating element/
2252          back reference.  Then the next state is the union set of
2253          these destinations and the results of the transition table.  */
2254       pstate = mctx->state_log[cur_idx];
2255       log_nodes = pstate->entrance_nodes;
2256       if (next_state != NULL)
2257         {
2258           table_nodes = next_state->entrance_nodes;
2259           *err = re_node_set_init_union (&next_nodes, table_nodes,
2260                                              log_nodes);
2261           if (BE (*err != REG_NOERROR, 0))
2262             return NULL;
2263         }
2264       else
2265         next_nodes = *log_nodes;
2266       /* Note: We already add the nodes of the initial state,
2267          then we don't need to add them here.  */
2268
2269       context = re_string_context_at (&mctx->input,
2270                                       re_string_cur_idx (&mctx->input) - 1,
2271                                       mctx->eflags);
2272       next_state = mctx->state_log[cur_idx]
2273         = re_acquire_state_context (err, dfa, &next_nodes, context);
2274       /* We don't need to check errors here, since the return value of
2275          this function is next_state and ERR is already set.  */
2276
2277       if (table_nodes != NULL)
2278         re_node_set_free (&next_nodes);
2279     }
2280
2281   if (BE (dfa->nbackref, 0) && next_state != NULL)
2282     {
2283       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2284          later.  We must check them here, since the back references in the
2285          next state might use them.  */
2286       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2287                                         cur_idx);
2288       if (BE (*err != REG_NOERROR, 0))
2289         return NULL;
2290
2291       /* If the next state has back references.  */
2292       if (next_state->has_backref)
2293         {
2294           *err = transit_state_bkref (mctx, &next_state->nodes);
2295           if (BE (*err != REG_NOERROR, 0))
2296             return NULL;
2297           next_state = mctx->state_log[cur_idx];
2298         }
2299     }
2300
2301   return next_state;
2302 }
2303
2304 /* Skip bytes in the input that correspond to part of a
2305    multi-byte match, then look in the log for a state
2306    from which to restart matching.  */
2307 re_dfastate_t *
2308 find_recover_state (err, mctx)
2309      reg_errcode_t *err;
2310      re_match_context_t *mctx;
2311 {
2312   re_dfastate_t *cur_state = NULL;
2313   do
2314     {
2315       int max = mctx->state_log_top;
2316       int cur_str_idx = re_string_cur_idx (&mctx->input);
2317
2318       do
2319         {
2320           if (++cur_str_idx > max)
2321             return NULL;
2322           re_string_skip_bytes (&mctx->input, 1);
2323         }
2324       while (mctx->state_log[cur_str_idx] == NULL);
2325
2326       cur_state = merge_state_with_log (err, mctx, NULL);
2327     }
2328   while (err == REG_NOERROR && cur_state == NULL);
2329   return cur_state;
2330 }
2331
2332 /* Helper functions for transit_state.  */
2333
2334 /* From the node set CUR_NODES, pick up the nodes whose types are
2335    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2336    expression. And register them to use them later for evaluating the
2337    correspoding back references.  */
2338
2339 static reg_errcode_t
2340 check_subexp_matching_top (mctx, cur_nodes, str_idx)
2341      re_match_context_t *mctx;
2342      re_node_set *cur_nodes;
2343      int str_idx;
2344 {
2345   re_dfa_t *const dfa = mctx->dfa;
2346   int node_idx;
2347   reg_errcode_t err;
2348
2349   /* TODO: This isn't efficient.
2350            Because there might be more than one nodes whose types are
2351            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2352            nodes.
2353            E.g. RE: (a){2}  */
2354   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2355     {
2356       int node = cur_nodes->elems[node_idx];
2357       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2358           && dfa->nodes[node].opr.idx < (8 * sizeof (dfa->used_bkref_map))
2359           && dfa->used_bkref_map & (1 << dfa->nodes[node].opr.idx))
2360         {
2361           err = match_ctx_add_subtop (mctx, node, str_idx);
2362           if (BE (err != REG_NOERROR, 0))
2363             return err;
2364         }
2365     }
2366   return REG_NOERROR;
2367 }
2368
2369 #if 0
2370 /* Return the next state to which the current state STATE will transit by
2371    accepting the current input byte.  */
2372
2373 static re_dfastate_t *
2374 transit_state_sb (err, mctx, state)
2375      reg_errcode_t *err;
2376      re_match_context_t *mctx;
2377      re_dfastate_t *state;
2378 {
2379   re_dfa_t *const dfa = mctx->dfa;
2380   re_node_set next_nodes;
2381   re_dfastate_t *next_state;
2382   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2383   unsigned int context;
2384
2385   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2386   if (BE (*err != REG_NOERROR, 0))
2387     return NULL;
2388   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2389     {
2390       int cur_node = state->nodes.elems[node_cnt];
2391       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2392         {
2393           *err = re_node_set_merge (&next_nodes,
2394                                     dfa->eclosures + dfa->nexts[cur_node]);
2395           if (BE (*err != REG_NOERROR, 0))
2396             {
2397               re_node_set_free (&next_nodes);
2398               return NULL;
2399             }
2400         }
2401     }
2402   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2403   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2404   /* We don't need to check errors here, since the return value of
2405      this function is next_state and ERR is already set.  */
2406
2407   re_node_set_free (&next_nodes);
2408   re_string_skip_bytes (&mctx->input, 1);
2409   return next_state;
2410 }
2411 #endif
2412
2413 #ifdef RE_ENABLE_I18N
2414 static reg_errcode_t
2415 transit_state_mb (mctx, pstate)
2416     re_match_context_t *mctx;
2417     re_dfastate_t *pstate;
2418 {
2419   re_dfa_t *const dfa = mctx->dfa;
2420   reg_errcode_t err;
2421   int i;
2422
2423   for (i = 0; i < pstate->nodes.nelem; ++i)
2424     {
2425       re_node_set dest_nodes, *new_nodes;
2426       int cur_node_idx = pstate->nodes.elems[i];
2427       int naccepted = 0, dest_idx;
2428       unsigned int context;
2429       re_dfastate_t *dest_state;
2430
2431       if (dfa->nodes[cur_node_idx].constraint)
2432         {
2433           context = re_string_context_at (&mctx->input,
2434                                           re_string_cur_idx (&mctx->input),
2435                                           mctx->eflags);
2436           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2437                                            context))
2438             continue;
2439         }
2440
2441       /* How many bytes the node can accept?  */
2442       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
2443         naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2444                                              re_string_cur_idx (&mctx->input));
2445       if (naccepted == 0)
2446         continue;
2447
2448       /* The node can accepts `naccepted' bytes.  */
2449       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2450       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2451                                : mctx->max_mb_elem_len);
2452       err = clean_state_log_if_needed (mctx, dest_idx);
2453       if (BE (err != REG_NOERROR, 0))
2454         return err;
2455 #ifdef DEBUG
2456       assert (dfa->nexts[cur_node_idx] != -1);
2457 #endif
2458       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
2459          then we use pstate->nodes.elems[i] instead.  */
2460       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
2461
2462       dest_state = mctx->state_log[dest_idx];
2463       if (dest_state == NULL)
2464         dest_nodes = *new_nodes;
2465       else
2466         {
2467           err = re_node_set_init_union (&dest_nodes,
2468                                         dest_state->entrance_nodes, new_nodes);
2469           if (BE (err != REG_NOERROR, 0))
2470             return err;
2471         }
2472       context = re_string_context_at (&mctx->input, dest_idx - 1, mctx->eflags);
2473       mctx->state_log[dest_idx]
2474         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2475       if (dest_state != NULL)
2476         re_node_set_free (&dest_nodes);
2477       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2478         return err;
2479     }
2480   return REG_NOERROR;
2481 }
2482 #endif /* RE_ENABLE_I18N */
2483
2484 static reg_errcode_t
2485 transit_state_bkref (mctx, nodes)
2486     re_match_context_t *mctx;
2487     const re_node_set *nodes;
2488 {
2489   re_dfa_t *const dfa = mctx->dfa;
2490   reg_errcode_t err;
2491   int i;
2492   int cur_str_idx = re_string_cur_idx (&mctx->input);
2493
2494   for (i = 0; i < nodes->nelem; ++i)
2495     {
2496       int dest_str_idx, prev_nelem, bkc_idx;
2497       int node_idx = nodes->elems[i];
2498       unsigned int context;
2499       const re_token_t *node = dfa->nodes + node_idx;
2500       re_node_set *new_dest_nodes;
2501
2502       /* Check whether `node' is a backreference or not.  */
2503       if (node->type != OP_BACK_REF)
2504         continue;
2505
2506       if (node->constraint)
2507         {
2508           context = re_string_context_at (&mctx->input, cur_str_idx,
2509                                           mctx->eflags);
2510           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2511             continue;
2512         }
2513
2514       /* `node' is a backreference.
2515          Check the substring which the substring matched.  */
2516       bkc_idx = mctx->nbkref_ents;
2517       err = get_subexp (mctx, node_idx, cur_str_idx);
2518       if (BE (err != REG_NOERROR, 0))
2519         goto free_return;
2520
2521       /* And add the epsilon closures (which is `new_dest_nodes') of
2522          the backreference to appropriate state_log.  */
2523 #ifdef DEBUG
2524       assert (dfa->nexts[node_idx] != -1);
2525 #endif
2526       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2527         {
2528           int subexp_len;
2529           re_dfastate_t *dest_state;
2530           struct re_backref_cache_entry *bkref_ent;
2531           bkref_ent = mctx->bkref_ents + bkc_idx;
2532           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2533             continue;
2534           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2535           new_dest_nodes = (subexp_len == 0
2536                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2537                             : dfa->eclosures + dfa->nexts[node_idx]);
2538           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2539                           - bkref_ent->subexp_from);
2540           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2541                                           mctx->eflags);
2542           dest_state = mctx->state_log[dest_str_idx];
2543           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2544                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2545           /* Add `new_dest_node' to state_log.  */
2546           if (dest_state == NULL)
2547             {
2548               mctx->state_log[dest_str_idx]
2549                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2550                                             context);
2551               if (BE (mctx->state_log[dest_str_idx] == NULL
2552                       && err != REG_NOERROR, 0))
2553                 goto free_return;
2554             }
2555           else
2556             {
2557               re_node_set dest_nodes;
2558               err = re_node_set_init_union (&dest_nodes,
2559                                             dest_state->entrance_nodes,
2560                                             new_dest_nodes);
2561               if (BE (err != REG_NOERROR, 0))
2562                 {
2563                   re_node_set_free (&dest_nodes);
2564                   goto free_return;
2565                 }
2566               mctx->state_log[dest_str_idx]
2567                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2568               re_node_set_free (&dest_nodes);
2569               if (BE (mctx->state_log[dest_str_idx] == NULL
2570                       && err != REG_NOERROR, 0))
2571                 goto free_return;
2572             }
2573           /* We need to check recursively if the backreference can epsilon
2574              transit.  */
2575           if (subexp_len == 0
2576               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2577             {
2578               err = check_subexp_matching_top (mctx, new_dest_nodes,
2579                                                cur_str_idx);
2580               if (BE (err != REG_NOERROR, 0))
2581                 goto free_return;
2582               err = transit_state_bkref (mctx, new_dest_nodes);
2583               if (BE (err != REG_NOERROR, 0))
2584                 goto free_return;
2585             }
2586         }
2587     }
2588   err = REG_NOERROR;
2589  free_return:
2590   return err;
2591 }
2592
2593 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2594    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2595    Note that we might collect inappropriate candidates here.
2596    However, the cost of checking them strictly here is too high, then we
2597    delay these checking for prune_impossible_nodes().  */
2598
2599 static reg_errcode_t
2600 get_subexp (mctx, bkref_node, bkref_str_idx)
2601      re_match_context_t *mctx;
2602      int bkref_node, bkref_str_idx;
2603 {
2604   re_dfa_t *const dfa = mctx->dfa;
2605   int subexp_num, sub_top_idx;
2606   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2607   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2608   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2609   if (cache_idx != -1)
2610     {
2611       const struct re_backref_cache_entry *entry = mctx->bkref_ents + cache_idx;
2612       do
2613         if (entry->node == bkref_node)
2614           return REG_NOERROR; /* We already checked it.  */
2615       while (entry++->more);
2616     }
2617
2618   subexp_num = dfa->nodes[bkref_node].opr.idx - 1;
2619
2620   /* For each sub expression  */
2621   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2622     {
2623       reg_errcode_t err;
2624       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2625       re_sub_match_last_t *sub_last;
2626       int sub_last_idx, sl_str, bkref_str_off;
2627
2628       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2629         continue; /* It isn't related.  */
2630
2631       sl_str = sub_top->str_idx;
2632       bkref_str_off = bkref_str_idx;
2633       /* At first, check the last node of sub expressions we already
2634          evaluated.  */
2635       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2636         {
2637           int sl_str_diff;
2638           sub_last = sub_top->lasts[sub_last_idx];
2639           sl_str_diff = sub_last->str_idx - sl_str;
2640           /* The matched string by the sub expression match with the substring
2641              at the back reference?  */
2642           if (sl_str_diff > 0)
2643             {
2644               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2645                 {
2646                   /* Not enough chars for a successful match.  */
2647                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2648                     break;
2649
2650                   err = clean_state_log_if_needed (mctx,
2651                                                    bkref_str_off
2652                                                    + sl_str_diff);
2653                   if (BE (err != REG_NOERROR, 0))
2654                     return err;
2655                   buf = (const char *) re_string_get_buffer (&mctx->input);
2656                 }
2657               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2658                 break; /* We don't need to search this sub expression any more.  */
2659             }
2660           bkref_str_off += sl_str_diff;
2661           sl_str += sl_str_diff;
2662           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2663                                 bkref_str_idx);
2664
2665           /* Reload buf, since the preceding call might have reallocated
2666              the buffer.  */
2667           buf = (const char *) re_string_get_buffer (&mctx->input);
2668
2669           if (err == REG_NOMATCH)
2670             continue;
2671           if (BE (err != REG_NOERROR, 0))
2672             return err;
2673         }
2674
2675       if (sub_last_idx < sub_top->nlasts)
2676         continue;
2677       if (sub_last_idx > 0)
2678         ++sl_str;
2679       /* Then, search for the other last nodes of the sub expression.  */
2680       for (; sl_str <= bkref_str_idx; ++sl_str)
2681         {
2682           int cls_node, sl_str_off;
2683           const re_node_set *nodes;
2684           sl_str_off = sl_str - sub_top->str_idx;
2685           /* The matched string by the sub expression match with the substring
2686              at the back reference?  */
2687           if (sl_str_off > 0)
2688             {
2689               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2690                 {
2691                   /* If we are at the end of the input, we cannot match.  */
2692                   if (bkref_str_off >= mctx->input.len)
2693                     break;
2694
2695                   err = extend_buffers (mctx);
2696                   if (BE (err != REG_NOERROR, 0))
2697                     return err;
2698
2699                   buf = (const char *) re_string_get_buffer (&mctx->input);
2700                 }
2701               if (buf [bkref_str_off++] != buf[sl_str - 1])
2702                 break; /* We don't need to search this sub expression
2703                           any more.  */
2704             }
2705           if (mctx->state_log[sl_str] == NULL)
2706             continue;
2707           /* Does this state have a ')' of the sub expression?  */
2708           nodes = &mctx->state_log[sl_str]->nodes;
2709           cls_node = find_subexp_node (dfa, nodes, subexp_num, OP_CLOSE_SUBEXP);
2710           if (cls_node == -1)
2711             continue; /* No.  */
2712           if (sub_top->path == NULL)
2713             {
2714               sub_top->path = calloc (sizeof (state_array_t),
2715                                       sl_str - sub_top->str_idx + 1);
2716               if (sub_top->path == NULL)
2717                 return REG_ESPACE;
2718             }
2719           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2720              in the current context?  */
2721           err = check_arrival (mctx, sub_top->path, sub_top->node,
2722                                sub_top->str_idx, cls_node, sl_str, OP_CLOSE_SUBEXP);
2723           if (err == REG_NOMATCH)
2724               continue;
2725           if (BE (err != REG_NOERROR, 0))
2726               return err;
2727           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2728           if (BE (sub_last == NULL, 0))
2729             return REG_ESPACE;
2730           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2731                                 bkref_str_idx);
2732           if (err == REG_NOMATCH)
2733             continue;
2734         }
2735     }
2736   return REG_NOERROR;
2737 }
2738
2739 /* Helper functions for get_subexp().  */
2740
2741 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2742    If it can arrive, register the sub expression expressed with SUB_TOP
2743    and SUB_LAST.  */
2744
2745 static reg_errcode_t
2746 get_subexp_sub (mctx, sub_top, sub_last, bkref_node, bkref_str)
2747      re_match_context_t *mctx;
2748      const re_sub_match_top_t *sub_top;
2749      re_sub_match_last_t *sub_last;
2750      int bkref_node, bkref_str;
2751 {
2752   reg_errcode_t err;
2753   int to_idx;
2754   /* Can the subexpression arrive the back reference?  */
2755   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2756                        sub_last->str_idx, bkref_node, bkref_str, OP_OPEN_SUBEXP);
2757   if (err != REG_NOERROR)
2758     return err;
2759   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2760                              sub_last->str_idx);
2761   if (BE (err != REG_NOERROR, 0))
2762     return err;
2763   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2764   return clean_state_log_if_needed (mctx, to_idx);
2765 }
2766
2767 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2768    Search '(' if FL_OPEN, or search ')' otherwise.
2769    TODO: This function isn't efficient...
2770          Because there might be more than one nodes whose types are
2771          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2772          nodes.
2773          E.g. RE: (a){2}  */
2774
2775 static int
2776 find_subexp_node (dfa, nodes, subexp_idx, type)
2777      const re_dfa_t *dfa;
2778      const re_node_set *nodes;
2779      int subexp_idx, type;
2780 {
2781   int cls_idx;
2782   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2783     {
2784       int cls_node = nodes->elems[cls_idx];
2785       const re_token_t *node = dfa->nodes + cls_node;
2786       if (node->type == type
2787           && node->opr.idx == subexp_idx)
2788         return cls_node;
2789     }
2790   return -1;
2791 }
2792
2793 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2794    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2795    heavily reused.
2796    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2797
2798 static reg_errcode_t
2799 check_arrival (mctx, path, top_node, top_str, last_node, last_str,
2800                type)
2801      re_match_context_t *mctx;
2802      state_array_t *path;
2803      int top_node, top_str, last_node, last_str, type;
2804 {
2805   re_dfa_t *const dfa = mctx->dfa;
2806   reg_errcode_t err;
2807   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2808   re_dfastate_t *cur_state = NULL;
2809   re_node_set *cur_nodes, next_nodes;
2810   re_dfastate_t **backup_state_log;
2811   unsigned int context;
2812
2813   subexp_num = dfa->nodes[top_node].opr.idx;
2814   /* Extend the buffer if we need.  */
2815   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2816     {
2817       re_dfastate_t **new_array;
2818       int old_alloc = path->alloc;
2819       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2820       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2821       if (new_array == NULL)
2822         {
2823           path->alloc = old_alloc;
2824           return REG_ESPACE;
2825         }
2826       path->array = new_array;
2827       memset (new_array + old_alloc, '\0',
2828               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2829     }
2830
2831   str_idx = path->next_idx == 0 ? top_str : path->next_idx;
2832
2833   /* Temporary modify MCTX.  */
2834   backup_state_log = mctx->state_log;
2835   backup_cur_idx = mctx->input.cur_idx;
2836   mctx->state_log = path->array;
2837   mctx->input.cur_idx = str_idx;
2838
2839   /* Setup initial node set.  */
2840   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2841   if (str_idx == top_str)
2842     {
2843       err = re_node_set_init_1 (&next_nodes, top_node);
2844       if (BE (err != REG_NOERROR, 0))
2845         return err;
2846       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2847       if (BE (err != REG_NOERROR, 0))
2848         {
2849           re_node_set_free (&next_nodes);
2850           return err;
2851         }
2852     }
2853   else
2854     {
2855       cur_state = mctx->state_log[str_idx];
2856       if (cur_state && cur_state->has_backref)
2857         {
2858           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2859           if (BE ( err != REG_NOERROR, 0))
2860             return err;
2861         }
2862       else
2863         re_node_set_init_empty (&next_nodes);
2864     }
2865   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2866     {
2867       if (next_nodes.nelem)
2868         {
2869           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2870                                     subexp_num, type);
2871           if (BE ( err != REG_NOERROR, 0))
2872             {
2873               re_node_set_free (&next_nodes);
2874               return err;
2875             }
2876         }
2877       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2878       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2879         {
2880           re_node_set_free (&next_nodes);
2881           return err;
2882         }
2883       mctx->state_log[str_idx] = cur_state;
2884     }
2885
2886   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2887     {
2888       re_node_set_empty (&next_nodes);
2889       if (mctx->state_log[str_idx + 1])
2890         {
2891           err = re_node_set_merge (&next_nodes,
2892                                    &mctx->state_log[str_idx + 1]->nodes);
2893           if (BE (err != REG_NOERROR, 0))
2894             {
2895               re_node_set_free (&next_nodes);
2896               return err;
2897             }
2898         }
2899       if (cur_state)
2900         {
2901           err = check_arrival_add_next_nodes (mctx, str_idx,
2902                                               &cur_state->nodes, &next_nodes);
2903           if (BE (err != REG_NOERROR, 0))
2904             {
2905               re_node_set_free (&next_nodes);
2906               return err;
2907             }
2908         }
2909       ++str_idx;
2910       if (next_nodes.nelem)
2911         {
2912           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2913           if (BE (err != REG_NOERROR, 0))
2914             {
2915               re_node_set_free (&next_nodes);
2916               return err;
2917             }
2918           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2919                                     subexp_num, type);
2920           if (BE ( err != REG_NOERROR, 0))
2921             {
2922               re_node_set_free (&next_nodes);
2923               return err;
2924             }
2925         }
2926       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2927       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2928       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2929         {
2930           re_node_set_free (&next_nodes);
2931           return err;
2932         }
2933       mctx->state_log[str_idx] = cur_state;
2934       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2935     }
2936   re_node_set_free (&next_nodes);
2937   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2938                : &mctx->state_log[last_str]->nodes);
2939   path->next_idx = str_idx;
2940
2941   /* Fix MCTX.  */
2942   mctx->state_log = backup_state_log;
2943   mctx->input.cur_idx = backup_cur_idx;
2944
2945   /* Then check the current node set has the node LAST_NODE.  */
2946   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2947     return REG_NOERROR;
2948
2949   return REG_NOMATCH;
2950 }
2951
2952 /* Helper functions for check_arrival.  */
2953
2954 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2955    to NEXT_NODES.
2956    TODO: This function is similar to the functions transit_state*(),
2957          however this function has many additional works.
2958          Can't we unify them?  */
2959
2960 static reg_errcode_t
2961 check_arrival_add_next_nodes (mctx, str_idx, cur_nodes, next_nodes)
2962      re_match_context_t *mctx;
2963      int str_idx;
2964      re_node_set *cur_nodes, *next_nodes;
2965 {
2966   re_dfa_t *const dfa = mctx->dfa;
2967   int result;
2968   int cur_idx;
2969   reg_errcode_t err;
2970   re_node_set union_set;
2971   re_node_set_init_empty (&union_set);
2972   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2973     {
2974       int naccepted = 0;
2975       int cur_node = cur_nodes->elems[cur_idx];
2976       re_token_type_t type = dfa->nodes[cur_node].type;
2977       if (IS_EPSILON_NODE (type))
2978         continue;
2979 #ifdef RE_ENABLE_I18N
2980       /* If the node may accept `multi byte'.  */
2981       if (ACCEPT_MB_NODE (type))
2982         {
2983           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
2984                                                str_idx);
2985           if (naccepted > 1)
2986             {
2987               re_dfastate_t *dest_state;
2988               int next_node = dfa->nexts[cur_node];
2989               int next_idx = str_idx + naccepted;
2990               dest_state = mctx->state_log[next_idx];
2991               re_node_set_empty (&union_set);
2992               if (dest_state)
2993                 {
2994                   err = re_node_set_merge (&union_set, &dest_state->nodes);
2995                   if (BE (err != REG_NOERROR, 0))
2996                     {
2997                       re_node_set_free (&union_set);
2998                       return err;
2999                     }
3000                 }
3001               result = re_node_set_insert (&union_set, next_node);
3002               if (BE (result < 0, 0))
3003                 {
3004                   re_node_set_free (&union_set);
3005                   return REG_ESPACE;
3006                 }
3007               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3008                                                             &union_set);
3009               if (BE (mctx->state_log[next_idx] == NULL
3010                       && err != REG_NOERROR, 0))
3011                 {
3012                   re_node_set_free (&union_set);
3013                   return err;
3014                 }
3015             }
3016         }
3017 #endif /* RE_ENABLE_I18N */
3018       if (naccepted
3019           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3020         {
3021           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3022           if (BE (result < 0, 0))
3023             {
3024               re_node_set_free (&union_set);
3025               return REG_ESPACE;
3026             }
3027         }
3028     }
3029   re_node_set_free (&union_set);
3030   return REG_NOERROR;
3031 }
3032
3033 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3034    CUR_NODES, however exclude the nodes which are:
3035     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3036     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3037 */
3038
3039 static reg_errcode_t
3040 check_arrival_expand_ecl (dfa, cur_nodes, ex_subexp, type)
3041      re_dfa_t *dfa;
3042      re_node_set *cur_nodes;
3043      int ex_subexp, type;
3044 {
3045   reg_errcode_t err;
3046   int idx, outside_node;
3047   re_node_set new_nodes;
3048 #ifdef DEBUG
3049   assert (cur_nodes->nelem);
3050 #endif
3051   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3052   if (BE (err != REG_NOERROR, 0))
3053     return err;
3054   /* Create a new node set NEW_NODES with the nodes which are epsilon
3055      closures of the node in CUR_NODES.  */
3056
3057   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3058     {
3059       int cur_node = cur_nodes->elems[idx];
3060       re_node_set *eclosure = dfa->eclosures + cur_node;
3061       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3062       if (outside_node == -1)
3063         {
3064           /* There are no problematic nodes, just merge them.  */
3065           err = re_node_set_merge (&new_nodes, eclosure);
3066           if (BE (err != REG_NOERROR, 0))
3067             {
3068               re_node_set_free (&new_nodes);
3069               return err;
3070             }
3071         }
3072       else
3073         {
3074           /* There are problematic nodes, re-calculate incrementally.  */
3075           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3076                                               ex_subexp, type);
3077           if (BE (err != REG_NOERROR, 0))
3078             {
3079               re_node_set_free (&new_nodes);
3080               return err;
3081             }
3082         }
3083     }
3084   re_node_set_free (cur_nodes);
3085   *cur_nodes = new_nodes;
3086   return REG_NOERROR;
3087 }
3088
3089 /* Helper function for check_arrival_expand_ecl.
3090    Check incrementally the epsilon closure of TARGET, and if it isn't
3091    problematic append it to DST_NODES.  */
3092
3093 static reg_errcode_t
3094 check_arrival_expand_ecl_sub (dfa, dst_nodes, target, ex_subexp, type)
3095      re_dfa_t *dfa;
3096      int target, ex_subexp, type;
3097      re_node_set *dst_nodes;
3098 {
3099   int cur_node;
3100   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3101     {
3102       int err;
3103
3104       if (dfa->nodes[cur_node].type == type
3105           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3106         {
3107           if (type == OP_CLOSE_SUBEXP)
3108             {
3109               err = re_node_set_insert (dst_nodes, cur_node);
3110               if (BE (err == -1, 0))
3111                 return REG_ESPACE;
3112             }
3113           break;
3114         }
3115       err = re_node_set_insert (dst_nodes, cur_node);
3116       if (BE (err == -1, 0))
3117         return REG_ESPACE;
3118       if (dfa->edests[cur_node].nelem == 0)
3119         break;
3120       if (dfa->edests[cur_node].nelem == 2)
3121         {
3122           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3123                                               dfa->edests[cur_node].elems[1],
3124                                               ex_subexp, type);
3125           if (BE (err != REG_NOERROR, 0))
3126             return err;
3127         }
3128       cur_node = dfa->edests[cur_node].elems[0];
3129     }
3130   return REG_NOERROR;
3131 }
3132
3133
3134 /* For all the back references in the current state, calculate the
3135    destination of the back references by the appropriate entry
3136    in MCTX->BKREF_ENTS.  */
3137
3138 static reg_errcode_t
3139 expand_bkref_cache (mctx, cur_nodes, cur_str, subexp_num,
3140                     type)
3141      re_match_context_t *mctx;
3142      int cur_str, subexp_num, type;
3143      re_node_set *cur_nodes;
3144 {
3145   re_dfa_t *const dfa = mctx->dfa;
3146   reg_errcode_t err;
3147   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3148   struct re_backref_cache_entry *ent;
3149
3150   if (cache_idx_start == -1)
3151     return REG_NOERROR;
3152
3153  restart:
3154   ent = mctx->bkref_ents + cache_idx_start;
3155   do
3156     {
3157       int to_idx, next_node;
3158
3159       /* Is this entry ENT is appropriate?  */
3160       if (!re_node_set_contains (cur_nodes, ent->node))
3161         continue; /* No.  */
3162
3163       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3164       /* Calculate the destination of the back reference, and append it
3165          to MCTX->STATE_LOG.  */
3166       if (to_idx == cur_str)
3167         {
3168           /* The backreference did epsilon transit, we must re-check all the
3169              node in the current state.  */
3170           re_node_set new_dests;
3171           reg_errcode_t err2, err3;
3172           next_node = dfa->edests[ent->node].elems[0];
3173           if (re_node_set_contains (cur_nodes, next_node))
3174             continue;
3175           err = re_node_set_init_1 (&new_dests, next_node);
3176           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3177           err3 = re_node_set_merge (cur_nodes, &new_dests);
3178           re_node_set_free (&new_dests);
3179           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3180                   || err3 != REG_NOERROR, 0))
3181             {
3182               err = (err != REG_NOERROR ? err
3183                      : (err2 != REG_NOERROR ? err2 : err3));
3184               return err;
3185             }
3186           /* TODO: It is still inefficient...  */
3187           goto restart;
3188         }
3189       else
3190         {
3191           re_node_set union_set;
3192           next_node = dfa->nexts[ent->node];
3193           if (mctx->state_log[to_idx])
3194             {
3195               int ret;
3196               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3197                                         next_node))
3198                 continue;
3199               err = re_node_set_init_copy (&union_set,
3200                                            &mctx->state_log[to_idx]->nodes);
3201               ret = re_node_set_insert (&union_set, next_node);
3202               if (BE (err != REG_NOERROR || ret < 0, 0))
3203                 {
3204                   re_node_set_free (&union_set);
3205                   err = err != REG_NOERROR ? err : REG_ESPACE;
3206                   return err;
3207                 }
3208             }
3209           else
3210             {
3211               err = re_node_set_init_1 (&union_set, next_node);
3212               if (BE (err != REG_NOERROR, 0))
3213                 return err;
3214             }
3215           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3216           re_node_set_free (&union_set);
3217           if (BE (mctx->state_log[to_idx] == NULL
3218                   && err != REG_NOERROR, 0))
3219             return err;
3220         }
3221     }
3222   while (ent++->more);
3223   return REG_NOERROR;
3224 }
3225
3226 /* Build transition table for the state.
3227    Return the new table if succeeded, otherwise return NULL.  */
3228
3229 static re_dfastate_t **
3230 build_trtable (dfa, state)
3231     re_dfa_t *dfa;
3232     re_dfastate_t *state;
3233 {
3234   reg_errcode_t err;
3235   int i, j, ch;
3236   unsigned int elem, mask;
3237   int dests_node_malloced = 0, dest_states_malloced = 0;
3238   int ndests; /* Number of the destination states from `state'.  */
3239   re_dfastate_t **trtable;
3240   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3241   re_node_set follows, *dests_node;
3242   bitset *dests_ch;
3243   bitset acceptable;
3244
3245   /* We build DFA states which corresponds to the destination nodes
3246      from `state'.  `dests_node[i]' represents the nodes which i-th
3247      destination state contains, and `dests_ch[i]' represents the
3248      characters which i-th destination state accepts.  */
3249 #ifdef _LIBC
3250   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX))
3251     dests_node = (re_node_set *)
3252                  alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3253   else
3254 #endif
3255     {
3256       dests_node = (re_node_set *)
3257                    malloc ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX);
3258       if (BE (dests_node == NULL, 0))
3259         return NULL;
3260       dests_node_malloced = 1;
3261     }
3262   dests_ch = (bitset *) (dests_node + SBC_MAX);
3263
3264   /* Initialize transiton table.  */
3265   state->word_trtable = 0;
3266
3267   /* At first, group all nodes belonging to `state' into several
3268      destinations.  */
3269   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3270   if (BE (ndests <= 0, 0))
3271     {
3272       if (dests_node_malloced)
3273         free (dests_node);
3274       /* Return NULL in case of an error, trtable otherwise.  */
3275       if (ndests == 0)
3276         {
3277           state->trtable = (re_dfastate_t **)
3278             calloc (sizeof (re_dfastate_t *), SBC_MAX);;
3279           return state->trtable;
3280         }
3281       return NULL;
3282     }
3283
3284   err = re_node_set_alloc (&follows, ndests + 1);
3285   if (BE (err != REG_NOERROR, 0))
3286     goto out_free;
3287
3288 #ifdef _LIBC
3289   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset)) * SBC_MAX
3290                          + ndests * 3 * sizeof (re_dfastate_t *)))
3291     dest_states = (re_dfastate_t **)
3292                   alloca (ndests * 3 * sizeof (re_dfastate_t *));
3293   else
3294 #endif
3295     {
3296       dest_states = (re_dfastate_t **)
3297                     malloc (ndests * 3 * sizeof (re_dfastate_t *));
3298       if (BE (dest_states == NULL, 0))
3299         {
3300 out_free:
3301           if (dest_states_malloced)
3302             free (dest_states);
3303           re_node_set_free (&follows);
3304           for (i = 0; i < ndests; ++i)
3305             re_node_set_free (dests_node + i);
3306           if (dests_node_malloced)
3307             free (dests_node);
3308           return NULL;
3309         }
3310       dest_states_malloced = 1;
3311     }
3312   dest_states_word = dest_states + ndests;
3313   dest_states_nl = dest_states_word + ndests;
3314   bitset_empty (acceptable);
3315
3316   /* Then build the states for all destinations.  */
3317   for (i = 0; i < ndests; ++i)
3318     {
3319       int next_node;
3320       re_node_set_empty (&follows);
3321       /* Merge the follows of this destination states.  */
3322       for (j = 0; j < dests_node[i].nelem; ++j)
3323         {
3324           next_node = dfa->nexts[dests_node[i].elems[j]];
3325           if (next_node != -1)
3326             {
3327               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3328               if (BE (err != REG_NOERROR, 0))
3329                 goto out_free;
3330             }
3331         }
3332       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3333       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3334         goto out_free;
3335       /* If the new state has context constraint,
3336          build appropriate states for these contexts.  */
3337       if (dest_states[i]->has_constraint)
3338         {
3339           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3340                                                           CONTEXT_WORD);
3341           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3342             goto out_free;
3343
3344           if (dest_states[i] != dest_states_word[i]
3345               && dfa->mb_cur_max > 1)
3346             state->word_trtable = 1;
3347
3348           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3349                                                         CONTEXT_NEWLINE);
3350           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3351             goto out_free;
3352         }
3353       else
3354         {
3355           dest_states_word[i] = dest_states[i];
3356           dest_states_nl[i] = dest_states[i];
3357         }
3358       bitset_merge (acceptable, dests_ch[i]);
3359     }
3360
3361   if (!BE (state->word_trtable, 0))
3362     {
3363       /* We don't care about whether the following character is a word
3364          character, or we are in a single-byte character set so we can
3365          discern by looking at the character code: allocate a
3366          256-entry transition table.  */
3367       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3368       if (BE (trtable == NULL, 0))
3369         goto out_free;
3370
3371       /* For all characters ch...:  */
3372       for (i = 0; i < BITSET_UINTS; ++i)
3373         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3374              elem;
3375              mask <<= 1, elem >>= 1, ++ch)
3376           if (BE (elem & 1, 0))
3377             {
3378               /* There must be exactly one destination which accepts
3379                  character ch.  See group_nodes_into_DFAstates.  */
3380               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3381                 ;
3382
3383               /* j-th destination accepts the word character ch.  */
3384               if (dfa->word_char[i] & mask)
3385                 trtable[ch] = dest_states_word[j];
3386               else
3387                 trtable[ch] = dest_states[j];
3388             }
3389     }
3390   else
3391     {
3392       /* We care about whether the following character is a word
3393          character, and we are in a multi-byte character set: discern
3394          by looking at the character code: build two 256-entry
3395          transition tables, one starting at trtable[0] and one
3396          starting at trtable[SBC_MAX].  */
3397       trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *),
3398                                            2 * SBC_MAX);
3399       if (BE (trtable == NULL, 0))
3400         goto out_free;
3401
3402       /* For all characters ch...:  */
3403       for (i = 0; i < BITSET_UINTS; ++i)
3404         for (ch = i * UINT_BITS, elem = acceptable[i], mask = 1;
3405              elem;
3406              mask <<= 1, elem >>= 1, ++ch)
3407           if (BE (elem & 1, 0))
3408             {
3409               /* There must be exactly one destination which accepts
3410                  character ch.  See group_nodes_into_DFAstates.  */
3411               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3412                 ;
3413
3414               /* j-th destination accepts the word character ch.  */
3415               trtable[ch] = dest_states[j];
3416               trtable[ch + SBC_MAX] = dest_states_word[j];
3417             }
3418     }
3419
3420   /* new line */
3421   if (bitset_contain (acceptable, NEWLINE_CHAR))
3422     {
3423       /* The current state accepts newline character.  */
3424       for (j = 0; j < ndests; ++j)
3425         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3426           {
3427             /* k-th destination accepts newline character.  */
3428             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3429             if (state->word_trtable)
3430               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3431             /* There must be only one destination which accepts
3432                newline.  See group_nodes_into_DFAstates.  */
3433             break;
3434           }
3435     }
3436
3437   if (dest_states_malloced)
3438     free (dest_states);
3439
3440   re_node_set_free (&follows);
3441   for (i = 0; i < ndests; ++i)
3442     re_node_set_free (dests_node + i);
3443
3444   if (dests_node_malloced)
3445     free (dests_node);
3446
3447   state->trtable = trtable;
3448   return trtable;
3449 }
3450
3451 /* Group all nodes belonging to STATE into several destinations.
3452    Then for all destinations, set the nodes belonging to the destination
3453    to DESTS_NODE[i] and set the characters accepted by the destination
3454    to DEST_CH[i].  This function return the number of destinations.  */
3455
3456 static int
3457 group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch)
3458     re_dfa_t *dfa;
3459     const re_dfastate_t *state;
3460     re_node_set *dests_node;
3461     bitset *dests_ch;
3462 {
3463   reg_errcode_t err;
3464   int result;
3465   int i, j, k;
3466   int ndests; /* Number of the destinations from `state'.  */
3467   bitset accepts; /* Characters a node can accept.  */
3468   const re_node_set *cur_nodes = &state->nodes;
3469   bitset_empty (accepts);
3470   ndests = 0;
3471
3472   /* For all the nodes belonging to `state',  */
3473   for (i = 0; i < cur_nodes->nelem; ++i)
3474     {
3475       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3476       re_token_type_t type = node->type;
3477       unsigned int constraint = node->constraint;
3478
3479       /* Enumerate all single byte character this node can accept.  */
3480       if (type == CHARACTER)
3481         bitset_set (accepts, node->opr.c);
3482       else if (type == SIMPLE_BRACKET)
3483         {
3484           bitset_merge (accepts, node->opr.sbcset);
3485         }
3486       else if (type == OP_PERIOD)
3487         {
3488 #ifdef RE_ENABLE_I18N
3489           if (dfa->mb_cur_max > 1)
3490             bitset_merge (accepts, dfa->sb_char);
3491           else
3492 #endif
3493             bitset_set_all (accepts);
3494           if (!(dfa->syntax & RE_DOT_NEWLINE))
3495             bitset_clear (accepts, '\n');
3496           if (dfa->syntax & RE_DOT_NOT_NULL)
3497             bitset_clear (accepts, '\0');
3498         }
3499 #ifdef RE_ENABLE_I18N
3500       else if (type == OP_UTF8_PERIOD)
3501         {
3502           memset (accepts, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
3503           if (!(dfa->syntax & RE_DOT_NEWLINE))
3504             bitset_clear (accepts, '\n');
3505           if (dfa->syntax & RE_DOT_NOT_NULL)
3506             bitset_clear (accepts, '\0');
3507         }
3508 #endif
3509       else
3510         continue;
3511
3512       /* Check the `accepts' and sift the characters which are not
3513          match it the context.  */
3514       if (constraint)
3515         {
3516           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3517             {
3518               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3519               bitset_empty (accepts);
3520               if (accepts_newline)
3521                 bitset_set (accepts, NEWLINE_CHAR);
3522               else
3523                 continue;
3524             }
3525           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3526             {
3527               bitset_empty (accepts);
3528               continue;
3529             }
3530
3531           if (constraint & NEXT_WORD_CONSTRAINT)
3532             {
3533               unsigned int any_set = 0;
3534               if (type == CHARACTER && !node->word_char)
3535                 {
3536                   bitset_empty (accepts);
3537                   continue;
3538                 }
3539 #ifdef RE_ENABLE_I18N
3540               if (dfa->mb_cur_max > 1)
3541                 for (j = 0; j < BITSET_UINTS; ++j)
3542                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3543               else
3544 #endif
3545                 for (j = 0; j < BITSET_UINTS; ++j)
3546                   any_set |= (accepts[j] &= dfa->word_char[j]);
3547               if (!any_set)
3548                 continue;
3549             }
3550           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3551             {
3552               unsigned int any_set = 0;
3553               if (type == CHARACTER && node->word_char)
3554                 {
3555                   bitset_empty (accepts);
3556                   continue;
3557                 }
3558 #ifdef RE_ENABLE_I18N
3559               if (dfa->mb_cur_max > 1)
3560                 for (j = 0; j < BITSET_UINTS; ++j)
3561                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3562               else
3563 #endif
3564                 for (j = 0; j < BITSET_UINTS; ++j)
3565                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3566               if (!any_set)
3567                 continue;
3568             }
3569         }
3570
3571       /* Then divide `accepts' into DFA states, or create a new
3572          state.  Above, we make sure that accepts is not empty.  */
3573       for (j = 0; j < ndests; ++j)
3574         {
3575           bitset intersec; /* Intersection sets, see below.  */
3576           bitset remains;
3577           /* Flags, see below.  */
3578           int has_intersec, not_subset, not_consumed;
3579
3580           /* Optimization, skip if this state doesn't accept the character.  */
3581           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3582             continue;
3583
3584           /* Enumerate the intersection set of this state and `accepts'.  */
3585           has_intersec = 0;
3586           for (k = 0; k < BITSET_UINTS; ++k)
3587             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3588           /* And skip if the intersection set is empty.  */
3589           if (!has_intersec)
3590             continue;
3591
3592           /* Then check if this state is a subset of `accepts'.  */
3593           not_subset = not_consumed = 0;
3594           for (k = 0; k < BITSET_UINTS; ++k)
3595             {
3596               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3597               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3598             }
3599
3600           /* If this state isn't a subset of `accepts', create a
3601              new group state, which has the `remains'. */
3602           if (not_subset)
3603             {
3604               bitset_copy (dests_ch[ndests], remains);
3605               bitset_copy (dests_ch[j], intersec);
3606               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3607               if (BE (err != REG_NOERROR, 0))
3608                 goto error_return;
3609               ++ndests;
3610             }
3611
3612           /* Put the position in the current group. */
3613           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3614           if (BE (result < 0, 0))
3615             goto error_return;
3616
3617           /* If all characters are consumed, go to next node. */
3618           if (!not_consumed)
3619             break;
3620         }
3621       /* Some characters remain, create a new group. */
3622       if (j == ndests)
3623         {
3624           bitset_copy (dests_ch[ndests], accepts);
3625           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3626           if (BE (err != REG_NOERROR, 0))
3627             goto error_return;
3628           ++ndests;
3629           bitset_empty (accepts);
3630         }
3631     }
3632   return ndests;
3633  error_return:
3634   for (j = 0; j < ndests; ++j)
3635     re_node_set_free (dests_node + j);
3636   return -1;
3637 }
3638
3639 #ifdef RE_ENABLE_I18N
3640 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3641    Return the number of the bytes the node accepts.
3642    STR_IDX is the current index of the input string.
3643
3644    This function handles the nodes which can accept one character, or
3645    one collating element like '.', '[a-z]', opposite to the other nodes
3646    can only accept one byte.  */
3647
3648 static int
3649 check_node_accept_bytes (dfa, node_idx, input, str_idx)
3650     re_dfa_t *dfa;
3651     int node_idx, str_idx;
3652     const re_string_t *input;
3653 {
3654   const re_token_t *node = dfa->nodes + node_idx;
3655   int char_len, elem_len;
3656   int i;
3657
3658   if (BE (node->type == OP_UTF8_PERIOD, 0))
3659     {
3660       unsigned char c = re_string_byte_at (input, str_idx), d;
3661       if (BE (c < 0xc2, 1))
3662         return 0;
3663
3664       if (str_idx + 2 > input->len)
3665         return 0;
3666
3667       d = re_string_byte_at (input, str_idx + 1);
3668       if (c < 0xe0)
3669         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3670       else if (c < 0xf0)
3671         {
3672           char_len = 3;
3673           if (c == 0xe0 && d < 0xa0)
3674             return 0;
3675         }
3676       else if (c < 0xf8)
3677         {
3678           char_len = 4;
3679           if (c == 0xf0 && d < 0x90)
3680             return 0;
3681         }
3682       else if (c < 0xfc)
3683         {
3684           char_len = 5;
3685           if (c == 0xf8 && d < 0x88)
3686             return 0;
3687         }
3688       else if (c < 0xfe)
3689         {
3690           char_len = 6;
3691           if (c == 0xfc && d < 0x84)
3692             return 0;
3693         }
3694       else
3695         return 0;
3696
3697       if (str_idx + char_len > input->len)
3698         return 0;
3699
3700       for (i = 1; i < char_len; ++i)
3701         {
3702           d = re_string_byte_at (input, str_idx + i);
3703           if (d < 0x80 || d > 0xbf)
3704             return 0;
3705         }
3706       return char_len;
3707     }
3708
3709   char_len = re_string_char_size_at (input, str_idx);
3710   if (node->type == OP_PERIOD)
3711     {
3712       if (char_len <= 1)
3713         return 0;
3714       /* FIXME: I don't think this if is needed, as both '\n'
3715          and '\0' are char_len == 1.  */
3716       /* '.' accepts any one character except the following two cases.  */
3717       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3718            re_string_byte_at (input, str_idx) == '\n') ||
3719           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3720            re_string_byte_at (input, str_idx) == '\0'))
3721         return 0;
3722       return char_len;
3723     }
3724
3725   elem_len = re_string_elem_size_at (input, str_idx);
3726   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3727     return 0;
3728
3729   if (node->type == COMPLEX_BRACKET)
3730     {
3731       const re_charset_t *cset = node->opr.mbcset;
3732 # ifdef _LIBC
3733       const unsigned char *pin = ((char *) re_string_get_buffer (input)
3734                                   + str_idx);
3735       int j;
3736       uint32_t nrules;
3737 # endif /* _LIBC */
3738       int match_len = 0;
3739       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3740                     ? re_string_wchar_at (input, str_idx) : 0);
3741
3742       /* match with multibyte character?  */
3743       for (i = 0; i < cset->nmbchars; ++i)
3744         if (wc == cset->mbchars[i])
3745           {
3746             match_len = char_len;
3747             goto check_node_accept_bytes_match;
3748           }
3749       /* match with character_class?  */
3750       for (i = 0; i < cset->nchar_classes; ++i)
3751         {
3752           wctype_t wt = cset->char_classes[i];
3753           if (__iswctype (wc, wt))
3754             {
3755               match_len = char_len;
3756               goto check_node_accept_bytes_match;
3757             }
3758         }
3759
3760 # ifdef _LIBC
3761       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3762       if (nrules != 0)
3763         {
3764           unsigned int in_collseq = 0;
3765           const int32_t *table, *indirect;
3766           const unsigned char *weights, *extra;
3767           const char *collseqwc;
3768           int32_t idx;
3769           /* This #include defines a local function!  */
3770 #  include <locale/weight.h>
3771
3772           /* match with collating_symbol?  */
3773           if (cset->ncoll_syms)
3774             extra = (const unsigned char *)
3775               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3776           for (i = 0; i < cset->ncoll_syms; ++i)
3777             {
3778               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3779               /* Compare the length of input collating element and
3780                  the length of current collating element.  */
3781               if (*coll_sym != elem_len)
3782                 continue;
3783               /* Compare each bytes.  */
3784               for (j = 0; j < *coll_sym; j++)
3785                 if (pin[j] != coll_sym[1 + j])
3786                   break;
3787               if (j == *coll_sym)
3788                 {
3789                   /* Match if every bytes is equal.  */
3790                   match_len = j;
3791                   goto check_node_accept_bytes_match;
3792                 }
3793             }
3794
3795           if (cset->nranges)
3796             {
3797               if (elem_len <= char_len)
3798                 {
3799                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3800                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3801                 }
3802               else
3803                 in_collseq = find_collation_sequence_value (pin, elem_len);
3804             }
3805           /* match with range expression?  */
3806           for (i = 0; i < cset->nranges; ++i)
3807             if (cset->range_starts[i] <= in_collseq
3808                 && in_collseq <= cset->range_ends[i])
3809               {
3810                 match_len = elem_len;
3811                 goto check_node_accept_bytes_match;
3812               }
3813
3814           /* match with equivalence_class?  */
3815           if (cset->nequiv_classes)
3816             {
3817               const unsigned char *cp = pin;
3818               table = (const int32_t *)
3819                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3820               weights = (const unsigned char *)
3821                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3822               extra = (const unsigned char *)
3823                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3824               indirect = (const int32_t *)
3825                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3826               idx = findidx (&cp);
3827               if (idx > 0)
3828                 for (i = 0; i < cset->nequiv_classes; ++i)
3829                   {
3830                     int32_t equiv_class_idx = cset->equiv_classes[i];
3831                     size_t weight_len = weights[idx];
3832                     if (weight_len == weights[equiv_class_idx])
3833                       {
3834                         int cnt = 0;
3835                         while (cnt <= weight_len
3836                                && (weights[equiv_class_idx + 1 + cnt]
3837                                    == weights[idx + 1 + cnt]))
3838                           ++cnt;
3839                         if (cnt > weight_len)
3840                           {
3841                             match_len = elem_len;
3842                             goto check_node_accept_bytes_match;
3843                           }
3844                       }
3845                   }
3846             }
3847         }
3848       else
3849 # endif /* _LIBC */
3850         {
3851           /* match with range expression?  */
3852 #if __GNUC__ >= 2
3853           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3854 #else
3855           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3856           cmp_buf[2] = wc;
3857 #endif
3858           for (i = 0; i < cset->nranges; ++i)
3859             {
3860               cmp_buf[0] = cset->range_starts[i];
3861               cmp_buf[4] = cset->range_ends[i];
3862               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3863                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3864                 {
3865                   match_len = char_len;
3866                   goto check_node_accept_bytes_match;
3867                 }
3868             }
3869         }
3870     check_node_accept_bytes_match:
3871       if (!cset->non_match)
3872         return match_len;
3873       else
3874         {
3875           if (match_len > 0)
3876             return 0;
3877           else
3878             return (elem_len > char_len) ? elem_len : char_len;
3879         }
3880     }
3881   return 0;
3882 }
3883
3884 # ifdef _LIBC
3885 static unsigned int
3886 find_collation_sequence_value (mbs, mbs_len)
3887     const unsigned char *mbs;
3888     size_t mbs_len;
3889 {
3890   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3891   if (nrules == 0)
3892     {
3893       if (mbs_len == 1)
3894         {
3895           /* No valid character.  Match it as a single byte character.  */
3896           const unsigned char *collseq = (const unsigned char *)
3897             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3898           return collseq[mbs[0]];
3899         }
3900       return UINT_MAX;
3901     }
3902   else
3903     {
3904       int32_t idx;
3905       const unsigned char *extra = (const unsigned char *)
3906         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3907       int32_t extrasize = (const unsigned char *)
3908         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3909
3910       for (idx = 0; idx < extrasize;)
3911         {
3912           int mbs_cnt, found = 0;
3913           int32_t elem_mbs_len;
3914           /* Skip the name of collating element name.  */
3915           idx = idx + extra[idx] + 1;
3916           elem_mbs_len = extra[idx++];
3917           if (mbs_len == elem_mbs_len)
3918             {
3919               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3920                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3921                   break;
3922               if (mbs_cnt == elem_mbs_len)
3923                 /* Found the entry.  */
3924                 found = 1;
3925             }
3926           /* Skip the byte sequence of the collating element.  */
3927           idx += elem_mbs_len;
3928           /* Adjust for the alignment.  */
3929           idx = (idx + 3) & ~3;
3930           /* Skip the collation sequence value.  */
3931           idx += sizeof (uint32_t);
3932           /* Skip the wide char sequence of the collating element.  */
3933           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
3934           /* If we found the entry, return the sequence value.  */
3935           if (found)
3936             return *(uint32_t *) (extra + idx);
3937           /* Skip the collation sequence value.  */
3938           idx += sizeof (uint32_t);
3939         }
3940       return UINT_MAX;
3941     }
3942 }
3943 # endif /* _LIBC */
3944 #endif /* RE_ENABLE_I18N */
3945
3946 /* Check whether the node accepts the byte which is IDX-th
3947    byte of the INPUT.  */
3948
3949 static int
3950 check_node_accept (mctx, node, idx)
3951     const re_match_context_t *mctx;
3952     const re_token_t *node;
3953     int idx;
3954 {
3955   re_dfa_t *const dfa = mctx->dfa;
3956   unsigned char ch;
3957   if (node->constraint)
3958     {
3959       /* The node has constraints.  Check whether the current context
3960          satisfies the constraints.  */
3961       unsigned int context = re_string_context_at (&mctx->input, idx,
3962                                                    mctx->eflags);
3963       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
3964         return 0;
3965     }
3966   ch = re_string_byte_at (&mctx->input, idx);
3967   switch (node->type)
3968     {
3969     case CHARACTER:
3970       return node->opr.c == ch;
3971     case SIMPLE_BRACKET:
3972       return bitset_contain (node->opr.sbcset, ch);
3973 #ifdef RE_ENABLE_I18N
3974     case OP_UTF8_PERIOD:
3975       if (ch >= 0x80)
3976         return 0;
3977       /* FALLTHROUGH */
3978 #endif
3979     case OP_PERIOD:
3980       return !((ch == '\n' && !(dfa->syntax & RE_DOT_NEWLINE))
3981                || (ch == '\0' && (dfa->syntax & RE_DOT_NOT_NULL)));
3982     default:
3983       return 0;
3984     }
3985 }
3986
3987 /* Extend the buffers, if the buffers have run out.  */
3988
3989 static reg_errcode_t
3990 extend_buffers (mctx)
3991      re_match_context_t *mctx;
3992 {
3993   reg_errcode_t ret;
3994   re_string_t *pstr = &mctx->input;
3995
3996   /* Double the lengthes of the buffers.  */
3997   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
3998   if (BE (ret != REG_NOERROR, 0))
3999     return ret;
4000
4001   if (mctx->state_log != NULL)
4002     {
4003       /* And double the length of state_log.  */
4004       /* XXX We have no indication of the size of this buffer.  If this
4005          allocation fail we have no indication that the state_log array
4006          does not have the right size.  */
4007       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4008                                               pstr->bufs_len + 1);
4009       if (BE (new_array == NULL, 0))
4010         return REG_ESPACE;
4011       mctx->state_log = new_array;
4012     }
4013
4014   /* Then reconstruct the buffers.  */
4015   if (pstr->icase)
4016     {
4017 #ifdef RE_ENABLE_I18N
4018       if (pstr->mb_cur_max > 1)
4019         {
4020           ret = build_wcs_upper_buffer (pstr);
4021           if (BE (ret != REG_NOERROR, 0))
4022             return ret;
4023         }
4024       else
4025 #endif /* RE_ENABLE_I18N  */
4026         build_upper_buffer (pstr);
4027     }
4028   else
4029     {
4030 #ifdef RE_ENABLE_I18N
4031       if (pstr->mb_cur_max > 1)
4032         build_wcs_buffer (pstr);
4033       else
4034 #endif /* RE_ENABLE_I18N  */
4035         {
4036           if (pstr->trans != NULL)
4037             re_string_translate_buffer (pstr);
4038         }
4039     }
4040   return REG_NOERROR;
4041 }
4042
4043 \f
4044 /* Functions for matching context.  */
4045
4046 /* Initialize MCTX.  */
4047
4048 static reg_errcode_t
4049 match_ctx_init (mctx, eflags, n)
4050     re_match_context_t *mctx;
4051     int eflags, n;
4052 {
4053   mctx->eflags = eflags;
4054   mctx->match_last = -1;
4055   if (n > 0)
4056     {
4057       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4058       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4059       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4060         return REG_ESPACE;
4061     }
4062   /* Already zero-ed by the caller.
4063      else
4064        mctx->bkref_ents = NULL;
4065      mctx->nbkref_ents = 0;
4066      mctx->nsub_tops = 0;  */
4067   mctx->abkref_ents = n;
4068   mctx->max_mb_elem_len = 1;
4069   mctx->asub_tops = n;
4070   return REG_NOERROR;
4071 }
4072
4073 /* Clean the entries which depend on the current input in MCTX.
4074    This function must be invoked when the matcher changes the start index
4075    of the input, or changes the input string.  */
4076
4077 static void
4078 match_ctx_clean (mctx)
4079     re_match_context_t *mctx;
4080 {
4081   match_ctx_free_subtops (mctx);
4082   mctx->nsub_tops = 0;
4083   mctx->nbkref_ents = 0;
4084 }
4085
4086 /* Free all the memory associated with MCTX.  */
4087
4088 static void
4089 match_ctx_free (mctx)
4090     re_match_context_t *mctx;
4091 {
4092   match_ctx_free_subtops (mctx);
4093   re_free (mctx->sub_tops);
4094   re_free (mctx->bkref_ents);
4095 }
4096
4097 /* Free all the memory associated with MCTX->SUB_TOPS.  */
4098
4099 static void
4100 match_ctx_free_subtops (mctx)
4101      re_match_context_t *mctx;
4102 {
4103   int st_idx;
4104   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4105     {
4106       int sl_idx;
4107       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4108       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4109         {
4110           re_sub_match_last_t *last = top->lasts[sl_idx];
4111           re_free (last->path.array);
4112           re_free (last);
4113         }
4114       re_free (top->lasts);
4115       if (top->path)
4116         {
4117           re_free (top->path->array);
4118           re_free (top->path);
4119         }
4120       free (top);
4121     }
4122 }
4123
4124 /* Add a new backreference entry to MCTX.
4125    Note that we assume that caller never call this function with duplicate
4126    entry, and call with STR_IDX which isn't smaller than any existing entry.
4127 */
4128
4129 static reg_errcode_t
4130 match_ctx_add_entry (mctx, node, str_idx, from, to)
4131      re_match_context_t *mctx;
4132      int node, str_idx, from, to;
4133 {
4134   if (mctx->nbkref_ents >= mctx->abkref_ents)
4135     {
4136       struct re_backref_cache_entry* new_entry;
4137       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4138                               mctx->abkref_ents * 2);
4139       if (BE (new_entry == NULL, 0))
4140         {
4141           re_free (mctx->bkref_ents);
4142           return REG_ESPACE;
4143         }
4144       mctx->bkref_ents = new_entry;
4145       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4146               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4147       mctx->abkref_ents *= 2;
4148     }
4149   if (mctx->nbkref_ents > 0
4150       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4151     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4152
4153   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4154   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4155   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4156   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4157   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4158   if (mctx->max_mb_elem_len < to - from)
4159     mctx->max_mb_elem_len = to - from;
4160   return REG_NOERROR;
4161 }
4162
4163 /* Search for the first entry which has the same str_idx, or -1 if none is
4164    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4165
4166 static int
4167 search_cur_bkref_entry (mctx, str_idx)
4168      re_match_context_t *mctx;
4169      int str_idx;
4170 {
4171   int left, right, mid, last;
4172   last = right = mctx->nbkref_ents;
4173   for (left = 0; left < right;)
4174     {
4175       mid = (left + right) / 2;
4176       if (mctx->bkref_ents[mid].str_idx < str_idx)
4177         left = mid + 1;
4178       else
4179         right = mid;
4180     }
4181   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4182     return left;
4183   else
4184     return -1;
4185 }
4186
4187 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4188    at STR_IDX.  */
4189
4190 static reg_errcode_t
4191 match_ctx_add_subtop (mctx, node, str_idx)
4192      re_match_context_t *mctx;
4193      int node, str_idx;
4194 {
4195 #ifdef DEBUG
4196   assert (mctx->sub_tops != NULL);
4197   assert (mctx->asub_tops > 0);
4198 #endif
4199   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4200     {
4201       int new_asub_tops = mctx->asub_tops * 2;
4202       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4203                                                    re_sub_match_top_t *,
4204                                                    new_asub_tops);
4205       if (BE (new_array == NULL, 0))
4206         return REG_ESPACE;
4207       mctx->sub_tops = new_array;
4208       mctx->asub_tops = new_asub_tops;
4209     }
4210   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4211   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4212     return REG_ESPACE;
4213   mctx->sub_tops[mctx->nsub_tops]->node = node;
4214   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4215   return REG_NOERROR;
4216 }
4217
4218 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4219    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4220
4221 static re_sub_match_last_t *
4222 match_ctx_add_sublast (subtop, node, str_idx)
4223      re_sub_match_top_t *subtop;
4224      int node, str_idx;
4225 {
4226   re_sub_match_last_t *new_entry;
4227   if (BE (subtop->nlasts == subtop->alasts, 0))
4228     {
4229       int new_alasts = 2 * subtop->alasts + 1;
4230       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4231                                                     re_sub_match_last_t *,
4232                                                     new_alasts);
4233       if (BE (new_array == NULL, 0))
4234         return NULL;
4235       subtop->lasts = new_array;
4236       subtop->alasts = new_alasts;
4237     }
4238   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4239   if (BE (new_entry != NULL, 1))
4240     {
4241       subtop->lasts[subtop->nlasts] = new_entry;
4242       new_entry->node = node;
4243       new_entry->str_idx = str_idx;
4244       ++subtop->nlasts;
4245     }
4246   return new_entry;
4247 }
4248
4249 static void
4250 sift_ctx_init (sctx, sifted_sts, limited_sts, last_node, last_str_idx)
4251     re_sift_context_t *sctx;
4252     re_dfastate_t **sifted_sts, **limited_sts;
4253     int last_node, last_str_idx;
4254 {
4255   sctx->sifted_states = sifted_sts;
4256   sctx->limited_states = limited_sts;
4257   sctx->last_node = last_node;
4258   sctx->last_str_idx = last_str_idx;
4259   re_node_set_init_empty (&sctx->limits);
4260 }