posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  23                                      int n);
  24 static void match_ctx_clean (re_match_context_t *mctx);
  25 static void match_ctx_free (re_match_context_t *cache);
  26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  27                                           int str_idx, int from, int to);
  28 static int search_cur_bkref_entry (const re_match_context_t *mctx,
  29                                    int str_idx);
  30 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  31                                            int str_idx);
  32 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  33                                                    int node, int str_idx);
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx);
  37 static reg_errcode_t re_search_internal (const regex_t *preg,
  38                                          const char *string, int length,
  39                                          int start, int range, int stop,
  40                                          size_t nmatch, regmatch_t pmatch[],
  41                                          int eflags);
  42 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  43                              const char *string1, int length1,
  44                              const char *string2, int length2,
  45                              int start, int range, struct re_registers *regs,
  46                              int stop, int ret_len);
  47 static int re_search_stub (struct re_pattern_buffer *bufp,
  48                            const char *string, int length, int start,
  49                            int range, int stop, struct re_registers *regs,
  50                            int ret_len);
  51 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  52                               int nregs, int regs_allocated);
  53 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
  54 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  55                            int *p_match_first);
  56 static int check_halt_state_context (const re_match_context_t *mctx,
  57                                      const re_dfastate_t *state, int idx);
  58 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  59                          regmatch_t *prev_idx_match, int cur_node,
  60                          int cur_idx, int nmatch);
  61 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  62                                       int str_idx, int dest_node, int nregs,
  63                                       regmatch_t *regs,
  64                                       re_node_set *eps_via_nodes);
  65 static reg_errcode_t set_regs (const regex_t *preg,
  66                                const re_match_context_t *mctx,
  67                                size_t nmatch, regmatch_t *pmatch,
  68                                int fl_backtrack);
  69 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
  70
  71 #ifdef RE_ENABLE_I18N
  72 static int sift_states_iter_mb (const re_match_context_t *mctx,
  73                                 re_sift_context_t *sctx,
  74                                 int node_idx, int str_idx, int max_str_idx);
  75 #endif /* RE_ENABLE_I18N */
  76 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  77                                            re_sift_context_t *sctx);
  78 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  79                                           re_sift_context_t *sctx, int str_idx,
  80                                           re_node_set *cur_dest);
  81 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  82                                               re_sift_context_t *sctx,
  83                                               int str_idx,
  84                                               re_node_set *dest_nodes);
  85 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  86                                             re_node_set *dest_nodes,
  87                                             const re_node_set *candidates);
  88 static int check_dst_limits (const re_match_context_t *mctx,
  89                              re_node_set *limits,
  90                              int dst_node, int dst_idx, int src_node,
  91                              int src_idx);
  92 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
  93                                         int boundaries, int subexp_idx,
  94                                         int from_node, int bkref_idx);
  95 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
  96                                       int limit, int subexp_idx,
  97                                       int node, int str_idx,
  98                                       int bkref_idx);
  99 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 100                                           re_node_set *dest_nodes,
 101                                           const re_node_set *candidates,
 102                                           re_node_set *limits,
 103                                           struct re_backref_cache_entry *bkref_ents,
 104                                           int str_idx);
 105 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 106                                         re_sift_context_t *sctx,
 107                                         int str_idx,
 108                                         const re_node_set *candidates);
 109 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 110                                         re_dfastate_t **dst,
 111                                         re_dfastate_t **src, int num);
 112 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 113                                          re_match_context_t *mctx);
 114 static re_dfastate_t *transit_state (reg_errcode_t *err,
 115                                      re_match_context_t *mctx,
 116                                      re_dfastate_t *state);
 117 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 118                                             re_match_context_t *mctx,
 119                                             re_dfastate_t *next_state);
 120 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 121                                                 re_node_set *cur_nodes,
 122                                                 int str_idx);
 123 #if 0
 124 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 125                                         re_match_context_t *mctx,
 126                                         re_dfastate_t *pstate);
 127 #endif
 128 #ifdef RE_ENABLE_I18N
 129 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 130                                        re_dfastate_t *pstate);
 131 #endif /* RE_ENABLE_I18N */
 132 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 133                                           const re_node_set *nodes);
 134 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 135                                  int bkref_node, int bkref_str_idx);
 136 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 137                                      const re_sub_match_top_t *sub_top,
 138                                      re_sub_match_last_t *sub_last,
 139                                      int bkref_node, int bkref_str);
 140 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 141                              int subexp_idx, int type);
 142 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 143                                     state_array_t *path, int top_node,
 144                                     int top_str, int last_node, int last_str,
 145                                     int type);
 146 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 147                                                    int str_idx,
 148                                                    re_node_set *cur_nodes,
 149                                                    re_node_set *next_nodes);
 150 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 151                                                re_node_set *cur_nodes,
 152                                                int ex_subexp, int type);
 153 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 154                                                    re_node_set *dst_nodes,
 155                                                    int target, int ex_subexp,
 156                                                    int type);
 157 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 158                                          re_node_set *cur_nodes, int cur_str,
 159                                          int subexp_num, int type);
 160 static int build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
 161 #ifdef RE_ENABLE_I18N
 162 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 163                                     const re_string_t *input, int idx);
 164 # ifdef _LIBC
 165 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 166                                                    size_t name_len);
 167 # endif /* _LIBC */
 168 #endif /* RE_ENABLE_I18N */
 169 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 170                                        const re_dfastate_t *state,
 171                                        re_node_set *states_node,
 172                                        bitset_t *states_ch);
 173 static int check_node_accept (const re_match_context_t *mctx,
 174                               const re_token_t *node, int idx);
 175 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
 176 \f
 177 /* Entry point for POSIX code.  */
 178
 179 /* regexec searches for a given pattern, specified by PREG, in the
 180    string STRING.
 181
 182    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 183    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 184    least NMATCH elements, and we set them to the offsets of the
 185    corresponding matched substrings.
 186
 187    EFLAGS specifies `execution flags' which affect matching: if
 188    REG_NOTBOL is set, then ^ does not match at the beginning of the
 189    string; if REG_NOTEOL is set, then $ does not match at the end.
 190
 191    We return 0 if we find a match and REG_NOMATCH if not.  */
 192
 193 int
 194 regexec (const regex_t *__restrict preg, const char *__restrict string,
 195          size_t nmatch, regmatch_t pmatch[], int eflags)
 196 {
 197   reg_errcode_t err;
 198   int start, length;
 199   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 200
 201   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 202     return REG_BADPAT;
 203
 204   if (eflags & REG_STARTEND)
 205     {
 206       start = pmatch[0].rm_so;
 207       length = pmatch[0].rm_eo;
 208     }
 209   else
 210     {
 211       start = 0;
 212       length = strlen (string);
 213     }
 214
 215   __libc_lock_lock (dfa->lock);
 216   if (preg->no_sub)
 217     err = re_search_internal (preg, string, length, start, length - start,
 218                               length, 0, NULL, eflags);
 219   else
 220     err = re_search_internal (preg, string, length, start, length - start,
 221                               length, nmatch, pmatch, eflags);
 222   __libc_lock_unlock (dfa->lock);
 223   return err != REG_NOERROR;
 224 }
 225
 226 #ifdef _LIBC
 227 libc_hidden_def (__regexec)
 228
 229 # include <shlib-compat.h>
 230 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 231
 232 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 233 __typeof__ (__regexec) __compat_regexec;
 234
 235 int
 236 attribute_compat_text_section
 237 __compat_regexec (const regex_t *__restrict preg,
 238                   const char *__restrict string, size_t nmatch,
 239                   regmatch_t pmatch[], int eflags)
 240 {
 241   return regexec (preg, string, nmatch, pmatch,
 242                   eflags & (REG_NOTBOL | REG_NOTEOL));
 243 }
 244 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 245 # endif
 246 #endif
 247
 248 /* Entry points for GNU code.  */
 249
 250 /* re_match, re_search, re_match_2, re_search_2
 251
 252    The former two functions operate on STRING with length LENGTH,
 253    while the later two operate on concatenation of STRING1 and STRING2
 254    with lengths LENGTH1 and LENGTH2, respectively.
 255
 256    re_match() matches the compiled pattern in BUFP against the string,
 257    starting at index START.
 258
 259    re_search() first tries matching at index START, then it tries to match
 260    starting from index START + 1, and so on.  The last start position tried
 261    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 262    way as re_match().)
 263
 264    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 265    the first STOP characters of the concatenation of the strings should be
 266    concerned.
 267
 268    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 269    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
 270    computed relative to the concatenation, not relative to the individual
 271    strings.)
 272
 273    On success, re_match* functions return the length of the match, re_search*
 274    return the position of the start of the match.  Return value -1 means no
 275    match was found and -2 indicates an internal error.  */
 276
 277 int
 278 re_match (struct re_pattern_buffer *bufp, const char *string, int length,
 279           int start, struct re_registers *regs)
 280 {
 281   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 282 }
 283 #ifdef _LIBC
 284 weak_alias (__re_match, re_match)
 285 #endif
 286
 287 int
 288 re_search (struct re_pattern_buffer *bufp, const char *string, int length,
 289            int start, int range, struct re_registers *regs)
 290 {
 291   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 292 }
 293 #ifdef _LIBC
 294 weak_alias (__re_search, re_search)
 295 #endif
 296
 297 int
 298 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
 299             const char *string2, int length2, int start,
 300             struct re_registers *regs, int stop)
 301 {
 302   return re_search_2_stub (bufp, string1, length1, string2, length2,
 303                            start, 0, regs, stop, 1);
 304 }
 305 #ifdef _LIBC
 306 weak_alias (__re_match_2, re_match_2)
 307 #endif
 308
 309 int
 310 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, int length1,
 311              const char *string2, int length2, int start, int range,
 312              struct re_registers *regs, int stop)
 313 {
 314   return re_search_2_stub (bufp, string1, length1, string2, length2,
 315                            start, range, regs, stop, 0);
 316 }
 317 #ifdef _LIBC
 318 weak_alias (__re_search_2, re_search_2)
 319 #endif
 320
 321 static int
 322 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
 323                   int length1, const char *string2, int length2, int start,
 324                   int range, struct re_registers *regs,
 325                   int stop, int ret_len)
 326 {
 327   const char *str;
 328   int rval;
 329   int len = length1 + length2;
 330   char *s = NULL;
 331
 332   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 333     return -2;
 334
 335   /* Concatenate the strings.  */
 336   if (length2 > 0)
 337     if (length1 > 0)
 338       {
 339         s = re_malloc (char, len);
 340
 341         if (BE (s == NULL, 0))
 342           return -2;
 343 #ifdef _LIBC
 344         memcpy (__mempcpy (s, string1, length1), string2, length2);
 345 #else
 346         memcpy (s, string1, length1);
 347         memcpy (s + length1, string2, length2);
 348 #endif
 349         str = s;
 350       }
 351     else
 352       str = string2;
 353   else
 354     str = string1;
 355
 356   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 357   re_free (s);
 358   return rval;
 359 }
 360
 361 /* The parameters have the same meaning as those of re_search.
 362    Additional parameters:
 363    If RET_LEN is nonzero the length of the match is returned (re_match style);
 364    otherwise the position of the match is returned.  */
 365
 366 static int
 367 re_search_stub (struct re_pattern_buffer *bufp, const char *string, int length,
 368                 int start, int range, int stop, struct re_registers *regs,
 369                 int ret_len)
 370 {
 371   reg_errcode_t result;
 372   regmatch_t *pmatch;
 373   int nregs, rval;
 374   int eflags = 0;
 375   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 376
 377   /* Check for out-of-range.  */
 378   if (BE (start < 0 || start > length, 0))
 379     return -1;
 380   if (BE (start + range > length, 0))
 381     range = length - start;
 382   else if (BE (start + range < 0, 0))
 383     range = -start;
 384
 385   __libc_lock_lock (dfa->lock);
 386
 387   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 388   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 389
 390   /* Compile fastmap if we haven't yet.  */
 391   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 392     re_compile_fastmap (bufp);
 393
 394   if (BE (bufp->no_sub, 0))
 395     regs = NULL;
 396
 397   /* We need at least 1 register.  */
 398   if (regs == NULL)
 399     nregs = 1;
 400   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 401                regs->num_regs < bufp->re_nsub + 1, 0))
 402     {
 403       nregs = regs->num_regs;
 404       if (BE (nregs < 1, 0))
 405         {
 406           /* Nothing can be copied to regs.  */
 407           regs = NULL;
 408           nregs = 1;
 409         }
 410     }
 411   else
 412     nregs = bufp->re_nsub + 1;
 413   pmatch = re_malloc (regmatch_t, nregs);
 414   if (BE (pmatch == NULL, 0))
 415     {
 416       rval = -2;
 417       goto out;
 418     }
 419
 420   result = re_search_internal (bufp, string, length, start, range, stop,
 421                                nregs, pmatch, eflags);
 422
 423   rval = 0;
 424
 425   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 426   if (result != REG_NOERROR)
 427     rval = -1;
 428   else if (regs != NULL)
 429     {
 430       /* If caller wants register contents data back, copy them.  */
 431       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 432                                            bufp->regs_allocated);
 433       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 434         rval = -2;
 435     }
 436
 437   if (BE (rval == 0, 1))
 438     {
 439       if (ret_len)
 440         {
 441           assert (pmatch[0].rm_so == start);
 442           rval = pmatch[0].rm_eo - start;
 443         }
 444       else
 445         rval = pmatch[0].rm_so;
 446     }
 447   re_free (pmatch);
 448  out:
 449   __libc_lock_unlock (dfa->lock);
 450   return rval;
 451 }
 452
 453 static unsigned
 454 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, int nregs,
 455               int regs_allocated)
 456 {
 457   int rval = REGS_REALLOCATE;
 458   int i;
 459   int need_regs = nregs + 1;
 460   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 461      uses.  */
 462
 463   /* Have the register data arrays been allocated?  */
 464   if (regs_allocated == REGS_UNALLOCATED)
 465     { /* No.  So allocate them with malloc.  */
 466       regs->start = re_malloc (regoff_t, need_regs);
 467       if (BE (regs->start == NULL, 0))
 468         return REGS_UNALLOCATED;
 469       regs->end = re_malloc (regoff_t, need_regs);
 470       if (BE (regs->end == NULL, 0))
 471         {
 472           re_free (regs->start);
 473           return REGS_UNALLOCATED;
 474         }
 475       regs->num_regs = need_regs;
 476     }
 477   else if (regs_allocated == REGS_REALLOCATE)
 478     { /* Yes.  If we need more elements than were already
 479          allocated, reallocate them.  If we need fewer, just
 480          leave it alone.  */
 481       if (BE (need_regs > regs->num_regs, 0))
 482         {
 483           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 484           regoff_t *new_end;
 485           if (BE (new_start == NULL, 0))
 486             return REGS_UNALLOCATED;
 487           new_end = re_realloc (regs->end, regoff_t, need_regs);
 488           if (BE (new_end == NULL, 0))
 489             {
 490               re_free (new_start);
 491               return REGS_UNALLOCATED;
 492             }
 493           regs->start = new_start;
 494           regs->end = new_end;
 495           regs->num_regs = need_regs;
 496         }
 497     }
 498   else
 499     {
 500       assert (regs_allocated == REGS_FIXED);
 501       /* This function may not be called with REGS_FIXED and nregs too big.  */
 502       assert (regs->num_regs >= nregs);
 503       rval = REGS_FIXED;
 504     }
 505
 506   /* Copy the regs.  */
 507   for (i = 0; i < nregs; ++i)
 508     {
 509       regs->start[i] = pmatch[i].rm_so;
 510       regs->end[i] = pmatch[i].rm_eo;
 511     }
 512   for ( ; i < regs->num_regs; ++i)
 513     regs->start[i] = regs->end[i] = -1;
 514
 515   return rval;
 516 }
 517
 518 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 519    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 520    this memory for recording register information.  STARTS and ENDS
 521    must be allocated using the malloc library routine, and must each
 522    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 523
 524    If NUM_REGS == 0, then subsequent matches should allocate their own
 525    register data.
 526
 527    Unless this function is called, the first search or match using
 528    PATTERN_BUFFER will allocate its own register data, without
 529    freeing the old data.  */
 530
 531 void
 532 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 533                   unsigned num_regs, regoff_t *starts, regoff_t *ends)
 534 {
 535   if (num_regs)
 536     {
 537       bufp->regs_allocated = REGS_REALLOCATE;
 538       regs->num_regs = num_regs;
 539       regs->start = starts;
 540       regs->end = ends;
 541     }
 542   else
 543     {
 544       bufp->regs_allocated = REGS_UNALLOCATED;
 545       regs->num_regs = 0;
 546       regs->start = regs->end = (regoff_t *) 0;
 547     }
 548 }
 549 #ifdef _LIBC
 550 weak_alias (__re_set_registers, re_set_registers)
 551 #endif
 552 \f
 553 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 554    them unless specifically requested.  */
 555
 556 #if defined _REGEX_RE_COMP || defined _LIBC
 557 int
 558 # ifdef _LIBC
 559 weak_function
 560 # endif
 561 re_exec (const char *s)
 562 {
 563   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 564 }
 565 #endif /* _REGEX_RE_COMP */
 566 \f
 567 /* Internal entry point.  */
 568
 569 /* Searches for a compiled pattern PREG in the string STRING, whose
 570    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 571    meaning as with regexec.  START, and RANGE have the same meanings
 572    with re_search.
 573    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 574    otherwise return the error code.
 575    Note: We assume front end functions already check ranges.
 576    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 577
 578 static reg_errcode_t
 579 __attribute_warn_unused_result__
 580 re_search_internal (const regex_t *preg, const char *string, int length,
 581                     int start, int range, int stop, size_t nmatch,
 582                     regmatch_t pmatch[], int eflags)
 583 {
 584   reg_errcode_t err;
 585   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 586   int left_lim, right_lim, incr;
 587   int fl_longest_match, match_first, match_kind, match_last = -1;
 588   int extra_nmatch;
 589   int sb, ch;
 590 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 591   re_match_context_t mctx = { .dfa = dfa };
 592 #else
 593   re_match_context_t mctx;
 594 #endif
 595   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 596                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 597   RE_TRANSLATE_TYPE t = preg->translate;
 598
 599 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 600   memset (&mctx, '\0', sizeof (re_match_context_t));
 601   mctx.dfa = dfa;
 602 #endif
 603
 604   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 605   nmatch -= extra_nmatch;
 606
 607   /* Check if the DFA haven't been compiled.  */
 608   if (BE (preg->used == 0 || dfa->init_state == NULL
 609           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 610           || dfa->init_state_begbuf == NULL, 0))
 611     return REG_NOMATCH;
 612
 613 #ifdef DEBUG
 614   /* We assume front-end functions already check them.  */
 615   assert (start + range >= 0 && start + range <= length);
 616 #endif
 617
 618   /* If initial states with non-begbuf contexts have no elements,
 619      the regex must be anchored.  If preg->newline_anchor is set,
 620      we'll never use init_state_nl, so do not check it.  */
 621   if (dfa->init_state->nodes.nelem == 0
 622       && dfa->init_state_word->nodes.nelem == 0
 623       && (dfa->init_state_nl->nodes.nelem == 0
 624           || !preg->newline_anchor))
 625     {
 626       if (start != 0 && start + range != 0)
 627         return REG_NOMATCH;
 628       start = range = 0;
 629     }
 630
 631   /* We must check the longest matching, if nmatch > 0.  */
 632   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 633
 634   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 635                             preg->translate, preg->syntax & RE_ICASE, dfa);
 636   if (BE (err != REG_NOERROR, 0))
 637     goto free_return;
 638   mctx.input.stop = stop;
 639   mctx.input.raw_stop = stop;
 640   mctx.input.newline_anchor = preg->newline_anchor;
 641
 642   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 643   if (BE (err != REG_NOERROR, 0))
 644     goto free_return;
 645
 646   /* We will log all the DFA states through which the dfa pass,
 647      if nmatch > 1, or this dfa has "multibyte node", which is a
 648      back-reference or a node which can accept multibyte character or
 649      multi character collating element.  */
 650   if (nmatch > 1 || dfa->has_mb_node)
 651     {
 652       /* Avoid overflow.  */
 653       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 654         {
 655           err = REG_ESPACE;
 656           goto free_return;
 657         }
 658
 659       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 660       if (BE (mctx.state_log == NULL, 0))
 661         {
 662           err = REG_ESPACE;
 663           goto free_return;
 664         }
 665     }
 666   else
 667     mctx.state_log = NULL;
 668
 669   match_first = start;
 670   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 671                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 672
 673   /* Check incrementally whether of not the input string match.  */
 674   incr = (range < 0) ? -1 : 1;
 675   left_lim = (range < 0) ? start + range : start;
 676   right_lim = (range < 0) ? start : start + range;
 677   sb = dfa->mb_cur_max == 1;
 678   match_kind =
 679     (fastmap
 680      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 681         | (range >= 0 ? 2 : 0)
 682         | (t != NULL ? 1 : 0))
 683      : 8);
 684
 685   for (;; match_first += incr)
 686     {
 687       err = REG_NOMATCH;
 688       if (match_first < left_lim || right_lim < match_first)
 689         goto free_return;
 690
 691       /* Advance as rapidly as possible through the string, until we
 692          find a plausible place to start matching.  This may be done
 693          with varying efficiency, so there are various possibilities:
 694          only the most common of them are specialized, in order to
 695          save on code size.  We use a switch statement for speed.  */
 696       switch (match_kind)
 697         {
 698         case 8:
 699           /* No fastmap.  */
 700           break;
 701
 702         case 7:
 703           /* Fastmap with single-byte translation, match forward.  */
 704           while (BE (match_first < right_lim, 1)
 705                  && !fastmap[t[(unsigned char) string[match_first]]])
 706             ++match_first;
 707           goto forward_match_found_start_or_reached_end;
 708
 709         case 6:
 710           /* Fastmap without translation, match forward.  */
 711           while (BE (match_first < right_lim, 1)
 712                  && !fastmap[(unsigned char) string[match_first]])
 713             ++match_first;
 714
 715         forward_match_found_start_or_reached_end:
 716           if (BE (match_first == right_lim, 0))
 717             {
 718               ch = match_first >= length
 719                        ? 0 : (unsigned char) string[match_first];
 720               if (!fastmap[t ? t[ch] : ch])
 721                 goto free_return;
 722             }
 723           break;
 724
 725         case 4:
 726         case 5:
 727           /* Fastmap without multi-byte translation, match backwards.  */
 728           while (match_first >= left_lim)
 729             {
 730               ch = match_first >= length
 731                        ? 0 : (unsigned char) string[match_first];
 732               if (fastmap[t ? t[ch] : ch])
 733                 break;
 734               --match_first;
 735             }
 736           if (match_first < left_lim)
 737             goto free_return;
 738           break;
 739
 740         default:
 741           /* In this case, we can't determine easily the current byte,
 742              since it might be a component byte of a multibyte
 743              character.  Then we use the constructed buffer instead.  */
 744           for (;;)
 745             {
 746               /* If MATCH_FIRST is out of the valid range, reconstruct the
 747                  buffers.  */
 748               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 749               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 750                 {
 751                   err = re_string_reconstruct (&mctx.input, match_first,
 752                                                eflags);
 753                   if (BE (err != REG_NOERROR, 0))
 754                     goto free_return;
 755
 756                   offset = match_first - mctx.input.raw_mbs_idx;
 757                 }
 758               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 759                  Note that MATCH_FIRST must not be smaller than 0.  */
 760               ch = (match_first >= length
 761                     ? 0 : re_string_byte_at (&mctx.input, offset));
 762               if (fastmap[ch])
 763                 break;
 764               match_first += incr;
 765               if (match_first < left_lim || match_first > right_lim)
 766                 {
 767                   err = REG_NOMATCH;
 768                   goto free_return;
 769                 }
 770             }
 771           break;
 772         }
 773
 774       /* Reconstruct the buffers so that the matcher can assume that
 775          the matching starts from the beginning of the buffer.  */
 776       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 777       if (BE (err != REG_NOERROR, 0))
 778         goto free_return;
 779
 780 #ifdef RE_ENABLE_I18N
 781      /* Don't consider this char as a possible match start if it part,
 782         yet isn't the head, of a multibyte character.  */
 783       if (!sb && !re_string_first_byte (&mctx.input, 0))
 784         continue;
 785 #endif
 786
 787       /* It seems to be appropriate one, then use the matcher.  */
 788       /* We assume that the matching starts from 0.  */
 789       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 790       match_last = check_matching (&mctx, fl_longest_match,
 791                                    range >= 0 ? &match_first : NULL);
 792       if (match_last != -1)
 793         {
 794           if (BE (match_last == -2, 0))
 795             {
 796               err = REG_ESPACE;
 797               goto free_return;
 798             }
 799           else
 800             {
 801               mctx.match_last = match_last;
 802               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 803                 {
 804                   re_dfastate_t *pstate = mctx.state_log[match_last];
 805                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 806                                                              match_last);
 807                 }
 808               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 809                   || dfa->nbackref)
 810                 {
 811                   err = prune_impossible_nodes (&mctx);
 812                   if (err == REG_NOERROR)
 813                     break;
 814                   if (BE (err != REG_NOMATCH, 0))
 815                     goto free_return;
 816                   match_last = -1;
 817                 }
 818               else
 819                 break; /* We found a match.  */
 820             }
 821         }
 822
 823       match_ctx_clean (&mctx);
 824     }
 825
 826 #ifdef DEBUG
 827   assert (match_last != -1);
 828   assert (err == REG_NOERROR);
 829 #endif
 830
 831   /* Set pmatch[] if we need.  */
 832   if (nmatch > 0)
 833     {
 834       int reg_idx;
 835
 836       /* Initialize registers.  */
 837       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 838         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 839
 840       /* Set the points where matching start/end.  */
 841       pmatch[0].rm_so = 0;
 842       pmatch[0].rm_eo = mctx.match_last;
 843
 844       if (!preg->no_sub && nmatch > 1)
 845         {
 846           err = set_regs (preg, &mctx, nmatch, pmatch,
 847                           dfa->has_plural_match && dfa->nbackref > 0);
 848           if (BE (err != REG_NOERROR, 0))
 849             goto free_return;
 850         }
 851
 852       /* At last, add the offset to each register, since we slid
 853          the buffers so that we could assume that the matching starts
 854          from 0.  */
 855       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 856         if (pmatch[reg_idx].rm_so != -1)
 857           {
 858 #ifdef RE_ENABLE_I18N
 859             if (BE (mctx.input.offsets_needed != 0, 0))
 860               {
 861                 pmatch[reg_idx].rm_so =
 862                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 863                    ? mctx.input.valid_raw_len
 864                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 865                 pmatch[reg_idx].rm_eo =
 866                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 867                    ? mctx.input.valid_raw_len
 868                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 869               }
 870 #else
 871             assert (mctx.input.offsets_needed == 0);
 872 #endif
 873             pmatch[reg_idx].rm_so += match_first;
 874             pmatch[reg_idx].rm_eo += match_first;
 875           }
 876       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 877         {
 878           pmatch[nmatch + reg_idx].rm_so = -1;
 879           pmatch[nmatch + reg_idx].rm_eo = -1;
 880         }
 881
 882       if (dfa->subexp_map)
 883         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 884           if (dfa->subexp_map[reg_idx] != reg_idx)
 885             {
 886               pmatch[reg_idx + 1].rm_so
 887                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 888               pmatch[reg_idx + 1].rm_eo
 889                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 890             }
 891     }
 892
 893  free_return:
 894   re_free (mctx.state_log);
 895   if (dfa->nbackref)
 896     match_ctx_free (&mctx);
 897   re_string_destruct (&mctx.input);
 898   return err;
 899 }
 900
 901 static reg_errcode_t
 902 __attribute_warn_unused_result__
 903 prune_impossible_nodes (re_match_context_t *mctx)
 904 {
 905   const re_dfa_t *const dfa = mctx->dfa;
 906   int halt_node, match_last;
 907   reg_errcode_t ret;
 908   re_dfastate_t **sifted_states;
 909   re_dfastate_t **lim_states = NULL;
 910   re_sift_context_t sctx;
 911 #ifdef DEBUG
 912   assert (mctx->state_log != NULL);
 913 #endif
 914   match_last = mctx->match_last;
 915   halt_node = mctx->last_node;
 916
 917   /* Avoid overflow.  */
 918   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 919     return REG_ESPACE;
 920
 921   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 922   if (BE (sifted_states == NULL, 0))
 923     {
 924       ret = REG_ESPACE;
 925       goto free_return;
 926     }
 927   if (dfa->nbackref)
 928     {
 929       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 930       if (BE (lim_states == NULL, 0))
 931         {
 932           ret = REG_ESPACE;
 933           goto free_return;
 934         }
 935       while (1)
 936         {
 937           memset (lim_states, '\0',
 938                   sizeof (re_dfastate_t *) * (match_last + 1));
 939           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 940                          match_last);
 941           ret = sift_states_backward (mctx, &sctx);
 942           re_node_set_free (&sctx.limits);
 943           if (BE (ret != REG_NOERROR, 0))
 944               goto free_return;
 945           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 946             break;
 947           do
 948             {
 949               --match_last;
 950               if (match_last < 0)
 951                 {
 952                   ret = REG_NOMATCH;
 953                   goto free_return;
 954                 }
 955             } while (mctx->state_log[match_last] == NULL
 956                      || !mctx->state_log[match_last]->halt);
 957           halt_node = check_halt_state_context (mctx,
 958                                                 mctx->state_log[match_last],
 959                                                 match_last);
 960         }
 961       ret = merge_state_array (dfa, sifted_states, lim_states,
 962                                match_last + 1);
 963       re_free (lim_states);
 964       lim_states = NULL;
 965       if (BE (ret != REG_NOERROR, 0))
 966         goto free_return;
 967     }
 968   else
 969     {
 970       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 971       ret = sift_states_backward (mctx, &sctx);
 972       re_node_set_free (&sctx.limits);
 973       if (BE (ret != REG_NOERROR, 0))
 974         goto free_return;
 975       if (sifted_states[0] == NULL)
 976         {
 977           ret = REG_NOMATCH;
 978           goto free_return;
 979         }
 980     }
 981   re_free (mctx->state_log);
 982   mctx->state_log = sifted_states;
 983   sifted_states = NULL;
 984   mctx->last_node = halt_node;
 985   mctx->match_last = match_last;
 986   ret = REG_NOERROR;
 987  free_return:
 988   re_free (sifted_states);
 989   re_free (lim_states);
 990   return ret;
 991 }
 992
 993 /* Acquire an initial state and return it.
 994    We must select appropriate initial state depending on the context,
 995    since initial states may have constraints like "\<", "^", etc..  */
 996
 997 static inline re_dfastate_t *
 998 __attribute ((always_inline))
 999 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1000                             int idx)
1001 {
1002   const re_dfa_t *const dfa = mctx->dfa;
1003   if (dfa->init_state->has_constraint)
1004     {
1005       unsigned int context;
1006       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1007       if (IS_WORD_CONTEXT (context))
1008         return dfa->init_state_word;
1009       else if (IS_ORDINARY_CONTEXT (context))
1010         return dfa->init_state;
1011       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1012         return dfa->init_state_begbuf;
1013       else if (IS_NEWLINE_CONTEXT (context))
1014         return dfa->init_state_nl;
1015       else if (IS_BEGBUF_CONTEXT (context))
1016         {
1017           /* It is relatively rare case, then calculate on demand.  */
1018           return re_acquire_state_context (err, dfa,
1019                                            dfa->init_state->entrance_nodes,
1020                                            context);
1021         }
1022       else
1023         /* Must not happen?  */
1024         return dfa->init_state;
1025     }
1026   else
1027     return dfa->init_state;
1028 }
1029
1030 /* Check whether the regular expression match input string INPUT or not,
1031    and return the index where the matching end, return -1 if not match,
1032    or return -2 in case of an error.
1033    FL_LONGEST_MATCH means we want the POSIX longest matching.
1034    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1035    next place where we may want to try matching.
1036    Note that the matcher assume that the maching starts from the current
1037    index of the buffer.  */
1038
1039 static int
1040 __attribute_warn_unused_result__
1041 check_matching (re_match_context_t *mctx, int fl_longest_match,
1042                 int *p_match_first)
1043 {
1044   const re_dfa_t *const dfa = mctx->dfa;
1045   reg_errcode_t err;
1046   int match = 0;
1047   int match_last = -1;
1048   int cur_str_idx = re_string_cur_idx (&mctx->input);
1049   re_dfastate_t *cur_state;
1050   int at_init_state = p_match_first != NULL;
1051   int next_start_idx = cur_str_idx;
1052
1053   err = REG_NOERROR;
1054   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1055   /* An initial state must not be NULL (invalid).  */
1056   if (BE (cur_state == NULL, 0))
1057     {
1058       assert (err == REG_ESPACE);
1059       return -2;
1060     }
1061
1062   if (mctx->state_log != NULL)
1063     {
1064       mctx->state_log[cur_str_idx] = cur_state;
1065
1066       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1067          later.  E.g. Processing back references.  */
1068       if (BE (dfa->nbackref, 0))
1069         {
1070           at_init_state = 0;
1071           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1072           if (BE (err != REG_NOERROR, 0))
1073             return err;
1074
1075           if (cur_state->has_backref)
1076             {
1077               err = transit_state_bkref (mctx, &cur_state->nodes);
1078               if (BE (err != REG_NOERROR, 0))
1079                 return err;
1080             }
1081         }
1082     }
1083
1084   /* If the RE accepts NULL string.  */
1085   if (BE (cur_state->halt, 0))
1086     {
1087       if (!cur_state->has_constraint
1088           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1089         {
1090           if (!fl_longest_match)
1091             return cur_str_idx;
1092           else
1093             {
1094               match_last = cur_str_idx;
1095               match = 1;
1096             }
1097         }
1098     }
1099
1100   while (!re_string_eoi (&mctx->input))
1101     {
1102       re_dfastate_t *old_state = cur_state;
1103       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1104
1105       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1106            && mctx->input.bufs_len < mctx->input.len)
1107           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1108               && mctx->input.valid_len < mctx->input.len))
1109         {
1110           err = extend_buffers (mctx, next_char_idx + 1);
1111           if (BE (err != REG_NOERROR, 0))
1112             {
1113               assert (err == REG_ESPACE);
1114               return -2;
1115             }
1116         }
1117
1118       cur_state = transit_state (&err, mctx, cur_state);
1119       if (mctx->state_log != NULL)
1120         cur_state = merge_state_with_log (&err, mctx, cur_state);
1121
1122       if (cur_state == NULL)
1123         {
1124           /* Reached the invalid state or an error.  Try to recover a valid
1125              state using the state log, if available and if we have not
1126              already found a valid (even if not the longest) match.  */
1127           if (BE (err != REG_NOERROR, 0))
1128             return -2;
1129
1130           if (mctx->state_log == NULL
1131               || (match && !fl_longest_match)
1132               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1133             break;
1134         }
1135
1136       if (BE (at_init_state, 0))
1137         {
1138           if (old_state == cur_state)
1139             next_start_idx = next_char_idx;
1140           else
1141             at_init_state = 0;
1142         }
1143
1144       if (cur_state->halt)
1145         {
1146           /* Reached a halt state.
1147              Check the halt state can satisfy the current context.  */
1148           if (!cur_state->has_constraint
1149               || check_halt_state_context (mctx, cur_state,
1150                                            re_string_cur_idx (&mctx->input)))
1151             {
1152               /* We found an appropriate halt state.  */
1153               match_last = re_string_cur_idx (&mctx->input);
1154               match = 1;
1155
1156               /* We found a match, do not modify match_first below.  */
1157               p_match_first = NULL;
1158               if (!fl_longest_match)
1159                 break;
1160             }
1161         }
1162     }
1163
1164   if (p_match_first)
1165     *p_match_first += next_start_idx;
1166
1167   return match_last;
1168 }
1169
1170 /* Check NODE match the current context.  */
1171
1172 static int
1173 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1174 {
1175   re_token_type_t type = dfa->nodes[node].type;
1176   unsigned int constraint = dfa->nodes[node].constraint;
1177   if (type != END_OF_RE)
1178     return 0;
1179   if (!constraint)
1180     return 1;
1181   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1182     return 0;
1183   return 1;
1184 }
1185
1186 /* Check the halt state STATE match the current context.
1187    Return 0 if not match, if the node, STATE has, is a halt node and
1188    match the context, return the node.  */
1189
1190 static int
1191 check_halt_state_context (const re_match_context_t *mctx,
1192                           const re_dfastate_t *state, int idx)
1193 {
1194   int i;
1195   unsigned int context;
1196 #ifdef DEBUG
1197   assert (state->halt);
1198 #endif
1199   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1200   for (i = 0; i < state->nodes.nelem; ++i)
1201     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1202       return state->nodes.elems[i];
1203   return 0;
1204 }
1205
1206 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1207    corresponding to the DFA).
1208    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1209    of errors.  */
1210
1211 static int
1212 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1213                    int *pidx, int node, re_node_set *eps_via_nodes,
1214                    struct re_fail_stack_t *fs)
1215 {
1216   const re_dfa_t *const dfa = mctx->dfa;
1217   int i, err;
1218   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1219     {
1220       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1221       re_node_set *edests = &dfa->edests[node];
1222       int dest_node;
1223       err = re_node_set_insert (eps_via_nodes, node);
1224       if (BE (err < 0, 0))
1225         return -2;
1226       /* Pick up a valid destination, or return -1 if none is found.  */
1227       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1228         {
1229           int candidate = edests->elems[i];
1230           if (!re_node_set_contains (cur_nodes, candidate))
1231             continue;
1232           if (dest_node == -1)
1233             dest_node = candidate;
1234
1235           else
1236             {
1237               /* In order to avoid infinite loop like "(a*)*", return the second
1238                  epsilon-transition if the first was already considered.  */
1239               if (re_node_set_contains (eps_via_nodes, dest_node))
1240                 return candidate;
1241
1242               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1243               else if (fs != NULL
1244                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1245                                            eps_via_nodes))
1246                 return -2;
1247
1248               /* We know we are going to exit.  */
1249               break;
1250             }
1251         }
1252       return dest_node;
1253     }
1254   else
1255     {
1256       int naccepted = 0;
1257       re_token_type_t type = dfa->nodes[node].type;
1258
1259 #ifdef RE_ENABLE_I18N
1260       if (dfa->nodes[node].accept_mb)
1261         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1262       else
1263 #endif /* RE_ENABLE_I18N */
1264       if (type == OP_BACK_REF)
1265         {
1266           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1267           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1268           if (fs != NULL)
1269             {
1270               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1271                 return -1;
1272               else if (naccepted)
1273                 {
1274                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1275                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1276                               naccepted) != 0)
1277                     return -1;
1278                 }
1279             }
1280
1281           if (naccepted == 0)
1282             {
1283               int dest_node;
1284               err = re_node_set_insert (eps_via_nodes, node);
1285               if (BE (err < 0, 0))
1286                 return -2;
1287               dest_node = dfa->edests[node].elems[0];
1288               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1289                                         dest_node))
1290                 return dest_node;
1291             }
1292         }
1293
1294       if (naccepted != 0
1295           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1296         {
1297           int dest_node = dfa->nexts[node];
1298           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1299           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1300                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1301                                                dest_node)))
1302             return -1;
1303           re_node_set_empty (eps_via_nodes);
1304           return dest_node;
1305         }
1306     }
1307   return -1;
1308 }
1309
1310 static reg_errcode_t
1311 __attribute_warn_unused_result__
1312 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1313                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1314 {
1315   reg_errcode_t err;
1316   int num = fs->num++;
1317   if (fs->num == fs->alloc)
1318     {
1319       struct re_fail_stack_ent_t *new_array;
1320       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1321                                        * fs->alloc * 2));
1322       if (new_array == NULL)
1323         return REG_ESPACE;
1324       fs->alloc *= 2;
1325       fs->stack = new_array;
1326     }
1327   fs->stack[num].idx = str_idx;
1328   fs->stack[num].node = dest_node;
1329   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1330   if (fs->stack[num].regs == NULL)
1331     return REG_ESPACE;
1332   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1333   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1334   return err;
1335 }
1336
1337 static int
1338 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1339                 regmatch_t *regs, re_node_set *eps_via_nodes)
1340 {
1341   int num = --fs->num;
1342   assert (num >= 0);
1343   *pidx = fs->stack[num].idx;
1344   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1345   re_node_set_free (eps_via_nodes);
1346   re_free (fs->stack[num].regs);
1347   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1348   return fs->stack[num].node;
1349 }
1350
1351 /* Set the positions where the subexpressions are starts/ends to registers
1352    PMATCH.
1353    Note: We assume that pmatch[0] is already set, and
1354    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1355
1356 static reg_errcode_t
1357 __attribute_warn_unused_result__
1358 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1359           regmatch_t *pmatch, int fl_backtrack)
1360 {
1361   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1362   int idx, cur_node;
1363   re_node_set eps_via_nodes;
1364   struct re_fail_stack_t *fs;
1365   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1366   regmatch_t *prev_idx_match;
1367   int prev_idx_match_malloced = 0;
1368
1369 #ifdef DEBUG
1370   assert (nmatch > 1);
1371   assert (mctx->state_log != NULL);
1372 #endif
1373   if (fl_backtrack)
1374     {
1375       fs = &fs_body;
1376       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1377       if (fs->stack == NULL)
1378         return REG_ESPACE;
1379     }
1380   else
1381     fs = NULL;
1382
1383   cur_node = dfa->init_node;
1384   re_node_set_init_empty (&eps_via_nodes);
1385
1386   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1387     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1388   else
1389     {
1390       prev_idx_match = re_malloc (regmatch_t, nmatch);
1391       if (prev_idx_match == NULL)
1392         {
1393           free_fail_stack_return (fs);
1394           return REG_ESPACE;
1395         }
1396       prev_idx_match_malloced = 1;
1397     }
1398   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1399
1400   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1401     {
1402       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1403
1404       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1405         {
1406           int reg_idx;
1407           if (fs)
1408             {
1409               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1410                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1411                   break;
1412               if (reg_idx == nmatch)
1413                 {
1414                   re_node_set_free (&eps_via_nodes);
1415                   if (prev_idx_match_malloced)
1416                     re_free (prev_idx_match);
1417                   return free_fail_stack_return (fs);
1418                 }
1419               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1420                                          &eps_via_nodes);
1421             }
1422           else
1423             {
1424               re_node_set_free (&eps_via_nodes);
1425               if (prev_idx_match_malloced)
1426                 re_free (prev_idx_match);
1427               return REG_NOERROR;
1428             }
1429         }
1430
1431       /* Proceed to next node.  */
1432       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1433                                     &eps_via_nodes, fs);
1434
1435       if (BE (cur_node < 0, 0))
1436         {
1437           if (BE (cur_node == -2, 0))
1438             {
1439               re_node_set_free (&eps_via_nodes);
1440               if (prev_idx_match_malloced)
1441                 re_free (prev_idx_match);
1442               free_fail_stack_return (fs);
1443               return REG_ESPACE;
1444             }
1445           if (fs)
1446             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1447                                        &eps_via_nodes);
1448           else
1449             {
1450               re_node_set_free (&eps_via_nodes);
1451               if (prev_idx_match_malloced)
1452                 re_free (prev_idx_match);
1453               return REG_NOMATCH;
1454             }
1455         }
1456     }
1457   re_node_set_free (&eps_via_nodes);
1458   if (prev_idx_match_malloced)
1459     re_free (prev_idx_match);
1460   return free_fail_stack_return (fs);
1461 }
1462
1463 static reg_errcode_t
1464 free_fail_stack_return (struct re_fail_stack_t *fs)
1465 {
1466   if (fs)
1467     {
1468       int fs_idx;
1469       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1470         {
1471           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1472           re_free (fs->stack[fs_idx].regs);
1473         }
1474       re_free (fs->stack);
1475     }
1476   return REG_NOERROR;
1477 }
1478
1479 static void
1480 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1481              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1482 {
1483   int type = dfa->nodes[cur_node].type;
1484   if (type == OP_OPEN_SUBEXP)
1485     {
1486       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1487
1488       /* We are at the first node of this sub expression.  */
1489       if (reg_num < nmatch)
1490         {
1491           pmatch[reg_num].rm_so = cur_idx;
1492           pmatch[reg_num].rm_eo = -1;
1493         }
1494     }
1495   else if (type == OP_CLOSE_SUBEXP)
1496     {
1497       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1498       if (reg_num < nmatch)
1499         {
1500           /* We are at the last node of this sub expression.  */
1501           if (pmatch[reg_num].rm_so < cur_idx)
1502             {
1503               pmatch[reg_num].rm_eo = cur_idx;
1504               /* This is a non-empty match or we are not inside an optional
1505                  subexpression.  Accept this right away.  */
1506               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1507             }
1508           else
1509             {
1510               if (dfa->nodes[cur_node].opt_subexp
1511                   && prev_idx_match[reg_num].rm_so != -1)
1512                 /* We transited through an empty match for an optional
1513                    subexpression, like (a?)*, and this is not the subexp's
1514                    first match.  Copy back the old content of the registers
1515                    so that matches of an inner subexpression are undone as
1516                    well, like in ((a?))*.  */
1517                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1518               else
1519                 /* We completed a subexpression, but it may be part of
1520                    an optional one, so do not update PREV_IDX_MATCH.  */
1521                 pmatch[reg_num].rm_eo = cur_idx;
1522             }
1523         }
1524     }
1525 }
1526
1527 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1528    and sift the nodes in each states according to the following rules.
1529    Updated state_log will be wrote to STATE_LOG.
1530
1531    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1532      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1533         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1534         the LAST_NODE, we throw away the node `a'.
1535      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1536         string `s' and transit to `b':
1537         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1538            away the node `a'.
1539         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1540             thrown away, we throw away the node `a'.
1541      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1542         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1543            node `a'.
1544         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1545             we throw away the node `a'.  */
1546
1547 #define STATE_NODE_CONTAINS(state,node) \
1548   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1549
1550 static reg_errcode_t
1551 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1552 {
1553   reg_errcode_t err;
1554   int null_cnt = 0;
1555   int str_idx = sctx->last_str_idx;
1556   re_node_set cur_dest;
1557
1558 #ifdef DEBUG
1559   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1560 #endif
1561
1562   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1563      transit to the last_node and the last_node itself.  */
1564   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1565   if (BE (err != REG_NOERROR, 0))
1566     return err;
1567   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1568   if (BE (err != REG_NOERROR, 0))
1569     goto free_return;
1570
1571   /* Then check each states in the state_log.  */
1572   while (str_idx > 0)
1573     {
1574       /* Update counters.  */
1575       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1576       if (null_cnt > mctx->max_mb_elem_len)
1577         {
1578           memset (sctx->sifted_states, '\0',
1579                   sizeof (re_dfastate_t *) * str_idx);
1580           re_node_set_free (&cur_dest);
1581           return REG_NOERROR;
1582         }
1583       re_node_set_empty (&cur_dest);
1584       --str_idx;
1585
1586       if (mctx->state_log[str_idx])
1587         {
1588           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1589           if (BE (err != REG_NOERROR, 0))
1590             goto free_return;
1591         }
1592
1593       /* Add all the nodes which satisfy the following conditions:
1594          - It can epsilon transit to a node in CUR_DEST.
1595          - It is in CUR_SRC.
1596          And update state_log.  */
1597       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1598       if (BE (err != REG_NOERROR, 0))
1599         goto free_return;
1600     }
1601   err = REG_NOERROR;
1602  free_return:
1603   re_node_set_free (&cur_dest);
1604   return err;
1605 }
1606
1607 static reg_errcode_t
1608 __attribute_warn_unused_result__
1609 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1610                      int str_idx, re_node_set *cur_dest)
1611 {
1612   const re_dfa_t *const dfa = mctx->dfa;
1613   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1614   int i;
1615
1616   /* Then build the next sifted state.
1617      We build the next sifted state on `cur_dest', and update
1618      `sifted_states[str_idx]' with `cur_dest'.
1619      Note:
1620      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1621      `cur_src' points the node_set of the old `state_log[str_idx]'
1622      (with the epsilon nodes pre-filtered out).  */
1623   for (i = 0; i < cur_src->nelem; i++)
1624     {
1625       int prev_node = cur_src->elems[i];
1626       int naccepted = 0;
1627       int ret;
1628
1629 #ifdef DEBUG
1630       re_token_type_t type = dfa->nodes[prev_node].type;
1631       assert (!IS_EPSILON_NODE (type));
1632 #endif
1633 #ifdef RE_ENABLE_I18N
1634       /* If the node may accept `multi byte'.  */
1635       if (dfa->nodes[prev_node].accept_mb)
1636         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1637                                          str_idx, sctx->last_str_idx);
1638 #endif /* RE_ENABLE_I18N */
1639
1640       /* We don't check backreferences here.
1641          See update_cur_sifted_state().  */
1642       if (!naccepted
1643           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1644           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1645                                   dfa->nexts[prev_node]))
1646         naccepted = 1;
1647
1648       if (naccepted == 0)
1649         continue;
1650
1651       if (sctx->limits.nelem)
1652         {
1653           int to_idx = str_idx + naccepted;
1654           if (check_dst_limits (mctx, &sctx->limits,
1655                                 dfa->nexts[prev_node], to_idx,
1656                                 prev_node, str_idx))
1657             continue;
1658         }
1659       ret = re_node_set_insert (cur_dest, prev_node);
1660       if (BE (ret == -1, 0))
1661         return REG_ESPACE;
1662     }
1663
1664   return REG_NOERROR;
1665 }
1666
1667 /* Helper functions.  */
1668
1669 static reg_errcode_t
1670 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1671 {
1672   int top = mctx->state_log_top;
1673
1674   if ((next_state_log_idx >= mctx->input.bufs_len
1675        && mctx->input.bufs_len < mctx->input.len)
1676       || (next_state_log_idx >= mctx->input.valid_len
1677           && mctx->input.valid_len < mctx->input.len))
1678     {
1679       reg_errcode_t err;
1680       err = extend_buffers (mctx, next_state_log_idx + 1);
1681       if (BE (err != REG_NOERROR, 0))
1682         return err;
1683     }
1684
1685   if (top < next_state_log_idx)
1686     {
1687       memset (mctx->state_log + top + 1, '\0',
1688               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1689       mctx->state_log_top = next_state_log_idx;
1690     }
1691   return REG_NOERROR;
1692 }
1693
1694 static reg_errcode_t
1695 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1696                    re_dfastate_t **src, int num)
1697 {
1698   int st_idx;
1699   reg_errcode_t err;
1700   for (st_idx = 0; st_idx < num; ++st_idx)
1701     {
1702       if (dst[st_idx] == NULL)
1703         dst[st_idx] = src[st_idx];
1704       else if (src[st_idx] != NULL)
1705         {
1706           re_node_set merged_set;
1707           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1708                                         &src[st_idx]->nodes);
1709           if (BE (err != REG_NOERROR, 0))
1710             return err;
1711           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1712           re_node_set_free (&merged_set);
1713           if (BE (err != REG_NOERROR, 0))
1714             return err;
1715         }
1716     }
1717   return REG_NOERROR;
1718 }
1719
1720 static reg_errcode_t
1721 update_cur_sifted_state (const re_match_context_t *mctx,
1722                          re_sift_context_t *sctx, int str_idx,
1723                          re_node_set *dest_nodes)
1724 {
1725   const re_dfa_t *const dfa = mctx->dfa;
1726   reg_errcode_t err = REG_NOERROR;
1727   const re_node_set *candidates;
1728   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1729                 : &mctx->state_log[str_idx]->nodes);
1730
1731   if (dest_nodes->nelem == 0)
1732     sctx->sifted_states[str_idx] = NULL;
1733   else
1734     {
1735       if (candidates)
1736         {
1737           /* At first, add the nodes which can epsilon transit to a node in
1738              DEST_NODE.  */
1739           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1740           if (BE (err != REG_NOERROR, 0))
1741             return err;
1742
1743           /* Then, check the limitations in the current sift_context.  */
1744           if (sctx->limits.nelem)
1745             {
1746               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1747                                          mctx->bkref_ents, str_idx);
1748               if (BE (err != REG_NOERROR, 0))
1749                 return err;
1750             }
1751         }
1752
1753       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1754       if (BE (err != REG_NOERROR, 0))
1755         return err;
1756     }
1757
1758   if (candidates && mctx->state_log[str_idx]->has_backref)
1759     {
1760       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1761       if (BE (err != REG_NOERROR, 0))
1762         return err;
1763     }
1764   return REG_NOERROR;
1765 }
1766
1767 static reg_errcode_t
1768 __attribute_warn_unused_result__
1769 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1770                        const re_node_set *candidates)
1771 {
1772   reg_errcode_t err = REG_NOERROR;
1773   int i;
1774
1775   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1776   if (BE (err != REG_NOERROR, 0))
1777     return err;
1778
1779   if (!state->inveclosure.alloc)
1780     {
1781       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1782       if (BE (err != REG_NOERROR, 0))
1783         return REG_ESPACE;
1784       for (i = 0; i < dest_nodes->nelem; i++)
1785         {
1786           err = re_node_set_merge (&state->inveclosure,
1787                                    dfa->inveclosures + dest_nodes->elems[i]);
1788           if (BE (err != REG_NOERROR, 0))
1789             return REG_ESPACE;
1790         }
1791     }
1792   return re_node_set_add_intersect (dest_nodes, candidates,
1793                                     &state->inveclosure);
1794 }
1795
1796 static reg_errcode_t
1797 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1798                        const re_node_set *candidates)
1799 {
1800     int ecl_idx;
1801     reg_errcode_t err;
1802     re_node_set *inv_eclosure = dfa->inveclosures + node;
1803     re_node_set except_nodes;
1804     re_node_set_init_empty (&except_nodes);
1805     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1806       {
1807         int cur_node = inv_eclosure->elems[ecl_idx];
1808         if (cur_node == node)
1809           continue;
1810         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1811           {
1812             int edst1 = dfa->edests[cur_node].elems[0];
1813             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1814                          ? dfa->edests[cur_node].elems[1] : -1);
1815             if ((!re_node_set_contains (inv_eclosure, edst1)
1816                  && re_node_set_contains (dest_nodes, edst1))
1817                 || (edst2 > 0
1818                     && !re_node_set_contains (inv_eclosure, edst2)
1819                     && re_node_set_contains (dest_nodes, edst2)))
1820               {
1821                 err = re_node_set_add_intersect (&except_nodes, candidates,
1822                                                  dfa->inveclosures + cur_node);
1823                 if (BE (err != REG_NOERROR, 0))
1824                   {
1825                     re_node_set_free (&except_nodes);
1826                     return err;
1827                   }
1828               }
1829           }
1830       }
1831     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1832       {
1833         int cur_node = inv_eclosure->elems[ecl_idx];
1834         if (!re_node_set_contains (&except_nodes, cur_node))
1835           {
1836             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1837             re_node_set_remove_at (dest_nodes, idx);
1838           }
1839       }
1840     re_node_set_free (&except_nodes);
1841     return REG_NOERROR;
1842 }
1843
1844 static int
1845 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1846                   int dst_node, int dst_idx, int src_node, int src_idx)
1847 {
1848   const re_dfa_t *const dfa = mctx->dfa;
1849   int lim_idx, src_pos, dst_pos;
1850
1851   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1852   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1853   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1854     {
1855       int subexp_idx;
1856       struct re_backref_cache_entry *ent;
1857       ent = mctx->bkref_ents + limits->elems[lim_idx];
1858       subexp_idx = dfa->nodes[ent->node].opr.idx;
1859
1860       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1861                                            subexp_idx, dst_node, dst_idx,
1862                                            dst_bkref_idx);
1863       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1864                                            subexp_idx, src_node, src_idx,
1865                                            src_bkref_idx);
1866
1867       /* In case of:
1868          <src> <dst> ( <subexp> )
1869          ( <subexp> ) <src> <dst>
1870          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1871       if (src_pos == dst_pos)
1872         continue; /* This is unrelated limitation.  */
1873       else
1874         return 1;
1875     }
1876   return 0;
1877 }
1878
1879 static int
1880 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1881                              int subexp_idx, int from_node, int bkref_idx)
1882 {
1883   const re_dfa_t *const dfa = mctx->dfa;
1884   const re_node_set *eclosures = dfa->eclosures + from_node;
1885   int node_idx;
1886
1887   /* Else, we are on the boundary: examine the nodes on the epsilon
1888      closure.  */
1889   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1890     {
1891       int node = eclosures->elems[node_idx];
1892       switch (dfa->nodes[node].type)
1893         {
1894         case OP_BACK_REF:
1895           if (bkref_idx != -1)
1896             {
1897               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1898               do
1899                 {
1900                   int dst, cpos;
1901
1902                   if (ent->node != node)
1903                     continue;
1904
1905                   if (subexp_idx < BITSET_WORD_BITS
1906                       && !(ent->eps_reachable_subexps_map
1907                            & ((bitset_word_t) 1 << subexp_idx)))
1908                     continue;
1909
1910                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1911                      OP_CLOSE_SUBEXP cases below.  But, if the
1912                      destination node is the same node as the source
1913                      node, don't recurse because it would cause an
1914                      infinite loop: a regex that exhibits this behavior
1915                      is ()\1*\1*  */
1916                   dst = dfa->edests[node].elems[0];
1917                   if (dst == from_node)
1918                     {
1919                       if (boundaries & 1)
1920                         return -1;
1921                       else /* if (boundaries & 2) */
1922                         return 0;
1923                     }
1924
1925                   cpos =
1926                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1927                                                  dst, bkref_idx);
1928                   if (cpos == -1 /* && (boundaries & 1) */)
1929                     return -1;
1930                   if (cpos == 0 && (boundaries & 2))
1931                     return 0;
1932
1933                   if (subexp_idx < BITSET_WORD_BITS)
1934                     ent->eps_reachable_subexps_map
1935                       &= ~((bitset_word_t) 1 << subexp_idx);
1936                 }
1937               while (ent++->more);
1938             }
1939           break;
1940
1941         case OP_OPEN_SUBEXP:
1942           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1943             return -1;
1944           break;
1945
1946         case OP_CLOSE_SUBEXP:
1947           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1948             return 0;
1949           break;
1950
1951         default:
1952             break;
1953         }
1954     }
1955
1956   return (boundaries & 2) ? 1 : 0;
1957 }
1958
1959 static int
1960 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
1961                            int subexp_idx, int from_node, int str_idx,
1962                            int bkref_idx)
1963 {
1964   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1965   int boundaries;
1966
1967   /* If we are outside the range of the subexpression, return -1 or 1.  */
1968   if (str_idx < lim->subexp_from)
1969     return -1;
1970
1971   if (lim->subexp_to < str_idx)
1972     return 1;
1973
1974   /* If we are within the subexpression, return 0.  */
1975   boundaries = (str_idx == lim->subexp_from);
1976   boundaries |= (str_idx == lim->subexp_to) << 1;
1977   if (boundaries == 0)
1978     return 0;
1979
1980   /* Else, examine epsilon closure.  */
1981   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1982                                       from_node, bkref_idx);
1983 }
1984
1985 /* Check the limitations of sub expressions LIMITS, and remove the nodes
1986    which are against limitations from DEST_NODES. */
1987
1988 static reg_errcode_t
1989 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
1990                      const re_node_set *candidates, re_node_set *limits,
1991                      struct re_backref_cache_entry *bkref_ents, int str_idx)
1992 {
1993   reg_errcode_t err;
1994   int node_idx, lim_idx;
1995
1996   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1997     {
1998       int subexp_idx;
1999       struct re_backref_cache_entry *ent;
2000       ent = bkref_ents + limits->elems[lim_idx];
2001
2002       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2003         continue; /* This is unrelated limitation.  */
2004
2005       subexp_idx = dfa->nodes[ent->node].opr.idx;
2006       if (ent->subexp_to == str_idx)
2007         {
2008           int ops_node = -1;
2009           int cls_node = -1;
2010           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2011             {
2012               int node = dest_nodes->elems[node_idx];
2013               re_token_type_t type = dfa->nodes[node].type;
2014               if (type == OP_OPEN_SUBEXP
2015                   && subexp_idx == dfa->nodes[node].opr.idx)
2016                 ops_node = node;
2017               else if (type == OP_CLOSE_SUBEXP
2018                        && subexp_idx == dfa->nodes[node].opr.idx)
2019                 cls_node = node;
2020             }
2021
2022           /* Check the limitation of the open subexpression.  */
2023           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2024           if (ops_node >= 0)
2025             {
2026               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2027                                            candidates);
2028               if (BE (err != REG_NOERROR, 0))
2029                 return err;
2030             }
2031
2032           /* Check the limitation of the close subexpression.  */
2033           if (cls_node >= 0)
2034             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2035               {
2036                 int node = dest_nodes->elems[node_idx];
2037                 if (!re_node_set_contains (dfa->inveclosures + node,
2038                                            cls_node)
2039                     && !re_node_set_contains (dfa->eclosures + node,
2040                                               cls_node))
2041                   {
2042                     /* It is against this limitation.
2043                        Remove it form the current sifted state.  */
2044                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2045                                                  candidates);
2046                     if (BE (err != REG_NOERROR, 0))
2047                       return err;
2048                     --node_idx;
2049                   }
2050               }
2051         }
2052       else /* (ent->subexp_to != str_idx)  */
2053         {
2054           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2055             {
2056               int node = dest_nodes->elems[node_idx];
2057               re_token_type_t type = dfa->nodes[node].type;
2058               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2059                 {
2060                   if (subexp_idx != dfa->nodes[node].opr.idx)
2061                     continue;
2062                   /* It is against this limitation.
2063                      Remove it form the current sifted state.  */
2064                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2065                                                candidates);
2066                   if (BE (err != REG_NOERROR, 0))
2067                     return err;
2068                 }
2069             }
2070         }
2071     }
2072   return REG_NOERROR;
2073 }
2074
2075 static reg_errcode_t
2076 __attribute_warn_unused_result__
2077 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2078                    int str_idx, const re_node_set *candidates)
2079 {
2080   const re_dfa_t *const dfa = mctx->dfa;
2081   reg_errcode_t err;
2082   int node_idx, node;
2083   re_sift_context_t local_sctx;
2084   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2085
2086   if (first_idx == -1)
2087     return REG_NOERROR;
2088
2089   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2090
2091   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2092     {
2093       int enabled_idx;
2094       re_token_type_t type;
2095       struct re_backref_cache_entry *entry;
2096       node = candidates->elems[node_idx];
2097       type = dfa->nodes[node].type;
2098       /* Avoid infinite loop for the REs like "()\1+".  */
2099       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2100         continue;
2101       if (type != OP_BACK_REF)
2102         continue;
2103
2104       entry = mctx->bkref_ents + first_idx;
2105       enabled_idx = first_idx;
2106       do
2107         {
2108           int subexp_len;
2109           int to_idx;
2110           int dst_node;
2111           int ret;
2112           re_dfastate_t *cur_state;
2113
2114           if (entry->node != node)
2115             continue;
2116           subexp_len = entry->subexp_to - entry->subexp_from;
2117           to_idx = str_idx + subexp_len;
2118           dst_node = (subexp_len ? dfa->nexts[node]
2119                       : dfa->edests[node].elems[0]);
2120
2121           if (to_idx > sctx->last_str_idx
2122               || sctx->sifted_states[to_idx] == NULL
2123               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2124               || check_dst_limits (mctx, &sctx->limits, node,
2125                                    str_idx, dst_node, to_idx))
2126             continue;
2127
2128           if (local_sctx.sifted_states == NULL)
2129             {
2130               local_sctx = *sctx;
2131               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2132               if (BE (err != REG_NOERROR, 0))
2133                 goto free_return;
2134             }
2135           local_sctx.last_node = node;
2136           local_sctx.last_str_idx = str_idx;
2137           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2138           if (BE (ret < 0, 0))
2139             {
2140               err = REG_ESPACE;
2141               goto free_return;
2142             }
2143           cur_state = local_sctx.sifted_states[str_idx];
2144           err = sift_states_backward (mctx, &local_sctx);
2145           if (BE (err != REG_NOERROR, 0))
2146             goto free_return;
2147           if (sctx->limited_states != NULL)
2148             {
2149               err = merge_state_array (dfa, sctx->limited_states,
2150                                        local_sctx.sifted_states,
2151                                        str_idx + 1);
2152               if (BE (err != REG_NOERROR, 0))
2153                 goto free_return;
2154             }
2155           local_sctx.sifted_states[str_idx] = cur_state;
2156           re_node_set_remove (&local_sctx.limits, enabled_idx);
2157
2158           /* mctx->bkref_ents may have changed, reload the pointer.  */
2159           entry = mctx->bkref_ents + enabled_idx;
2160         }
2161       while (enabled_idx++, entry++->more);
2162     }
2163   err = REG_NOERROR;
2164  free_return:
2165   if (local_sctx.sifted_states != NULL)
2166     {
2167       re_node_set_free (&local_sctx.limits);
2168     }
2169
2170   return err;
2171 }
2172
2173
2174 #ifdef RE_ENABLE_I18N
2175 static int
2176 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2177                      int node_idx, int str_idx, int max_str_idx)
2178 {
2179   const re_dfa_t *const dfa = mctx->dfa;
2180   int naccepted;
2181   /* Check the node can accept `multi byte'.  */
2182   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2183   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2184       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2185                             dfa->nexts[node_idx]))
2186     /* The node can't accept the `multi byte', or the
2187        destination was already thrown away, then the node
2188        could't accept the current input `multi byte'.   */
2189     naccepted = 0;
2190   /* Otherwise, it is sure that the node could accept
2191      `naccepted' bytes input.  */
2192   return naccepted;
2193 }
2194 #endif /* RE_ENABLE_I18N */
2195
2196 \f
2197 /* Functions for state transition.  */
2198
2199 /* Return the next state to which the current state STATE will transit by
2200    accepting the current input byte, and update STATE_LOG if necessary.
2201    If STATE can accept a multibyte char/collating element/back reference
2202    update the destination of STATE_LOG.  */
2203
2204 static re_dfastate_t *
2205 __attribute_warn_unused_result__
2206 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2207                re_dfastate_t *state)
2208 {
2209   re_dfastate_t **trtable;
2210   unsigned char ch;
2211
2212 #ifdef RE_ENABLE_I18N
2213   /* If the current state can accept multibyte.  */
2214   if (BE (state->accept_mb, 0))
2215     {
2216       *err = transit_state_mb (mctx, state);
2217       if (BE (*err != REG_NOERROR, 0))
2218         return NULL;
2219     }
2220 #endif /* RE_ENABLE_I18N */
2221
2222   /* Then decide the next state with the single byte.  */
2223 #if 0
2224   if (0)
2225     /* don't use transition table  */
2226     return transit_state_sb (err, mctx, state);
2227 #endif
2228
2229   /* Use transition table  */
2230   ch = re_string_fetch_byte (&mctx->input);
2231   for (;;)
2232     {
2233       trtable = state->trtable;
2234       if (BE (trtable != NULL, 1))
2235         return trtable[ch];
2236
2237       trtable = state->word_trtable;
2238       if (BE (trtable != NULL, 1))
2239         {
2240           unsigned int context;
2241           context
2242             = re_string_context_at (&mctx->input,
2243                                     re_string_cur_idx (&mctx->input) - 1,
2244                                     mctx->eflags);
2245           if (IS_WORD_CONTEXT (context))
2246             return trtable[ch + SBC_MAX];
2247           else
2248             return trtable[ch];
2249         }
2250
2251       if (!build_trtable (mctx->dfa, state))
2252         {
2253           *err = REG_ESPACE;
2254           return NULL;
2255         }
2256
2257       /* Retry, we now have a transition table.  */
2258     }
2259 }
2260
2261 /* Update the state_log if we need */
2262 re_dfastate_t *
2263 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2264                       re_dfastate_t *next_state)
2265 {
2266   const re_dfa_t *const dfa = mctx->dfa;
2267   int cur_idx = re_string_cur_idx (&mctx->input);
2268
2269   if (cur_idx > mctx->state_log_top)
2270     {
2271       mctx->state_log[cur_idx] = next_state;
2272       mctx->state_log_top = cur_idx;
2273     }
2274   else if (mctx->state_log[cur_idx] == 0)
2275     {
2276       mctx->state_log[cur_idx] = next_state;
2277     }
2278   else
2279     {
2280       re_dfastate_t *pstate;
2281       unsigned int context;
2282       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2283       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2284          the destination of a multibyte char/collating element/
2285          back reference.  Then the next state is the union set of
2286          these destinations and the results of the transition table.  */
2287       pstate = mctx->state_log[cur_idx];
2288       log_nodes = pstate->entrance_nodes;
2289       if (next_state != NULL)
2290         {
2291           table_nodes = next_state->entrance_nodes;
2292           *err = re_node_set_init_union (&next_nodes, table_nodes,
2293                                              log_nodes);
2294           if (BE (*err != REG_NOERROR, 0))
2295             return NULL;
2296         }
2297       else
2298         next_nodes = *log_nodes;
2299       /* Note: We already add the nodes of the initial state,
2300          then we don't need to add them here.  */
2301
2302       context = re_string_context_at (&mctx->input,
2303                                       re_string_cur_idx (&mctx->input) - 1,
2304                                       mctx->eflags);
2305       next_state = mctx->state_log[cur_idx]
2306         = re_acquire_state_context (err, dfa, &next_nodes, context);
2307       /* We don't need to check errors here, since the return value of
2308          this function is next_state and ERR is already set.  */
2309
2310       if (table_nodes != NULL)
2311         re_node_set_free (&next_nodes);
2312     }
2313
2314   if (BE (dfa->nbackref, 0) && next_state != NULL)
2315     {
2316       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2317          later.  We must check them here, since the back references in the
2318          next state might use them.  */
2319       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2320                                         cur_idx);
2321       if (BE (*err != REG_NOERROR, 0))
2322         return NULL;
2323
2324       /* If the next state has back references.  */
2325       if (next_state->has_backref)
2326         {
2327           *err = transit_state_bkref (mctx, &next_state->nodes);
2328           if (BE (*err != REG_NOERROR, 0))
2329             return NULL;
2330           next_state = mctx->state_log[cur_idx];
2331         }
2332     }
2333
2334   return next_state;
2335 }
2336
2337 /* Skip bytes in the input that correspond to part of a
2338    multi-byte match, then look in the log for a state
2339    from which to restart matching.  */
2340 re_dfastate_t *
2341 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2342 {
2343   re_dfastate_t *cur_state;
2344   do
2345     {
2346       int max = mctx->state_log_top;
2347       int cur_str_idx = re_string_cur_idx (&mctx->input);
2348
2349       do
2350         {
2351           if (++cur_str_idx > max)
2352             return NULL;
2353           re_string_skip_bytes (&mctx->input, 1);
2354         }
2355       while (mctx->state_log[cur_str_idx] == NULL);
2356
2357       cur_state = merge_state_with_log (err, mctx, NULL);
2358     }
2359   while (*err == REG_NOERROR && cur_state == NULL);
2360   return cur_state;
2361 }
2362
2363 /* Helper functions for transit_state.  */
2364
2365 /* From the node set CUR_NODES, pick up the nodes whose types are
2366    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2367    expression. And register them to use them later for evaluating the
2368    corresponding back references.  */
2369
2370 static reg_errcode_t
2371 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2372                            int str_idx)
2373 {
2374   const re_dfa_t *const dfa = mctx->dfa;
2375   int node_idx;
2376   reg_errcode_t err;
2377
2378   /* TODO: This isn't efficient.
2379            Because there might be more than one nodes whose types are
2380            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2381            nodes.
2382            E.g. RE: (a){2}  */
2383   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2384     {
2385       int node = cur_nodes->elems[node_idx];
2386       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2387           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2388           && (dfa->used_bkref_map
2389               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2390         {
2391           err = match_ctx_add_subtop (mctx, node, str_idx);
2392           if (BE (err != REG_NOERROR, 0))
2393             return err;
2394         }
2395     }
2396   return REG_NOERROR;
2397 }
2398
2399 #if 0
2400 /* Return the next state to which the current state STATE will transit by
2401    accepting the current input byte.  */
2402
2403 static re_dfastate_t *
2404 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2405                   re_dfastate_t *state)
2406 {
2407   const re_dfa_t *const dfa = mctx->dfa;
2408   re_node_set next_nodes;
2409   re_dfastate_t *next_state;
2410   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2411   unsigned int context;
2412
2413   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2414   if (BE (*err != REG_NOERROR, 0))
2415     return NULL;
2416   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2417     {
2418       int cur_node = state->nodes.elems[node_cnt];
2419       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2420         {
2421           *err = re_node_set_merge (&next_nodes,
2422                                     dfa->eclosures + dfa->nexts[cur_node]);
2423           if (BE (*err != REG_NOERROR, 0))
2424             {
2425               re_node_set_free (&next_nodes);
2426               return NULL;
2427             }
2428         }
2429     }
2430   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2431   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2432   /* We don't need to check errors here, since the return value of
2433      this function is next_state and ERR is already set.  */
2434
2435   re_node_set_free (&next_nodes);
2436   re_string_skip_bytes (&mctx->input, 1);
2437   return next_state;
2438 }
2439 #endif
2440
2441 #ifdef RE_ENABLE_I18N
2442 static reg_errcode_t
2443 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2444 {
2445   const re_dfa_t *const dfa = mctx->dfa;
2446   reg_errcode_t err;
2447   int i;
2448
2449   for (i = 0; i < pstate->nodes.nelem; ++i)
2450     {
2451       re_node_set dest_nodes, *new_nodes;
2452       int cur_node_idx = pstate->nodes.elems[i];
2453       int naccepted, dest_idx;
2454       unsigned int context;
2455       re_dfastate_t *dest_state;
2456
2457       if (!dfa->nodes[cur_node_idx].accept_mb)
2458         continue;
2459
2460       if (dfa->nodes[cur_node_idx].constraint)
2461         {
2462           context = re_string_context_at (&mctx->input,
2463                                           re_string_cur_idx (&mctx->input),
2464                                           mctx->eflags);
2465           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2466                                            context))
2467             continue;
2468         }
2469
2470       /* How many bytes the node can accept?  */
2471       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2472                                            re_string_cur_idx (&mctx->input));
2473       if (naccepted == 0)
2474         continue;
2475
2476       /* The node can accepts `naccepted' bytes.  */
2477       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2478       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2479                                : mctx->max_mb_elem_len);
2480       err = clean_state_log_if_needed (mctx, dest_idx);
2481       if (BE (err != REG_NOERROR, 0))
2482         return err;
2483 #ifdef DEBUG
2484       assert (dfa->nexts[cur_node_idx] != -1);
2485 #endif
2486       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2487
2488       dest_state = mctx->state_log[dest_idx];
2489       if (dest_state == NULL)
2490         dest_nodes = *new_nodes;
2491       else
2492         {
2493           err = re_node_set_init_union (&dest_nodes,
2494                                         dest_state->entrance_nodes, new_nodes);
2495           if (BE (err != REG_NOERROR, 0))
2496             return err;
2497         }
2498       context = re_string_context_at (&mctx->input, dest_idx - 1,
2499                                       mctx->eflags);
2500       mctx->state_log[dest_idx]
2501         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2502       if (dest_state != NULL)
2503         re_node_set_free (&dest_nodes);
2504       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2505         return err;
2506     }
2507   return REG_NOERROR;
2508 }
2509 #endif /* RE_ENABLE_I18N */
2510
2511 static reg_errcode_t
2512 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2513 {
2514   const re_dfa_t *const dfa = mctx->dfa;
2515   reg_errcode_t err;
2516   int i;
2517   int cur_str_idx = re_string_cur_idx (&mctx->input);
2518
2519   for (i = 0; i < nodes->nelem; ++i)
2520     {
2521       int dest_str_idx, prev_nelem, bkc_idx;
2522       int node_idx = nodes->elems[i];
2523       unsigned int context;
2524       const re_token_t *node = dfa->nodes + node_idx;
2525       re_node_set *new_dest_nodes;
2526
2527       /* Check whether `node' is a backreference or not.  */
2528       if (node->type != OP_BACK_REF)
2529         continue;
2530
2531       if (node->constraint)
2532         {
2533           context = re_string_context_at (&mctx->input, cur_str_idx,
2534                                           mctx->eflags);
2535           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2536             continue;
2537         }
2538
2539       /* `node' is a backreference.
2540          Check the substring which the substring matched.  */
2541       bkc_idx = mctx->nbkref_ents;
2542       err = get_subexp (mctx, node_idx, cur_str_idx);
2543       if (BE (err != REG_NOERROR, 0))
2544         goto free_return;
2545
2546       /* And add the epsilon closures (which is `new_dest_nodes') of
2547          the backreference to appropriate state_log.  */
2548 #ifdef DEBUG
2549       assert (dfa->nexts[node_idx] != -1);
2550 #endif
2551       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2552         {
2553           int subexp_len;
2554           re_dfastate_t *dest_state;
2555           struct re_backref_cache_entry *bkref_ent;
2556           bkref_ent = mctx->bkref_ents + bkc_idx;
2557           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2558             continue;
2559           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2560           new_dest_nodes = (subexp_len == 0
2561                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2562                             : dfa->eclosures + dfa->nexts[node_idx]);
2563           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2564                           - bkref_ent->subexp_from);
2565           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2566                                           mctx->eflags);
2567           dest_state = mctx->state_log[dest_str_idx];
2568           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2569                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2570           /* Add `new_dest_node' to state_log.  */
2571           if (dest_state == NULL)
2572             {
2573               mctx->state_log[dest_str_idx]
2574                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2575                                             context);
2576               if (BE (mctx->state_log[dest_str_idx] == NULL
2577                       && err != REG_NOERROR, 0))
2578                 goto free_return;
2579             }
2580           else
2581             {
2582               re_node_set dest_nodes;
2583               err = re_node_set_init_union (&dest_nodes,
2584                                             dest_state->entrance_nodes,
2585                                             new_dest_nodes);
2586               if (BE (err != REG_NOERROR, 0))
2587                 {
2588                   re_node_set_free (&dest_nodes);
2589                   goto free_return;
2590                 }
2591               mctx->state_log[dest_str_idx]
2592                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2593               re_node_set_free (&dest_nodes);
2594               if (BE (mctx->state_log[dest_str_idx] == NULL
2595                       && err != REG_NOERROR, 0))
2596                 goto free_return;
2597             }
2598           /* We need to check recursively if the backreference can epsilon
2599              transit.  */
2600           if (subexp_len == 0
2601               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2602             {
2603               err = check_subexp_matching_top (mctx, new_dest_nodes,
2604                                                cur_str_idx);
2605               if (BE (err != REG_NOERROR, 0))
2606                 goto free_return;
2607               err = transit_state_bkref (mctx, new_dest_nodes);
2608               if (BE (err != REG_NOERROR, 0))
2609                 goto free_return;
2610             }
2611         }
2612     }
2613   err = REG_NOERROR;
2614  free_return:
2615   return err;
2616 }
2617
2618 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2619    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2620    Note that we might collect inappropriate candidates here.
2621    However, the cost of checking them strictly here is too high, then we
2622    delay these checking for prune_impossible_nodes().  */
2623
2624 static reg_errcode_t
2625 __attribute_warn_unused_result__
2626 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2627 {
2628   const re_dfa_t *const dfa = mctx->dfa;
2629   int subexp_num, sub_top_idx;
2630   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2631   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2632   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2633   if (cache_idx != -1)
2634     {
2635       const struct re_backref_cache_entry *entry
2636         = mctx->bkref_ents + cache_idx;
2637       do
2638         if (entry->node == bkref_node)
2639           return REG_NOERROR; /* We already checked it.  */
2640       while (entry++->more);
2641     }
2642
2643   subexp_num = dfa->nodes[bkref_node].opr.idx;
2644
2645   /* For each sub expression  */
2646   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2647     {
2648       reg_errcode_t err;
2649       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2650       re_sub_match_last_t *sub_last;
2651       int sub_last_idx, sl_str, bkref_str_off;
2652
2653       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2654         continue; /* It isn't related.  */
2655
2656       sl_str = sub_top->str_idx;
2657       bkref_str_off = bkref_str_idx;
2658       /* At first, check the last node of sub expressions we already
2659          evaluated.  */
2660       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2661         {
2662           int sl_str_diff;
2663           sub_last = sub_top->lasts[sub_last_idx];
2664           sl_str_diff = sub_last->str_idx - sl_str;
2665           /* The matched string by the sub expression match with the substring
2666              at the back reference?  */
2667           if (sl_str_diff > 0)
2668             {
2669               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2670                 {
2671                   /* Not enough chars for a successful match.  */
2672                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2673                     break;
2674
2675                   err = clean_state_log_if_needed (mctx,
2676                                                    bkref_str_off
2677                                                    + sl_str_diff);
2678                   if (BE (err != REG_NOERROR, 0))
2679                     return err;
2680                   buf = (const char *) re_string_get_buffer (&mctx->input);
2681                 }
2682               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2683                 /* We don't need to search this sub expression any more.  */
2684                 break;
2685             }
2686           bkref_str_off += sl_str_diff;
2687           sl_str += sl_str_diff;
2688           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2689                                 bkref_str_idx);
2690
2691           /* Reload buf, since the preceding call might have reallocated
2692              the buffer.  */
2693           buf = (const char *) re_string_get_buffer (&mctx->input);
2694
2695           if (err == REG_NOMATCH)
2696             continue;
2697           if (BE (err != REG_NOERROR, 0))
2698             return err;
2699         }
2700
2701       if (sub_last_idx < sub_top->nlasts)
2702         continue;
2703       if (sub_last_idx > 0)
2704         ++sl_str;
2705       /* Then, search for the other last nodes of the sub expression.  */
2706       for (; sl_str <= bkref_str_idx; ++sl_str)
2707         {
2708           int cls_node, sl_str_off;
2709           const re_node_set *nodes;
2710           sl_str_off = sl_str - sub_top->str_idx;
2711           /* The matched string by the sub expression match with the substring
2712              at the back reference?  */
2713           if (sl_str_off > 0)
2714             {
2715               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2716                 {
2717                   /* If we are at the end of the input, we cannot match.  */
2718                   if (bkref_str_off >= mctx->input.len)
2719                     break;
2720
2721                   err = extend_buffers (mctx, bkref_str_off + 1);
2722                   if (BE (err != REG_NOERROR, 0))
2723                     return err;
2724
2725                   buf = (const char *) re_string_get_buffer (&mctx->input);
2726                 }
2727               if (buf [bkref_str_off++] != buf[sl_str - 1])
2728                 break; /* We don't need to search this sub expression
2729                           any more.  */
2730             }
2731           if (mctx->state_log[sl_str] == NULL)
2732             continue;
2733           /* Does this state have a ')' of the sub expression?  */
2734           nodes = &mctx->state_log[sl_str]->nodes;
2735           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2736                                        OP_CLOSE_SUBEXP);
2737           if (cls_node == -1)
2738             continue; /* No.  */
2739           if (sub_top->path == NULL)
2740             {
2741               sub_top->path = calloc (sizeof (state_array_t),
2742                                       sl_str - sub_top->str_idx + 1);
2743               if (sub_top->path == NULL)
2744                 return REG_ESPACE;
2745             }
2746           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2747              in the current context?  */
2748           err = check_arrival (mctx, sub_top->path, sub_top->node,
2749                                sub_top->str_idx, cls_node, sl_str,
2750                                OP_CLOSE_SUBEXP);
2751           if (err == REG_NOMATCH)
2752               continue;
2753           if (BE (err != REG_NOERROR, 0))
2754               return err;
2755           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2756           if (BE (sub_last == NULL, 0))
2757             return REG_ESPACE;
2758           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2759                                 bkref_str_idx);
2760           if (err == REG_NOMATCH)
2761             continue;
2762         }
2763     }
2764   return REG_NOERROR;
2765 }
2766
2767 /* Helper functions for get_subexp().  */
2768
2769 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2770    If it can arrive, register the sub expression expressed with SUB_TOP
2771    and SUB_LAST.  */
2772
2773 static reg_errcode_t
2774 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2775                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2776 {
2777   reg_errcode_t err;
2778   int to_idx;
2779   /* Can the subexpression arrive the back reference?  */
2780   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2781                        sub_last->str_idx, bkref_node, bkref_str,
2782                        OP_OPEN_SUBEXP);
2783   if (err != REG_NOERROR)
2784     return err;
2785   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2786                              sub_last->str_idx);
2787   if (BE (err != REG_NOERROR, 0))
2788     return err;
2789   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2790   return clean_state_log_if_needed (mctx, to_idx);
2791 }
2792
2793 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2794    Search '(' if FL_OPEN, or search ')' otherwise.
2795    TODO: This function isn't efficient...
2796          Because there might be more than one nodes whose types are
2797          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2798          nodes.
2799          E.g. RE: (a){2}  */
2800
2801 static int
2802 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2803                   int subexp_idx, int type)
2804 {
2805   int cls_idx;
2806   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2807     {
2808       int cls_node = nodes->elems[cls_idx];
2809       const re_token_t *node = dfa->nodes + cls_node;
2810       if (node->type == type
2811           && node->opr.idx == subexp_idx)
2812         return cls_node;
2813     }
2814   return -1;
2815 }
2816
2817 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2818    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2819    heavily reused.
2820    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2821
2822 static reg_errcode_t
2823 __attribute_warn_unused_result__
2824 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2825                int top_str, int last_node, int last_str, int type)
2826 {
2827   const re_dfa_t *const dfa = mctx->dfa;
2828   reg_errcode_t err = REG_NOERROR;
2829   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2830   re_dfastate_t *cur_state = NULL;
2831   re_node_set *cur_nodes, next_nodes;
2832   re_dfastate_t **backup_state_log;
2833   unsigned int context;
2834
2835   subexp_num = dfa->nodes[top_node].opr.idx;
2836   /* Extend the buffer if we need.  */
2837   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2838     {
2839       re_dfastate_t **new_array;
2840       int old_alloc = path->alloc;
2841       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2842       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2843       if (BE (new_array == NULL, 0))
2844         {
2845           path->alloc = old_alloc;
2846           return REG_ESPACE;
2847         }
2848       path->array = new_array;
2849       memset (new_array + old_alloc, '\0',
2850               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2851     }
2852
2853   str_idx = path->next_idx ?: top_str;
2854
2855   /* Temporary modify MCTX.  */
2856   backup_state_log = mctx->state_log;
2857   backup_cur_idx = mctx->input.cur_idx;
2858   mctx->state_log = path->array;
2859   mctx->input.cur_idx = str_idx;
2860
2861   /* Setup initial node set.  */
2862   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2863   if (str_idx == top_str)
2864     {
2865       err = re_node_set_init_1 (&next_nodes, top_node);
2866       if (BE (err != REG_NOERROR, 0))
2867         return err;
2868       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2869       if (BE (err != REG_NOERROR, 0))
2870         {
2871           re_node_set_free (&next_nodes);
2872           return err;
2873         }
2874     }
2875   else
2876     {
2877       cur_state = mctx->state_log[str_idx];
2878       if (cur_state && cur_state->has_backref)
2879         {
2880           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2881           if (BE (err != REG_NOERROR, 0))
2882             return err;
2883         }
2884       else
2885         re_node_set_init_empty (&next_nodes);
2886     }
2887   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2888     {
2889       if (next_nodes.nelem)
2890         {
2891           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2892                                     subexp_num, type);
2893           if (BE (err != REG_NOERROR, 0))
2894             {
2895               re_node_set_free (&next_nodes);
2896               return err;
2897             }
2898         }
2899       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2900       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2901         {
2902           re_node_set_free (&next_nodes);
2903           return err;
2904         }
2905       mctx->state_log[str_idx] = cur_state;
2906     }
2907
2908   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2909     {
2910       re_node_set_empty (&next_nodes);
2911       if (mctx->state_log[str_idx + 1])
2912         {
2913           err = re_node_set_merge (&next_nodes,
2914                                    &mctx->state_log[str_idx + 1]->nodes);
2915           if (BE (err != REG_NOERROR, 0))
2916             {
2917               re_node_set_free (&next_nodes);
2918               return err;
2919             }
2920         }
2921       if (cur_state)
2922         {
2923           err = check_arrival_add_next_nodes (mctx, str_idx,
2924                                               &cur_state->non_eps_nodes,
2925                                               &next_nodes);
2926           if (BE (err != REG_NOERROR, 0))
2927             {
2928               re_node_set_free (&next_nodes);
2929               return err;
2930             }
2931         }
2932       ++str_idx;
2933       if (next_nodes.nelem)
2934         {
2935           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2936           if (BE (err != REG_NOERROR, 0))
2937             {
2938               re_node_set_free (&next_nodes);
2939               return err;
2940             }
2941           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2942                                     subexp_num, type);
2943           if (BE (err != REG_NOERROR, 0))
2944             {
2945               re_node_set_free (&next_nodes);
2946               return err;
2947             }
2948         }
2949       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2950       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2951       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2952         {
2953           re_node_set_free (&next_nodes);
2954           return err;
2955         }
2956       mctx->state_log[str_idx] = cur_state;
2957       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2958     }
2959   re_node_set_free (&next_nodes);
2960   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2961                : &mctx->state_log[last_str]->nodes);
2962   path->next_idx = str_idx;
2963
2964   /* Fix MCTX.  */
2965   mctx->state_log = backup_state_log;
2966   mctx->input.cur_idx = backup_cur_idx;
2967
2968   /* Then check the current node set has the node LAST_NODE.  */
2969   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2970     return REG_NOERROR;
2971
2972   return REG_NOMATCH;
2973 }
2974
2975 /* Helper functions for check_arrival.  */
2976
2977 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
2978    to NEXT_NODES.
2979    TODO: This function is similar to the functions transit_state*(),
2980          however this function has many additional works.
2981          Can't we unify them?  */
2982
2983 static reg_errcode_t
2984 __attribute_warn_unused_result__
2985 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
2986                               re_node_set *cur_nodes, re_node_set *next_nodes)
2987 {
2988   const re_dfa_t *const dfa = mctx->dfa;
2989   int result;
2990   int cur_idx;
2991 #ifdef RE_ENABLE_I18N
2992   reg_errcode_t err = REG_NOERROR;
2993 #endif
2994   re_node_set union_set;
2995   re_node_set_init_empty (&union_set);
2996   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
2997     {
2998       int naccepted = 0;
2999       int cur_node = cur_nodes->elems[cur_idx];
3000 #ifdef DEBUG
3001       re_token_type_t type = dfa->nodes[cur_node].type;
3002       assert (!IS_EPSILON_NODE (type));
3003 #endif
3004 #ifdef RE_ENABLE_I18N
3005       /* If the node may accept `multi byte'.  */
3006       if (dfa->nodes[cur_node].accept_mb)
3007         {
3008           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3009                                                str_idx);
3010           if (naccepted > 1)
3011             {
3012               re_dfastate_t *dest_state;
3013               int next_node = dfa->nexts[cur_node];
3014               int next_idx = str_idx + naccepted;
3015               dest_state = mctx->state_log[next_idx];
3016               re_node_set_empty (&union_set);
3017               if (dest_state)
3018                 {
3019                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3020                   if (BE (err != REG_NOERROR, 0))
3021                     {
3022                       re_node_set_free (&union_set);
3023                       return err;
3024                     }
3025                 }
3026               result = re_node_set_insert (&union_set, next_node);
3027               if (BE (result < 0, 0))
3028                 {
3029                   re_node_set_free (&union_set);
3030                   return REG_ESPACE;
3031                 }
3032               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3033                                                             &union_set);
3034               if (BE (mctx->state_log[next_idx] == NULL
3035                       && err != REG_NOERROR, 0))
3036                 {
3037                   re_node_set_free (&union_set);
3038                   return err;
3039                 }
3040             }
3041         }
3042 #endif /* RE_ENABLE_I18N */
3043       if (naccepted
3044           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3045         {
3046           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3047           if (BE (result < 0, 0))
3048             {
3049               re_node_set_free (&union_set);
3050               return REG_ESPACE;
3051             }
3052         }
3053     }
3054   re_node_set_free (&union_set);
3055   return REG_NOERROR;
3056 }
3057
3058 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3059    CUR_NODES, however exclude the nodes which are:
3060     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3061     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3062 */
3063
3064 static reg_errcode_t
3065 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3066                           int ex_subexp, int type)
3067 {
3068   reg_errcode_t err;
3069   int idx, outside_node;
3070   re_node_set new_nodes;
3071 #ifdef DEBUG
3072   assert (cur_nodes->nelem);
3073 #endif
3074   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3075   if (BE (err != REG_NOERROR, 0))
3076     return err;
3077   /* Create a new node set NEW_NODES with the nodes which are epsilon
3078      closures of the node in CUR_NODES.  */
3079
3080   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3081     {
3082       int cur_node = cur_nodes->elems[idx];
3083       const re_node_set *eclosure = dfa->eclosures + cur_node;
3084       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3085       if (outside_node == -1)
3086         {
3087           /* There are no problematic nodes, just merge them.  */
3088           err = re_node_set_merge (&new_nodes, eclosure);
3089           if (BE (err != REG_NOERROR, 0))
3090             {
3091               re_node_set_free (&new_nodes);
3092               return err;
3093             }
3094         }
3095       else
3096         {
3097           /* There are problematic nodes, re-calculate incrementally.  */
3098           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3099                                               ex_subexp, type);
3100           if (BE (err != REG_NOERROR, 0))
3101             {
3102               re_node_set_free (&new_nodes);
3103               return err;
3104             }
3105         }
3106     }
3107   re_node_set_free (cur_nodes);
3108   *cur_nodes = new_nodes;
3109   return REG_NOERROR;
3110 }
3111
3112 /* Helper function for check_arrival_expand_ecl.
3113    Check incrementally the epsilon closure of TARGET, and if it isn't
3114    problematic append it to DST_NODES.  */
3115
3116 static reg_errcode_t
3117 __attribute_warn_unused_result__
3118 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3119                               int target, int ex_subexp, int type)
3120 {
3121   int cur_node;
3122   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3123     {
3124       int err;
3125
3126       if (dfa->nodes[cur_node].type == type
3127           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3128         {
3129           if (type == OP_CLOSE_SUBEXP)
3130             {
3131               err = re_node_set_insert (dst_nodes, cur_node);
3132               if (BE (err == -1, 0))
3133                 return REG_ESPACE;
3134             }
3135           break;
3136         }
3137       err = re_node_set_insert (dst_nodes, cur_node);
3138       if (BE (err == -1, 0))
3139         return REG_ESPACE;
3140       if (dfa->edests[cur_node].nelem == 0)
3141         break;
3142       if (dfa->edests[cur_node].nelem == 2)
3143         {
3144           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3145                                               dfa->edests[cur_node].elems[1],
3146                                               ex_subexp, type);
3147           if (BE (err != REG_NOERROR, 0))
3148             return err;
3149         }
3150       cur_node = dfa->edests[cur_node].elems[0];
3151     }
3152   return REG_NOERROR;
3153 }
3154
3155
3156 /* For all the back references in the current state, calculate the
3157    destination of the back references by the appropriate entry
3158    in MCTX->BKREF_ENTS.  */
3159
3160 static reg_errcode_t
3161 __attribute_warn_unused_result__
3162 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3163                     int cur_str, int subexp_num, int type)
3164 {
3165   const re_dfa_t *const dfa = mctx->dfa;
3166   reg_errcode_t err;
3167   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3168   struct re_backref_cache_entry *ent;
3169
3170   if (cache_idx_start == -1)
3171     return REG_NOERROR;
3172
3173  restart:
3174   ent = mctx->bkref_ents + cache_idx_start;
3175   do
3176     {
3177       int to_idx, next_node;
3178
3179       /* Is this entry ENT is appropriate?  */
3180       if (!re_node_set_contains (cur_nodes, ent->node))
3181         continue; /* No.  */
3182
3183       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3184       /* Calculate the destination of the back reference, and append it
3185          to MCTX->STATE_LOG.  */
3186       if (to_idx == cur_str)
3187         {
3188           /* The backreference did epsilon transit, we must re-check all the
3189              node in the current state.  */
3190           re_node_set new_dests;
3191           reg_errcode_t err2, err3;
3192           next_node = dfa->edests[ent->node].elems[0];
3193           if (re_node_set_contains (cur_nodes, next_node))
3194             continue;
3195           err = re_node_set_init_1 (&new_dests, next_node);
3196           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3197           err3 = re_node_set_merge (cur_nodes, &new_dests);
3198           re_node_set_free (&new_dests);
3199           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3200                   || err3 != REG_NOERROR, 0))
3201             {
3202               err = (err != REG_NOERROR ? err
3203                      : (err2 != REG_NOERROR ? err2 : err3));
3204               return err;
3205             }
3206           /* TODO: It is still inefficient...  */
3207           goto restart;
3208         }
3209       else
3210         {
3211           re_node_set union_set;
3212           next_node = dfa->nexts[ent->node];
3213           if (mctx->state_log[to_idx])
3214             {
3215               int ret;
3216               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3217                                         next_node))
3218                 continue;
3219               err = re_node_set_init_copy (&union_set,
3220                                            &mctx->state_log[to_idx]->nodes);
3221               ret = re_node_set_insert (&union_set, next_node);
3222               if (BE (err != REG_NOERROR || ret < 0, 0))
3223                 {
3224                   re_node_set_free (&union_set);
3225                   err = err != REG_NOERROR ? err : REG_ESPACE;
3226                   return err;
3227                 }
3228             }
3229           else
3230             {
3231               err = re_node_set_init_1 (&union_set, next_node);
3232               if (BE (err != REG_NOERROR, 0))
3233                 return err;
3234             }
3235           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3236           re_node_set_free (&union_set);
3237           if (BE (mctx->state_log[to_idx] == NULL
3238                   && err != REG_NOERROR, 0))
3239             return err;
3240         }
3241     }
3242   while (ent++->more);
3243   return REG_NOERROR;
3244 }
3245
3246 /* Build transition table for the state.
3247    Return 1 if succeeded, otherwise return NULL.  */
3248
3249 static int
3250 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3251 {
3252   reg_errcode_t err;
3253   int i, j, ch, need_word_trtable = 0;
3254   bitset_word_t elem, mask;
3255   bool dests_node_malloced = false;
3256   bool dest_states_malloced = false;
3257   int ndests; /* Number of the destination states from `state'.  */
3258   re_dfastate_t **trtable;
3259   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3260   re_node_set follows, *dests_node;
3261   bitset_t *dests_ch;
3262   bitset_t acceptable;
3263
3264   struct dests_alloc
3265   {
3266     re_node_set dests_node[SBC_MAX];
3267     bitset_t dests_ch[SBC_MAX];
3268   } *dests_alloc;
3269
3270   /* We build DFA states which corresponds to the destination nodes
3271      from `state'.  `dests_node[i]' represents the nodes which i-th
3272      destination state contains, and `dests_ch[i]' represents the
3273      characters which i-th destination state accepts.  */
3274   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3275     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3276   else
3277     {
3278       dests_alloc = re_malloc (struct dests_alloc, 1);
3279       if (BE (dests_alloc == NULL, 0))
3280         return 0;
3281       dests_node_malloced = true;
3282     }
3283   dests_node = dests_alloc->dests_node;
3284   dests_ch = dests_alloc->dests_ch;
3285
3286   /* Initialize transiton table.  */
3287   state->word_trtable = state->trtable = NULL;
3288
3289   /* At first, group all nodes belonging to `state' into several
3290      destinations.  */
3291   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3292   if (BE (ndests <= 0, 0))
3293     {
3294       if (dests_node_malloced)
3295         free (dests_alloc);
3296       /* Return 0 in case of an error, 1 otherwise.  */
3297       if (ndests == 0)
3298         {
3299           state->trtable = (re_dfastate_t **)
3300             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3301           if (BE (state->trtable == NULL, 0))
3302             return 0;
3303           return 1;
3304         }
3305       return 0;
3306     }
3307
3308   err = re_node_set_alloc (&follows, ndests + 1);
3309   if (BE (err != REG_NOERROR, 0))
3310     goto out_free;
3311
3312   /* Avoid arithmetic overflow in size calculation.  */
3313   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3314             / (3 * sizeof (re_dfastate_t *)))
3315            < ndests),
3316           0))
3317     goto out_free;
3318
3319   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3320                          + ndests * 3 * sizeof (re_dfastate_t *)))
3321     dest_states = (re_dfastate_t **)
3322       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3323   else
3324     {
3325       dest_states = (re_dfastate_t **)
3326         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3327       if (BE (dest_states == NULL, 0))
3328         {
3329 out_free:
3330           if (dest_states_malloced)
3331             free (dest_states);
3332           re_node_set_free (&follows);
3333           for (i = 0; i < ndests; ++i)
3334             re_node_set_free (dests_node + i);
3335           if (dests_node_malloced)
3336             free (dests_alloc);
3337           return 0;
3338         }
3339       dest_states_malloced = true;
3340     }
3341   dest_states_word = dest_states + ndests;
3342   dest_states_nl = dest_states_word + ndests;
3343   bitset_empty (acceptable);
3344
3345   /* Then build the states for all destinations.  */
3346   for (i = 0; i < ndests; ++i)
3347     {
3348       int next_node;
3349       re_node_set_empty (&follows);
3350       /* Merge the follows of this destination states.  */
3351       for (j = 0; j < dests_node[i].nelem; ++j)
3352         {
3353           next_node = dfa->nexts[dests_node[i].elems[j]];
3354           if (next_node != -1)
3355             {
3356               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3357               if (BE (err != REG_NOERROR, 0))
3358                 goto out_free;
3359             }
3360         }
3361       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3362       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3363         goto out_free;
3364       /* If the new state has context constraint,
3365          build appropriate states for these contexts.  */
3366       if (dest_states[i]->has_constraint)
3367         {
3368           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3369                                                           CONTEXT_WORD);
3370           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3371             goto out_free;
3372
3373           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3374             need_word_trtable = 1;
3375
3376           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3377                                                         CONTEXT_NEWLINE);
3378           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3379             goto out_free;
3380         }
3381       else
3382         {
3383           dest_states_word[i] = dest_states[i];
3384           dest_states_nl[i] = dest_states[i];
3385         }
3386       bitset_merge (acceptable, dests_ch[i]);
3387     }
3388
3389   if (!BE (need_word_trtable, 0))
3390     {
3391       /* We don't care about whether the following character is a word
3392          character, or we are in a single-byte character set so we can
3393          discern by looking at the character code: allocate a
3394          256-entry transition table.  */
3395       trtable = state->trtable =
3396         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3397       if (BE (trtable == NULL, 0))
3398         goto out_free;
3399
3400       /* For all characters ch...:  */
3401       for (i = 0; i < BITSET_WORDS; ++i)
3402         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3403              elem;
3404              mask <<= 1, elem >>= 1, ++ch)
3405           if (BE (elem & 1, 0))
3406             {
3407               /* There must be exactly one destination which accepts
3408                  character ch.  See group_nodes_into_DFAstates.  */
3409               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3410                 ;
3411
3412               /* j-th destination accepts the word character ch.  */
3413               if (dfa->word_char[i] & mask)
3414                 trtable[ch] = dest_states_word[j];
3415               else
3416                 trtable[ch] = dest_states[j];
3417             }
3418     }
3419   else
3420     {
3421       /* We care about whether the following character is a word
3422          character, and we are in a multi-byte character set: discern
3423          by looking at the character code: build two 256-entry
3424          transition tables, one starting at trtable[0] and one
3425          starting at trtable[SBC_MAX].  */
3426       trtable = state->word_trtable =
3427         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3428       if (BE (trtable == NULL, 0))
3429         goto out_free;
3430
3431       /* For all characters ch...:  */
3432       for (i = 0; i < BITSET_WORDS; ++i)
3433         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3434              elem;
3435              mask <<= 1, elem >>= 1, ++ch)
3436           if (BE (elem & 1, 0))
3437             {
3438               /* There must be exactly one destination which accepts
3439                  character ch.  See group_nodes_into_DFAstates.  */
3440               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3441                 ;
3442
3443               /* j-th destination accepts the word character ch.  */
3444               trtable[ch] = dest_states[j];
3445               trtable[ch + SBC_MAX] = dest_states_word[j];
3446             }
3447     }
3448
3449   /* new line */
3450   if (bitset_contain (acceptable, NEWLINE_CHAR))
3451     {
3452       /* The current state accepts newline character.  */
3453       for (j = 0; j < ndests; ++j)
3454         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3455           {
3456             /* k-th destination accepts newline character.  */
3457             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3458             if (need_word_trtable)
3459               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3460             /* There must be only one destination which accepts
3461                newline.  See group_nodes_into_DFAstates.  */
3462             break;
3463           }
3464     }
3465
3466   if (dest_states_malloced)
3467     free (dest_states);
3468
3469   re_node_set_free (&follows);
3470   for (i = 0; i < ndests; ++i)
3471     re_node_set_free (dests_node + i);
3472
3473   if (dests_node_malloced)
3474     free (dests_alloc);
3475
3476   return 1;
3477 }
3478
3479 /* Group all nodes belonging to STATE into several destinations.
3480    Then for all destinations, set the nodes belonging to the destination
3481    to DESTS_NODE[i] and set the characters accepted by the destination
3482    to DEST_CH[i].  This function return the number of destinations.  */
3483
3484 static int
3485 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3486                             re_node_set *dests_node, bitset_t *dests_ch)
3487 {
3488   reg_errcode_t err;
3489   int result;
3490   int i, j, k;
3491   int ndests; /* Number of the destinations from `state'.  */
3492   bitset_t accepts; /* Characters a node can accept.  */
3493   const re_node_set *cur_nodes = &state->nodes;
3494   bitset_empty (accepts);
3495   ndests = 0;
3496
3497   /* For all the nodes belonging to `state',  */
3498   for (i = 0; i < cur_nodes->nelem; ++i)
3499     {
3500       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3501       re_token_type_t type = node->type;
3502       unsigned int constraint = node->constraint;
3503
3504       /* Enumerate all single byte character this node can accept.  */
3505       if (type == CHARACTER)
3506         bitset_set (accepts, node->opr.c);
3507       else if (type == SIMPLE_BRACKET)
3508         {
3509           bitset_merge (accepts, node->opr.sbcset);
3510         }
3511       else if (type == OP_PERIOD)
3512         {
3513 #ifdef RE_ENABLE_I18N
3514           if (dfa->mb_cur_max > 1)
3515             bitset_merge (accepts, dfa->sb_char);
3516           else
3517 #endif
3518             bitset_set_all (accepts);
3519           if (!(dfa->syntax & RE_DOT_NEWLINE))
3520             bitset_clear (accepts, '\n');
3521           if (dfa->syntax & RE_DOT_NOT_NULL)
3522             bitset_clear (accepts, '\0');
3523         }
3524 #ifdef RE_ENABLE_I18N
3525       else if (type == OP_UTF8_PERIOD)
3526         {
3527           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3528           if (!(dfa->syntax & RE_DOT_NEWLINE))
3529             bitset_clear (accepts, '\n');
3530           if (dfa->syntax & RE_DOT_NOT_NULL)
3531             bitset_clear (accepts, '\0');
3532         }
3533 #endif
3534       else
3535         continue;
3536
3537       /* Check the `accepts' and sift the characters which are not
3538          match it the context.  */
3539       if (constraint)
3540         {
3541           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3542             {
3543               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3544               bitset_empty (accepts);
3545               if (accepts_newline)
3546                 bitset_set (accepts, NEWLINE_CHAR);
3547               else
3548                 continue;
3549             }
3550           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3551             {
3552               bitset_empty (accepts);
3553               continue;
3554             }
3555
3556           if (constraint & NEXT_WORD_CONSTRAINT)
3557             {
3558               bitset_word_t any_set = 0;
3559               if (type == CHARACTER && !node->word_char)
3560                 {
3561                   bitset_empty (accepts);
3562                   continue;
3563                 }
3564 #ifdef RE_ENABLE_I18N
3565               if (dfa->mb_cur_max > 1)
3566                 for (j = 0; j < BITSET_WORDS; ++j)
3567                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3568               else
3569 #endif
3570                 for (j = 0; j < BITSET_WORDS; ++j)
3571                   any_set |= (accepts[j] &= dfa->word_char[j]);
3572               if (!any_set)
3573                 continue;
3574             }
3575           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3576             {
3577               bitset_word_t any_set = 0;
3578               if (type == CHARACTER && node->word_char)
3579                 {
3580                   bitset_empty (accepts);
3581                   continue;
3582                 }
3583 #ifdef RE_ENABLE_I18N
3584               if (dfa->mb_cur_max > 1)
3585                 for (j = 0; j < BITSET_WORDS; ++j)
3586                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3587               else
3588 #endif
3589                 for (j = 0; j < BITSET_WORDS; ++j)
3590                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3591               if (!any_set)
3592                 continue;
3593             }
3594         }
3595
3596       /* Then divide `accepts' into DFA states, or create a new
3597          state.  Above, we make sure that accepts is not empty.  */
3598       for (j = 0; j < ndests; ++j)
3599         {
3600           bitset_t intersec; /* Intersection sets, see below.  */
3601           bitset_t remains;
3602           /* Flags, see below.  */
3603           bitset_word_t has_intersec, not_subset, not_consumed;
3604
3605           /* Optimization, skip if this state doesn't accept the character.  */
3606           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3607             continue;
3608
3609           /* Enumerate the intersection set of this state and `accepts'.  */
3610           has_intersec = 0;
3611           for (k = 0; k < BITSET_WORDS; ++k)
3612             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3613           /* And skip if the intersection set is empty.  */
3614           if (!has_intersec)
3615             continue;
3616
3617           /* Then check if this state is a subset of `accepts'.  */
3618           not_subset = not_consumed = 0;
3619           for (k = 0; k < BITSET_WORDS; ++k)
3620             {
3621               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3622               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3623             }
3624
3625           /* If this state isn't a subset of `accepts', create a
3626              new group state, which has the `remains'. */
3627           if (not_subset)
3628             {
3629               bitset_copy (dests_ch[ndests], remains);
3630               bitset_copy (dests_ch[j], intersec);
3631               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3632               if (BE (err != REG_NOERROR, 0))
3633                 goto error_return;
3634               ++ndests;
3635             }
3636
3637           /* Put the position in the current group. */
3638           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3639           if (BE (result < 0, 0))
3640             goto error_return;
3641
3642           /* If all characters are consumed, go to next node. */
3643           if (!not_consumed)
3644             break;
3645         }
3646       /* Some characters remain, create a new group. */
3647       if (j == ndests)
3648         {
3649           bitset_copy (dests_ch[ndests], accepts);
3650           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3651           if (BE (err != REG_NOERROR, 0))
3652             goto error_return;
3653           ++ndests;
3654           bitset_empty (accepts);
3655         }
3656     }
3657   return ndests;
3658  error_return:
3659   for (j = 0; j < ndests; ++j)
3660     re_node_set_free (dests_node + j);
3661   return -1;
3662 }
3663
3664 #ifdef RE_ENABLE_I18N
3665 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3666    Return the number of the bytes the node accepts.
3667    STR_IDX is the current index of the input string.
3668
3669    This function handles the nodes which can accept one character, or
3670    one collating element like '.', '[a-z]', opposite to the other nodes
3671    can only accept one byte.  */
3672
3673 # ifdef _LIBC
3674 #  include <locale/weight.h>
3675 # endif
3676
3677 static int
3678 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3679                          const re_string_t *input, int str_idx)
3680 {
3681   const re_token_t *node = dfa->nodes + node_idx;
3682   int char_len, elem_len;
3683   int i;
3684
3685   if (BE (node->type == OP_UTF8_PERIOD, 0))
3686     {
3687       unsigned char c = re_string_byte_at (input, str_idx), d;
3688       if (BE (c < 0xc2, 1))
3689         return 0;
3690
3691       if (str_idx + 2 > input->len)
3692         return 0;
3693
3694       d = re_string_byte_at (input, str_idx + 1);
3695       if (c < 0xe0)
3696         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3697       else if (c < 0xf0)
3698         {
3699           char_len = 3;
3700           if (c == 0xe0 && d < 0xa0)
3701             return 0;
3702         }
3703       else if (c < 0xf8)
3704         {
3705           char_len = 4;
3706           if (c == 0xf0 && d < 0x90)
3707             return 0;
3708         }
3709       else if (c < 0xfc)
3710         {
3711           char_len = 5;
3712           if (c == 0xf8 && d < 0x88)
3713             return 0;
3714         }
3715       else if (c < 0xfe)
3716         {
3717           char_len = 6;
3718           if (c == 0xfc && d < 0x84)
3719             return 0;
3720         }
3721       else
3722         return 0;
3723
3724       if (str_idx + char_len > input->len)
3725         return 0;
3726
3727       for (i = 1; i < char_len; ++i)
3728         {
3729           d = re_string_byte_at (input, str_idx + i);
3730           if (d < 0x80 || d > 0xbf)
3731             return 0;
3732         }
3733       return char_len;
3734     }
3735
3736   char_len = re_string_char_size_at (input, str_idx);
3737   if (node->type == OP_PERIOD)
3738     {
3739       if (char_len <= 1)
3740         return 0;
3741       /* FIXME: I don't think this if is needed, as both '\n'
3742          and '\0' are char_len == 1.  */
3743       /* '.' accepts any one character except the following two cases.  */
3744       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3745            re_string_byte_at (input, str_idx) == '\n') ||
3746           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3747            re_string_byte_at (input, str_idx) == '\0'))
3748         return 0;
3749       return char_len;
3750     }
3751
3752   elem_len = re_string_elem_size_at (input, str_idx);
3753   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3754     return 0;
3755
3756   if (node->type == COMPLEX_BRACKET)
3757     {
3758       const re_charset_t *cset = node->opr.mbcset;
3759 # ifdef _LIBC
3760       const unsigned char *pin
3761         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3762       int j;
3763       uint32_t nrules;
3764 # endif /* _LIBC */
3765       int match_len = 0;
3766       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3767                     ? re_string_wchar_at (input, str_idx) : 0);
3768
3769       /* match with multibyte character?  */
3770       for (i = 0; i < cset->nmbchars; ++i)
3771         if (wc == cset->mbchars[i])
3772           {
3773             match_len = char_len;
3774             goto check_node_accept_bytes_match;
3775           }
3776       /* match with character_class?  */
3777       for (i = 0; i < cset->nchar_classes; ++i)
3778         {
3779           wctype_t wt = cset->char_classes[i];
3780           if (__iswctype (wc, wt))
3781             {
3782               match_len = char_len;
3783               goto check_node_accept_bytes_match;
3784             }
3785         }
3786
3787 # ifdef _LIBC
3788       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3789       if (nrules != 0)
3790         {
3791           unsigned int in_collseq = 0;
3792           const int32_t *table, *indirect;
3793           const unsigned char *weights, *extra;
3794           const char *collseqwc;
3795
3796           /* match with collating_symbol?  */
3797           if (cset->ncoll_syms)
3798             extra = (const unsigned char *)
3799               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3800           for (i = 0; i < cset->ncoll_syms; ++i)
3801             {
3802               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3803               /* Compare the length of input collating element and
3804                  the length of current collating element.  */
3805               if (*coll_sym != elem_len)
3806                 continue;
3807               /* Compare each bytes.  */
3808               for (j = 0; j < *coll_sym; j++)
3809                 if (pin[j] != coll_sym[1 + j])
3810                   break;
3811               if (j == *coll_sym)
3812                 {
3813                   /* Match if every bytes is equal.  */
3814                   match_len = j;
3815                   goto check_node_accept_bytes_match;
3816                 }
3817             }
3818
3819           if (cset->nranges)
3820             {
3821               if (elem_len <= char_len)
3822                 {
3823                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3824                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3825                 }
3826               else
3827                 in_collseq = find_collation_sequence_value (pin, elem_len);
3828             }
3829           /* match with range expression?  */
3830           for (i = 0; i < cset->nranges; ++i)
3831             if (cset->range_starts[i] <= in_collseq
3832                 && in_collseq <= cset->range_ends[i])
3833               {
3834                 match_len = elem_len;
3835                 goto check_node_accept_bytes_match;
3836               }
3837
3838           /* match with equivalence_class?  */
3839           if (cset->nequiv_classes)
3840             {
3841               const unsigned char *cp = pin;
3842               table = (const int32_t *)
3843                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3844               weights = (const unsigned char *)
3845                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3846               extra = (const unsigned char *)
3847                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3848               indirect = (const int32_t *)
3849                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3850               int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3851               if (idx > 0)
3852                 for (i = 0; i < cset->nequiv_classes; ++i)
3853                   {
3854                     int32_t equiv_class_idx = cset->equiv_classes[i];
3855                     size_t weight_len = weights[idx & 0xffffff];
3856                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3857                         && (idx >> 24) == (equiv_class_idx >> 24))
3858                       {
3859                         int cnt = 0;
3860
3861                         idx &= 0xffffff;
3862                         equiv_class_idx &= 0xffffff;
3863
3864                         while (cnt <= weight_len
3865                                && (weights[equiv_class_idx + 1 + cnt]
3866                                    == weights[idx + 1 + cnt]))
3867                           ++cnt;
3868                         if (cnt > weight_len)
3869                           {
3870                             match_len = elem_len;
3871                             goto check_node_accept_bytes_match;
3872                           }
3873                       }
3874                   }
3875             }
3876         }
3877       else
3878 # endif /* _LIBC */
3879         {
3880           /* match with range expression?  */
3881 #if __GNUC__ >= 2
3882           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3883 #else
3884           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3885           cmp_buf[2] = wc;
3886 #endif
3887           for (i = 0; i < cset->nranges; ++i)
3888             {
3889               cmp_buf[0] = cset->range_starts[i];
3890               cmp_buf[4] = cset->range_ends[i];
3891               if (__wcscoll (cmp_buf, cmp_buf + 2) <= 0
3892                   && __wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3893                 {
3894                   match_len = char_len;
3895                   goto check_node_accept_bytes_match;
3896                 }
3897             }
3898         }
3899     check_node_accept_bytes_match:
3900       if (!cset->non_match)
3901         return match_len;
3902       else
3903         {
3904           if (match_len > 0)
3905             return 0;
3906           else
3907             return (elem_len > char_len) ? elem_len : char_len;
3908         }
3909     }
3910   return 0;
3911 }
3912
3913 # ifdef _LIBC
3914 static unsigned int
3915 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3916 {
3917   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3918   if (nrules == 0)
3919     {
3920       if (mbs_len == 1)
3921         {
3922           /* No valid character.  Match it as a single byte character.  */
3923           const unsigned char *collseq = (const unsigned char *)
3924             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3925           return collseq[mbs[0]];
3926         }
3927       return UINT_MAX;
3928     }
3929   else
3930     {
3931       int32_t idx;
3932       const unsigned char *extra = (const unsigned char *)
3933         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3934       int32_t extrasize = (const unsigned char *)
3935         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3936
3937       for (idx = 0; idx < extrasize;)
3938         {
3939           int mbs_cnt, found = 0;
3940           int32_t elem_mbs_len;
3941           /* Skip the name of collating element name.  */
3942           idx = idx + extra[idx] + 1;
3943           elem_mbs_len = extra[idx++];
3944           if (mbs_len == elem_mbs_len)
3945             {
3946               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3947                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3948                   break;
3949               if (mbs_cnt == elem_mbs_len)
3950                 /* Found the entry.  */
3951                 found = 1;
3952             }
3953           /* Skip the byte sequence of the collating element.  */
3954           idx += elem_mbs_len;
3955           /* Adjust for the alignment.  */
3956           idx = (idx + 3) & ~3;
3957           /* Skip the collation sequence value.  */
3958           idx += sizeof (uint32_t);
3959           /* Skip the wide char sequence of the collating element.  */
3960           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3961           /* If we found the entry, return the sequence value.  */
3962           if (found)
3963             return *(uint32_t *) (extra + idx);
3964           /* Skip the collation sequence value.  */
3965           idx += sizeof (uint32_t);
3966         }
3967       return UINT_MAX;
3968     }
3969 }
3970 # endif /* _LIBC */
3971 #endif /* RE_ENABLE_I18N */
3972
3973 /* Check whether the node accepts the byte which is IDX-th
3974    byte of the INPUT.  */
3975
3976 static int
3977 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3978                    int idx)
3979 {
3980   unsigned char ch;
3981   ch = re_string_byte_at (&mctx->input, idx);
3982   switch (node->type)
3983     {
3984     case CHARACTER:
3985       if (node->opr.c != ch)
3986         return 0;
3987       break;
3988
3989     case SIMPLE_BRACKET:
3990       if (!bitset_contain (node->opr.sbcset, ch))
3991         return 0;
3992       break;
3993
3994 #ifdef RE_ENABLE_I18N
3995     case OP_UTF8_PERIOD:
3996       if (ch >= 0x80)
3997         return 0;
3998       /* FALLTHROUGH */
3999 #endif
4000     case OP_PERIOD:
4001       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4002           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4003         return 0;
4004       break;
4005
4006     default:
4007       return 0;
4008     }
4009
4010   if (node->constraint)
4011     {
4012       /* The node has constraints.  Check whether the current context
4013          satisfies the constraints.  */
4014       unsigned int context = re_string_context_at (&mctx->input, idx,
4015                                                    mctx->eflags);
4016       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4017         return 0;
4018     }
4019
4020   return 1;
4021 }
4022
4023 /* Extend the buffers, if the buffers have run out.  */
4024
4025 static reg_errcode_t
4026 __attribute_warn_unused_result__
4027 extend_buffers (re_match_context_t *mctx, int min_len)
4028 {
4029   reg_errcode_t ret;
4030   re_string_t *pstr = &mctx->input;
4031
4032   /* Avoid overflow.  */
4033   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4034     return REG_ESPACE;
4035
4036   /* Double the lengthes of the buffers, but allocate at least MIN_LEN.  */
4037   ret = re_string_realloc_buffers (pstr,
4038                                    MAX (min_len,
4039                                         MIN (pstr->len, pstr->bufs_len * 2)));
4040   if (BE (ret != REG_NOERROR, 0))
4041     return ret;
4042
4043   if (mctx->state_log != NULL)
4044     {
4045       /* And double the length of state_log.  */
4046       /* XXX We have no indication of the size of this buffer.  If this
4047          allocation fail we have no indication that the state_log array
4048          does not have the right size.  */
4049       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4050                                               pstr->bufs_len + 1);
4051       if (BE (new_array == NULL, 0))
4052         return REG_ESPACE;
4053       mctx->state_log = new_array;
4054     }
4055
4056   /* Then reconstruct the buffers.  */
4057   if (pstr->icase)
4058     {
4059 #ifdef RE_ENABLE_I18N
4060       if (pstr->mb_cur_max > 1)
4061         {
4062           ret = build_wcs_upper_buffer (pstr);
4063           if (BE (ret != REG_NOERROR, 0))
4064             return ret;
4065         }
4066       else
4067 #endif /* RE_ENABLE_I18N  */
4068         build_upper_buffer (pstr);
4069     }
4070   else
4071     {
4072 #ifdef RE_ENABLE_I18N
4073       if (pstr->mb_cur_max > 1)
4074         build_wcs_buffer (pstr);
4075       else
4076 #endif /* RE_ENABLE_I18N  */
4077         {
4078           if (pstr->trans != NULL)
4079             re_string_translate_buffer (pstr);
4080         }
4081     }
4082   return REG_NOERROR;
4083 }
4084
4085 \f
4086 /* Functions for matching context.  */
4087
4088 /* Initialize MCTX.  */
4089
4090 static reg_errcode_t
4091 __attribute_warn_unused_result__
4092 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4093 {
4094   mctx->eflags = eflags;
4095   mctx->match_last = -1;
4096   if (n > 0)
4097     {
4098       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4099       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4100       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4101         return REG_ESPACE;
4102     }
4103   /* Already zero-ed by the caller.
4104      else
4105        mctx->bkref_ents = NULL;
4106      mctx->nbkref_ents = 0;
4107      mctx->nsub_tops = 0;  */
4108   mctx->abkref_ents = n;
4109   mctx->max_mb_elem_len = 1;
4110   mctx->asub_tops = n;
4111   return REG_NOERROR;
4112 }
4113
4114 /* Clean the entries which depend on the current input in MCTX.
4115    This function must be invoked when the matcher changes the start index
4116    of the input, or changes the input string.  */
4117
4118 static void
4119 match_ctx_clean (re_match_context_t *mctx)
4120 {
4121   int st_idx;
4122   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4123     {
4124       int sl_idx;
4125       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4126       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4127         {
4128           re_sub_match_last_t *last = top->lasts[sl_idx];
4129           re_free (last->path.array);
4130           re_free (last);
4131         }
4132       re_free (top->lasts);
4133       if (top->path)
4134         {
4135           re_free (top->path->array);
4136           re_free (top->path);
4137         }
4138       free (top);
4139     }
4140
4141   mctx->nsub_tops = 0;
4142   mctx->nbkref_ents = 0;
4143 }
4144
4145 /* Free all the memory associated with MCTX.  */
4146
4147 static void
4148 match_ctx_free (re_match_context_t *mctx)
4149 {
4150   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4151   match_ctx_clean (mctx);
4152   re_free (mctx->sub_tops);
4153   re_free (mctx->bkref_ents);
4154 }
4155
4156 /* Add a new backreference entry to MCTX.
4157    Note that we assume that caller never call this function with duplicate
4158    entry, and call with STR_IDX which isn't smaller than any existing entry.
4159 */
4160
4161 static reg_errcode_t
4162 __attribute_warn_unused_result__
4163 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4164                      int to)
4165 {
4166   if (mctx->nbkref_ents >= mctx->abkref_ents)
4167     {
4168       struct re_backref_cache_entry* new_entry;
4169       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4170                               mctx->abkref_ents * 2);
4171       if (BE (new_entry == NULL, 0))
4172         {
4173           re_free (mctx->bkref_ents);
4174           return REG_ESPACE;
4175         }
4176       mctx->bkref_ents = new_entry;
4177       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4178               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4179       mctx->abkref_ents *= 2;
4180     }
4181   if (mctx->nbkref_ents > 0
4182       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4183     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4184
4185   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4186   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4187   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4188   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4189
4190   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4191      If bit N is clear, means that this entry won't epsilon-transition to
4192      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4193      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4194      such node.
4195
4196      A backreference does not epsilon-transition unless it is empty, so set
4197      to all zeros if FROM != TO.  */
4198   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4199     = (from == to ? ~0 : 0);
4200
4201   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4202   if (mctx->max_mb_elem_len < to - from)
4203     mctx->max_mb_elem_len = to - from;
4204   return REG_NOERROR;
4205 }
4206
4207 /* Search for the first entry which has the same str_idx, or -1 if none is
4208    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4209
4210 static int
4211 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4212 {
4213   int left, right, mid, last;
4214   last = right = mctx->nbkref_ents;
4215   for (left = 0; left < right;)
4216     {
4217       mid = (left + right) / 2;
4218       if (mctx->bkref_ents[mid].str_idx < str_idx)
4219         left = mid + 1;
4220       else
4221         right = mid;
4222     }
4223   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4224     return left;
4225   else
4226     return -1;
4227 }
4228
4229 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4230    at STR_IDX.  */
4231
4232 static reg_errcode_t
4233 __attribute_warn_unused_result__
4234 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4235 {
4236 #ifdef DEBUG
4237   assert (mctx->sub_tops != NULL);
4238   assert (mctx->asub_tops > 0);
4239 #endif
4240   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4241     {
4242       int new_asub_tops = mctx->asub_tops * 2;
4243       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4244                                                    re_sub_match_top_t *,
4245                                                    new_asub_tops);
4246       if (BE (new_array == NULL, 0))
4247         return REG_ESPACE;
4248       mctx->sub_tops = new_array;
4249       mctx->asub_tops = new_asub_tops;
4250     }
4251   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4252   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4253     return REG_ESPACE;
4254   mctx->sub_tops[mctx->nsub_tops]->node = node;
4255   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4256   return REG_NOERROR;
4257 }
4258
4259 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4260    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4261
4262 static re_sub_match_last_t *
4263 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4264 {
4265   re_sub_match_last_t *new_entry;
4266   if (BE (subtop->nlasts == subtop->alasts, 0))
4267     {
4268       int new_alasts = 2 * subtop->alasts + 1;
4269       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4270                                                     re_sub_match_last_t *,
4271                                                     new_alasts);
4272       if (BE (new_array == NULL, 0))
4273         return NULL;
4274       subtop->lasts = new_array;
4275       subtop->alasts = new_alasts;
4276     }
4277   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4278   if (BE (new_entry != NULL, 1))
4279     {
4280       subtop->lasts[subtop->nlasts] = new_entry;
4281       new_entry->node = node;
4282       new_entry->str_idx = str_idx;
4283       ++subtop->nlasts;
4284     }
4285   return new_entry;
4286 }
4287
4288 static void
4289 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4290                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4291 {
4292   sctx->sifted_states = sifted_sts;
4293   sctx->limited_states = limited_sts;
4294   sctx->last_node = last_node;
4295   sctx->last_str_idx = last_str_idx;
4296   re_node_set_init_empty (&sctx->limits);
4297 }