posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2015 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 #include <stdint.h>
  21
  22 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  23                                      int n) internal_function;
  24 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  25 static void match_ctx_free (re_match_context_t *cache) internal_function;
  26 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  27                                           int str_idx, int from, int to)
  28      internal_function;
  29 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  30      internal_function;
  31 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  32                                            int str_idx) internal_function;
  33 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  34                                                    int node, int str_idx)
  35      internal_function;
  36 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  37                            re_dfastate_t **limited_sts, int last_node,
  38                            int last_str_idx)
  39      internal_function;
  40 static reg_errcode_t re_search_internal (const regex_t *preg,
  41                                          const char *string, int length,
  42                                          int start, int range, int stop,
  43                                          size_t nmatch, regmatch_t pmatch[],
  44                                          int eflags) internal_function;
  45 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  46                              const char *string1, int length1,
  47                              const char *string2, int length2,
  48                              int start, int range, struct re_registers *regs,
  49                              int stop, int ret_len) internal_function;
  50 static int re_search_stub (struct re_pattern_buffer *bufp,
  51                            const char *string, int length, int start,
  52                            int range, int stop, struct re_registers *regs,
  53                            int ret_len) internal_function;
  54 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  55                               int nregs, int regs_allocated) internal_function;
  56 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  57      internal_function;
  58 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  59                            int *p_match_first) internal_function;
  60 static int check_halt_state_context (const re_match_context_t *mctx,
  61                                      const re_dfastate_t *state, int idx)
  62      internal_function;
  63 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  64                          regmatch_t *prev_idx_match, int cur_node,
  65                          int cur_idx, int nmatch) internal_function;
  66 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  67                                       int str_idx, int dest_node, int nregs,
  68                                       regmatch_t *regs,
  69                                       re_node_set *eps_via_nodes)
  70      internal_function;
  71 static reg_errcode_t set_regs (const regex_t *preg,
  72                                const re_match_context_t *mctx,
  73                                size_t nmatch, regmatch_t *pmatch,
  74                                int fl_backtrack) internal_function;
  75 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  76      internal_function;
  77
  78 #ifdef RE_ENABLE_I18N
  79 static int sift_states_iter_mb (const re_match_context_t *mctx,
  80                                 re_sift_context_t *sctx,
  81                                 int node_idx, int str_idx, int max_str_idx)
  82      internal_function;
  83 #endif /* RE_ENABLE_I18N */
  84 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  85                                            re_sift_context_t *sctx)
  86      internal_function;
  87 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  88                                           re_sift_context_t *sctx, int str_idx,
  89                                           re_node_set *cur_dest)
  90      internal_function;
  91 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  92                                               re_sift_context_t *sctx,
  93                                               int str_idx,
  94                                               re_node_set *dest_nodes)
  95      internal_function;
  96 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  97                                             re_node_set *dest_nodes,
  98                                             const re_node_set *candidates)
  99      internal_function;
 100 static int check_dst_limits (const re_match_context_t *mctx,
 101                              re_node_set *limits,
 102                              int dst_node, int dst_idx, int src_node,
 103                              int src_idx) internal_function;
 104 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 105                                         int boundaries, int subexp_idx,
 106                                         int from_node, int bkref_idx)
 107      internal_function;
 108 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 109                                       int limit, int subexp_idx,
 110                                       int node, int str_idx,
 111                                       int bkref_idx) internal_function;
 112 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 113                                           re_node_set *dest_nodes,
 114                                           const re_node_set *candidates,
 115                                           re_node_set *limits,
 116                                           struct re_backref_cache_entry *bkref_ents,
 117                                           int str_idx) internal_function;
 118 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 119                                         re_sift_context_t *sctx,
 120                                         int str_idx, const re_node_set *candidates)
 121      internal_function;
 122 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 123                                         re_dfastate_t **dst,
 124                                         re_dfastate_t **src, int num)
 125      internal_function;
 126 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 127                                          re_match_context_t *mctx) internal_function;
 128 static re_dfastate_t *transit_state (reg_errcode_t *err,
 129                                      re_match_context_t *mctx,
 130                                      re_dfastate_t *state) internal_function;
 131 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 132                                             re_match_context_t *mctx,
 133                                             re_dfastate_t *next_state)
 134      internal_function;
 135 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 136                                                 re_node_set *cur_nodes,
 137                                                 int str_idx) internal_function;
 138 #if 0
 139 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 140                                         re_match_context_t *mctx,
 141                                         re_dfastate_t *pstate)
 142      internal_function;
 143 #endif
 144 #ifdef RE_ENABLE_I18N
 145 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 146                                        re_dfastate_t *pstate)
 147      internal_function;
 148 #endif /* RE_ENABLE_I18N */
 149 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 150                                           const re_node_set *nodes)
 151      internal_function;
 152 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 153                                  int bkref_node, int bkref_str_idx)
 154      internal_function;
 155 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 156                                      const re_sub_match_top_t *sub_top,
 157                                      re_sub_match_last_t *sub_last,
 158                                      int bkref_node, int bkref_str)
 159      internal_function;
 160 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 161                              int subexp_idx, int type) internal_function;
 162 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 163                                     state_array_t *path, int top_node,
 164                                     int top_str, int last_node, int last_str,
 165                                     int type) internal_function;
 166 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 167                                                    int str_idx,
 168                                                    re_node_set *cur_nodes,
 169                                                    re_node_set *next_nodes)
 170      internal_function;
 171 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 172                                                re_node_set *cur_nodes,
 173                                                int ex_subexp, int type)
 174      internal_function;
 175 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 176                                                    re_node_set *dst_nodes,
 177                                                    int target, int ex_subexp,
 178                                                    int type) internal_function;
 179 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 180                                          re_node_set *cur_nodes, int cur_str,
 181                                          int subexp_num, int type)
 182      internal_function;
 183 static int build_trtable (const re_dfa_t *dfa,
 184                           re_dfastate_t *state) internal_function;
 185 #ifdef RE_ENABLE_I18N
 186 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 187                                     const re_string_t *input, int idx)
 188      internal_function;
 189 # ifdef _LIBC
 190 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 191                                                    size_t name_len)
 192      internal_function;
 193 # endif /* _LIBC */
 194 #endif /* RE_ENABLE_I18N */
 195 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 196                                        const re_dfastate_t *state,
 197                                        re_node_set *states_node,
 198                                        bitset_t *states_ch) internal_function;
 199 static int check_node_accept (const re_match_context_t *mctx,
 200                               const re_token_t *node, int idx)
 201      internal_function;
 202 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
 203      internal_function;
 204 \f
 205 /* Entry point for POSIX code.  */
 206
 207 /* regexec searches for a given pattern, specified by PREG, in the
 208    string STRING.
 209
 210    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 211    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 212    least NMATCH elements, and we set them to the offsets of the
 213    corresponding matched substrings.
 214
 215    EFLAGS specifies `execution flags' which affect matching: if
 216    REG_NOTBOL is set, then ^ does not match at the beginning of the
 217    string; if REG_NOTEOL is set, then $ does not match at the end.
 218
 219    We return 0 if we find a match and REG_NOMATCH if not.  */
 220
 221 int
 222 regexec (preg, string, nmatch, pmatch, eflags)
 223     const regex_t *__restrict preg;
 224     const char *__restrict string;
 225     size_t nmatch;
 226     regmatch_t pmatch[];
 227     int eflags;
 228 {
 229   reg_errcode_t err;
 230   int start, length;
 231   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 232
 233   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 234     return REG_BADPAT;
 235
 236   if (eflags & REG_STARTEND)
 237     {
 238       start = pmatch[0].rm_so;
 239       length = pmatch[0].rm_eo;
 240     }
 241   else
 242     {
 243       start = 0;
 244       length = strlen (string);
 245     }
 246
 247   __libc_lock_lock (dfa->lock);
 248   if (preg->no_sub)
 249     err = re_search_internal (preg, string, length, start, length - start,
 250                               length, 0, NULL, eflags);
 251   else
 252     err = re_search_internal (preg, string, length, start, length - start,
 253                               length, nmatch, pmatch, eflags);
 254   __libc_lock_unlock (dfa->lock);
 255   return err != REG_NOERROR;
 256 }
 257
 258 #ifdef _LIBC
 259 # include <shlib-compat.h>
 260 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 261
 262 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 263 __typeof__ (__regexec) __compat_regexec;
 264
 265 int
 266 attribute_compat_text_section
 267 __compat_regexec (const regex_t *__restrict preg,
 268                   const char *__restrict string, size_t nmatch,
 269                   regmatch_t pmatch[], int eflags)
 270 {
 271   return regexec (preg, string, nmatch, pmatch,
 272                   eflags & (REG_NOTBOL | REG_NOTEOL));
 273 }
 274 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 275 # endif
 276 #endif
 277
 278 /* Entry points for GNU code.  */
 279
 280 /* re_match, re_search, re_match_2, re_search_2
 281
 282    The former two functions operate on STRING with length LENGTH,
 283    while the later two operate on concatenation of STRING1 and STRING2
 284    with lengths LENGTH1 and LENGTH2, respectively.
 285
 286    re_match() matches the compiled pattern in BUFP against the string,
 287    starting at index START.
 288
 289    re_search() first tries matching at index START, then it tries to match
 290    starting from index START + 1, and so on.  The last start position tried
 291    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 292    way as re_match().)
 293
 294    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 295    the first STOP characters of the concatenation of the strings should be
 296    concerned.
 297
 298    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 299    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 300    computed relative to the concatenation, not relative to the individual
 301    strings.)
 302
 303    On success, re_match* functions return the length of the match, re_search*
 304    return the position of the start of the match.  Return value -1 means no
 305    match was found and -2 indicates an internal error.  */
 306
 307 int
 308 re_match (bufp, string, length, start, regs)
 309     struct re_pattern_buffer *bufp;
 310     const char *string;
 311     int length, start;
 312     struct re_registers *regs;
 313 {
 314   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 315 }
 316 #ifdef _LIBC
 317 weak_alias (__re_match, re_match)
 318 #endif
 319
 320 int
 321 re_search (bufp, string, length, start, range, regs)
 322     struct re_pattern_buffer *bufp;
 323     const char *string;
 324     int length, start, range;
 325     struct re_registers *regs;
 326 {
 327   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 328 }
 329 #ifdef _LIBC
 330 weak_alias (__re_search, re_search)
 331 #endif
 332
 333 int
 334 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 335     struct re_pattern_buffer *bufp;
 336     const char *string1, *string2;
 337     int length1, length2, start, stop;
 338     struct re_registers *regs;
 339 {
 340   return re_search_2_stub (bufp, string1, length1, string2, length2,
 341                            start, 0, regs, stop, 1);
 342 }
 343 #ifdef _LIBC
 344 weak_alias (__re_match_2, re_match_2)
 345 #endif
 346
 347 int
 348 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 349     struct re_pattern_buffer *bufp;
 350     const char *string1, *string2;
 351     int length1, length2, start, range, stop;
 352     struct re_registers *regs;
 353 {
 354   return re_search_2_stub (bufp, string1, length1, string2, length2,
 355                            start, range, regs, stop, 0);
 356 }
 357 #ifdef _LIBC
 358 weak_alias (__re_search_2, re_search_2)
 359 #endif
 360
 361 static int
 362 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 363                   stop, ret_len)
 364     struct re_pattern_buffer *bufp;
 365     const char *string1, *string2;
 366     int length1, length2, start, range, stop, ret_len;
 367     struct re_registers *regs;
 368 {
 369   const char *str;
 370   int rval;
 371   int len = length1 + length2;
 372   char *s = NULL;
 373
 374   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 375     return -2;
 376
 377   /* Concatenate the strings.  */
 378   if (length2 > 0)
 379     if (length1 > 0)
 380       {
 381         s = re_malloc (char, len);
 382
 383         if (BE (s == NULL, 0))
 384           return -2;
 385 #ifdef _LIBC
 386         memcpy (__mempcpy (s, string1, length1), string2, length2);
 387 #else
 388         memcpy (s, string1, length1);
 389         memcpy (s + length1, string2, length2);
 390 #endif
 391         str = s;
 392       }
 393     else
 394       str = string2;
 395   else
 396     str = string1;
 397
 398   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 399   re_free (s);
 400   return rval;
 401 }
 402
 403 /* The parameters have the same meaning as those of re_search.
 404    Additional parameters:
 405    If RET_LEN is nonzero the length of the match is returned (re_match style);
 406    otherwise the position of the match is returned.  */
 407
 408 static int
 409 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 410     struct re_pattern_buffer *bufp;
 411     const char *string;
 412     int length, start, range, stop, ret_len;
 413     struct re_registers *regs;
 414 {
 415   reg_errcode_t result;
 416   regmatch_t *pmatch;
 417   int nregs, rval;
 418   int eflags = 0;
 419   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 420
 421   /* Check for out-of-range.  */
 422   if (BE (start < 0 || start > length, 0))
 423     return -1;
 424   if (BE (start + range > length, 0))
 425     range = length - start;
 426   else if (BE (start + range < 0, 0))
 427     range = -start;
 428
 429   __libc_lock_lock (dfa->lock);
 430
 431   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 432   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 433
 434   /* Compile fastmap if we haven't yet.  */
 435   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 436     re_compile_fastmap (bufp);
 437
 438   if (BE (bufp->no_sub, 0))
 439     regs = NULL;
 440
 441   /* We need at least 1 register.  */
 442   if (regs == NULL)
 443     nregs = 1;
 444   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 445                regs->num_regs < bufp->re_nsub + 1, 0))
 446     {
 447       nregs = regs->num_regs;
 448       if (BE (nregs < 1, 0))
 449         {
 450           /* Nothing can be copied to regs.  */
 451           regs = NULL;
 452           nregs = 1;
 453         }
 454     }
 455   else
 456     nregs = bufp->re_nsub + 1;
 457   pmatch = re_malloc (regmatch_t, nregs);
 458   if (BE (pmatch == NULL, 0))
 459     {
 460       rval = -2;
 461       goto out;
 462     }
 463
 464   result = re_search_internal (bufp, string, length, start, range, stop,
 465                                nregs, pmatch, eflags);
 466
 467   rval = 0;
 468
 469   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 470   if (result != REG_NOERROR)
 471     rval = -1;
 472   else if (regs != NULL)
 473     {
 474       /* If caller wants register contents data back, copy them.  */
 475       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 476                                            bufp->regs_allocated);
 477       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 478         rval = -2;
 479     }
 480
 481   if (BE (rval == 0, 1))
 482     {
 483       if (ret_len)
 484         {
 485           assert (pmatch[0].rm_so == start);
 486           rval = pmatch[0].rm_eo - start;
 487         }
 488       else
 489         rval = pmatch[0].rm_so;
 490     }
 491   re_free (pmatch);
 492  out:
 493   __libc_lock_unlock (dfa->lock);
 494   return rval;
 495 }
 496
 497 static unsigned
 498 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 499     struct re_registers *regs;
 500     regmatch_t *pmatch;
 501     int nregs, regs_allocated;
 502 {
 503   int rval = REGS_REALLOCATE;
 504   int i;
 505   int need_regs = nregs + 1;
 506   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 507      uses.  */
 508
 509   /* Have the register data arrays been allocated?  */
 510   if (regs_allocated == REGS_UNALLOCATED)
 511     { /* No.  So allocate them with malloc.  */
 512       regs->start = re_malloc (regoff_t, need_regs);
 513       if (BE (regs->start == NULL, 0))
 514         return REGS_UNALLOCATED;
 515       regs->end = re_malloc (regoff_t, need_regs);
 516       if (BE (regs->end == NULL, 0))
 517         {
 518           re_free (regs->start);
 519           return REGS_UNALLOCATED;
 520         }
 521       regs->num_regs = need_regs;
 522     }
 523   else if (regs_allocated == REGS_REALLOCATE)
 524     { /* Yes.  If we need more elements than were already
 525          allocated, reallocate them.  If we need fewer, just
 526          leave it alone.  */
 527       if (BE (need_regs > regs->num_regs, 0))
 528         {
 529           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 530           regoff_t *new_end;
 531           if (BE (new_start == NULL, 0))
 532             return REGS_UNALLOCATED;
 533           new_end = re_realloc (regs->end, regoff_t, need_regs);
 534           if (BE (new_end == NULL, 0))
 535             {
 536               re_free (new_start);
 537               return REGS_UNALLOCATED;
 538             }
 539           regs->start = new_start;
 540           regs->end = new_end;
 541           regs->num_regs = need_regs;
 542         }
 543     }
 544   else
 545     {
 546       assert (regs_allocated == REGS_FIXED);
 547       /* This function may not be called with REGS_FIXED and nregs too big.  */
 548       assert (regs->num_regs >= nregs);
 549       rval = REGS_FIXED;
 550     }
 551
 552   /* Copy the regs.  */
 553   for (i = 0; i < nregs; ++i)
 554     {
 555       regs->start[i] = pmatch[i].rm_so;
 556       regs->end[i] = pmatch[i].rm_eo;
 557     }
 558   for ( ; i < regs->num_regs; ++i)
 559     regs->start[i] = regs->end[i] = -1;
 560
 561   return rval;
 562 }
 563
 564 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 565    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 566    this memory for recording register information.  STARTS and ENDS
 567    must be allocated using the malloc library routine, and must each
 568    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 569
 570    If NUM_REGS == 0, then subsequent matches should allocate their own
 571    register data.
 572
 573    Unless this function is called, the first search or match using
 574    PATTERN_BUFFER will allocate its own register data, without
 575    freeing the old data.  */
 576
 577 void
 578 re_set_registers (bufp, regs, num_regs, starts, ends)
 579     struct re_pattern_buffer *bufp;
 580     struct re_registers *regs;
 581     unsigned num_regs;
 582     regoff_t *starts, *ends;
 583 {
 584   if (num_regs)
 585     {
 586       bufp->regs_allocated = REGS_REALLOCATE;
 587       regs->num_regs = num_regs;
 588       regs->start = starts;
 589       regs->end = ends;
 590     }
 591   else
 592     {
 593       bufp->regs_allocated = REGS_UNALLOCATED;
 594       regs->num_regs = 0;
 595       regs->start = regs->end = (regoff_t *) 0;
 596     }
 597 }
 598 #ifdef _LIBC
 599 weak_alias (__re_set_registers, re_set_registers)
 600 #endif
 601 \f
 602 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 603    them unless specifically requested.  */
 604
 605 #if defined _REGEX_RE_COMP || defined _LIBC
 606 int
 607 # ifdef _LIBC
 608 weak_function
 609 # endif
 610 re_exec (s)
 611      const char *s;
 612 {
 613   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 614 }
 615 #endif /* _REGEX_RE_COMP */
 616 \f
 617 /* Internal entry point.  */
 618
 619 /* Searches for a compiled pattern PREG in the string STRING, whose
 620    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 621    mingings with regexec.  START, and RANGE have the same meanings
 622    with re_search.
 623    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 624    otherwise return the error code.
 625    Note: We assume front end functions already check ranges.
 626    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 627
 628 static reg_errcode_t
 629 __attribute_warn_unused_result__
 630 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 631                     eflags)
 632     const regex_t *preg;
 633     const char *string;
 634     int length, start, range, stop, eflags;
 635     size_t nmatch;
 636     regmatch_t pmatch[];
 637 {
 638   reg_errcode_t err;
 639   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 640   int left_lim, right_lim, incr;
 641   int fl_longest_match, match_first, match_kind, match_last = -1;
 642   int extra_nmatch;
 643   int sb, ch;
 644 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 645   re_match_context_t mctx = { .dfa = dfa };
 646 #else
 647   re_match_context_t mctx;
 648 #endif
 649   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 650                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 651   RE_TRANSLATE_TYPE t = preg->translate;
 652
 653 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 654   memset (&mctx, '\0', sizeof (re_match_context_t));
 655   mctx.dfa = dfa;
 656 #endif
 657
 658   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 659   nmatch -= extra_nmatch;
 660
 661   /* Check if the DFA haven't been compiled.  */
 662   if (BE (preg->used == 0 || dfa->init_state == NULL
 663           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 664           || dfa->init_state_begbuf == NULL, 0))
 665     return REG_NOMATCH;
 666
 667 #ifdef DEBUG
 668   /* We assume front-end functions already check them.  */
 669   assert (start + range >= 0 && start + range <= length);
 670 #endif
 671
 672   /* If initial states with non-begbuf contexts have no elements,
 673      the regex must be anchored.  If preg->newline_anchor is set,
 674      we'll never use init_state_nl, so do not check it.  */
 675   if (dfa->init_state->nodes.nelem == 0
 676       && dfa->init_state_word->nodes.nelem == 0
 677       && (dfa->init_state_nl->nodes.nelem == 0
 678           || !preg->newline_anchor))
 679     {
 680       if (start != 0 && start + range != 0)
 681         return REG_NOMATCH;
 682       start = range = 0;
 683     }
 684
 685   /* We must check the longest matching, if nmatch > 0.  */
 686   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 687
 688   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 689                             preg->translate, preg->syntax & RE_ICASE, dfa);
 690   if (BE (err != REG_NOERROR, 0))
 691     goto free_return;
 692   mctx.input.stop = stop;
 693   mctx.input.raw_stop = stop;
 694   mctx.input.newline_anchor = preg->newline_anchor;
 695
 696   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 697   if (BE (err != REG_NOERROR, 0))
 698     goto free_return;
 699
 700   /* We will log all the DFA states through which the dfa pass,
 701      if nmatch > 1, or this dfa has "multibyte node", which is a
 702      back-reference or a node which can accept multibyte character or
 703      multi character collating element.  */
 704   if (nmatch > 1 || dfa->has_mb_node)
 705     {
 706       /* Avoid overflow.  */
 707       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 708         {
 709           err = REG_ESPACE;
 710           goto free_return;
 711         }
 712
 713       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 714       if (BE (mctx.state_log == NULL, 0))
 715         {
 716           err = REG_ESPACE;
 717           goto free_return;
 718         }
 719     }
 720   else
 721     mctx.state_log = NULL;
 722
 723   match_first = start;
 724   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 725                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 726
 727   /* Check incrementally whether of not the input string match.  */
 728   incr = (range < 0) ? -1 : 1;
 729   left_lim = (range < 0) ? start + range : start;
 730   right_lim = (range < 0) ? start : start + range;
 731   sb = dfa->mb_cur_max == 1;
 732   match_kind =
 733     (fastmap
 734      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 735         | (range >= 0 ? 2 : 0)
 736         | (t != NULL ? 1 : 0))
 737      : 8);
 738
 739   for (;; match_first += incr)
 740     {
 741       err = REG_NOMATCH;
 742       if (match_first < left_lim || right_lim < match_first)
 743         goto free_return;
 744
 745       /* Advance as rapidly as possible through the string, until we
 746          find a plausible place to start matching.  This may be done
 747          with varying efficiency, so there are various possibilities:
 748          only the most common of them are specialized, in order to
 749          save on code size.  We use a switch statement for speed.  */
 750       switch (match_kind)
 751         {
 752         case 8:
 753           /* No fastmap.  */
 754           break;
 755
 756         case 7:
 757           /* Fastmap with single-byte translation, match forward.  */
 758           while (BE (match_first < right_lim, 1)
 759                  && !fastmap[t[(unsigned char) string[match_first]]])
 760             ++match_first;
 761           goto forward_match_found_start_or_reached_end;
 762
 763         case 6:
 764           /* Fastmap without translation, match forward.  */
 765           while (BE (match_first < right_lim, 1)
 766                  && !fastmap[(unsigned char) string[match_first]])
 767             ++match_first;
 768
 769         forward_match_found_start_or_reached_end:
 770           if (BE (match_first == right_lim, 0))
 771             {
 772               ch = match_first >= length
 773                        ? 0 : (unsigned char) string[match_first];
 774               if (!fastmap[t ? t[ch] : ch])
 775                 goto free_return;
 776             }
 777           break;
 778
 779         case 4:
 780         case 5:
 781           /* Fastmap without multi-byte translation, match backwards.  */
 782           while (match_first >= left_lim)
 783             {
 784               ch = match_first >= length
 785                        ? 0 : (unsigned char) string[match_first];
 786               if (fastmap[t ? t[ch] : ch])
 787                 break;
 788               --match_first;
 789             }
 790           if (match_first < left_lim)
 791             goto free_return;
 792           break;
 793
 794         default:
 795           /* In this case, we can't determine easily the current byte,
 796              since it might be a component byte of a multibyte
 797              character.  Then we use the constructed buffer instead.  */
 798           for (;;)
 799             {
 800               /* If MATCH_FIRST is out of the valid range, reconstruct the
 801                  buffers.  */
 802               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 803               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 804                 {
 805                   err = re_string_reconstruct (&mctx.input, match_first,
 806                                                eflags);
 807                   if (BE (err != REG_NOERROR, 0))
 808                     goto free_return;
 809
 810                   offset = match_first - mctx.input.raw_mbs_idx;
 811                 }
 812               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 813                  Note that MATCH_FIRST must not be smaller than 0.  */
 814               ch = (match_first >= length
 815                     ? 0 : re_string_byte_at (&mctx.input, offset));
 816               if (fastmap[ch])
 817                 break;
 818               match_first += incr;
 819               if (match_first < left_lim || match_first > right_lim)
 820                 {
 821                   err = REG_NOMATCH;
 822                   goto free_return;
 823                 }
 824             }
 825           break;
 826         }
 827
 828       /* Reconstruct the buffers so that the matcher can assume that
 829          the matching starts from the beginning of the buffer.  */
 830       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 831       if (BE (err != REG_NOERROR, 0))
 832         goto free_return;
 833
 834 #ifdef RE_ENABLE_I18N
 835      /* Don't consider this char as a possible match start if it part,
 836         yet isn't the head, of a multibyte character.  */
 837       if (!sb && !re_string_first_byte (&mctx.input, 0))
 838         continue;
 839 #endif
 840
 841       /* It seems to be appropriate one, then use the matcher.  */
 842       /* We assume that the matching starts from 0.  */
 843       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 844       match_last = check_matching (&mctx, fl_longest_match,
 845                                    range >= 0 ? &match_first : NULL);
 846       if (match_last != -1)
 847         {
 848           if (BE (match_last == -2, 0))
 849             {
 850               err = REG_ESPACE;
 851               goto free_return;
 852             }
 853           else
 854             {
 855               mctx.match_last = match_last;
 856               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 857                 {
 858                   re_dfastate_t *pstate = mctx.state_log[match_last];
 859                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 860                                                              match_last);
 861                 }
 862               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 863                   || dfa->nbackref)
 864                 {
 865                   err = prune_impossible_nodes (&mctx);
 866                   if (err == REG_NOERROR)
 867                     break;
 868                   if (BE (err != REG_NOMATCH, 0))
 869                     goto free_return;
 870                   match_last = -1;
 871                 }
 872               else
 873                 break; /* We found a match.  */
 874             }
 875         }
 876
 877       match_ctx_clean (&mctx);
 878     }
 879
 880 #ifdef DEBUG
 881   assert (match_last != -1);
 882   assert (err == REG_NOERROR);
 883 #endif
 884
 885   /* Set pmatch[] if we need.  */
 886   if (nmatch > 0)
 887     {
 888       int reg_idx;
 889
 890       /* Initialize registers.  */
 891       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 892         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 893
 894       /* Set the points where matching start/end.  */
 895       pmatch[0].rm_so = 0;
 896       pmatch[0].rm_eo = mctx.match_last;
 897
 898       if (!preg->no_sub && nmatch > 1)
 899         {
 900           err = set_regs (preg, &mctx, nmatch, pmatch,
 901                           dfa->has_plural_match && dfa->nbackref > 0);
 902           if (BE (err != REG_NOERROR, 0))
 903             goto free_return;
 904         }
 905
 906       /* At last, add the offset to the each registers, since we slided
 907          the buffers so that we could assume that the matching starts
 908          from 0.  */
 909       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 910         if (pmatch[reg_idx].rm_so != -1)
 911           {
 912 #ifdef RE_ENABLE_I18N
 913             if (BE (mctx.input.offsets_needed != 0, 0))
 914               {
 915                 pmatch[reg_idx].rm_so =
 916                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 917                    ? mctx.input.valid_raw_len
 918                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 919                 pmatch[reg_idx].rm_eo =
 920                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 921                    ? mctx.input.valid_raw_len
 922                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 923               }
 924 #else
 925             assert (mctx.input.offsets_needed == 0);
 926 #endif
 927             pmatch[reg_idx].rm_so += match_first;
 928             pmatch[reg_idx].rm_eo += match_first;
 929           }
 930       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 931         {
 932           pmatch[nmatch + reg_idx].rm_so = -1;
 933           pmatch[nmatch + reg_idx].rm_eo = -1;
 934         }
 935
 936       if (dfa->subexp_map)
 937         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 938           if (dfa->subexp_map[reg_idx] != reg_idx)
 939             {
 940               pmatch[reg_idx + 1].rm_so
 941                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 942               pmatch[reg_idx + 1].rm_eo
 943                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 944             }
 945     }
 946
 947  free_return:
 948   re_free (mctx.state_log);
 949   if (dfa->nbackref)
 950     match_ctx_free (&mctx);
 951   re_string_destruct (&mctx.input);
 952   return err;
 953 }
 954
 955 static reg_errcode_t
 956 __attribute_warn_unused_result__
 957 prune_impossible_nodes (mctx)
 958      re_match_context_t *mctx;
 959 {
 960   const re_dfa_t *const dfa = mctx->dfa;
 961   int halt_node, match_last;
 962   reg_errcode_t ret;
 963   re_dfastate_t **sifted_states;
 964   re_dfastate_t **lim_states = NULL;
 965   re_sift_context_t sctx;
 966 #ifdef DEBUG
 967   assert (mctx->state_log != NULL);
 968 #endif
 969   match_last = mctx->match_last;
 970   halt_node = mctx->last_node;
 971
 972   /* Avoid overflow.  */
 973   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 974     return REG_ESPACE;
 975
 976   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 977   if (BE (sifted_states == NULL, 0))
 978     {
 979       ret = REG_ESPACE;
 980       goto free_return;
 981     }
 982   if (dfa->nbackref)
 983     {
 984       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 985       if (BE (lim_states == NULL, 0))
 986         {
 987           ret = REG_ESPACE;
 988           goto free_return;
 989         }
 990       while (1)
 991         {
 992           memset (lim_states, '\0',
 993                   sizeof (re_dfastate_t *) * (match_last + 1));
 994           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 995                          match_last);
 996           ret = sift_states_backward (mctx, &sctx);
 997           re_node_set_free (&sctx.limits);
 998           if (BE (ret != REG_NOERROR, 0))
 999               goto free_return;
1000           if (sifted_states[0] != NULL || lim_states[0] != NULL)
1001             break;
1002           do
1003             {
1004               --match_last;
1005               if (match_last < 0)
1006                 {
1007                   ret = REG_NOMATCH;
1008                   goto free_return;
1009                 }
1010             } while (mctx->state_log[match_last] == NULL
1011                      || !mctx->state_log[match_last]->halt);
1012           halt_node = check_halt_state_context (mctx,
1013                                                 mctx->state_log[match_last],
1014                                                 match_last);
1015         }
1016       ret = merge_state_array (dfa, sifted_states, lim_states,
1017                                match_last + 1);
1018       re_free (lim_states);
1019       lim_states = NULL;
1020       if (BE (ret != REG_NOERROR, 0))
1021         goto free_return;
1022     }
1023   else
1024     {
1025       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1026       ret = sift_states_backward (mctx, &sctx);
1027       re_node_set_free (&sctx.limits);
1028       if (BE (ret != REG_NOERROR, 0))
1029         goto free_return;
1030       if (sifted_states[0] == NULL)
1031         {
1032           ret = REG_NOMATCH;
1033           goto free_return;
1034         }
1035     }
1036   re_free (mctx->state_log);
1037   mctx->state_log = sifted_states;
1038   sifted_states = NULL;
1039   mctx->last_node = halt_node;
1040   mctx->match_last = match_last;
1041   ret = REG_NOERROR;
1042  free_return:
1043   re_free (sifted_states);
1044   re_free (lim_states);
1045   return ret;
1046 }
1047
1048 /* Acquire an initial state and return it.
1049    We must select appropriate initial state depending on the context,
1050    since initial states may have constraints like "\<", "^", etc..  */
1051
1052 static inline re_dfastate_t *
1053 __attribute ((always_inline)) internal_function
1054 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1055                             int idx)
1056 {
1057   const re_dfa_t *const dfa = mctx->dfa;
1058   if (dfa->init_state->has_constraint)
1059     {
1060       unsigned int context;
1061       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1062       if (IS_WORD_CONTEXT (context))
1063         return dfa->init_state_word;
1064       else if (IS_ORDINARY_CONTEXT (context))
1065         return dfa->init_state;
1066       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1067         return dfa->init_state_begbuf;
1068       else if (IS_NEWLINE_CONTEXT (context))
1069         return dfa->init_state_nl;
1070       else if (IS_BEGBUF_CONTEXT (context))
1071         {
1072           /* It is relatively rare case, then calculate on demand.  */
1073           return re_acquire_state_context (err, dfa,
1074                                            dfa->init_state->entrance_nodes,
1075                                            context);
1076         }
1077       else
1078         /* Must not happen?  */
1079         return dfa->init_state;
1080     }
1081   else
1082     return dfa->init_state;
1083 }
1084
1085 /* Check whether the regular expression match input string INPUT or not,
1086    and return the index where the matching end, return -1 if not match,
1087    or return -2 in case of an error.
1088    FL_LONGEST_MATCH means we want the POSIX longest matching.
1089    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1090    next place where we may want to try matching.
1091    Note that the matcher assume that the maching starts from the current
1092    index of the buffer.  */
1093
1094 static int
1095 internal_function __attribute_warn_unused_result__
1096 check_matching (re_match_context_t *mctx, int fl_longest_match,
1097                 int *p_match_first)
1098 {
1099   const re_dfa_t *const dfa = mctx->dfa;
1100   reg_errcode_t err;
1101   int match = 0;
1102   int match_last = -1;
1103   int cur_str_idx = re_string_cur_idx (&mctx->input);
1104   re_dfastate_t *cur_state;
1105   int at_init_state = p_match_first != NULL;
1106   int next_start_idx = cur_str_idx;
1107
1108   err = REG_NOERROR;
1109   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1110   /* An initial state must not be NULL (invalid).  */
1111   if (BE (cur_state == NULL, 0))
1112     {
1113       assert (err == REG_ESPACE);
1114       return -2;
1115     }
1116
1117   if (mctx->state_log != NULL)
1118     {
1119       mctx->state_log[cur_str_idx] = cur_state;
1120
1121       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1122          later.  E.g. Processing back references.  */
1123       if (BE (dfa->nbackref, 0))
1124         {
1125           at_init_state = 0;
1126           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1127           if (BE (err != REG_NOERROR, 0))
1128             return err;
1129
1130           if (cur_state->has_backref)
1131             {
1132               err = transit_state_bkref (mctx, &cur_state->nodes);
1133               if (BE (err != REG_NOERROR, 0))
1134                 return err;
1135             }
1136         }
1137     }
1138
1139   /* If the RE accepts NULL string.  */
1140   if (BE (cur_state->halt, 0))
1141     {
1142       if (!cur_state->has_constraint
1143           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1144         {
1145           if (!fl_longest_match)
1146             return cur_str_idx;
1147           else
1148             {
1149               match_last = cur_str_idx;
1150               match = 1;
1151             }
1152         }
1153     }
1154
1155   while (!re_string_eoi (&mctx->input))
1156     {
1157       re_dfastate_t *old_state = cur_state;
1158       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1159
1160       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1161            && mctx->input.bufs_len < mctx->input.len)
1162           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1163               && mctx->input.valid_len < mctx->input.len))
1164         {
1165           err = extend_buffers (mctx, next_char_idx + 1);
1166           if (BE (err != REG_NOERROR, 0))
1167             {
1168               assert (err == REG_ESPACE);
1169               return -2;
1170             }
1171         }
1172
1173       cur_state = transit_state (&err, mctx, cur_state);
1174       if (mctx->state_log != NULL)
1175         cur_state = merge_state_with_log (&err, mctx, cur_state);
1176
1177       if (cur_state == NULL)
1178         {
1179           /* Reached the invalid state or an error.  Try to recover a valid
1180              state using the state log, if available and if we have not
1181              already found a valid (even if not the longest) match.  */
1182           if (BE (err != REG_NOERROR, 0))
1183             return -2;
1184
1185           if (mctx->state_log == NULL
1186               || (match && !fl_longest_match)
1187               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1188             break;
1189         }
1190
1191       if (BE (at_init_state, 0))
1192         {
1193           if (old_state == cur_state)
1194             next_start_idx = next_char_idx;
1195           else
1196             at_init_state = 0;
1197         }
1198
1199       if (cur_state->halt)
1200         {
1201           /* Reached a halt state.
1202              Check the halt state can satisfy the current context.  */
1203           if (!cur_state->has_constraint
1204               || check_halt_state_context (mctx, cur_state,
1205                                            re_string_cur_idx (&mctx->input)))
1206             {
1207               /* We found an appropriate halt state.  */
1208               match_last = re_string_cur_idx (&mctx->input);
1209               match = 1;
1210
1211               /* We found a match, do not modify match_first below.  */
1212               p_match_first = NULL;
1213               if (!fl_longest_match)
1214                 break;
1215             }
1216         }
1217     }
1218
1219   if (p_match_first)
1220     *p_match_first += next_start_idx;
1221
1222   return match_last;
1223 }
1224
1225 /* Check NODE match the current context.  */
1226
1227 static int
1228 internal_function
1229 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1230 {
1231   re_token_type_t type = dfa->nodes[node].type;
1232   unsigned int constraint = dfa->nodes[node].constraint;
1233   if (type != END_OF_RE)
1234     return 0;
1235   if (!constraint)
1236     return 1;
1237   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1238     return 0;
1239   return 1;
1240 }
1241
1242 /* Check the halt state STATE match the current context.
1243    Return 0 if not match, if the node, STATE has, is a halt node and
1244    match the context, return the node.  */
1245
1246 static int
1247 internal_function
1248 check_halt_state_context (const re_match_context_t *mctx,
1249                           const re_dfastate_t *state, int idx)
1250 {
1251   int i;
1252   unsigned int context;
1253 #ifdef DEBUG
1254   assert (state->halt);
1255 #endif
1256   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1257   for (i = 0; i < state->nodes.nelem; ++i)
1258     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1259       return state->nodes.elems[i];
1260   return 0;
1261 }
1262
1263 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1264    corresponding to the DFA).
1265    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1266    of errors.  */
1267
1268 static int
1269 internal_function
1270 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1271                    int *pidx, int node, re_node_set *eps_via_nodes,
1272                    struct re_fail_stack_t *fs)
1273 {
1274   const re_dfa_t *const dfa = mctx->dfa;
1275   int i, err;
1276   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1277     {
1278       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1279       re_node_set *edests = &dfa->edests[node];
1280       int dest_node;
1281       err = re_node_set_insert (eps_via_nodes, node);
1282       if (BE (err < 0, 0))
1283         return -2;
1284       /* Pick up a valid destination, or return -1 if none is found.  */
1285       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1286         {
1287           int candidate = edests->elems[i];
1288           if (!re_node_set_contains (cur_nodes, candidate))
1289             continue;
1290           if (dest_node == -1)
1291             dest_node = candidate;
1292
1293           else
1294             {
1295               /* In order to avoid infinite loop like "(a*)*", return the second
1296                  epsilon-transition if the first was already considered.  */
1297               if (re_node_set_contains (eps_via_nodes, dest_node))
1298                 return candidate;
1299
1300               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1301               else if (fs != NULL
1302                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1303                                            eps_via_nodes))
1304                 return -2;
1305
1306               /* We know we are going to exit.  */
1307               break;
1308             }
1309         }
1310       return dest_node;
1311     }
1312   else
1313     {
1314       int naccepted = 0;
1315       re_token_type_t type = dfa->nodes[node].type;
1316
1317 #ifdef RE_ENABLE_I18N
1318       if (dfa->nodes[node].accept_mb)
1319         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1320       else
1321 #endif /* RE_ENABLE_I18N */
1322       if (type == OP_BACK_REF)
1323         {
1324           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1325           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1326           if (fs != NULL)
1327             {
1328               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1329                 return -1;
1330               else if (naccepted)
1331                 {
1332                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1333                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1334                               naccepted) != 0)
1335                     return -1;
1336                 }
1337             }
1338
1339           if (naccepted == 0)
1340             {
1341               int dest_node;
1342               err = re_node_set_insert (eps_via_nodes, node);
1343               if (BE (err < 0, 0))
1344                 return -2;
1345               dest_node = dfa->edests[node].elems[0];
1346               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1347                                         dest_node))
1348                 return dest_node;
1349             }
1350         }
1351
1352       if (naccepted != 0
1353           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1354         {
1355           int dest_node = dfa->nexts[node];
1356           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1357           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1358                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1359                                                dest_node)))
1360             return -1;
1361           re_node_set_empty (eps_via_nodes);
1362           return dest_node;
1363         }
1364     }
1365   return -1;
1366 }
1367
1368 static reg_errcode_t
1369 internal_function __attribute_warn_unused_result__
1370 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1371                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1372 {
1373   reg_errcode_t err;
1374   int num = fs->num++;
1375   if (fs->num == fs->alloc)
1376     {
1377       struct re_fail_stack_ent_t *new_array;
1378       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1379                                        * fs->alloc * 2));
1380       if (new_array == NULL)
1381         return REG_ESPACE;
1382       fs->alloc *= 2;
1383       fs->stack = new_array;
1384     }
1385   fs->stack[num].idx = str_idx;
1386   fs->stack[num].node = dest_node;
1387   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1388   if (fs->stack[num].regs == NULL)
1389     return REG_ESPACE;
1390   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1391   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1392   return err;
1393 }
1394
1395 static int
1396 internal_function
1397 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1398                 regmatch_t *regs, re_node_set *eps_via_nodes)
1399 {
1400   int num = --fs->num;
1401   assert (num >= 0);
1402   *pidx = fs->stack[num].idx;
1403   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1404   re_node_set_free (eps_via_nodes);
1405   re_free (fs->stack[num].regs);
1406   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1407   return fs->stack[num].node;
1408 }
1409
1410 /* Set the positions where the subexpressions are starts/ends to registers
1411    PMATCH.
1412    Note: We assume that pmatch[0] is already set, and
1413    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1414
1415 static reg_errcode_t
1416 internal_function __attribute_warn_unused_result__
1417 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1418           regmatch_t *pmatch, int fl_backtrack)
1419 {
1420   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1421   int idx, cur_node;
1422   re_node_set eps_via_nodes;
1423   struct re_fail_stack_t *fs;
1424   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1425   regmatch_t *prev_idx_match;
1426   int prev_idx_match_malloced = 0;
1427
1428 #ifdef DEBUG
1429   assert (nmatch > 1);
1430   assert (mctx->state_log != NULL);
1431 #endif
1432   if (fl_backtrack)
1433     {
1434       fs = &fs_body;
1435       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1436       if (fs->stack == NULL)
1437         return REG_ESPACE;
1438     }
1439   else
1440     fs = NULL;
1441
1442   cur_node = dfa->init_node;
1443   re_node_set_init_empty (&eps_via_nodes);
1444
1445   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1446     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1447   else
1448     {
1449       prev_idx_match = re_malloc (regmatch_t, nmatch);
1450       if (prev_idx_match == NULL)
1451         {
1452           free_fail_stack_return (fs);
1453           return REG_ESPACE;
1454         }
1455       prev_idx_match_malloced = 1;
1456     }
1457   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1458
1459   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1460     {
1461       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1462
1463       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1464         {
1465           int reg_idx;
1466           if (fs)
1467             {
1468               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1469                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1470                   break;
1471               if (reg_idx == nmatch)
1472                 {
1473                   re_node_set_free (&eps_via_nodes);
1474                   if (prev_idx_match_malloced)
1475                     re_free (prev_idx_match);
1476                   return free_fail_stack_return (fs);
1477                 }
1478               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1479                                          &eps_via_nodes);
1480             }
1481           else
1482             {
1483               re_node_set_free (&eps_via_nodes);
1484               if (prev_idx_match_malloced)
1485                 re_free (prev_idx_match);
1486               return REG_NOERROR;
1487             }
1488         }
1489
1490       /* Proceed to next node.  */
1491       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1492                                     &eps_via_nodes, fs);
1493
1494       if (BE (cur_node < 0, 0))
1495         {
1496           if (BE (cur_node == -2, 0))
1497             {
1498               re_node_set_free (&eps_via_nodes);
1499               if (prev_idx_match_malloced)
1500                 re_free (prev_idx_match);
1501               free_fail_stack_return (fs);
1502               return REG_ESPACE;
1503             }
1504           if (fs)
1505             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1506                                        &eps_via_nodes);
1507           else
1508             {
1509               re_node_set_free (&eps_via_nodes);
1510               if (prev_idx_match_malloced)
1511                 re_free (prev_idx_match);
1512               return REG_NOMATCH;
1513             }
1514         }
1515     }
1516   re_node_set_free (&eps_via_nodes);
1517   if (prev_idx_match_malloced)
1518     re_free (prev_idx_match);
1519   return free_fail_stack_return (fs);
1520 }
1521
1522 static reg_errcode_t
1523 internal_function
1524 free_fail_stack_return (struct re_fail_stack_t *fs)
1525 {
1526   if (fs)
1527     {
1528       int fs_idx;
1529       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1530         {
1531           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1532           re_free (fs->stack[fs_idx].regs);
1533         }
1534       re_free (fs->stack);
1535     }
1536   return REG_NOERROR;
1537 }
1538
1539 static void
1540 internal_function
1541 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1542              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1543 {
1544   int type = dfa->nodes[cur_node].type;
1545   if (type == OP_OPEN_SUBEXP)
1546     {
1547       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1548
1549       /* We are at the first node of this sub expression.  */
1550       if (reg_num < nmatch)
1551         {
1552           pmatch[reg_num].rm_so = cur_idx;
1553           pmatch[reg_num].rm_eo = -1;
1554         }
1555     }
1556   else if (type == OP_CLOSE_SUBEXP)
1557     {
1558       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1559       if (reg_num < nmatch)
1560         {
1561           /* We are at the last node of this sub expression.  */
1562           if (pmatch[reg_num].rm_so < cur_idx)
1563             {
1564               pmatch[reg_num].rm_eo = cur_idx;
1565               /* This is a non-empty match or we are not inside an optional
1566                  subexpression.  Accept this right away.  */
1567               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1568             }
1569           else
1570             {
1571               if (dfa->nodes[cur_node].opt_subexp
1572                   && prev_idx_match[reg_num].rm_so != -1)
1573                 /* We transited through an empty match for an optional
1574                    subexpression, like (a?)*, and this is not the subexp's
1575                    first match.  Copy back the old content of the registers
1576                    so that matches of an inner subexpression are undone as
1577                    well, like in ((a?))*.  */
1578                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1579               else
1580                 /* We completed a subexpression, but it may be part of
1581                    an optional one, so do not update PREV_IDX_MATCH.  */
1582                 pmatch[reg_num].rm_eo = cur_idx;
1583             }
1584         }
1585     }
1586 }
1587
1588 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1589    and sift the nodes in each states according to the following rules.
1590    Updated state_log will be wrote to STATE_LOG.
1591
1592    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1593      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1594         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1595         the LAST_NODE, we throw away the node `a'.
1596      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1597         string `s' and transit to `b':
1598         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1599            away the node `a'.
1600         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1601             thrown away, we throw away the node `a'.
1602      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1603         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1604            node `a'.
1605         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1606             we throw away the node `a'.  */
1607
1608 #define STATE_NODE_CONTAINS(state,node) \
1609   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1610
1611 static reg_errcode_t
1612 internal_function
1613 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1614 {
1615   reg_errcode_t err;
1616   int null_cnt = 0;
1617   int str_idx = sctx->last_str_idx;
1618   re_node_set cur_dest;
1619
1620 #ifdef DEBUG
1621   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1622 #endif
1623
1624   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1625      transit to the last_node and the last_node itself.  */
1626   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1627   if (BE (err != REG_NOERROR, 0))
1628     return err;
1629   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1630   if (BE (err != REG_NOERROR, 0))
1631     goto free_return;
1632
1633   /* Then check each states in the state_log.  */
1634   while (str_idx > 0)
1635     {
1636       /* Update counters.  */
1637       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1638       if (null_cnt > mctx->max_mb_elem_len)
1639         {
1640           memset (sctx->sifted_states, '\0',
1641                   sizeof (re_dfastate_t *) * str_idx);
1642           re_node_set_free (&cur_dest);
1643           return REG_NOERROR;
1644         }
1645       re_node_set_empty (&cur_dest);
1646       --str_idx;
1647
1648       if (mctx->state_log[str_idx])
1649         {
1650           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1651           if (BE (err != REG_NOERROR, 0))
1652             goto free_return;
1653         }
1654
1655       /* Add all the nodes which satisfy the following conditions:
1656          - It can epsilon transit to a node in CUR_DEST.
1657          - It is in CUR_SRC.
1658          And update state_log.  */
1659       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1660       if (BE (err != REG_NOERROR, 0))
1661         goto free_return;
1662     }
1663   err = REG_NOERROR;
1664  free_return:
1665   re_node_set_free (&cur_dest);
1666   return err;
1667 }
1668
1669 static reg_errcode_t
1670 internal_function __attribute_warn_unused_result__
1671 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1672                      int str_idx, re_node_set *cur_dest)
1673 {
1674   const re_dfa_t *const dfa = mctx->dfa;
1675   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1676   int i;
1677
1678   /* Then build the next sifted state.
1679      We build the next sifted state on `cur_dest', and update
1680      `sifted_states[str_idx]' with `cur_dest'.
1681      Note:
1682      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1683      `cur_src' points the node_set of the old `state_log[str_idx]'
1684      (with the epsilon nodes pre-filtered out).  */
1685   for (i = 0; i < cur_src->nelem; i++)
1686     {
1687       int prev_node = cur_src->elems[i];
1688       int naccepted = 0;
1689       int ret;
1690
1691 #ifdef DEBUG
1692       re_token_type_t type = dfa->nodes[prev_node].type;
1693       assert (!IS_EPSILON_NODE (type));
1694 #endif
1695 #ifdef RE_ENABLE_I18N
1696       /* If the node may accept `multi byte'.  */
1697       if (dfa->nodes[prev_node].accept_mb)
1698         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1699                                          str_idx, sctx->last_str_idx);
1700 #endif /* RE_ENABLE_I18N */
1701
1702       /* We don't check backreferences here.
1703          See update_cur_sifted_state().  */
1704       if (!naccepted
1705           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1706           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1707                                   dfa->nexts[prev_node]))
1708         naccepted = 1;
1709
1710       if (naccepted == 0)
1711         continue;
1712
1713       if (sctx->limits.nelem)
1714         {
1715           int to_idx = str_idx + naccepted;
1716           if (check_dst_limits (mctx, &sctx->limits,
1717                                 dfa->nexts[prev_node], to_idx,
1718                                 prev_node, str_idx))
1719             continue;
1720         }
1721       ret = re_node_set_insert (cur_dest, prev_node);
1722       if (BE (ret == -1, 0))
1723         return REG_ESPACE;
1724     }
1725
1726   return REG_NOERROR;
1727 }
1728
1729 /* Helper functions.  */
1730
1731 static reg_errcode_t
1732 internal_function
1733 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1734 {
1735   int top = mctx->state_log_top;
1736
1737   if ((next_state_log_idx >= mctx->input.bufs_len
1738        && mctx->input.bufs_len < mctx->input.len)
1739       || (next_state_log_idx >= mctx->input.valid_len
1740           && mctx->input.valid_len < mctx->input.len))
1741     {
1742       reg_errcode_t err;
1743       err = extend_buffers (mctx, next_state_log_idx + 1);
1744       if (BE (err != REG_NOERROR, 0))
1745         return err;
1746     }
1747
1748   if (top < next_state_log_idx)
1749     {
1750       memset (mctx->state_log + top + 1, '\0',
1751               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1752       mctx->state_log_top = next_state_log_idx;
1753     }
1754   return REG_NOERROR;
1755 }
1756
1757 static reg_errcode_t
1758 internal_function
1759 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1760                    re_dfastate_t **src, int num)
1761 {
1762   int st_idx;
1763   reg_errcode_t err;
1764   for (st_idx = 0; st_idx < num; ++st_idx)
1765     {
1766       if (dst[st_idx] == NULL)
1767         dst[st_idx] = src[st_idx];
1768       else if (src[st_idx] != NULL)
1769         {
1770           re_node_set merged_set;
1771           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1772                                         &src[st_idx]->nodes);
1773           if (BE (err != REG_NOERROR, 0))
1774             return err;
1775           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1776           re_node_set_free (&merged_set);
1777           if (BE (err != REG_NOERROR, 0))
1778             return err;
1779         }
1780     }
1781   return REG_NOERROR;
1782 }
1783
1784 static reg_errcode_t
1785 internal_function
1786 update_cur_sifted_state (const re_match_context_t *mctx,
1787                          re_sift_context_t *sctx, int str_idx,
1788                          re_node_set *dest_nodes)
1789 {
1790   const re_dfa_t *const dfa = mctx->dfa;
1791   reg_errcode_t err = REG_NOERROR;
1792   const re_node_set *candidates;
1793   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1794                 : &mctx->state_log[str_idx]->nodes);
1795
1796   if (dest_nodes->nelem == 0)
1797     sctx->sifted_states[str_idx] = NULL;
1798   else
1799     {
1800       if (candidates)
1801         {
1802           /* At first, add the nodes which can epsilon transit to a node in
1803              DEST_NODE.  */
1804           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1805           if (BE (err != REG_NOERROR, 0))
1806             return err;
1807
1808           /* Then, check the limitations in the current sift_context.  */
1809           if (sctx->limits.nelem)
1810             {
1811               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1812                                          mctx->bkref_ents, str_idx);
1813               if (BE (err != REG_NOERROR, 0))
1814                 return err;
1815             }
1816         }
1817
1818       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1819       if (BE (err != REG_NOERROR, 0))
1820         return err;
1821     }
1822
1823   if (candidates && mctx->state_log[str_idx]->has_backref)
1824     {
1825       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1826       if (BE (err != REG_NOERROR, 0))
1827         return err;
1828     }
1829   return REG_NOERROR;
1830 }
1831
1832 static reg_errcode_t
1833 internal_function __attribute_warn_unused_result__
1834 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1835                        const re_node_set *candidates)
1836 {
1837   reg_errcode_t err = REG_NOERROR;
1838   int i;
1839
1840   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1841   if (BE (err != REG_NOERROR, 0))
1842     return err;
1843
1844   if (!state->inveclosure.alloc)
1845     {
1846       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1847       if (BE (err != REG_NOERROR, 0))
1848         return REG_ESPACE;
1849       for (i = 0; i < dest_nodes->nelem; i++)
1850         {
1851           err = re_node_set_merge (&state->inveclosure,
1852                                    dfa->inveclosures + dest_nodes->elems[i]);
1853           if (BE (err != REG_NOERROR, 0))
1854             return REG_ESPACE;
1855         }
1856     }
1857   return re_node_set_add_intersect (dest_nodes, candidates,
1858                                     &state->inveclosure);
1859 }
1860
1861 static reg_errcode_t
1862 internal_function
1863 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1864                        const re_node_set *candidates)
1865 {
1866     int ecl_idx;
1867     reg_errcode_t err;
1868     re_node_set *inv_eclosure = dfa->inveclosures + node;
1869     re_node_set except_nodes;
1870     re_node_set_init_empty (&except_nodes);
1871     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1872       {
1873         int cur_node = inv_eclosure->elems[ecl_idx];
1874         if (cur_node == node)
1875           continue;
1876         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1877           {
1878             int edst1 = dfa->edests[cur_node].elems[0];
1879             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1880                          ? dfa->edests[cur_node].elems[1] : -1);
1881             if ((!re_node_set_contains (inv_eclosure, edst1)
1882                  && re_node_set_contains (dest_nodes, edst1))
1883                 || (edst2 > 0
1884                     && !re_node_set_contains (inv_eclosure, edst2)
1885                     && re_node_set_contains (dest_nodes, edst2)))
1886               {
1887                 err = re_node_set_add_intersect (&except_nodes, candidates,
1888                                                  dfa->inveclosures + cur_node);
1889                 if (BE (err != REG_NOERROR, 0))
1890                   {
1891                     re_node_set_free (&except_nodes);
1892                     return err;
1893                   }
1894               }
1895           }
1896       }
1897     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1898       {
1899         int cur_node = inv_eclosure->elems[ecl_idx];
1900         if (!re_node_set_contains (&except_nodes, cur_node))
1901           {
1902             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1903             re_node_set_remove_at (dest_nodes, idx);
1904           }
1905       }
1906     re_node_set_free (&except_nodes);
1907     return REG_NOERROR;
1908 }
1909
1910 static int
1911 internal_function
1912 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1913                   int dst_node, int dst_idx, int src_node, int src_idx)
1914 {
1915   const re_dfa_t *const dfa = mctx->dfa;
1916   int lim_idx, src_pos, dst_pos;
1917
1918   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1919   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1920   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1921     {
1922       int subexp_idx;
1923       struct re_backref_cache_entry *ent;
1924       ent = mctx->bkref_ents + limits->elems[lim_idx];
1925       subexp_idx = dfa->nodes[ent->node].opr.idx;
1926
1927       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1928                                            subexp_idx, dst_node, dst_idx,
1929                                            dst_bkref_idx);
1930       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1931                                            subexp_idx, src_node, src_idx,
1932                                            src_bkref_idx);
1933
1934       /* In case of:
1935          <src> <dst> ( <subexp> )
1936          ( <subexp> ) <src> <dst>
1937          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1938       if (src_pos == dst_pos)
1939         continue; /* This is unrelated limitation.  */
1940       else
1941         return 1;
1942     }
1943   return 0;
1944 }
1945
1946 static int
1947 internal_function
1948 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1949                              int subexp_idx, int from_node, int bkref_idx)
1950 {
1951   const re_dfa_t *const dfa = mctx->dfa;
1952   const re_node_set *eclosures = dfa->eclosures + from_node;
1953   int node_idx;
1954
1955   /* Else, we are on the boundary: examine the nodes on the epsilon
1956      closure.  */
1957   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1958     {
1959       int node = eclosures->elems[node_idx];
1960       switch (dfa->nodes[node].type)
1961         {
1962         case OP_BACK_REF:
1963           if (bkref_idx != -1)
1964             {
1965               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1966               do
1967                 {
1968                   int dst, cpos;
1969
1970                   if (ent->node != node)
1971                     continue;
1972
1973                   if (subexp_idx < BITSET_WORD_BITS
1974                       && !(ent->eps_reachable_subexps_map
1975                            & ((bitset_word_t) 1 << subexp_idx)))
1976                     continue;
1977
1978                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1979                      OP_CLOSE_SUBEXP cases below.  But, if the
1980                      destination node is the same node as the source
1981                      node, don't recurse because it would cause an
1982                      infinite loop: a regex that exhibits this behavior
1983                      is ()\1*\1*  */
1984                   dst = dfa->edests[node].elems[0];
1985                   if (dst == from_node)
1986                     {
1987                       if (boundaries & 1)
1988                         return -1;
1989                       else /* if (boundaries & 2) */
1990                         return 0;
1991                     }
1992
1993                   cpos =
1994                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1995                                                  dst, bkref_idx);
1996                   if (cpos == -1 /* && (boundaries & 1) */)
1997                     return -1;
1998                   if (cpos == 0 && (boundaries & 2))
1999                     return 0;
2000
2001                   if (subexp_idx < BITSET_WORD_BITS)
2002                     ent->eps_reachable_subexps_map
2003                       &= ~((bitset_word_t) 1 << subexp_idx);
2004                 }
2005               while (ent++->more);
2006             }
2007           break;
2008
2009         case OP_OPEN_SUBEXP:
2010           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2011             return -1;
2012           break;
2013
2014         case OP_CLOSE_SUBEXP:
2015           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2016             return 0;
2017           break;
2018
2019         default:
2020             break;
2021         }
2022     }
2023
2024   return (boundaries & 2) ? 1 : 0;
2025 }
2026
2027 static int
2028 internal_function
2029 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2030                            int subexp_idx, int from_node, int str_idx,
2031                            int bkref_idx)
2032 {
2033   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2034   int boundaries;
2035
2036   /* If we are outside the range of the subexpression, return -1 or 1.  */
2037   if (str_idx < lim->subexp_from)
2038     return -1;
2039
2040   if (lim->subexp_to < str_idx)
2041     return 1;
2042
2043   /* If we are within the subexpression, return 0.  */
2044   boundaries = (str_idx == lim->subexp_from);
2045   boundaries |= (str_idx == lim->subexp_to) << 1;
2046   if (boundaries == 0)
2047     return 0;
2048
2049   /* Else, examine epsilon closure.  */
2050   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2051                                       from_node, bkref_idx);
2052 }
2053
2054 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2055    which are against limitations from DEST_NODES. */
2056
2057 static reg_errcode_t
2058 internal_function
2059 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2060                      const re_node_set *candidates, re_node_set *limits,
2061                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2062 {
2063   reg_errcode_t err;
2064   int node_idx, lim_idx;
2065
2066   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2067     {
2068       int subexp_idx;
2069       struct re_backref_cache_entry *ent;
2070       ent = bkref_ents + limits->elems[lim_idx];
2071
2072       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2073         continue; /* This is unrelated limitation.  */
2074
2075       subexp_idx = dfa->nodes[ent->node].opr.idx;
2076       if (ent->subexp_to == str_idx)
2077         {
2078           int ops_node = -1;
2079           int cls_node = -1;
2080           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2081             {
2082               int node = dest_nodes->elems[node_idx];
2083               re_token_type_t type = dfa->nodes[node].type;
2084               if (type == OP_OPEN_SUBEXP
2085                   && subexp_idx == dfa->nodes[node].opr.idx)
2086                 ops_node = node;
2087               else if (type == OP_CLOSE_SUBEXP
2088                        && subexp_idx == dfa->nodes[node].opr.idx)
2089                 cls_node = node;
2090             }
2091
2092           /* Check the limitation of the open subexpression.  */
2093           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2094           if (ops_node >= 0)
2095             {
2096               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2097                                            candidates);
2098               if (BE (err != REG_NOERROR, 0))
2099                 return err;
2100             }
2101
2102           /* Check the limitation of the close subexpression.  */
2103           if (cls_node >= 0)
2104             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2105               {
2106                 int node = dest_nodes->elems[node_idx];
2107                 if (!re_node_set_contains (dfa->inveclosures + node,
2108                                            cls_node)
2109                     && !re_node_set_contains (dfa->eclosures + node,
2110                                               cls_node))
2111                   {
2112                     /* It is against this limitation.
2113                        Remove it form the current sifted state.  */
2114                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2115                                                  candidates);
2116                     if (BE (err != REG_NOERROR, 0))
2117                       return err;
2118                     --node_idx;
2119                   }
2120               }
2121         }
2122       else /* (ent->subexp_to != str_idx)  */
2123         {
2124           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2125             {
2126               int node = dest_nodes->elems[node_idx];
2127               re_token_type_t type = dfa->nodes[node].type;
2128               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2129                 {
2130                   if (subexp_idx != dfa->nodes[node].opr.idx)
2131                     continue;
2132                   /* It is against this limitation.
2133                      Remove it form the current sifted state.  */
2134                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2135                                                candidates);
2136                   if (BE (err != REG_NOERROR, 0))
2137                     return err;
2138                 }
2139             }
2140         }
2141     }
2142   return REG_NOERROR;
2143 }
2144
2145 static reg_errcode_t
2146 internal_function __attribute_warn_unused_result__
2147 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2148                    int str_idx, const re_node_set *candidates)
2149 {
2150   const re_dfa_t *const dfa = mctx->dfa;
2151   reg_errcode_t err;
2152   int node_idx, node;
2153   re_sift_context_t local_sctx;
2154   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2155
2156   if (first_idx == -1)
2157     return REG_NOERROR;
2158
2159   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2160
2161   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2162     {
2163       int enabled_idx;
2164       re_token_type_t type;
2165       struct re_backref_cache_entry *entry;
2166       node = candidates->elems[node_idx];
2167       type = dfa->nodes[node].type;
2168       /* Avoid infinite loop for the REs like "()\1+".  */
2169       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2170         continue;
2171       if (type != OP_BACK_REF)
2172         continue;
2173
2174       entry = mctx->bkref_ents + first_idx;
2175       enabled_idx = first_idx;
2176       do
2177         {
2178           int subexp_len;
2179           int to_idx;
2180           int dst_node;
2181           int ret;
2182           re_dfastate_t *cur_state;
2183
2184           if (entry->node != node)
2185             continue;
2186           subexp_len = entry->subexp_to - entry->subexp_from;
2187           to_idx = str_idx + subexp_len;
2188           dst_node = (subexp_len ? dfa->nexts[node]
2189                       : dfa->edests[node].elems[0]);
2190
2191           if (to_idx > sctx->last_str_idx
2192               || sctx->sifted_states[to_idx] == NULL
2193               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2194               || check_dst_limits (mctx, &sctx->limits, node,
2195                                    str_idx, dst_node, to_idx))
2196             continue;
2197
2198           if (local_sctx.sifted_states == NULL)
2199             {
2200               local_sctx = *sctx;
2201               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2202               if (BE (err != REG_NOERROR, 0))
2203                 goto free_return;
2204             }
2205           local_sctx.last_node = node;
2206           local_sctx.last_str_idx = str_idx;
2207           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2208           if (BE (ret < 0, 0))
2209             {
2210               err = REG_ESPACE;
2211               goto free_return;
2212             }
2213           cur_state = local_sctx.sifted_states[str_idx];
2214           err = sift_states_backward (mctx, &local_sctx);
2215           if (BE (err != REG_NOERROR, 0))
2216             goto free_return;
2217           if (sctx->limited_states != NULL)
2218             {
2219               err = merge_state_array (dfa, sctx->limited_states,
2220                                        local_sctx.sifted_states,
2221                                        str_idx + 1);
2222               if (BE (err != REG_NOERROR, 0))
2223                 goto free_return;
2224             }
2225           local_sctx.sifted_states[str_idx] = cur_state;
2226           re_node_set_remove (&local_sctx.limits, enabled_idx);
2227
2228           /* mctx->bkref_ents may have changed, reload the pointer.  */
2229           entry = mctx->bkref_ents + enabled_idx;
2230         }
2231       while (enabled_idx++, entry++->more);
2232     }
2233   err = REG_NOERROR;
2234  free_return:
2235   if (local_sctx.sifted_states != NULL)
2236     {
2237       re_node_set_free (&local_sctx.limits);
2238     }
2239
2240   return err;
2241 }
2242
2243
2244 #ifdef RE_ENABLE_I18N
2245 static int
2246 internal_function
2247 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2248                      int node_idx, int str_idx, int max_str_idx)
2249 {
2250   const re_dfa_t *const dfa = mctx->dfa;
2251   int naccepted;
2252   /* Check the node can accept `multi byte'.  */
2253   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2254   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2255       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2256                             dfa->nexts[node_idx]))
2257     /* The node can't accept the `multi byte', or the
2258        destination was already thrown away, then the node
2259        could't accept the current input `multi byte'.   */
2260     naccepted = 0;
2261   /* Otherwise, it is sure that the node could accept
2262      `naccepted' bytes input.  */
2263   return naccepted;
2264 }
2265 #endif /* RE_ENABLE_I18N */
2266
2267 \f
2268 /* Functions for state transition.  */
2269
2270 /* Return the next state to which the current state STATE will transit by
2271    accepting the current input byte, and update STATE_LOG if necessary.
2272    If STATE can accept a multibyte char/collating element/back reference
2273    update the destination of STATE_LOG.  */
2274
2275 static re_dfastate_t *
2276 internal_function __attribute_warn_unused_result__
2277 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2278                re_dfastate_t *state)
2279 {
2280   re_dfastate_t **trtable;
2281   unsigned char ch;
2282
2283 #ifdef RE_ENABLE_I18N
2284   /* If the current state can accept multibyte.  */
2285   if (BE (state->accept_mb, 0))
2286     {
2287       *err = transit_state_mb (mctx, state);
2288       if (BE (*err != REG_NOERROR, 0))
2289         return NULL;
2290     }
2291 #endif /* RE_ENABLE_I18N */
2292
2293   /* Then decide the next state with the single byte.  */
2294 #if 0
2295   if (0)
2296     /* don't use transition table  */
2297     return transit_state_sb (err, mctx, state);
2298 #endif
2299
2300   /* Use transition table  */
2301   ch = re_string_fetch_byte (&mctx->input);
2302   for (;;)
2303     {
2304       trtable = state->trtable;
2305       if (BE (trtable != NULL, 1))
2306         return trtable[ch];
2307
2308       trtable = state->word_trtable;
2309       if (BE (trtable != NULL, 1))
2310         {
2311           unsigned int context;
2312           context
2313             = re_string_context_at (&mctx->input,
2314                                     re_string_cur_idx (&mctx->input) - 1,
2315                                     mctx->eflags);
2316           if (IS_WORD_CONTEXT (context))
2317             return trtable[ch + SBC_MAX];
2318           else
2319             return trtable[ch];
2320         }
2321
2322       if (!build_trtable (mctx->dfa, state))
2323         {
2324           *err = REG_ESPACE;
2325           return NULL;
2326         }
2327
2328       /* Retry, we now have a transition table.  */
2329     }
2330 }
2331
2332 /* Update the state_log if we need */
2333 re_dfastate_t *
2334 internal_function
2335 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2336                       re_dfastate_t *next_state)
2337 {
2338   const re_dfa_t *const dfa = mctx->dfa;
2339   int cur_idx = re_string_cur_idx (&mctx->input);
2340
2341   if (cur_idx > mctx->state_log_top)
2342     {
2343       mctx->state_log[cur_idx] = next_state;
2344       mctx->state_log_top = cur_idx;
2345     }
2346   else if (mctx->state_log[cur_idx] == 0)
2347     {
2348       mctx->state_log[cur_idx] = next_state;
2349     }
2350   else
2351     {
2352       re_dfastate_t *pstate;
2353       unsigned int context;
2354       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2355       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2356          the destination of a multibyte char/collating element/
2357          back reference.  Then the next state is the union set of
2358          these destinations and the results of the transition table.  */
2359       pstate = mctx->state_log[cur_idx];
2360       log_nodes = pstate->entrance_nodes;
2361       if (next_state != NULL)
2362         {
2363           table_nodes = next_state->entrance_nodes;
2364           *err = re_node_set_init_union (&next_nodes, table_nodes,
2365                                              log_nodes);
2366           if (BE (*err != REG_NOERROR, 0))
2367             return NULL;
2368         }
2369       else
2370         next_nodes = *log_nodes;
2371       /* Note: We already add the nodes of the initial state,
2372          then we don't need to add them here.  */
2373
2374       context = re_string_context_at (&mctx->input,
2375                                       re_string_cur_idx (&mctx->input) - 1,
2376                                       mctx->eflags);
2377       next_state = mctx->state_log[cur_idx]
2378         = re_acquire_state_context (err, dfa, &next_nodes, context);
2379       /* We don't need to check errors here, since the return value of
2380          this function is next_state and ERR is already set.  */
2381
2382       if (table_nodes != NULL)
2383         re_node_set_free (&next_nodes);
2384     }
2385
2386   if (BE (dfa->nbackref, 0) && next_state != NULL)
2387     {
2388       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2389          later.  We must check them here, since the back references in the
2390          next state might use them.  */
2391       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2392                                         cur_idx);
2393       if (BE (*err != REG_NOERROR, 0))
2394         return NULL;
2395
2396       /* If the next state has back references.  */
2397       if (next_state->has_backref)
2398         {
2399           *err = transit_state_bkref (mctx, &next_state->nodes);
2400           if (BE (*err != REG_NOERROR, 0))
2401             return NULL;
2402           next_state = mctx->state_log[cur_idx];
2403         }
2404     }
2405
2406   return next_state;
2407 }
2408
2409 /* Skip bytes in the input that correspond to part of a
2410    multi-byte match, then look in the log for a state
2411    from which to restart matching.  */
2412 re_dfastate_t *
2413 internal_function
2414 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2415 {
2416   re_dfastate_t *cur_state;
2417   do
2418     {
2419       int max = mctx->state_log_top;
2420       int cur_str_idx = re_string_cur_idx (&mctx->input);
2421
2422       do
2423         {
2424           if (++cur_str_idx > max)
2425             return NULL;
2426           re_string_skip_bytes (&mctx->input, 1);
2427         }
2428       while (mctx->state_log[cur_str_idx] == NULL);
2429
2430       cur_state = merge_state_with_log (err, mctx, NULL);
2431     }
2432   while (*err == REG_NOERROR && cur_state == NULL);
2433   return cur_state;
2434 }
2435
2436 /* Helper functions for transit_state.  */
2437
2438 /* From the node set CUR_NODES, pick up the nodes whose types are
2439    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2440    expression. And register them to use them later for evaluating the
2441    correspoding back references.  */
2442
2443 static reg_errcode_t
2444 internal_function
2445 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2446                            int str_idx)
2447 {
2448   const re_dfa_t *const dfa = mctx->dfa;
2449   int node_idx;
2450   reg_errcode_t err;
2451
2452   /* TODO: This isn't efficient.
2453            Because there might be more than one nodes whose types are
2454            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2455            nodes.
2456            E.g. RE: (a){2}  */
2457   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2458     {
2459       int node = cur_nodes->elems[node_idx];
2460       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2461           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2462           && (dfa->used_bkref_map
2463               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2464         {
2465           err = match_ctx_add_subtop (mctx, node, str_idx);
2466           if (BE (err != REG_NOERROR, 0))
2467             return err;
2468         }
2469     }
2470   return REG_NOERROR;
2471 }
2472
2473 #if 0
2474 /* Return the next state to which the current state STATE will transit by
2475    accepting the current input byte.  */
2476
2477 static re_dfastate_t *
2478 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2479                   re_dfastate_t *state)
2480 {
2481   const re_dfa_t *const dfa = mctx->dfa;
2482   re_node_set next_nodes;
2483   re_dfastate_t *next_state;
2484   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2485   unsigned int context;
2486
2487   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2488   if (BE (*err != REG_NOERROR, 0))
2489     return NULL;
2490   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2491     {
2492       int cur_node = state->nodes.elems[node_cnt];
2493       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2494         {
2495           *err = re_node_set_merge (&next_nodes,
2496                                     dfa->eclosures + dfa->nexts[cur_node]);
2497           if (BE (*err != REG_NOERROR, 0))
2498             {
2499               re_node_set_free (&next_nodes);
2500               return NULL;
2501             }
2502         }
2503     }
2504   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2505   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2506   /* We don't need to check errors here, since the return value of
2507      this function is next_state and ERR is already set.  */
2508
2509   re_node_set_free (&next_nodes);
2510   re_string_skip_bytes (&mctx->input, 1);
2511   return next_state;
2512 }
2513 #endif
2514
2515 #ifdef RE_ENABLE_I18N
2516 static reg_errcode_t
2517 internal_function
2518 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2519 {
2520   const re_dfa_t *const dfa = mctx->dfa;
2521   reg_errcode_t err;
2522   int i;
2523
2524   for (i = 0; i < pstate->nodes.nelem; ++i)
2525     {
2526       re_node_set dest_nodes, *new_nodes;
2527       int cur_node_idx = pstate->nodes.elems[i];
2528       int naccepted, dest_idx;
2529       unsigned int context;
2530       re_dfastate_t *dest_state;
2531
2532       if (!dfa->nodes[cur_node_idx].accept_mb)
2533         continue;
2534
2535       if (dfa->nodes[cur_node_idx].constraint)
2536         {
2537           context = re_string_context_at (&mctx->input,
2538                                           re_string_cur_idx (&mctx->input),
2539                                           mctx->eflags);
2540           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2541                                            context))
2542             continue;
2543         }
2544
2545       /* How many bytes the node can accept?  */
2546       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2547                                            re_string_cur_idx (&mctx->input));
2548       if (naccepted == 0)
2549         continue;
2550
2551       /* The node can accepts `naccepted' bytes.  */
2552       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2553       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2554                                : mctx->max_mb_elem_len);
2555       err = clean_state_log_if_needed (mctx, dest_idx);
2556       if (BE (err != REG_NOERROR, 0))
2557         return err;
2558 #ifdef DEBUG
2559       assert (dfa->nexts[cur_node_idx] != -1);
2560 #endif
2561       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2562
2563       dest_state = mctx->state_log[dest_idx];
2564       if (dest_state == NULL)
2565         dest_nodes = *new_nodes;
2566       else
2567         {
2568           err = re_node_set_init_union (&dest_nodes,
2569                                         dest_state->entrance_nodes, new_nodes);
2570           if (BE (err != REG_NOERROR, 0))
2571             return err;
2572         }
2573       context = re_string_context_at (&mctx->input, dest_idx - 1,
2574                                       mctx->eflags);
2575       mctx->state_log[dest_idx]
2576         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2577       if (dest_state != NULL)
2578         re_node_set_free (&dest_nodes);
2579       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2580         return err;
2581     }
2582   return REG_NOERROR;
2583 }
2584 #endif /* RE_ENABLE_I18N */
2585
2586 static reg_errcode_t
2587 internal_function
2588 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2589 {
2590   const re_dfa_t *const dfa = mctx->dfa;
2591   reg_errcode_t err;
2592   int i;
2593   int cur_str_idx = re_string_cur_idx (&mctx->input);
2594
2595   for (i = 0; i < nodes->nelem; ++i)
2596     {
2597       int dest_str_idx, prev_nelem, bkc_idx;
2598       int node_idx = nodes->elems[i];
2599       unsigned int context;
2600       const re_token_t *node = dfa->nodes + node_idx;
2601       re_node_set *new_dest_nodes;
2602
2603       /* Check whether `node' is a backreference or not.  */
2604       if (node->type != OP_BACK_REF)
2605         continue;
2606
2607       if (node->constraint)
2608         {
2609           context = re_string_context_at (&mctx->input, cur_str_idx,
2610                                           mctx->eflags);
2611           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2612             continue;
2613         }
2614
2615       /* `node' is a backreference.
2616          Check the substring which the substring matched.  */
2617       bkc_idx = mctx->nbkref_ents;
2618       err = get_subexp (mctx, node_idx, cur_str_idx);
2619       if (BE (err != REG_NOERROR, 0))
2620         goto free_return;
2621
2622       /* And add the epsilon closures (which is `new_dest_nodes') of
2623          the backreference to appropriate state_log.  */
2624 #ifdef DEBUG
2625       assert (dfa->nexts[node_idx] != -1);
2626 #endif
2627       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2628         {
2629           int subexp_len;
2630           re_dfastate_t *dest_state;
2631           struct re_backref_cache_entry *bkref_ent;
2632           bkref_ent = mctx->bkref_ents + bkc_idx;
2633           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2634             continue;
2635           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2636           new_dest_nodes = (subexp_len == 0
2637                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2638                             : dfa->eclosures + dfa->nexts[node_idx]);
2639           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2640                           - bkref_ent->subexp_from);
2641           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2642                                           mctx->eflags);
2643           dest_state = mctx->state_log[dest_str_idx];
2644           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2645                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2646           /* Add `new_dest_node' to state_log.  */
2647           if (dest_state == NULL)
2648             {
2649               mctx->state_log[dest_str_idx]
2650                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2651                                             context);
2652               if (BE (mctx->state_log[dest_str_idx] == NULL
2653                       && err != REG_NOERROR, 0))
2654                 goto free_return;
2655             }
2656           else
2657             {
2658               re_node_set dest_nodes;
2659               err = re_node_set_init_union (&dest_nodes,
2660                                             dest_state->entrance_nodes,
2661                                             new_dest_nodes);
2662               if (BE (err != REG_NOERROR, 0))
2663                 {
2664                   re_node_set_free (&dest_nodes);
2665                   goto free_return;
2666                 }
2667               mctx->state_log[dest_str_idx]
2668                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2669               re_node_set_free (&dest_nodes);
2670               if (BE (mctx->state_log[dest_str_idx] == NULL
2671                       && err != REG_NOERROR, 0))
2672                 goto free_return;
2673             }
2674           /* We need to check recursively if the backreference can epsilon
2675              transit.  */
2676           if (subexp_len == 0
2677               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2678             {
2679               err = check_subexp_matching_top (mctx, new_dest_nodes,
2680                                                cur_str_idx);
2681               if (BE (err != REG_NOERROR, 0))
2682                 goto free_return;
2683               err = transit_state_bkref (mctx, new_dest_nodes);
2684               if (BE (err != REG_NOERROR, 0))
2685                 goto free_return;
2686             }
2687         }
2688     }
2689   err = REG_NOERROR;
2690  free_return:
2691   return err;
2692 }
2693
2694 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2695    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2696    Note that we might collect inappropriate candidates here.
2697    However, the cost of checking them strictly here is too high, then we
2698    delay these checking for prune_impossible_nodes().  */
2699
2700 static reg_errcode_t
2701 internal_function __attribute_warn_unused_result__
2702 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2703 {
2704   const re_dfa_t *const dfa = mctx->dfa;
2705   int subexp_num, sub_top_idx;
2706   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2707   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2708   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2709   if (cache_idx != -1)
2710     {
2711       const struct re_backref_cache_entry *entry
2712         = mctx->bkref_ents + cache_idx;
2713       do
2714         if (entry->node == bkref_node)
2715           return REG_NOERROR; /* We already checked it.  */
2716       while (entry++->more);
2717     }
2718
2719   subexp_num = dfa->nodes[bkref_node].opr.idx;
2720
2721   /* For each sub expression  */
2722   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2723     {
2724       reg_errcode_t err;
2725       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2726       re_sub_match_last_t *sub_last;
2727       int sub_last_idx, sl_str, bkref_str_off;
2728
2729       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2730         continue; /* It isn't related.  */
2731
2732       sl_str = sub_top->str_idx;
2733       bkref_str_off = bkref_str_idx;
2734       /* At first, check the last node of sub expressions we already
2735          evaluated.  */
2736       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2737         {
2738           int sl_str_diff;
2739           sub_last = sub_top->lasts[sub_last_idx];
2740           sl_str_diff = sub_last->str_idx - sl_str;
2741           /* The matched string by the sub expression match with the substring
2742              at the back reference?  */
2743           if (sl_str_diff > 0)
2744             {
2745               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2746                 {
2747                   /* Not enough chars for a successful match.  */
2748                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2749                     break;
2750
2751                   err = clean_state_log_if_needed (mctx,
2752                                                    bkref_str_off
2753                                                    + sl_str_diff);
2754                   if (BE (err != REG_NOERROR, 0))
2755                     return err;
2756                   buf = (const char *) re_string_get_buffer (&mctx->input);
2757                 }
2758               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2759                 /* We don't need to search this sub expression any more.  */
2760                 break;
2761             }
2762           bkref_str_off += sl_str_diff;
2763           sl_str += sl_str_diff;
2764           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2765                                 bkref_str_idx);
2766
2767           /* Reload buf, since the preceding call might have reallocated
2768              the buffer.  */
2769           buf = (const char *) re_string_get_buffer (&mctx->input);
2770
2771           if (err == REG_NOMATCH)
2772             continue;
2773           if (BE (err != REG_NOERROR, 0))
2774             return err;
2775         }
2776
2777       if (sub_last_idx < sub_top->nlasts)
2778         continue;
2779       if (sub_last_idx > 0)
2780         ++sl_str;
2781       /* Then, search for the other last nodes of the sub expression.  */
2782       for (; sl_str <= bkref_str_idx; ++sl_str)
2783         {
2784           int cls_node, sl_str_off;
2785           const re_node_set *nodes;
2786           sl_str_off = sl_str - sub_top->str_idx;
2787           /* The matched string by the sub expression match with the substring
2788              at the back reference?  */
2789           if (sl_str_off > 0)
2790             {
2791               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2792                 {
2793                   /* If we are at the end of the input, we cannot match.  */
2794                   if (bkref_str_off >= mctx->input.len)
2795                     break;
2796
2797                   err = extend_buffers (mctx, bkref_str_off + 1);
2798                   if (BE (err != REG_NOERROR, 0))
2799                     return err;
2800
2801                   buf = (const char *) re_string_get_buffer (&mctx->input);
2802                 }
2803               if (buf [bkref_str_off++] != buf[sl_str - 1])
2804                 break; /* We don't need to search this sub expression
2805                           any more.  */
2806             }
2807           if (mctx->state_log[sl_str] == NULL)
2808             continue;
2809           /* Does this state have a ')' of the sub expression?  */
2810           nodes = &mctx->state_log[sl_str]->nodes;
2811           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2812                                        OP_CLOSE_SUBEXP);
2813           if (cls_node == -1)
2814             continue; /* No.  */
2815           if (sub_top->path == NULL)
2816             {
2817               sub_top->path = calloc (sizeof (state_array_t),
2818                                       sl_str - sub_top->str_idx + 1);
2819               if (sub_top->path == NULL)
2820                 return REG_ESPACE;
2821             }
2822           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2823              in the current context?  */
2824           err = check_arrival (mctx, sub_top->path, sub_top->node,
2825                                sub_top->str_idx, cls_node, sl_str,
2826                                OP_CLOSE_SUBEXP);
2827           if (err == REG_NOMATCH)
2828               continue;
2829           if (BE (err != REG_NOERROR, 0))
2830               return err;
2831           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2832           if (BE (sub_last == NULL, 0))
2833             return REG_ESPACE;
2834           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2835                                 bkref_str_idx);
2836           if (err == REG_NOMATCH)
2837             continue;
2838         }
2839     }
2840   return REG_NOERROR;
2841 }
2842
2843 /* Helper functions for get_subexp().  */
2844
2845 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2846    If it can arrive, register the sub expression expressed with SUB_TOP
2847    and SUB_LAST.  */
2848
2849 static reg_errcode_t
2850 internal_function
2851 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2852                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2853 {
2854   reg_errcode_t err;
2855   int to_idx;
2856   /* Can the subexpression arrive the back reference?  */
2857   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2858                        sub_last->str_idx, bkref_node, bkref_str,
2859                        OP_OPEN_SUBEXP);
2860   if (err != REG_NOERROR)
2861     return err;
2862   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2863                              sub_last->str_idx);
2864   if (BE (err != REG_NOERROR, 0))
2865     return err;
2866   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2867   return clean_state_log_if_needed (mctx, to_idx);
2868 }
2869
2870 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2871    Search '(' if FL_OPEN, or search ')' otherwise.
2872    TODO: This function isn't efficient...
2873          Because there might be more than one nodes whose types are
2874          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2875          nodes.
2876          E.g. RE: (a){2}  */
2877
2878 static int
2879 internal_function
2880 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2881                   int subexp_idx, int type)
2882 {
2883   int cls_idx;
2884   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2885     {
2886       int cls_node = nodes->elems[cls_idx];
2887       const re_token_t *node = dfa->nodes + cls_node;
2888       if (node->type == type
2889           && node->opr.idx == subexp_idx)
2890         return cls_node;
2891     }
2892   return -1;
2893 }
2894
2895 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2896    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2897    heavily reused.
2898    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2899
2900 static reg_errcode_t
2901 internal_function __attribute_warn_unused_result__
2902 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2903                int top_str, int last_node, int last_str, int type)
2904 {
2905   const re_dfa_t *const dfa = mctx->dfa;
2906   reg_errcode_t err = REG_NOERROR;
2907   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2908   re_dfastate_t *cur_state = NULL;
2909   re_node_set *cur_nodes, next_nodes;
2910   re_dfastate_t **backup_state_log;
2911   unsigned int context;
2912
2913   subexp_num = dfa->nodes[top_node].opr.idx;
2914   /* Extend the buffer if we need.  */
2915   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2916     {
2917       re_dfastate_t **new_array;
2918       int old_alloc = path->alloc;
2919       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2920       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2921       if (BE (new_array == NULL, 0))
2922         {
2923           path->alloc = old_alloc;
2924           return REG_ESPACE;
2925         }
2926       path->array = new_array;
2927       memset (new_array + old_alloc, '\0',
2928               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2929     }
2930
2931   str_idx = path->next_idx ?: top_str;
2932
2933   /* Temporary modify MCTX.  */
2934   backup_state_log = mctx->state_log;
2935   backup_cur_idx = mctx->input.cur_idx;
2936   mctx->state_log = path->array;
2937   mctx->input.cur_idx = str_idx;
2938
2939   /* Setup initial node set.  */
2940   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2941   if (str_idx == top_str)
2942     {
2943       err = re_node_set_init_1 (&next_nodes, top_node);
2944       if (BE (err != REG_NOERROR, 0))
2945         return err;
2946       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2947       if (BE (err != REG_NOERROR, 0))
2948         {
2949           re_node_set_free (&next_nodes);
2950           return err;
2951         }
2952     }
2953   else
2954     {
2955       cur_state = mctx->state_log[str_idx];
2956       if (cur_state && cur_state->has_backref)
2957         {
2958           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2959           if (BE (err != REG_NOERROR, 0))
2960             return err;
2961         }
2962       else
2963         re_node_set_init_empty (&next_nodes);
2964     }
2965   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2966     {
2967       if (next_nodes.nelem)
2968         {
2969           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2970                                     subexp_num, type);
2971           if (BE (err != REG_NOERROR, 0))
2972             {
2973               re_node_set_free (&next_nodes);
2974               return err;
2975             }
2976         }
2977       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2978       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2979         {
2980           re_node_set_free (&next_nodes);
2981           return err;
2982         }
2983       mctx->state_log[str_idx] = cur_state;
2984     }
2985
2986   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2987     {
2988       re_node_set_empty (&next_nodes);
2989       if (mctx->state_log[str_idx + 1])
2990         {
2991           err = re_node_set_merge (&next_nodes,
2992                                    &mctx->state_log[str_idx + 1]->nodes);
2993           if (BE (err != REG_NOERROR, 0))
2994             {
2995               re_node_set_free (&next_nodes);
2996               return err;
2997             }
2998         }
2999       if (cur_state)
3000         {
3001           err = check_arrival_add_next_nodes (mctx, str_idx,
3002                                               &cur_state->non_eps_nodes,
3003                                               &next_nodes);
3004           if (BE (err != REG_NOERROR, 0))
3005             {
3006               re_node_set_free (&next_nodes);
3007               return err;
3008             }
3009         }
3010       ++str_idx;
3011       if (next_nodes.nelem)
3012         {
3013           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3014           if (BE (err != REG_NOERROR, 0))
3015             {
3016               re_node_set_free (&next_nodes);
3017               return err;
3018             }
3019           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3020                                     subexp_num, type);
3021           if (BE (err != REG_NOERROR, 0))
3022             {
3023               re_node_set_free (&next_nodes);
3024               return err;
3025             }
3026         }
3027       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3028       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3029       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3030         {
3031           re_node_set_free (&next_nodes);
3032           return err;
3033         }
3034       mctx->state_log[str_idx] = cur_state;
3035       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3036     }
3037   re_node_set_free (&next_nodes);
3038   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3039                : &mctx->state_log[last_str]->nodes);
3040   path->next_idx = str_idx;
3041
3042   /* Fix MCTX.  */
3043   mctx->state_log = backup_state_log;
3044   mctx->input.cur_idx = backup_cur_idx;
3045
3046   /* Then check the current node set has the node LAST_NODE.  */
3047   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3048     return REG_NOERROR;
3049
3050   return REG_NOMATCH;
3051 }
3052
3053 /* Helper functions for check_arrival.  */
3054
3055 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3056    to NEXT_NODES.
3057    TODO: This function is similar to the functions transit_state*(),
3058          however this function has many additional works.
3059          Can't we unify them?  */
3060
3061 static reg_errcode_t
3062 internal_function __attribute_warn_unused_result__
3063 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3064                               re_node_set *cur_nodes, re_node_set *next_nodes)
3065 {
3066   const re_dfa_t *const dfa = mctx->dfa;
3067   int result;
3068   int cur_idx;
3069   reg_errcode_t err = REG_NOERROR;
3070   re_node_set union_set;
3071   re_node_set_init_empty (&union_set);
3072   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3073     {
3074       int naccepted = 0;
3075       int cur_node = cur_nodes->elems[cur_idx];
3076 #ifdef DEBUG
3077       re_token_type_t type = dfa->nodes[cur_node].type;
3078       assert (!IS_EPSILON_NODE (type));
3079 #endif
3080 #ifdef RE_ENABLE_I18N
3081       /* If the node may accept `multi byte'.  */
3082       if (dfa->nodes[cur_node].accept_mb)
3083         {
3084           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3085                                                str_idx);
3086           if (naccepted > 1)
3087             {
3088               re_dfastate_t *dest_state;
3089               int next_node = dfa->nexts[cur_node];
3090               int next_idx = str_idx + naccepted;
3091               dest_state = mctx->state_log[next_idx];
3092               re_node_set_empty (&union_set);
3093               if (dest_state)
3094                 {
3095                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3096                   if (BE (err != REG_NOERROR, 0))
3097                     {
3098                       re_node_set_free (&union_set);
3099                       return err;
3100                     }
3101                 }
3102               result = re_node_set_insert (&union_set, next_node);
3103               if (BE (result < 0, 0))
3104                 {
3105                   re_node_set_free (&union_set);
3106                   return REG_ESPACE;
3107                 }
3108               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3109                                                             &union_set);
3110               if (BE (mctx->state_log[next_idx] == NULL
3111                       && err != REG_NOERROR, 0))
3112                 {
3113                   re_node_set_free (&union_set);
3114                   return err;
3115                 }
3116             }
3117         }
3118 #endif /* RE_ENABLE_I18N */
3119       if (naccepted
3120           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3121         {
3122           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3123           if (BE (result < 0, 0))
3124             {
3125               re_node_set_free (&union_set);
3126               return REG_ESPACE;
3127             }
3128         }
3129     }
3130   re_node_set_free (&union_set);
3131   return REG_NOERROR;
3132 }
3133
3134 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3135    CUR_NODES, however exclude the nodes which are:
3136     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3137     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3138 */
3139
3140 static reg_errcode_t
3141 internal_function
3142 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3143                           int ex_subexp, int type)
3144 {
3145   reg_errcode_t err;
3146   int idx, outside_node;
3147   re_node_set new_nodes;
3148 #ifdef DEBUG
3149   assert (cur_nodes->nelem);
3150 #endif
3151   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3152   if (BE (err != REG_NOERROR, 0))
3153     return err;
3154   /* Create a new node set NEW_NODES with the nodes which are epsilon
3155      closures of the node in CUR_NODES.  */
3156
3157   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3158     {
3159       int cur_node = cur_nodes->elems[idx];
3160       const re_node_set *eclosure = dfa->eclosures + cur_node;
3161       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3162       if (outside_node == -1)
3163         {
3164           /* There are no problematic nodes, just merge them.  */
3165           err = re_node_set_merge (&new_nodes, eclosure);
3166           if (BE (err != REG_NOERROR, 0))
3167             {
3168               re_node_set_free (&new_nodes);
3169               return err;
3170             }
3171         }
3172       else
3173         {
3174           /* There are problematic nodes, re-calculate incrementally.  */
3175           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3176                                               ex_subexp, type);
3177           if (BE (err != REG_NOERROR, 0))
3178             {
3179               re_node_set_free (&new_nodes);
3180               return err;
3181             }
3182         }
3183     }
3184   re_node_set_free (cur_nodes);
3185   *cur_nodes = new_nodes;
3186   return REG_NOERROR;
3187 }
3188
3189 /* Helper function for check_arrival_expand_ecl.
3190    Check incrementally the epsilon closure of TARGET, and if it isn't
3191    problematic append it to DST_NODES.  */
3192
3193 static reg_errcode_t
3194 internal_function __attribute_warn_unused_result__
3195 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3196                               int target, int ex_subexp, int type)
3197 {
3198   int cur_node;
3199   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3200     {
3201       int err;
3202
3203       if (dfa->nodes[cur_node].type == type
3204           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3205         {
3206           if (type == OP_CLOSE_SUBEXP)
3207             {
3208               err = re_node_set_insert (dst_nodes, cur_node);
3209               if (BE (err == -1, 0))
3210                 return REG_ESPACE;
3211             }
3212           break;
3213         }
3214       err = re_node_set_insert (dst_nodes, cur_node);
3215       if (BE (err == -1, 0))
3216         return REG_ESPACE;
3217       if (dfa->edests[cur_node].nelem == 0)
3218         break;
3219       if (dfa->edests[cur_node].nelem == 2)
3220         {
3221           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3222                                               dfa->edests[cur_node].elems[1],
3223                                               ex_subexp, type);
3224           if (BE (err != REG_NOERROR, 0))
3225             return err;
3226         }
3227       cur_node = dfa->edests[cur_node].elems[0];
3228     }
3229   return REG_NOERROR;
3230 }
3231
3232
3233 /* For all the back references in the current state, calculate the
3234    destination of the back references by the appropriate entry
3235    in MCTX->BKREF_ENTS.  */
3236
3237 static reg_errcode_t
3238 internal_function __attribute_warn_unused_result__
3239 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3240                     int cur_str, int subexp_num, int type)
3241 {
3242   const re_dfa_t *const dfa = mctx->dfa;
3243   reg_errcode_t err;
3244   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3245   struct re_backref_cache_entry *ent;
3246
3247   if (cache_idx_start == -1)
3248     return REG_NOERROR;
3249
3250  restart:
3251   ent = mctx->bkref_ents + cache_idx_start;
3252   do
3253     {
3254       int to_idx, next_node;
3255
3256       /* Is this entry ENT is appropriate?  */
3257       if (!re_node_set_contains (cur_nodes, ent->node))
3258         continue; /* No.  */
3259
3260       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3261       /* Calculate the destination of the back reference, and append it
3262          to MCTX->STATE_LOG.  */
3263       if (to_idx == cur_str)
3264         {
3265           /* The backreference did epsilon transit, we must re-check all the
3266              node in the current state.  */
3267           re_node_set new_dests;
3268           reg_errcode_t err2, err3;
3269           next_node = dfa->edests[ent->node].elems[0];
3270           if (re_node_set_contains (cur_nodes, next_node))
3271             continue;
3272           err = re_node_set_init_1 (&new_dests, next_node);
3273           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3274           err3 = re_node_set_merge (cur_nodes, &new_dests);
3275           re_node_set_free (&new_dests);
3276           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3277                   || err3 != REG_NOERROR, 0))
3278             {
3279               err = (err != REG_NOERROR ? err
3280                      : (err2 != REG_NOERROR ? err2 : err3));
3281               return err;
3282             }
3283           /* TODO: It is still inefficient...  */
3284           goto restart;
3285         }
3286       else
3287         {
3288           re_node_set union_set;
3289           next_node = dfa->nexts[ent->node];
3290           if (mctx->state_log[to_idx])
3291             {
3292               int ret;
3293               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3294                                         next_node))
3295                 continue;
3296               err = re_node_set_init_copy (&union_set,
3297                                            &mctx->state_log[to_idx]->nodes);
3298               ret = re_node_set_insert (&union_set, next_node);
3299               if (BE (err != REG_NOERROR || ret < 0, 0))
3300                 {
3301                   re_node_set_free (&union_set);
3302                   err = err != REG_NOERROR ? err : REG_ESPACE;
3303                   return err;
3304                 }
3305             }
3306           else
3307             {
3308               err = re_node_set_init_1 (&union_set, next_node);
3309               if (BE (err != REG_NOERROR, 0))
3310                 return err;
3311             }
3312           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3313           re_node_set_free (&union_set);
3314           if (BE (mctx->state_log[to_idx] == NULL
3315                   && err != REG_NOERROR, 0))
3316             return err;
3317         }
3318     }
3319   while (ent++->more);
3320   return REG_NOERROR;
3321 }
3322
3323 /* Build transition table for the state.
3324    Return 1 if succeeded, otherwise return NULL.  */
3325
3326 static int
3327 internal_function
3328 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3329 {
3330   reg_errcode_t err;
3331   int i, j, ch, need_word_trtable = 0;
3332   bitset_word_t elem, mask;
3333   bool dests_node_malloced = false;
3334   bool dest_states_malloced = false;
3335   int ndests; /* Number of the destination states from `state'.  */
3336   re_dfastate_t **trtable;
3337   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3338   re_node_set follows, *dests_node;
3339   bitset_t *dests_ch;
3340   bitset_t acceptable;
3341
3342   struct dests_alloc
3343   {
3344     re_node_set dests_node[SBC_MAX];
3345     bitset_t dests_ch[SBC_MAX];
3346   } *dests_alloc;
3347
3348   /* We build DFA states which corresponds to the destination nodes
3349      from `state'.  `dests_node[i]' represents the nodes which i-th
3350      destination state contains, and `dests_ch[i]' represents the
3351      characters which i-th destination state accepts.  */
3352   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3353     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3354   else
3355     {
3356       dests_alloc = re_malloc (struct dests_alloc, 1);
3357       if (BE (dests_alloc == NULL, 0))
3358         return 0;
3359       dests_node_malloced = true;
3360     }
3361   dests_node = dests_alloc->dests_node;
3362   dests_ch = dests_alloc->dests_ch;
3363
3364   /* Initialize transiton table.  */
3365   state->word_trtable = state->trtable = NULL;
3366
3367   /* At first, group all nodes belonging to `state' into several
3368      destinations.  */
3369   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3370   if (BE (ndests <= 0, 0))
3371     {
3372       if (dests_node_malloced)
3373         free (dests_alloc);
3374       /* Return 0 in case of an error, 1 otherwise.  */
3375       if (ndests == 0)
3376         {
3377           state->trtable = (re_dfastate_t **)
3378             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3379           if (BE (state->trtable == NULL, 0))
3380             return 0;
3381           return 1;
3382         }
3383       return 0;
3384     }
3385
3386   err = re_node_set_alloc (&follows, ndests + 1);
3387   if (BE (err != REG_NOERROR, 0))
3388     goto out_free;
3389
3390   /* Avoid arithmetic overflow in size calculation.  */
3391   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3392             / (3 * sizeof (re_dfastate_t *)))
3393            < ndests),
3394           0))
3395     goto out_free;
3396
3397   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3398                          + ndests * 3 * sizeof (re_dfastate_t *)))
3399     dest_states = (re_dfastate_t **)
3400       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3401   else
3402     {
3403       dest_states = (re_dfastate_t **)
3404         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3405       if (BE (dest_states == NULL, 0))
3406         {
3407 out_free:
3408           if (dest_states_malloced)
3409             free (dest_states);
3410           re_node_set_free (&follows);
3411           for (i = 0; i < ndests; ++i)
3412             re_node_set_free (dests_node + i);
3413           if (dests_node_malloced)
3414             free (dests_alloc);
3415           return 0;
3416         }
3417       dest_states_malloced = true;
3418     }
3419   dest_states_word = dest_states + ndests;
3420   dest_states_nl = dest_states_word + ndests;
3421   bitset_empty (acceptable);
3422
3423   /* Then build the states for all destinations.  */
3424   for (i = 0; i < ndests; ++i)
3425     {
3426       int next_node;
3427       re_node_set_empty (&follows);
3428       /* Merge the follows of this destination states.  */
3429       for (j = 0; j < dests_node[i].nelem; ++j)
3430         {
3431           next_node = dfa->nexts[dests_node[i].elems[j]];
3432           if (next_node != -1)
3433             {
3434               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3435               if (BE (err != REG_NOERROR, 0))
3436                 goto out_free;
3437             }
3438         }
3439       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3440       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3441         goto out_free;
3442       /* If the new state has context constraint,
3443          build appropriate states for these contexts.  */
3444       if (dest_states[i]->has_constraint)
3445         {
3446           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3447                                                           CONTEXT_WORD);
3448           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3449             goto out_free;
3450
3451           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3452             need_word_trtable = 1;
3453
3454           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3455                                                         CONTEXT_NEWLINE);
3456           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3457             goto out_free;
3458         }
3459       else
3460         {
3461           dest_states_word[i] = dest_states[i];
3462           dest_states_nl[i] = dest_states[i];
3463         }
3464       bitset_merge (acceptable, dests_ch[i]);
3465     }
3466
3467   if (!BE (need_word_trtable, 0))
3468     {
3469       /* We don't care about whether the following character is a word
3470          character, or we are in a single-byte character set so we can
3471          discern by looking at the character code: allocate a
3472          256-entry transition table.  */
3473       trtable = state->trtable =
3474         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3475       if (BE (trtable == NULL, 0))
3476         goto out_free;
3477
3478       /* For all characters ch...:  */
3479       for (i = 0; i < BITSET_WORDS; ++i)
3480         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3481              elem;
3482              mask <<= 1, elem >>= 1, ++ch)
3483           if (BE (elem & 1, 0))
3484             {
3485               /* There must be exactly one destination which accepts
3486                  character ch.  See group_nodes_into_DFAstates.  */
3487               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3488                 ;
3489
3490               /* j-th destination accepts the word character ch.  */
3491               if (dfa->word_char[i] & mask)
3492                 trtable[ch] = dest_states_word[j];
3493               else
3494                 trtable[ch] = dest_states[j];
3495             }
3496     }
3497   else
3498     {
3499       /* We care about whether the following character is a word
3500          character, and we are in a multi-byte character set: discern
3501          by looking at the character code: build two 256-entry
3502          transition tables, one starting at trtable[0] and one
3503          starting at trtable[SBC_MAX].  */
3504       trtable = state->word_trtable =
3505         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3506       if (BE (trtable == NULL, 0))
3507         goto out_free;
3508
3509       /* For all characters ch...:  */
3510       for (i = 0; i < BITSET_WORDS; ++i)
3511         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3512              elem;
3513              mask <<= 1, elem >>= 1, ++ch)
3514           if (BE (elem & 1, 0))
3515             {
3516               /* There must be exactly one destination which accepts
3517                  character ch.  See group_nodes_into_DFAstates.  */
3518               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3519                 ;
3520
3521               /* j-th destination accepts the word character ch.  */
3522               trtable[ch] = dest_states[j];
3523               trtable[ch + SBC_MAX] = dest_states_word[j];
3524             }
3525     }
3526
3527   /* new line */
3528   if (bitset_contain (acceptable, NEWLINE_CHAR))
3529     {
3530       /* The current state accepts newline character.  */
3531       for (j = 0; j < ndests; ++j)
3532         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3533           {
3534             /* k-th destination accepts newline character.  */
3535             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3536             if (need_word_trtable)
3537               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3538             /* There must be only one destination which accepts
3539                newline.  See group_nodes_into_DFAstates.  */
3540             break;
3541           }
3542     }
3543
3544   if (dest_states_malloced)
3545     free (dest_states);
3546
3547   re_node_set_free (&follows);
3548   for (i = 0; i < ndests; ++i)
3549     re_node_set_free (dests_node + i);
3550
3551   if (dests_node_malloced)
3552     free (dests_alloc);
3553
3554   return 1;
3555 }
3556
3557 /* Group all nodes belonging to STATE into several destinations.
3558    Then for all destinations, set the nodes belonging to the destination
3559    to DESTS_NODE[i] and set the characters accepted by the destination
3560    to DEST_CH[i].  This function return the number of destinations.  */
3561
3562 static int
3563 internal_function
3564 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3565                             re_node_set *dests_node, bitset_t *dests_ch)
3566 {
3567   reg_errcode_t err;
3568   int result;
3569   int i, j, k;
3570   int ndests; /* Number of the destinations from `state'.  */
3571   bitset_t accepts; /* Characters a node can accept.  */
3572   const re_node_set *cur_nodes = &state->nodes;
3573   bitset_empty (accepts);
3574   ndests = 0;
3575
3576   /* For all the nodes belonging to `state',  */
3577   for (i = 0; i < cur_nodes->nelem; ++i)
3578     {
3579       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3580       re_token_type_t type = node->type;
3581       unsigned int constraint = node->constraint;
3582
3583       /* Enumerate all single byte character this node can accept.  */
3584       if (type == CHARACTER)
3585         bitset_set (accepts, node->opr.c);
3586       else if (type == SIMPLE_BRACKET)
3587         {
3588           bitset_merge (accepts, node->opr.sbcset);
3589         }
3590       else if (type == OP_PERIOD)
3591         {
3592 #ifdef RE_ENABLE_I18N
3593           if (dfa->mb_cur_max > 1)
3594             bitset_merge (accepts, dfa->sb_char);
3595           else
3596 #endif
3597             bitset_set_all (accepts);
3598           if (!(dfa->syntax & RE_DOT_NEWLINE))
3599             bitset_clear (accepts, '\n');
3600           if (dfa->syntax & RE_DOT_NOT_NULL)
3601             bitset_clear (accepts, '\0');
3602         }
3603 #ifdef RE_ENABLE_I18N
3604       else if (type == OP_UTF8_PERIOD)
3605         {
3606           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3607           if (!(dfa->syntax & RE_DOT_NEWLINE))
3608             bitset_clear (accepts, '\n');
3609           if (dfa->syntax & RE_DOT_NOT_NULL)
3610             bitset_clear (accepts, '\0');
3611         }
3612 #endif
3613       else
3614         continue;
3615
3616       /* Check the `accepts' and sift the characters which are not
3617          match it the context.  */
3618       if (constraint)
3619         {
3620           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3621             {
3622               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3623               bitset_empty (accepts);
3624               if (accepts_newline)
3625                 bitset_set (accepts, NEWLINE_CHAR);
3626               else
3627                 continue;
3628             }
3629           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3630             {
3631               bitset_empty (accepts);
3632               continue;
3633             }
3634
3635           if (constraint & NEXT_WORD_CONSTRAINT)
3636             {
3637               bitset_word_t any_set = 0;
3638               if (type == CHARACTER && !node->word_char)
3639                 {
3640                   bitset_empty (accepts);
3641                   continue;
3642                 }
3643 #ifdef RE_ENABLE_I18N
3644               if (dfa->mb_cur_max > 1)
3645                 for (j = 0; j < BITSET_WORDS; ++j)
3646                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3647               else
3648 #endif
3649                 for (j = 0; j < BITSET_WORDS; ++j)
3650                   any_set |= (accepts[j] &= dfa->word_char[j]);
3651               if (!any_set)
3652                 continue;
3653             }
3654           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3655             {
3656               bitset_word_t any_set = 0;
3657               if (type == CHARACTER && node->word_char)
3658                 {
3659                   bitset_empty (accepts);
3660                   continue;
3661                 }
3662 #ifdef RE_ENABLE_I18N
3663               if (dfa->mb_cur_max > 1)
3664                 for (j = 0; j < BITSET_WORDS; ++j)
3665                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3666               else
3667 #endif
3668                 for (j = 0; j < BITSET_WORDS; ++j)
3669                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3670               if (!any_set)
3671                 continue;
3672             }
3673         }
3674
3675       /* Then divide `accepts' into DFA states, or create a new
3676          state.  Above, we make sure that accepts is not empty.  */
3677       for (j = 0; j < ndests; ++j)
3678         {
3679           bitset_t intersec; /* Intersection sets, see below.  */
3680           bitset_t remains;
3681           /* Flags, see below.  */
3682           bitset_word_t has_intersec, not_subset, not_consumed;
3683
3684           /* Optimization, skip if this state doesn't accept the character.  */
3685           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3686             continue;
3687
3688           /* Enumerate the intersection set of this state and `accepts'.  */
3689           has_intersec = 0;
3690           for (k = 0; k < BITSET_WORDS; ++k)
3691             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3692           /* And skip if the intersection set is empty.  */
3693           if (!has_intersec)
3694             continue;
3695
3696           /* Then check if this state is a subset of `accepts'.  */
3697           not_subset = not_consumed = 0;
3698           for (k = 0; k < BITSET_WORDS; ++k)
3699             {
3700               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3701               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3702             }
3703
3704           /* If this state isn't a subset of `accepts', create a
3705              new group state, which has the `remains'. */
3706           if (not_subset)
3707             {
3708               bitset_copy (dests_ch[ndests], remains);
3709               bitset_copy (dests_ch[j], intersec);
3710               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3711               if (BE (err != REG_NOERROR, 0))
3712                 goto error_return;
3713               ++ndests;
3714             }
3715
3716           /* Put the position in the current group. */
3717           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3718           if (BE (result < 0, 0))
3719             goto error_return;
3720
3721           /* If all characters are consumed, go to next node. */
3722           if (!not_consumed)
3723             break;
3724         }
3725       /* Some characters remain, create a new group. */
3726       if (j == ndests)
3727         {
3728           bitset_copy (dests_ch[ndests], accepts);
3729           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3730           if (BE (err != REG_NOERROR, 0))
3731             goto error_return;
3732           ++ndests;
3733           bitset_empty (accepts);
3734         }
3735     }
3736   return ndests;
3737  error_return:
3738   for (j = 0; j < ndests; ++j)
3739     re_node_set_free (dests_node + j);
3740   return -1;
3741 }
3742
3743 #ifdef RE_ENABLE_I18N
3744 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3745    Return the number of the bytes the node accepts.
3746    STR_IDX is the current index of the input string.
3747
3748    This function handles the nodes which can accept one character, or
3749    one collating element like '.', '[a-z]', opposite to the other nodes
3750    can only accept one byte.  */
3751
3752 # ifdef _LIBC
3753 #  include <locale/weight.h>
3754 # endif
3755
3756 static int
3757 internal_function
3758 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3759                          const re_string_t *input, int str_idx)
3760 {
3761   const re_token_t *node = dfa->nodes + node_idx;
3762   int char_len, elem_len;
3763   int i;
3764
3765   if (BE (node->type == OP_UTF8_PERIOD, 0))
3766     {
3767       unsigned char c = re_string_byte_at (input, str_idx), d;
3768       if (BE (c < 0xc2, 1))
3769         return 0;
3770
3771       if (str_idx + 2 > input->len)
3772         return 0;
3773
3774       d = re_string_byte_at (input, str_idx + 1);
3775       if (c < 0xe0)
3776         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3777       else if (c < 0xf0)
3778         {
3779           char_len = 3;
3780           if (c == 0xe0 && d < 0xa0)
3781             return 0;
3782         }
3783       else if (c < 0xf8)
3784         {
3785           char_len = 4;
3786           if (c == 0xf0 && d < 0x90)
3787             return 0;
3788         }
3789       else if (c < 0xfc)
3790         {
3791           char_len = 5;
3792           if (c == 0xf8 && d < 0x88)
3793             return 0;
3794         }
3795       else if (c < 0xfe)
3796         {
3797           char_len = 6;
3798           if (c == 0xfc && d < 0x84)
3799             return 0;
3800         }
3801       else
3802         return 0;
3803
3804       if (str_idx + char_len > input->len)
3805         return 0;
3806
3807       for (i = 1; i < char_len; ++i)
3808         {
3809           d = re_string_byte_at (input, str_idx + i);
3810           if (d < 0x80 || d > 0xbf)
3811             return 0;
3812         }
3813       return char_len;
3814     }
3815
3816   char_len = re_string_char_size_at (input, str_idx);
3817   if (node->type == OP_PERIOD)
3818     {
3819       if (char_len <= 1)
3820         return 0;
3821       /* FIXME: I don't think this if is needed, as both '\n'
3822          and '\0' are char_len == 1.  */
3823       /* '.' accepts any one character except the following two cases.  */
3824       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3825            re_string_byte_at (input, str_idx) == '\n') ||
3826           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3827            re_string_byte_at (input, str_idx) == '\0'))
3828         return 0;
3829       return char_len;
3830     }
3831
3832   elem_len = re_string_elem_size_at (input, str_idx);
3833   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3834     return 0;
3835
3836   if (node->type == COMPLEX_BRACKET)
3837     {
3838       const re_charset_t *cset = node->opr.mbcset;
3839 # ifdef _LIBC
3840       const unsigned char *pin
3841         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3842       int j;
3843       uint32_t nrules;
3844 # endif /* _LIBC */
3845       int match_len = 0;
3846       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3847                     ? re_string_wchar_at (input, str_idx) : 0);
3848
3849       /* match with multibyte character?  */
3850       for (i = 0; i < cset->nmbchars; ++i)
3851         if (wc == cset->mbchars[i])
3852           {
3853             match_len = char_len;
3854             goto check_node_accept_bytes_match;
3855           }
3856       /* match with character_class?  */
3857       for (i = 0; i < cset->nchar_classes; ++i)
3858         {
3859           wctype_t wt = cset->char_classes[i];
3860           if (__iswctype (wc, wt))
3861             {
3862               match_len = char_len;
3863               goto check_node_accept_bytes_match;
3864             }
3865         }
3866
3867 # ifdef _LIBC
3868       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3869       if (nrules != 0)
3870         {
3871           unsigned int in_collseq = 0;
3872           const int32_t *table, *indirect;
3873           const unsigned char *weights, *extra;
3874           const char *collseqwc;
3875
3876           /* match with collating_symbol?  */
3877           if (cset->ncoll_syms)
3878             extra = (const unsigned char *)
3879               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3880           for (i = 0; i < cset->ncoll_syms; ++i)
3881             {
3882               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3883               /* Compare the length of input collating element and
3884                  the length of current collating element.  */
3885               if (*coll_sym != elem_len)
3886                 continue;
3887               /* Compare each bytes.  */
3888               for (j = 0; j < *coll_sym; j++)
3889                 if (pin[j] != coll_sym[1 + j])
3890                   break;
3891               if (j == *coll_sym)
3892                 {
3893                   /* Match if every bytes is equal.  */
3894                   match_len = j;
3895                   goto check_node_accept_bytes_match;
3896                 }
3897             }
3898
3899           if (cset->nranges)
3900             {
3901               if (elem_len <= char_len)
3902                 {
3903                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3904                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3905                 }
3906               else
3907                 in_collseq = find_collation_sequence_value (pin, elem_len);
3908             }
3909           /* match with range expression?  */
3910           for (i = 0; i < cset->nranges; ++i)
3911             if (cset->range_starts[i] <= in_collseq
3912                 && in_collseq <= cset->range_ends[i])
3913               {
3914                 match_len = elem_len;
3915                 goto check_node_accept_bytes_match;
3916               }
3917
3918           /* match with equivalence_class?  */
3919           if (cset->nequiv_classes)
3920             {
3921               const unsigned char *cp = pin;
3922               table = (const int32_t *)
3923                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3924               weights = (const unsigned char *)
3925                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3926               extra = (const unsigned char *)
3927                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3928               indirect = (const int32_t *)
3929                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3930               int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3931               if (idx > 0)
3932                 for (i = 0; i < cset->nequiv_classes; ++i)
3933                   {
3934                     int32_t equiv_class_idx = cset->equiv_classes[i];
3935                     size_t weight_len = weights[idx & 0xffffff];
3936                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3937                         && (idx >> 24) == (equiv_class_idx >> 24))
3938                       {
3939                         int cnt = 0;
3940
3941                         idx &= 0xffffff;
3942                         equiv_class_idx &= 0xffffff;
3943
3944                         while (cnt <= weight_len
3945                                && (weights[equiv_class_idx + 1 + cnt]
3946                                    == weights[idx + 1 + cnt]))
3947                           ++cnt;
3948                         if (cnt > weight_len)
3949                           {
3950                             match_len = elem_len;
3951                             goto check_node_accept_bytes_match;
3952                           }
3953                       }
3954                   }
3955             }
3956         }
3957       else
3958 # endif /* _LIBC */
3959         {
3960           /* match with range expression?  */
3961 #if __GNUC__ >= 2
3962           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3963 #else
3964           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3965           cmp_buf[2] = wc;
3966 #endif
3967           for (i = 0; i < cset->nranges; ++i)
3968             {
3969               cmp_buf[0] = cset->range_starts[i];
3970               cmp_buf[4] = cset->range_ends[i];
3971               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3972                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3973                 {
3974                   match_len = char_len;
3975                   goto check_node_accept_bytes_match;
3976                 }
3977             }
3978         }
3979     check_node_accept_bytes_match:
3980       if (!cset->non_match)
3981         return match_len;
3982       else
3983         {
3984           if (match_len > 0)
3985             return 0;
3986           else
3987             return (elem_len > char_len) ? elem_len : char_len;
3988         }
3989     }
3990   return 0;
3991 }
3992
3993 # ifdef _LIBC
3994 static unsigned int
3995 internal_function
3996 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3997 {
3998   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3999   if (nrules == 0)
4000     {
4001       if (mbs_len == 1)
4002         {
4003           /* No valid character.  Match it as a single byte character.  */
4004           const unsigned char *collseq = (const unsigned char *)
4005             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4006           return collseq[mbs[0]];
4007         }
4008       return UINT_MAX;
4009     }
4010   else
4011     {
4012       int32_t idx;
4013       const unsigned char *extra = (const unsigned char *)
4014         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4015       int32_t extrasize = (const unsigned char *)
4016         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4017
4018       for (idx = 0; idx < extrasize;)
4019         {
4020           int mbs_cnt, found = 0;
4021           int32_t elem_mbs_len;
4022           /* Skip the name of collating element name.  */
4023           idx = idx + extra[idx] + 1;
4024           elem_mbs_len = extra[idx++];
4025           if (mbs_len == elem_mbs_len)
4026             {
4027               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4028                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4029                   break;
4030               if (mbs_cnt == elem_mbs_len)
4031                 /* Found the entry.  */
4032                 found = 1;
4033             }
4034           /* Skip the byte sequence of the collating element.  */
4035           idx += elem_mbs_len;
4036           /* Adjust for the alignment.  */
4037           idx = (idx + 3) & ~3;
4038           /* Skip the collation sequence value.  */
4039           idx += sizeof (uint32_t);
4040           /* Skip the wide char sequence of the collating element.  */
4041           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4042           /* If we found the entry, return the sequence value.  */
4043           if (found)
4044             return *(uint32_t *) (extra + idx);
4045           /* Skip the collation sequence value.  */
4046           idx += sizeof (uint32_t);
4047         }
4048       return UINT_MAX;
4049     }
4050 }
4051 # endif /* _LIBC */
4052 #endif /* RE_ENABLE_I18N */
4053
4054 /* Check whether the node accepts the byte which is IDX-th
4055    byte of the INPUT.  */
4056
4057 static int
4058 internal_function
4059 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4060                    int idx)
4061 {
4062   unsigned char ch;
4063   ch = re_string_byte_at (&mctx->input, idx);
4064   switch (node->type)
4065     {
4066     case CHARACTER:
4067       if (node->opr.c != ch)
4068         return 0;
4069       break;
4070
4071     case SIMPLE_BRACKET:
4072       if (!bitset_contain (node->opr.sbcset, ch))
4073         return 0;
4074       break;
4075
4076 #ifdef RE_ENABLE_I18N
4077     case OP_UTF8_PERIOD:
4078       if (ch >= 0x80)
4079         return 0;
4080       /* FALLTHROUGH */
4081 #endif
4082     case OP_PERIOD:
4083       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4084           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4085         return 0;
4086       break;
4087
4088     default:
4089       return 0;
4090     }
4091
4092   if (node->constraint)
4093     {
4094       /* The node has constraints.  Check whether the current context
4095          satisfies the constraints.  */
4096       unsigned int context = re_string_context_at (&mctx->input, idx,
4097                                                    mctx->eflags);
4098       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4099         return 0;
4100     }
4101
4102   return 1;
4103 }
4104
4105 /* Extend the buffers, if the buffers have run out.  */
4106
4107 static reg_errcode_t
4108 internal_function __attribute_warn_unused_result__
4109 extend_buffers (re_match_context_t *mctx, int min_len)
4110 {
4111   reg_errcode_t ret;
4112   re_string_t *pstr = &mctx->input;
4113
4114   /* Avoid overflow.  */
4115   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4116     return REG_ESPACE;
4117
4118   /* Double the lengthes of the buffers, but allocate at least MIN_LEN.  */
4119   ret = re_string_realloc_buffers (pstr,
4120                                    MAX (min_len,
4121                                         MIN (pstr->len, pstr->bufs_len * 2)));
4122   if (BE (ret != REG_NOERROR, 0))
4123     return ret;
4124
4125   if (mctx->state_log != NULL)
4126     {
4127       /* And double the length of state_log.  */
4128       /* XXX We have no indication of the size of this buffer.  If this
4129          allocation fail we have no indication that the state_log array
4130          does not have the right size.  */
4131       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4132                                               pstr->bufs_len + 1);
4133       if (BE (new_array == NULL, 0))
4134         return REG_ESPACE;
4135       mctx->state_log = new_array;
4136     }
4137
4138   /* Then reconstruct the buffers.  */
4139   if (pstr->icase)
4140     {
4141 #ifdef RE_ENABLE_I18N
4142       if (pstr->mb_cur_max > 1)
4143         {
4144           ret = build_wcs_upper_buffer (pstr);
4145           if (BE (ret != REG_NOERROR, 0))
4146             return ret;
4147         }
4148       else
4149 #endif /* RE_ENABLE_I18N  */
4150         build_upper_buffer (pstr);
4151     }
4152   else
4153     {
4154 #ifdef RE_ENABLE_I18N
4155       if (pstr->mb_cur_max > 1)
4156         build_wcs_buffer (pstr);
4157       else
4158 #endif /* RE_ENABLE_I18N  */
4159         {
4160           if (pstr->trans != NULL)
4161             re_string_translate_buffer (pstr);
4162         }
4163     }
4164   return REG_NOERROR;
4165 }
4166
4167 \f
4168 /* Functions for matching context.  */
4169
4170 /* Initialize MCTX.  */
4171
4172 static reg_errcode_t
4173 internal_function __attribute_warn_unused_result__
4174 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4175 {
4176   mctx->eflags = eflags;
4177   mctx->match_last = -1;
4178   if (n > 0)
4179     {
4180       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4181       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4182       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4183         return REG_ESPACE;
4184     }
4185   /* Already zero-ed by the caller.
4186      else
4187        mctx->bkref_ents = NULL;
4188      mctx->nbkref_ents = 0;
4189      mctx->nsub_tops = 0;  */
4190   mctx->abkref_ents = n;
4191   mctx->max_mb_elem_len = 1;
4192   mctx->asub_tops = n;
4193   return REG_NOERROR;
4194 }
4195
4196 /* Clean the entries which depend on the current input in MCTX.
4197    This function must be invoked when the matcher changes the start index
4198    of the input, or changes the input string.  */
4199
4200 static void
4201 internal_function
4202 match_ctx_clean (re_match_context_t *mctx)
4203 {
4204   int st_idx;
4205   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4206     {
4207       int sl_idx;
4208       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4209       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4210         {
4211           re_sub_match_last_t *last = top->lasts[sl_idx];
4212           re_free (last->path.array);
4213           re_free (last);
4214         }
4215       re_free (top->lasts);
4216       if (top->path)
4217         {
4218           re_free (top->path->array);
4219           re_free (top->path);
4220         }
4221       free (top);
4222     }
4223
4224   mctx->nsub_tops = 0;
4225   mctx->nbkref_ents = 0;
4226 }
4227
4228 /* Free all the memory associated with MCTX.  */
4229
4230 static void
4231 internal_function
4232 match_ctx_free (re_match_context_t *mctx)
4233 {
4234   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4235   match_ctx_clean (mctx);
4236   re_free (mctx->sub_tops);
4237   re_free (mctx->bkref_ents);
4238 }
4239
4240 /* Add a new backreference entry to MCTX.
4241    Note that we assume that caller never call this function with duplicate
4242    entry, and call with STR_IDX which isn't smaller than any existing entry.
4243 */
4244
4245 static reg_errcode_t
4246 internal_function __attribute_warn_unused_result__
4247 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4248                      int to)
4249 {
4250   if (mctx->nbkref_ents >= mctx->abkref_ents)
4251     {
4252       struct re_backref_cache_entry* new_entry;
4253       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4254                               mctx->abkref_ents * 2);
4255       if (BE (new_entry == NULL, 0))
4256         {
4257           re_free (mctx->bkref_ents);
4258           return REG_ESPACE;
4259         }
4260       mctx->bkref_ents = new_entry;
4261       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4262               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4263       mctx->abkref_ents *= 2;
4264     }
4265   if (mctx->nbkref_ents > 0
4266       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4267     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4268
4269   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4270   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4271   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4272   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4273
4274   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4275      If bit N is clear, means that this entry won't epsilon-transition to
4276      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4277      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4278      such node.
4279
4280      A backreference does not epsilon-transition unless it is empty, so set
4281      to all zeros if FROM != TO.  */
4282   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4283     = (from == to ? ~0 : 0);
4284
4285   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4286   if (mctx->max_mb_elem_len < to - from)
4287     mctx->max_mb_elem_len = to - from;
4288   return REG_NOERROR;
4289 }
4290
4291 /* Search for the first entry which has the same str_idx, or -1 if none is
4292    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4293
4294 static int
4295 internal_function
4296 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4297 {
4298   int left, right, mid, last;
4299   last = right = mctx->nbkref_ents;
4300   for (left = 0; left < right;)
4301     {
4302       mid = (left + right) / 2;
4303       if (mctx->bkref_ents[mid].str_idx < str_idx)
4304         left = mid + 1;
4305       else
4306         right = mid;
4307     }
4308   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4309     return left;
4310   else
4311     return -1;
4312 }
4313
4314 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4315    at STR_IDX.  */
4316
4317 static reg_errcode_t
4318 internal_function __attribute_warn_unused_result__
4319 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4320 {
4321 #ifdef DEBUG
4322   assert (mctx->sub_tops != NULL);
4323   assert (mctx->asub_tops > 0);
4324 #endif
4325   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4326     {
4327       int new_asub_tops = mctx->asub_tops * 2;
4328       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4329                                                    re_sub_match_top_t *,
4330                                                    new_asub_tops);
4331       if (BE (new_array == NULL, 0))
4332         return REG_ESPACE;
4333       mctx->sub_tops = new_array;
4334       mctx->asub_tops = new_asub_tops;
4335     }
4336   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4337   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4338     return REG_ESPACE;
4339   mctx->sub_tops[mctx->nsub_tops]->node = node;
4340   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4341   return REG_NOERROR;
4342 }
4343
4344 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4345    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4346
4347 static re_sub_match_last_t *
4348 internal_function
4349 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4350 {
4351   re_sub_match_last_t *new_entry;
4352   if (BE (subtop->nlasts == subtop->alasts, 0))
4353     {
4354       int new_alasts = 2 * subtop->alasts + 1;
4355       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4356                                                     re_sub_match_last_t *,
4357                                                     new_alasts);
4358       if (BE (new_array == NULL, 0))
4359         return NULL;
4360       subtop->lasts = new_array;
4361       subtop->alasts = new_alasts;
4362     }
4363   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4364   if (BE (new_entry != NULL, 1))
4365     {
4366       subtop->lasts[subtop->nlasts] = new_entry;
4367       new_entry->node = node;
4368       new_entry->str_idx = str_idx;
4369       ++subtop->nlasts;
4370     }
4371   return new_entry;
4372 }
4373
4374 static void
4375 internal_function
4376 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4377                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4378 {
4379   sctx->sifted_states = sifted_sts;
4380   sctx->limited_states = limited_sts;
4381   sctx->last_node = last_node;
4382   sctx->last_str_idx = last_str_idx;
4383   re_node_set_init_empty (&sctx->limits);
4384 }