posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      Idx n);
  22 static void match_ctx_clean (re_match_context_t *mctx);
  23 static void match_ctx_free (re_match_context_t *cache);
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, Idx node,
  25                                           Idx str_idx, Idx from, Idx to);
  26 static Idx search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx);
  27 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, Idx node,
  28                                            Idx str_idx);
  29 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  30                                                     Idx node, Idx str_idx);
  31 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  32                            re_dfastate_t **limited_sts, Idx last_node,
  33                            Idx last_str_idx);
  34 static reg_errcode_t re_search_internal (const regex_t *preg,
  35                                          const char *string, Idx length,
  36                                          Idx start, Idx last_start, Idx stop,
  37                                          size_t nmatch, regmatch_t pmatch[],
  38                                          int eflags);
  39 static regoff_t re_search_2_stub (struct re_pattern_buffer *bufp,
  40                                   const char *string1, Idx length1,
  41                                   const char *string2, Idx length2,
  42                                   Idx start, regoff_t range,
  43                                   struct re_registers *regs,
  44                                   Idx stop, bool ret_len);
  45 static regoff_t re_search_stub (struct re_pattern_buffer *bufp,
  46                                 const char *string, Idx length, Idx start,
  47                                 regoff_t range, Idx stop,
  48                                 struct re_registers *regs,
  49                                 bool ret_len);
  50 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  51                               Idx nregs, int regs_allocated);
  52 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx);
  53 static Idx check_matching (re_match_context_t *mctx, bool fl_longest_match,
  54                            Idx *p_match_first);
  55 static Idx check_halt_state_context (const re_match_context_t *mctx,
  56                                      const re_dfastate_t *state, Idx idx);
  57 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  58                          regmatch_t *prev_idx_match, Idx cur_node,
  59                          Idx cur_idx, Idx nmatch);
  60 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  61                                       Idx str_idx, Idx dest_node, Idx nregs,
  62                                       regmatch_t *regs,
  63                                       re_node_set *eps_via_nodes);
  64 static reg_errcode_t set_regs (const regex_t *preg,
  65                                const re_match_context_t *mctx,
  66                                size_t nmatch, regmatch_t *pmatch,
  67                                bool fl_backtrack);
  68 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs);
  69
  70 #ifdef RE_ENABLE_I18N
  71 static int sift_states_iter_mb (const re_match_context_t *mctx,
  72                                 re_sift_context_t *sctx,
  73                                 Idx node_idx, Idx str_idx, Idx max_str_idx);
  74 #endif /* RE_ENABLE_I18N */
  75 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  76                                            re_sift_context_t *sctx);
  77 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  78                                           re_sift_context_t *sctx, Idx str_idx,
  79                                           re_node_set *cur_dest);
  80 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  81                                               re_sift_context_t *sctx,
  82                                               Idx str_idx,
  83                                               re_node_set *dest_nodes);
  84 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  85                                             re_node_set *dest_nodes,
  86                                             const re_node_set *candidates);
  87 static bool check_dst_limits (const re_match_context_t *mctx,
  88                               const re_node_set *limits,
  89                               Idx dst_node, Idx dst_idx, Idx src_node,
  90                               Idx src_idx);
  91 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
  92                                         int boundaries, Idx subexp_idx,
  93                                         Idx from_node, Idx bkref_idx);
  94 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
  95                                       Idx limit, Idx subexp_idx,
  96                                       Idx node, Idx str_idx,
  97                                       Idx bkref_idx);
  98 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
  99                                           re_node_set *dest_nodes,
 100                                           const re_node_set *candidates,
 101                                           re_node_set *limits,
 102                                           struct re_backref_cache_entry *bkref_ents,
 103                                           Idx str_idx);
 104 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 105                                         re_sift_context_t *sctx,
 106                                         Idx str_idx, const re_node_set *candidates);
 107 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 108                                         re_dfastate_t **dst,
 109                                         re_dfastate_t **src, Idx num);
 110 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 111                                          re_match_context_t *mctx);
 112 static re_dfastate_t *transit_state (reg_errcode_t *err,
 113                                      re_match_context_t *mctx,
 114                                      re_dfastate_t *state);
 115 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 116                                             re_match_context_t *mctx,
 117                                             re_dfastate_t *next_state);
 118 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 119                                                 re_node_set *cur_nodes,
 120                                                 Idx str_idx);
 121 #if 0
 122 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 123                                         re_match_context_t *mctx,
 124                                         re_dfastate_t *pstate);
 125 #endif
 126 #ifdef RE_ENABLE_I18N
 127 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 128                                        re_dfastate_t *pstate);
 129 #endif /* RE_ENABLE_I18N */
 130 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 131                                           const re_node_set *nodes);
 132 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 133                                  Idx bkref_node, Idx bkref_str_idx);
 134 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 135                                      const re_sub_match_top_t *sub_top,
 136                                      re_sub_match_last_t *sub_last,
 137                                      Idx bkref_node, Idx bkref_str);
 138 static Idx find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 139                              Idx subexp_idx, int type);
 140 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 141                                     state_array_t *path, Idx top_node,
 142                                     Idx top_str, Idx last_node, Idx last_str,
 143                                     int type);
 144 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 145                                                    Idx str_idx,
 146                                                    re_node_set *cur_nodes,
 147                                                    re_node_set *next_nodes);
 148 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 149                                                re_node_set *cur_nodes,
 150                                                Idx ex_subexp, int type);
 151 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 152                                                    re_node_set *dst_nodes,
 153                                                    Idx target, Idx ex_subexp,
 154                                                    int type);
 155 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 156                                          re_node_set *cur_nodes, Idx cur_str,
 157                                          Idx subexp_num, int type);
 158 static bool build_trtable (const re_dfa_t *dfa, re_dfastate_t *state);
 159 #ifdef RE_ENABLE_I18N
 160 static int check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
 161                                     const re_string_t *input, Idx idx);
 162 # ifdef _LIBC
 163 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 164                                                    size_t name_len);
 165 # endif /* _LIBC */
 166 #endif /* RE_ENABLE_I18N */
 167 static Idx group_nodes_into_DFAstates (const re_dfa_t *dfa,
 168                                        const re_dfastate_t *state,
 169                                        re_node_set *states_node,
 170                                        bitset_t *states_ch);
 171 static bool check_node_accept (const re_match_context_t *mctx,
 172                                const re_token_t *node, Idx idx);
 173 static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len);
 174 \f
 175 /* Entry point for POSIX code.  */
 176
 177 /* regexec searches for a given pattern, specified by PREG, in the
 178    string STRING.
 179
 180    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 181    'regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 182    least NMATCH elements, and we set them to the offsets of the
 183    corresponding matched substrings.
 184
 185    EFLAGS specifies "execution flags" which affect matching: if
 186    REG_NOTBOL is set, then ^ does not match at the beginning of the
 187    string; if REG_NOTEOL is set, then $ does not match at the end.
 188
 189    We return 0 if we find a match and REG_NOMATCH if not.  */
 190
 191 int
 192 regexec (const regex_t *_Restrict_ preg, const char *_Restrict_ string,
 193          size_t nmatch, regmatch_t pmatch[], int eflags)
 194 {
 195   reg_errcode_t err;
 196   Idx start, length;
 197   re_dfa_t *dfa = preg->buffer;
 198
 199   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 200     return REG_BADPAT;
 201
 202   if (eflags & REG_STARTEND)
 203     {
 204       start = pmatch[0].rm_so;
 205       length = pmatch[0].rm_eo;
 206     }
 207   else
 208     {
 209       start = 0;
 210       length = strlen (string);
 211     }
 212
 213   lock_lock (dfa->lock);
 214   if (preg->no_sub)
 215     err = re_search_internal (preg, string, length, start, length,
 216                               length, 0, NULL, eflags);
 217   else
 218     err = re_search_internal (preg, string, length, start, length,
 219                               length, nmatch, pmatch, eflags);
 220   lock_unlock (dfa->lock);
 221   return err != REG_NOERROR;
 222 }
 223
 224 #ifdef _LIBC
 225 libc_hidden_def (__regexec)
 226
 227 # include <shlib-compat.h>
 228 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 229
 230 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 231 __typeof__ (__regexec) __compat_regexec;
 232
 233 int
 234 attribute_compat_text_section
 235 __compat_regexec (const regex_t *_Restrict_ preg,
 236                   const char *_Restrict_ string, size_t nmatch,
 237                   regmatch_t pmatch[], int eflags)
 238 {
 239   return regexec (preg, string, nmatch, pmatch,
 240                   eflags & (REG_NOTBOL | REG_NOTEOL));
 241 }
 242 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 243 # endif
 244 #endif
 245
 246 /* Entry points for GNU code.  */
 247
 248 /* re_match, re_search, re_match_2, re_search_2
 249
 250    The former two functions operate on STRING with length LENGTH,
 251    while the later two operate on concatenation of STRING1 and STRING2
 252    with lengths LENGTH1 and LENGTH2, respectively.
 253
 254    re_match() matches the compiled pattern in BUFP against the string,
 255    starting at index START.
 256
 257    re_search() first tries matching at index START, then it tries to match
 258    starting from index START + 1, and so on.  The last start position tried
 259    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 260    way as re_match().)
 261
 262    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 263    the first STOP characters of the concatenation of the strings should be
 264    concerned.
 265
 266    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 267    and all groups is stored in REGS.  (For the "_2" variants, the offsets are
 268    computed relative to the concatenation, not relative to the individual
 269    strings.)
 270
 271    On success, re_match* functions return the length of the match, re_search*
 272    return the position of the start of the match.  Return value -1 means no
 273    match was found and -2 indicates an internal error.  */
 274
 275 regoff_t
 276 re_match (struct re_pattern_buffer *bufp, const char *string, Idx length,
 277           Idx start, struct re_registers *regs)
 278 {
 279   return re_search_stub (bufp, string, length, start, 0, length, regs, true);
 280 }
 281 #ifdef _LIBC
 282 weak_alias (__re_match, re_match)
 283 #endif
 284
 285 regoff_t
 286 re_search (struct re_pattern_buffer *bufp, const char *string, Idx length,
 287            Idx start, regoff_t range, struct re_registers *regs)
 288 {
 289   return re_search_stub (bufp, string, length, start, range, length, regs,
 290                          false);
 291 }
 292 #ifdef _LIBC
 293 weak_alias (__re_search, re_search)
 294 #endif
 295
 296 regoff_t
 297 re_match_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
 298             const char *string2, Idx length2, Idx start,
 299             struct re_registers *regs, Idx stop)
 300 {
 301   return re_search_2_stub (bufp, string1, length1, string2, length2,
 302                            start, 0, regs, stop, true);
 303 }
 304 #ifdef _LIBC
 305 weak_alias (__re_match_2, re_match_2)
 306 #endif
 307
 308 regoff_t
 309 re_search_2 (struct re_pattern_buffer *bufp, const char *string1, Idx length1,
 310              const char *string2, Idx length2, Idx start, regoff_t range,
 311              struct re_registers *regs, Idx stop)
 312 {
 313   return re_search_2_stub (bufp, string1, length1, string2, length2,
 314                            start, range, regs, stop, false);
 315 }
 316 #ifdef _LIBC
 317 weak_alias (__re_search_2, re_search_2)
 318 #endif
 319
 320 static regoff_t
 321 re_search_2_stub (struct re_pattern_buffer *bufp, const char *string1,
 322                   Idx length1, const char *string2, Idx length2, Idx start,
 323                   regoff_t range, struct re_registers *regs,
 324                   Idx stop, bool ret_len)
 325 {
 326   const char *str;
 327   regoff_t rval;
 328   Idx len;
 329   char *s = NULL;
 330
 331   if (BE ((length1 < 0 || length2 < 0 || stop < 0
 332            || INT_ADD_WRAPV (length1, length2, &len)),
 333           0))
 334     return -2;
 335
 336   /* Concatenate the strings.  */
 337   if (length2 > 0)
 338     if (length1 > 0)
 339       {
 340         s = re_malloc (char, len);
 341
 342         if (BE (s == NULL, 0))
 343           return -2;
 344 #ifdef _LIBC
 345         memcpy (__mempcpy (s, string1, length1), string2, length2);
 346 #else
 347         memcpy (s, string1, length1);
 348         memcpy (s + length1, string2, length2);
 349 #endif
 350         str = s;
 351       }
 352     else
 353       str = string2;
 354   else
 355     str = string1;
 356
 357   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 358                          ret_len);
 359   re_free (s);
 360   return rval;
 361 }
 362
 363 /* The parameters have the same meaning as those of re_search.
 364    Additional parameters:
 365    If RET_LEN is true the length of the match is returned (re_match style);
 366    otherwise the position of the match is returned.  */
 367
 368 static regoff_t
 369 re_search_stub (struct re_pattern_buffer *bufp, const char *string, Idx length,
 370                 Idx start, regoff_t range, Idx stop, struct re_registers *regs,
 371                 bool ret_len)
 372 {
 373   reg_errcode_t result;
 374   regmatch_t *pmatch;
 375   Idx nregs;
 376   regoff_t rval;
 377   int eflags = 0;
 378   re_dfa_t *dfa = bufp->buffer;
 379   Idx last_start = start + range;
 380
 381   /* Check for out-of-range.  */
 382   if (BE (start < 0 || start > length, 0))
 383     return -1;
 384   if (BE (length < last_start || (0 <= range && last_start < start), 0))
 385     last_start = length;
 386   else if (BE (last_start < 0 || (range < 0 && start <= last_start), 0))
 387     last_start = 0;
 388
 389   lock_lock (dfa->lock);
 390
 391   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 392   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 393
 394   /* Compile fastmap if we haven't yet.  */
 395   if (start < last_start && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 396     re_compile_fastmap (bufp);
 397
 398   if (BE (bufp->no_sub, 0))
 399     regs = NULL;
 400
 401   /* We need at least 1 register.  */
 402   if (regs == NULL)
 403     nregs = 1;
 404   else if (BE (bufp->regs_allocated == REGS_FIXED
 405                && regs->num_regs <= bufp->re_nsub, 0))
 406     {
 407       nregs = regs->num_regs;
 408       if (BE (nregs < 1, 0))
 409         {
 410           /* Nothing can be copied to regs.  */
 411           regs = NULL;
 412           nregs = 1;
 413         }
 414     }
 415   else
 416     nregs = bufp->re_nsub + 1;
 417   pmatch = re_malloc (regmatch_t, nregs);
 418   if (BE (pmatch == NULL, 0))
 419     {
 420       rval = -2;
 421       goto out;
 422     }
 423
 424   result = re_search_internal (bufp, string, length, start, last_start, stop,
 425                                nregs, pmatch, eflags);
 426
 427   rval = 0;
 428
 429   /* I hope we needn't fill their regs with -1's when no match was found.  */
 430   if (result != REG_NOERROR)
 431     rval = result == REG_NOMATCH ? -1 : -2;
 432   else if (regs != NULL)
 433     {
 434       /* If caller wants register contents data back, copy them.  */
 435       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 436                                            bufp->regs_allocated);
 437       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 438         rval = -2;
 439     }
 440
 441   if (BE (rval == 0, 1))
 442     {
 443       if (ret_len)
 444         {
 445           assert (pmatch[0].rm_so == start);
 446           rval = pmatch[0].rm_eo - start;
 447         }
 448       else
 449         rval = pmatch[0].rm_so;
 450     }
 451   re_free (pmatch);
 452  out:
 453   lock_unlock (dfa->lock);
 454   return rval;
 455 }
 456
 457 static unsigned
 458 re_copy_regs (struct re_registers *regs, regmatch_t *pmatch, Idx nregs,
 459               int regs_allocated)
 460 {
 461   int rval = REGS_REALLOCATE;
 462   Idx i;
 463   Idx need_regs = nregs + 1;
 464   /* We need one extra element beyond 'num_regs' for the '-1' marker GNU code
 465      uses.  */
 466
 467   /* Have the register data arrays been allocated?  */
 468   if (regs_allocated == REGS_UNALLOCATED)
 469     { /* No.  So allocate them with malloc.  */
 470       regs->start = re_malloc (regoff_t, need_regs);
 471       if (BE (regs->start == NULL, 0))
 472         return REGS_UNALLOCATED;
 473       regs->end = re_malloc (regoff_t, need_regs);
 474       if (BE (regs->end == NULL, 0))
 475         {
 476           re_free (regs->start);
 477           return REGS_UNALLOCATED;
 478         }
 479       regs->num_regs = need_regs;
 480     }
 481   else if (regs_allocated == REGS_REALLOCATE)
 482     { /* Yes.  If we need more elements than were already
 483          allocated, reallocate them.  If we need fewer, just
 484          leave it alone.  */
 485       if (BE (need_regs > regs->num_regs, 0))
 486         {
 487           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 488           regoff_t *new_end;
 489           if (BE (new_start == NULL, 0))
 490             return REGS_UNALLOCATED;
 491           new_end = re_realloc (regs->end, regoff_t, need_regs);
 492           if (BE (new_end == NULL, 0))
 493             {
 494               re_free (new_start);
 495               return REGS_UNALLOCATED;
 496             }
 497           regs->start = new_start;
 498           regs->end = new_end;
 499           regs->num_regs = need_regs;
 500         }
 501     }
 502   else
 503     {
 504       assert (regs_allocated == REGS_FIXED);
 505       /* This function may not be called with REGS_FIXED and nregs too big.  */
 506       assert (regs->num_regs >= nregs);
 507       rval = REGS_FIXED;
 508     }
 509
 510   /* Copy the regs.  */
 511   for (i = 0; i < nregs; ++i)
 512     {
 513       regs->start[i] = pmatch[i].rm_so;
 514       regs->end[i] = pmatch[i].rm_eo;
 515     }
 516   for ( ; i < regs->num_regs; ++i)
 517     regs->start[i] = regs->end[i] = -1;
 518
 519   return rval;
 520 }
 521
 522 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 523    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 524    this memory for recording register information.  STARTS and ENDS
 525    must be allocated using the malloc library routine, and must each
 526    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 527
 528    If NUM_REGS == 0, then subsequent matches should allocate their own
 529    register data.
 530
 531    Unless this function is called, the first search or match using
 532    PATTERN_BUFFER will allocate its own register data, without
 533    freeing the old data.  */
 534
 535 void
 536 re_set_registers (struct re_pattern_buffer *bufp, struct re_registers *regs,
 537                   __re_size_t num_regs, regoff_t *starts, regoff_t *ends)
 538 {
 539   if (num_regs)
 540     {
 541       bufp->regs_allocated = REGS_REALLOCATE;
 542       regs->num_regs = num_regs;
 543       regs->start = starts;
 544       regs->end = ends;
 545     }
 546   else
 547     {
 548       bufp->regs_allocated = REGS_UNALLOCATED;
 549       regs->num_regs = 0;
 550       regs->start = regs->end = NULL;
 551     }
 552 }
 553 #ifdef _LIBC
 554 weak_alias (__re_set_registers, re_set_registers)
 555 #endif
 556 \f
 557 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 558    them unless specifically requested.  */
 559
 560 #if defined _REGEX_RE_COMP || defined _LIBC
 561 int
 562 # ifdef _LIBC
 563 weak_function
 564 # endif
 565 re_exec (const char *s)
 566 {
 567   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 568 }
 569 #endif /* _REGEX_RE_COMP */
 570 \f
 571 /* Internal entry point.  */
 572
 573 /* Searches for a compiled pattern PREG in the string STRING, whose
 574    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 575    meaning as with regexec.  LAST_START is START + RANGE, where
 576    START and RANGE have the same meaning as with re_search.
 577    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 578    otherwise return the error code.
 579    Note: We assume front end functions already check ranges.
 580    (0 <= LAST_START && LAST_START <= LENGTH)  */
 581
 582 static reg_errcode_t
 583 __attribute_warn_unused_result__
 584 re_search_internal (const regex_t *preg, const char *string, Idx length,
 585                     Idx start, Idx last_start, Idx stop, size_t nmatch,
 586                     regmatch_t pmatch[], int eflags)
 587 {
 588   reg_errcode_t err;
 589   const re_dfa_t *dfa = preg->buffer;
 590   Idx left_lim, right_lim;
 591   int incr;
 592   bool fl_longest_match;
 593   int match_kind;
 594   Idx match_first;
 595   Idx match_last = -1;
 596   Idx extra_nmatch;
 597   bool sb;
 598   int ch;
 599 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 600   re_match_context_t mctx = { .dfa = dfa };
 601 #else
 602   re_match_context_t mctx;
 603 #endif
 604   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
 605                     && start != last_start && !preg->can_be_null)
 606                    ? preg->fastmap : NULL);
 607   RE_TRANSLATE_TYPE t = preg->translate;
 608
 609 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 610   memset (&mctx, '\0', sizeof (re_match_context_t));
 611   mctx.dfa = dfa;
 612 #endif
 613
 614   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 615   nmatch -= extra_nmatch;
 616
 617   /* Check if the DFA haven't been compiled.  */
 618   if (BE (preg->used == 0 || dfa->init_state == NULL
 619           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 620           || dfa->init_state_begbuf == NULL, 0))
 621     return REG_NOMATCH;
 622
 623 #ifdef DEBUG
 624   /* We assume front-end functions already check them.  */
 625   assert (0 <= last_start && last_start <= length);
 626 #endif
 627
 628   /* If initial states with non-begbuf contexts have no elements,
 629      the regex must be anchored.  If preg->newline_anchor is set,
 630      we'll never use init_state_nl, so do not check it.  */
 631   if (dfa->init_state->nodes.nelem == 0
 632       && dfa->init_state_word->nodes.nelem == 0
 633       && (dfa->init_state_nl->nodes.nelem == 0
 634           || !preg->newline_anchor))
 635     {
 636       if (start != 0 && last_start != 0)
 637         return REG_NOMATCH;
 638       start = last_start = 0;
 639     }
 640
 641   /* We must check the longest matching, if nmatch > 0.  */
 642   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 643
 644   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 645                             preg->translate, (preg->syntax & RE_ICASE) != 0,
 646                             dfa);
 647   if (BE (err != REG_NOERROR, 0))
 648     goto free_return;
 649   mctx.input.stop = stop;
 650   mctx.input.raw_stop = stop;
 651   mctx.input.newline_anchor = preg->newline_anchor;
 652
 653   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 654   if (BE (err != REG_NOERROR, 0))
 655     goto free_return;
 656
 657   /* We will log all the DFA states through which the dfa pass,
 658      if nmatch > 1, or this dfa has "multibyte node", which is a
 659      back-reference or a node which can accept multibyte character or
 660      multi character collating element.  */
 661   if (nmatch > 1 || dfa->has_mb_node)
 662     {
 663       /* Avoid overflow.  */
 664       if (BE ((MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *))
 665                <= mctx.input.bufs_len), 0))
 666         {
 667           err = REG_ESPACE;
 668           goto free_return;
 669         }
 670
 671       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 672       if (BE (mctx.state_log == NULL, 0))
 673         {
 674           err = REG_ESPACE;
 675           goto free_return;
 676         }
 677     }
 678   else
 679     mctx.state_log = NULL;
 680
 681   match_first = start;
 682   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 683                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 684
 685   /* Check incrementally whether the input string matches.  */
 686   incr = (last_start < start) ? -1 : 1;
 687   left_lim = (last_start < start) ? last_start : start;
 688   right_lim = (last_start < start) ? start : last_start;
 689   sb = dfa->mb_cur_max == 1;
 690   match_kind =
 691     (fastmap
 692      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 693         | (start <= last_start ? 2 : 0)
 694         | (t != NULL ? 1 : 0))
 695      : 8);
 696
 697   for (;; match_first += incr)
 698     {
 699       err = REG_NOMATCH;
 700       if (match_first < left_lim || right_lim < match_first)
 701         goto free_return;
 702
 703       /* Advance as rapidly as possible through the string, until we
 704          find a plausible place to start matching.  This may be done
 705          with varying efficiency, so there are various possibilities:
 706          only the most common of them are specialized, in order to
 707          save on code size.  We use a switch statement for speed.  */
 708       switch (match_kind)
 709         {
 710         case 8:
 711           /* No fastmap.  */
 712           break;
 713
 714         case 7:
 715           /* Fastmap with single-byte translation, match forward.  */
 716           while (BE (match_first < right_lim, 1)
 717                  && !fastmap[t[(unsigned char) string[match_first]]])
 718             ++match_first;
 719           goto forward_match_found_start_or_reached_end;
 720
 721         case 6:
 722           /* Fastmap without translation, match forward.  */
 723           while (BE (match_first < right_lim, 1)
 724                  && !fastmap[(unsigned char) string[match_first]])
 725             ++match_first;
 726
 727         forward_match_found_start_or_reached_end:
 728           if (BE (match_first == right_lim, 0))
 729             {
 730               ch = match_first >= length
 731                        ? 0 : (unsigned char) string[match_first];
 732               if (!fastmap[t ? t[ch] : ch])
 733                 goto free_return;
 734             }
 735           break;
 736
 737         case 4:
 738         case 5:
 739           /* Fastmap without multi-byte translation, match backwards.  */
 740           while (match_first >= left_lim)
 741             {
 742               ch = match_first >= length
 743                        ? 0 : (unsigned char) string[match_first];
 744               if (fastmap[t ? t[ch] : ch])
 745                 break;
 746               --match_first;
 747             }
 748           if (match_first < left_lim)
 749             goto free_return;
 750           break;
 751
 752         default:
 753           /* In this case, we can't determine easily the current byte,
 754              since it might be a component byte of a multibyte
 755              character.  Then we use the constructed buffer instead.  */
 756           for (;;)
 757             {
 758               /* If MATCH_FIRST is out of the valid range, reconstruct the
 759                  buffers.  */
 760               __re_size_t offset = match_first - mctx.input.raw_mbs_idx;
 761               if (BE (offset >= (__re_size_t) mctx.input.valid_raw_len, 0))
 762                 {
 763                   err = re_string_reconstruct (&mctx.input, match_first,
 764                                                eflags);
 765                   if (BE (err != REG_NOERROR, 0))
 766                     goto free_return;
 767
 768                   offset = match_first - mctx.input.raw_mbs_idx;
 769                 }
 770               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 771                  Note that MATCH_FIRST must not be smaller than 0.  */
 772               ch = (match_first >= length
 773                     ? 0 : re_string_byte_at (&mctx.input, offset));
 774               if (fastmap[ch])
 775                 break;
 776               match_first += incr;
 777               if (match_first < left_lim || match_first > right_lim)
 778                 {
 779                   err = REG_NOMATCH;
 780                   goto free_return;
 781                 }
 782             }
 783           break;
 784         }
 785
 786       /* Reconstruct the buffers so that the matcher can assume that
 787          the matching starts from the beginning of the buffer.  */
 788       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 789       if (BE (err != REG_NOERROR, 0))
 790         goto free_return;
 791
 792 #ifdef RE_ENABLE_I18N
 793      /* Don't consider this char as a possible match start if it part,
 794         yet isn't the head, of a multibyte character.  */
 795       if (!sb && !re_string_first_byte (&mctx.input, 0))
 796         continue;
 797 #endif
 798
 799       /* It seems to be appropriate one, then use the matcher.  */
 800       /* We assume that the matching starts from 0.  */
 801       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 802       match_last = check_matching (&mctx, fl_longest_match,
 803                                    start <= last_start ? &match_first : NULL);
 804       if (match_last != -1)
 805         {
 806           if (BE (match_last == -2, 0))
 807             {
 808               err = REG_ESPACE;
 809               goto free_return;
 810             }
 811           else
 812             {
 813               mctx.match_last = match_last;
 814               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 815                 {
 816                   re_dfastate_t *pstate = mctx.state_log[match_last];
 817                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 818                                                              match_last);
 819                 }
 820               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 821                   || dfa->nbackref)
 822                 {
 823                   err = prune_impossible_nodes (&mctx);
 824                   if (err == REG_NOERROR)
 825                     break;
 826                   if (BE (err != REG_NOMATCH, 0))
 827                     goto free_return;
 828                   match_last = -1;
 829                 }
 830               else
 831                 break; /* We found a match.  */
 832             }
 833         }
 834
 835       match_ctx_clean (&mctx);
 836     }
 837
 838 #ifdef DEBUG
 839   assert (match_last != -1);
 840   assert (err == REG_NOERROR);
 841 #endif
 842
 843   /* Set pmatch[] if we need.  */
 844   if (nmatch > 0)
 845     {
 846       Idx reg_idx;
 847
 848       /* Initialize registers.  */
 849       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 850         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 851
 852       /* Set the points where matching start/end.  */
 853       pmatch[0].rm_so = 0;
 854       pmatch[0].rm_eo = mctx.match_last;
 855       /* FIXME: This function should fail if mctx.match_last exceeds
 856          the maximum possible regoff_t value.  We need a new error
 857          code REG_OVERFLOW.  */
 858
 859       if (!preg->no_sub && nmatch > 1)
 860         {
 861           err = set_regs (preg, &mctx, nmatch, pmatch,
 862                           dfa->has_plural_match && dfa->nbackref > 0);
 863           if (BE (err != REG_NOERROR, 0))
 864             goto free_return;
 865         }
 866
 867       /* At last, add the offset to each register, since we slid
 868          the buffers so that we could assume that the matching starts
 869          from 0.  */
 870       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 871         if (pmatch[reg_idx].rm_so != -1)
 872           {
 873 #ifdef RE_ENABLE_I18N
 874             if (BE (mctx.input.offsets_needed != 0, 0))
 875               {
 876                 pmatch[reg_idx].rm_so =
 877                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 878                    ? mctx.input.valid_raw_len
 879                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 880                 pmatch[reg_idx].rm_eo =
 881                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 882                    ? mctx.input.valid_raw_len
 883                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 884               }
 885 #else
 886             assert (mctx.input.offsets_needed == 0);
 887 #endif
 888             pmatch[reg_idx].rm_so += match_first;
 889             pmatch[reg_idx].rm_eo += match_first;
 890           }
 891       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 892         {
 893           pmatch[nmatch + reg_idx].rm_so = -1;
 894           pmatch[nmatch + reg_idx].rm_eo = -1;
 895         }
 896
 897       if (dfa->subexp_map)
 898         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 899           if (dfa->subexp_map[reg_idx] != reg_idx)
 900             {
 901               pmatch[reg_idx + 1].rm_so
 902                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 903               pmatch[reg_idx + 1].rm_eo
 904                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 905             }
 906     }
 907
 908  free_return:
 909   re_free (mctx.state_log);
 910   if (dfa->nbackref)
 911     match_ctx_free (&mctx);
 912   re_string_destruct (&mctx.input);
 913   return err;
 914 }
 915
 916 static reg_errcode_t
 917 __attribute_warn_unused_result__
 918 prune_impossible_nodes (re_match_context_t *mctx)
 919 {
 920   const re_dfa_t *const dfa = mctx->dfa;
 921   Idx halt_node, match_last;
 922   reg_errcode_t ret;
 923   re_dfastate_t **sifted_states;
 924   re_dfastate_t **lim_states = NULL;
 925   re_sift_context_t sctx;
 926 #ifdef DEBUG
 927   assert (mctx->state_log != NULL);
 928 #endif
 929   match_last = mctx->match_last;
 930   halt_node = mctx->last_node;
 931
 932   /* Avoid overflow.  */
 933   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) <= match_last, 0))
 934     return REG_ESPACE;
 935
 936   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 937   if (BE (sifted_states == NULL, 0))
 938     {
 939       ret = REG_ESPACE;
 940       goto free_return;
 941     }
 942   if (dfa->nbackref)
 943     {
 944       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 945       if (BE (lim_states == NULL, 0))
 946         {
 947           ret = REG_ESPACE;
 948           goto free_return;
 949         }
 950       while (1)
 951         {
 952           memset (lim_states, '\0',
 953                   sizeof (re_dfastate_t *) * (match_last + 1));
 954           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 955                          match_last);
 956           ret = sift_states_backward (mctx, &sctx);
 957           re_node_set_free (&sctx.limits);
 958           if (BE (ret != REG_NOERROR, 0))
 959               goto free_return;
 960           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 961             break;
 962           do
 963             {
 964               --match_last;
 965               if (match_last < 0)
 966                 {
 967                   ret = REG_NOMATCH;
 968                   goto free_return;
 969                 }
 970             } while (mctx->state_log[match_last] == NULL
 971                      || !mctx->state_log[match_last]->halt);
 972           halt_node = check_halt_state_context (mctx,
 973                                                 mctx->state_log[match_last],
 974                                                 match_last);
 975         }
 976       ret = merge_state_array (dfa, sifted_states, lim_states,
 977                                match_last + 1);
 978       re_free (lim_states);
 979       lim_states = NULL;
 980       if (BE (ret != REG_NOERROR, 0))
 981         goto free_return;
 982     }
 983   else
 984     {
 985       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
 986       ret = sift_states_backward (mctx, &sctx);
 987       re_node_set_free (&sctx.limits);
 988       if (BE (ret != REG_NOERROR, 0))
 989         goto free_return;
 990       if (sifted_states[0] == NULL)
 991         {
 992           ret = REG_NOMATCH;
 993           goto free_return;
 994         }
 995     }
 996   re_free (mctx->state_log);
 997   mctx->state_log = sifted_states;
 998   sifted_states = NULL;
 999   mctx->last_node = halt_node;
1000   mctx->match_last = match_last;
1001   ret = REG_NOERROR;
1002  free_return:
1003   re_free (sifted_states);
1004   re_free (lim_states);
1005   return ret;
1006 }
1007
1008 /* Acquire an initial state and return it.
1009    We must select appropriate initial state depending on the context,
1010    since initial states may have constraints like "\<", "^", etc..  */
1011
1012 static inline re_dfastate_t *
1013 __attribute__ ((always_inline))
1014 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1015                             Idx idx)
1016 {
1017   const re_dfa_t *const dfa = mctx->dfa;
1018   if (dfa->init_state->has_constraint)
1019     {
1020       unsigned int context;
1021       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1022       if (IS_WORD_CONTEXT (context))
1023         return dfa->init_state_word;
1024       else if (IS_ORDINARY_CONTEXT (context))
1025         return dfa->init_state;
1026       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1027         return dfa->init_state_begbuf;
1028       else if (IS_NEWLINE_CONTEXT (context))
1029         return dfa->init_state_nl;
1030       else if (IS_BEGBUF_CONTEXT (context))
1031         {
1032           /* It is relatively rare case, then calculate on demand.  */
1033           return re_acquire_state_context (err, dfa,
1034                                            dfa->init_state->entrance_nodes,
1035                                            context);
1036         }
1037       else
1038         /* Must not happen?  */
1039         return dfa->init_state;
1040     }
1041   else
1042     return dfa->init_state;
1043 }
1044
1045 /* Check whether the regular expression match input string INPUT or not,
1046    and return the index where the matching end.  Return -1 if
1047    there is no match, and return -2 in case of an error.
1048    FL_LONGEST_MATCH means we want the POSIX longest matching.
1049    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1050    next place where we may want to try matching.
1051    Note that the matcher assumes that the matching starts from the current
1052    index of the buffer.  */
1053
1054 static Idx
1055 __attribute_warn_unused_result__
1056 check_matching (re_match_context_t *mctx, bool fl_longest_match,
1057                 Idx *p_match_first)
1058 {
1059   const re_dfa_t *const dfa = mctx->dfa;
1060   reg_errcode_t err;
1061   Idx match = 0;
1062   Idx match_last = -1;
1063   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
1064   re_dfastate_t *cur_state;
1065   bool at_init_state = p_match_first != NULL;
1066   Idx next_start_idx = cur_str_idx;
1067
1068   err = REG_NOERROR;
1069   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1070   /* An initial state must not be NULL (invalid).  */
1071   if (BE (cur_state == NULL, 0))
1072     {
1073       assert (err == REG_ESPACE);
1074       return -2;
1075     }
1076
1077   if (mctx->state_log != NULL)
1078     {
1079       mctx->state_log[cur_str_idx] = cur_state;
1080
1081       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1082          later.  E.g. Processing back references.  */
1083       if (BE (dfa->nbackref, 0))
1084         {
1085           at_init_state = false;
1086           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1087           if (BE (err != REG_NOERROR, 0))
1088             return err;
1089
1090           if (cur_state->has_backref)
1091             {
1092               err = transit_state_bkref (mctx, &cur_state->nodes);
1093               if (BE (err != REG_NOERROR, 0))
1094                 return err;
1095             }
1096         }
1097     }
1098
1099   /* If the RE accepts NULL string.  */
1100   if (BE (cur_state->halt, 0))
1101     {
1102       if (!cur_state->has_constraint
1103           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1104         {
1105           if (!fl_longest_match)
1106             return cur_str_idx;
1107           else
1108             {
1109               match_last = cur_str_idx;
1110               match = 1;
1111             }
1112         }
1113     }
1114
1115   while (!re_string_eoi (&mctx->input))
1116     {
1117       re_dfastate_t *old_state = cur_state;
1118       Idx next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1119
1120       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1121            && mctx->input.bufs_len < mctx->input.len)
1122           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1123               && mctx->input.valid_len < mctx->input.len))
1124         {
1125           err = extend_buffers (mctx, next_char_idx + 1);
1126           if (BE (err != REG_NOERROR, 0))
1127             {
1128               assert (err == REG_ESPACE);
1129               return -2;
1130             }
1131         }
1132
1133       cur_state = transit_state (&err, mctx, cur_state);
1134       if (mctx->state_log != NULL)
1135         cur_state = merge_state_with_log (&err, mctx, cur_state);
1136
1137       if (cur_state == NULL)
1138         {
1139           /* Reached the invalid state or an error.  Try to recover a valid
1140              state using the state log, if available and if we have not
1141              already found a valid (even if not the longest) match.  */
1142           if (BE (err != REG_NOERROR, 0))
1143             return -2;
1144
1145           if (mctx->state_log == NULL
1146               || (match && !fl_longest_match)
1147               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1148             break;
1149         }
1150
1151       if (BE (at_init_state, 0))
1152         {
1153           if (old_state == cur_state)
1154             next_start_idx = next_char_idx;
1155           else
1156             at_init_state = false;
1157         }
1158
1159       if (cur_state->halt)
1160         {
1161           /* Reached a halt state.
1162              Check the halt state can satisfy the current context.  */
1163           if (!cur_state->has_constraint
1164               || check_halt_state_context (mctx, cur_state,
1165                                            re_string_cur_idx (&mctx->input)))
1166             {
1167               /* We found an appropriate halt state.  */
1168               match_last = re_string_cur_idx (&mctx->input);
1169               match = 1;
1170
1171               /* We found a match, do not modify match_first below.  */
1172               p_match_first = NULL;
1173               if (!fl_longest_match)
1174                 break;
1175             }
1176         }
1177     }
1178
1179   if (p_match_first)
1180     *p_match_first += next_start_idx;
1181
1182   return match_last;
1183 }
1184
1185 /* Check NODE match the current context.  */
1186
1187 static bool
1188 check_halt_node_context (const re_dfa_t *dfa, Idx node, unsigned int context)
1189 {
1190   re_token_type_t type = dfa->nodes[node].type;
1191   unsigned int constraint = dfa->nodes[node].constraint;
1192   if (type != END_OF_RE)
1193     return false;
1194   if (!constraint)
1195     return true;
1196   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1197     return false;
1198   return true;
1199 }
1200
1201 /* Check the halt state STATE match the current context.
1202    Return 0 if not match, if the node, STATE has, is a halt node and
1203    match the context, return the node.  */
1204
1205 static Idx
1206 check_halt_state_context (const re_match_context_t *mctx,
1207                           const re_dfastate_t *state, Idx idx)
1208 {
1209   Idx i;
1210   unsigned int context;
1211 #ifdef DEBUG
1212   assert (state->halt);
1213 #endif
1214   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1215   for (i = 0; i < state->nodes.nelem; ++i)
1216     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1217       return state->nodes.elems[i];
1218   return 0;
1219 }
1220
1221 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1222    corresponding to the DFA).
1223    Return the destination node, and update EPS_VIA_NODES;
1224    return -1 in case of errors.  */
1225
1226 static Idx
1227 proceed_next_node (const re_match_context_t *mctx, Idx nregs, regmatch_t *regs,
1228                    Idx *pidx, Idx node, re_node_set *eps_via_nodes,
1229                    struct re_fail_stack_t *fs)
1230 {
1231   const re_dfa_t *const dfa = mctx->dfa;
1232   Idx i;
1233   bool ok;
1234   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1235     {
1236       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1237       re_node_set *edests = &dfa->edests[node];
1238       Idx dest_node;
1239       ok = re_node_set_insert (eps_via_nodes, node);
1240       if (BE (! ok, 0))
1241         return -2;
1242       /* Pick up a valid destination, or return -1 if none
1243          is found.  */
1244       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1245         {
1246           Idx candidate = edests->elems[i];
1247           if (!re_node_set_contains (cur_nodes, candidate))
1248             continue;
1249           if (dest_node == -1)
1250             dest_node = candidate;
1251
1252           else
1253             {
1254               /* In order to avoid infinite loop like "(a*)*", return the second
1255                  epsilon-transition if the first was already considered.  */
1256               if (re_node_set_contains (eps_via_nodes, dest_node))
1257                 return candidate;
1258
1259               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1260               else if (fs != NULL
1261                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1262                                            eps_via_nodes))
1263                 return -2;
1264
1265               /* We know we are going to exit.  */
1266               break;
1267             }
1268         }
1269       return dest_node;
1270     }
1271   else
1272     {
1273       Idx naccepted = 0;
1274       re_token_type_t type = dfa->nodes[node].type;
1275
1276 #ifdef RE_ENABLE_I18N
1277       if (dfa->nodes[node].accept_mb)
1278         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1279       else
1280 #endif /* RE_ENABLE_I18N */
1281       if (type == OP_BACK_REF)
1282         {
1283           Idx subexp_idx = dfa->nodes[node].opr.idx + 1;
1284           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1285           if (fs != NULL)
1286             {
1287               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1288                 return -1;
1289               else if (naccepted)
1290                 {
1291                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1292                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1293                               naccepted) != 0)
1294                     return -1;
1295                 }
1296             }
1297
1298           if (naccepted == 0)
1299             {
1300               Idx dest_node;
1301               ok = re_node_set_insert (eps_via_nodes, node);
1302               if (BE (! ok, 0))
1303                 return -2;
1304               dest_node = dfa->edests[node].elems[0];
1305               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1306                                         dest_node))
1307                 return dest_node;
1308             }
1309         }
1310
1311       if (naccepted != 0
1312           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1313         {
1314           Idx dest_node = dfa->nexts[node];
1315           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1316           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1317                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1318                                                dest_node)))
1319             return -1;
1320           re_node_set_empty (eps_via_nodes);
1321           return dest_node;
1322         }
1323     }
1324   return -1;
1325 }
1326
1327 static reg_errcode_t
1328 __attribute_warn_unused_result__
1329 push_fail_stack (struct re_fail_stack_t *fs, Idx str_idx, Idx dest_node,
1330                  Idx nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1331 {
1332   reg_errcode_t err;
1333   Idx num = fs->num++;
1334   if (fs->num == fs->alloc)
1335     {
1336       struct re_fail_stack_ent_t *new_array;
1337       new_array = re_realloc (fs->stack, struct re_fail_stack_ent_t,
1338                               fs->alloc * 2);
1339       if (new_array == NULL)
1340         return REG_ESPACE;
1341       fs->alloc *= 2;
1342       fs->stack = new_array;
1343     }
1344   fs->stack[num].idx = str_idx;
1345   fs->stack[num].node = dest_node;
1346   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1347   if (fs->stack[num].regs == NULL)
1348     return REG_ESPACE;
1349   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1350   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1351   return err;
1352 }
1353
1354 static Idx
1355 pop_fail_stack (struct re_fail_stack_t *fs, Idx *pidx, Idx nregs,
1356                 regmatch_t *regs, re_node_set *eps_via_nodes)
1357 {
1358   Idx num = --fs->num;
1359   assert (num >= 0);
1360   *pidx = fs->stack[num].idx;
1361   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1362   re_node_set_free (eps_via_nodes);
1363   re_free (fs->stack[num].regs);
1364   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1365   return fs->stack[num].node;
1366 }
1367
1368 /* Set the positions where the subexpressions are starts/ends to registers
1369    PMATCH.
1370    Note: We assume that pmatch[0] is already set, and
1371    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1372
1373 static reg_errcode_t
1374 __attribute_warn_unused_result__
1375 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1376           regmatch_t *pmatch, bool fl_backtrack)
1377 {
1378   const re_dfa_t *dfa = preg->buffer;
1379   Idx idx, cur_node;
1380   re_node_set eps_via_nodes;
1381   struct re_fail_stack_t *fs;
1382   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1383   regmatch_t *prev_idx_match;
1384   bool prev_idx_match_malloced = false;
1385
1386 #ifdef DEBUG
1387   assert (nmatch > 1);
1388   assert (mctx->state_log != NULL);
1389 #endif
1390   if (fl_backtrack)
1391     {
1392       fs = &fs_body;
1393       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1394       if (fs->stack == NULL)
1395         return REG_ESPACE;
1396     }
1397   else
1398     fs = NULL;
1399
1400   cur_node = dfa->init_node;
1401   re_node_set_init_empty (&eps_via_nodes);
1402
1403   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1404     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1405   else
1406     {
1407       prev_idx_match = re_malloc (regmatch_t, nmatch);
1408       if (prev_idx_match == NULL)
1409         {
1410           free_fail_stack_return (fs);
1411           return REG_ESPACE;
1412         }
1413       prev_idx_match_malloced = true;
1414     }
1415   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1416
1417   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1418     {
1419       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1420
1421       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1422         {
1423           Idx reg_idx;
1424           if (fs)
1425             {
1426               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1427                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1428                   break;
1429               if (reg_idx == nmatch)
1430                 {
1431                   re_node_set_free (&eps_via_nodes);
1432                   if (prev_idx_match_malloced)
1433                     re_free (prev_idx_match);
1434                   return free_fail_stack_return (fs);
1435                 }
1436               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1437                                          &eps_via_nodes);
1438             }
1439           else
1440             {
1441               re_node_set_free (&eps_via_nodes);
1442               if (prev_idx_match_malloced)
1443                 re_free (prev_idx_match);
1444               return REG_NOERROR;
1445             }
1446         }
1447
1448       /* Proceed to next node.  */
1449       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1450                                     &eps_via_nodes, fs);
1451
1452       if (BE (cur_node < 0, 0))
1453         {
1454           if (BE (cur_node == -2, 0))
1455             {
1456               re_node_set_free (&eps_via_nodes);
1457               if (prev_idx_match_malloced)
1458                 re_free (prev_idx_match);
1459               free_fail_stack_return (fs);
1460               return REG_ESPACE;
1461             }
1462           if (fs)
1463             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1464                                        &eps_via_nodes);
1465           else
1466             {
1467               re_node_set_free (&eps_via_nodes);
1468               if (prev_idx_match_malloced)
1469                 re_free (prev_idx_match);
1470               return REG_NOMATCH;
1471             }
1472         }
1473     }
1474   re_node_set_free (&eps_via_nodes);
1475   if (prev_idx_match_malloced)
1476     re_free (prev_idx_match);
1477   return free_fail_stack_return (fs);
1478 }
1479
1480 static reg_errcode_t
1481 free_fail_stack_return (struct re_fail_stack_t *fs)
1482 {
1483   if (fs)
1484     {
1485       Idx fs_idx;
1486       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1487         {
1488           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1489           re_free (fs->stack[fs_idx].regs);
1490         }
1491       re_free (fs->stack);
1492     }
1493   return REG_NOERROR;
1494 }
1495
1496 static void
1497 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1498              regmatch_t *prev_idx_match, Idx cur_node, Idx cur_idx, Idx nmatch)
1499 {
1500   int type = dfa->nodes[cur_node].type;
1501   if (type == OP_OPEN_SUBEXP)
1502     {
1503       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1504
1505       /* We are at the first node of this sub expression.  */
1506       if (reg_num < nmatch)
1507         {
1508           pmatch[reg_num].rm_so = cur_idx;
1509           pmatch[reg_num].rm_eo = -1;
1510         }
1511     }
1512   else if (type == OP_CLOSE_SUBEXP)
1513     {
1514       Idx reg_num = dfa->nodes[cur_node].opr.idx + 1;
1515       if (reg_num < nmatch)
1516         {
1517           /* We are at the last node of this sub expression.  */
1518           if (pmatch[reg_num].rm_so < cur_idx)
1519             {
1520               pmatch[reg_num].rm_eo = cur_idx;
1521               /* This is a non-empty match or we are not inside an optional
1522                  subexpression.  Accept this right away.  */
1523               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1524             }
1525           else
1526             {
1527               if (dfa->nodes[cur_node].opt_subexp
1528                   && prev_idx_match[reg_num].rm_so != -1)
1529                 /* We transited through an empty match for an optional
1530                    subexpression, like (a?)*, and this is not the subexp's
1531                    first match.  Copy back the old content of the registers
1532                    so that matches of an inner subexpression are undone as
1533                    well, like in ((a?))*.  */
1534                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1535               else
1536                 /* We completed a subexpression, but it may be part of
1537                    an optional one, so do not update PREV_IDX_MATCH.  */
1538                 pmatch[reg_num].rm_eo = cur_idx;
1539             }
1540         }
1541     }
1542 }
1543
1544 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1545    and sift the nodes in each states according to the following rules.
1546    Updated state_log will be wrote to STATE_LOG.
1547
1548    Rules: We throw away the Node 'a' in the STATE_LOG[STR_IDX] if...
1549      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1550         If 'a' isn't the LAST_NODE and 'a' can't epsilon transit to
1551         the LAST_NODE, we throw away the node 'a'.
1552      2. When 0 <= STR_IDX < MATCH_LAST and 'a' accepts
1553         string 's' and transit to 'b':
1554         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1555            away the node 'a'.
1556         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1557             thrown away, we throw away the node 'a'.
1558      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1559         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1560            node 'a'.
1561         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1562             we throw away the node 'a'.  */
1563
1564 #define STATE_NODE_CONTAINS(state,node) \
1565   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1566
1567 static reg_errcode_t
1568 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1569 {
1570   reg_errcode_t err;
1571   int null_cnt = 0;
1572   Idx str_idx = sctx->last_str_idx;
1573   re_node_set cur_dest;
1574
1575 #ifdef DEBUG
1576   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1577 #endif
1578
1579   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1580      transit to the last_node and the last_node itself.  */
1581   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1582   if (BE (err != REG_NOERROR, 0))
1583     return err;
1584   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1585   if (BE (err != REG_NOERROR, 0))
1586     goto free_return;
1587
1588   /* Then check each states in the state_log.  */
1589   while (str_idx > 0)
1590     {
1591       /* Update counters.  */
1592       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1593       if (null_cnt > mctx->max_mb_elem_len)
1594         {
1595           memset (sctx->sifted_states, '\0',
1596                   sizeof (re_dfastate_t *) * str_idx);
1597           re_node_set_free (&cur_dest);
1598           return REG_NOERROR;
1599         }
1600       re_node_set_empty (&cur_dest);
1601       --str_idx;
1602
1603       if (mctx->state_log[str_idx])
1604         {
1605           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1606           if (BE (err != REG_NOERROR, 0))
1607             goto free_return;
1608         }
1609
1610       /* Add all the nodes which satisfy the following conditions:
1611          - It can epsilon transit to a node in CUR_DEST.
1612          - It is in CUR_SRC.
1613          And update state_log.  */
1614       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1615       if (BE (err != REG_NOERROR, 0))
1616         goto free_return;
1617     }
1618   err = REG_NOERROR;
1619  free_return:
1620   re_node_set_free (&cur_dest);
1621   return err;
1622 }
1623
1624 static reg_errcode_t
1625 __attribute_warn_unused_result__
1626 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1627                      Idx str_idx, re_node_set *cur_dest)
1628 {
1629   const re_dfa_t *const dfa = mctx->dfa;
1630   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1631   Idx i;
1632
1633   /* Then build the next sifted state.
1634      We build the next sifted state on 'cur_dest', and update
1635      'sifted_states[str_idx]' with 'cur_dest'.
1636      Note:
1637      'cur_dest' is the sifted state from 'state_log[str_idx + 1]'.
1638      'cur_src' points the node_set of the old 'state_log[str_idx]'
1639      (with the epsilon nodes pre-filtered out).  */
1640   for (i = 0; i < cur_src->nelem; i++)
1641     {
1642       Idx prev_node = cur_src->elems[i];
1643       int naccepted = 0;
1644       bool ok;
1645
1646 #ifdef DEBUG
1647       re_token_type_t type = dfa->nodes[prev_node].type;
1648       assert (!IS_EPSILON_NODE (type));
1649 #endif
1650 #ifdef RE_ENABLE_I18N
1651       /* If the node may accept "multi byte".  */
1652       if (dfa->nodes[prev_node].accept_mb)
1653         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1654                                          str_idx, sctx->last_str_idx);
1655 #endif /* RE_ENABLE_I18N */
1656
1657       /* We don't check backreferences here.
1658          See update_cur_sifted_state().  */
1659       if (!naccepted
1660           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1661           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1662                                   dfa->nexts[prev_node]))
1663         naccepted = 1;
1664
1665       if (naccepted == 0)
1666         continue;
1667
1668       if (sctx->limits.nelem)
1669         {
1670           Idx to_idx = str_idx + naccepted;
1671           if (check_dst_limits (mctx, &sctx->limits,
1672                                 dfa->nexts[prev_node], to_idx,
1673                                 prev_node, str_idx))
1674             continue;
1675         }
1676       ok = re_node_set_insert (cur_dest, prev_node);
1677       if (BE (! ok, 0))
1678         return REG_ESPACE;
1679     }
1680
1681   return REG_NOERROR;
1682 }
1683
1684 /* Helper functions.  */
1685
1686 static reg_errcode_t
1687 clean_state_log_if_needed (re_match_context_t *mctx, Idx next_state_log_idx)
1688 {
1689   Idx top = mctx->state_log_top;
1690
1691   if ((next_state_log_idx >= mctx->input.bufs_len
1692        && mctx->input.bufs_len < mctx->input.len)
1693       || (next_state_log_idx >= mctx->input.valid_len
1694           && mctx->input.valid_len < mctx->input.len))
1695     {
1696       reg_errcode_t err;
1697       err = extend_buffers (mctx, next_state_log_idx + 1);
1698       if (BE (err != REG_NOERROR, 0))
1699         return err;
1700     }
1701
1702   if (top < next_state_log_idx)
1703     {
1704       memset (mctx->state_log + top + 1, '\0',
1705               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1706       mctx->state_log_top = next_state_log_idx;
1707     }
1708   return REG_NOERROR;
1709 }
1710
1711 static reg_errcode_t
1712 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1713                    re_dfastate_t **src, Idx num)
1714 {
1715   Idx st_idx;
1716   reg_errcode_t err;
1717   for (st_idx = 0; st_idx < num; ++st_idx)
1718     {
1719       if (dst[st_idx] == NULL)
1720         dst[st_idx] = src[st_idx];
1721       else if (src[st_idx] != NULL)
1722         {
1723           re_node_set merged_set;
1724           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1725                                         &src[st_idx]->nodes);
1726           if (BE (err != REG_NOERROR, 0))
1727             return err;
1728           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1729           re_node_set_free (&merged_set);
1730           if (BE (err != REG_NOERROR, 0))
1731             return err;
1732         }
1733     }
1734   return REG_NOERROR;
1735 }
1736
1737 static reg_errcode_t
1738 update_cur_sifted_state (const re_match_context_t *mctx,
1739                          re_sift_context_t *sctx, Idx str_idx,
1740                          re_node_set *dest_nodes)
1741 {
1742   const re_dfa_t *const dfa = mctx->dfa;
1743   reg_errcode_t err = REG_NOERROR;
1744   const re_node_set *candidates;
1745   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1746                 : &mctx->state_log[str_idx]->nodes);
1747
1748   if (dest_nodes->nelem == 0)
1749     sctx->sifted_states[str_idx] = NULL;
1750   else
1751     {
1752       if (candidates)
1753         {
1754           /* At first, add the nodes which can epsilon transit to a node in
1755              DEST_NODE.  */
1756           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1757           if (BE (err != REG_NOERROR, 0))
1758             return err;
1759
1760           /* Then, check the limitations in the current sift_context.  */
1761           if (sctx->limits.nelem)
1762             {
1763               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1764                                          mctx->bkref_ents, str_idx);
1765               if (BE (err != REG_NOERROR, 0))
1766                 return err;
1767             }
1768         }
1769
1770       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1771       if (BE (err != REG_NOERROR, 0))
1772         return err;
1773     }
1774
1775   if (candidates && mctx->state_log[str_idx]->has_backref)
1776     {
1777       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1778       if (BE (err != REG_NOERROR, 0))
1779         return err;
1780     }
1781   return REG_NOERROR;
1782 }
1783
1784 static reg_errcode_t
1785 __attribute_warn_unused_result__
1786 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1787                        const re_node_set *candidates)
1788 {
1789   reg_errcode_t err = REG_NOERROR;
1790   Idx i;
1791
1792   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1793   if (BE (err != REG_NOERROR, 0))
1794     return err;
1795
1796   if (!state->inveclosure.alloc)
1797     {
1798       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1799       if (BE (err != REG_NOERROR, 0))
1800         return REG_ESPACE;
1801       for (i = 0; i < dest_nodes->nelem; i++)
1802         {
1803           err = re_node_set_merge (&state->inveclosure,
1804                                    dfa->inveclosures + dest_nodes->elems[i]);
1805           if (BE (err != REG_NOERROR, 0))
1806             return REG_ESPACE;
1807         }
1808     }
1809   return re_node_set_add_intersect (dest_nodes, candidates,
1810                                     &state->inveclosure);
1811 }
1812
1813 static reg_errcode_t
1814 sub_epsilon_src_nodes (const re_dfa_t *dfa, Idx node, re_node_set *dest_nodes,
1815                        const re_node_set *candidates)
1816 {
1817     Idx ecl_idx;
1818     reg_errcode_t err;
1819     re_node_set *inv_eclosure = dfa->inveclosures + node;
1820     re_node_set except_nodes;
1821     re_node_set_init_empty (&except_nodes);
1822     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1823       {
1824         Idx cur_node = inv_eclosure->elems[ecl_idx];
1825         if (cur_node == node)
1826           continue;
1827         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1828           {
1829             Idx edst1 = dfa->edests[cur_node].elems[0];
1830             Idx edst2 = ((dfa->edests[cur_node].nelem > 1)
1831                          ? dfa->edests[cur_node].elems[1] : -1);
1832             if ((!re_node_set_contains (inv_eclosure, edst1)
1833                  && re_node_set_contains (dest_nodes, edst1))
1834                 || (edst2 > 0
1835                     && !re_node_set_contains (inv_eclosure, edst2)
1836                     && re_node_set_contains (dest_nodes, edst2)))
1837               {
1838                 err = re_node_set_add_intersect (&except_nodes, candidates,
1839                                                  dfa->inveclosures + cur_node);
1840                 if (BE (err != REG_NOERROR, 0))
1841                   {
1842                     re_node_set_free (&except_nodes);
1843                     return err;
1844                   }
1845               }
1846           }
1847       }
1848     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1849       {
1850         Idx cur_node = inv_eclosure->elems[ecl_idx];
1851         if (!re_node_set_contains (&except_nodes, cur_node))
1852           {
1853             Idx idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1854             re_node_set_remove_at (dest_nodes, idx);
1855           }
1856       }
1857     re_node_set_free (&except_nodes);
1858     return REG_NOERROR;
1859 }
1860
1861 static bool
1862 check_dst_limits (const re_match_context_t *mctx, const re_node_set *limits,
1863                   Idx dst_node, Idx dst_idx, Idx src_node, Idx src_idx)
1864 {
1865   const re_dfa_t *const dfa = mctx->dfa;
1866   Idx lim_idx, src_pos, dst_pos;
1867
1868   Idx dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1869   Idx src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1870   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1871     {
1872       Idx subexp_idx;
1873       struct re_backref_cache_entry *ent;
1874       ent = mctx->bkref_ents + limits->elems[lim_idx];
1875       subexp_idx = dfa->nodes[ent->node].opr.idx;
1876
1877       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1878                                            subexp_idx, dst_node, dst_idx,
1879                                            dst_bkref_idx);
1880       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1881                                            subexp_idx, src_node, src_idx,
1882                                            src_bkref_idx);
1883
1884       /* In case of:
1885          <src> <dst> ( <subexp> )
1886          ( <subexp> ) <src> <dst>
1887          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1888       if (src_pos == dst_pos)
1889         continue; /* This is unrelated limitation.  */
1890       else
1891         return true;
1892     }
1893   return false;
1894 }
1895
1896 static int
1897 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1898                              Idx subexp_idx, Idx from_node, Idx bkref_idx)
1899 {
1900   const re_dfa_t *const dfa = mctx->dfa;
1901   const re_node_set *eclosures = dfa->eclosures + from_node;
1902   Idx node_idx;
1903
1904   /* Else, we are on the boundary: examine the nodes on the epsilon
1905      closure.  */
1906   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1907     {
1908       Idx node = eclosures->elems[node_idx];
1909       switch (dfa->nodes[node].type)
1910         {
1911         case OP_BACK_REF:
1912           if (bkref_idx != -1)
1913             {
1914               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1915               do
1916                 {
1917                   Idx dst;
1918                   int cpos;
1919
1920                   if (ent->node != node)
1921                     continue;
1922
1923                   if (subexp_idx < BITSET_WORD_BITS
1924                       && !(ent->eps_reachable_subexps_map
1925                            & ((bitset_word_t) 1 << subexp_idx)))
1926                     continue;
1927
1928                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1929                      OP_CLOSE_SUBEXP cases below.  But, if the
1930                      destination node is the same node as the source
1931                      node, don't recurse because it would cause an
1932                      infinite loop: a regex that exhibits this behavior
1933                      is ()\1*\1*  */
1934                   dst = dfa->edests[node].elems[0];
1935                   if (dst == from_node)
1936                     {
1937                       if (boundaries & 1)
1938                         return -1;
1939                       else /* if (boundaries & 2) */
1940                         return 0;
1941                     }
1942
1943                   cpos =
1944                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1945                                                  dst, bkref_idx);
1946                   if (cpos == -1 /* && (boundaries & 1) */)
1947                     return -1;
1948                   if (cpos == 0 && (boundaries & 2))
1949                     return 0;
1950
1951                   if (subexp_idx < BITSET_WORD_BITS)
1952                     ent->eps_reachable_subexps_map
1953                       &= ~((bitset_word_t) 1 << subexp_idx);
1954                 }
1955               while (ent++->more);
1956             }
1957           break;
1958
1959         case OP_OPEN_SUBEXP:
1960           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
1961             return -1;
1962           break;
1963
1964         case OP_CLOSE_SUBEXP:
1965           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
1966             return 0;
1967           break;
1968
1969         default:
1970             break;
1971         }
1972     }
1973
1974   return (boundaries & 2) ? 1 : 0;
1975 }
1976
1977 static int
1978 check_dst_limits_calc_pos (const re_match_context_t *mctx, Idx limit,
1979                            Idx subexp_idx, Idx from_node, Idx str_idx,
1980                            Idx bkref_idx)
1981 {
1982   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
1983   int boundaries;
1984
1985   /* If we are outside the range of the subexpression, return -1 or 1.  */
1986   if (str_idx < lim->subexp_from)
1987     return -1;
1988
1989   if (lim->subexp_to < str_idx)
1990     return 1;
1991
1992   /* If we are within the subexpression, return 0.  */
1993   boundaries = (str_idx == lim->subexp_from);
1994   boundaries |= (str_idx == lim->subexp_to) << 1;
1995   if (boundaries == 0)
1996     return 0;
1997
1998   /* Else, examine epsilon closure.  */
1999   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2000                                       from_node, bkref_idx);
2001 }
2002
2003 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2004    which are against limitations from DEST_NODES. */
2005
2006 static reg_errcode_t
2007 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2008                      const re_node_set *candidates, re_node_set *limits,
2009                      struct re_backref_cache_entry *bkref_ents, Idx str_idx)
2010 {
2011   reg_errcode_t err;
2012   Idx node_idx, lim_idx;
2013
2014   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2015     {
2016       Idx subexp_idx;
2017       struct re_backref_cache_entry *ent;
2018       ent = bkref_ents + limits->elems[lim_idx];
2019
2020       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2021         continue; /* This is unrelated limitation.  */
2022
2023       subexp_idx = dfa->nodes[ent->node].opr.idx;
2024       if (ent->subexp_to == str_idx)
2025         {
2026           Idx ops_node = -1;
2027           Idx cls_node = -1;
2028           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2029             {
2030               Idx node = dest_nodes->elems[node_idx];
2031               re_token_type_t type = dfa->nodes[node].type;
2032               if (type == OP_OPEN_SUBEXP
2033                   && subexp_idx == dfa->nodes[node].opr.idx)
2034                 ops_node = node;
2035               else if (type == OP_CLOSE_SUBEXP
2036                        && subexp_idx == dfa->nodes[node].opr.idx)
2037                 cls_node = node;
2038             }
2039
2040           /* Check the limitation of the open subexpression.  */
2041           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2042           if (ops_node >= 0)
2043             {
2044               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2045                                            candidates);
2046               if (BE (err != REG_NOERROR, 0))
2047                 return err;
2048             }
2049
2050           /* Check the limitation of the close subexpression.  */
2051           if (cls_node >= 0)
2052             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2053               {
2054                 Idx node = dest_nodes->elems[node_idx];
2055                 if (!re_node_set_contains (dfa->inveclosures + node,
2056                                            cls_node)
2057                     && !re_node_set_contains (dfa->eclosures + node,
2058                                               cls_node))
2059                   {
2060                     /* It is against this limitation.
2061                        Remove it form the current sifted state.  */
2062                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2063                                                  candidates);
2064                     if (BE (err != REG_NOERROR, 0))
2065                       return err;
2066                     --node_idx;
2067                   }
2068               }
2069         }
2070       else /* (ent->subexp_to != str_idx)  */
2071         {
2072           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2073             {
2074               Idx node = dest_nodes->elems[node_idx];
2075               re_token_type_t type = dfa->nodes[node].type;
2076               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2077                 {
2078                   if (subexp_idx != dfa->nodes[node].opr.idx)
2079                     continue;
2080                   /* It is against this limitation.
2081                      Remove it form the current sifted state.  */
2082                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2083                                                candidates);
2084                   if (BE (err != REG_NOERROR, 0))
2085                     return err;
2086                 }
2087             }
2088         }
2089     }
2090   return REG_NOERROR;
2091 }
2092
2093 static reg_errcode_t
2094 __attribute_warn_unused_result__
2095 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2096                    Idx str_idx, const re_node_set *candidates)
2097 {
2098   const re_dfa_t *const dfa = mctx->dfa;
2099   reg_errcode_t err;
2100   Idx node_idx, node;
2101   re_sift_context_t local_sctx;
2102   Idx first_idx = search_cur_bkref_entry (mctx, str_idx);
2103
2104   if (first_idx == -1)
2105     return REG_NOERROR;
2106
2107   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2108
2109   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2110     {
2111       Idx enabled_idx;
2112       re_token_type_t type;
2113       struct re_backref_cache_entry *entry;
2114       node = candidates->elems[node_idx];
2115       type = dfa->nodes[node].type;
2116       /* Avoid infinite loop for the REs like "()\1+".  */
2117       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2118         continue;
2119       if (type != OP_BACK_REF)
2120         continue;
2121
2122       entry = mctx->bkref_ents + first_idx;
2123       enabled_idx = first_idx;
2124       do
2125         {
2126           Idx subexp_len;
2127           Idx to_idx;
2128           Idx dst_node;
2129           bool ok;
2130           re_dfastate_t *cur_state;
2131
2132           if (entry->node != node)
2133             continue;
2134           subexp_len = entry->subexp_to - entry->subexp_from;
2135           to_idx = str_idx + subexp_len;
2136           dst_node = (subexp_len ? dfa->nexts[node]
2137                       : dfa->edests[node].elems[0]);
2138
2139           if (to_idx > sctx->last_str_idx
2140               || sctx->sifted_states[to_idx] == NULL
2141               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2142               || check_dst_limits (mctx, &sctx->limits, node,
2143                                    str_idx, dst_node, to_idx))
2144             continue;
2145
2146           if (local_sctx.sifted_states == NULL)
2147             {
2148               local_sctx = *sctx;
2149               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2150               if (BE (err != REG_NOERROR, 0))
2151                 goto free_return;
2152             }
2153           local_sctx.last_node = node;
2154           local_sctx.last_str_idx = str_idx;
2155           ok = re_node_set_insert (&local_sctx.limits, enabled_idx);
2156           if (BE (! ok, 0))
2157             {
2158               err = REG_ESPACE;
2159               goto free_return;
2160             }
2161           cur_state = local_sctx.sifted_states[str_idx];
2162           err = sift_states_backward (mctx, &local_sctx);
2163           if (BE (err != REG_NOERROR, 0))
2164             goto free_return;
2165           if (sctx->limited_states != NULL)
2166             {
2167               err = merge_state_array (dfa, sctx->limited_states,
2168                                        local_sctx.sifted_states,
2169                                        str_idx + 1);
2170               if (BE (err != REG_NOERROR, 0))
2171                 goto free_return;
2172             }
2173           local_sctx.sifted_states[str_idx] = cur_state;
2174           re_node_set_remove (&local_sctx.limits, enabled_idx);
2175
2176           /* mctx->bkref_ents may have changed, reload the pointer.  */
2177           entry = mctx->bkref_ents + enabled_idx;
2178         }
2179       while (enabled_idx++, entry++->more);
2180     }
2181   err = REG_NOERROR;
2182  free_return:
2183   if (local_sctx.sifted_states != NULL)
2184     {
2185       re_node_set_free (&local_sctx.limits);
2186     }
2187
2188   return err;
2189 }
2190
2191
2192 #ifdef RE_ENABLE_I18N
2193 static int
2194 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2195                      Idx node_idx, Idx str_idx, Idx max_str_idx)
2196 {
2197   const re_dfa_t *const dfa = mctx->dfa;
2198   int naccepted;
2199   /* Check the node can accept "multi byte".  */
2200   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2201   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2202       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2203                             dfa->nexts[node_idx]))
2204     /* The node can't accept the "multi byte", or the
2205        destination was already thrown away, then the node
2206        could't accept the current input "multi byte".   */
2207     naccepted = 0;
2208   /* Otherwise, it is sure that the node could accept
2209      'naccepted' bytes input.  */
2210   return naccepted;
2211 }
2212 #endif /* RE_ENABLE_I18N */
2213
2214 \f
2215 /* Functions for state transition.  */
2216
2217 /* Return the next state to which the current state STATE will transit by
2218    accepting the current input byte, and update STATE_LOG if necessary.
2219    If STATE can accept a multibyte char/collating element/back reference
2220    update the destination of STATE_LOG.  */
2221
2222 static re_dfastate_t *
2223 __attribute_warn_unused_result__
2224 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2225                re_dfastate_t *state)
2226 {
2227   re_dfastate_t **trtable;
2228   unsigned char ch;
2229
2230 #ifdef RE_ENABLE_I18N
2231   /* If the current state can accept multibyte.  */
2232   if (BE (state->accept_mb, 0))
2233     {
2234       *err = transit_state_mb (mctx, state);
2235       if (BE (*err != REG_NOERROR, 0))
2236         return NULL;
2237     }
2238 #endif /* RE_ENABLE_I18N */
2239
2240   /* Then decide the next state with the single byte.  */
2241 #if 0
2242   if (0)
2243     /* don't use transition table  */
2244     return transit_state_sb (err, mctx, state);
2245 #endif
2246
2247   /* Use transition table  */
2248   ch = re_string_fetch_byte (&mctx->input);
2249   for (;;)
2250     {
2251       trtable = state->trtable;
2252       if (BE (trtable != NULL, 1))
2253         return trtable[ch];
2254
2255       trtable = state->word_trtable;
2256       if (BE (trtable != NULL, 1))
2257         {
2258           unsigned int context;
2259           context
2260             = re_string_context_at (&mctx->input,
2261                                     re_string_cur_idx (&mctx->input) - 1,
2262                                     mctx->eflags);
2263           if (IS_WORD_CONTEXT (context))
2264             return trtable[ch + SBC_MAX];
2265           else
2266             return trtable[ch];
2267         }
2268
2269       if (!build_trtable (mctx->dfa, state))
2270         {
2271           *err = REG_ESPACE;
2272           return NULL;
2273         }
2274
2275       /* Retry, we now have a transition table.  */
2276     }
2277 }
2278
2279 /* Update the state_log if we need */
2280 static re_dfastate_t *
2281 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2282                       re_dfastate_t *next_state)
2283 {
2284   const re_dfa_t *const dfa = mctx->dfa;
2285   Idx cur_idx = re_string_cur_idx (&mctx->input);
2286
2287   if (cur_idx > mctx->state_log_top)
2288     {
2289       mctx->state_log[cur_idx] = next_state;
2290       mctx->state_log_top = cur_idx;
2291     }
2292   else if (mctx->state_log[cur_idx] == 0)
2293     {
2294       mctx->state_log[cur_idx] = next_state;
2295     }
2296   else
2297     {
2298       re_dfastate_t *pstate;
2299       unsigned int context;
2300       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2301       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2302          the destination of a multibyte char/collating element/
2303          back reference.  Then the next state is the union set of
2304          these destinations and the results of the transition table.  */
2305       pstate = mctx->state_log[cur_idx];
2306       log_nodes = pstate->entrance_nodes;
2307       if (next_state != NULL)
2308         {
2309           table_nodes = next_state->entrance_nodes;
2310           *err = re_node_set_init_union (&next_nodes, table_nodes,
2311                                              log_nodes);
2312           if (BE (*err != REG_NOERROR, 0))
2313             return NULL;
2314         }
2315       else
2316         next_nodes = *log_nodes;
2317       /* Note: We already add the nodes of the initial state,
2318          then we don't need to add them here.  */
2319
2320       context = re_string_context_at (&mctx->input,
2321                                       re_string_cur_idx (&mctx->input) - 1,
2322                                       mctx->eflags);
2323       next_state = mctx->state_log[cur_idx]
2324         = re_acquire_state_context (err, dfa, &next_nodes, context);
2325       /* We don't need to check errors here, since the return value of
2326          this function is next_state and ERR is already set.  */
2327
2328       if (table_nodes != NULL)
2329         re_node_set_free (&next_nodes);
2330     }
2331
2332   if (BE (dfa->nbackref, 0) && next_state != NULL)
2333     {
2334       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2335          later.  We must check them here, since the back references in the
2336          next state might use them.  */
2337       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2338                                         cur_idx);
2339       if (BE (*err != REG_NOERROR, 0))
2340         return NULL;
2341
2342       /* If the next state has back references.  */
2343       if (next_state->has_backref)
2344         {
2345           *err = transit_state_bkref (mctx, &next_state->nodes);
2346           if (BE (*err != REG_NOERROR, 0))
2347             return NULL;
2348           next_state = mctx->state_log[cur_idx];
2349         }
2350     }
2351
2352   return next_state;
2353 }
2354
2355 /* Skip bytes in the input that correspond to part of a
2356    multi-byte match, then look in the log for a state
2357    from which to restart matching.  */
2358 static re_dfastate_t *
2359 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2360 {
2361   re_dfastate_t *cur_state;
2362   do
2363     {
2364       Idx max = mctx->state_log_top;
2365       Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2366
2367       do
2368         {
2369           if (++cur_str_idx > max)
2370             return NULL;
2371           re_string_skip_bytes (&mctx->input, 1);
2372         }
2373       while (mctx->state_log[cur_str_idx] == NULL);
2374
2375       cur_state = merge_state_with_log (err, mctx, NULL);
2376     }
2377   while (*err == REG_NOERROR && cur_state == NULL);
2378   return cur_state;
2379 }
2380
2381 /* Helper functions for transit_state.  */
2382
2383 /* From the node set CUR_NODES, pick up the nodes whose types are
2384    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2385    expression. And register them to use them later for evaluating the
2386    corresponding back references.  */
2387
2388 static reg_errcode_t
2389 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2390                            Idx str_idx)
2391 {
2392   const re_dfa_t *const dfa = mctx->dfa;
2393   Idx node_idx;
2394   reg_errcode_t err;
2395
2396   /* TODO: This isn't efficient.
2397            Because there might be more than one nodes whose types are
2398            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2399            nodes.
2400            E.g. RE: (a){2}  */
2401   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2402     {
2403       Idx node = cur_nodes->elems[node_idx];
2404       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2405           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2406           && (dfa->used_bkref_map
2407               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2408         {
2409           err = match_ctx_add_subtop (mctx, node, str_idx);
2410           if (BE (err != REG_NOERROR, 0))
2411             return err;
2412         }
2413     }
2414   return REG_NOERROR;
2415 }
2416
2417 #if 0
2418 /* Return the next state to which the current state STATE will transit by
2419    accepting the current input byte.  */
2420
2421 static re_dfastate_t *
2422 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2423                   re_dfastate_t *state)
2424 {
2425   const re_dfa_t *const dfa = mctx->dfa;
2426   re_node_set next_nodes;
2427   re_dfastate_t *next_state;
2428   Idx node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2429   unsigned int context;
2430
2431   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2432   if (BE (*err != REG_NOERROR, 0))
2433     return NULL;
2434   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2435     {
2436       Idx cur_node = state->nodes.elems[node_cnt];
2437       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2438         {
2439           *err = re_node_set_merge (&next_nodes,
2440                                     dfa->eclosures + dfa->nexts[cur_node]);
2441           if (BE (*err != REG_NOERROR, 0))
2442             {
2443               re_node_set_free (&next_nodes);
2444               return NULL;
2445             }
2446         }
2447     }
2448   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2449   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2450   /* We don't need to check errors here, since the return value of
2451      this function is next_state and ERR is already set.  */
2452
2453   re_node_set_free (&next_nodes);
2454   re_string_skip_bytes (&mctx->input, 1);
2455   return next_state;
2456 }
2457 #endif
2458
2459 #ifdef RE_ENABLE_I18N
2460 static reg_errcode_t
2461 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2462 {
2463   const re_dfa_t *const dfa = mctx->dfa;
2464   reg_errcode_t err;
2465   Idx i;
2466
2467   for (i = 0; i < pstate->nodes.nelem; ++i)
2468     {
2469       re_node_set dest_nodes, *new_nodes;
2470       Idx cur_node_idx = pstate->nodes.elems[i];
2471       int naccepted;
2472       Idx dest_idx;
2473       unsigned int context;
2474       re_dfastate_t *dest_state;
2475
2476       if (!dfa->nodes[cur_node_idx].accept_mb)
2477         continue;
2478
2479       if (dfa->nodes[cur_node_idx].constraint)
2480         {
2481           context = re_string_context_at (&mctx->input,
2482                                           re_string_cur_idx (&mctx->input),
2483                                           mctx->eflags);
2484           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2485                                            context))
2486             continue;
2487         }
2488
2489       /* How many bytes the node can accept?  */
2490       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2491                                            re_string_cur_idx (&mctx->input));
2492       if (naccepted == 0)
2493         continue;
2494
2495       /* The node can accepts 'naccepted' bytes.  */
2496       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2497       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2498                                : mctx->max_mb_elem_len);
2499       err = clean_state_log_if_needed (mctx, dest_idx);
2500       if (BE (err != REG_NOERROR, 0))
2501         return err;
2502 #ifdef DEBUG
2503       assert (dfa->nexts[cur_node_idx] != -1);
2504 #endif
2505       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2506
2507       dest_state = mctx->state_log[dest_idx];
2508       if (dest_state == NULL)
2509         dest_nodes = *new_nodes;
2510       else
2511         {
2512           err = re_node_set_init_union (&dest_nodes,
2513                                         dest_state->entrance_nodes, new_nodes);
2514           if (BE (err != REG_NOERROR, 0))
2515             return err;
2516         }
2517       context = re_string_context_at (&mctx->input, dest_idx - 1,
2518                                       mctx->eflags);
2519       mctx->state_log[dest_idx]
2520         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2521       if (dest_state != NULL)
2522         re_node_set_free (&dest_nodes);
2523       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2524         return err;
2525     }
2526   return REG_NOERROR;
2527 }
2528 #endif /* RE_ENABLE_I18N */
2529
2530 static reg_errcode_t
2531 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2532 {
2533   const re_dfa_t *const dfa = mctx->dfa;
2534   reg_errcode_t err;
2535   Idx i;
2536   Idx cur_str_idx = re_string_cur_idx (&mctx->input);
2537
2538   for (i = 0; i < nodes->nelem; ++i)
2539     {
2540       Idx dest_str_idx, prev_nelem, bkc_idx;
2541       Idx node_idx = nodes->elems[i];
2542       unsigned int context;
2543       const re_token_t *node = dfa->nodes + node_idx;
2544       re_node_set *new_dest_nodes;
2545
2546       /* Check whether 'node' is a backreference or not.  */
2547       if (node->type != OP_BACK_REF)
2548         continue;
2549
2550       if (node->constraint)
2551         {
2552           context = re_string_context_at (&mctx->input, cur_str_idx,
2553                                           mctx->eflags);
2554           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2555             continue;
2556         }
2557
2558       /* 'node' is a backreference.
2559          Check the substring which the substring matched.  */
2560       bkc_idx = mctx->nbkref_ents;
2561       err = get_subexp (mctx, node_idx, cur_str_idx);
2562       if (BE (err != REG_NOERROR, 0))
2563         goto free_return;
2564
2565       /* And add the epsilon closures (which is 'new_dest_nodes') of
2566          the backreference to appropriate state_log.  */
2567 #ifdef DEBUG
2568       assert (dfa->nexts[node_idx] != -1);
2569 #endif
2570       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2571         {
2572           Idx subexp_len;
2573           re_dfastate_t *dest_state;
2574           struct re_backref_cache_entry *bkref_ent;
2575           bkref_ent = mctx->bkref_ents + bkc_idx;
2576           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2577             continue;
2578           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2579           new_dest_nodes = (subexp_len == 0
2580                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2581                             : dfa->eclosures + dfa->nexts[node_idx]);
2582           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2583                           - bkref_ent->subexp_from);
2584           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2585                                           mctx->eflags);
2586           dest_state = mctx->state_log[dest_str_idx];
2587           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2588                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2589           /* Add 'new_dest_node' to state_log.  */
2590           if (dest_state == NULL)
2591             {
2592               mctx->state_log[dest_str_idx]
2593                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2594                                             context);
2595               if (BE (mctx->state_log[dest_str_idx] == NULL
2596                       && err != REG_NOERROR, 0))
2597                 goto free_return;
2598             }
2599           else
2600             {
2601               re_node_set dest_nodes;
2602               err = re_node_set_init_union (&dest_nodes,
2603                                             dest_state->entrance_nodes,
2604                                             new_dest_nodes);
2605               if (BE (err != REG_NOERROR, 0))
2606                 {
2607                   re_node_set_free (&dest_nodes);
2608                   goto free_return;
2609                 }
2610               mctx->state_log[dest_str_idx]
2611                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2612               re_node_set_free (&dest_nodes);
2613               if (BE (mctx->state_log[dest_str_idx] == NULL
2614                       && err != REG_NOERROR, 0))
2615                 goto free_return;
2616             }
2617           /* We need to check recursively if the backreference can epsilon
2618              transit.  */
2619           if (subexp_len == 0
2620               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2621             {
2622               err = check_subexp_matching_top (mctx, new_dest_nodes,
2623                                                cur_str_idx);
2624               if (BE (err != REG_NOERROR, 0))
2625                 goto free_return;
2626               err = transit_state_bkref (mctx, new_dest_nodes);
2627               if (BE (err != REG_NOERROR, 0))
2628                 goto free_return;
2629             }
2630         }
2631     }
2632   err = REG_NOERROR;
2633  free_return:
2634   return err;
2635 }
2636
2637 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2638    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2639    Note that we might collect inappropriate candidates here.
2640    However, the cost of checking them strictly here is too high, then we
2641    delay these checking for prune_impossible_nodes().  */
2642
2643 static reg_errcode_t
2644 __attribute_warn_unused_result__
2645 get_subexp (re_match_context_t *mctx, Idx bkref_node, Idx bkref_str_idx)
2646 {
2647   const re_dfa_t *const dfa = mctx->dfa;
2648   Idx subexp_num, sub_top_idx;
2649   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2650   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2651   Idx cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2652   if (cache_idx != -1)
2653     {
2654       const struct re_backref_cache_entry *entry
2655         = mctx->bkref_ents + cache_idx;
2656       do
2657         if (entry->node == bkref_node)
2658           return REG_NOERROR; /* We already checked it.  */
2659       while (entry++->more);
2660     }
2661
2662   subexp_num = dfa->nodes[bkref_node].opr.idx;
2663
2664   /* For each sub expression  */
2665   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2666     {
2667       reg_errcode_t err;
2668       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2669       re_sub_match_last_t *sub_last;
2670       Idx sub_last_idx, sl_str, bkref_str_off;
2671
2672       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2673         continue; /* It isn't related.  */
2674
2675       sl_str = sub_top->str_idx;
2676       bkref_str_off = bkref_str_idx;
2677       /* At first, check the last node of sub expressions we already
2678          evaluated.  */
2679       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2680         {
2681           regoff_t sl_str_diff;
2682           sub_last = sub_top->lasts[sub_last_idx];
2683           sl_str_diff = sub_last->str_idx - sl_str;
2684           /* The matched string by the sub expression match with the substring
2685              at the back reference?  */
2686           if (sl_str_diff > 0)
2687             {
2688               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2689                 {
2690                   /* Not enough chars for a successful match.  */
2691                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2692                     break;
2693
2694                   err = clean_state_log_if_needed (mctx,
2695                                                    bkref_str_off
2696                                                    + sl_str_diff);
2697                   if (BE (err != REG_NOERROR, 0))
2698                     return err;
2699                   buf = (const char *) re_string_get_buffer (&mctx->input);
2700                 }
2701               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2702                 /* We don't need to search this sub expression any more.  */
2703                 break;
2704             }
2705           bkref_str_off += sl_str_diff;
2706           sl_str += sl_str_diff;
2707           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2708                                 bkref_str_idx);
2709
2710           /* Reload buf, since the preceding call might have reallocated
2711              the buffer.  */
2712           buf = (const char *) re_string_get_buffer (&mctx->input);
2713
2714           if (err == REG_NOMATCH)
2715             continue;
2716           if (BE (err != REG_NOERROR, 0))
2717             return err;
2718         }
2719
2720       if (sub_last_idx < sub_top->nlasts)
2721         continue;
2722       if (sub_last_idx > 0)
2723         ++sl_str;
2724       /* Then, search for the other last nodes of the sub expression.  */
2725       for (; sl_str <= bkref_str_idx; ++sl_str)
2726         {
2727           Idx cls_node;
2728           regoff_t sl_str_off;
2729           const re_node_set *nodes;
2730           sl_str_off = sl_str - sub_top->str_idx;
2731           /* The matched string by the sub expression match with the substring
2732              at the back reference?  */
2733           if (sl_str_off > 0)
2734             {
2735               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2736                 {
2737                   /* If we are at the end of the input, we cannot match.  */
2738                   if (bkref_str_off >= mctx->input.len)
2739                     break;
2740
2741                   err = extend_buffers (mctx, bkref_str_off + 1);
2742                   if (BE (err != REG_NOERROR, 0))
2743                     return err;
2744
2745                   buf = (const char *) re_string_get_buffer (&mctx->input);
2746                 }
2747               if (buf [bkref_str_off++] != buf[sl_str - 1])
2748                 break; /* We don't need to search this sub expression
2749                           any more.  */
2750             }
2751           if (mctx->state_log[sl_str] == NULL)
2752             continue;
2753           /* Does this state have a ')' of the sub expression?  */
2754           nodes = &mctx->state_log[sl_str]->nodes;
2755           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2756                                        OP_CLOSE_SUBEXP);
2757           if (cls_node == -1)
2758             continue; /* No.  */
2759           if (sub_top->path == NULL)
2760             {
2761               sub_top->path = calloc (sizeof (state_array_t),
2762                                       sl_str - sub_top->str_idx + 1);
2763               if (sub_top->path == NULL)
2764                 return REG_ESPACE;
2765             }
2766           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2767              in the current context?  */
2768           err = check_arrival (mctx, sub_top->path, sub_top->node,
2769                                sub_top->str_idx, cls_node, sl_str,
2770                                OP_CLOSE_SUBEXP);
2771           if (err == REG_NOMATCH)
2772               continue;
2773           if (BE (err != REG_NOERROR, 0))
2774               return err;
2775           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2776           if (BE (sub_last == NULL, 0))
2777             return REG_ESPACE;
2778           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2779                                 bkref_str_idx);
2780           if (err == REG_NOMATCH)
2781             continue;
2782         }
2783     }
2784   return REG_NOERROR;
2785 }
2786
2787 /* Helper functions for get_subexp().  */
2788
2789 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2790    If it can arrive, register the sub expression expressed with SUB_TOP
2791    and SUB_LAST.  */
2792
2793 static reg_errcode_t
2794 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2795                 re_sub_match_last_t *sub_last, Idx bkref_node, Idx bkref_str)
2796 {
2797   reg_errcode_t err;
2798   Idx to_idx;
2799   /* Can the subexpression arrive the back reference?  */
2800   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2801                        sub_last->str_idx, bkref_node, bkref_str,
2802                        OP_OPEN_SUBEXP);
2803   if (err != REG_NOERROR)
2804     return err;
2805   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2806                              sub_last->str_idx);
2807   if (BE (err != REG_NOERROR, 0))
2808     return err;
2809   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2810   return clean_state_log_if_needed (mctx, to_idx);
2811 }
2812
2813 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2814    Search '(' if FL_OPEN, or search ')' otherwise.
2815    TODO: This function isn't efficient...
2816          Because there might be more than one nodes whose types are
2817          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2818          nodes.
2819          E.g. RE: (a){2}  */
2820
2821 static Idx
2822 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2823                   Idx subexp_idx, int type)
2824 {
2825   Idx cls_idx;
2826   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2827     {
2828       Idx cls_node = nodes->elems[cls_idx];
2829       const re_token_t *node = dfa->nodes + cls_node;
2830       if (node->type == type
2831           && node->opr.idx == subexp_idx)
2832         return cls_node;
2833     }
2834   return -1;
2835 }
2836
2837 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2838    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2839    heavily reused.
2840    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2841
2842 static reg_errcode_t
2843 __attribute_warn_unused_result__
2844 check_arrival (re_match_context_t *mctx, state_array_t *path, Idx top_node,
2845                Idx top_str, Idx last_node, Idx last_str, int type)
2846 {
2847   const re_dfa_t *const dfa = mctx->dfa;
2848   reg_errcode_t err = REG_NOERROR;
2849   Idx subexp_num, backup_cur_idx, str_idx, null_cnt;
2850   re_dfastate_t *cur_state = NULL;
2851   re_node_set *cur_nodes, next_nodes;
2852   re_dfastate_t **backup_state_log;
2853   unsigned int context;
2854
2855   subexp_num = dfa->nodes[top_node].opr.idx;
2856   /* Extend the buffer if we need.  */
2857   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2858     {
2859       re_dfastate_t **new_array;
2860       Idx old_alloc = path->alloc;
2861       Idx incr_alloc = last_str + mctx->max_mb_elem_len + 1;
2862       Idx new_alloc;
2863       if (BE (IDX_MAX - old_alloc < incr_alloc, 0))
2864         return REG_ESPACE;
2865       new_alloc = old_alloc + incr_alloc;
2866       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) < new_alloc, 0))
2867         return REG_ESPACE;
2868       new_array = re_realloc (path->array, re_dfastate_t *, new_alloc);
2869       if (BE (new_array == NULL, 0))
2870         return REG_ESPACE;
2871       path->array = new_array;
2872       path->alloc = new_alloc;
2873       memset (new_array + old_alloc, '\0',
2874               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2875     }
2876
2877   str_idx = path->next_idx ? path->next_idx : top_str;
2878
2879   /* Temporary modify MCTX.  */
2880   backup_state_log = mctx->state_log;
2881   backup_cur_idx = mctx->input.cur_idx;
2882   mctx->state_log = path->array;
2883   mctx->input.cur_idx = str_idx;
2884
2885   /* Setup initial node set.  */
2886   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2887   if (str_idx == top_str)
2888     {
2889       err = re_node_set_init_1 (&next_nodes, top_node);
2890       if (BE (err != REG_NOERROR, 0))
2891         return err;
2892       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2893       if (BE (err != REG_NOERROR, 0))
2894         {
2895           re_node_set_free (&next_nodes);
2896           return err;
2897         }
2898     }
2899   else
2900     {
2901       cur_state = mctx->state_log[str_idx];
2902       if (cur_state && cur_state->has_backref)
2903         {
2904           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2905           if (BE (err != REG_NOERROR, 0))
2906             return err;
2907         }
2908       else
2909         re_node_set_init_empty (&next_nodes);
2910     }
2911   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2912     {
2913       if (next_nodes.nelem)
2914         {
2915           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2916                                     subexp_num, type);
2917           if (BE (err != REG_NOERROR, 0))
2918             {
2919               re_node_set_free (&next_nodes);
2920               return err;
2921             }
2922         }
2923       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2924       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2925         {
2926           re_node_set_free (&next_nodes);
2927           return err;
2928         }
2929       mctx->state_log[str_idx] = cur_state;
2930     }
2931
2932   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2933     {
2934       re_node_set_empty (&next_nodes);
2935       if (mctx->state_log[str_idx + 1])
2936         {
2937           err = re_node_set_merge (&next_nodes,
2938                                    &mctx->state_log[str_idx + 1]->nodes);
2939           if (BE (err != REG_NOERROR, 0))
2940             {
2941               re_node_set_free (&next_nodes);
2942               return err;
2943             }
2944         }
2945       if (cur_state)
2946         {
2947           err = check_arrival_add_next_nodes (mctx, str_idx,
2948                                               &cur_state->non_eps_nodes,
2949                                               &next_nodes);
2950           if (BE (err != REG_NOERROR, 0))
2951             {
2952               re_node_set_free (&next_nodes);
2953               return err;
2954             }
2955         }
2956       ++str_idx;
2957       if (next_nodes.nelem)
2958         {
2959           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2960           if (BE (err != REG_NOERROR, 0))
2961             {
2962               re_node_set_free (&next_nodes);
2963               return err;
2964             }
2965           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2966                                     subexp_num, type);
2967           if (BE (err != REG_NOERROR, 0))
2968             {
2969               re_node_set_free (&next_nodes);
2970               return err;
2971             }
2972         }
2973       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2974       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2975       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2976         {
2977           re_node_set_free (&next_nodes);
2978           return err;
2979         }
2980       mctx->state_log[str_idx] = cur_state;
2981       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
2982     }
2983   re_node_set_free (&next_nodes);
2984   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
2985                : &mctx->state_log[last_str]->nodes);
2986   path->next_idx = str_idx;
2987
2988   /* Fix MCTX.  */
2989   mctx->state_log = backup_state_log;
2990   mctx->input.cur_idx = backup_cur_idx;
2991
2992   /* Then check the current node set has the node LAST_NODE.  */
2993   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
2994     return REG_NOERROR;
2995
2996   return REG_NOMATCH;
2997 }
2998
2999 /* Helper functions for check_arrival.  */
3000
3001 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3002    to NEXT_NODES.
3003    TODO: This function is similar to the functions transit_state*(),
3004          however this function has many additional works.
3005          Can't we unify them?  */
3006
3007 static reg_errcode_t
3008 __attribute_warn_unused_result__
3009 check_arrival_add_next_nodes (re_match_context_t *mctx, Idx str_idx,
3010                               re_node_set *cur_nodes, re_node_set *next_nodes)
3011 {
3012   const re_dfa_t *const dfa = mctx->dfa;
3013   bool ok;
3014   Idx cur_idx;
3015 #ifdef RE_ENABLE_I18N
3016   reg_errcode_t err = REG_NOERROR;
3017 #endif
3018   re_node_set union_set;
3019   re_node_set_init_empty (&union_set);
3020   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3021     {
3022       int naccepted = 0;
3023       Idx cur_node = cur_nodes->elems[cur_idx];
3024 #ifdef DEBUG
3025       re_token_type_t type = dfa->nodes[cur_node].type;
3026       assert (!IS_EPSILON_NODE (type));
3027 #endif
3028 #ifdef RE_ENABLE_I18N
3029       /* If the node may accept "multi byte".  */
3030       if (dfa->nodes[cur_node].accept_mb)
3031         {
3032           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3033                                                str_idx);
3034           if (naccepted > 1)
3035             {
3036               re_dfastate_t *dest_state;
3037               Idx next_node = dfa->nexts[cur_node];
3038               Idx next_idx = str_idx + naccepted;
3039               dest_state = mctx->state_log[next_idx];
3040               re_node_set_empty (&union_set);
3041               if (dest_state)
3042                 {
3043                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3044                   if (BE (err != REG_NOERROR, 0))
3045                     {
3046                       re_node_set_free (&union_set);
3047                       return err;
3048                     }
3049                 }
3050               ok = re_node_set_insert (&union_set, next_node);
3051               if (BE (! ok, 0))
3052                 {
3053                   re_node_set_free (&union_set);
3054                   return REG_ESPACE;
3055                 }
3056               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3057                                                             &union_set);
3058               if (BE (mctx->state_log[next_idx] == NULL
3059                       && err != REG_NOERROR, 0))
3060                 {
3061                   re_node_set_free (&union_set);
3062                   return err;
3063                 }
3064             }
3065         }
3066 #endif /* RE_ENABLE_I18N */
3067       if (naccepted
3068           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3069         {
3070           ok = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3071           if (BE (! ok, 0))
3072             {
3073               re_node_set_free (&union_set);
3074               return REG_ESPACE;
3075             }
3076         }
3077     }
3078   re_node_set_free (&union_set);
3079   return REG_NOERROR;
3080 }
3081
3082 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3083    CUR_NODES, however exclude the nodes which are:
3084     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3085     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3086 */
3087
3088 static reg_errcode_t
3089 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3090                           Idx ex_subexp, int type)
3091 {
3092   reg_errcode_t err;
3093   Idx idx, outside_node;
3094   re_node_set new_nodes;
3095 #ifdef DEBUG
3096   assert (cur_nodes->nelem);
3097 #endif
3098   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3099   if (BE (err != REG_NOERROR, 0))
3100     return err;
3101   /* Create a new node set NEW_NODES with the nodes which are epsilon
3102      closures of the node in CUR_NODES.  */
3103
3104   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3105     {
3106       Idx cur_node = cur_nodes->elems[idx];
3107       const re_node_set *eclosure = dfa->eclosures + cur_node;
3108       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3109       if (outside_node == -1)
3110         {
3111           /* There are no problematic nodes, just merge them.  */
3112           err = re_node_set_merge (&new_nodes, eclosure);
3113           if (BE (err != REG_NOERROR, 0))
3114             {
3115               re_node_set_free (&new_nodes);
3116               return err;
3117             }
3118         }
3119       else
3120         {
3121           /* There are problematic nodes, re-calculate incrementally.  */
3122           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3123                                               ex_subexp, type);
3124           if (BE (err != REG_NOERROR, 0))
3125             {
3126               re_node_set_free (&new_nodes);
3127               return err;
3128             }
3129         }
3130     }
3131   re_node_set_free (cur_nodes);
3132   *cur_nodes = new_nodes;
3133   return REG_NOERROR;
3134 }
3135
3136 /* Helper function for check_arrival_expand_ecl.
3137    Check incrementally the epsilon closure of TARGET, and if it isn't
3138    problematic append it to DST_NODES.  */
3139
3140 static reg_errcode_t
3141 __attribute_warn_unused_result__
3142 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3143                               Idx target, Idx ex_subexp, int type)
3144 {
3145   Idx cur_node;
3146   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3147     {
3148       bool ok;
3149
3150       if (dfa->nodes[cur_node].type == type
3151           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3152         {
3153           if (type == OP_CLOSE_SUBEXP)
3154             {
3155               ok = re_node_set_insert (dst_nodes, cur_node);
3156               if (BE (! ok, 0))
3157                 return REG_ESPACE;
3158             }
3159           break;
3160         }
3161       ok = re_node_set_insert (dst_nodes, cur_node);
3162       if (BE (! ok, 0))
3163         return REG_ESPACE;
3164       if (dfa->edests[cur_node].nelem == 0)
3165         break;
3166       if (dfa->edests[cur_node].nelem == 2)
3167         {
3168           reg_errcode_t err;
3169           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3170                                               dfa->edests[cur_node].elems[1],
3171                                               ex_subexp, type);
3172           if (BE (err != REG_NOERROR, 0))
3173             return err;
3174         }
3175       cur_node = dfa->edests[cur_node].elems[0];
3176     }
3177   return REG_NOERROR;
3178 }
3179
3180
3181 /* For all the back references in the current state, calculate the
3182    destination of the back references by the appropriate entry
3183    in MCTX->BKREF_ENTS.  */
3184
3185 static reg_errcode_t
3186 __attribute_warn_unused_result__
3187 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3188                     Idx cur_str, Idx subexp_num, int type)
3189 {
3190   const re_dfa_t *const dfa = mctx->dfa;
3191   reg_errcode_t err;
3192   Idx cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3193   struct re_backref_cache_entry *ent;
3194
3195   if (cache_idx_start == -1)
3196     return REG_NOERROR;
3197
3198  restart:
3199   ent = mctx->bkref_ents + cache_idx_start;
3200   do
3201     {
3202       Idx to_idx, next_node;
3203
3204       /* Is this entry ENT is appropriate?  */
3205       if (!re_node_set_contains (cur_nodes, ent->node))
3206         continue; /* No.  */
3207
3208       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3209       /* Calculate the destination of the back reference, and append it
3210          to MCTX->STATE_LOG.  */
3211       if (to_idx == cur_str)
3212         {
3213           /* The backreference did epsilon transit, we must re-check all the
3214              node in the current state.  */
3215           re_node_set new_dests;
3216           reg_errcode_t err2, err3;
3217           next_node = dfa->edests[ent->node].elems[0];
3218           if (re_node_set_contains (cur_nodes, next_node))
3219             continue;
3220           err = re_node_set_init_1 (&new_dests, next_node);
3221           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3222           err3 = re_node_set_merge (cur_nodes, &new_dests);
3223           re_node_set_free (&new_dests);
3224           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3225                   || err3 != REG_NOERROR, 0))
3226             {
3227               err = (err != REG_NOERROR ? err
3228                      : (err2 != REG_NOERROR ? err2 : err3));
3229               return err;
3230             }
3231           /* TODO: It is still inefficient...  */
3232           goto restart;
3233         }
3234       else
3235         {
3236           re_node_set union_set;
3237           next_node = dfa->nexts[ent->node];
3238           if (mctx->state_log[to_idx])
3239             {
3240               bool ok;
3241               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3242                                         next_node))
3243                 continue;
3244               err = re_node_set_init_copy (&union_set,
3245                                            &mctx->state_log[to_idx]->nodes);
3246               ok = re_node_set_insert (&union_set, next_node);
3247               if (BE (err != REG_NOERROR || ! ok, 0))
3248                 {
3249                   re_node_set_free (&union_set);
3250                   err = err != REG_NOERROR ? err : REG_ESPACE;
3251                   return err;
3252                 }
3253             }
3254           else
3255             {
3256               err = re_node_set_init_1 (&union_set, next_node);
3257               if (BE (err != REG_NOERROR, 0))
3258                 return err;
3259             }
3260           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3261           re_node_set_free (&union_set);
3262           if (BE (mctx->state_log[to_idx] == NULL
3263                   && err != REG_NOERROR, 0))
3264             return err;
3265         }
3266     }
3267   while (ent++->more);
3268   return REG_NOERROR;
3269 }
3270
3271 /* Build transition table for the state.
3272    Return true if successful.  */
3273
3274 static bool
3275 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3276 {
3277   reg_errcode_t err;
3278   Idx i, j;
3279   int ch;
3280   bool need_word_trtable = false;
3281   bitset_word_t elem, mask;
3282   bool dests_node_malloced = false;
3283   bool dest_states_malloced = false;
3284   Idx ndests; /* Number of the destination states from 'state'.  */
3285   re_dfastate_t **trtable;
3286   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3287   re_node_set follows, *dests_node;
3288   bitset_t *dests_ch;
3289   bitset_t acceptable;
3290
3291   struct dests_alloc
3292   {
3293     re_node_set dests_node[SBC_MAX];
3294     bitset_t dests_ch[SBC_MAX];
3295   } *dests_alloc;
3296
3297   /* We build DFA states which corresponds to the destination nodes
3298      from 'state'.  'dests_node[i]' represents the nodes which i-th
3299      destination state contains, and 'dests_ch[i]' represents the
3300      characters which i-th destination state accepts.  */
3301   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3302     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3303   else
3304     {
3305       dests_alloc = re_malloc (struct dests_alloc, 1);
3306       if (BE (dests_alloc == NULL, 0))
3307         return false;
3308       dests_node_malloced = true;
3309     }
3310   dests_node = dests_alloc->dests_node;
3311   dests_ch = dests_alloc->dests_ch;
3312
3313   /* Initialize transition table.  */
3314   state->word_trtable = state->trtable = NULL;
3315
3316   /* At first, group all nodes belonging to 'state' into several
3317      destinations.  */
3318   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3319   if (BE (ndests <= 0, 0))
3320     {
3321       if (dests_node_malloced)
3322         re_free (dests_alloc);
3323       /* Return false in case of an error, true otherwise.  */
3324       if (ndests == 0)
3325         {
3326           state->trtable = (re_dfastate_t **)
3327             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3328           if (BE (state->trtable == NULL, 0))
3329             return false;
3330           return true;
3331         }
3332       return false;
3333     }
3334
3335   err = re_node_set_alloc (&follows, ndests + 1);
3336   if (BE (err != REG_NOERROR, 0))
3337     goto out_free;
3338
3339   /* Avoid arithmetic overflow in size calculation.  */
3340   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3341             / (3 * sizeof (re_dfastate_t *)))
3342            < ndests),
3343           0))
3344     goto out_free;
3345
3346   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3347                          + ndests * 3 * sizeof (re_dfastate_t *)))
3348     dest_states = (re_dfastate_t **)
3349       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3350   else
3351     {
3352       dest_states = re_malloc (re_dfastate_t *, ndests * 3);
3353       if (BE (dest_states == NULL, 0))
3354         {
3355 out_free:
3356           if (dest_states_malloced)
3357             re_free (dest_states);
3358           re_node_set_free (&follows);
3359           for (i = 0; i < ndests; ++i)
3360             re_node_set_free (dests_node + i);
3361           if (dests_node_malloced)
3362             re_free (dests_alloc);
3363           return false;
3364         }
3365       dest_states_malloced = true;
3366     }
3367   dest_states_word = dest_states + ndests;
3368   dest_states_nl = dest_states_word + ndests;
3369   bitset_empty (acceptable);
3370
3371   /* Then build the states for all destinations.  */
3372   for (i = 0; i < ndests; ++i)
3373     {
3374       Idx next_node;
3375       re_node_set_empty (&follows);
3376       /* Merge the follows of this destination states.  */
3377       for (j = 0; j < dests_node[i].nelem; ++j)
3378         {
3379           next_node = dfa->nexts[dests_node[i].elems[j]];
3380           if (next_node != -1)
3381             {
3382               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3383               if (BE (err != REG_NOERROR, 0))
3384                 goto out_free;
3385             }
3386         }
3387       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3388       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3389         goto out_free;
3390       /* If the new state has context constraint,
3391          build appropriate states for these contexts.  */
3392       if (dest_states[i]->has_constraint)
3393         {
3394           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3395                                                           CONTEXT_WORD);
3396           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3397             goto out_free;
3398
3399           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3400             need_word_trtable = true;
3401
3402           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3403                                                         CONTEXT_NEWLINE);
3404           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3405             goto out_free;
3406         }
3407       else
3408         {
3409           dest_states_word[i] = dest_states[i];
3410           dest_states_nl[i] = dest_states[i];
3411         }
3412       bitset_merge (acceptable, dests_ch[i]);
3413     }
3414
3415   if (!BE (need_word_trtable, 0))
3416     {
3417       /* We don't care about whether the following character is a word
3418          character, or we are in a single-byte character set so we can
3419          discern by looking at the character code: allocate a
3420          256-entry transition table.  */
3421       trtable = state->trtable =
3422         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3423       if (BE (trtable == NULL, 0))
3424         goto out_free;
3425
3426       /* For all characters ch...:  */
3427       for (i = 0; i < BITSET_WORDS; ++i)
3428         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3429              elem;
3430              mask <<= 1, elem >>= 1, ++ch)
3431           if (BE (elem & 1, 0))
3432             {
3433               /* There must be exactly one destination which accepts
3434                  character ch.  See group_nodes_into_DFAstates.  */
3435               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3436                 ;
3437
3438               /* j-th destination accepts the word character ch.  */
3439               if (dfa->word_char[i] & mask)
3440                 trtable[ch] = dest_states_word[j];
3441               else
3442                 trtable[ch] = dest_states[j];
3443             }
3444     }
3445   else
3446     {
3447       /* We care about whether the following character is a word
3448          character, and we are in a multi-byte character set: discern
3449          by looking at the character code: build two 256-entry
3450          transition tables, one starting at trtable[0] and one
3451          starting at trtable[SBC_MAX].  */
3452       trtable = state->word_trtable =
3453         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3454       if (BE (trtable == NULL, 0))
3455         goto out_free;
3456
3457       /* For all characters ch...:  */
3458       for (i = 0; i < BITSET_WORDS; ++i)
3459         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3460              elem;
3461              mask <<= 1, elem >>= 1, ++ch)
3462           if (BE (elem & 1, 0))
3463             {
3464               /* There must be exactly one destination which accepts
3465                  character ch.  See group_nodes_into_DFAstates.  */
3466               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3467                 ;
3468
3469               /* j-th destination accepts the word character ch.  */
3470               trtable[ch] = dest_states[j];
3471               trtable[ch + SBC_MAX] = dest_states_word[j];
3472             }
3473     }
3474
3475   /* new line */
3476   if (bitset_contain (acceptable, NEWLINE_CHAR))
3477     {
3478       /* The current state accepts newline character.  */
3479       for (j = 0; j < ndests; ++j)
3480         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3481           {
3482             /* k-th destination accepts newline character.  */
3483             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3484             if (need_word_trtable)
3485               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3486             /* There must be only one destination which accepts
3487                newline.  See group_nodes_into_DFAstates.  */
3488             break;
3489           }
3490     }
3491
3492   if (dest_states_malloced)
3493     re_free (dest_states);
3494
3495   re_node_set_free (&follows);
3496   for (i = 0; i < ndests; ++i)
3497     re_node_set_free (dests_node + i);
3498
3499   if (dests_node_malloced)
3500     re_free (dests_alloc);
3501
3502   return true;
3503 }
3504
3505 /* Group all nodes belonging to STATE into several destinations.
3506    Then for all destinations, set the nodes belonging to the destination
3507    to DESTS_NODE[i] and set the characters accepted by the destination
3508    to DEST_CH[i].  This function return the number of destinations.  */
3509
3510 static Idx
3511 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3512                             re_node_set *dests_node, bitset_t *dests_ch)
3513 {
3514   reg_errcode_t err;
3515   bool ok;
3516   Idx i, j, k;
3517   Idx ndests; /* Number of the destinations from 'state'.  */
3518   bitset_t accepts; /* Characters a node can accept.  */
3519   const re_node_set *cur_nodes = &state->nodes;
3520   bitset_empty (accepts);
3521   ndests = 0;
3522
3523   /* For all the nodes belonging to 'state',  */
3524   for (i = 0; i < cur_nodes->nelem; ++i)
3525     {
3526       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3527       re_token_type_t type = node->type;
3528       unsigned int constraint = node->constraint;
3529
3530       /* Enumerate all single byte character this node can accept.  */
3531       if (type == CHARACTER)
3532         bitset_set (accepts, node->opr.c);
3533       else if (type == SIMPLE_BRACKET)
3534         {
3535           bitset_merge (accepts, node->opr.sbcset);
3536         }
3537       else if (type == OP_PERIOD)
3538         {
3539 #ifdef RE_ENABLE_I18N
3540           if (dfa->mb_cur_max > 1)
3541             bitset_merge (accepts, dfa->sb_char);
3542           else
3543 #endif
3544             bitset_set_all (accepts);
3545           if (!(dfa->syntax & RE_DOT_NEWLINE))
3546             bitset_clear (accepts, '\n');
3547           if (dfa->syntax & RE_DOT_NOT_NULL)
3548             bitset_clear (accepts, '\0');
3549         }
3550 #ifdef RE_ENABLE_I18N
3551       else if (type == OP_UTF8_PERIOD)
3552         {
3553           if (ASCII_CHARS % BITSET_WORD_BITS == 0)
3554             memset (accepts, -1, ASCII_CHARS / CHAR_BIT);
3555           else
3556             bitset_merge (accepts, utf8_sb_map);
3557           if (!(dfa->syntax & RE_DOT_NEWLINE))
3558             bitset_clear (accepts, '\n');
3559           if (dfa->syntax & RE_DOT_NOT_NULL)
3560             bitset_clear (accepts, '\0');
3561         }
3562 #endif
3563       else
3564         continue;
3565
3566       /* Check the 'accepts' and sift the characters which are not
3567          match it the context.  */
3568       if (constraint)
3569         {
3570           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3571             {
3572               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3573               bitset_empty (accepts);
3574               if (accepts_newline)
3575                 bitset_set (accepts, NEWLINE_CHAR);
3576               else
3577                 continue;
3578             }
3579           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3580             {
3581               bitset_empty (accepts);
3582               continue;
3583             }
3584
3585           if (constraint & NEXT_WORD_CONSTRAINT)
3586             {
3587               bitset_word_t any_set = 0;
3588               if (type == CHARACTER && !node->word_char)
3589                 {
3590                   bitset_empty (accepts);
3591                   continue;
3592                 }
3593 #ifdef RE_ENABLE_I18N
3594               if (dfa->mb_cur_max > 1)
3595                 for (j = 0; j < BITSET_WORDS; ++j)
3596                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3597               else
3598 #endif
3599                 for (j = 0; j < BITSET_WORDS; ++j)
3600                   any_set |= (accepts[j] &= dfa->word_char[j]);
3601               if (!any_set)
3602                 continue;
3603             }
3604           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3605             {
3606               bitset_word_t any_set = 0;
3607               if (type == CHARACTER && node->word_char)
3608                 {
3609                   bitset_empty (accepts);
3610                   continue;
3611                 }
3612 #ifdef RE_ENABLE_I18N
3613               if (dfa->mb_cur_max > 1)
3614                 for (j = 0; j < BITSET_WORDS; ++j)
3615                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3616               else
3617 #endif
3618                 for (j = 0; j < BITSET_WORDS; ++j)
3619                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3620               if (!any_set)
3621                 continue;
3622             }
3623         }
3624
3625       /* Then divide 'accepts' into DFA states, or create a new
3626          state.  Above, we make sure that accepts is not empty.  */
3627       for (j = 0; j < ndests; ++j)
3628         {
3629           bitset_t intersec; /* Intersection sets, see below.  */
3630           bitset_t remains;
3631           /* Flags, see below.  */
3632           bitset_word_t has_intersec, not_subset, not_consumed;
3633
3634           /* Optimization, skip if this state doesn't accept the character.  */
3635           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3636             continue;
3637
3638           /* Enumerate the intersection set of this state and 'accepts'.  */
3639           has_intersec = 0;
3640           for (k = 0; k < BITSET_WORDS; ++k)
3641             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3642           /* And skip if the intersection set is empty.  */
3643           if (!has_intersec)
3644             continue;
3645
3646           /* Then check if this state is a subset of 'accepts'.  */
3647           not_subset = not_consumed = 0;
3648           for (k = 0; k < BITSET_WORDS; ++k)
3649             {
3650               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3651               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3652             }
3653
3654           /* If this state isn't a subset of 'accepts', create a
3655              new group state, which has the 'remains'. */
3656           if (not_subset)
3657             {
3658               bitset_copy (dests_ch[ndests], remains);
3659               bitset_copy (dests_ch[j], intersec);
3660               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3661               if (BE (err != REG_NOERROR, 0))
3662                 goto error_return;
3663               ++ndests;
3664             }
3665
3666           /* Put the position in the current group. */
3667           ok = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3668           if (BE (! ok, 0))
3669             goto error_return;
3670
3671           /* If all characters are consumed, go to next node. */
3672           if (!not_consumed)
3673             break;
3674         }
3675       /* Some characters remain, create a new group. */
3676       if (j == ndests)
3677         {
3678           bitset_copy (dests_ch[ndests], accepts);
3679           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3680           if (BE (err != REG_NOERROR, 0))
3681             goto error_return;
3682           ++ndests;
3683           bitset_empty (accepts);
3684         }
3685     }
3686   return ndests;
3687  error_return:
3688   for (j = 0; j < ndests; ++j)
3689     re_node_set_free (dests_node + j);
3690   return -1;
3691 }
3692
3693 #ifdef RE_ENABLE_I18N
3694 /* Check how many bytes the node 'dfa->nodes[node_idx]' accepts.
3695    Return the number of the bytes the node accepts.
3696    STR_IDX is the current index of the input string.
3697
3698    This function handles the nodes which can accept one character, or
3699    one collating element like '.', '[a-z]', opposite to the other nodes
3700    can only accept one byte.  */
3701
3702 # ifdef _LIBC
3703 #  include <locale/weight.h>
3704 # endif
3705
3706 static int
3707 check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
3708                          const re_string_t *input, Idx str_idx)
3709 {
3710   const re_token_t *node = dfa->nodes + node_idx;
3711   int char_len, elem_len;
3712   Idx i;
3713
3714   if (BE (node->type == OP_UTF8_PERIOD, 0))
3715     {
3716       unsigned char c = re_string_byte_at (input, str_idx), d;
3717       if (BE (c < 0xc2, 1))
3718         return 0;
3719
3720       if (str_idx + 2 > input->len)
3721         return 0;
3722
3723       d = re_string_byte_at (input, str_idx + 1);
3724       if (c < 0xe0)
3725         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3726       else if (c < 0xf0)
3727         {
3728           char_len = 3;
3729           if (c == 0xe0 && d < 0xa0)
3730             return 0;
3731         }
3732       else if (c < 0xf8)
3733         {
3734           char_len = 4;
3735           if (c == 0xf0 && d < 0x90)
3736             return 0;
3737         }
3738       else if (c < 0xfc)
3739         {
3740           char_len = 5;
3741           if (c == 0xf8 && d < 0x88)
3742             return 0;
3743         }
3744       else if (c < 0xfe)
3745         {
3746           char_len = 6;
3747           if (c == 0xfc && d < 0x84)
3748             return 0;
3749         }
3750       else
3751         return 0;
3752
3753       if (str_idx + char_len > input->len)
3754         return 0;
3755
3756       for (i = 1; i < char_len; ++i)
3757         {
3758           d = re_string_byte_at (input, str_idx + i);
3759           if (d < 0x80 || d > 0xbf)
3760             return 0;
3761         }
3762       return char_len;
3763     }
3764
3765   char_len = re_string_char_size_at (input, str_idx);
3766   if (node->type == OP_PERIOD)
3767     {
3768       if (char_len <= 1)
3769         return 0;
3770       /* FIXME: I don't think this if is needed, as both '\n'
3771          and '\0' are char_len == 1.  */
3772       /* '.' accepts any one character except the following two cases.  */
3773       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3774            re_string_byte_at (input, str_idx) == '\n') ||
3775           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3776            re_string_byte_at (input, str_idx) == '\0'))
3777         return 0;
3778       return char_len;
3779     }
3780
3781   elem_len = re_string_elem_size_at (input, str_idx);
3782   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3783     return 0;
3784
3785   if (node->type == COMPLEX_BRACKET)
3786     {
3787       const re_charset_t *cset = node->opr.mbcset;
3788 # ifdef _LIBC
3789       const unsigned char *pin
3790         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3791       Idx j;
3792       uint32_t nrules;
3793 # endif /* _LIBC */
3794       int match_len = 0;
3795       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3796                     ? re_string_wchar_at (input, str_idx) : 0);
3797
3798       /* match with multibyte character?  */
3799       for (i = 0; i < cset->nmbchars; ++i)
3800         if (wc == cset->mbchars[i])
3801           {
3802             match_len = char_len;
3803             goto check_node_accept_bytes_match;
3804           }
3805       /* match with character_class?  */
3806       for (i = 0; i < cset->nchar_classes; ++i)
3807         {
3808           wctype_t wt = cset->char_classes[i];
3809           if (__iswctype (wc, wt))
3810             {
3811               match_len = char_len;
3812               goto check_node_accept_bytes_match;
3813             }
3814         }
3815
3816 # ifdef _LIBC
3817       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3818       if (nrules != 0)
3819         {
3820           unsigned int in_collseq = 0;
3821           const int32_t *table, *indirect;
3822           const unsigned char *weights, *extra;
3823           const char *collseqwc;
3824
3825           /* match with collating_symbol?  */
3826           if (cset->ncoll_syms)
3827             extra = (const unsigned char *)
3828               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3829           for (i = 0; i < cset->ncoll_syms; ++i)
3830             {
3831               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3832               /* Compare the length of input collating element and
3833                  the length of current collating element.  */
3834               if (*coll_sym != elem_len)
3835                 continue;
3836               /* Compare each bytes.  */
3837               for (j = 0; j < *coll_sym; j++)
3838                 if (pin[j] != coll_sym[1 + j])
3839                   break;
3840               if (j == *coll_sym)
3841                 {
3842                   /* Match if every bytes is equal.  */
3843                   match_len = j;
3844                   goto check_node_accept_bytes_match;
3845                 }
3846             }
3847
3848           if (cset->nranges)
3849             {
3850               if (elem_len <= char_len)
3851                 {
3852                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3853                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3854                 }
3855               else
3856                 in_collseq = find_collation_sequence_value (pin, elem_len);
3857             }
3858           /* match with range expression?  */
3859           /* FIXME: Implement rational ranges here, too.  */
3860           for (i = 0; i < cset->nranges; ++i)
3861             if (cset->range_starts[i] <= in_collseq
3862                 && in_collseq <= cset->range_ends[i])
3863               {
3864                 match_len = elem_len;
3865                 goto check_node_accept_bytes_match;
3866               }
3867
3868           /* match with equivalence_class?  */
3869           if (cset->nequiv_classes)
3870             {
3871               const unsigned char *cp = pin;
3872               table = (const int32_t *)
3873                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3874               weights = (const unsigned char *)
3875                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3876               extra = (const unsigned char *)
3877                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3878               indirect = (const int32_t *)
3879                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3880               int32_t idx = findidx (table, indirect, extra, &cp, elem_len);
3881               int32_t rule = idx >> 24;
3882               idx &= 0xffffff;
3883               if (idx > 0)
3884                 {
3885                   size_t weight_len = weights[idx];
3886                   for (i = 0; i < cset->nequiv_classes; ++i)
3887                     {
3888                       int32_t equiv_class_idx = cset->equiv_classes[i];
3889                       int32_t equiv_class_rule = equiv_class_idx >> 24;
3890                       equiv_class_idx &= 0xffffff;
3891                       if (weights[equiv_class_idx] == weight_len
3892                           && equiv_class_rule == rule
3893                           && memcmp (weights + idx + 1,
3894                                      weights + equiv_class_idx + 1,
3895                                      weight_len) == 0)
3896                         {
3897                           match_len = elem_len;
3898                           goto check_node_accept_bytes_match;
3899                         }
3900                     }
3901                 }
3902             }
3903         }
3904       else
3905 # endif /* _LIBC */
3906         {
3907           /* match with range expression?  */
3908           for (i = 0; i < cset->nranges; ++i)
3909             {
3910               if (cset->range_starts[i] <= wc && wc <= cset->range_ends[i])
3911                 {
3912                   match_len = char_len;
3913                   goto check_node_accept_bytes_match;
3914                 }
3915             }
3916         }
3917     check_node_accept_bytes_match:
3918       if (!cset->non_match)
3919         return match_len;
3920       else
3921         {
3922           if (match_len > 0)
3923             return 0;
3924           else
3925             return (elem_len > char_len) ? elem_len : char_len;
3926         }
3927     }
3928   return 0;
3929 }
3930
3931 # ifdef _LIBC
3932 static unsigned int
3933 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3934 {
3935   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3936   if (nrules == 0)
3937     {
3938       if (mbs_len == 1)
3939         {
3940           /* No valid character.  Match it as a single byte character.  */
3941           const unsigned char *collseq = (const unsigned char *)
3942             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
3943           return collseq[mbs[0]];
3944         }
3945       return UINT_MAX;
3946     }
3947   else
3948     {
3949       int32_t idx;
3950       const unsigned char *extra = (const unsigned char *)
3951         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3952       int32_t extrasize = (const unsigned char *)
3953         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
3954
3955       for (idx = 0; idx < extrasize;)
3956         {
3957           int mbs_cnt;
3958           bool found = false;
3959           int32_t elem_mbs_len;
3960           /* Skip the name of collating element name.  */
3961           idx = idx + extra[idx] + 1;
3962           elem_mbs_len = extra[idx++];
3963           if (mbs_len == elem_mbs_len)
3964             {
3965               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
3966                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
3967                   break;
3968               if (mbs_cnt == elem_mbs_len)
3969                 /* Found the entry.  */
3970                 found = true;
3971             }
3972           /* Skip the byte sequence of the collating element.  */
3973           idx += elem_mbs_len;
3974           /* Adjust for the alignment.  */
3975           idx = (idx + 3) & ~3;
3976           /* Skip the collation sequence value.  */
3977           idx += sizeof (uint32_t);
3978           /* Skip the wide char sequence of the collating element.  */
3979           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
3980           /* If we found the entry, return the sequence value.  */
3981           if (found)
3982             return *(uint32_t *) (extra + idx);
3983           /* Skip the collation sequence value.  */
3984           idx += sizeof (uint32_t);
3985         }
3986       return UINT_MAX;
3987     }
3988 }
3989 # endif /* _LIBC */
3990 #endif /* RE_ENABLE_I18N */
3991
3992 /* Check whether the node accepts the byte which is IDX-th
3993    byte of the INPUT.  */
3994
3995 static bool
3996 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
3997                    Idx idx)
3998 {
3999   unsigned char ch;
4000   ch = re_string_byte_at (&mctx->input, idx);
4001   switch (node->type)
4002     {
4003     case CHARACTER:
4004       if (node->opr.c != ch)
4005         return false;
4006       break;
4007
4008     case SIMPLE_BRACKET:
4009       if (!bitset_contain (node->opr.sbcset, ch))
4010         return false;
4011       break;
4012
4013 #ifdef RE_ENABLE_I18N
4014     case OP_UTF8_PERIOD:
4015       if (ch >= ASCII_CHARS)
4016         return false;
4017       FALLTHROUGH;
4018 #endif
4019     case OP_PERIOD:
4020       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4021           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4022         return false;
4023       break;
4024
4025     default:
4026       return false;
4027     }
4028
4029   if (node->constraint)
4030     {
4031       /* The node has constraints.  Check whether the current context
4032          satisfies the constraints.  */
4033       unsigned int context = re_string_context_at (&mctx->input, idx,
4034                                                    mctx->eflags);
4035       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4036         return false;
4037     }
4038
4039   return true;
4040 }
4041
4042 /* Extend the buffers, if the buffers have run out.  */
4043
4044 static reg_errcode_t
4045 __attribute_warn_unused_result__
4046 extend_buffers (re_match_context_t *mctx, int min_len)
4047 {
4048   reg_errcode_t ret;
4049   re_string_t *pstr = &mctx->input;
4050
4051   /* Avoid overflow.  */
4052   if (BE (MIN (IDX_MAX, SIZE_MAX / sizeof (re_dfastate_t *)) / 2
4053           <= pstr->bufs_len, 0))
4054     return REG_ESPACE;
4055
4056   /* Double the lengths of the buffers, but allocate at least MIN_LEN.  */
4057   ret = re_string_realloc_buffers (pstr,
4058                                    MAX (min_len,
4059                                         MIN (pstr->len, pstr->bufs_len * 2)));
4060   if (BE (ret != REG_NOERROR, 0))
4061     return ret;
4062
4063   if (mctx->state_log != NULL)
4064     {
4065       /* And double the length of state_log.  */
4066       /* XXX We have no indication of the size of this buffer.  If this
4067          allocation fail we have no indication that the state_log array
4068          does not have the right size.  */
4069       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4070                                               pstr->bufs_len + 1);
4071       if (BE (new_array == NULL, 0))
4072         return REG_ESPACE;
4073       mctx->state_log = new_array;
4074     }
4075
4076   /* Then reconstruct the buffers.  */
4077   if (pstr->icase)
4078     {
4079 #ifdef RE_ENABLE_I18N
4080       if (pstr->mb_cur_max > 1)
4081         {
4082           ret = build_wcs_upper_buffer (pstr);
4083           if (BE (ret != REG_NOERROR, 0))
4084             return ret;
4085         }
4086       else
4087 #endif /* RE_ENABLE_I18N  */
4088         build_upper_buffer (pstr);
4089     }
4090   else
4091     {
4092 #ifdef RE_ENABLE_I18N
4093       if (pstr->mb_cur_max > 1)
4094         build_wcs_buffer (pstr);
4095       else
4096 #endif /* RE_ENABLE_I18N  */
4097         {
4098           if (pstr->trans != NULL)
4099             re_string_translate_buffer (pstr);
4100         }
4101     }
4102   return REG_NOERROR;
4103 }
4104
4105 \f
4106 /* Functions for matching context.  */
4107
4108 /* Initialize MCTX.  */
4109
4110 static reg_errcode_t
4111 __attribute_warn_unused_result__
4112 match_ctx_init (re_match_context_t *mctx, int eflags, Idx n)
4113 {
4114   mctx->eflags = eflags;
4115   mctx->match_last = -1;
4116   if (n > 0)
4117     {
4118       /* Avoid overflow.  */
4119       size_t max_object_size =
4120         MAX (sizeof (struct re_backref_cache_entry),
4121              sizeof (re_sub_match_top_t *));
4122       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < n, 0))
4123         return REG_ESPACE;
4124
4125       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4126       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4127       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4128         return REG_ESPACE;
4129     }
4130   /* Already zero-ed by the caller.
4131      else
4132        mctx->bkref_ents = NULL;
4133      mctx->nbkref_ents = 0;
4134      mctx->nsub_tops = 0;  */
4135   mctx->abkref_ents = n;
4136   mctx->max_mb_elem_len = 1;
4137   mctx->asub_tops = n;
4138   return REG_NOERROR;
4139 }
4140
4141 /* Clean the entries which depend on the current input in MCTX.
4142    This function must be invoked when the matcher changes the start index
4143    of the input, or changes the input string.  */
4144
4145 static void
4146 match_ctx_clean (re_match_context_t *mctx)
4147 {
4148   Idx st_idx;
4149   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4150     {
4151       Idx sl_idx;
4152       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4153       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4154         {
4155           re_sub_match_last_t *last = top->lasts[sl_idx];
4156           re_free (last->path.array);
4157           re_free (last);
4158         }
4159       re_free (top->lasts);
4160       if (top->path)
4161         {
4162           re_free (top->path->array);
4163           re_free (top->path);
4164         }
4165       re_free (top);
4166     }
4167
4168   mctx->nsub_tops = 0;
4169   mctx->nbkref_ents = 0;
4170 }
4171
4172 /* Free all the memory associated with MCTX.  */
4173
4174 static void
4175 match_ctx_free (re_match_context_t *mctx)
4176 {
4177   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4178   match_ctx_clean (mctx);
4179   re_free (mctx->sub_tops);
4180   re_free (mctx->bkref_ents);
4181 }
4182
4183 /* Add a new backreference entry to MCTX.
4184    Note that we assume that caller never call this function with duplicate
4185    entry, and call with STR_IDX which isn't smaller than any existing entry.
4186 */
4187
4188 static reg_errcode_t
4189 __attribute_warn_unused_result__
4190 match_ctx_add_entry (re_match_context_t *mctx, Idx node, Idx str_idx, Idx from,
4191                      Idx to)
4192 {
4193   if (mctx->nbkref_ents >= mctx->abkref_ents)
4194     {
4195       struct re_backref_cache_entry* new_entry;
4196       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4197                               mctx->abkref_ents * 2);
4198       if (BE (new_entry == NULL, 0))
4199         {
4200           re_free (mctx->bkref_ents);
4201           return REG_ESPACE;
4202         }
4203       mctx->bkref_ents = new_entry;
4204       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4205               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4206       mctx->abkref_ents *= 2;
4207     }
4208   if (mctx->nbkref_ents > 0
4209       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4210     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4211
4212   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4213   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4214   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4215   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4216
4217   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4218      If bit N is clear, means that this entry won't epsilon-transition to
4219      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4220      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4221      such node.
4222
4223      A backreference does not epsilon-transition unless it is empty, so set
4224      to all zeros if FROM != TO.  */
4225   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4226     = (from == to ? -1 : 0);
4227
4228   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4229   if (mctx->max_mb_elem_len < to - from)
4230     mctx->max_mb_elem_len = to - from;
4231   return REG_NOERROR;
4232 }
4233
4234 /* Return the first entry with the same str_idx, or -1 if none is
4235    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4236
4237 static Idx
4238 search_cur_bkref_entry (const re_match_context_t *mctx, Idx str_idx)
4239 {
4240   Idx left, right, mid, last;
4241   last = right = mctx->nbkref_ents;
4242   for (left = 0; left < right;)
4243     {
4244       mid = (left + right) / 2;
4245       if (mctx->bkref_ents[mid].str_idx < str_idx)
4246         left = mid + 1;
4247       else
4248         right = mid;
4249     }
4250   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4251     return left;
4252   else
4253     return -1;
4254 }
4255
4256 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4257    at STR_IDX.  */
4258
4259 static reg_errcode_t
4260 __attribute_warn_unused_result__
4261 match_ctx_add_subtop (re_match_context_t *mctx, Idx node, Idx str_idx)
4262 {
4263 #ifdef DEBUG
4264   assert (mctx->sub_tops != NULL);
4265   assert (mctx->asub_tops > 0);
4266 #endif
4267   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4268     {
4269       Idx new_asub_tops = mctx->asub_tops * 2;
4270       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4271                                                    re_sub_match_top_t *,
4272                                                    new_asub_tops);
4273       if (BE (new_array == NULL, 0))
4274         return REG_ESPACE;
4275       mctx->sub_tops = new_array;
4276       mctx->asub_tops = new_asub_tops;
4277     }
4278   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4279   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4280     return REG_ESPACE;
4281   mctx->sub_tops[mctx->nsub_tops]->node = node;
4282   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4283   return REG_NOERROR;
4284 }
4285
4286 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4287    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4288
4289 static re_sub_match_last_t *
4290 match_ctx_add_sublast (re_sub_match_top_t *subtop, Idx node, Idx str_idx)
4291 {
4292   re_sub_match_last_t *new_entry;
4293   if (BE (subtop->nlasts == subtop->alasts, 0))
4294     {
4295       Idx new_alasts = 2 * subtop->alasts + 1;
4296       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4297                                                     re_sub_match_last_t *,
4298                                                     new_alasts);
4299       if (BE (new_array == NULL, 0))
4300         return NULL;
4301       subtop->lasts = new_array;
4302       subtop->alasts = new_alasts;
4303     }
4304   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4305   if (BE (new_entry != NULL, 1))
4306     {
4307       subtop->lasts[subtop->nlasts] = new_entry;
4308       new_entry->node = node;
4309       new_entry->str_idx = str_idx;
4310       ++subtop->nlasts;
4311     }
4312   return new_entry;
4313 }
4314
4315 static void
4316 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4317                re_dfastate_t **limited_sts, Idx last_node, Idx last_str_idx)
4318 {
4319   sctx->sifted_states = sifted_sts;
4320   sctx->limited_states = limited_sts;
4321   sctx->last_node = last_node;
4322   sctx->last_str_idx = last_str_idx;
4323   re_node_set_init_empty (&sctx->limits);
4324 }