posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  21                                      int n) internal_function;
  22 static void match_ctx_clean (re_match_context_t *mctx) internal_function;
  23 static void match_ctx_free (re_match_context_t *cache) internal_function;
  24 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  25                                           int str_idx, int from, int to)
  26      internal_function;
  27 static int search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
  28      internal_function;
  29 static reg_errcode_t match_ctx_add_subtop (re_match_context_t *mctx, int node,
  30                                            int str_idx) internal_function;
  31 static re_sub_match_last_t * match_ctx_add_sublast (re_sub_match_top_t *subtop,
  32                                                    int node, int str_idx)
  33      internal_function;
  34 static void sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
  35                            re_dfastate_t **limited_sts, int last_node,
  36                            int last_str_idx)
  37      internal_function;
  38 static reg_errcode_t re_search_internal (const regex_t *preg,
  39                                          const char *string, int length,
  40                                          int start, int range, int stop,
  41                                          size_t nmatch, regmatch_t pmatch[],
  42                                          int eflags) internal_function;
  43 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  44                              const char *string1, int length1,
  45                              const char *string2, int length2,
  46                              int start, int range, struct re_registers *regs,
  47                              int stop, int ret_len) internal_function;
  48 static int re_search_stub (struct re_pattern_buffer *bufp,
  49                            const char *string, int length, int start,
  50                            int range, int stop, struct re_registers *regs,
  51                            int ret_len) internal_function;
  52 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  53                               int nregs, int regs_allocated) internal_function;
  54 static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
  55      internal_function;
  56 static int check_matching (re_match_context_t *mctx, int fl_longest_match,
  57                            int *p_match_first) internal_function;
  58 static int check_halt_state_context (const re_match_context_t *mctx,
  59                                      const re_dfastate_t *state, int idx)
  60      internal_function;
  61 static void update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
  62                          regmatch_t *prev_idx_match, int cur_node,
  63                          int cur_idx, int nmatch) internal_function;
  64 static reg_errcode_t push_fail_stack (struct re_fail_stack_t *fs,
  65                                       int str_idx, int dest_node, int nregs,
  66                                       regmatch_t *regs,
  67                                       re_node_set *eps_via_nodes)
  68      internal_function;
  69 static reg_errcode_t set_regs (const regex_t *preg,
  70                                const re_match_context_t *mctx,
  71                                size_t nmatch, regmatch_t *pmatch,
  72                                int fl_backtrack) internal_function;
  73 static reg_errcode_t free_fail_stack_return (struct re_fail_stack_t *fs)
  74      internal_function;
  75
  76 #ifdef RE_ENABLE_I18N
  77 static int sift_states_iter_mb (const re_match_context_t *mctx,
  78                                 re_sift_context_t *sctx,
  79                                 int node_idx, int str_idx, int max_str_idx)
  80      internal_function;
  81 #endif /* RE_ENABLE_I18N */
  82 static reg_errcode_t sift_states_backward (const re_match_context_t *mctx,
  83                                            re_sift_context_t *sctx)
  84      internal_function;
  85 static reg_errcode_t build_sifted_states (const re_match_context_t *mctx,
  86                                           re_sift_context_t *sctx, int str_idx,
  87                                           re_node_set *cur_dest)
  88      internal_function;
  89 static reg_errcode_t update_cur_sifted_state (const re_match_context_t *mctx,
  90                                               re_sift_context_t *sctx,
  91                                               int str_idx,
  92                                               re_node_set *dest_nodes)
  93      internal_function;
  94 static reg_errcode_t add_epsilon_src_nodes (const re_dfa_t *dfa,
  95                                             re_node_set *dest_nodes,
  96                                             const re_node_set *candidates)
  97      internal_function;
  98 static int check_dst_limits (const re_match_context_t *mctx,
  99                              re_node_set *limits,
 100                              int dst_node, int dst_idx, int src_node,
 101                              int src_idx) internal_function;
 102 static int check_dst_limits_calc_pos_1 (const re_match_context_t *mctx,
 103                                         int boundaries, int subexp_idx,
 104                                         int from_node, int bkref_idx)
 105      internal_function;
 106 static int check_dst_limits_calc_pos (const re_match_context_t *mctx,
 107                                       int limit, int subexp_idx,
 108                                       int node, int str_idx,
 109                                       int bkref_idx) internal_function;
 110 static reg_errcode_t check_subexp_limits (const re_dfa_t *dfa,
 111                                           re_node_set *dest_nodes,
 112                                           const re_node_set *candidates,
 113                                           re_node_set *limits,
 114                                           struct re_backref_cache_entry *bkref_ents,
 115                                           int str_idx) internal_function;
 116 static reg_errcode_t sift_states_bkref (const re_match_context_t *mctx,
 117                                         re_sift_context_t *sctx,
 118                                         int str_idx, const re_node_set *candidates)
 119      internal_function;
 120 static reg_errcode_t merge_state_array (const re_dfa_t *dfa,
 121                                         re_dfastate_t **dst,
 122                                         re_dfastate_t **src, int num)
 123      internal_function;
 124 static re_dfastate_t *find_recover_state (reg_errcode_t *err,
 125                                          re_match_context_t *mctx) internal_function;
 126 static re_dfastate_t *transit_state (reg_errcode_t *err,
 127                                      re_match_context_t *mctx,
 128                                      re_dfastate_t *state) internal_function;
 129 static re_dfastate_t *merge_state_with_log (reg_errcode_t *err,
 130                                             re_match_context_t *mctx,
 131                                             re_dfastate_t *next_state)
 132      internal_function;
 133 static reg_errcode_t check_subexp_matching_top (re_match_context_t *mctx,
 134                                                 re_node_set *cur_nodes,
 135                                                 int str_idx) internal_function;
 136 #if 0
 137 static re_dfastate_t *transit_state_sb (reg_errcode_t *err,
 138                                         re_match_context_t *mctx,
 139                                         re_dfastate_t *pstate)
 140      internal_function;
 141 #endif
 142 #ifdef RE_ENABLE_I18N
 143 static reg_errcode_t transit_state_mb (re_match_context_t *mctx,
 144                                        re_dfastate_t *pstate)
 145      internal_function;
 146 #endif /* RE_ENABLE_I18N */
 147 static reg_errcode_t transit_state_bkref (re_match_context_t *mctx,
 148                                           const re_node_set *nodes)
 149      internal_function;
 150 static reg_errcode_t get_subexp (re_match_context_t *mctx,
 151                                  int bkref_node, int bkref_str_idx)
 152      internal_function;
 153 static reg_errcode_t get_subexp_sub (re_match_context_t *mctx,
 154                                      const re_sub_match_top_t *sub_top,
 155                                      re_sub_match_last_t *sub_last,
 156                                      int bkref_node, int bkref_str)
 157      internal_function;
 158 static int find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
 159                              int subexp_idx, int type) internal_function;
 160 static reg_errcode_t check_arrival (re_match_context_t *mctx,
 161                                     state_array_t *path, int top_node,
 162                                     int top_str, int last_node, int last_str,
 163                                     int type) internal_function;
 164 static reg_errcode_t check_arrival_add_next_nodes (re_match_context_t *mctx,
 165                                                    int str_idx,
 166                                                    re_node_set *cur_nodes,
 167                                                    re_node_set *next_nodes)
 168      internal_function;
 169 static reg_errcode_t check_arrival_expand_ecl (const re_dfa_t *dfa,
 170                                                re_node_set *cur_nodes,
 171                                                int ex_subexp, int type)
 172      internal_function;
 173 static reg_errcode_t check_arrival_expand_ecl_sub (const re_dfa_t *dfa,
 174                                                    re_node_set *dst_nodes,
 175                                                    int target, int ex_subexp,
 176                                                    int type) internal_function;
 177 static reg_errcode_t expand_bkref_cache (re_match_context_t *mctx,
 178                                          re_node_set *cur_nodes, int cur_str,
 179                                          int subexp_num, int type)
 180      internal_function;
 181 static int build_trtable (const re_dfa_t *dfa,
 182                           re_dfastate_t *state) internal_function;
 183 #ifdef RE_ENABLE_I18N
 184 static int check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
 185                                     const re_string_t *input, int idx)
 186      internal_function;
 187 # ifdef _LIBC
 188 static unsigned int find_collation_sequence_value (const unsigned char *mbs,
 189                                                    size_t name_len)
 190      internal_function;
 191 # endif /* _LIBC */
 192 #endif /* RE_ENABLE_I18N */
 193 static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
 194                                        const re_dfastate_t *state,
 195                                        re_node_set *states_node,
 196                                        bitset_t *states_ch) internal_function;
 197 static int check_node_accept (const re_match_context_t *mctx,
 198                               const re_token_t *node, int idx)
 199      internal_function;
 200 static reg_errcode_t extend_buffers (re_match_context_t *mctx)
 201      internal_function;
 202 \f
 203 /* Entry point for POSIX code.  */
 204
 205 /* regexec searches for a given pattern, specified by PREG, in the
 206    string STRING.
 207
 208    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 209    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 210    least NMATCH elements, and we set them to the offsets of the
 211    corresponding matched substrings.
 212
 213    EFLAGS specifies `execution flags' which affect matching: if
 214    REG_NOTBOL is set, then ^ does not match at the beginning of the
 215    string; if REG_NOTEOL is set, then $ does not match at the end.
 216
 217    We return 0 if we find a match and REG_NOMATCH if not.  */
 218
 219 int
 220 regexec (preg, string, nmatch, pmatch, eflags)
 221     const regex_t *__restrict preg;
 222     const char *__restrict string;
 223     size_t nmatch;
 224     regmatch_t pmatch[];
 225     int eflags;
 226 {
 227   reg_errcode_t err;
 228   int start, length;
 229   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 230
 231   if (eflags & ~(REG_NOTBOL | REG_NOTEOL | REG_STARTEND))
 232     return REG_BADPAT;
 233
 234   if (eflags & REG_STARTEND)
 235     {
 236       start = pmatch[0].rm_so;
 237       length = pmatch[0].rm_eo;
 238     }
 239   else
 240     {
 241       start = 0;
 242       length = strlen (string);
 243     }
 244
 245   __libc_lock_lock (dfa->lock);
 246   if (preg->no_sub)
 247     err = re_search_internal (preg, string, length, start, length - start,
 248                               length, 0, NULL, eflags);
 249   else
 250     err = re_search_internal (preg, string, length, start, length - start,
 251                               length, nmatch, pmatch, eflags);
 252   __libc_lock_unlock (dfa->lock);
 253   return err != REG_NOERROR;
 254 }
 255
 256 #ifdef _LIBC
 257 # include <shlib-compat.h>
 258 versioned_symbol (libc, __regexec, regexec, GLIBC_2_3_4);
 259
 260 # if SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 261 __typeof__ (__regexec) __compat_regexec;
 262
 263 int
 264 attribute_compat_text_section
 265 __compat_regexec (const regex_t *__restrict preg,
 266                   const char *__restrict string, size_t nmatch,
 267                   regmatch_t pmatch[], int eflags)
 268 {
 269   return regexec (preg, string, nmatch, pmatch,
 270                   eflags & (REG_NOTBOL | REG_NOTEOL));
 271 }
 272 compat_symbol (libc, __compat_regexec, regexec, GLIBC_2_0);
 273 # endif
 274 #endif
 275
 276 /* Entry points for GNU code.  */
 277
 278 /* re_match, re_search, re_match_2, re_search_2
 279
 280    The former two functions operate on STRING with length LENGTH,
 281    while the later two operate on concatenation of STRING1 and STRING2
 282    with lengths LENGTH1 and LENGTH2, respectively.
 283
 284    re_match() matches the compiled pattern in BUFP against the string,
 285    starting at index START.
 286
 287    re_search() first tries matching at index START, then it tries to match
 288    starting from index START + 1, and so on.  The last start position tried
 289    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 290    way as re_match().)
 291
 292    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 293    the first STOP characters of the concatenation of the strings should be
 294    concerned.
 295
 296    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 297    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 298    computed relative to the concatenation, not relative to the individual
 299    strings.)
 300
 301    On success, re_match* functions return the length of the match, re_search*
 302    return the position of the start of the match.  Return value -1 means no
 303    match was found and -2 indicates an internal error.  */
 304
 305 int
 306 re_match (bufp, string, length, start, regs)
 307     struct re_pattern_buffer *bufp;
 308     const char *string;
 309     int length, start;
 310     struct re_registers *regs;
 311 {
 312   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 313 }
 314 #ifdef _LIBC
 315 weak_alias (__re_match, re_match)
 316 #endif
 317
 318 int
 319 re_search (bufp, string, length, start, range, regs)
 320     struct re_pattern_buffer *bufp;
 321     const char *string;
 322     int length, start, range;
 323     struct re_registers *regs;
 324 {
 325   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 326 }
 327 #ifdef _LIBC
 328 weak_alias (__re_search, re_search)
 329 #endif
 330
 331 int
 332 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 333     struct re_pattern_buffer *bufp;
 334     const char *string1, *string2;
 335     int length1, length2, start, stop;
 336     struct re_registers *regs;
 337 {
 338   return re_search_2_stub (bufp, string1, length1, string2, length2,
 339                            start, 0, regs, stop, 1);
 340 }
 341 #ifdef _LIBC
 342 weak_alias (__re_match_2, re_match_2)
 343 #endif
 344
 345 int
 346 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 347     struct re_pattern_buffer *bufp;
 348     const char *string1, *string2;
 349     int length1, length2, start, range, stop;
 350     struct re_registers *regs;
 351 {
 352   return re_search_2_stub (bufp, string1, length1, string2, length2,
 353                            start, range, regs, stop, 0);
 354 }
 355 #ifdef _LIBC
 356 weak_alias (__re_search_2, re_search_2)
 357 #endif
 358
 359 static int
 360 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 361                   stop, ret_len)
 362     struct re_pattern_buffer *bufp;
 363     const char *string1, *string2;
 364     int length1, length2, start, range, stop, ret_len;
 365     struct re_registers *regs;
 366 {
 367   const char *str;
 368   int rval;
 369   int len = length1 + length2;
 370   char *s = NULL;
 371
 372   if (BE (length1 < 0 || length2 < 0 || stop < 0 || len < length1, 0))
 373     return -2;
 374
 375   /* Concatenate the strings.  */
 376   if (length2 > 0)
 377     if (length1 > 0)
 378       {
 379         s = re_malloc (char, len);
 380
 381         if (BE (s == NULL, 0))
 382           return -2;
 383 #ifdef _LIBC
 384         memcpy (__mempcpy (s, string1, length1), string2, length2);
 385 #else
 386         memcpy (s, string1, length1);
 387         memcpy (s + length1, string2, length2);
 388 #endif
 389         str = s;
 390       }
 391     else
 392       str = string2;
 393   else
 394     str = string1;
 395
 396   rval = re_search_stub (bufp, str, len, start, range, stop, regs, ret_len);
 397   re_free (s);
 398   return rval;
 399 }
 400
 401 /* The parameters have the same meaning as those of re_search.
 402    Additional parameters:
 403    If RET_LEN is nonzero the length of the match is returned (re_match style);
 404    otherwise the position of the match is returned.  */
 405
 406 static int
 407 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 408     struct re_pattern_buffer *bufp;
 409     const char *string;
 410     int length, start, range, stop, ret_len;
 411     struct re_registers *regs;
 412 {
 413   reg_errcode_t result;
 414   regmatch_t *pmatch;
 415   int nregs, rval;
 416   int eflags = 0;
 417   re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
 418
 419   /* Check for out-of-range.  */
 420   if (BE (start < 0 || start > length, 0))
 421     return -1;
 422   if (BE (start + range > length, 0))
 423     range = length - start;
 424   else if (BE (start + range < 0, 0))
 425     range = -start;
 426
 427   __libc_lock_lock (dfa->lock);
 428
 429   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 430   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 431
 432   /* Compile fastmap if we haven't yet.  */
 433   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 434     re_compile_fastmap (bufp);
 435
 436   if (BE (bufp->no_sub, 0))
 437     regs = NULL;
 438
 439   /* We need at least 1 register.  */
 440   if (regs == NULL)
 441     nregs = 1;
 442   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 443                regs->num_regs < bufp->re_nsub + 1, 0))
 444     {
 445       nregs = regs->num_regs;
 446       if (BE (nregs < 1, 0))
 447         {
 448           /* Nothing can be copied to regs.  */
 449           regs = NULL;
 450           nregs = 1;
 451         }
 452     }
 453   else
 454     nregs = bufp->re_nsub + 1;
 455   pmatch = re_malloc (regmatch_t, nregs);
 456   if (BE (pmatch == NULL, 0))
 457     {
 458       rval = -2;
 459       goto out;
 460     }
 461
 462   result = re_search_internal (bufp, string, length, start, range, stop,
 463                                nregs, pmatch, eflags);
 464
 465   rval = 0;
 466
 467   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 468   if (result != REG_NOERROR)
 469     rval = -1;
 470   else if (regs != NULL)
 471     {
 472       /* If caller wants register contents data back, copy them.  */
 473       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 474                                            bufp->regs_allocated);
 475       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 476         rval = -2;
 477     }
 478
 479   if (BE (rval == 0, 1))
 480     {
 481       if (ret_len)
 482         {
 483           assert (pmatch[0].rm_so == start);
 484           rval = pmatch[0].rm_eo - start;
 485         }
 486       else
 487         rval = pmatch[0].rm_so;
 488     }
 489   re_free (pmatch);
 490  out:
 491   __libc_lock_unlock (dfa->lock);
 492   return rval;
 493 }
 494
 495 static unsigned
 496 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 497     struct re_registers *regs;
 498     regmatch_t *pmatch;
 499     int nregs, regs_allocated;
 500 {
 501   int rval = REGS_REALLOCATE;
 502   int i;
 503   int need_regs = nregs + 1;
 504   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 505      uses.  */
 506
 507   /* Have the register data arrays been allocated?  */
 508   if (regs_allocated == REGS_UNALLOCATED)
 509     { /* No.  So allocate them with malloc.  */
 510       regs->start = re_malloc (regoff_t, need_regs);
 511       if (BE (regs->start == NULL, 0))
 512         return REGS_UNALLOCATED;
 513       regs->end = re_malloc (regoff_t, need_regs);
 514       if (BE (regs->end == NULL, 0))
 515         {
 516           re_free (regs->start);
 517           return REGS_UNALLOCATED;
 518         }
 519       regs->num_regs = need_regs;
 520     }
 521   else if (regs_allocated == REGS_REALLOCATE)
 522     { /* Yes.  If we need more elements than were already
 523          allocated, reallocate them.  If we need fewer, just
 524          leave it alone.  */
 525       if (BE (need_regs > regs->num_regs, 0))
 526         {
 527           regoff_t *new_start = re_realloc (regs->start, regoff_t, need_regs);
 528           regoff_t *new_end;
 529           if (BE (new_start == NULL, 0))
 530             return REGS_UNALLOCATED;
 531           new_end = re_realloc (regs->end, regoff_t, need_regs);
 532           if (BE (new_end == NULL, 0))
 533             {
 534               re_free (new_start);
 535               return REGS_UNALLOCATED;
 536             }
 537           regs->start = new_start;
 538           regs->end = new_end;
 539           regs->num_regs = need_regs;
 540         }
 541     }
 542   else
 543     {
 544       assert (regs_allocated == REGS_FIXED);
 545       /* This function may not be called with REGS_FIXED and nregs too big.  */
 546       assert (regs->num_regs >= nregs);
 547       rval = REGS_FIXED;
 548     }
 549
 550   /* Copy the regs.  */
 551   for (i = 0; i < nregs; ++i)
 552     {
 553       regs->start[i] = pmatch[i].rm_so;
 554       regs->end[i] = pmatch[i].rm_eo;
 555     }
 556   for ( ; i < regs->num_regs; ++i)
 557     regs->start[i] = regs->end[i] = -1;
 558
 559   return rval;
 560 }
 561
 562 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 563    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 564    this memory for recording register information.  STARTS and ENDS
 565    must be allocated using the malloc library routine, and must each
 566    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 567
 568    If NUM_REGS == 0, then subsequent matches should allocate their own
 569    register data.
 570
 571    Unless this function is called, the first search or match using
 572    PATTERN_BUFFER will allocate its own register data, without
 573    freeing the old data.  */
 574
 575 void
 576 re_set_registers (bufp, regs, num_regs, starts, ends)
 577     struct re_pattern_buffer *bufp;
 578     struct re_registers *regs;
 579     unsigned num_regs;
 580     regoff_t *starts, *ends;
 581 {
 582   if (num_regs)
 583     {
 584       bufp->regs_allocated = REGS_REALLOCATE;
 585       regs->num_regs = num_regs;
 586       regs->start = starts;
 587       regs->end = ends;
 588     }
 589   else
 590     {
 591       bufp->regs_allocated = REGS_UNALLOCATED;
 592       regs->num_regs = 0;
 593       regs->start = regs->end = (regoff_t *) 0;
 594     }
 595 }
 596 #ifdef _LIBC
 597 weak_alias (__re_set_registers, re_set_registers)
 598 #endif
 599 \f
 600 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 601    them unless specifically requested.  */
 602
 603 #if defined _REGEX_RE_COMP || defined _LIBC
 604 int
 605 # ifdef _LIBC
 606 weak_function
 607 # endif
 608 re_exec (s)
 609      const char *s;
 610 {
 611   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 612 }
 613 #endif /* _REGEX_RE_COMP */
 614 \f
 615 /* Internal entry point.  */
 616
 617 /* Searches for a compiled pattern PREG in the string STRING, whose
 618    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 619    mingings with regexec.  START, and RANGE have the same meanings
 620    with re_search.
 621    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 622    otherwise return the error code.
 623    Note: We assume front end functions already check ranges.
 624    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 625
 626 static reg_errcode_t
 627 __attribute_warn_unused_result__
 628 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 629                     eflags)
 630     const regex_t *preg;
 631     const char *string;
 632     int length, start, range, stop, eflags;
 633     size_t nmatch;
 634     regmatch_t pmatch[];
 635 {
 636   reg_errcode_t err;
 637   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
 638   int left_lim, right_lim, incr;
 639   int fl_longest_match, match_first, match_kind, match_last = -1;
 640   int extra_nmatch;
 641   int sb, ch;
 642 #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
 643   re_match_context_t mctx = { .dfa = dfa };
 644 #else
 645   re_match_context_t mctx;
 646 #endif
 647   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
 648                    && range && !preg->can_be_null) ? preg->fastmap : NULL;
 649   RE_TRANSLATE_TYPE t = preg->translate;
 650
 651 #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
 652   memset (&mctx, '\0', sizeof (re_match_context_t));
 653   mctx.dfa = dfa;
 654 #endif
 655
 656   extra_nmatch = (nmatch > preg->re_nsub) ? nmatch - (preg->re_nsub + 1) : 0;
 657   nmatch -= extra_nmatch;
 658
 659   /* Check if the DFA haven't been compiled.  */
 660   if (BE (preg->used == 0 || dfa->init_state == NULL
 661           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 662           || dfa->init_state_begbuf == NULL, 0))
 663     return REG_NOMATCH;
 664
 665 #ifdef DEBUG
 666   /* We assume front-end functions already check them.  */
 667   assert (start + range >= 0 && start + range <= length);
 668 #endif
 669
 670   /* If initial states with non-begbuf contexts have no elements,
 671      the regex must be anchored.  If preg->newline_anchor is set,
 672      we'll never use init_state_nl, so do not check it.  */
 673   if (dfa->init_state->nodes.nelem == 0
 674       && dfa->init_state_word->nodes.nelem == 0
 675       && (dfa->init_state_nl->nodes.nelem == 0
 676           || !preg->newline_anchor))
 677     {
 678       if (start != 0 && start + range != 0)
 679         return REG_NOMATCH;
 680       start = range = 0;
 681     }
 682
 683   /* We must check the longest matching, if nmatch > 0.  */
 684   fl_longest_match = (nmatch != 0 || dfa->nbackref);
 685
 686   err = re_string_allocate (&mctx.input, string, length, dfa->nodes_len + 1,
 687                             preg->translate, preg->syntax & RE_ICASE, dfa);
 688   if (BE (err != REG_NOERROR, 0))
 689     goto free_return;
 690   mctx.input.stop = stop;
 691   mctx.input.raw_stop = stop;
 692   mctx.input.newline_anchor = preg->newline_anchor;
 693
 694   err = match_ctx_init (&mctx, eflags, dfa->nbackref * 2);
 695   if (BE (err != REG_NOERROR, 0))
 696     goto free_return;
 697
 698   /* We will log all the DFA states through which the dfa pass,
 699      if nmatch > 1, or this dfa has "multibyte node", which is a
 700      back-reference or a node which can accept multibyte character or
 701      multi character collating element.  */
 702   if (nmatch > 1 || dfa->has_mb_node)
 703     {
 704       /* Avoid overflow.  */
 705       if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= mctx.input.bufs_len, 0))
 706         {
 707           err = REG_ESPACE;
 708           goto free_return;
 709         }
 710
 711       mctx.state_log = re_malloc (re_dfastate_t *, mctx.input.bufs_len + 1);
 712       if (BE (mctx.state_log == NULL, 0))
 713         {
 714           err = REG_ESPACE;
 715           goto free_return;
 716         }
 717     }
 718   else
 719     mctx.state_log = NULL;
 720
 721   match_first = start;
 722   mctx.input.tip_context = (eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 723                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF;
 724
 725   /* Check incrementally whether of not the input string match.  */
 726   incr = (range < 0) ? -1 : 1;
 727   left_lim = (range < 0) ? start + range : start;
 728   right_lim = (range < 0) ? start : start + range;
 729   sb = dfa->mb_cur_max == 1;
 730   match_kind =
 731     (fastmap
 732      ? ((sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
 733         | (range >= 0 ? 2 : 0)
 734         | (t != NULL ? 1 : 0))
 735      : 8);
 736
 737   for (;; match_first += incr)
 738     {
 739       err = REG_NOMATCH;
 740       if (match_first < left_lim || right_lim < match_first)
 741         goto free_return;
 742
 743       /* Advance as rapidly as possible through the string, until we
 744          find a plausible place to start matching.  This may be done
 745          with varying efficiency, so there are various possibilities:
 746          only the most common of them are specialized, in order to
 747          save on code size.  We use a switch statement for speed.  */
 748       switch (match_kind)
 749         {
 750         case 8:
 751           /* No fastmap.  */
 752           break;
 753
 754         case 7:
 755           /* Fastmap with single-byte translation, match forward.  */
 756           while (BE (match_first < right_lim, 1)
 757                  && !fastmap[t[(unsigned char) string[match_first]]])
 758             ++match_first;
 759           goto forward_match_found_start_or_reached_end;
 760
 761         case 6:
 762           /* Fastmap without translation, match forward.  */
 763           while (BE (match_first < right_lim, 1)
 764                  && !fastmap[(unsigned char) string[match_first]])
 765             ++match_first;
 766
 767         forward_match_found_start_or_reached_end:
 768           if (BE (match_first == right_lim, 0))
 769             {
 770               ch = match_first >= length
 771                        ? 0 : (unsigned char) string[match_first];
 772               if (!fastmap[t ? t[ch] : ch])
 773                 goto free_return;
 774             }
 775           break;
 776
 777         case 4:
 778         case 5:
 779           /* Fastmap without multi-byte translation, match backwards.  */
 780           while (match_first >= left_lim)
 781             {
 782               ch = match_first >= length
 783                        ? 0 : (unsigned char) string[match_first];
 784               if (fastmap[t ? t[ch] : ch])
 785                 break;
 786               --match_first;
 787             }
 788           if (match_first < left_lim)
 789             goto free_return;
 790           break;
 791
 792         default:
 793           /* In this case, we can't determine easily the current byte,
 794              since it might be a component byte of a multibyte
 795              character.  Then we use the constructed buffer instead.  */
 796           for (;;)
 797             {
 798               /* If MATCH_FIRST is out of the valid range, reconstruct the
 799                  buffers.  */
 800               unsigned int offset = match_first - mctx.input.raw_mbs_idx;
 801               if (BE (offset >= (unsigned int) mctx.input.valid_raw_len, 0))
 802                 {
 803                   err = re_string_reconstruct (&mctx.input, match_first,
 804                                                eflags);
 805                   if (BE (err != REG_NOERROR, 0))
 806                     goto free_return;
 807
 808                   offset = match_first - mctx.input.raw_mbs_idx;
 809                 }
 810               /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 811                  Note that MATCH_FIRST must not be smaller than 0.  */
 812               ch = (match_first >= length
 813                     ? 0 : re_string_byte_at (&mctx.input, offset));
 814               if (fastmap[ch])
 815                 break;
 816               match_first += incr;
 817               if (match_first < left_lim || match_first > right_lim)
 818                 {
 819                   err = REG_NOMATCH;
 820                   goto free_return;
 821                 }
 822             }
 823           break;
 824         }
 825
 826       /* Reconstruct the buffers so that the matcher can assume that
 827          the matching starts from the beginning of the buffer.  */
 828       err = re_string_reconstruct (&mctx.input, match_first, eflags);
 829       if (BE (err != REG_NOERROR, 0))
 830         goto free_return;
 831
 832 #ifdef RE_ENABLE_I18N
 833      /* Don't consider this char as a possible match start if it part,
 834         yet isn't the head, of a multibyte character.  */
 835       if (!sb && !re_string_first_byte (&mctx.input, 0))
 836         continue;
 837 #endif
 838
 839       /* It seems to be appropriate one, then use the matcher.  */
 840       /* We assume that the matching starts from 0.  */
 841       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
 842       match_last = check_matching (&mctx, fl_longest_match,
 843                                    range >= 0 ? &match_first : NULL);
 844       if (match_last != -1)
 845         {
 846           if (BE (match_last == -2, 0))
 847             {
 848               err = REG_ESPACE;
 849               goto free_return;
 850             }
 851           else
 852             {
 853               mctx.match_last = match_last;
 854               if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
 855                 {
 856                   re_dfastate_t *pstate = mctx.state_log[match_last];
 857                   mctx.last_node = check_halt_state_context (&mctx, pstate,
 858                                                              match_last);
 859                 }
 860               if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
 861                   || dfa->nbackref)
 862                 {
 863                   err = prune_impossible_nodes (&mctx);
 864                   if (err == REG_NOERROR)
 865                     break;
 866                   if (BE (err != REG_NOMATCH, 0))
 867                     goto free_return;
 868                   match_last = -1;
 869                 }
 870               else
 871                 break; /* We found a match.  */
 872             }
 873         }
 874
 875       match_ctx_clean (&mctx);
 876     }
 877
 878 #ifdef DEBUG
 879   assert (match_last != -1);
 880   assert (err == REG_NOERROR);
 881 #endif
 882
 883   /* Set pmatch[] if we need.  */
 884   if (nmatch > 0)
 885     {
 886       int reg_idx;
 887
 888       /* Initialize registers.  */
 889       for (reg_idx = 1; reg_idx < nmatch; ++reg_idx)
 890         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 891
 892       /* Set the points where matching start/end.  */
 893       pmatch[0].rm_so = 0;
 894       pmatch[0].rm_eo = mctx.match_last;
 895
 896       if (!preg->no_sub && nmatch > 1)
 897         {
 898           err = set_regs (preg, &mctx, nmatch, pmatch,
 899                           dfa->has_plural_match && dfa->nbackref > 0);
 900           if (BE (err != REG_NOERROR, 0))
 901             goto free_return;
 902         }
 903
 904       /* At last, add the offset to the each registers, since we slided
 905          the buffers so that we could assume that the matching starts
 906          from 0.  */
 907       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 908         if (pmatch[reg_idx].rm_so != -1)
 909           {
 910 #ifdef RE_ENABLE_I18N
 911             if (BE (mctx.input.offsets_needed != 0, 0))
 912               {
 913                 pmatch[reg_idx].rm_so =
 914                   (pmatch[reg_idx].rm_so == mctx.input.valid_len
 915                    ? mctx.input.valid_raw_len
 916                    : mctx.input.offsets[pmatch[reg_idx].rm_so]);
 917                 pmatch[reg_idx].rm_eo =
 918                   (pmatch[reg_idx].rm_eo == mctx.input.valid_len
 919                    ? mctx.input.valid_raw_len
 920                    : mctx.input.offsets[pmatch[reg_idx].rm_eo]);
 921               }
 922 #else
 923             assert (mctx.input.offsets_needed == 0);
 924 #endif
 925             pmatch[reg_idx].rm_so += match_first;
 926             pmatch[reg_idx].rm_eo += match_first;
 927           }
 928       for (reg_idx = 0; reg_idx < extra_nmatch; ++reg_idx)
 929         {
 930           pmatch[nmatch + reg_idx].rm_so = -1;
 931           pmatch[nmatch + reg_idx].rm_eo = -1;
 932         }
 933
 934       if (dfa->subexp_map)
 935         for (reg_idx = 0; reg_idx + 1 < nmatch; reg_idx++)
 936           if (dfa->subexp_map[reg_idx] != reg_idx)
 937             {
 938               pmatch[reg_idx + 1].rm_so
 939                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
 940               pmatch[reg_idx + 1].rm_eo
 941                 = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
 942             }
 943     }
 944
 945  free_return:
 946   re_free (mctx.state_log);
 947   if (dfa->nbackref)
 948     match_ctx_free (&mctx);
 949   re_string_destruct (&mctx.input);
 950   return err;
 951 }
 952
 953 static reg_errcode_t
 954 __attribute_warn_unused_result__
 955 prune_impossible_nodes (mctx)
 956      re_match_context_t *mctx;
 957 {
 958   const re_dfa_t *const dfa = mctx->dfa;
 959   int halt_node, match_last;
 960   reg_errcode_t ret;
 961   re_dfastate_t **sifted_states;
 962   re_dfastate_t **lim_states = NULL;
 963   re_sift_context_t sctx;
 964 #ifdef DEBUG
 965   assert (mctx->state_log != NULL);
 966 #endif
 967   match_last = mctx->match_last;
 968   halt_node = mctx->last_node;
 969
 970   /* Avoid overflow.  */
 971   if (BE (SIZE_MAX / sizeof (re_dfastate_t *) <= match_last, 0))
 972     return REG_ESPACE;
 973
 974   sifted_states = re_malloc (re_dfastate_t *, match_last + 1);
 975   if (BE (sifted_states == NULL, 0))
 976     {
 977       ret = REG_ESPACE;
 978       goto free_return;
 979     }
 980   if (dfa->nbackref)
 981     {
 982       lim_states = re_malloc (re_dfastate_t *, match_last + 1);
 983       if (BE (lim_states == NULL, 0))
 984         {
 985           ret = REG_ESPACE;
 986           goto free_return;
 987         }
 988       while (1)
 989         {
 990           memset (lim_states, '\0',
 991                   sizeof (re_dfastate_t *) * (match_last + 1));
 992           sift_ctx_init (&sctx, sifted_states, lim_states, halt_node,
 993                          match_last);
 994           ret = sift_states_backward (mctx, &sctx);
 995           re_node_set_free (&sctx.limits);
 996           if (BE (ret != REG_NOERROR, 0))
 997               goto free_return;
 998           if (sifted_states[0] != NULL || lim_states[0] != NULL)
 999             break;
1000           do
1001             {
1002               --match_last;
1003               if (match_last < 0)
1004                 {
1005                   ret = REG_NOMATCH;
1006                   goto free_return;
1007                 }
1008             } while (mctx->state_log[match_last] == NULL
1009                      || !mctx->state_log[match_last]->halt);
1010           halt_node = check_halt_state_context (mctx,
1011                                                 mctx->state_log[match_last],
1012                                                 match_last);
1013         }
1014       ret = merge_state_array (dfa, sifted_states, lim_states,
1015                                match_last + 1);
1016       re_free (lim_states);
1017       lim_states = NULL;
1018       if (BE (ret != REG_NOERROR, 0))
1019         goto free_return;
1020     }
1021   else
1022     {
1023       sift_ctx_init (&sctx, sifted_states, lim_states, halt_node, match_last);
1024       ret = sift_states_backward (mctx, &sctx);
1025       re_node_set_free (&sctx.limits);
1026       if (BE (ret != REG_NOERROR, 0))
1027         goto free_return;
1028       if (sifted_states[0] == NULL)
1029         {
1030           ret = REG_NOMATCH;
1031           goto free_return;
1032         }
1033     }
1034   re_free (mctx->state_log);
1035   mctx->state_log = sifted_states;
1036   sifted_states = NULL;
1037   mctx->last_node = halt_node;
1038   mctx->match_last = match_last;
1039   ret = REG_NOERROR;
1040  free_return:
1041   re_free (sifted_states);
1042   re_free (lim_states);
1043   return ret;
1044 }
1045
1046 /* Acquire an initial state and return it.
1047    We must select appropriate initial state depending on the context,
1048    since initial states may have constraints like "\<", "^", etc..  */
1049
1050 static inline re_dfastate_t *
1051 __attribute ((always_inline)) internal_function
1052 acquire_init_state_context (reg_errcode_t *err, const re_match_context_t *mctx,
1053                             int idx)
1054 {
1055   const re_dfa_t *const dfa = mctx->dfa;
1056   if (dfa->init_state->has_constraint)
1057     {
1058       unsigned int context;
1059       context = re_string_context_at (&mctx->input, idx - 1, mctx->eflags);
1060       if (IS_WORD_CONTEXT (context))
1061         return dfa->init_state_word;
1062       else if (IS_ORDINARY_CONTEXT (context))
1063         return dfa->init_state;
1064       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
1065         return dfa->init_state_begbuf;
1066       else if (IS_NEWLINE_CONTEXT (context))
1067         return dfa->init_state_nl;
1068       else if (IS_BEGBUF_CONTEXT (context))
1069         {
1070           /* It is relatively rare case, then calculate on demand.  */
1071           return re_acquire_state_context (err, dfa,
1072                                            dfa->init_state->entrance_nodes,
1073                                            context);
1074         }
1075       else
1076         /* Must not happen?  */
1077         return dfa->init_state;
1078     }
1079   else
1080     return dfa->init_state;
1081 }
1082
1083 /* Check whether the regular expression match input string INPUT or not,
1084    and return the index where the matching end, return -1 if not match,
1085    or return -2 in case of an error.
1086    FL_LONGEST_MATCH means we want the POSIX longest matching.
1087    If P_MATCH_FIRST is not NULL, and the match fails, it is set to the
1088    next place where we may want to try matching.
1089    Note that the matcher assume that the maching starts from the current
1090    index of the buffer.  */
1091
1092 static int
1093 internal_function __attribute_warn_unused_result__
1094 check_matching (re_match_context_t *mctx, int fl_longest_match,
1095                 int *p_match_first)
1096 {
1097   const re_dfa_t *const dfa = mctx->dfa;
1098   reg_errcode_t err;
1099   int match = 0;
1100   int match_last = -1;
1101   int cur_str_idx = re_string_cur_idx (&mctx->input);
1102   re_dfastate_t *cur_state;
1103   int at_init_state = p_match_first != NULL;
1104   int next_start_idx = cur_str_idx;
1105
1106   err = REG_NOERROR;
1107   cur_state = acquire_init_state_context (&err, mctx, cur_str_idx);
1108   /* An initial state must not be NULL (invalid).  */
1109   if (BE (cur_state == NULL, 0))
1110     {
1111       assert (err == REG_ESPACE);
1112       return -2;
1113     }
1114
1115   if (mctx->state_log != NULL)
1116     {
1117       mctx->state_log[cur_str_idx] = cur_state;
1118
1119       /* Check OP_OPEN_SUBEXP in the initial state in case that we use them
1120          later.  E.g. Processing back references.  */
1121       if (BE (dfa->nbackref, 0))
1122         {
1123           at_init_state = 0;
1124           err = check_subexp_matching_top (mctx, &cur_state->nodes, 0);
1125           if (BE (err != REG_NOERROR, 0))
1126             return err;
1127
1128           if (cur_state->has_backref)
1129             {
1130               err = transit_state_bkref (mctx, &cur_state->nodes);
1131               if (BE (err != REG_NOERROR, 0))
1132                 return err;
1133             }
1134         }
1135     }
1136
1137   /* If the RE accepts NULL string.  */
1138   if (BE (cur_state->halt, 0))
1139     {
1140       if (!cur_state->has_constraint
1141           || check_halt_state_context (mctx, cur_state, cur_str_idx))
1142         {
1143           if (!fl_longest_match)
1144             return cur_str_idx;
1145           else
1146             {
1147               match_last = cur_str_idx;
1148               match = 1;
1149             }
1150         }
1151     }
1152
1153   while (!re_string_eoi (&mctx->input))
1154     {
1155       re_dfastate_t *old_state = cur_state;
1156       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
1157
1158       if ((BE (next_char_idx >= mctx->input.bufs_len, 0)
1159            && mctx->input.bufs_len < mctx->input.len)
1160           || (BE (next_char_idx >= mctx->input.valid_len, 0)
1161               && mctx->input.valid_len < mctx->input.len))
1162         {
1163           err = extend_buffers (mctx);
1164           if (BE (err != REG_NOERROR, 0))
1165             {
1166               assert (err == REG_ESPACE);
1167               return -2;
1168             }
1169         }
1170
1171       cur_state = transit_state (&err, mctx, cur_state);
1172       if (mctx->state_log != NULL)
1173         cur_state = merge_state_with_log (&err, mctx, cur_state);
1174
1175       if (cur_state == NULL)
1176         {
1177           /* Reached the invalid state or an error.  Try to recover a valid
1178              state using the state log, if available and if we have not
1179              already found a valid (even if not the longest) match.  */
1180           if (BE (err != REG_NOERROR, 0))
1181             return -2;
1182
1183           if (mctx->state_log == NULL
1184               || (match && !fl_longest_match)
1185               || (cur_state = find_recover_state (&err, mctx)) == NULL)
1186             break;
1187         }
1188
1189       if (BE (at_init_state, 0))
1190         {
1191           if (old_state == cur_state)
1192             next_start_idx = next_char_idx;
1193           else
1194             at_init_state = 0;
1195         }
1196
1197       if (cur_state->halt)
1198         {
1199           /* Reached a halt state.
1200              Check the halt state can satisfy the current context.  */
1201           if (!cur_state->has_constraint
1202               || check_halt_state_context (mctx, cur_state,
1203                                            re_string_cur_idx (&mctx->input)))
1204             {
1205               /* We found an appropriate halt state.  */
1206               match_last = re_string_cur_idx (&mctx->input);
1207               match = 1;
1208
1209               /* We found a match, do not modify match_first below.  */
1210               p_match_first = NULL;
1211               if (!fl_longest_match)
1212                 break;
1213             }
1214         }
1215     }
1216
1217   if (p_match_first)
1218     *p_match_first += next_start_idx;
1219
1220   return match_last;
1221 }
1222
1223 /* Check NODE match the current context.  */
1224
1225 static int
1226 internal_function
1227 check_halt_node_context (const re_dfa_t *dfa, int node, unsigned int context)
1228 {
1229   re_token_type_t type = dfa->nodes[node].type;
1230   unsigned int constraint = dfa->nodes[node].constraint;
1231   if (type != END_OF_RE)
1232     return 0;
1233   if (!constraint)
1234     return 1;
1235   if (NOT_SATISFY_NEXT_CONSTRAINT (constraint, context))
1236     return 0;
1237   return 1;
1238 }
1239
1240 /* Check the halt state STATE match the current context.
1241    Return 0 if not match, if the node, STATE has, is a halt node and
1242    match the context, return the node.  */
1243
1244 static int
1245 internal_function
1246 check_halt_state_context (const re_match_context_t *mctx,
1247                           const re_dfastate_t *state, int idx)
1248 {
1249   int i;
1250   unsigned int context;
1251 #ifdef DEBUG
1252   assert (state->halt);
1253 #endif
1254   context = re_string_context_at (&mctx->input, idx, mctx->eflags);
1255   for (i = 0; i < state->nodes.nelem; ++i)
1256     if (check_halt_node_context (mctx->dfa, state->nodes.elems[i], context))
1257       return state->nodes.elems[i];
1258   return 0;
1259 }
1260
1261 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
1262    corresponding to the DFA).
1263    Return the destination node, and update EPS_VIA_NODES, return -1 in case
1264    of errors.  */
1265
1266 static int
1267 internal_function
1268 proceed_next_node (const re_match_context_t *mctx, int nregs, regmatch_t *regs,
1269                    int *pidx, int node, re_node_set *eps_via_nodes,
1270                    struct re_fail_stack_t *fs)
1271 {
1272   const re_dfa_t *const dfa = mctx->dfa;
1273   int i, err;
1274   if (IS_EPSILON_NODE (dfa->nodes[node].type))
1275     {
1276       re_node_set *cur_nodes = &mctx->state_log[*pidx]->nodes;
1277       re_node_set *edests = &dfa->edests[node];
1278       int dest_node;
1279       err = re_node_set_insert (eps_via_nodes, node);
1280       if (BE (err < 0, 0))
1281         return -2;
1282       /* Pick up a valid destination, or return -1 if none is found.  */
1283       for (dest_node = -1, i = 0; i < edests->nelem; ++i)
1284         {
1285           int candidate = edests->elems[i];
1286           if (!re_node_set_contains (cur_nodes, candidate))
1287             continue;
1288           if (dest_node == -1)
1289             dest_node = candidate;
1290
1291           else
1292             {
1293               /* In order to avoid infinite loop like "(a*)*", return the second
1294                  epsilon-transition if the first was already considered.  */
1295               if (re_node_set_contains (eps_via_nodes, dest_node))
1296                 return candidate;
1297
1298               /* Otherwise, push the second epsilon-transition on the fail stack.  */
1299               else if (fs != NULL
1300                        && push_fail_stack (fs, *pidx, candidate, nregs, regs,
1301                                            eps_via_nodes))
1302                 return -2;
1303
1304               /* We know we are going to exit.  */
1305               break;
1306             }
1307         }
1308       return dest_node;
1309     }
1310   else
1311     {
1312       int naccepted = 0;
1313       re_token_type_t type = dfa->nodes[node].type;
1314
1315 #ifdef RE_ENABLE_I18N
1316       if (dfa->nodes[node].accept_mb)
1317         naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
1318       else
1319 #endif /* RE_ENABLE_I18N */
1320       if (type == OP_BACK_REF)
1321         {
1322           int subexp_idx = dfa->nodes[node].opr.idx + 1;
1323           naccepted = regs[subexp_idx].rm_eo - regs[subexp_idx].rm_so;
1324           if (fs != NULL)
1325             {
1326               if (regs[subexp_idx].rm_so == -1 || regs[subexp_idx].rm_eo == -1)
1327                 return -1;
1328               else if (naccepted)
1329                 {
1330                   char *buf = (char *) re_string_get_buffer (&mctx->input);
1331                   if (memcmp (buf + regs[subexp_idx].rm_so, buf + *pidx,
1332                               naccepted) != 0)
1333                     return -1;
1334                 }
1335             }
1336
1337           if (naccepted == 0)
1338             {
1339               int dest_node;
1340               err = re_node_set_insert (eps_via_nodes, node);
1341               if (BE (err < 0, 0))
1342                 return -2;
1343               dest_node = dfa->edests[node].elems[0];
1344               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1345                                         dest_node))
1346                 return dest_node;
1347             }
1348         }
1349
1350       if (naccepted != 0
1351           || check_node_accept (mctx, dfa->nodes + node, *pidx))
1352         {
1353           int dest_node = dfa->nexts[node];
1354           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
1355           if (fs && (*pidx > mctx->match_last || mctx->state_log[*pidx] == NULL
1356                      || !re_node_set_contains (&mctx->state_log[*pidx]->nodes,
1357                                                dest_node)))
1358             return -1;
1359           re_node_set_empty (eps_via_nodes);
1360           return dest_node;
1361         }
1362     }
1363   return -1;
1364 }
1365
1366 static reg_errcode_t
1367 internal_function __attribute_warn_unused_result__
1368 push_fail_stack (struct re_fail_stack_t *fs, int str_idx, int dest_node,
1369                  int nregs, regmatch_t *regs, re_node_set *eps_via_nodes)
1370 {
1371   reg_errcode_t err;
1372   int num = fs->num++;
1373   if (fs->num == fs->alloc)
1374     {
1375       struct re_fail_stack_ent_t *new_array;
1376       new_array = realloc (fs->stack, (sizeof (struct re_fail_stack_ent_t)
1377                                        * fs->alloc * 2));
1378       if (new_array == NULL)
1379         return REG_ESPACE;
1380       fs->alloc *= 2;
1381       fs->stack = new_array;
1382     }
1383   fs->stack[num].idx = str_idx;
1384   fs->stack[num].node = dest_node;
1385   fs->stack[num].regs = re_malloc (regmatch_t, nregs);
1386   if (fs->stack[num].regs == NULL)
1387     return REG_ESPACE;
1388   memcpy (fs->stack[num].regs, regs, sizeof (regmatch_t) * nregs);
1389   err = re_node_set_init_copy (&fs->stack[num].eps_via_nodes, eps_via_nodes);
1390   return err;
1391 }
1392
1393 static int
1394 internal_function
1395 pop_fail_stack (struct re_fail_stack_t *fs, int *pidx, int nregs,
1396                 regmatch_t *regs, re_node_set *eps_via_nodes)
1397 {
1398   int num = --fs->num;
1399   assert (num >= 0);
1400   *pidx = fs->stack[num].idx;
1401   memcpy (regs, fs->stack[num].regs, sizeof (regmatch_t) * nregs);
1402   re_node_set_free (eps_via_nodes);
1403   re_free (fs->stack[num].regs);
1404   *eps_via_nodes = fs->stack[num].eps_via_nodes;
1405   return fs->stack[num].node;
1406 }
1407
1408 /* Set the positions where the subexpressions are starts/ends to registers
1409    PMATCH.
1410    Note: We assume that pmatch[0] is already set, and
1411    pmatch[i].rm_so == pmatch[i].rm_eo == -1 for 0 < i < nmatch.  */
1412
1413 static reg_errcode_t
1414 internal_function __attribute_warn_unused_result__
1415 set_regs (const regex_t *preg, const re_match_context_t *mctx, size_t nmatch,
1416           regmatch_t *pmatch, int fl_backtrack)
1417 {
1418   const re_dfa_t *dfa = (const re_dfa_t *) preg->buffer;
1419   int idx, cur_node;
1420   re_node_set eps_via_nodes;
1421   struct re_fail_stack_t *fs;
1422   struct re_fail_stack_t fs_body = { 0, 2, NULL };
1423   regmatch_t *prev_idx_match;
1424   int prev_idx_match_malloced = 0;
1425
1426 #ifdef DEBUG
1427   assert (nmatch > 1);
1428   assert (mctx->state_log != NULL);
1429 #endif
1430   if (fl_backtrack)
1431     {
1432       fs = &fs_body;
1433       fs->stack = re_malloc (struct re_fail_stack_ent_t, fs->alloc);
1434       if (fs->stack == NULL)
1435         return REG_ESPACE;
1436     }
1437   else
1438     fs = NULL;
1439
1440   cur_node = dfa->init_node;
1441   re_node_set_init_empty (&eps_via_nodes);
1442
1443   if (__libc_use_alloca (nmatch * sizeof (regmatch_t)))
1444     prev_idx_match = (regmatch_t *) alloca (nmatch * sizeof (regmatch_t));
1445   else
1446     {
1447       prev_idx_match = re_malloc (regmatch_t, nmatch);
1448       if (prev_idx_match == NULL)
1449         {
1450           free_fail_stack_return (fs);
1451           return REG_ESPACE;
1452         }
1453       prev_idx_match_malloced = 1;
1454     }
1455   memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1456
1457   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1458     {
1459       update_regs (dfa, pmatch, prev_idx_match, cur_node, idx, nmatch);
1460
1461       if (idx == pmatch[0].rm_eo && cur_node == mctx->last_node)
1462         {
1463           int reg_idx;
1464           if (fs)
1465             {
1466               for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
1467                 if (pmatch[reg_idx].rm_so > -1 && pmatch[reg_idx].rm_eo == -1)
1468                   break;
1469               if (reg_idx == nmatch)
1470                 {
1471                   re_node_set_free (&eps_via_nodes);
1472                   if (prev_idx_match_malloced)
1473                     re_free (prev_idx_match);
1474                   return free_fail_stack_return (fs);
1475                 }
1476               cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1477                                          &eps_via_nodes);
1478             }
1479           else
1480             {
1481               re_node_set_free (&eps_via_nodes);
1482               if (prev_idx_match_malloced)
1483                 re_free (prev_idx_match);
1484               return REG_NOERROR;
1485             }
1486         }
1487
1488       /* Proceed to next node.  */
1489       cur_node = proceed_next_node (mctx, nmatch, pmatch, &idx, cur_node,
1490                                     &eps_via_nodes, fs);
1491
1492       if (BE (cur_node < 0, 0))
1493         {
1494           if (BE (cur_node == -2, 0))
1495             {
1496               re_node_set_free (&eps_via_nodes);
1497               if (prev_idx_match_malloced)
1498                 re_free (prev_idx_match);
1499               free_fail_stack_return (fs);
1500               return REG_ESPACE;
1501             }
1502           if (fs)
1503             cur_node = pop_fail_stack (fs, &idx, nmatch, pmatch,
1504                                        &eps_via_nodes);
1505           else
1506             {
1507               re_node_set_free (&eps_via_nodes);
1508               if (prev_idx_match_malloced)
1509                 re_free (prev_idx_match);
1510               return REG_NOMATCH;
1511             }
1512         }
1513     }
1514   re_node_set_free (&eps_via_nodes);
1515   if (prev_idx_match_malloced)
1516     re_free (prev_idx_match);
1517   return free_fail_stack_return (fs);
1518 }
1519
1520 static reg_errcode_t
1521 internal_function
1522 free_fail_stack_return (struct re_fail_stack_t *fs)
1523 {
1524   if (fs)
1525     {
1526       int fs_idx;
1527       for (fs_idx = 0; fs_idx < fs->num; ++fs_idx)
1528         {
1529           re_node_set_free (&fs->stack[fs_idx].eps_via_nodes);
1530           re_free (fs->stack[fs_idx].regs);
1531         }
1532       re_free (fs->stack);
1533     }
1534   return REG_NOERROR;
1535 }
1536
1537 static void
1538 internal_function
1539 update_regs (const re_dfa_t *dfa, regmatch_t *pmatch,
1540              regmatch_t *prev_idx_match, int cur_node, int cur_idx, int nmatch)
1541 {
1542   int type = dfa->nodes[cur_node].type;
1543   if (type == OP_OPEN_SUBEXP)
1544     {
1545       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1546
1547       /* We are at the first node of this sub expression.  */
1548       if (reg_num < nmatch)
1549         {
1550           pmatch[reg_num].rm_so = cur_idx;
1551           pmatch[reg_num].rm_eo = -1;
1552         }
1553     }
1554   else if (type == OP_CLOSE_SUBEXP)
1555     {
1556       int reg_num = dfa->nodes[cur_node].opr.idx + 1;
1557       if (reg_num < nmatch)
1558         {
1559           /* We are at the last node of this sub expression.  */
1560           if (pmatch[reg_num].rm_so < cur_idx)
1561             {
1562               pmatch[reg_num].rm_eo = cur_idx;
1563               /* This is a non-empty match or we are not inside an optional
1564                  subexpression.  Accept this right away.  */
1565               memcpy (prev_idx_match, pmatch, sizeof (regmatch_t) * nmatch);
1566             }
1567           else
1568             {
1569               if (dfa->nodes[cur_node].opt_subexp
1570                   && prev_idx_match[reg_num].rm_so != -1)
1571                 /* We transited through an empty match for an optional
1572                    subexpression, like (a?)*, and this is not the subexp's
1573                    first match.  Copy back the old content of the registers
1574                    so that matches of an inner subexpression are undone as
1575                    well, like in ((a?))*.  */
1576                 memcpy (pmatch, prev_idx_match, sizeof (regmatch_t) * nmatch);
1577               else
1578                 /* We completed a subexpression, but it may be part of
1579                    an optional one, so do not update PREV_IDX_MATCH.  */
1580                 pmatch[reg_num].rm_eo = cur_idx;
1581             }
1582         }
1583     }
1584 }
1585
1586 /* This function checks the STATE_LOG from the SCTX->last_str_idx to 0
1587    and sift the nodes in each states according to the following rules.
1588    Updated state_log will be wrote to STATE_LOG.
1589
1590    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1591      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1592         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1593         the LAST_NODE, we throw away the node `a'.
1594      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1595         string `s' and transit to `b':
1596         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1597            away the node `a'.
1598         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1599             thrown away, we throw away the node `a'.
1600      3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
1601         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1602            node `a'.
1603         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is thrown away,
1604             we throw away the node `a'.  */
1605
1606 #define STATE_NODE_CONTAINS(state,node) \
1607   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1608
1609 static reg_errcode_t
1610 internal_function
1611 sift_states_backward (const re_match_context_t *mctx, re_sift_context_t *sctx)
1612 {
1613   reg_errcode_t err;
1614   int null_cnt = 0;
1615   int str_idx = sctx->last_str_idx;
1616   re_node_set cur_dest;
1617
1618 #ifdef DEBUG
1619   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1620 #endif
1621
1622   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1623      transit to the last_node and the last_node itself.  */
1624   err = re_node_set_init_1 (&cur_dest, sctx->last_node);
1625   if (BE (err != REG_NOERROR, 0))
1626     return err;
1627   err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1628   if (BE (err != REG_NOERROR, 0))
1629     goto free_return;
1630
1631   /* Then check each states in the state_log.  */
1632   while (str_idx > 0)
1633     {
1634       /* Update counters.  */
1635       null_cnt = (sctx->sifted_states[str_idx] == NULL) ? null_cnt + 1 : 0;
1636       if (null_cnt > mctx->max_mb_elem_len)
1637         {
1638           memset (sctx->sifted_states, '\0',
1639                   sizeof (re_dfastate_t *) * str_idx);
1640           re_node_set_free (&cur_dest);
1641           return REG_NOERROR;
1642         }
1643       re_node_set_empty (&cur_dest);
1644       --str_idx;
1645
1646       if (mctx->state_log[str_idx])
1647         {
1648           err = build_sifted_states (mctx, sctx, str_idx, &cur_dest);
1649           if (BE (err != REG_NOERROR, 0))
1650             goto free_return;
1651         }
1652
1653       /* Add all the nodes which satisfy the following conditions:
1654          - It can epsilon transit to a node in CUR_DEST.
1655          - It is in CUR_SRC.
1656          And update state_log.  */
1657       err = update_cur_sifted_state (mctx, sctx, str_idx, &cur_dest);
1658       if (BE (err != REG_NOERROR, 0))
1659         goto free_return;
1660     }
1661   err = REG_NOERROR;
1662  free_return:
1663   re_node_set_free (&cur_dest);
1664   return err;
1665 }
1666
1667 static reg_errcode_t
1668 internal_function __attribute_warn_unused_result__
1669 build_sifted_states (const re_match_context_t *mctx, re_sift_context_t *sctx,
1670                      int str_idx, re_node_set *cur_dest)
1671 {
1672   const re_dfa_t *const dfa = mctx->dfa;
1673   const re_node_set *cur_src = &mctx->state_log[str_idx]->non_eps_nodes;
1674   int i;
1675
1676   /* Then build the next sifted state.
1677      We build the next sifted state on `cur_dest', and update
1678      `sifted_states[str_idx]' with `cur_dest'.
1679      Note:
1680      `cur_dest' is the sifted state from `state_log[str_idx + 1]'.
1681      `cur_src' points the node_set of the old `state_log[str_idx]'
1682      (with the epsilon nodes pre-filtered out).  */
1683   for (i = 0; i < cur_src->nelem; i++)
1684     {
1685       int prev_node = cur_src->elems[i];
1686       int naccepted = 0;
1687       int ret;
1688
1689 #ifdef DEBUG
1690       re_token_type_t type = dfa->nodes[prev_node].type;
1691       assert (!IS_EPSILON_NODE (type));
1692 #endif
1693 #ifdef RE_ENABLE_I18N
1694       /* If the node may accept `multi byte'.  */
1695       if (dfa->nodes[prev_node].accept_mb)
1696         naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
1697                                          str_idx, sctx->last_str_idx);
1698 #endif /* RE_ENABLE_I18N */
1699
1700       /* We don't check backreferences here.
1701          See update_cur_sifted_state().  */
1702       if (!naccepted
1703           && check_node_accept (mctx, dfa->nodes + prev_node, str_idx)
1704           && STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + 1],
1705                                   dfa->nexts[prev_node]))
1706         naccepted = 1;
1707
1708       if (naccepted == 0)
1709         continue;
1710
1711       if (sctx->limits.nelem)
1712         {
1713           int to_idx = str_idx + naccepted;
1714           if (check_dst_limits (mctx, &sctx->limits,
1715                                 dfa->nexts[prev_node], to_idx,
1716                                 prev_node, str_idx))
1717             continue;
1718         }
1719       ret = re_node_set_insert (cur_dest, prev_node);
1720       if (BE (ret == -1, 0))
1721         return REG_ESPACE;
1722     }
1723
1724   return REG_NOERROR;
1725 }
1726
1727 /* Helper functions.  */
1728
1729 static reg_errcode_t
1730 internal_function
1731 clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
1732 {
1733   int top = mctx->state_log_top;
1734
1735   if ((next_state_log_idx >= mctx->input.bufs_len
1736        && mctx->input.bufs_len < mctx->input.len)
1737       || (next_state_log_idx >= mctx->input.valid_len
1738           && mctx->input.valid_len < mctx->input.len))
1739     {
1740       reg_errcode_t err;
1741       err = extend_buffers (mctx);
1742       if (BE (err != REG_NOERROR, 0))
1743         return err;
1744     }
1745
1746   if (top < next_state_log_idx)
1747     {
1748       memset (mctx->state_log + top + 1, '\0',
1749               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1750       mctx->state_log_top = next_state_log_idx;
1751     }
1752   return REG_NOERROR;
1753 }
1754
1755 static reg_errcode_t
1756 internal_function
1757 merge_state_array (const re_dfa_t *dfa, re_dfastate_t **dst,
1758                    re_dfastate_t **src, int num)
1759 {
1760   int st_idx;
1761   reg_errcode_t err;
1762   for (st_idx = 0; st_idx < num; ++st_idx)
1763     {
1764       if (dst[st_idx] == NULL)
1765         dst[st_idx] = src[st_idx];
1766       else if (src[st_idx] != NULL)
1767         {
1768           re_node_set merged_set;
1769           err = re_node_set_init_union (&merged_set, &dst[st_idx]->nodes,
1770                                         &src[st_idx]->nodes);
1771           if (BE (err != REG_NOERROR, 0))
1772             return err;
1773           dst[st_idx] = re_acquire_state (&err, dfa, &merged_set);
1774           re_node_set_free (&merged_set);
1775           if (BE (err != REG_NOERROR, 0))
1776             return err;
1777         }
1778     }
1779   return REG_NOERROR;
1780 }
1781
1782 static reg_errcode_t
1783 internal_function
1784 update_cur_sifted_state (const re_match_context_t *mctx,
1785                          re_sift_context_t *sctx, int str_idx,
1786                          re_node_set *dest_nodes)
1787 {
1788   const re_dfa_t *const dfa = mctx->dfa;
1789   reg_errcode_t err = REG_NOERROR;
1790   const re_node_set *candidates;
1791   candidates = ((mctx->state_log[str_idx] == NULL) ? NULL
1792                 : &mctx->state_log[str_idx]->nodes);
1793
1794   if (dest_nodes->nelem == 0)
1795     sctx->sifted_states[str_idx] = NULL;
1796   else
1797     {
1798       if (candidates)
1799         {
1800           /* At first, add the nodes which can epsilon transit to a node in
1801              DEST_NODE.  */
1802           err = add_epsilon_src_nodes (dfa, dest_nodes, candidates);
1803           if (BE (err != REG_NOERROR, 0))
1804             return err;
1805
1806           /* Then, check the limitations in the current sift_context.  */
1807           if (sctx->limits.nelem)
1808             {
1809               err = check_subexp_limits (dfa, dest_nodes, candidates, &sctx->limits,
1810                                          mctx->bkref_ents, str_idx);
1811               if (BE (err != REG_NOERROR, 0))
1812                 return err;
1813             }
1814         }
1815
1816       sctx->sifted_states[str_idx] = re_acquire_state (&err, dfa, dest_nodes);
1817       if (BE (err != REG_NOERROR, 0))
1818         return err;
1819     }
1820
1821   if (candidates && mctx->state_log[str_idx]->has_backref)
1822     {
1823       err = sift_states_bkref (mctx, sctx, str_idx, candidates);
1824       if (BE (err != REG_NOERROR, 0))
1825         return err;
1826     }
1827   return REG_NOERROR;
1828 }
1829
1830 static reg_errcode_t
1831 internal_function __attribute_warn_unused_result__
1832 add_epsilon_src_nodes (const re_dfa_t *dfa, re_node_set *dest_nodes,
1833                        const re_node_set *candidates)
1834 {
1835   reg_errcode_t err = REG_NOERROR;
1836   int i;
1837
1838   re_dfastate_t *state = re_acquire_state (&err, dfa, dest_nodes);
1839   if (BE (err != REG_NOERROR, 0))
1840     return err;
1841
1842   if (!state->inveclosure.alloc)
1843     {
1844       err = re_node_set_alloc (&state->inveclosure, dest_nodes->nelem);
1845       if (BE (err != REG_NOERROR, 0))
1846         return REG_ESPACE;
1847       for (i = 0; i < dest_nodes->nelem; i++)
1848         {
1849           err = re_node_set_merge (&state->inveclosure,
1850                                    dfa->inveclosures + dest_nodes->elems[i]);
1851           if (BE (err != REG_NOERROR, 0))
1852             return REG_ESPACE;
1853         }
1854     }
1855   return re_node_set_add_intersect (dest_nodes, candidates,
1856                                     &state->inveclosure);
1857 }
1858
1859 static reg_errcode_t
1860 internal_function
1861 sub_epsilon_src_nodes (const re_dfa_t *dfa, int node, re_node_set *dest_nodes,
1862                        const re_node_set *candidates)
1863 {
1864     int ecl_idx;
1865     reg_errcode_t err;
1866     re_node_set *inv_eclosure = dfa->inveclosures + node;
1867     re_node_set except_nodes;
1868     re_node_set_init_empty (&except_nodes);
1869     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1870       {
1871         int cur_node = inv_eclosure->elems[ecl_idx];
1872         if (cur_node == node)
1873           continue;
1874         if (IS_EPSILON_NODE (dfa->nodes[cur_node].type))
1875           {
1876             int edst1 = dfa->edests[cur_node].elems[0];
1877             int edst2 = ((dfa->edests[cur_node].nelem > 1)
1878                          ? dfa->edests[cur_node].elems[1] : -1);
1879             if ((!re_node_set_contains (inv_eclosure, edst1)
1880                  && re_node_set_contains (dest_nodes, edst1))
1881                 || (edst2 > 0
1882                     && !re_node_set_contains (inv_eclosure, edst2)
1883                     && re_node_set_contains (dest_nodes, edst2)))
1884               {
1885                 err = re_node_set_add_intersect (&except_nodes, candidates,
1886                                                  dfa->inveclosures + cur_node);
1887                 if (BE (err != REG_NOERROR, 0))
1888                   {
1889                     re_node_set_free (&except_nodes);
1890                     return err;
1891                   }
1892               }
1893           }
1894       }
1895     for (ecl_idx = 0; ecl_idx < inv_eclosure->nelem; ++ecl_idx)
1896       {
1897         int cur_node = inv_eclosure->elems[ecl_idx];
1898         if (!re_node_set_contains (&except_nodes, cur_node))
1899           {
1900             int idx = re_node_set_contains (dest_nodes, cur_node) - 1;
1901             re_node_set_remove_at (dest_nodes, idx);
1902           }
1903       }
1904     re_node_set_free (&except_nodes);
1905     return REG_NOERROR;
1906 }
1907
1908 static int
1909 internal_function
1910 check_dst_limits (const re_match_context_t *mctx, re_node_set *limits,
1911                   int dst_node, int dst_idx, int src_node, int src_idx)
1912 {
1913   const re_dfa_t *const dfa = mctx->dfa;
1914   int lim_idx, src_pos, dst_pos;
1915
1916   int dst_bkref_idx = search_cur_bkref_entry (mctx, dst_idx);
1917   int src_bkref_idx = search_cur_bkref_entry (mctx, src_idx);
1918   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
1919     {
1920       int subexp_idx;
1921       struct re_backref_cache_entry *ent;
1922       ent = mctx->bkref_ents + limits->elems[lim_idx];
1923       subexp_idx = dfa->nodes[ent->node].opr.idx;
1924
1925       dst_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1926                                            subexp_idx, dst_node, dst_idx,
1927                                            dst_bkref_idx);
1928       src_pos = check_dst_limits_calc_pos (mctx, limits->elems[lim_idx],
1929                                            subexp_idx, src_node, src_idx,
1930                                            src_bkref_idx);
1931
1932       /* In case of:
1933          <src> <dst> ( <subexp> )
1934          ( <subexp> ) <src> <dst>
1935          ( <subexp1> <src> <subexp2> <dst> <subexp3> )  */
1936       if (src_pos == dst_pos)
1937         continue; /* This is unrelated limitation.  */
1938       else
1939         return 1;
1940     }
1941   return 0;
1942 }
1943
1944 static int
1945 internal_function
1946 check_dst_limits_calc_pos_1 (const re_match_context_t *mctx, int boundaries,
1947                              int subexp_idx, int from_node, int bkref_idx)
1948 {
1949   const re_dfa_t *const dfa = mctx->dfa;
1950   const re_node_set *eclosures = dfa->eclosures + from_node;
1951   int node_idx;
1952
1953   /* Else, we are on the boundary: examine the nodes on the epsilon
1954      closure.  */
1955   for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
1956     {
1957       int node = eclosures->elems[node_idx];
1958       switch (dfa->nodes[node].type)
1959         {
1960         case OP_BACK_REF:
1961           if (bkref_idx != -1)
1962             {
1963               struct re_backref_cache_entry *ent = mctx->bkref_ents + bkref_idx;
1964               do
1965                 {
1966                   int dst, cpos;
1967
1968                   if (ent->node != node)
1969                     continue;
1970
1971                   if (subexp_idx < BITSET_WORD_BITS
1972                       && !(ent->eps_reachable_subexps_map
1973                            & ((bitset_word_t) 1 << subexp_idx)))
1974                     continue;
1975
1976                   /* Recurse trying to reach the OP_OPEN_SUBEXP and
1977                      OP_CLOSE_SUBEXP cases below.  But, if the
1978                      destination node is the same node as the source
1979                      node, don't recurse because it would cause an
1980                      infinite loop: a regex that exhibits this behavior
1981                      is ()\1*\1*  */
1982                   dst = dfa->edests[node].elems[0];
1983                   if (dst == from_node)
1984                     {
1985                       if (boundaries & 1)
1986                         return -1;
1987                       else /* if (boundaries & 2) */
1988                         return 0;
1989                     }
1990
1991                   cpos =
1992                     check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
1993                                                  dst, bkref_idx);
1994                   if (cpos == -1 /* && (boundaries & 1) */)
1995                     return -1;
1996                   if (cpos == 0 && (boundaries & 2))
1997                     return 0;
1998
1999                   if (subexp_idx < BITSET_WORD_BITS)
2000                     ent->eps_reachable_subexps_map
2001                       &= ~((bitset_word_t) 1 << subexp_idx);
2002                 }
2003               while (ent++->more);
2004             }
2005           break;
2006
2007         case OP_OPEN_SUBEXP:
2008           if ((boundaries & 1) && subexp_idx == dfa->nodes[node].opr.idx)
2009             return -1;
2010           break;
2011
2012         case OP_CLOSE_SUBEXP:
2013           if ((boundaries & 2) && subexp_idx == dfa->nodes[node].opr.idx)
2014             return 0;
2015           break;
2016
2017         default:
2018             break;
2019         }
2020     }
2021
2022   return (boundaries & 2) ? 1 : 0;
2023 }
2024
2025 static int
2026 internal_function
2027 check_dst_limits_calc_pos (const re_match_context_t *mctx, int limit,
2028                            int subexp_idx, int from_node, int str_idx,
2029                            int bkref_idx)
2030 {
2031   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
2032   int boundaries;
2033
2034   /* If we are outside the range of the subexpression, return -1 or 1.  */
2035   if (str_idx < lim->subexp_from)
2036     return -1;
2037
2038   if (lim->subexp_to < str_idx)
2039     return 1;
2040
2041   /* If we are within the subexpression, return 0.  */
2042   boundaries = (str_idx == lim->subexp_from);
2043   boundaries |= (str_idx == lim->subexp_to) << 1;
2044   if (boundaries == 0)
2045     return 0;
2046
2047   /* Else, examine epsilon closure.  */
2048   return check_dst_limits_calc_pos_1 (mctx, boundaries, subexp_idx,
2049                                       from_node, bkref_idx);
2050 }
2051
2052 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2053    which are against limitations from DEST_NODES. */
2054
2055 static reg_errcode_t
2056 internal_function
2057 check_subexp_limits (const re_dfa_t *dfa, re_node_set *dest_nodes,
2058                      const re_node_set *candidates, re_node_set *limits,
2059                      struct re_backref_cache_entry *bkref_ents, int str_idx)
2060 {
2061   reg_errcode_t err;
2062   int node_idx, lim_idx;
2063
2064   for (lim_idx = 0; lim_idx < limits->nelem; ++lim_idx)
2065     {
2066       int subexp_idx;
2067       struct re_backref_cache_entry *ent;
2068       ent = bkref_ents + limits->elems[lim_idx];
2069
2070       if (str_idx <= ent->subexp_from || ent->str_idx < str_idx)
2071         continue; /* This is unrelated limitation.  */
2072
2073       subexp_idx = dfa->nodes[ent->node].opr.idx;
2074       if (ent->subexp_to == str_idx)
2075         {
2076           int ops_node = -1;
2077           int cls_node = -1;
2078           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2079             {
2080               int node = dest_nodes->elems[node_idx];
2081               re_token_type_t type = dfa->nodes[node].type;
2082               if (type == OP_OPEN_SUBEXP
2083                   && subexp_idx == dfa->nodes[node].opr.idx)
2084                 ops_node = node;
2085               else if (type == OP_CLOSE_SUBEXP
2086                        && subexp_idx == dfa->nodes[node].opr.idx)
2087                 cls_node = node;
2088             }
2089
2090           /* Check the limitation of the open subexpression.  */
2091           /* Note that (ent->subexp_to = str_idx != ent->subexp_from).  */
2092           if (ops_node >= 0)
2093             {
2094               err = sub_epsilon_src_nodes (dfa, ops_node, dest_nodes,
2095                                            candidates);
2096               if (BE (err != REG_NOERROR, 0))
2097                 return err;
2098             }
2099
2100           /* Check the limitation of the close subexpression.  */
2101           if (cls_node >= 0)
2102             for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2103               {
2104                 int node = dest_nodes->elems[node_idx];
2105                 if (!re_node_set_contains (dfa->inveclosures + node,
2106                                            cls_node)
2107                     && !re_node_set_contains (dfa->eclosures + node,
2108                                               cls_node))
2109                   {
2110                     /* It is against this limitation.
2111                        Remove it form the current sifted state.  */
2112                     err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2113                                                  candidates);
2114                     if (BE (err != REG_NOERROR, 0))
2115                       return err;
2116                     --node_idx;
2117                   }
2118               }
2119         }
2120       else /* (ent->subexp_to != str_idx)  */
2121         {
2122           for (node_idx = 0; node_idx < dest_nodes->nelem; ++node_idx)
2123             {
2124               int node = dest_nodes->elems[node_idx];
2125               re_token_type_t type = dfa->nodes[node].type;
2126               if (type == OP_CLOSE_SUBEXP || type == OP_OPEN_SUBEXP)
2127                 {
2128                   if (subexp_idx != dfa->nodes[node].opr.idx)
2129                     continue;
2130                   /* It is against this limitation.
2131                      Remove it form the current sifted state.  */
2132                   err = sub_epsilon_src_nodes (dfa, node, dest_nodes,
2133                                                candidates);
2134                   if (BE (err != REG_NOERROR, 0))
2135                     return err;
2136                 }
2137             }
2138         }
2139     }
2140   return REG_NOERROR;
2141 }
2142
2143 static reg_errcode_t
2144 internal_function __attribute_warn_unused_result__
2145 sift_states_bkref (const re_match_context_t *mctx, re_sift_context_t *sctx,
2146                    int str_idx, const re_node_set *candidates)
2147 {
2148   const re_dfa_t *const dfa = mctx->dfa;
2149   reg_errcode_t err;
2150   int node_idx, node;
2151   re_sift_context_t local_sctx;
2152   int first_idx = search_cur_bkref_entry (mctx, str_idx);
2153
2154   if (first_idx == -1)
2155     return REG_NOERROR;
2156
2157   local_sctx.sifted_states = NULL; /* Mark that it hasn't been initialized.  */
2158
2159   for (node_idx = 0; node_idx < candidates->nelem; ++node_idx)
2160     {
2161       int enabled_idx;
2162       re_token_type_t type;
2163       struct re_backref_cache_entry *entry;
2164       node = candidates->elems[node_idx];
2165       type = dfa->nodes[node].type;
2166       /* Avoid infinite loop for the REs like "()\1+".  */
2167       if (node == sctx->last_node && str_idx == sctx->last_str_idx)
2168         continue;
2169       if (type != OP_BACK_REF)
2170         continue;
2171
2172       entry = mctx->bkref_ents + first_idx;
2173       enabled_idx = first_idx;
2174       do
2175         {
2176           int subexp_len;
2177           int to_idx;
2178           int dst_node;
2179           int ret;
2180           re_dfastate_t *cur_state;
2181
2182           if (entry->node != node)
2183             continue;
2184           subexp_len = entry->subexp_to - entry->subexp_from;
2185           to_idx = str_idx + subexp_len;
2186           dst_node = (subexp_len ? dfa->nexts[node]
2187                       : dfa->edests[node].elems[0]);
2188
2189           if (to_idx > sctx->last_str_idx
2190               || sctx->sifted_states[to_idx] == NULL
2191               || !STATE_NODE_CONTAINS (sctx->sifted_states[to_idx], dst_node)
2192               || check_dst_limits (mctx, &sctx->limits, node,
2193                                    str_idx, dst_node, to_idx))
2194             continue;
2195
2196           if (local_sctx.sifted_states == NULL)
2197             {
2198               local_sctx = *sctx;
2199               err = re_node_set_init_copy (&local_sctx.limits, &sctx->limits);
2200               if (BE (err != REG_NOERROR, 0))
2201                 goto free_return;
2202             }
2203           local_sctx.last_node = node;
2204           local_sctx.last_str_idx = str_idx;
2205           ret = re_node_set_insert (&local_sctx.limits, enabled_idx);
2206           if (BE (ret < 0, 0))
2207             {
2208               err = REG_ESPACE;
2209               goto free_return;
2210             }
2211           cur_state = local_sctx.sifted_states[str_idx];
2212           err = sift_states_backward (mctx, &local_sctx);
2213           if (BE (err != REG_NOERROR, 0))
2214             goto free_return;
2215           if (sctx->limited_states != NULL)
2216             {
2217               err = merge_state_array (dfa, sctx->limited_states,
2218                                        local_sctx.sifted_states,
2219                                        str_idx + 1);
2220               if (BE (err != REG_NOERROR, 0))
2221                 goto free_return;
2222             }
2223           local_sctx.sifted_states[str_idx] = cur_state;
2224           re_node_set_remove (&local_sctx.limits, enabled_idx);
2225
2226           /* mctx->bkref_ents may have changed, reload the pointer.  */
2227           entry = mctx->bkref_ents + enabled_idx;
2228         }
2229       while (enabled_idx++, entry++->more);
2230     }
2231   err = REG_NOERROR;
2232  free_return:
2233   if (local_sctx.sifted_states != NULL)
2234     {
2235       re_node_set_free (&local_sctx.limits);
2236     }
2237
2238   return err;
2239 }
2240
2241
2242 #ifdef RE_ENABLE_I18N
2243 static int
2244 internal_function
2245 sift_states_iter_mb (const re_match_context_t *mctx, re_sift_context_t *sctx,
2246                      int node_idx, int str_idx, int max_str_idx)
2247 {
2248   const re_dfa_t *const dfa = mctx->dfa;
2249   int naccepted;
2250   /* Check the node can accept `multi byte'.  */
2251   naccepted = check_node_accept_bytes (dfa, node_idx, &mctx->input, str_idx);
2252   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
2253       !STATE_NODE_CONTAINS (sctx->sifted_states[str_idx + naccepted],
2254                             dfa->nexts[node_idx]))
2255     /* The node can't accept the `multi byte', or the
2256        destination was already thrown away, then the node
2257        could't accept the current input `multi byte'.   */
2258     naccepted = 0;
2259   /* Otherwise, it is sure that the node could accept
2260      `naccepted' bytes input.  */
2261   return naccepted;
2262 }
2263 #endif /* RE_ENABLE_I18N */
2264
2265 \f
2266 /* Functions for state transition.  */
2267
2268 /* Return the next state to which the current state STATE will transit by
2269    accepting the current input byte, and update STATE_LOG if necessary.
2270    If STATE can accept a multibyte char/collating element/back reference
2271    update the destination of STATE_LOG.  */
2272
2273 static re_dfastate_t *
2274 internal_function __attribute_warn_unused_result__
2275 transit_state (reg_errcode_t *err, re_match_context_t *mctx,
2276                re_dfastate_t *state)
2277 {
2278   re_dfastate_t **trtable;
2279   unsigned char ch;
2280
2281 #ifdef RE_ENABLE_I18N
2282   /* If the current state can accept multibyte.  */
2283   if (BE (state->accept_mb, 0))
2284     {
2285       *err = transit_state_mb (mctx, state);
2286       if (BE (*err != REG_NOERROR, 0))
2287         return NULL;
2288     }
2289 #endif /* RE_ENABLE_I18N */
2290
2291   /* Then decide the next state with the single byte.  */
2292 #if 0
2293   if (0)
2294     /* don't use transition table  */
2295     return transit_state_sb (err, mctx, state);
2296 #endif
2297
2298   /* Use transition table  */
2299   ch = re_string_fetch_byte (&mctx->input);
2300   for (;;)
2301     {
2302       trtable = state->trtable;
2303       if (BE (trtable != NULL, 1))
2304         return trtable[ch];
2305
2306       trtable = state->word_trtable;
2307       if (BE (trtable != NULL, 1))
2308         {
2309           unsigned int context;
2310           context
2311             = re_string_context_at (&mctx->input,
2312                                     re_string_cur_idx (&mctx->input) - 1,
2313                                     mctx->eflags);
2314           if (IS_WORD_CONTEXT (context))
2315             return trtable[ch + SBC_MAX];
2316           else
2317             return trtable[ch];
2318         }
2319
2320       if (!build_trtable (mctx->dfa, state))
2321         {
2322           *err = REG_ESPACE;
2323           return NULL;
2324         }
2325
2326       /* Retry, we now have a transition table.  */
2327     }
2328 }
2329
2330 /* Update the state_log if we need */
2331 re_dfastate_t *
2332 internal_function
2333 merge_state_with_log (reg_errcode_t *err, re_match_context_t *mctx,
2334                       re_dfastate_t *next_state)
2335 {
2336   const re_dfa_t *const dfa = mctx->dfa;
2337   int cur_idx = re_string_cur_idx (&mctx->input);
2338
2339   if (cur_idx > mctx->state_log_top)
2340     {
2341       mctx->state_log[cur_idx] = next_state;
2342       mctx->state_log_top = cur_idx;
2343     }
2344   else if (mctx->state_log[cur_idx] == 0)
2345     {
2346       mctx->state_log[cur_idx] = next_state;
2347     }
2348   else
2349     {
2350       re_dfastate_t *pstate;
2351       unsigned int context;
2352       re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
2353       /* If (state_log[cur_idx] != 0), it implies that cur_idx is
2354          the destination of a multibyte char/collating element/
2355          back reference.  Then the next state is the union set of
2356          these destinations and the results of the transition table.  */
2357       pstate = mctx->state_log[cur_idx];
2358       log_nodes = pstate->entrance_nodes;
2359       if (next_state != NULL)
2360         {
2361           table_nodes = next_state->entrance_nodes;
2362           *err = re_node_set_init_union (&next_nodes, table_nodes,
2363                                              log_nodes);
2364           if (BE (*err != REG_NOERROR, 0))
2365             return NULL;
2366         }
2367       else
2368         next_nodes = *log_nodes;
2369       /* Note: We already add the nodes of the initial state,
2370          then we don't need to add them here.  */
2371
2372       context = re_string_context_at (&mctx->input,
2373                                       re_string_cur_idx (&mctx->input) - 1,
2374                                       mctx->eflags);
2375       next_state = mctx->state_log[cur_idx]
2376         = re_acquire_state_context (err, dfa, &next_nodes, context);
2377       /* We don't need to check errors here, since the return value of
2378          this function is next_state and ERR is already set.  */
2379
2380       if (table_nodes != NULL)
2381         re_node_set_free (&next_nodes);
2382     }
2383
2384   if (BE (dfa->nbackref, 0) && next_state != NULL)
2385     {
2386       /* Check OP_OPEN_SUBEXP in the current state in case that we use them
2387          later.  We must check them here, since the back references in the
2388          next state might use them.  */
2389       *err = check_subexp_matching_top (mctx, &next_state->nodes,
2390                                         cur_idx);
2391       if (BE (*err != REG_NOERROR, 0))
2392         return NULL;
2393
2394       /* If the next state has back references.  */
2395       if (next_state->has_backref)
2396         {
2397           *err = transit_state_bkref (mctx, &next_state->nodes);
2398           if (BE (*err != REG_NOERROR, 0))
2399             return NULL;
2400           next_state = mctx->state_log[cur_idx];
2401         }
2402     }
2403
2404   return next_state;
2405 }
2406
2407 /* Skip bytes in the input that correspond to part of a
2408    multi-byte match, then look in the log for a state
2409    from which to restart matching.  */
2410 re_dfastate_t *
2411 internal_function
2412 find_recover_state (reg_errcode_t *err, re_match_context_t *mctx)
2413 {
2414   re_dfastate_t *cur_state;
2415   do
2416     {
2417       int max = mctx->state_log_top;
2418       int cur_str_idx = re_string_cur_idx (&mctx->input);
2419
2420       do
2421         {
2422           if (++cur_str_idx > max)
2423             return NULL;
2424           re_string_skip_bytes (&mctx->input, 1);
2425         }
2426       while (mctx->state_log[cur_str_idx] == NULL);
2427
2428       cur_state = merge_state_with_log (err, mctx, NULL);
2429     }
2430   while (*err == REG_NOERROR && cur_state == NULL);
2431   return cur_state;
2432 }
2433
2434 /* Helper functions for transit_state.  */
2435
2436 /* From the node set CUR_NODES, pick up the nodes whose types are
2437    OP_OPEN_SUBEXP and which have corresponding back references in the regular
2438    expression. And register them to use them later for evaluating the
2439    correspoding back references.  */
2440
2441 static reg_errcode_t
2442 internal_function
2443 check_subexp_matching_top (re_match_context_t *mctx, re_node_set *cur_nodes,
2444                            int str_idx)
2445 {
2446   const re_dfa_t *const dfa = mctx->dfa;
2447   int node_idx;
2448   reg_errcode_t err;
2449
2450   /* TODO: This isn't efficient.
2451            Because there might be more than one nodes whose types are
2452            OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2453            nodes.
2454            E.g. RE: (a){2}  */
2455   for (node_idx = 0; node_idx < cur_nodes->nelem; ++node_idx)
2456     {
2457       int node = cur_nodes->elems[node_idx];
2458       if (dfa->nodes[node].type == OP_OPEN_SUBEXP
2459           && dfa->nodes[node].opr.idx < BITSET_WORD_BITS
2460           && (dfa->used_bkref_map
2461               & ((bitset_word_t) 1 << dfa->nodes[node].opr.idx)))
2462         {
2463           err = match_ctx_add_subtop (mctx, node, str_idx);
2464           if (BE (err != REG_NOERROR, 0))
2465             return err;
2466         }
2467     }
2468   return REG_NOERROR;
2469 }
2470
2471 #if 0
2472 /* Return the next state to which the current state STATE will transit by
2473    accepting the current input byte.  */
2474
2475 static re_dfastate_t *
2476 transit_state_sb (reg_errcode_t *err, re_match_context_t *mctx,
2477                   re_dfastate_t *state)
2478 {
2479   const re_dfa_t *const dfa = mctx->dfa;
2480   re_node_set next_nodes;
2481   re_dfastate_t *next_state;
2482   int node_cnt, cur_str_idx = re_string_cur_idx (&mctx->input);
2483   unsigned int context;
2484
2485   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
2486   if (BE (*err != REG_NOERROR, 0))
2487     return NULL;
2488   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
2489     {
2490       int cur_node = state->nodes.elems[node_cnt];
2491       if (check_node_accept (mctx, dfa->nodes + cur_node, cur_str_idx))
2492         {
2493           *err = re_node_set_merge (&next_nodes,
2494                                     dfa->eclosures + dfa->nexts[cur_node]);
2495           if (BE (*err != REG_NOERROR, 0))
2496             {
2497               re_node_set_free (&next_nodes);
2498               return NULL;
2499             }
2500         }
2501     }
2502   context = re_string_context_at (&mctx->input, cur_str_idx, mctx->eflags);
2503   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
2504   /* We don't need to check errors here, since the return value of
2505      this function is next_state and ERR is already set.  */
2506
2507   re_node_set_free (&next_nodes);
2508   re_string_skip_bytes (&mctx->input, 1);
2509   return next_state;
2510 }
2511 #endif
2512
2513 #ifdef RE_ENABLE_I18N
2514 static reg_errcode_t
2515 internal_function
2516 transit_state_mb (re_match_context_t *mctx, re_dfastate_t *pstate)
2517 {
2518   const re_dfa_t *const dfa = mctx->dfa;
2519   reg_errcode_t err;
2520   int i;
2521
2522   for (i = 0; i < pstate->nodes.nelem; ++i)
2523     {
2524       re_node_set dest_nodes, *new_nodes;
2525       int cur_node_idx = pstate->nodes.elems[i];
2526       int naccepted, dest_idx;
2527       unsigned int context;
2528       re_dfastate_t *dest_state;
2529
2530       if (!dfa->nodes[cur_node_idx].accept_mb)
2531         continue;
2532
2533       if (dfa->nodes[cur_node_idx].constraint)
2534         {
2535           context = re_string_context_at (&mctx->input,
2536                                           re_string_cur_idx (&mctx->input),
2537                                           mctx->eflags);
2538           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
2539                                            context))
2540             continue;
2541         }
2542
2543       /* How many bytes the node can accept?  */
2544       naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
2545                                            re_string_cur_idx (&mctx->input));
2546       if (naccepted == 0)
2547         continue;
2548
2549       /* The node can accepts `naccepted' bytes.  */
2550       dest_idx = re_string_cur_idx (&mctx->input) + naccepted;
2551       mctx->max_mb_elem_len = ((mctx->max_mb_elem_len < naccepted) ? naccepted
2552                                : mctx->max_mb_elem_len);
2553       err = clean_state_log_if_needed (mctx, dest_idx);
2554       if (BE (err != REG_NOERROR, 0))
2555         return err;
2556 #ifdef DEBUG
2557       assert (dfa->nexts[cur_node_idx] != -1);
2558 #endif
2559       new_nodes = dfa->eclosures + dfa->nexts[cur_node_idx];
2560
2561       dest_state = mctx->state_log[dest_idx];
2562       if (dest_state == NULL)
2563         dest_nodes = *new_nodes;
2564       else
2565         {
2566           err = re_node_set_init_union (&dest_nodes,
2567                                         dest_state->entrance_nodes, new_nodes);
2568           if (BE (err != REG_NOERROR, 0))
2569             return err;
2570         }
2571       context = re_string_context_at (&mctx->input, dest_idx - 1,
2572                                       mctx->eflags);
2573       mctx->state_log[dest_idx]
2574         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2575       if (dest_state != NULL)
2576         re_node_set_free (&dest_nodes);
2577       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
2578         return err;
2579     }
2580   return REG_NOERROR;
2581 }
2582 #endif /* RE_ENABLE_I18N */
2583
2584 static reg_errcode_t
2585 internal_function
2586 transit_state_bkref (re_match_context_t *mctx, const re_node_set *nodes)
2587 {
2588   const re_dfa_t *const dfa = mctx->dfa;
2589   reg_errcode_t err;
2590   int i;
2591   int cur_str_idx = re_string_cur_idx (&mctx->input);
2592
2593   for (i = 0; i < nodes->nelem; ++i)
2594     {
2595       int dest_str_idx, prev_nelem, bkc_idx;
2596       int node_idx = nodes->elems[i];
2597       unsigned int context;
2598       const re_token_t *node = dfa->nodes + node_idx;
2599       re_node_set *new_dest_nodes;
2600
2601       /* Check whether `node' is a backreference or not.  */
2602       if (node->type != OP_BACK_REF)
2603         continue;
2604
2605       if (node->constraint)
2606         {
2607           context = re_string_context_at (&mctx->input, cur_str_idx,
2608                                           mctx->eflags);
2609           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2610             continue;
2611         }
2612
2613       /* `node' is a backreference.
2614          Check the substring which the substring matched.  */
2615       bkc_idx = mctx->nbkref_ents;
2616       err = get_subexp (mctx, node_idx, cur_str_idx);
2617       if (BE (err != REG_NOERROR, 0))
2618         goto free_return;
2619
2620       /* And add the epsilon closures (which is `new_dest_nodes') of
2621          the backreference to appropriate state_log.  */
2622 #ifdef DEBUG
2623       assert (dfa->nexts[node_idx] != -1);
2624 #endif
2625       for (; bkc_idx < mctx->nbkref_ents; ++bkc_idx)
2626         {
2627           int subexp_len;
2628           re_dfastate_t *dest_state;
2629           struct re_backref_cache_entry *bkref_ent;
2630           bkref_ent = mctx->bkref_ents + bkc_idx;
2631           if (bkref_ent->node != node_idx || bkref_ent->str_idx != cur_str_idx)
2632             continue;
2633           subexp_len = bkref_ent->subexp_to - bkref_ent->subexp_from;
2634           new_dest_nodes = (subexp_len == 0
2635                             ? dfa->eclosures + dfa->edests[node_idx].elems[0]
2636                             : dfa->eclosures + dfa->nexts[node_idx]);
2637           dest_str_idx = (cur_str_idx + bkref_ent->subexp_to
2638                           - bkref_ent->subexp_from);
2639           context = re_string_context_at (&mctx->input, dest_str_idx - 1,
2640                                           mctx->eflags);
2641           dest_state = mctx->state_log[dest_str_idx];
2642           prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
2643                         : mctx->state_log[cur_str_idx]->nodes.nelem);
2644           /* Add `new_dest_node' to state_log.  */
2645           if (dest_state == NULL)
2646             {
2647               mctx->state_log[dest_str_idx]
2648                 = re_acquire_state_context (&err, dfa, new_dest_nodes,
2649                                             context);
2650               if (BE (mctx->state_log[dest_str_idx] == NULL
2651                       && err != REG_NOERROR, 0))
2652                 goto free_return;
2653             }
2654           else
2655             {
2656               re_node_set dest_nodes;
2657               err = re_node_set_init_union (&dest_nodes,
2658                                             dest_state->entrance_nodes,
2659                                             new_dest_nodes);
2660               if (BE (err != REG_NOERROR, 0))
2661                 {
2662                   re_node_set_free (&dest_nodes);
2663                   goto free_return;
2664                 }
2665               mctx->state_log[dest_str_idx]
2666                 = re_acquire_state_context (&err, dfa, &dest_nodes, context);
2667               re_node_set_free (&dest_nodes);
2668               if (BE (mctx->state_log[dest_str_idx] == NULL
2669                       && err != REG_NOERROR, 0))
2670                 goto free_return;
2671             }
2672           /* We need to check recursively if the backreference can epsilon
2673              transit.  */
2674           if (subexp_len == 0
2675               && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
2676             {
2677               err = check_subexp_matching_top (mctx, new_dest_nodes,
2678                                                cur_str_idx);
2679               if (BE (err != REG_NOERROR, 0))
2680                 goto free_return;
2681               err = transit_state_bkref (mctx, new_dest_nodes);
2682               if (BE (err != REG_NOERROR, 0))
2683                 goto free_return;
2684             }
2685         }
2686     }
2687   err = REG_NOERROR;
2688  free_return:
2689   return err;
2690 }
2691
2692 /* Enumerate all the candidates which the backreference BKREF_NODE can match
2693    at BKREF_STR_IDX, and register them by match_ctx_add_entry().
2694    Note that we might collect inappropriate candidates here.
2695    However, the cost of checking them strictly here is too high, then we
2696    delay these checking for prune_impossible_nodes().  */
2697
2698 static reg_errcode_t
2699 internal_function __attribute_warn_unused_result__
2700 get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
2701 {
2702   const re_dfa_t *const dfa = mctx->dfa;
2703   int subexp_num, sub_top_idx;
2704   const char *buf = (const char *) re_string_get_buffer (&mctx->input);
2705   /* Return if we have already checked BKREF_NODE at BKREF_STR_IDX.  */
2706   int cache_idx = search_cur_bkref_entry (mctx, bkref_str_idx);
2707   if (cache_idx != -1)
2708     {
2709       const struct re_backref_cache_entry *entry
2710         = mctx->bkref_ents + cache_idx;
2711       do
2712         if (entry->node == bkref_node)
2713           return REG_NOERROR; /* We already checked it.  */
2714       while (entry++->more);
2715     }
2716
2717   subexp_num = dfa->nodes[bkref_node].opr.idx;
2718
2719   /* For each sub expression  */
2720   for (sub_top_idx = 0; sub_top_idx < mctx->nsub_tops; ++sub_top_idx)
2721     {
2722       reg_errcode_t err;
2723       re_sub_match_top_t *sub_top = mctx->sub_tops[sub_top_idx];
2724       re_sub_match_last_t *sub_last;
2725       int sub_last_idx, sl_str, bkref_str_off;
2726
2727       if (dfa->nodes[sub_top->node].opr.idx != subexp_num)
2728         continue; /* It isn't related.  */
2729
2730       sl_str = sub_top->str_idx;
2731       bkref_str_off = bkref_str_idx;
2732       /* At first, check the last node of sub expressions we already
2733          evaluated.  */
2734       for (sub_last_idx = 0; sub_last_idx < sub_top->nlasts; ++sub_last_idx)
2735         {
2736           int sl_str_diff;
2737           sub_last = sub_top->lasts[sub_last_idx];
2738           sl_str_diff = sub_last->str_idx - sl_str;
2739           /* The matched string by the sub expression match with the substring
2740              at the back reference?  */
2741           if (sl_str_diff > 0)
2742             {
2743               if (BE (bkref_str_off + sl_str_diff > mctx->input.valid_len, 0))
2744                 {
2745                   /* Not enough chars for a successful match.  */
2746                   if (bkref_str_off + sl_str_diff > mctx->input.len)
2747                     break;
2748
2749                   err = clean_state_log_if_needed (mctx,
2750                                                    bkref_str_off
2751                                                    + sl_str_diff);
2752                   if (BE (err != REG_NOERROR, 0))
2753                     return err;
2754                   buf = (const char *) re_string_get_buffer (&mctx->input);
2755                 }
2756               if (memcmp (buf + bkref_str_off, buf + sl_str, sl_str_diff) != 0)
2757                 /* We don't need to search this sub expression any more.  */
2758                 break;
2759             }
2760           bkref_str_off += sl_str_diff;
2761           sl_str += sl_str_diff;
2762           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2763                                 bkref_str_idx);
2764
2765           /* Reload buf, since the preceding call might have reallocated
2766              the buffer.  */
2767           buf = (const char *) re_string_get_buffer (&mctx->input);
2768
2769           if (err == REG_NOMATCH)
2770             continue;
2771           if (BE (err != REG_NOERROR, 0))
2772             return err;
2773         }
2774
2775       if (sub_last_idx < sub_top->nlasts)
2776         continue;
2777       if (sub_last_idx > 0)
2778         ++sl_str;
2779       /* Then, search for the other last nodes of the sub expression.  */
2780       for (; sl_str <= bkref_str_idx; ++sl_str)
2781         {
2782           int cls_node, sl_str_off;
2783           const re_node_set *nodes;
2784           sl_str_off = sl_str - sub_top->str_idx;
2785           /* The matched string by the sub expression match with the substring
2786              at the back reference?  */
2787           if (sl_str_off > 0)
2788             {
2789               if (BE (bkref_str_off >= mctx->input.valid_len, 0))
2790                 {
2791                   /* If we are at the end of the input, we cannot match.  */
2792                   if (bkref_str_off >= mctx->input.len)
2793                     break;
2794
2795                   err = extend_buffers (mctx);
2796                   if (BE (err != REG_NOERROR, 0))
2797                     return err;
2798
2799                   buf = (const char *) re_string_get_buffer (&mctx->input);
2800                 }
2801               if (buf [bkref_str_off++] != buf[sl_str - 1])
2802                 break; /* We don't need to search this sub expression
2803                           any more.  */
2804             }
2805           if (mctx->state_log[sl_str] == NULL)
2806             continue;
2807           /* Does this state have a ')' of the sub expression?  */
2808           nodes = &mctx->state_log[sl_str]->nodes;
2809           cls_node = find_subexp_node (dfa, nodes, subexp_num,
2810                                        OP_CLOSE_SUBEXP);
2811           if (cls_node == -1)
2812             continue; /* No.  */
2813           if (sub_top->path == NULL)
2814             {
2815               sub_top->path = calloc (sizeof (state_array_t),
2816                                       sl_str - sub_top->str_idx + 1);
2817               if (sub_top->path == NULL)
2818                 return REG_ESPACE;
2819             }
2820           /* Can the OP_OPEN_SUBEXP node arrive the OP_CLOSE_SUBEXP node
2821              in the current context?  */
2822           err = check_arrival (mctx, sub_top->path, sub_top->node,
2823                                sub_top->str_idx, cls_node, sl_str,
2824                                OP_CLOSE_SUBEXP);
2825           if (err == REG_NOMATCH)
2826               continue;
2827           if (BE (err != REG_NOERROR, 0))
2828               return err;
2829           sub_last = match_ctx_add_sublast (sub_top, cls_node, sl_str);
2830           if (BE (sub_last == NULL, 0))
2831             return REG_ESPACE;
2832           err = get_subexp_sub (mctx, sub_top, sub_last, bkref_node,
2833                                 bkref_str_idx);
2834           if (err == REG_NOMATCH)
2835             continue;
2836         }
2837     }
2838   return REG_NOERROR;
2839 }
2840
2841 /* Helper functions for get_subexp().  */
2842
2843 /* Check SUB_LAST can arrive to the back reference BKREF_NODE at BKREF_STR.
2844    If it can arrive, register the sub expression expressed with SUB_TOP
2845    and SUB_LAST.  */
2846
2847 static reg_errcode_t
2848 internal_function
2849 get_subexp_sub (re_match_context_t *mctx, const re_sub_match_top_t *sub_top,
2850                 re_sub_match_last_t *sub_last, int bkref_node, int bkref_str)
2851 {
2852   reg_errcode_t err;
2853   int to_idx;
2854   /* Can the subexpression arrive the back reference?  */
2855   err = check_arrival (mctx, &sub_last->path, sub_last->node,
2856                        sub_last->str_idx, bkref_node, bkref_str,
2857                        OP_OPEN_SUBEXP);
2858   if (err != REG_NOERROR)
2859     return err;
2860   err = match_ctx_add_entry (mctx, bkref_node, bkref_str, sub_top->str_idx,
2861                              sub_last->str_idx);
2862   if (BE (err != REG_NOERROR, 0))
2863     return err;
2864   to_idx = bkref_str + sub_last->str_idx - sub_top->str_idx;
2865   return clean_state_log_if_needed (mctx, to_idx);
2866 }
2867
2868 /* Find the first node which is '(' or ')' and whose index is SUBEXP_IDX.
2869    Search '(' if FL_OPEN, or search ')' otherwise.
2870    TODO: This function isn't efficient...
2871          Because there might be more than one nodes whose types are
2872          OP_OPEN_SUBEXP and whose index is SUBEXP_IDX, we must check all
2873          nodes.
2874          E.g. RE: (a){2}  */
2875
2876 static int
2877 internal_function
2878 find_subexp_node (const re_dfa_t *dfa, const re_node_set *nodes,
2879                   int subexp_idx, int type)
2880 {
2881   int cls_idx;
2882   for (cls_idx = 0; cls_idx < nodes->nelem; ++cls_idx)
2883     {
2884       int cls_node = nodes->elems[cls_idx];
2885       const re_token_t *node = dfa->nodes + cls_node;
2886       if (node->type == type
2887           && node->opr.idx == subexp_idx)
2888         return cls_node;
2889     }
2890   return -1;
2891 }
2892
2893 /* Check whether the node TOP_NODE at TOP_STR can arrive to the node
2894    LAST_NODE at LAST_STR.  We record the path onto PATH since it will be
2895    heavily reused.
2896    Return REG_NOERROR if it can arrive, or REG_NOMATCH otherwise.  */
2897
2898 static reg_errcode_t
2899 internal_function __attribute_warn_unused_result__
2900 check_arrival (re_match_context_t *mctx, state_array_t *path, int top_node,
2901                int top_str, int last_node, int last_str, int type)
2902 {
2903   const re_dfa_t *const dfa = mctx->dfa;
2904   reg_errcode_t err = REG_NOERROR;
2905   int subexp_num, backup_cur_idx, str_idx, null_cnt;
2906   re_dfastate_t *cur_state = NULL;
2907   re_node_set *cur_nodes, next_nodes;
2908   re_dfastate_t **backup_state_log;
2909   unsigned int context;
2910
2911   subexp_num = dfa->nodes[top_node].opr.idx;
2912   /* Extend the buffer if we need.  */
2913   if (BE (path->alloc < last_str + mctx->max_mb_elem_len + 1, 0))
2914     {
2915       re_dfastate_t **new_array;
2916       int old_alloc = path->alloc;
2917       path->alloc += last_str + mctx->max_mb_elem_len + 1;
2918       new_array = re_realloc (path->array, re_dfastate_t *, path->alloc);
2919       if (BE (new_array == NULL, 0))
2920         {
2921           path->alloc = old_alloc;
2922           return REG_ESPACE;
2923         }
2924       path->array = new_array;
2925       memset (new_array + old_alloc, '\0',
2926               sizeof (re_dfastate_t *) * (path->alloc - old_alloc));
2927     }
2928
2929   str_idx = path->next_idx ?: top_str;
2930
2931   /* Temporary modify MCTX.  */
2932   backup_state_log = mctx->state_log;
2933   backup_cur_idx = mctx->input.cur_idx;
2934   mctx->state_log = path->array;
2935   mctx->input.cur_idx = str_idx;
2936
2937   /* Setup initial node set.  */
2938   context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
2939   if (str_idx == top_str)
2940     {
2941       err = re_node_set_init_1 (&next_nodes, top_node);
2942       if (BE (err != REG_NOERROR, 0))
2943         return err;
2944       err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
2945       if (BE (err != REG_NOERROR, 0))
2946         {
2947           re_node_set_free (&next_nodes);
2948           return err;
2949         }
2950     }
2951   else
2952     {
2953       cur_state = mctx->state_log[str_idx];
2954       if (cur_state && cur_state->has_backref)
2955         {
2956           err = re_node_set_init_copy (&next_nodes, &cur_state->nodes);
2957           if (BE (err != REG_NOERROR, 0))
2958             return err;
2959         }
2960       else
2961         re_node_set_init_empty (&next_nodes);
2962     }
2963   if (str_idx == top_str || (cur_state && cur_state->has_backref))
2964     {
2965       if (next_nodes.nelem)
2966         {
2967           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
2968                                     subexp_num, type);
2969           if (BE (err != REG_NOERROR, 0))
2970             {
2971               re_node_set_free (&next_nodes);
2972               return err;
2973             }
2974         }
2975       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
2976       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
2977         {
2978           re_node_set_free (&next_nodes);
2979           return err;
2980         }
2981       mctx->state_log[str_idx] = cur_state;
2982     }
2983
2984   for (null_cnt = 0; str_idx < last_str && null_cnt <= mctx->max_mb_elem_len;)
2985     {
2986       re_node_set_empty (&next_nodes);
2987       if (mctx->state_log[str_idx + 1])
2988         {
2989           err = re_node_set_merge (&next_nodes,
2990                                    &mctx->state_log[str_idx + 1]->nodes);
2991           if (BE (err != REG_NOERROR, 0))
2992             {
2993               re_node_set_free (&next_nodes);
2994               return err;
2995             }
2996         }
2997       if (cur_state)
2998         {
2999           err = check_arrival_add_next_nodes (mctx, str_idx,
3000                                               &cur_state->non_eps_nodes,
3001                                               &next_nodes);
3002           if (BE (err != REG_NOERROR, 0))
3003             {
3004               re_node_set_free (&next_nodes);
3005               return err;
3006             }
3007         }
3008       ++str_idx;
3009       if (next_nodes.nelem)
3010         {
3011           err = check_arrival_expand_ecl (dfa, &next_nodes, subexp_num, type);
3012           if (BE (err != REG_NOERROR, 0))
3013             {
3014               re_node_set_free (&next_nodes);
3015               return err;
3016             }
3017           err = expand_bkref_cache (mctx, &next_nodes, str_idx,
3018                                     subexp_num, type);
3019           if (BE (err != REG_NOERROR, 0))
3020             {
3021               re_node_set_free (&next_nodes);
3022               return err;
3023             }
3024         }
3025       context = re_string_context_at (&mctx->input, str_idx - 1, mctx->eflags);
3026       cur_state = re_acquire_state_context (&err, dfa, &next_nodes, context);
3027       if (BE (cur_state == NULL && err != REG_NOERROR, 0))
3028         {
3029           re_node_set_free (&next_nodes);
3030           return err;
3031         }
3032       mctx->state_log[str_idx] = cur_state;
3033       null_cnt = cur_state == NULL ? null_cnt + 1 : 0;
3034     }
3035   re_node_set_free (&next_nodes);
3036   cur_nodes = (mctx->state_log[last_str] == NULL ? NULL
3037                : &mctx->state_log[last_str]->nodes);
3038   path->next_idx = str_idx;
3039
3040   /* Fix MCTX.  */
3041   mctx->state_log = backup_state_log;
3042   mctx->input.cur_idx = backup_cur_idx;
3043
3044   /* Then check the current node set has the node LAST_NODE.  */
3045   if (cur_nodes != NULL && re_node_set_contains (cur_nodes, last_node))
3046     return REG_NOERROR;
3047
3048   return REG_NOMATCH;
3049 }
3050
3051 /* Helper functions for check_arrival.  */
3052
3053 /* Calculate the destination nodes of CUR_NODES at STR_IDX, and append them
3054    to NEXT_NODES.
3055    TODO: This function is similar to the functions transit_state*(),
3056          however this function has many additional works.
3057          Can't we unify them?  */
3058
3059 static reg_errcode_t
3060 internal_function __attribute_warn_unused_result__
3061 check_arrival_add_next_nodes (re_match_context_t *mctx, int str_idx,
3062                               re_node_set *cur_nodes, re_node_set *next_nodes)
3063 {
3064   const re_dfa_t *const dfa = mctx->dfa;
3065   int result;
3066   int cur_idx;
3067   reg_errcode_t err = REG_NOERROR;
3068   re_node_set union_set;
3069   re_node_set_init_empty (&union_set);
3070   for (cur_idx = 0; cur_idx < cur_nodes->nelem; ++cur_idx)
3071     {
3072       int naccepted = 0;
3073       int cur_node = cur_nodes->elems[cur_idx];
3074 #ifdef DEBUG
3075       re_token_type_t type = dfa->nodes[cur_node].type;
3076       assert (!IS_EPSILON_NODE (type));
3077 #endif
3078 #ifdef RE_ENABLE_I18N
3079       /* If the node may accept `multi byte'.  */
3080       if (dfa->nodes[cur_node].accept_mb)
3081         {
3082           naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
3083                                                str_idx);
3084           if (naccepted > 1)
3085             {
3086               re_dfastate_t *dest_state;
3087               int next_node = dfa->nexts[cur_node];
3088               int next_idx = str_idx + naccepted;
3089               dest_state = mctx->state_log[next_idx];
3090               re_node_set_empty (&union_set);
3091               if (dest_state)
3092                 {
3093                   err = re_node_set_merge (&union_set, &dest_state->nodes);
3094                   if (BE (err != REG_NOERROR, 0))
3095                     {
3096                       re_node_set_free (&union_set);
3097                       return err;
3098                     }
3099                 }
3100               result = re_node_set_insert (&union_set, next_node);
3101               if (BE (result < 0, 0))
3102                 {
3103                   re_node_set_free (&union_set);
3104                   return REG_ESPACE;
3105                 }
3106               mctx->state_log[next_idx] = re_acquire_state (&err, dfa,
3107                                                             &union_set);
3108               if (BE (mctx->state_log[next_idx] == NULL
3109                       && err != REG_NOERROR, 0))
3110                 {
3111                   re_node_set_free (&union_set);
3112                   return err;
3113                 }
3114             }
3115         }
3116 #endif /* RE_ENABLE_I18N */
3117       if (naccepted
3118           || check_node_accept (mctx, dfa->nodes + cur_node, str_idx))
3119         {
3120           result = re_node_set_insert (next_nodes, dfa->nexts[cur_node]);
3121           if (BE (result < 0, 0))
3122             {
3123               re_node_set_free (&union_set);
3124               return REG_ESPACE;
3125             }
3126         }
3127     }
3128   re_node_set_free (&union_set);
3129   return REG_NOERROR;
3130 }
3131
3132 /* For all the nodes in CUR_NODES, add the epsilon closures of them to
3133    CUR_NODES, however exclude the nodes which are:
3134     - inside the sub expression whose number is EX_SUBEXP, if FL_OPEN.
3135     - out of the sub expression whose number is EX_SUBEXP, if !FL_OPEN.
3136 */
3137
3138 static reg_errcode_t
3139 internal_function
3140 check_arrival_expand_ecl (const re_dfa_t *dfa, re_node_set *cur_nodes,
3141                           int ex_subexp, int type)
3142 {
3143   reg_errcode_t err;
3144   int idx, outside_node;
3145   re_node_set new_nodes;
3146 #ifdef DEBUG
3147   assert (cur_nodes->nelem);
3148 #endif
3149   err = re_node_set_alloc (&new_nodes, cur_nodes->nelem);
3150   if (BE (err != REG_NOERROR, 0))
3151     return err;
3152   /* Create a new node set NEW_NODES with the nodes which are epsilon
3153      closures of the node in CUR_NODES.  */
3154
3155   for (idx = 0; idx < cur_nodes->nelem; ++idx)
3156     {
3157       int cur_node = cur_nodes->elems[idx];
3158       const re_node_set *eclosure = dfa->eclosures + cur_node;
3159       outside_node = find_subexp_node (dfa, eclosure, ex_subexp, type);
3160       if (outside_node == -1)
3161         {
3162           /* There are no problematic nodes, just merge them.  */
3163           err = re_node_set_merge (&new_nodes, eclosure);
3164           if (BE (err != REG_NOERROR, 0))
3165             {
3166               re_node_set_free (&new_nodes);
3167               return err;
3168             }
3169         }
3170       else
3171         {
3172           /* There are problematic nodes, re-calculate incrementally.  */
3173           err = check_arrival_expand_ecl_sub (dfa, &new_nodes, cur_node,
3174                                               ex_subexp, type);
3175           if (BE (err != REG_NOERROR, 0))
3176             {
3177               re_node_set_free (&new_nodes);
3178               return err;
3179             }
3180         }
3181     }
3182   re_node_set_free (cur_nodes);
3183   *cur_nodes = new_nodes;
3184   return REG_NOERROR;
3185 }
3186
3187 /* Helper function for check_arrival_expand_ecl.
3188    Check incrementally the epsilon closure of TARGET, and if it isn't
3189    problematic append it to DST_NODES.  */
3190
3191 static reg_errcode_t
3192 internal_function __attribute_warn_unused_result__
3193 check_arrival_expand_ecl_sub (const re_dfa_t *dfa, re_node_set *dst_nodes,
3194                               int target, int ex_subexp, int type)
3195 {
3196   int cur_node;
3197   for (cur_node = target; !re_node_set_contains (dst_nodes, cur_node);)
3198     {
3199       int err;
3200
3201       if (dfa->nodes[cur_node].type == type
3202           && dfa->nodes[cur_node].opr.idx == ex_subexp)
3203         {
3204           if (type == OP_CLOSE_SUBEXP)
3205             {
3206               err = re_node_set_insert (dst_nodes, cur_node);
3207               if (BE (err == -1, 0))
3208                 return REG_ESPACE;
3209             }
3210           break;
3211         }
3212       err = re_node_set_insert (dst_nodes, cur_node);
3213       if (BE (err == -1, 0))
3214         return REG_ESPACE;
3215       if (dfa->edests[cur_node].nelem == 0)
3216         break;
3217       if (dfa->edests[cur_node].nelem == 2)
3218         {
3219           err = check_arrival_expand_ecl_sub (dfa, dst_nodes,
3220                                               dfa->edests[cur_node].elems[1],
3221                                               ex_subexp, type);
3222           if (BE (err != REG_NOERROR, 0))
3223             return err;
3224         }
3225       cur_node = dfa->edests[cur_node].elems[0];
3226     }
3227   return REG_NOERROR;
3228 }
3229
3230
3231 /* For all the back references in the current state, calculate the
3232    destination of the back references by the appropriate entry
3233    in MCTX->BKREF_ENTS.  */
3234
3235 static reg_errcode_t
3236 internal_function __attribute_warn_unused_result__
3237 expand_bkref_cache (re_match_context_t *mctx, re_node_set *cur_nodes,
3238                     int cur_str, int subexp_num, int type)
3239 {
3240   const re_dfa_t *const dfa = mctx->dfa;
3241   reg_errcode_t err;
3242   int cache_idx_start = search_cur_bkref_entry (mctx, cur_str);
3243   struct re_backref_cache_entry *ent;
3244
3245   if (cache_idx_start == -1)
3246     return REG_NOERROR;
3247
3248  restart:
3249   ent = mctx->bkref_ents + cache_idx_start;
3250   do
3251     {
3252       int to_idx, next_node;
3253
3254       /* Is this entry ENT is appropriate?  */
3255       if (!re_node_set_contains (cur_nodes, ent->node))
3256         continue; /* No.  */
3257
3258       to_idx = cur_str + ent->subexp_to - ent->subexp_from;
3259       /* Calculate the destination of the back reference, and append it
3260          to MCTX->STATE_LOG.  */
3261       if (to_idx == cur_str)
3262         {
3263           /* The backreference did epsilon transit, we must re-check all the
3264              node in the current state.  */
3265           re_node_set new_dests;
3266           reg_errcode_t err2, err3;
3267           next_node = dfa->edests[ent->node].elems[0];
3268           if (re_node_set_contains (cur_nodes, next_node))
3269             continue;
3270           err = re_node_set_init_1 (&new_dests, next_node);
3271           err2 = check_arrival_expand_ecl (dfa, &new_dests, subexp_num, type);
3272           err3 = re_node_set_merge (cur_nodes, &new_dests);
3273           re_node_set_free (&new_dests);
3274           if (BE (err != REG_NOERROR || err2 != REG_NOERROR
3275                   || err3 != REG_NOERROR, 0))
3276             {
3277               err = (err != REG_NOERROR ? err
3278                      : (err2 != REG_NOERROR ? err2 : err3));
3279               return err;
3280             }
3281           /* TODO: It is still inefficient...  */
3282           goto restart;
3283         }
3284       else
3285         {
3286           re_node_set union_set;
3287           next_node = dfa->nexts[ent->node];
3288           if (mctx->state_log[to_idx])
3289             {
3290               int ret;
3291               if (re_node_set_contains (&mctx->state_log[to_idx]->nodes,
3292                                         next_node))
3293                 continue;
3294               err = re_node_set_init_copy (&union_set,
3295                                            &mctx->state_log[to_idx]->nodes);
3296               ret = re_node_set_insert (&union_set, next_node);
3297               if (BE (err != REG_NOERROR || ret < 0, 0))
3298                 {
3299                   re_node_set_free (&union_set);
3300                   err = err != REG_NOERROR ? err : REG_ESPACE;
3301                   return err;
3302                 }
3303             }
3304           else
3305             {
3306               err = re_node_set_init_1 (&union_set, next_node);
3307               if (BE (err != REG_NOERROR, 0))
3308                 return err;
3309             }
3310           mctx->state_log[to_idx] = re_acquire_state (&err, dfa, &union_set);
3311           re_node_set_free (&union_set);
3312           if (BE (mctx->state_log[to_idx] == NULL
3313                   && err != REG_NOERROR, 0))
3314             return err;
3315         }
3316     }
3317   while (ent++->more);
3318   return REG_NOERROR;
3319 }
3320
3321 /* Build transition table for the state.
3322    Return 1 if succeeded, otherwise return NULL.  */
3323
3324 static int
3325 internal_function
3326 build_trtable (const re_dfa_t *dfa, re_dfastate_t *state)
3327 {
3328   reg_errcode_t err;
3329   int i, j, ch, need_word_trtable = 0;
3330   bitset_word_t elem, mask;
3331   bool dests_node_malloced = false;
3332   bool dest_states_malloced = false;
3333   int ndests; /* Number of the destination states from `state'.  */
3334   re_dfastate_t **trtable;
3335   re_dfastate_t **dest_states = NULL, **dest_states_word, **dest_states_nl;
3336   re_node_set follows, *dests_node;
3337   bitset_t *dests_ch;
3338   bitset_t acceptable;
3339
3340   struct dests_alloc
3341   {
3342     re_node_set dests_node[SBC_MAX];
3343     bitset_t dests_ch[SBC_MAX];
3344   } *dests_alloc;
3345
3346   /* We build DFA states which corresponds to the destination nodes
3347      from `state'.  `dests_node[i]' represents the nodes which i-th
3348      destination state contains, and `dests_ch[i]' represents the
3349      characters which i-th destination state accepts.  */
3350   if (__libc_use_alloca (sizeof (struct dests_alloc)))
3351     dests_alloc = (struct dests_alloc *) alloca (sizeof (struct dests_alloc));
3352   else
3353     {
3354       dests_alloc = re_malloc (struct dests_alloc, 1);
3355       if (BE (dests_alloc == NULL, 0))
3356         return 0;
3357       dests_node_malloced = true;
3358     }
3359   dests_node = dests_alloc->dests_node;
3360   dests_ch = dests_alloc->dests_ch;
3361
3362   /* Initialize transiton table.  */
3363   state->word_trtable = state->trtable = NULL;
3364
3365   /* At first, group all nodes belonging to `state' into several
3366      destinations.  */
3367   ndests = group_nodes_into_DFAstates (dfa, state, dests_node, dests_ch);
3368   if (BE (ndests <= 0, 0))
3369     {
3370       if (dests_node_malloced)
3371         free (dests_alloc);
3372       /* Return 0 in case of an error, 1 otherwise.  */
3373       if (ndests == 0)
3374         {
3375           state->trtable = (re_dfastate_t **)
3376             calloc (sizeof (re_dfastate_t *), SBC_MAX);
3377           if (BE (state->trtable == NULL, 0))
3378             return 0;
3379           return 1;
3380         }
3381       return 0;
3382     }
3383
3384   err = re_node_set_alloc (&follows, ndests + 1);
3385   if (BE (err != REG_NOERROR, 0))
3386     goto out_free;
3387
3388   /* Avoid arithmetic overflow in size calculation.  */
3389   if (BE ((((SIZE_MAX - (sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX)
3390             / (3 * sizeof (re_dfastate_t *)))
3391            < ndests),
3392           0))
3393     goto out_free;
3394
3395   if (__libc_use_alloca ((sizeof (re_node_set) + sizeof (bitset_t)) * SBC_MAX
3396                          + ndests * 3 * sizeof (re_dfastate_t *)))
3397     dest_states = (re_dfastate_t **)
3398       alloca (ndests * 3 * sizeof (re_dfastate_t *));
3399   else
3400     {
3401       dest_states = (re_dfastate_t **)
3402         malloc (ndests * 3 * sizeof (re_dfastate_t *));
3403       if (BE (dest_states == NULL, 0))
3404         {
3405 out_free:
3406           if (dest_states_malloced)
3407             free (dest_states);
3408           re_node_set_free (&follows);
3409           for (i = 0; i < ndests; ++i)
3410             re_node_set_free (dests_node + i);
3411           if (dests_node_malloced)
3412             free (dests_alloc);
3413           return 0;
3414         }
3415       dest_states_malloced = true;
3416     }
3417   dest_states_word = dest_states + ndests;
3418   dest_states_nl = dest_states_word + ndests;
3419   bitset_empty (acceptable);
3420
3421   /* Then build the states for all destinations.  */
3422   for (i = 0; i < ndests; ++i)
3423     {
3424       int next_node;
3425       re_node_set_empty (&follows);
3426       /* Merge the follows of this destination states.  */
3427       for (j = 0; j < dests_node[i].nelem; ++j)
3428         {
3429           next_node = dfa->nexts[dests_node[i].elems[j]];
3430           if (next_node != -1)
3431             {
3432               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
3433               if (BE (err != REG_NOERROR, 0))
3434                 goto out_free;
3435             }
3436         }
3437       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
3438       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
3439         goto out_free;
3440       /* If the new state has context constraint,
3441          build appropriate states for these contexts.  */
3442       if (dest_states[i]->has_constraint)
3443         {
3444           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
3445                                                           CONTEXT_WORD);
3446           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
3447             goto out_free;
3448
3449           if (dest_states[i] != dest_states_word[i] && dfa->mb_cur_max > 1)
3450             need_word_trtable = 1;
3451
3452           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
3453                                                         CONTEXT_NEWLINE);
3454           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
3455             goto out_free;
3456         }
3457       else
3458         {
3459           dest_states_word[i] = dest_states[i];
3460           dest_states_nl[i] = dest_states[i];
3461         }
3462       bitset_merge (acceptable, dests_ch[i]);
3463     }
3464
3465   if (!BE (need_word_trtable, 0))
3466     {
3467       /* We don't care about whether the following character is a word
3468          character, or we are in a single-byte character set so we can
3469          discern by looking at the character code: allocate a
3470          256-entry transition table.  */
3471       trtable = state->trtable =
3472         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
3473       if (BE (trtable == NULL, 0))
3474         goto out_free;
3475
3476       /* For all characters ch...:  */
3477       for (i = 0; i < BITSET_WORDS; ++i)
3478         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3479              elem;
3480              mask <<= 1, elem >>= 1, ++ch)
3481           if (BE (elem & 1, 0))
3482             {
3483               /* There must be exactly one destination which accepts
3484                  character ch.  See group_nodes_into_DFAstates.  */
3485               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3486                 ;
3487
3488               /* j-th destination accepts the word character ch.  */
3489               if (dfa->word_char[i] & mask)
3490                 trtable[ch] = dest_states_word[j];
3491               else
3492                 trtable[ch] = dest_states[j];
3493             }
3494     }
3495   else
3496     {
3497       /* We care about whether the following character is a word
3498          character, and we are in a multi-byte character set: discern
3499          by looking at the character code: build two 256-entry
3500          transition tables, one starting at trtable[0] and one
3501          starting at trtable[SBC_MAX].  */
3502       trtable = state->word_trtable =
3503         (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), 2 * SBC_MAX);
3504       if (BE (trtable == NULL, 0))
3505         goto out_free;
3506
3507       /* For all characters ch...:  */
3508       for (i = 0; i < BITSET_WORDS; ++i)
3509         for (ch = i * BITSET_WORD_BITS, elem = acceptable[i], mask = 1;
3510              elem;
3511              mask <<= 1, elem >>= 1, ++ch)
3512           if (BE (elem & 1, 0))
3513             {
3514               /* There must be exactly one destination which accepts
3515                  character ch.  See group_nodes_into_DFAstates.  */
3516               for (j = 0; (dests_ch[j][i] & mask) == 0; ++j)
3517                 ;
3518
3519               /* j-th destination accepts the word character ch.  */
3520               trtable[ch] = dest_states[j];
3521               trtable[ch + SBC_MAX] = dest_states_word[j];
3522             }
3523     }
3524
3525   /* new line */
3526   if (bitset_contain (acceptable, NEWLINE_CHAR))
3527     {
3528       /* The current state accepts newline character.  */
3529       for (j = 0; j < ndests; ++j)
3530         if (bitset_contain (dests_ch[j], NEWLINE_CHAR))
3531           {
3532             /* k-th destination accepts newline character.  */
3533             trtable[NEWLINE_CHAR] = dest_states_nl[j];
3534             if (need_word_trtable)
3535               trtable[NEWLINE_CHAR + SBC_MAX] = dest_states_nl[j];
3536             /* There must be only one destination which accepts
3537                newline.  See group_nodes_into_DFAstates.  */
3538             break;
3539           }
3540     }
3541
3542   if (dest_states_malloced)
3543     free (dest_states);
3544
3545   re_node_set_free (&follows);
3546   for (i = 0; i < ndests; ++i)
3547     re_node_set_free (dests_node + i);
3548
3549   if (dests_node_malloced)
3550     free (dests_alloc);
3551
3552   return 1;
3553 }
3554
3555 /* Group all nodes belonging to STATE into several destinations.
3556    Then for all destinations, set the nodes belonging to the destination
3557    to DESTS_NODE[i] and set the characters accepted by the destination
3558    to DEST_CH[i].  This function return the number of destinations.  */
3559
3560 static int
3561 internal_function
3562 group_nodes_into_DFAstates (const re_dfa_t *dfa, const re_dfastate_t *state,
3563                             re_node_set *dests_node, bitset_t *dests_ch)
3564 {
3565   reg_errcode_t err;
3566   int result;
3567   int i, j, k;
3568   int ndests; /* Number of the destinations from `state'.  */
3569   bitset_t accepts; /* Characters a node can accept.  */
3570   const re_node_set *cur_nodes = &state->nodes;
3571   bitset_empty (accepts);
3572   ndests = 0;
3573
3574   /* For all the nodes belonging to `state',  */
3575   for (i = 0; i < cur_nodes->nelem; ++i)
3576     {
3577       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
3578       re_token_type_t type = node->type;
3579       unsigned int constraint = node->constraint;
3580
3581       /* Enumerate all single byte character this node can accept.  */
3582       if (type == CHARACTER)
3583         bitset_set (accepts, node->opr.c);
3584       else if (type == SIMPLE_BRACKET)
3585         {
3586           bitset_merge (accepts, node->opr.sbcset);
3587         }
3588       else if (type == OP_PERIOD)
3589         {
3590 #ifdef RE_ENABLE_I18N
3591           if (dfa->mb_cur_max > 1)
3592             bitset_merge (accepts, dfa->sb_char);
3593           else
3594 #endif
3595             bitset_set_all (accepts);
3596           if (!(dfa->syntax & RE_DOT_NEWLINE))
3597             bitset_clear (accepts, '\n');
3598           if (dfa->syntax & RE_DOT_NOT_NULL)
3599             bitset_clear (accepts, '\0');
3600         }
3601 #ifdef RE_ENABLE_I18N
3602       else if (type == OP_UTF8_PERIOD)
3603         {
3604           memset (accepts, '\xff', sizeof (bitset_t) / 2);
3605           if (!(dfa->syntax & RE_DOT_NEWLINE))
3606             bitset_clear (accepts, '\n');
3607           if (dfa->syntax & RE_DOT_NOT_NULL)
3608             bitset_clear (accepts, '\0');
3609         }
3610 #endif
3611       else
3612         continue;
3613
3614       /* Check the `accepts' and sift the characters which are not
3615          match it the context.  */
3616       if (constraint)
3617         {
3618           if (constraint & NEXT_NEWLINE_CONSTRAINT)
3619             {
3620               bool accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
3621               bitset_empty (accepts);
3622               if (accepts_newline)
3623                 bitset_set (accepts, NEWLINE_CHAR);
3624               else
3625                 continue;
3626             }
3627           if (constraint & NEXT_ENDBUF_CONSTRAINT)
3628             {
3629               bitset_empty (accepts);
3630               continue;
3631             }
3632
3633           if (constraint & NEXT_WORD_CONSTRAINT)
3634             {
3635               bitset_word_t any_set = 0;
3636               if (type == CHARACTER && !node->word_char)
3637                 {
3638                   bitset_empty (accepts);
3639                   continue;
3640                 }
3641 #ifdef RE_ENABLE_I18N
3642               if (dfa->mb_cur_max > 1)
3643                 for (j = 0; j < BITSET_WORDS; ++j)
3644                   any_set |= (accepts[j] &= (dfa->word_char[j] | ~dfa->sb_char[j]));
3645               else
3646 #endif
3647                 for (j = 0; j < BITSET_WORDS; ++j)
3648                   any_set |= (accepts[j] &= dfa->word_char[j]);
3649               if (!any_set)
3650                 continue;
3651             }
3652           if (constraint & NEXT_NOTWORD_CONSTRAINT)
3653             {
3654               bitset_word_t any_set = 0;
3655               if (type == CHARACTER && node->word_char)
3656                 {
3657                   bitset_empty (accepts);
3658                   continue;
3659                 }
3660 #ifdef RE_ENABLE_I18N
3661               if (dfa->mb_cur_max > 1)
3662                 for (j = 0; j < BITSET_WORDS; ++j)
3663                   any_set |= (accepts[j] &= ~(dfa->word_char[j] & dfa->sb_char[j]));
3664               else
3665 #endif
3666                 for (j = 0; j < BITSET_WORDS; ++j)
3667                   any_set |= (accepts[j] &= ~dfa->word_char[j]);
3668               if (!any_set)
3669                 continue;
3670             }
3671         }
3672
3673       /* Then divide `accepts' into DFA states, or create a new
3674          state.  Above, we make sure that accepts is not empty.  */
3675       for (j = 0; j < ndests; ++j)
3676         {
3677           bitset_t intersec; /* Intersection sets, see below.  */
3678           bitset_t remains;
3679           /* Flags, see below.  */
3680           bitset_word_t has_intersec, not_subset, not_consumed;
3681
3682           /* Optimization, skip if this state doesn't accept the character.  */
3683           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
3684             continue;
3685
3686           /* Enumerate the intersection set of this state and `accepts'.  */
3687           has_intersec = 0;
3688           for (k = 0; k < BITSET_WORDS; ++k)
3689             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
3690           /* And skip if the intersection set is empty.  */
3691           if (!has_intersec)
3692             continue;
3693
3694           /* Then check if this state is a subset of `accepts'.  */
3695           not_subset = not_consumed = 0;
3696           for (k = 0; k < BITSET_WORDS; ++k)
3697             {
3698               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
3699               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
3700             }
3701
3702           /* If this state isn't a subset of `accepts', create a
3703              new group state, which has the `remains'. */
3704           if (not_subset)
3705             {
3706               bitset_copy (dests_ch[ndests], remains);
3707               bitset_copy (dests_ch[j], intersec);
3708               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
3709               if (BE (err != REG_NOERROR, 0))
3710                 goto error_return;
3711               ++ndests;
3712             }
3713
3714           /* Put the position in the current group. */
3715           result = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
3716           if (BE (result < 0, 0))
3717             goto error_return;
3718
3719           /* If all characters are consumed, go to next node. */
3720           if (!not_consumed)
3721             break;
3722         }
3723       /* Some characters remain, create a new group. */
3724       if (j == ndests)
3725         {
3726           bitset_copy (dests_ch[ndests], accepts);
3727           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
3728           if (BE (err != REG_NOERROR, 0))
3729             goto error_return;
3730           ++ndests;
3731           bitset_empty (accepts);
3732         }
3733     }
3734   return ndests;
3735  error_return:
3736   for (j = 0; j < ndests; ++j)
3737     re_node_set_free (dests_node + j);
3738   return -1;
3739 }
3740
3741 #ifdef RE_ENABLE_I18N
3742 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
3743    Return the number of the bytes the node accepts.
3744    STR_IDX is the current index of the input string.
3745
3746    This function handles the nodes which can accept one character, or
3747    one collating element like '.', '[a-z]', opposite to the other nodes
3748    can only accept one byte.  */
3749
3750 static int
3751 internal_function
3752 check_node_accept_bytes (const re_dfa_t *dfa, int node_idx,
3753                          const re_string_t *input, int str_idx)
3754 {
3755   const re_token_t *node = dfa->nodes + node_idx;
3756   int char_len, elem_len;
3757   int i;
3758
3759   if (BE (node->type == OP_UTF8_PERIOD, 0))
3760     {
3761       unsigned char c = re_string_byte_at (input, str_idx), d;
3762       if (BE (c < 0xc2, 1))
3763         return 0;
3764
3765       if (str_idx + 2 > input->len)
3766         return 0;
3767
3768       d = re_string_byte_at (input, str_idx + 1);
3769       if (c < 0xe0)
3770         return (d < 0x80 || d > 0xbf) ? 0 : 2;
3771       else if (c < 0xf0)
3772         {
3773           char_len = 3;
3774           if (c == 0xe0 && d < 0xa0)
3775             return 0;
3776         }
3777       else if (c < 0xf8)
3778         {
3779           char_len = 4;
3780           if (c == 0xf0 && d < 0x90)
3781             return 0;
3782         }
3783       else if (c < 0xfc)
3784         {
3785           char_len = 5;
3786           if (c == 0xf8 && d < 0x88)
3787             return 0;
3788         }
3789       else if (c < 0xfe)
3790         {
3791           char_len = 6;
3792           if (c == 0xfc && d < 0x84)
3793             return 0;
3794         }
3795       else
3796         return 0;
3797
3798       if (str_idx + char_len > input->len)
3799         return 0;
3800
3801       for (i = 1; i < char_len; ++i)
3802         {
3803           d = re_string_byte_at (input, str_idx + i);
3804           if (d < 0x80 || d > 0xbf)
3805             return 0;
3806         }
3807       return char_len;
3808     }
3809
3810   char_len = re_string_char_size_at (input, str_idx);
3811   if (node->type == OP_PERIOD)
3812     {
3813       if (char_len <= 1)
3814         return 0;
3815       /* FIXME: I don't think this if is needed, as both '\n'
3816          and '\0' are char_len == 1.  */
3817       /* '.' accepts any one character except the following two cases.  */
3818       if ((!(dfa->syntax & RE_DOT_NEWLINE) &&
3819            re_string_byte_at (input, str_idx) == '\n') ||
3820           ((dfa->syntax & RE_DOT_NOT_NULL) &&
3821            re_string_byte_at (input, str_idx) == '\0'))
3822         return 0;
3823       return char_len;
3824     }
3825
3826   elem_len = re_string_elem_size_at (input, str_idx);
3827   if ((elem_len <= 1 && char_len <= 1) || char_len == 0)
3828     return 0;
3829
3830   if (node->type == COMPLEX_BRACKET)
3831     {
3832       const re_charset_t *cset = node->opr.mbcset;
3833 # ifdef _LIBC
3834       const unsigned char *pin
3835         = ((const unsigned char *) re_string_get_buffer (input) + str_idx);
3836       int j;
3837       uint32_t nrules;
3838 # endif /* _LIBC */
3839       int match_len = 0;
3840       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
3841                     ? re_string_wchar_at (input, str_idx) : 0);
3842
3843       /* match with multibyte character?  */
3844       for (i = 0; i < cset->nmbchars; ++i)
3845         if (wc == cset->mbchars[i])
3846           {
3847             match_len = char_len;
3848             goto check_node_accept_bytes_match;
3849           }
3850       /* match with character_class?  */
3851       for (i = 0; i < cset->nchar_classes; ++i)
3852         {
3853           wctype_t wt = cset->char_classes[i];
3854           if (__iswctype (wc, wt))
3855             {
3856               match_len = char_len;
3857               goto check_node_accept_bytes_match;
3858             }
3859         }
3860
3861 # ifdef _LIBC
3862       nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3863       if (nrules != 0)
3864         {
3865           unsigned int in_collseq = 0;
3866           const int32_t *table, *indirect;
3867           const unsigned char *weights, *extra;
3868           const char *collseqwc;
3869           /* This #include defines a local function!  */
3870 #  include <locale/weight.h>
3871
3872           /* match with collating_symbol?  */
3873           if (cset->ncoll_syms)
3874             extra = (const unsigned char *)
3875               _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
3876           for (i = 0; i < cset->ncoll_syms; ++i)
3877             {
3878               const unsigned char *coll_sym = extra + cset->coll_syms[i];
3879               /* Compare the length of input collating element and
3880                  the length of current collating element.  */
3881               if (*coll_sym != elem_len)
3882                 continue;
3883               /* Compare each bytes.  */
3884               for (j = 0; j < *coll_sym; j++)
3885                 if (pin[j] != coll_sym[1 + j])
3886                   break;
3887               if (j == *coll_sym)
3888                 {
3889                   /* Match if every bytes is equal.  */
3890                   match_len = j;
3891                   goto check_node_accept_bytes_match;
3892                 }
3893             }
3894
3895           if (cset->nranges)
3896             {
3897               if (elem_len <= char_len)
3898                 {
3899                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
3900                   in_collseq = __collseq_table_lookup (collseqwc, wc);
3901                 }
3902               else
3903                 in_collseq = find_collation_sequence_value (pin, elem_len);
3904             }
3905           /* match with range expression?  */
3906           for (i = 0; i < cset->nranges; ++i)
3907             if (cset->range_starts[i] <= in_collseq
3908                 && in_collseq <= cset->range_ends[i])
3909               {
3910                 match_len = elem_len;
3911                 goto check_node_accept_bytes_match;
3912               }
3913
3914           /* match with equivalence_class?  */
3915           if (cset->nequiv_classes)
3916             {
3917               const unsigned char *cp = pin;
3918               table = (const int32_t *)
3919                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3920               weights = (const unsigned char *)
3921                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
3922               extra = (const unsigned char *)
3923                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
3924               indirect = (const int32_t *)
3925                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
3926               int32_t idx = findidx (&cp, elem_len);
3927               if (idx > 0)
3928                 for (i = 0; i < cset->nequiv_classes; ++i)
3929                   {
3930                     int32_t equiv_class_idx = cset->equiv_classes[i];
3931                     size_t weight_len = weights[idx & 0xffffff];
3932                     if (weight_len == weights[equiv_class_idx & 0xffffff]
3933                         && (idx >> 24) == (equiv_class_idx >> 24))
3934                       {
3935                         int cnt = 0;
3936
3937                         idx &= 0xffffff;
3938                         equiv_class_idx &= 0xffffff;
3939
3940                         while (cnt <= weight_len
3941                                && (weights[equiv_class_idx + 1 + cnt]
3942                                    == weights[idx + 1 + cnt]))
3943                           ++cnt;
3944                         if (cnt > weight_len)
3945                           {
3946                             match_len = elem_len;
3947                             goto check_node_accept_bytes_match;
3948                           }
3949                       }
3950                   }
3951             }
3952         }
3953       else
3954 # endif /* _LIBC */
3955         {
3956           /* match with range expression?  */
3957 #if __GNUC__ >= 2
3958           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
3959 #else
3960           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
3961           cmp_buf[2] = wc;
3962 #endif
3963           for (i = 0; i < cset->nranges; ++i)
3964             {
3965               cmp_buf[0] = cset->range_starts[i];
3966               cmp_buf[4] = cset->range_ends[i];
3967               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
3968                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
3969                 {
3970                   match_len = char_len;
3971                   goto check_node_accept_bytes_match;
3972                 }
3973             }
3974         }
3975     check_node_accept_bytes_match:
3976       if (!cset->non_match)
3977         return match_len;
3978       else
3979         {
3980           if (match_len > 0)
3981             return 0;
3982           else
3983             return (elem_len > char_len) ? elem_len : char_len;
3984         }
3985     }
3986   return 0;
3987 }
3988
3989 # ifdef _LIBC
3990 static unsigned int
3991 internal_function
3992 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
3993 {
3994   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3995   if (nrules == 0)
3996     {
3997       if (mbs_len == 1)
3998         {
3999           /* No valid character.  Match it as a single byte character.  */
4000           const unsigned char *collseq = (const unsigned char *)
4001             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
4002           return collseq[mbs[0]];
4003         }
4004       return UINT_MAX;
4005     }
4006   else
4007     {
4008       int32_t idx;
4009       const unsigned char *extra = (const unsigned char *)
4010         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
4011       int32_t extrasize = (const unsigned char *)
4012         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
4013
4014       for (idx = 0; idx < extrasize;)
4015         {
4016           int mbs_cnt, found = 0;
4017           int32_t elem_mbs_len;
4018           /* Skip the name of collating element name.  */
4019           idx = idx + extra[idx] + 1;
4020           elem_mbs_len = extra[idx++];
4021           if (mbs_len == elem_mbs_len)
4022             {
4023               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
4024                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
4025                   break;
4026               if (mbs_cnt == elem_mbs_len)
4027                 /* Found the entry.  */
4028                 found = 1;
4029             }
4030           /* Skip the byte sequence of the collating element.  */
4031           idx += elem_mbs_len;
4032           /* Adjust for the alignment.  */
4033           idx = (idx + 3) & ~3;
4034           /* Skip the collation sequence value.  */
4035           idx += sizeof (uint32_t);
4036           /* Skip the wide char sequence of the collating element.  */
4037           idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
4038           /* If we found the entry, return the sequence value.  */
4039           if (found)
4040             return *(uint32_t *) (extra + idx);
4041           /* Skip the collation sequence value.  */
4042           idx += sizeof (uint32_t);
4043         }
4044       return UINT_MAX;
4045     }
4046 }
4047 # endif /* _LIBC */
4048 #endif /* RE_ENABLE_I18N */
4049
4050 /* Check whether the node accepts the byte which is IDX-th
4051    byte of the INPUT.  */
4052
4053 static int
4054 internal_function
4055 check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
4056                    int idx)
4057 {
4058   unsigned char ch;
4059   ch = re_string_byte_at (&mctx->input, idx);
4060   switch (node->type)
4061     {
4062     case CHARACTER:
4063       if (node->opr.c != ch)
4064         return 0;
4065       break;
4066
4067     case SIMPLE_BRACKET:
4068       if (!bitset_contain (node->opr.sbcset, ch))
4069         return 0;
4070       break;
4071
4072 #ifdef RE_ENABLE_I18N
4073     case OP_UTF8_PERIOD:
4074       if (ch >= 0x80)
4075         return 0;
4076       /* FALLTHROUGH */
4077 #endif
4078     case OP_PERIOD:
4079       if ((ch == '\n' && !(mctx->dfa->syntax & RE_DOT_NEWLINE))
4080           || (ch == '\0' && (mctx->dfa->syntax & RE_DOT_NOT_NULL)))
4081         return 0;
4082       break;
4083
4084     default:
4085       return 0;
4086     }
4087
4088   if (node->constraint)
4089     {
4090       /* The node has constraints.  Check whether the current context
4091          satisfies the constraints.  */
4092       unsigned int context = re_string_context_at (&mctx->input, idx,
4093                                                    mctx->eflags);
4094       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
4095         return 0;
4096     }
4097
4098   return 1;
4099 }
4100
4101 /* Extend the buffers, if the buffers have run out.  */
4102
4103 static reg_errcode_t
4104 internal_function __attribute_warn_unused_result__
4105 extend_buffers (re_match_context_t *mctx)
4106 {
4107   reg_errcode_t ret;
4108   re_string_t *pstr = &mctx->input;
4109
4110   /* Avoid overflow.  */
4111   if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
4112     return REG_ESPACE;
4113
4114   /* Double the lengthes of the buffers.  */
4115   ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
4116   if (BE (ret != REG_NOERROR, 0))
4117     return ret;
4118
4119   if (mctx->state_log != NULL)
4120     {
4121       /* And double the length of state_log.  */
4122       /* XXX We have no indication of the size of this buffer.  If this
4123          allocation fail we have no indication that the state_log array
4124          does not have the right size.  */
4125       re_dfastate_t **new_array = re_realloc (mctx->state_log, re_dfastate_t *,
4126                                               pstr->bufs_len + 1);
4127       if (BE (new_array == NULL, 0))
4128         return REG_ESPACE;
4129       mctx->state_log = new_array;
4130     }
4131
4132   /* Then reconstruct the buffers.  */
4133   if (pstr->icase)
4134     {
4135 #ifdef RE_ENABLE_I18N
4136       if (pstr->mb_cur_max > 1)
4137         {
4138           ret = build_wcs_upper_buffer (pstr);
4139           if (BE (ret != REG_NOERROR, 0))
4140             return ret;
4141         }
4142       else
4143 #endif /* RE_ENABLE_I18N  */
4144         build_upper_buffer (pstr);
4145     }
4146   else
4147     {
4148 #ifdef RE_ENABLE_I18N
4149       if (pstr->mb_cur_max > 1)
4150         build_wcs_buffer (pstr);
4151       else
4152 #endif /* RE_ENABLE_I18N  */
4153         {
4154           if (pstr->trans != NULL)
4155             re_string_translate_buffer (pstr);
4156         }
4157     }
4158   return REG_NOERROR;
4159 }
4160
4161 \f
4162 /* Functions for matching context.  */
4163
4164 /* Initialize MCTX.  */
4165
4166 static reg_errcode_t
4167 internal_function __attribute_warn_unused_result__
4168 match_ctx_init (re_match_context_t *mctx, int eflags, int n)
4169 {
4170   mctx->eflags = eflags;
4171   mctx->match_last = -1;
4172   if (n > 0)
4173     {
4174       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
4175       mctx->sub_tops = re_malloc (re_sub_match_top_t *, n);
4176       if (BE (mctx->bkref_ents == NULL || mctx->sub_tops == NULL, 0))
4177         return REG_ESPACE;
4178     }
4179   /* Already zero-ed by the caller.
4180      else
4181        mctx->bkref_ents = NULL;
4182      mctx->nbkref_ents = 0;
4183      mctx->nsub_tops = 0;  */
4184   mctx->abkref_ents = n;
4185   mctx->max_mb_elem_len = 1;
4186   mctx->asub_tops = n;
4187   return REG_NOERROR;
4188 }
4189
4190 /* Clean the entries which depend on the current input in MCTX.
4191    This function must be invoked when the matcher changes the start index
4192    of the input, or changes the input string.  */
4193
4194 static void
4195 internal_function
4196 match_ctx_clean (re_match_context_t *mctx)
4197 {
4198   int st_idx;
4199   for (st_idx = 0; st_idx < mctx->nsub_tops; ++st_idx)
4200     {
4201       int sl_idx;
4202       re_sub_match_top_t *top = mctx->sub_tops[st_idx];
4203       for (sl_idx = 0; sl_idx < top->nlasts; ++sl_idx)
4204         {
4205           re_sub_match_last_t *last = top->lasts[sl_idx];
4206           re_free (last->path.array);
4207           re_free (last);
4208         }
4209       re_free (top->lasts);
4210       if (top->path)
4211         {
4212           re_free (top->path->array);
4213           re_free (top->path);
4214         }
4215       free (top);
4216     }
4217
4218   mctx->nsub_tops = 0;
4219   mctx->nbkref_ents = 0;
4220 }
4221
4222 /* Free all the memory associated with MCTX.  */
4223
4224 static void
4225 internal_function
4226 match_ctx_free (re_match_context_t *mctx)
4227 {
4228   /* First, free all the memory associated with MCTX->SUB_TOPS.  */
4229   match_ctx_clean (mctx);
4230   re_free (mctx->sub_tops);
4231   re_free (mctx->bkref_ents);
4232 }
4233
4234 /* Add a new backreference entry to MCTX.
4235    Note that we assume that caller never call this function with duplicate
4236    entry, and call with STR_IDX which isn't smaller than any existing entry.
4237 */
4238
4239 static reg_errcode_t
4240 internal_function __attribute_warn_unused_result__
4241 match_ctx_add_entry (re_match_context_t *mctx, int node, int str_idx, int from,
4242                      int to)
4243 {
4244   if (mctx->nbkref_ents >= mctx->abkref_ents)
4245     {
4246       struct re_backref_cache_entry* new_entry;
4247       new_entry = re_realloc (mctx->bkref_ents, struct re_backref_cache_entry,
4248                               mctx->abkref_ents * 2);
4249       if (BE (new_entry == NULL, 0))
4250         {
4251           re_free (mctx->bkref_ents);
4252           return REG_ESPACE;
4253         }
4254       mctx->bkref_ents = new_entry;
4255       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
4256               sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
4257       mctx->abkref_ents *= 2;
4258     }
4259   if (mctx->nbkref_ents > 0
4260       && mctx->bkref_ents[mctx->nbkref_ents - 1].str_idx == str_idx)
4261     mctx->bkref_ents[mctx->nbkref_ents - 1].more = 1;
4262
4263   mctx->bkref_ents[mctx->nbkref_ents].node = node;
4264   mctx->bkref_ents[mctx->nbkref_ents].str_idx = str_idx;
4265   mctx->bkref_ents[mctx->nbkref_ents].subexp_from = from;
4266   mctx->bkref_ents[mctx->nbkref_ents].subexp_to = to;
4267
4268   /* This is a cache that saves negative results of check_dst_limits_calc_pos.
4269      If bit N is clear, means that this entry won't epsilon-transition to
4270      an OP_OPEN_SUBEXP or OP_CLOSE_SUBEXP for the N+1-th subexpression.  If
4271      it is set, check_dst_limits_calc_pos_1 will recurse and try to find one
4272      such node.
4273
4274      A backreference does not epsilon-transition unless it is empty, so set
4275      to all zeros if FROM != TO.  */
4276   mctx->bkref_ents[mctx->nbkref_ents].eps_reachable_subexps_map
4277     = (from == to ? ~0 : 0);
4278
4279   mctx->bkref_ents[mctx->nbkref_ents++].more = 0;
4280   if (mctx->max_mb_elem_len < to - from)
4281     mctx->max_mb_elem_len = to - from;
4282   return REG_NOERROR;
4283 }
4284
4285 /* Search for the first entry which has the same str_idx, or -1 if none is
4286    found.  Note that MCTX->BKREF_ENTS is already sorted by MCTX->STR_IDX.  */
4287
4288 static int
4289 internal_function
4290 search_cur_bkref_entry (const re_match_context_t *mctx, int str_idx)
4291 {
4292   int left, right, mid, last;
4293   last = right = mctx->nbkref_ents;
4294   for (left = 0; left < right;)
4295     {
4296       mid = (left + right) / 2;
4297       if (mctx->bkref_ents[mid].str_idx < str_idx)
4298         left = mid + 1;
4299       else
4300         right = mid;
4301     }
4302   if (left < last && mctx->bkref_ents[left].str_idx == str_idx)
4303     return left;
4304   else
4305     return -1;
4306 }
4307
4308 /* Register the node NODE, whose type is OP_OPEN_SUBEXP, and which matches
4309    at STR_IDX.  */
4310
4311 static reg_errcode_t
4312 internal_function __attribute_warn_unused_result__
4313 match_ctx_add_subtop (re_match_context_t *mctx, int node, int str_idx)
4314 {
4315 #ifdef DEBUG
4316   assert (mctx->sub_tops != NULL);
4317   assert (mctx->asub_tops > 0);
4318 #endif
4319   if (BE (mctx->nsub_tops == mctx->asub_tops, 0))
4320     {
4321       int new_asub_tops = mctx->asub_tops * 2;
4322       re_sub_match_top_t **new_array = re_realloc (mctx->sub_tops,
4323                                                    re_sub_match_top_t *,
4324                                                    new_asub_tops);
4325       if (BE (new_array == NULL, 0))
4326         return REG_ESPACE;
4327       mctx->sub_tops = new_array;
4328       mctx->asub_tops = new_asub_tops;
4329     }
4330   mctx->sub_tops[mctx->nsub_tops] = calloc (1, sizeof (re_sub_match_top_t));
4331   if (BE (mctx->sub_tops[mctx->nsub_tops] == NULL, 0))
4332     return REG_ESPACE;
4333   mctx->sub_tops[mctx->nsub_tops]->node = node;
4334   mctx->sub_tops[mctx->nsub_tops++]->str_idx = str_idx;
4335   return REG_NOERROR;
4336 }
4337
4338 /* Register the node NODE, whose type is OP_CLOSE_SUBEXP, and which matches
4339    at STR_IDX, whose corresponding OP_OPEN_SUBEXP is SUB_TOP.  */
4340
4341 static re_sub_match_last_t *
4342 internal_function
4343 match_ctx_add_sublast (re_sub_match_top_t *subtop, int node, int str_idx)
4344 {
4345   re_sub_match_last_t *new_entry;
4346   if (BE (subtop->nlasts == subtop->alasts, 0))
4347     {
4348       int new_alasts = 2 * subtop->alasts + 1;
4349       re_sub_match_last_t **new_array = re_realloc (subtop->lasts,
4350                                                     re_sub_match_last_t *,
4351                                                     new_alasts);
4352       if (BE (new_array == NULL, 0))
4353         return NULL;
4354       subtop->lasts = new_array;
4355       subtop->alasts = new_alasts;
4356     }
4357   new_entry = calloc (1, sizeof (re_sub_match_last_t));
4358   if (BE (new_entry != NULL, 1))
4359     {
4360       subtop->lasts[subtop->nlasts] = new_entry;
4361       new_entry->node = node;
4362       new_entry->str_idx = str_idx;
4363       ++subtop->nlasts;
4364     }
4365   return new_entry;
4366 }
4367
4368 static void
4369 internal_function
4370 sift_ctx_init (re_sift_context_t *sctx, re_dfastate_t **sifted_sts,
4371                re_dfastate_t **limited_sts, int last_node, int last_str_idx)
4372 {
4373   sctx->sifted_states = sifted_sts;
4374   sctx->limited_states = limited_sts;
4375   sctx->last_node = last_node;
4376   sctx->last_str_idx = last_str_idx;
4377   re_node_set_init_empty (&sctx->limits);
4378 }