posix/regexec.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <stdio.h>
  24 #include <stdlib.h>
  25 #include <string.h>
  26 #include <wchar.h>
  27 #include <wctype.h>
  28
  29 #ifdef _LIBC
  30 # ifndef _RE_DEFINE_LOCALE_FUNCTIONS
  31 #  define _RE_DEFINE_LOCALE_FUNCTIONS 1
  32 #  include <locale/localeinfo.h>
  33 #  include <locale/elem-hash.h>
  34 #  include <locale/coll-lookup.h>
  35 # endif
  36 #endif
  37
  38 #include "regex.h"
  39 #include "regex_internal.h"
  40
  41 static reg_errcode_t match_ctx_init (re_match_context_t *cache, int eflags,
  42                                      re_string_t *input, int n);
  43 static void match_ctx_free (re_match_context_t *cache);
  44 static reg_errcode_t match_ctx_add_entry (re_match_context_t *cache, int node,
  45                                           int from, int to);
  46 static reg_errcode_t re_search_internal (const regex_t *preg,
  47                                          const char *string, int length,
  48                                          int start, int range, int stop,
  49                                          size_t nmatch, regmatch_t pmatch[],
  50                                          int eflags);
  51 static int re_search_2_stub (struct re_pattern_buffer *bufp,
  52                              const char *string1, int length1,
  53                              const char *string2, int length2,
  54                              int start, int range, struct re_registers *regs,
  55                              int stop, int ret_len);
  56 static int re_search_stub (struct re_pattern_buffer *bufp,
  57                            const char *string, int length, int start,
  58                            int range, int stop, struct re_registers *regs,
  59                            int ret_len);
  60 static unsigned re_copy_regs (struct re_registers *regs, regmatch_t *pmatch,
  61                               int nregs, int regs_allocated);
  62 static inline re_dfastate_t *acquire_init_state_context (reg_errcode_t *err,
  63                                                          const regex_t *preg,
  64                                                          const re_match_context_t *mctx,
  65                                                          int idx);
  66 static int check_matching (const regex_t *preg, re_match_context_t *mctx,
  67                            int fl_search, int fl_longest_match);
  68 static int check_halt_node_context (const re_dfa_t *dfa, int node,
  69                                     unsigned int context);
  70 static int check_halt_state_context (const regex_t *preg,
  71                                      const re_dfastate_t *state,
  72                                      const re_match_context_t *mctx, int idx);
  73 static void update_regs (re_dfa_t *dfa, regmatch_t *pmatch, int cur_node,
  74                          int cur_idx, int nmatch);
  75 static int proceed_next_node (const regex_t *preg,
  76                               const re_match_context_t *mctx,
  77                               int *pidx, int node, re_node_set *eps_via_nodes);
  78 static reg_errcode_t set_regs (const regex_t *preg,
  79                                const re_match_context_t *mctx,
  80                                size_t nmatch, regmatch_t *pmatch, int last);
  81 #ifdef RE_ENABLE_I18N
  82 static int sift_states_iter_mb (const regex_t *preg,
  83                                 const re_match_context_t *mctx,
  84                                 int node_idx, int str_idx, int max_str_idx);
  85 #endif /* RE_ENABLE_I18N */
  86 static int sift_states_iter_bkref (const re_dfa_t *dfa,
  87                                    re_dfastate_t **state_log,
  88                                    struct re_backref_cache_entry *mctx_entry,
  89                                    int node_idx, int idx, int match_last);
  90 static reg_errcode_t sift_states_backward (const regex_t *preg,
  91                                            const re_match_context_t *mctx,
  92                                            int last_node);
  93 static reg_errcode_t clean_state_log_if_need (re_match_context_t *mctx,
  94                                               int next_state_log_idx);
  95 static reg_errcode_t add_epsilon_backreference (const re_dfa_t *dfa,
  96                                                 const re_match_context_t *mctx,
  97                                                 const re_node_set *plog,
  98                                                 int idx,
  99                                                 re_node_set *state_buf);
 100 static re_dfastate_t *transit_state (reg_errcode_t *err, const regex_t *preg,
 101                                      re_match_context_t *mctx,
 102                                      re_dfastate_t *state, int fl_search);
 103 static re_dfastate_t *transit_state_sb (reg_errcode_t *err, const regex_t *preg,
 104                                         re_dfastate_t *pstate,
 105                                         int fl_search,
 106                                         re_match_context_t *mctx);
 107 #ifdef RE_ENABLE_I18N
 108 static reg_errcode_t transit_state_mb (const regex_t *preg,
 109                                        re_dfastate_t *pstate,
 110                                        re_match_context_t *mctx);
 111 #endif /* RE_ENABLE_I18N */
 112 static reg_errcode_t transit_state_bkref (const regex_t *preg,
 113                                           re_dfastate_t *pstate,
 114                                           re_match_context_t *mctx);
 115 static reg_errcode_t transit_state_bkref_loop (const regex_t *preg,
 116                                                re_node_set *nodes,
 117                                                re_dfastate_t **work_state_log,
 118                                                re_match_context_t *mctx);
 119 static re_dfastate_t **build_trtable (const regex_t *dfa,
 120                                       const re_dfastate_t *state,
 121                                       int fl_search);
 122 #ifdef RE_ENABLE_I18N
 123 static int check_node_accept_bytes (const regex_t *preg, int node_idx,
 124                                     const re_string_t *input, int idx);
 125 # ifdef _LIBC
 126 static unsigned int find_collation_sequence_value (const char *mbs,
 127                                                    size_t name_len);
 128 # endif /* _LIBC */
 129 #endif /* RE_ENABLE_I18N */
 130 static int group_nodes_into_DFAstates (const regex_t *dfa,
 131                                        const re_dfastate_t *state,
 132                                        re_node_set *states_node,
 133                                        bitset *states_ch);
 134 static int check_node_accept (const regex_t *preg, const re_token_t *node,
 135                               const re_match_context_t *mctx, int idx);
 136 static reg_errcode_t extend_buffers (re_match_context_t *mctx);
 137 \f
 138 /* Entry point for POSIX code.  */
 139
 140 /* regexec searches for a given pattern, specified by PREG, in the
 141    string STRING.
 142
 143    If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 144    `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 145    least NMATCH elements, and we set them to the offsets of the
 146    corresponding matched substrings.
 147
 148    EFLAGS specifies `execution flags' which affect matching: if
 149    REG_NOTBOL is set, then ^ does not match at the beginning of the
 150    string; if REG_NOTEOL is set, then $ does not match at the end.
 151
 152    We return 0 if we find a match and REG_NOMATCH if not.  */
 153
 154 int
 155 regexec (preg, string, nmatch, pmatch, eflags)
 156     const regex_t *__restrict preg;
 157     const char *__restrict string;
 158     size_t nmatch;
 159     regmatch_t pmatch[];
 160     int eflags;
 161 {
 162   reg_errcode_t err;
 163   int length = strlen (string);
 164   if (preg->no_sub)
 165     err = re_search_internal (preg, string, length, 0, length, length, 0,
 166                               NULL, eflags);
 167   else
 168     err = re_search_internal (preg, string, length, 0, length, length, nmatch,
 169                               pmatch, eflags);
 170   return err != REG_NOERROR;
 171 }
 172 #ifdef _LIBC
 173 weak_alias (__regexec, regexec)
 174 #endif
 175
 176 /* Entry points for GNU code.  */
 177
 178 /* re_match, re_search, re_match_2, re_search_2
 179
 180    The former two functions operate on STRING with length LENGTH,
 181    while the later two operate on concatenation of STRING1 and STRING2
 182    with lengths LENGTH1 and LENGTH2, respectively.
 183
 184    re_match() matches the compiled pattern in BUFP against the string,
 185    starting at index START.
 186
 187    re_search() first tries matching at index START, then it tries to match
 188    starting from index START + 1, and so on.  The last start position tried
 189    is START + RANGE.  (Thus RANGE = 0 forces re_search to operate the same
 190    way as re_match().)
 191
 192    The parameter STOP of re_{match,search}_2 specifies that no match exceeding
 193    the first STOP characters of the concatenation of the strings should be
 194    concerned.
 195
 196    If REGS is not NULL, and BUFP->no_sub is not set, the offsets of the match
 197    and all groups is stroed in REGS.  (For the "_2" variants, the offsets are
 198    computed relative to the concatenation, not relative to the individual
 199    strings.)
 200
 201    On success, re_match* functions return the length of the match, re_search*
 202    return the position of the start of the match.  Return value -1 means no
 203    match was found and -2 indicates an internal error.  */
 204
 205 int
 206 re_match (bufp, string, length, start, regs)
 207     struct re_pattern_buffer *bufp;
 208     const char *string;
 209     int length, start;
 210     struct re_registers *regs;
 211 {
 212   return re_search_stub (bufp, string, length, start, 0, length, regs, 1);
 213 }
 214 #ifdef _LIBC
 215 weak_alias (__re_match, re_match)
 216 #endif
 217
 218 int
 219 re_search (bufp, string, length, start, range, regs)
 220     struct re_pattern_buffer *bufp;
 221     const char *string;
 222     int length, start, range;
 223     struct re_registers *regs;
 224 {
 225   return re_search_stub (bufp, string, length, start, range, length, regs, 0);
 226 }
 227 #ifdef _LIBC
 228 weak_alias (__re_search, re_search)
 229 #endif
 230
 231 int
 232 re_match_2 (bufp, string1, length1, string2, length2, start, regs, stop)
 233     struct re_pattern_buffer *bufp;
 234     const char *string1, *string2;
 235     int length1, length2, start, stop;
 236     struct re_registers *regs;
 237 {
 238   return re_search_2_stub (bufp, string1, length1, string2, length2,
 239                            start, 0, regs, stop, 1);
 240 }
 241 #ifdef _LIBC
 242 weak_alias (__re_match_2, re_match_2)
 243 #endif
 244
 245 int
 246 re_search_2 (bufp, string1, length1, string2, length2, start, range, regs, stop)
 247     struct re_pattern_buffer *bufp;
 248     const char *string1, *string2;
 249     int length1, length2, start, range, stop;
 250     struct re_registers *regs;
 251 {
 252   return re_search_2_stub (bufp, string1, length1, string2, length2,
 253                            start, range, regs, stop, 0);
 254 }
 255 #ifdef _LIBC
 256 weak_alias (__re_search_2, re_search_2)
 257 #endif
 258
 259 static int
 260 re_search_2_stub (bufp, string1, length1, string2, length2, start, range, regs,
 261                   stop, ret_len)
 262     struct re_pattern_buffer *bufp;
 263     const char *string1, *string2;
 264     int length1, length2, start, range, stop, ret_len;
 265     struct re_registers *regs;
 266 {
 267   const char *str;
 268   int rval;
 269   int len = length1 + length2;
 270   int free_str = 0;
 271
 272   if (BE (length1 < 0 || length2 < 0 || stop < 0, 0))
 273     return -2;
 274
 275   /* Concatenate the strings.  */
 276   if (length2 > 0)
 277     if (length1 > 0)
 278       {
 279         char *s = re_malloc (char, len);
 280
 281         if (BE (s == NULL, 0))
 282           return -2;
 283         memcpy (s, string1, length1);
 284         memcpy (s + length1, string2, length2);
 285         str = s;
 286         free_str = 1;
 287       }
 288     else
 289       str = string2;
 290   else
 291     str = string1;
 292
 293   rval = re_search_stub (bufp, str, len, start, range, stop, regs,
 294                          ret_len);
 295   if (free_str)
 296       re_free ((char *) str);
 297   return rval;
 298 }
 299
 300 /* The parameters have the same meaning as those of re_search.
 301    Additional parameters:
 302    If RET_LEN is nonzero the length of the match is returned (re_match style);
 303    otherwise the position of the match is returned.  */
 304
 305 static int
 306 re_search_stub (bufp, string, length, start, range, stop, regs, ret_len)
 307     struct re_pattern_buffer *bufp;
 308     const char *string;
 309     int length, start, range, stop, ret_len;
 310     struct re_registers *regs;
 311 {
 312   reg_errcode_t result;
 313   regmatch_t *pmatch;
 314   int nregs, rval;
 315   int eflags = 0;
 316
 317   /* Check for out-of-range.  */
 318   if (BE (start < 0 || start > length || range < 0, 0))
 319     return -1;
 320   if (BE (start + range > length, 0))
 321     range = length - start;
 322
 323   eflags |= (bufp->not_bol) ? REG_NOTBOL : 0;
 324   eflags |= (bufp->not_eol) ? REG_NOTEOL : 0;
 325
 326   /* Compile fastmap if we haven't yet.  */
 327   if (range > 0 && bufp->fastmap != NULL && !bufp->fastmap_accurate)
 328     re_compile_fastmap (bufp);
 329
 330   if (BE (bufp->no_sub, 0))
 331     regs = NULL;
 332
 333   /* We need at least 1 register.  */
 334   if (regs == NULL)
 335     nregs = 1;
 336   else if (BE (bufp->regs_allocated == REGS_FIXED &&
 337                regs->num_regs < bufp->re_nsub + 1, 0))
 338     {
 339       nregs = regs->num_regs;
 340       if (BE (nregs < 1, 0))
 341         {
 342           /* Nothing can be copied to regs.  */
 343           regs = NULL;
 344           nregs = 1;
 345         }
 346     }
 347   else
 348     nregs = bufp->re_nsub + 1;
 349   pmatch = re_malloc (regmatch_t, nregs);
 350   if (BE (pmatch == NULL, 0))
 351     return -2;
 352
 353   result = re_search_internal (bufp, string, length, start, range, stop,
 354                                nregs, pmatch, eflags);
 355
 356   rval = 0;
 357
 358   /* I hope we needn't fill ther regs with -1's when no match was found.  */
 359   if (result != REG_NOERROR)
 360     rval = -1;
 361   else if (regs != NULL)
 362     {
 363       /* If caller wants register contents data back, copy them.  */
 364       bufp->regs_allocated = re_copy_regs (regs, pmatch, nregs,
 365                                            bufp->regs_allocated);
 366       if (BE (bufp->regs_allocated == REGS_UNALLOCATED, 0))
 367         rval = -2;
 368     }
 369
 370   if (BE (rval == 0, 1))
 371     {
 372       if (ret_len)
 373         {
 374           assert (pmatch[0].rm_so == start);
 375           rval = pmatch[0].rm_eo - start;
 376         }
 377       else
 378         rval = pmatch[0].rm_so;
 379     }
 380   re_free (pmatch);
 381   return rval;
 382 }
 383
 384 static unsigned
 385 re_copy_regs (regs, pmatch, nregs, regs_allocated)
 386     struct re_registers *regs;
 387     regmatch_t *pmatch;
 388     int nregs, regs_allocated;
 389 {
 390   int rval = REGS_REALLOCATE;
 391   int i;
 392   int need_regs = nregs + 1;
 393   /* We need one extra element beyond `num_regs' for the `-1' marker GNU code
 394      uses.  */
 395
 396   /* Have the register data arrays been allocated?  */
 397   if (regs_allocated == REGS_UNALLOCATED)
 398     { /* No.  So allocate them with malloc.  */
 399       regs->start = re_malloc (regoff_t, need_regs);
 400       if (BE (regs->start == NULL, 0))
 401         return REGS_UNALLOCATED;
 402       regs->end = re_malloc (regoff_t, need_regs);
 403       if (BE (regs->end == NULL, 0))
 404         {
 405           re_free (regs->start);
 406           return REGS_UNALLOCATED;
 407         }
 408       regs->num_regs = need_regs;
 409     }
 410   else if (regs_allocated == REGS_REALLOCATE)
 411     { /* Yes.  If we need more elements than were already
 412          allocated, reallocate them.  If we need fewer, just
 413          leave it alone.  */
 414       if (need_regs > regs->num_regs)
 415         {
 416           regs->start = re_realloc (regs->start, regoff_t, need_regs);
 417           if (BE (regs->start == NULL, 0))
 418             {
 419               if (regs->end != NULL)
 420                 re_free (regs->end);
 421               return REGS_UNALLOCATED;
 422             }
 423           regs->end = re_realloc (regs->end, regoff_t, need_regs);
 424           if (BE (regs->end == NULL, 0))
 425             {
 426               re_free (regs->start);
 427               return REGS_UNALLOCATED;
 428             }
 429           regs->num_regs = need_regs;
 430         }
 431     }
 432   else
 433     {
 434       assert (regs_allocated == REGS_FIXED);
 435       /* This function may not be called with REGS_FIXED and nregs too big.  */
 436       assert (regs->num_regs >= nregs);
 437       rval = REGS_FIXED;
 438     }
 439
 440   /* Copy the regs.  */
 441   for (i = 0; i < nregs; ++i)
 442     {
 443       regs->start[i] = pmatch[i].rm_so;
 444       regs->end[i] = pmatch[i].rm_eo;
 445     }
 446   for ( ; i < regs->num_regs; ++i)
 447     regs->start[i] = regs->end[i] = -1;
 448
 449   return rval;
 450 }
 451
 452 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
 453    ENDS.  Subsequent matches using PATTERN_BUFFER and REGS will use
 454    this memory for recording register information.  STARTS and ENDS
 455    must be allocated using the malloc library routine, and must each
 456    be at least NUM_REGS * sizeof (regoff_t) bytes long.
 457
 458    If NUM_REGS == 0, then subsequent matches should allocate their own
 459    register data.
 460
 461    Unless this function is called, the first search or match using
 462    PATTERN_BUFFER will allocate its own register data, without
 463    freeing the old data.  */
 464
 465 void
 466 re_set_registers (bufp, regs, num_regs, starts, ends)
 467     struct re_pattern_buffer *bufp;
 468     struct re_registers *regs;
 469     unsigned num_regs;
 470     regoff_t *starts, *ends;
 471 {
 472   if (num_regs)
 473     {
 474       bufp->regs_allocated = REGS_REALLOCATE;
 475       regs->num_regs = num_regs;
 476       regs->start = starts;
 477       regs->end = ends;
 478     }
 479   else
 480     {
 481       bufp->regs_allocated = REGS_UNALLOCATED;
 482       regs->num_regs = 0;
 483       regs->start = regs->end = (regoff_t *) 0;
 484     }
 485 }
 486 #ifdef _LIBC
 487 weak_alias (__re_set_registers, re_set_registers)
 488 #endif
 489 \f
 490 /* Entry points compatible with 4.2 BSD regex library.  We don't define
 491    them unless specifically requested.  */
 492
 493 #if defined _REGEX_RE_COMP || defined _LIBC
 494 int
 495 # ifdef _LIBC
 496 weak_function
 497 # endif
 498 re_exec (s)
 499      const char *s;
 500 {
 501   return 0 == regexec (&re_comp_buf, s, 0, NULL, 0);
 502 }
 503 #endif /* _REGEX_RE_COMP */
 504 \f
 505 static re_node_set empty_set;
 506
 507 /* Internal entry point.  */
 508
 509 /* Searches for a compiled pattern PREG in the string STRING, whose
 510    length is LENGTH.  NMATCH, PMATCH, and EFLAGS have the same
 511    mingings with regexec.  START, and RANGE have the same meanings
 512    with re_search.
 513    Return REG_NOERROR if we find a match, and REG_NOMATCH if not,
 514    otherwise return the error code.
 515    Note: We assume front end functions already check ranges.
 516    (START + RANGE >= 0 && START + RANGE <= LENGTH)  */
 517
 518 static reg_errcode_t
 519 re_search_internal (preg, string, length, start, range, stop, nmatch, pmatch,
 520                     eflags)
 521     const regex_t *preg;
 522     const char *string;
 523     int length, start, range, stop, eflags;
 524     size_t nmatch;
 525     regmatch_t pmatch[];
 526 {
 527   reg_errcode_t err;
 528   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 529   re_string_t input;
 530   int left_lim, right_lim, incr;
 531   int fl_longest_match, match_first, match_last = -1;
 532   re_match_context_t mctx;
 533   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate)
 534                    ? preg->fastmap : NULL);
 535
 536   /* Check if the DFA haven't been compiled.  */
 537   if (BE (preg->used == 0 || dfa->init_state == NULL
 538           || dfa->init_state_word == NULL || dfa->init_state_nl == NULL
 539           || dfa->init_state_begbuf == NULL, 0))
 540     return REG_NOMATCH;
 541
 542   re_node_set_init_empty (&empty_set);
 543
 544   /* We must check the longest matching, if nmatch > 0.  */
 545   fl_longest_match = (nmatch != 0);
 546
 547   err = re_string_allocate (&input, string, length, dfa->nodes_len + 1,
 548                             preg->translate, preg->syntax & RE_ICASE);
 549   if (BE (err != REG_NOERROR, 0))
 550     return err;
 551   input.stop = stop;
 552
 553   err = match_ctx_init (&mctx, eflags, &input, dfa->nbackref * 2);
 554   if (BE (err != REG_NOERROR, 0))
 555     return err;
 556
 557   /* We will log all the DFA states through which the dfa pass,
 558      if nmatch > 1, or this dfa has "multibyte node", which is a
 559      back-reference or a node which can accept multibyte character or
 560      multi character collating element.  */
 561   if (nmatch > 1 || dfa->has_mb_node)
 562     {
 563       mctx.state_log = re_malloc (re_dfastate_t *, dfa->nodes_len + 1);
 564       if (BE (mctx.state_log == NULL, 0))
 565         return REG_ESPACE;
 566     }
 567   else
 568     mctx.state_log = NULL;
 569
 570 #ifdef DEBUG
 571   /* We assume front-end functions already check them.  */
 572   assert (start + range >= 0 && start + range <= length);
 573 #endif
 574
 575   match_first = start;
 576   input.tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 577                        : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 578
 579   /* Check incrementally whether of not the input string match.  */
 580   incr = (range < 0) ? -1 : 1;
 581   left_lim = (range < 0) ? start + range : start;
 582   right_lim = (range < 0) ? start : start + range;
 583
 584   for (;;)
 585     {
 586       /* At first get the current byte from input string.  */
 587       int ch;
 588       if (MB_CUR_MAX > 1 && (preg->syntax & RE_ICASE || preg->translate))
 589         {
 590           /* In this case, we can't determin easily the current byte,
 591              since it might be a component byte of a multibyte character.
 592              Then we use the constructed buffer instead.  */
 593           /* If MATCH_FIRST is out of the valid range, reconstruct the
 594              buffers.  */
 595           if (input.raw_mbs_idx + input.valid_len <= match_first)
 596             re_string_reconstruct (&input, match_first, eflags,
 597                                    preg->newline_anchor);
 598           /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 599              Note that MATCH_FIRST must not be smaller than 0.  */
 600           ch = ((match_first >= length) ? 0
 601                 : re_string_byte_at (&input, match_first - input.raw_mbs_idx));
 602         }
 603       else
 604         {
 605           /* We apply translate/conversion manually, since it is trivial
 606              in this case.  */
 607           /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
 608              Note that MATCH_FIRST must not be smaller than 0.  */
 609           ch = (match_first < length) ? (unsigned char)string[match_first] : 0;
 610           /* Apply translation if we need.  */
 611           ch = preg->translate ? preg->translate[ch] : ch;
 612           /* In case of case insensitive mode, convert to upper case.  */
 613           ch = ((preg->syntax & RE_ICASE) && islower (ch)) ? toupper (ch) : ch;
 614         }
 615
 616       /* Eliminate inappropriate one by fastmap.  */
 617       if (preg->can_be_null || fastmap == NULL || fastmap[ch])
 618         {
 619           /* Reconstruct the buffers so that the matcher can assume that
 620              the matching starts from the begining of the buffer.  */
 621           re_string_reconstruct (&input, match_first, eflags,
 622                                  preg->newline_anchor);
 623 #ifdef RE_ENABLE_I18N
 624           /* Eliminate it when it is a component of a multibyte character
 625              and isn't the head of a multibyte character.  */
 626           if (MB_CUR_MAX == 1 || re_string_first_byte (&input, 0))
 627 #endif
 628             {
 629               /* It seems to be appropriate one, then use the matcher.  */
 630               /* We assume that the matching starts from 0.  */
 631               mctx.state_log_top = mctx.nbkref_ents = mctx.max_bkref_len = 0;
 632               match_last = check_matching (preg, &mctx, 0, fl_longest_match);
 633               if (match_last != -1)
 634                 {
 635                   if (BE (match_last == -2, 0))
 636                     return REG_ESPACE;
 637                   else
 638                     break; /* We found a matching.  */
 639                 }
 640             }
 641         }
 642       /* Update counter.  */
 643       match_first += incr;
 644       if (match_first < left_lim || right_lim < match_first)
 645         break;
 646     }
 647
 648   /* Set pmatch[] if we need.  */
 649   if (match_last != -1 && nmatch > 0)
 650     {
 651       int reg_idx;
 652
 653       /* Initialize registers.  */
 654       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 655         pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1;
 656
 657       /* Set the points where matching start/end.  */
 658       pmatch[0].rm_so = 0;
 659       mctx.match_last = pmatch[0].rm_eo = match_last;
 660
 661       if (!preg->no_sub && nmatch > 1)
 662         {
 663           /* We need the ranges of all the subexpressions.  */
 664           int halt_node;
 665           re_dfastate_t *pstate = mctx.state_log[match_last];
 666 #ifdef DEBUG
 667           assert (mctx.state_log != NULL);
 668 #endif
 669           halt_node = check_halt_state_context (preg, pstate, &mctx,
 670                                                 match_last);
 671           err = sift_states_backward (preg, &mctx, halt_node);
 672           if (BE (err != REG_NOERROR, 0))
 673             return err;
 674           err = set_regs (preg, &mctx, nmatch, pmatch, halt_node);
 675           if (BE (err != REG_NOERROR, 0))
 676             return err;
 677         }
 678
 679       /* At last, add the offset to the each registers, since we slided
 680          the buffers so that We can assume that the matching starts from 0.  */
 681       for (reg_idx = 0; reg_idx < nmatch; ++reg_idx)
 682         if (pmatch[reg_idx].rm_so != -1)
 683           {
 684             pmatch[reg_idx].rm_so += match_first;
 685             pmatch[reg_idx].rm_eo += match_first;
 686           }
 687     }
 688
 689   re_free (mctx.state_log);
 690   if (dfa->nbackref)
 691     match_ctx_free (&mctx);
 692   re_string_destruct (&input);
 693   return (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
 694 }
 695
 696 /* Acquire an initial state and return it.
 697    We must select appropriate initial state depending on the context,
 698    since initial states may have constraints like "\<", "^", etc..  */
 699
 700 static inline re_dfastate_t *
 701 acquire_init_state_context (err, preg, mctx, idx)
 702      reg_errcode_t *err;
 703      const regex_t *preg;
 704      const re_match_context_t *mctx;
 705      int idx;
 706 {
 707   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 708
 709   *err = REG_NOERROR;
 710   if (dfa->init_state->has_constraint)
 711     {
 712       unsigned int context;
 713       context =  re_string_context_at (mctx->input, idx - 1, mctx->eflags,
 714                                        preg->newline_anchor);
 715       if (IS_WORD_CONTEXT (context))
 716         return dfa->init_state_word;
 717       else if (IS_ORDINARY_CONTEXT (context))
 718         return dfa->init_state;
 719       else if (IS_BEGBUF_CONTEXT (context) && IS_NEWLINE_CONTEXT (context))
 720         return dfa->init_state_begbuf;
 721       else if (IS_NEWLINE_CONTEXT (context))
 722         return dfa->init_state_nl;
 723       else if (IS_BEGBUF_CONTEXT (context))
 724         {
 725           /* It is relatively rare case, then calculate on demand.  */
 726           return  re_acquire_state_context (err, dfa,
 727                                             dfa->init_state->entrance_nodes,
 728                                             context);
 729         }
 730       else
 731         /* Must not happen?  */
 732         return dfa->init_state;
 733     }
 734   else
 735     return dfa->init_state;
 736 }
 737
 738 /* Check whether the regular expression match input string INPUT or not,
 739    and return the index where the matching end, return -1 if not match,
 740    or return -2 in case of an error.
 741    FL_SEARCH means we must search where the matching starts,
 742    FL_LONGEST_MATCH means we want the POSIX longest matching.
 743    Note that the matcher assume that the maching starts from the current
 744    index of the buffer.  */
 745
 746 static int
 747 check_matching (preg, mctx, fl_search, fl_longest_match)
 748     const regex_t *preg;
 749     re_match_context_t *mctx;
 750     int fl_search, fl_longest_match;
 751 {
 752   reg_errcode_t err;
 753   int match = 0;
 754   int match_last = -1;
 755   int cur_str_idx = re_string_cur_idx (mctx->input);
 756   re_dfastate_t *cur_state;
 757
 758   cur_state = acquire_init_state_context (&err, preg, mctx, cur_str_idx);
 759   /* An initial state must not be NULL(invalid state).  */
 760   if (BE (cur_state == NULL, 0))
 761     return -2;
 762   if (mctx->state_log != NULL)
 763     mctx->state_log[cur_str_idx] = cur_state;
 764   /* If the RE accepts NULL string.  */
 765   if (cur_state->halt)
 766     {
 767       if (!cur_state->has_constraint
 768           || check_halt_state_context (preg, cur_state, mctx, cur_str_idx))
 769         {
 770           if (!fl_longest_match)
 771             return cur_str_idx;
 772           else
 773             {
 774               match_last = cur_str_idx;
 775               match = 1;
 776             }
 777         }
 778     }
 779
 780   while (!re_string_eoi (mctx->input))
 781     {
 782       cur_state = transit_state (&err, preg, mctx, cur_state,
 783                                  fl_search && !match);
 784       if (cur_state == NULL) /* Reached at the invalid state or an error.  */
 785         {
 786           cur_str_idx = re_string_cur_idx (mctx->input);
 787           if (BE (err != REG_NOERROR, 0))
 788             return -2;
 789           if (fl_search && !match)
 790             {
 791               /* Restart from initial state, since we are searching
 792                  the point from where matching start.  */
 793 #ifdef RE_ENABLE_I18N
 794               if (MB_CUR_MAX == 1
 795                   || re_string_first_byte (mctx->input, cur_str_idx))
 796 #endif /* RE_ENABLE_I18N */
 797                 cur_state = acquire_init_state_context (&err, preg, mctx,
 798                                                         cur_str_idx);
 799               if (BE (cur_state == NULL && err != REG_NOERROR, 0))
 800                 return -2;
 801               if (mctx->state_log != NULL)
 802                 mctx->state_log[cur_str_idx] = cur_state;
 803             }
 804           else if (!fl_longest_match && match)
 805             break;
 806           else /* (fl_longest_match && match) || (!fl_search && !match)  */
 807             {
 808               if (mctx->state_log == NULL)
 809                 break;
 810               else
 811                 {
 812                   int max = mctx->state_log_top;
 813                   for (; cur_str_idx <= max; ++cur_str_idx)
 814                     if (mctx->state_log[cur_str_idx] != NULL)
 815                       break;
 816                   if (cur_str_idx > max)
 817                     break;
 818                 }
 819             }
 820         }
 821
 822       if (cur_state != NULL && cur_state->halt)
 823         {
 824           /* Reached at a halt state.
 825              Check the halt state can satisfy the current context.  */
 826           if (!cur_state->has_constraint
 827               || check_halt_state_context (preg, cur_state, mctx,
 828                                            re_string_cur_idx (mctx->input)))
 829             {
 830               /* We found an appropriate halt state.  */
 831               match_last = re_string_cur_idx (mctx->input);
 832               match = 1;
 833               if (!fl_longest_match)
 834                 break;
 835             }
 836         }
 837    }
 838   return match_last;
 839 }
 840
 841 /* Check NODE match the current context.  */
 842
 843 static int check_halt_node_context (dfa, node, context)
 844     const re_dfa_t *dfa;
 845     int node;
 846     unsigned int context;
 847 {
 848   int entity;
 849   re_token_type_t type = dfa->nodes[node].type;
 850   if (type == END_OF_RE)
 851     return 1;
 852   if (type != OP_CONTEXT_NODE)
 853     return 0;
 854   entity = dfa->nodes[node].opr.ctx_info->entity;
 855   if (dfa->nodes[entity].type != END_OF_RE
 856       || NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[node].constraint, context))
 857     return 0;
 858   return 1;
 859 }
 860
 861 /* Check the halt state STATE match the current context.
 862    Return 0 if not match, if the node, STATE has, is a halt node and
 863    match the context, return the node.  */
 864
 865 static int
 866 check_halt_state_context (preg, state, mctx, idx)
 867     const regex_t *preg;
 868     const re_dfastate_t *state;
 869     const re_match_context_t *mctx;
 870     int idx;
 871 {
 872   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
 873   int i;
 874   unsigned int context;
 875 #ifdef DEBUG
 876   assert (state->halt);
 877 #endif
 878   context = re_string_context_at (mctx->input, idx, mctx->eflags,
 879                                   preg->newline_anchor);
 880   for (i = 0; i < state->nodes.nelem; ++i)
 881     if (check_halt_node_context (dfa, state->nodes.elems[i], context))
 882       return state->nodes.elems[i];
 883   return 0;
 884 }
 885
 886 /* Compute the next node to which "NFA" transit from NODE("NFA" is a NFA
 887    corresponding to the DFA).
 888    Return the destination node, and update EPS_VIA_NODES, return -1 in case
 889    of errors.  */
 890
 891 static int
 892 proceed_next_node (preg, mctx, pidx, node, eps_via_nodes)
 893     const regex_t *preg;
 894     const re_match_context_t *mctx;
 895     int *pidx, node;
 896     re_node_set *eps_via_nodes;
 897 {
 898   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
 899   int i, err, dest_node, cur_entity;
 900   dest_node = -1;
 901   cur_entity = ((dfa->nodes[node].type == OP_CONTEXT_NODE)
 902                 ? dfa->nodes[node].opr.ctx_info->entity : node);
 903   if (IS_EPSILON_NODE (dfa->nodes[node].type))
 904     {
 905       int dest_entity = INT_MAX;
 906       err = re_node_set_insert (eps_via_nodes, node);
 907       if (BE (err < 0, 0))
 908         return -1;
 909       for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
 910         {
 911           int candidate, candidate_entity;
 912           candidate = mctx->state_log[*pidx]->nodes.elems[i];
 913           candidate_entity = ((dfa->nodes[candidate].type == OP_CONTEXT_NODE)
 914                               ? dfa->nodes[candidate].opr.ctx_info->entity
 915                               : candidate);
 916           if (!re_node_set_contains (dfa->edests + node, candidate))
 917             if (candidate == candidate_entity
 918                 || !re_node_set_contains (dfa->edests + node, candidate_entity))
 919               continue;
 920
 921           /* In order to avoid infinite loop like "(a*)*".  */
 922           if (cur_entity > candidate_entity
 923               && re_node_set_contains (eps_via_nodes, candidate))
 924             continue;
 925
 926           if (dest_entity > candidate_entity)
 927             {
 928               dest_node = candidate;
 929               dest_entity = candidate_entity;
 930             }
 931         }
 932 #ifdef DEBUG
 933       assert (dest_node != -1);
 934 #endif
 935       return dest_node;
 936     }
 937   else
 938     {
 939       int naccepted = 0, entity = node;
 940       re_token_type_t type = dfa->nodes[node].type;
 941       if (type == OP_CONTEXT_NODE)
 942         {
 943           entity = dfa->nodes[node].opr.ctx_info->entity;
 944           type = dfa->nodes[entity].type;
 945         }
 946
 947 #ifdef RE_ENABLE_I18N
 948       if (ACCEPT_MB_NODE (type))
 949         naccepted = check_node_accept_bytes (preg, entity, mctx->input, *pidx);
 950       else
 951 #endif /* RE_ENABLE_I18N */
 952       if (type == OP_BACK_REF)
 953         {
 954           for (i = 0; i < mctx->nbkref_ents; ++i)
 955             {
 956               if (mctx->bkref_ents[i].node == node
 957                   && mctx->bkref_ents[i].from == *pidx)
 958                 naccepted = mctx->bkref_ents[i].to - *pidx;
 959             }
 960           if (naccepted == 0)
 961             {
 962               err = re_node_set_insert (eps_via_nodes, node);
 963               if (BE (err < 0, 0))
 964                 return -1;
 965               dest_node = dfa->nexts[node];
 966               if (re_node_set_contains (&mctx->state_log[*pidx]->nodes,
 967                                         dest_node))
 968                 return dest_node;
 969               for (i = 0; i < mctx->state_log[*pidx]->nodes.nelem; ++i)
 970                 {
 971                   dest_node = mctx->state_log[*pidx]->nodes.elems[i];
 972                   if ((dfa->nodes[dest_node].type == OP_CONTEXT_NODE
 973                        && (dfa->nexts[node]
 974                            == dfa->nodes[dest_node].opr.ctx_info->entity)))
 975                     return dest_node;
 976                 }
 977             }
 978         }
 979
 980       if (naccepted != 0
 981           || check_node_accept (preg, dfa->nodes + node, mctx, *pidx))
 982         {
 983           dest_node = dfa->nexts[node];
 984           *pidx = (naccepted == 0) ? *pidx + 1 : *pidx + naccepted;
 985 #ifdef DEBUG
 986           assert (mctx->state_log[*pidx] != NULL);
 987 #endif
 988           re_node_set_empty (eps_via_nodes);
 989           return dest_node;
 990         }
 991     }
 992   /* Must not reach here.  */
 993 #ifdef DEBUG
 994   assert (0);
 995 #endif
 996   return 0;
 997 }
 998
 999 /* Set the positions where the subexpressions are starts/ends to registers
1000    PMATCH.
1001    Note: We assume that pmatch[0] is already set, and
1002    pmatch[i].rm_so == pmatch[i].rm_eo == -1 (i > 1).  */
1003
1004 static reg_errcode_t
1005 set_regs (preg, mctx, nmatch, pmatch, last_node)
1006     const regex_t *preg;
1007     const re_match_context_t *mctx;
1008     size_t nmatch;
1009     regmatch_t *pmatch;
1010     int last_node;
1011 {
1012   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1013   int idx, cur_node, real_nmatch;
1014   re_node_set eps_via_nodes;
1015 #ifdef DEBUG
1016   assert (nmatch > 1);
1017   assert (mctx->state_log != NULL);
1018 #endif
1019   cur_node = dfa->init_node;
1020   real_nmatch = (nmatch <= preg->re_nsub) ? nmatch : preg->re_nsub + 1;
1021   re_node_set_init_empty (&eps_via_nodes);
1022   for (idx = pmatch[0].rm_so; idx <= pmatch[0].rm_eo ;)
1023     {
1024       update_regs (dfa, pmatch, cur_node, idx, real_nmatch);
1025       if (idx == pmatch[0].rm_eo && cur_node == last_node)
1026         break;
1027
1028       /* Proceed to next node.  */
1029       cur_node = proceed_next_node (preg, mctx, &idx, cur_node, &eps_via_nodes);
1030       if (BE (cur_node < 0, 0))
1031         return REG_ESPACE;
1032     }
1033   re_node_set_free (&eps_via_nodes);
1034   return REG_NOERROR;
1035 }
1036
1037 static void
1038 update_regs (dfa, pmatch, cur_node, cur_idx, nmatch)
1039      re_dfa_t *dfa;
1040      regmatch_t *pmatch;
1041      int cur_node, cur_idx, nmatch;
1042 {
1043   int type = dfa->nodes[cur_node].type;
1044   int reg_num;
1045   if (type != OP_OPEN_SUBEXP && type != OP_CLOSE_SUBEXP)
1046     return;
1047   reg_num = dfa->nodes[cur_node].opr.idx + 1;
1048   if (reg_num >= nmatch)
1049     return;
1050   if (type == OP_OPEN_SUBEXP)
1051     {
1052       /* We are at the first node of this sub expression.  */
1053       pmatch[reg_num].rm_so = cur_idx;
1054       pmatch[reg_num].rm_eo = -1;
1055     }
1056   else if (type == OP_CLOSE_SUBEXP)
1057     /* We are at the first node of this sub expression.  */
1058     pmatch[reg_num].rm_eo = cur_idx;
1059  }
1060
1061 #define NUMBER_OF_STATE 1
1062
1063 /* This function checks the STATE_LOG from the MCTX->match_last to 0
1064    and sift the nodes in each states according to the following rules.
1065    Updated state_log will be wrote to STATE_LOG.
1066
1067    Rules: We throw away the Node `a' in the STATE_LOG[STR_IDX] if...
1068      1. When STR_IDX == MATCH_LAST(the last index in the state_log):
1069         If `a' isn't the LAST_NODE and `a' can't epsilon transit to
1070         the LAST_NODE, we throw away the node `a'.
1071      2. When 0 <= STR_IDX < MATCH_LAST and `a' accepts
1072         string `s' and transit to `b':
1073         i. If 'b' isn't in the STATE_LOG[STR_IDX+strlen('s')], we throw
1074            away the node `a'.
1075         ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
1076             throwed away, we throw away the node `a'.
1077      3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
1078         i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
1079            node `a'.
1080         ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
1081             we throw away the node `a'.  */
1082
1083 #define STATE_NODE_CONTAINS(state,node) \
1084   ((state) != NULL && re_node_set_contains (&(state)->nodes, node))
1085
1086 static reg_errcode_t
1087 sift_states_backward (preg, mctx, last_node)
1088     const regex_t *preg;
1089     const re_match_context_t *mctx;
1090     int last_node;
1091 {
1092   reg_errcode_t err;
1093   re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
1094   re_node_set state_buf;
1095   int str_idx = mctx->match_last;
1096   re_node_set *plog;    /* Points the state_log[str_idx]->nodes  */
1097
1098 #ifdef DEBUG
1099   assert (mctx->state_log != NULL && mctx->state_log[str_idx] != NULL);
1100 #endif
1101   err = re_node_set_alloc (&state_buf, NUMBER_OF_STATE);
1102   if (BE (err != REG_NOERROR, 0))
1103     return err;
1104   plog = &mctx->state_log[str_idx]->nodes;
1105
1106   /* Build sifted state_log[str_idx].  It has the nodes which can epsilon
1107      transit to the last_node and the last_node itself.  */
1108   err = re_node_set_intersect (&state_buf, plog, dfa->inveclosures + last_node);
1109   if (BE (err != REG_NOERROR, 0))
1110     return err;
1111
1112   if (mctx->state_log[str_idx] != NULL
1113       && mctx->state_log[str_idx]->has_backref)
1114     {
1115       err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf);
1116       if (BE (err != REG_NOERROR, 0))
1117         return err;
1118     }
1119
1120   /* Update state log.  */
1121   mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf);
1122   if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0))
1123     return err;
1124
1125   /* Then check each states in the state_log.  */
1126   while (str_idx > 0)
1127     {
1128       int i, j;
1129       /* Update counters.  */
1130       re_node_set_empty (&state_buf);
1131       --str_idx;
1132       plog = ((mctx->state_log[str_idx] == NULL) ? &empty_set
1133               : &mctx->state_log[str_idx]->nodes);
1134
1135       /* Then build the next sifted state.
1136          We build the next sifted state on `state_buf', and update
1137          `state_log[str_idx]' with `state_buf'.
1138          Note:
1139          `state_buf' is the sifted state from `state_log[str_idx + 1]'.
1140          `plog' points the node_set of the old `state_log[str_idx]'.  */
1141       for (i = 0; i < plog->nelem; i++)
1142         {
1143           int prev_node = plog->elems[i];
1144           int entity = prev_node;
1145           int naccepted = 0;
1146           re_token_type_t type = dfa->nodes[prev_node].type;
1147           if (type == OP_CONTEXT_NODE)
1148             {
1149               entity = dfa->nodes[prev_node].opr.ctx_info->entity;
1150               type = dfa->nodes[entity].type;
1151             }
1152
1153 #ifdef RE_ENABLE_I18N
1154           /* If the node may accept `multi byte'.  */
1155           if (ACCEPT_MB_NODE (type))
1156             naccepted = sift_states_iter_mb (preg, mctx, entity, str_idx,
1157                                              mctx->match_last);
1158
1159           /* If the node is a back reference.  */
1160           else
1161 #endif /* RE_ENABLE_I18N */
1162           if (type == OP_BACK_REF)
1163             for (j = 0; j < mctx->nbkref_ents; ++j)
1164               {
1165                 naccepted = sift_states_iter_bkref (dfa, mctx->state_log,
1166                                                     mctx->bkref_ents + j,
1167                                                     prev_node, str_idx,
1168                                                     mctx->match_last);
1169                 if (naccepted)
1170                   break;
1171               }
1172
1173           if (!naccepted
1174               && check_node_accept (preg, dfa->nodes + prev_node, mctx,
1175                                     str_idx)
1176               && STATE_NODE_CONTAINS (mctx->state_log[str_idx + 1],
1177                                       dfa->nexts[prev_node]))
1178             naccepted = 1;
1179
1180           if (naccepted == 0)
1181             continue;
1182
1183           /* `prev_node' may point the entity of the OP_CONTEXT_NODE,
1184              then we use plog->elems[i] instead.  */
1185           err = re_node_set_add_intersect (&state_buf, plog,
1186                                            dfa->inveclosures + prev_node);
1187           if (BE (err != REG_NOERROR, 0))
1188             return err;
1189         }
1190       if (mctx->state_log[str_idx] != NULL
1191           && mctx->state_log[str_idx]->has_backref)
1192         {
1193           err = add_epsilon_backreference (dfa, mctx, plog, str_idx, &state_buf);
1194           if (BE (err != REG_NOERROR, 0))
1195             return err;
1196         }
1197
1198       /* Update state_log.  */
1199       mctx->state_log[str_idx] = re_acquire_state (&err, dfa, &state_buf);
1200       if (BE (mctx->state_log[str_idx] == NULL && err != REG_NOERROR, 0))
1201         return err;
1202     }
1203
1204   re_node_set_free (&state_buf);
1205   return REG_NOERROR;
1206 }
1207
1208 /* Helper functions.  */
1209
1210 static inline reg_errcode_t
1211 clean_state_log_if_need (mctx, next_state_log_idx)
1212     re_match_context_t *mctx;
1213     int next_state_log_idx;
1214 {
1215   int top = mctx->state_log_top;
1216
1217   if (next_state_log_idx >= mctx->input->bufs_len
1218       || (next_state_log_idx >= mctx->input->valid_len
1219           && mctx->input->valid_len < mctx->input->len))
1220     {
1221       reg_errcode_t err;
1222       err = extend_buffers (mctx);
1223       if (BE (err != REG_NOERROR, 0))
1224         return err;
1225     }
1226
1227   if (top < next_state_log_idx)
1228     {
1229       memset (mctx->state_log + top + 1, '\0',
1230               sizeof (re_dfastate_t *) * (next_state_log_idx - top));
1231       mctx->state_log_top = next_state_log_idx;
1232     }
1233   return REG_NOERROR;
1234 }
1235
1236 #ifdef RE_ENABLE_I18N
1237 static int
1238 sift_states_iter_mb (preg, mctx, node_idx, str_idx, max_str_idx)
1239     const regex_t *preg;
1240     const re_match_context_t *mctx;
1241     int node_idx, str_idx, max_str_idx;
1242 {
1243   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1244   int naccepted;
1245   /* Check the node can accept `multi byte'.  */
1246   naccepted = check_node_accept_bytes (preg, node_idx, mctx->input, str_idx);
1247   if (naccepted > 0 && str_idx + naccepted <= max_str_idx &&
1248       !STATE_NODE_CONTAINS (mctx->state_log[str_idx + naccepted],
1249                             dfa->nexts[node_idx]))
1250     /* The node can't accept the `multi byte', or the
1251        destination was already throwed away, then the node
1252        could't accept the current input `multi byte'.   */
1253     naccepted = 0;
1254   /* Otherwise, it is sure that the node could accept
1255      `naccepted' bytes input.  */
1256   return naccepted;
1257 }
1258 #endif /* RE_ENABLE_I18N */
1259
1260 static int
1261 sift_states_iter_bkref (dfa, state_log, mctx_entry, node_idx, idx, match_last)
1262     const re_dfa_t *dfa;
1263     re_dfastate_t **state_log;
1264     struct re_backref_cache_entry *mctx_entry;
1265     int node_idx, idx, match_last;
1266 {
1267   int naccepted = 0;
1268   int from_idx, to_idx;
1269   from_idx = mctx_entry->from;
1270   to_idx = mctx_entry->to;
1271   if (mctx_entry->node == node_idx
1272       && from_idx == idx && to_idx <= match_last
1273       && STATE_NODE_CONTAINS (state_log[to_idx], dfa->nexts[node_idx]))
1274     naccepted = to_idx - from_idx;
1275   return naccepted;
1276 }
1277
1278 static reg_errcode_t
1279 add_epsilon_backreference (dfa, mctx, plog, idx, state_buf)
1280     const re_dfa_t *dfa;
1281     const re_match_context_t *mctx;
1282     const re_node_set *plog;
1283     int idx;
1284     re_node_set *state_buf;
1285 {
1286   int i, j;
1287   for (i = 0; i < plog->nelem; ++i)
1288     {
1289       int node_idx = plog->elems[i];
1290       re_token_type_t type = dfa->nodes[node_idx].type;
1291       if (type == OP_CONTEXT_NODE)
1292         type = dfa->nodes[dfa->nodes[node_idx].opr.ctx_info->entity].type;
1293
1294       if (type == OP_BACK_REF &&
1295           !re_node_set_contains (state_buf, node_idx))
1296         {
1297           for (j = 0; j < mctx->nbkref_ents; ++j)
1298             {
1299               struct re_backref_cache_entry *entry;
1300               entry = mctx->bkref_ents + j;
1301               if (entry->from == entry->to && entry->from == idx)
1302                 break;
1303             }
1304           if (j < mctx->nbkref_ents || idx == 0)
1305             {
1306               reg_errcode_t err;
1307               err = re_node_set_add_intersect (state_buf, plog,
1308                                                dfa->inveclosures + node_idx);
1309               if (BE (err != REG_NOERROR, 0))
1310                 return err;
1311               i = 0;
1312             }
1313         }
1314     }
1315   return REG_NOERROR;
1316 }
1317 \f
1318 /* Functions for state transition.  */
1319
1320 /* Return the next state to which the current state STATE will transit by
1321    accepting the current input byte, and update STATE_LOG if necessary.
1322    If STATE can accept a multibyte char/collating element/back reference
1323    update the destination of STATE_LOG.  */
1324
1325 static re_dfastate_t *
1326 transit_state (err, preg, mctx, state, fl_search)
1327      reg_errcode_t *err;
1328      const regex_t *preg;
1329      re_match_context_t *mctx;
1330      re_dfastate_t *state;
1331      int fl_search;
1332 {
1333   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1334   re_dfastate_t **trtable, *next_state;
1335   unsigned char ch;
1336
1337   if (re_string_cur_idx (mctx->input) + 1 >= mctx->input->bufs_len
1338       || (re_string_cur_idx (mctx->input) + 1 >= mctx->input->valid_len
1339           && mctx->input->valid_len < mctx->input->len))
1340     {
1341       *err = extend_buffers (mctx);
1342       if (BE (*err != REG_NOERROR, 0))
1343         return NULL;
1344     }
1345
1346   *err = REG_NOERROR;
1347   if (state == NULL)
1348     {
1349       next_state = state;
1350       re_string_skip_bytes (mctx->input, 1);
1351     }
1352   else
1353     {
1354 #ifdef RE_ENABLE_I18N
1355       /* If the current state can accept multibyte.  */
1356       if (state->accept_mb)
1357         {
1358           *err = transit_state_mb (preg, state, mctx);
1359           if (BE (*err != REG_NOERROR, 0))
1360             return NULL;
1361         }
1362 #endif /* RE_ENABLE_I18N */
1363
1364       /* Then decide the next state with the single byte.  */
1365       if (1)
1366         {
1367           /* Use transition table  */
1368           ch = re_string_fetch_byte (mctx->input);
1369           trtable = fl_search ? state->trtable_search : state->trtable;
1370           if (trtable == NULL)
1371             {
1372               trtable = build_trtable (preg, state, fl_search);
1373               if (fl_search)
1374                 state->trtable_search = trtable;
1375               else
1376                 state->trtable = trtable;
1377             }
1378           next_state = trtable[ch];
1379         }
1380       else
1381         {
1382           /* don't use transition table  */
1383           next_state = transit_state_sb (err, preg, state, fl_search, mctx);
1384           if (BE (next_state == NULL && err != REG_NOERROR, 0))
1385             return NULL;
1386         }
1387     }
1388
1389   /* Update the state_log if we need.  */
1390   if (mctx->state_log != NULL)
1391     {
1392       int cur_idx = re_string_cur_idx (mctx->input);
1393       if (cur_idx > mctx->state_log_top)
1394         {
1395           mctx->state_log[cur_idx] = next_state;
1396           mctx->state_log_top = cur_idx;
1397         }
1398       else if (mctx->state_log[cur_idx] == 0)
1399         {
1400           mctx->state_log[cur_idx] = next_state;
1401         }
1402       else
1403         {
1404           re_dfastate_t *pstate;
1405           unsigned int context;
1406           re_node_set next_nodes, *log_nodes, *table_nodes = NULL;
1407           /* If (state_log[cur_idx] != 0), it implies that cur_idx is
1408              the destination of a multibyte char/collating element/
1409              back reference.  Then the next state is the union set of
1410              these destinations and the results of the transition table.  */
1411           pstate = mctx->state_log[cur_idx];
1412           log_nodes = pstate->entrance_nodes;
1413           if (next_state != NULL)
1414             {
1415               table_nodes = next_state->entrance_nodes;
1416               *err = re_node_set_init_union (&next_nodes, table_nodes,
1417                                              log_nodes);
1418               if (BE (*err != REG_NOERROR, 0))
1419                 return NULL;
1420             }
1421           else
1422             next_nodes = *log_nodes;
1423           /* Note: We already add the nodes of the initial state,
1424                    then we don't need to add them here.  */
1425
1426           context = re_string_context_at (mctx->input,
1427                                           re_string_cur_idx (mctx->input) - 1,
1428                                           mctx->eflags, preg->newline_anchor);
1429           next_state = mctx->state_log[cur_idx]
1430             = re_acquire_state_context (err, dfa, &next_nodes, context);
1431           /* We don't need to check errors here, since the return value of
1432              this function is next_state and ERR is already set.  */
1433
1434           if (table_nodes != NULL)
1435             re_node_set_free (&next_nodes);
1436         }
1437       /* If the next state has back references.  */
1438       if (next_state != NULL && next_state->has_backref)
1439         {
1440           *err = transit_state_bkref (preg, next_state, mctx);
1441           if (BE (*err != REG_NOERROR, 0))
1442             return NULL;
1443           next_state = mctx->state_log[cur_idx];
1444         }
1445     }
1446   return next_state;
1447 }
1448
1449 /* Helper functions for transit_state.  */
1450
1451 /* Return the next state to which the current state STATE will transit by
1452    accepting the current input byte.  */
1453
1454 static re_dfastate_t *
1455 transit_state_sb (err, preg, state, fl_search, mctx)
1456      reg_errcode_t *err;
1457      const regex_t *preg;
1458      re_dfastate_t *state;
1459      int fl_search;
1460      re_match_context_t *mctx;
1461 {
1462   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1463   re_node_set next_nodes;
1464   re_dfastate_t *next_state;
1465   int node_cnt, cur_str_idx = re_string_cur_idx (mctx->input);
1466   unsigned int context;
1467
1468   *err = re_node_set_alloc (&next_nodes, state->nodes.nelem + 1);
1469   if (BE (*err != REG_NOERROR, 0))
1470     return NULL;
1471   for (node_cnt = 0; node_cnt < state->nodes.nelem; ++node_cnt)
1472     {
1473       int cur_node = state->nodes.elems[node_cnt];
1474       if (check_node_accept (preg, dfa->nodes + cur_node, mctx, cur_str_idx))
1475         {
1476           *err = re_node_set_merge (&next_nodes,
1477                                     dfa->eclosures + dfa->nexts[cur_node]);
1478           if (BE (*err != REG_NOERROR, 0))
1479             return NULL;
1480         }
1481     }
1482   if (fl_search)
1483     {
1484 #ifdef RE_ENABLE_I18N
1485       int not_initial = 0;
1486       if (MB_CUR_MAX > 1)
1487         for (node_cnt = 0; node_cnt < next_nodes.nelem; ++node_cnt)
1488           if (dfa->nodes[next_nodes.elems[node_cnt]].type == CHARACTER)
1489             {
1490               not_initial = dfa->nodes[next_nodes.elems[node_cnt]].mb_partial;
1491               break;
1492             }
1493       if (!not_initial)
1494 #endif
1495         {
1496           *err = re_node_set_merge (&next_nodes,
1497                                     dfa->init_state->entrance_nodes);
1498           if (BE (*err != REG_NOERROR, 0))
1499             return NULL;
1500         }
1501     }
1502   context = re_string_context_at (mctx->input, cur_str_idx, mctx->eflags,
1503                                   preg->newline_anchor);
1504   next_state = re_acquire_state_context (err, dfa, &next_nodes, context);
1505   /* We don't need to check errors here, since the return value of
1506      this function is next_state and ERR is already set.  */
1507
1508   re_node_set_free (&next_nodes);
1509   re_string_skip_bytes (mctx->input, 1);
1510   return next_state;
1511 }
1512
1513 #ifdef RE_ENABLE_I18N
1514 static reg_errcode_t
1515 transit_state_mb (preg, pstate, mctx)
1516     const regex_t *preg;
1517     re_dfastate_t *pstate;
1518     re_match_context_t *mctx;
1519 {
1520   reg_errcode_t err;
1521   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1522   int i;
1523
1524   for (i = 0; i < pstate->nodes.nelem; ++i)
1525     {
1526       re_node_set dest_nodes, *new_nodes;
1527       int cur_node_idx = pstate->nodes.elems[i];
1528       int naccepted = 0, dest_idx;
1529       unsigned int context;
1530       re_dfastate_t *dest_state;
1531
1532       if (dfa->nodes[cur_node_idx].type == OP_CONTEXT_NODE)
1533         {
1534           context = re_string_context_at (mctx->input,
1535                                           re_string_cur_idx (mctx->input),
1536                                           mctx->eflags, preg->newline_anchor);
1537           if (NOT_SATISFY_NEXT_CONSTRAINT (dfa->nodes[cur_node_idx].constraint,
1538                                         context))
1539             continue;
1540           cur_node_idx = dfa->nodes[cur_node_idx].opr.ctx_info->entity;
1541         }
1542
1543       /* How many bytes the node can accepts?  */
1544       if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
1545         naccepted = check_node_accept_bytes (preg, cur_node_idx, mctx->input,
1546                                              re_string_cur_idx (mctx->input));
1547       if (naccepted == 0)
1548         continue;
1549
1550       /* The node can accepts `naccepted' bytes.  */
1551       dest_idx = re_string_cur_idx (mctx->input) + naccepted;
1552       err = clean_state_log_if_need (mctx, dest_idx);
1553       if (BE (err != REG_NOERROR, 0))
1554         return err;
1555 #ifdef DEBUG
1556       assert (dfa->nexts[cur_node_idx] != -1);
1557 #endif
1558       /* `cur_node_idx' may point the entity of the OP_CONTEXT_NODE,
1559          then we use pstate->nodes.elems[i] instead.  */
1560       new_nodes = dfa->eclosures + dfa->nexts[pstate->nodes.elems[i]];
1561
1562       dest_state = mctx->state_log[dest_idx];
1563       if (dest_state == NULL)
1564         dest_nodes = *new_nodes;
1565       else
1566         {
1567           err = re_node_set_init_union (&dest_nodes,
1568                                         dest_state->entrance_nodes, new_nodes);
1569           if (BE (err != REG_NOERROR, 0))
1570             return err;
1571         }
1572       context = re_string_context_at (mctx->input, dest_idx - 1, mctx->eflags,
1573                                       preg->newline_anchor);
1574       mctx->state_log[dest_idx]
1575         = re_acquire_state_context (&err, dfa, &dest_nodes, context);
1576       if (BE (mctx->state_log[dest_idx] == NULL && err != REG_NOERROR, 0))
1577         return err;
1578       if (dest_state != NULL)
1579         re_node_set_free (&dest_nodes);
1580     }
1581   return REG_NOERROR;
1582 }
1583 #endif /* RE_ENABLE_I18N */
1584
1585 static reg_errcode_t
1586 transit_state_bkref (preg, pstate, mctx)
1587     const regex_t *preg;
1588     re_dfastate_t *pstate;
1589     re_match_context_t *mctx;
1590 {
1591   reg_errcode_t err;
1592   re_dfastate_t **work_state_log;
1593
1594   work_state_log = re_malloc (re_dfastate_t *,
1595                               re_string_cur_idx (mctx->input) + 1);
1596   if (BE (work_state_log == NULL, 0))
1597     return REG_ESPACE;
1598
1599   err = transit_state_bkref_loop (preg, &pstate->nodes, work_state_log, mctx);
1600   re_free (work_state_log);
1601   return err;
1602 }
1603
1604 /* Caller must allocate `work_state_log'.  */
1605
1606 static reg_errcode_t
1607 transit_state_bkref_loop (preg, nodes, work_state_log, mctx)
1608     const regex_t *preg;
1609     re_node_set *nodes;
1610     re_dfastate_t **work_state_log;
1611     re_match_context_t *mctx;
1612 {
1613   reg_errcode_t err;
1614   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1615   int i, j;
1616   re_dfastate_t **state_log_bak;
1617   regmatch_t *cur_regs = re_malloc (regmatch_t, preg->re_nsub + 1);
1618   int cur_str_idx = re_string_cur_idx (mctx->input);
1619   if (BE (cur_regs == NULL, 0))
1620     return REG_ESPACE;
1621
1622   for (i = 0; i < nodes->nelem; ++i)
1623     {
1624       char *buf;
1625       int dest_str_idx, subexp_idx, prev_nelem, subexp_len;
1626       int node_idx = nodes->elems[i];
1627       unsigned int context;
1628       re_token_t *node = dfa->nodes + node_idx;
1629       re_dfastate_t *dest_state;
1630       re_node_set *new_dest_nodes;
1631
1632       /* Check whether `node' is a backreference or not.  */
1633       if (node->type == OP_BACK_REF)
1634         subexp_idx = node->opr.idx;
1635       else if (node->type == OP_CONTEXT_NODE &&
1636                dfa->nodes[node->opr.ctx_info->entity].type == OP_BACK_REF)
1637         {
1638           context = re_string_context_at (mctx->input, cur_str_idx,
1639                                           mctx->eflags, preg->newline_anchor);
1640           if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
1641             continue;
1642           subexp_idx = dfa->nodes[node->opr.ctx_info->entity].opr.idx;
1643         }
1644       else
1645         continue;
1646
1647       /* `node' is a backreference.
1648          At first, set registers to check the backreference. */
1649       cur_regs[0].rm_so = 0;
1650       cur_regs[0].rm_eo = cur_str_idx;
1651       memcpy (work_state_log, mctx->state_log,
1652               sizeof (re_dfastate_t *) * (cur_str_idx  + 1));
1653       mctx->match_last = cur_str_idx;
1654       state_log_bak = mctx->state_log;
1655       mctx->state_log = work_state_log;
1656       sift_states_backward (preg, mctx, node_idx);
1657       if (!STATE_NODE_CONTAINS (work_state_log[0], dfa->init_node))
1658         continue;
1659       for (j = 1; j <= preg->re_nsub; ++j)
1660         cur_regs[j].rm_so = cur_regs[j].rm_eo = -1;
1661       set_regs (preg, mctx, subexp_idx + 1, cur_regs, node_idx);
1662       mctx->state_log = state_log_bak;
1663
1664       /* Then check that the backreference can match the input string.  */
1665       subexp_len = cur_regs[subexp_idx].rm_eo - cur_regs[subexp_idx].rm_so;
1666       if (subexp_len < 0 || cur_str_idx + subexp_len > mctx->input->len)
1667         continue;
1668
1669       if (cur_str_idx + subexp_len > mctx->input->valid_len
1670           && mctx->input->valid_len < mctx->input->len)
1671         {
1672           reg_errcode_t err;
1673           err = extend_buffers (mctx);
1674           if (BE (err != REG_NOERROR, 0))
1675             return err;
1676         }
1677       buf = re_string_get_buffer (mctx->input);
1678       if (strncmp (buf + cur_regs[subexp_idx].rm_so, buf + cur_str_idx,
1679                    subexp_len) != 0)
1680         continue;
1681
1682       /* Successfully matched, add a new cache entry.  */
1683       dest_str_idx = cur_str_idx + subexp_len;
1684       err = match_ctx_add_entry (mctx, node_idx, cur_str_idx, dest_str_idx);
1685       if (BE (err != REG_NOERROR, 0))
1686         return err;
1687       err = clean_state_log_if_need (mctx, dest_str_idx);
1688       if (BE (err != REG_NOERROR, 0))
1689         return err;
1690
1691       /* And add the epsilon closures (which is `new_dest_nodes') of
1692          the backreference to appropriate state_log.  */
1693 #ifdef DEBUG
1694       assert (dfa->nexts[node_idx] != -1);
1695 #endif
1696       if (node->type == OP_CONTEXT_NODE && subexp_len == 0)
1697         new_dest_nodes = dfa->nodes[node_idx].opr.ctx_info->bkref_eclosure;
1698       else
1699         new_dest_nodes = dfa->eclosures + dfa->nexts[node_idx];
1700       context = (IS_WORD_CHAR (re_string_byte_at (mctx->input,
1701                                                   dest_str_idx - 1))
1702                  ? CONTEXT_WORD : 0);
1703       dest_state = mctx->state_log[dest_str_idx];
1704
1705       prev_nelem = ((mctx->state_log[cur_str_idx] == NULL) ? 0
1706                     : mctx->state_log[cur_str_idx]->nodes.nelem);
1707       /* Add `new_dest_node' to state_log.  */
1708       if (dest_state == NULL)
1709         {
1710           mctx->state_log[dest_str_idx]
1711             = re_acquire_state_context (&err, dfa, new_dest_nodes, context);
1712           if (BE (mctx->state_log[dest_str_idx] == NULL
1713                   && err != REG_NOERROR, 0))
1714             return err;
1715         }
1716       else
1717         {
1718           re_node_set dest_nodes;
1719           err = re_node_set_init_union (&dest_nodes, dest_state->entrance_nodes,
1720                                         new_dest_nodes);
1721           if (BE (err != REG_NOERROR, 0))
1722             return err;
1723           mctx->state_log[dest_str_idx]
1724             = re_acquire_state_context (&err, dfa, &dest_nodes, context);
1725           if (BE (mctx->state_log[dest_str_idx] == NULL
1726                   && err != REG_NOERROR, 0))
1727             return err;
1728           re_node_set_free (&dest_nodes);
1729         }
1730
1731       /* We need to check recursively if the backreference can epsilon
1732          transit.  */
1733       if (subexp_len == 0
1734           && mctx->state_log[cur_str_idx]->nodes.nelem > prev_nelem)
1735         {
1736           err = transit_state_bkref_loop (preg, new_dest_nodes, work_state_log,
1737                                           mctx);
1738           if (BE (err != REG_NOERROR, 0))
1739             return err;
1740         }
1741     }
1742   re_free (cur_regs);
1743   return REG_NOERROR;
1744 }
1745
1746 /* Build transition table for the state.
1747    Return the new table if succeeded, otherwise return NULL.  */
1748
1749 static re_dfastate_t **
1750 build_trtable (preg, state, fl_search)
1751     const regex_t *preg;
1752     const re_dfastate_t *state;
1753     int fl_search;
1754 {
1755   reg_errcode_t err;
1756   re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1757   int i, j, k, ch;
1758   int ndests; /* Number of the destination states from `state'.  */
1759   re_dfastate_t **trtable, **dest_states, **dest_states_word, **dest_states_nl;
1760   re_node_set follows, *dests_node;
1761   bitset *dests_ch;
1762   bitset acceptable;
1763
1764   /* We build DFA states which corresponds to the destination nodes
1765      from `state'.  `dests_node[i]' represents the nodes which i-th
1766      destination state contains, and `dests_ch[i]' represents the
1767      characters which i-th destination state accepts.  */
1768   dests_node = re_malloc (re_node_set, SBC_MAX);
1769   dests_ch = re_malloc (bitset, SBC_MAX);
1770
1771   /* Initialize transiton table.  */
1772   trtable = (re_dfastate_t **) calloc (sizeof (re_dfastate_t *), SBC_MAX);
1773   if (BE (dests_node == NULL || dests_ch == NULL || trtable == NULL, 0))
1774     return NULL;
1775
1776   /* At first, group all nodes belonging to `state' into several
1777      destinations.  */
1778   ndests = group_nodes_into_DFAstates (preg, state, dests_node, dests_ch);
1779   if (BE (ndests <= 0, 0))
1780     {
1781       re_free (dests_node);
1782       re_free (dests_ch);
1783       /* Return NULL in case of an error, trtable otherwise.  */
1784       return (ndests < 0) ? NULL : trtable;
1785     }
1786
1787   dest_states = re_malloc (re_dfastate_t *, ndests);
1788   dest_states_word = re_malloc (re_dfastate_t *, ndests);
1789   dest_states_nl = re_malloc (re_dfastate_t *, ndests);
1790   bitset_empty (acceptable);
1791
1792   err = re_node_set_alloc (&follows, ndests + 1);
1793   if (BE (dest_states == NULL || dest_states_word == NULL
1794           || dest_states_nl == NULL || err != REG_NOERROR, 0))
1795     return NULL;
1796
1797   /* Then build the states for all destinations.  */
1798   for (i = 0; i < ndests; ++i)
1799     {
1800       int next_node;
1801       re_node_set_empty (&follows);
1802       /* Merge the follows of this destination states.  */
1803       for (j = 0; j < dests_node[i].nelem; ++j)
1804         {
1805           next_node = dfa->nexts[dests_node[i].elems[j]];
1806           if (next_node != -1)
1807             {
1808               err = re_node_set_merge (&follows, dfa->eclosures + next_node);
1809               if (BE (err != REG_NOERROR, 0))
1810                 return NULL;
1811             }
1812         }
1813       /* If search flag is set, merge the initial state.  */
1814       if (fl_search)
1815         {
1816 #ifdef RE_ENABLE_I18N
1817           int not_initial = 0;
1818           for (j = 0; j < follows.nelem; ++j)
1819             if (dfa->nodes[follows.elems[j]].type == CHARACTER)
1820               {
1821                 not_initial = dfa->nodes[follows.elems[j]].mb_partial;
1822                 break;
1823               }
1824           if (!not_initial)
1825 #endif
1826             {
1827               err = re_node_set_merge (&follows,
1828                                        dfa->init_state->entrance_nodes);
1829               if (BE (err != REG_NOERROR, 0))
1830                 return NULL;
1831             }
1832         }
1833       dest_states[i] = re_acquire_state_context (&err, dfa, &follows, 0);
1834       if (BE (dest_states[i] == NULL && err != REG_NOERROR, 0))
1835         return NULL;
1836       /* If the new state has context constraint,
1837          build appropriate states for these contexts.  */
1838       if (dest_states[i]->has_constraint)
1839         {
1840           dest_states_word[i] = re_acquire_state_context (&err, dfa, &follows,
1841                                                           CONTEXT_WORD);
1842           if (BE (dest_states_word[i] == NULL && err != REG_NOERROR, 0))
1843             return NULL;
1844           dest_states_nl[i] = re_acquire_state_context (&err, dfa, &follows,
1845                                                         CONTEXT_NEWLINE);
1846           if (BE (dest_states_nl[i] == NULL && err != REG_NOERROR, 0))
1847             return NULL;
1848         }
1849       else
1850         {
1851           dest_states_word[i] = dest_states[i];
1852           dest_states_nl[i] = dest_states[i];
1853         }
1854       bitset_merge (acceptable, dests_ch[i]);
1855     }
1856
1857   /* Update the transition table.  */
1858   for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
1859     for (j = 0; j < UINT_BITS; ++j, ++ch)
1860       if ((acceptable[i] >> j) & 1)
1861         {
1862           if (IS_WORD_CHAR (ch))
1863             {
1864               for (k = 0; k < ndests; ++k)
1865                 if ((dests_ch[k][i] >> j) & 1)
1866                   trtable[ch] = dest_states_word[k];
1867             }
1868           else /* not WORD_CHAR */
1869             {
1870               for (k = 0; k < ndests; ++k)
1871                 if ((dests_ch[k][i] >> j) & 1)
1872                   trtable[ch] = dest_states[k];
1873             }
1874         }
1875   /* new line */
1876   for (k = 0; k < ndests; ++k)
1877     if (bitset_contain (acceptable, NEWLINE_CHAR))
1878       trtable[NEWLINE_CHAR] = dest_states_nl[k];
1879
1880   re_free (dest_states_nl);
1881   re_free (dest_states_word);
1882   re_free (dest_states);
1883
1884   re_node_set_free (&follows);
1885   for (i = 0; i < ndests; ++i)
1886     re_node_set_free (dests_node + i);
1887
1888   re_free (dests_ch);
1889   re_free (dests_node);
1890
1891   return trtable;
1892 }
1893
1894 /* Group all nodes belonging to STATE into several destinations.
1895    Then for all destinations, set the nodes belonging to the destination
1896    to DESTS_NODE[i] and set the characters accepted by the destination
1897    to DEST_CH[i].  This function return the number of destinations.  */
1898
1899 static int
1900 group_nodes_into_DFAstates (preg, state, dests_node, dests_ch)
1901     const regex_t *preg;
1902     const re_dfastate_t *state;
1903     re_node_set *dests_node;
1904     bitset *dests_ch;
1905 {
1906   reg_errcode_t err;
1907   const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1908   int i, j, k;
1909   int ndests; /* Number of the destinations from `state'.  */
1910   bitset accepts; /* Characters a node can accept.  */
1911   const re_node_set *cur_nodes = &state->nodes;
1912   bitset_empty (accepts);
1913   ndests = 0;
1914
1915   /* For all the nodes belonging to `state',  */
1916   for (i = 0; i < cur_nodes->nelem; ++i)
1917     {
1918       unsigned int constraint = 0;
1919       re_token_t *node = &dfa->nodes[cur_nodes->elems[i]];
1920       re_token_type_t type = node->type;
1921
1922       if (type == OP_CONTEXT_NODE)
1923         {
1924           constraint = node->constraint;
1925           node = dfa->nodes + node->opr.ctx_info->entity;
1926           type = node->type;
1927         }
1928
1929       /* Enumerate all single byte character this node can accept.  */
1930       if (type == CHARACTER)
1931         bitset_set (accepts, node->opr.c);
1932       else if (type == SIMPLE_BRACKET)
1933         {
1934           bitset_merge (accepts, node->opr.sbcset);
1935         }
1936       else if (type == OP_PERIOD)
1937         {
1938           bitset_set_all (accepts);
1939           if (!(preg->syntax & RE_DOT_NEWLINE))
1940             bitset_clear (accepts, '\n');
1941           if (preg->syntax & RE_DOT_NOT_NULL)
1942             bitset_clear (accepts, '\0');
1943         }
1944       else
1945         continue;
1946
1947       /* Check the `accepts' and sift the characters which are not
1948          match it the context.  */
1949       if (constraint)
1950         {
1951           if (constraint & NEXT_WORD_CONSTRAINT)
1952             for (j = 0; j < BITSET_UINTS; ++j)
1953               accepts[j] &= dfa->word_char[j];
1954           else if (constraint & NEXT_NOTWORD_CONSTRAINT)
1955             for (j = 0; j < BITSET_UINTS; ++j)
1956               accepts[j] &= ~dfa->word_char[j];
1957           else if (constraint & NEXT_NEWLINE_CONSTRAINT)
1958             {
1959               int accepts_newline = bitset_contain (accepts, NEWLINE_CHAR);
1960               bitset_empty (accepts);
1961               if (accepts_newline)
1962                 bitset_set (accepts, NEWLINE_CHAR);
1963               else
1964                 continue;
1965             }
1966         }
1967
1968       /* Then divide `accepts' into DFA states, or create a new
1969          state.  */
1970       for (j = 0; j < ndests; ++j)
1971         {
1972           bitset intersec; /* Intersection sets, see below.  */
1973           bitset remains;
1974           /* Flags, see below.  */
1975           int has_intersec, not_subset, not_consumed;
1976
1977           /* Optimization, skip if this state doesn't accept the character.  */
1978           if (type == CHARACTER && !bitset_contain (dests_ch[j], node->opr.c))
1979             continue;
1980
1981           /* Enumerate the intersection set of this state and `accepts'.  */
1982           has_intersec = 0;
1983           for (k = 0; k < BITSET_UINTS; ++k)
1984             has_intersec |= intersec[k] = accepts[k] & dests_ch[j][k];
1985           /* And skip if the intersection set is empty.  */
1986           if (!has_intersec)
1987             continue;
1988
1989           /* Then check if this state is a subset of `accepts'.  */
1990           not_subset = not_consumed = 0;
1991           for (k = 0; k < BITSET_UINTS; ++k)
1992             {
1993               not_subset |= remains[k] = ~accepts[k] & dests_ch[j][k];
1994               not_consumed |= accepts[k] = accepts[k] & ~dests_ch[j][k];
1995             }
1996
1997           /* If this state isn't a subset of `accepts', create a
1998              new group state, which has the `remains'. */
1999           if (not_subset)
2000             {
2001               bitset_copy (dests_ch[ndests], remains);
2002               bitset_copy (dests_ch[j], intersec);
2003               err = re_node_set_init_copy (dests_node + ndests, &dests_node[j]);
2004               if (BE (err != REG_NOERROR, 0))
2005                 return -1;
2006               ++ndests;
2007             }
2008
2009           /* Put the position in the current group. */
2010           err = re_node_set_insert (&dests_node[j], cur_nodes->elems[i]);
2011           if (BE (err < 0, 0))
2012             return -1;
2013
2014           /* If all characters are consumed, go to next node. */
2015           if (!not_consumed)
2016             break;
2017         }
2018       /* Some characters remain, create a new group. */
2019       if (j == ndests)
2020         {
2021           bitset_copy (dests_ch[ndests], accepts);
2022           err = re_node_set_init_1 (dests_node + ndests, cur_nodes->elems[i]);
2023           if (BE (err != REG_NOERROR, 0))
2024             return -1;
2025           ++ndests;
2026           bitset_empty (accepts);
2027         }
2028     }
2029   return ndests;
2030 }
2031
2032 #ifdef RE_ENABLE_I18N
2033 /* Check how many bytes the node `dfa->nodes[node_idx]' accepts.
2034    Return the number of the bytes the node accepts.
2035    STR_IDX is the current index of the input string.
2036
2037    This function handles the nodes which can accept one character, or
2038    one collating element like '.', '[a-z]', opposite to the other nodes
2039    can only accept one byte.  */
2040
2041 static int
2042 check_node_accept_bytes (preg, node_idx, input, str_idx)
2043     const regex_t *preg;
2044     int node_idx, str_idx;
2045     const re_string_t *input;
2046 {
2047   const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2048   const re_token_t *node = dfa->nodes + node_idx;
2049   int elem_len = re_string_elem_size_at (input, str_idx);
2050   int char_len = re_string_char_size_at (input, str_idx);
2051   int i;
2052 # ifdef _LIBC
2053   int j;
2054   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2055 # endif /* _LIBC */
2056   if (elem_len <= 1 && char_len <= 1)
2057     return 0;
2058   if (node->type == OP_PERIOD)
2059     {
2060       /* '.' accepts any one character except the following two cases.  */
2061       if ((!(preg->syntax & RE_DOT_NEWLINE) &&
2062            re_string_byte_at (input, str_idx) == '\n') ||
2063           ((preg->syntax & RE_DOT_NOT_NULL) &&
2064            re_string_byte_at (input, str_idx) == '\0'))
2065         return 0;
2066       return char_len;
2067     }
2068   else if (node->type == COMPLEX_BRACKET)
2069     {
2070       const re_charset_t *cset = node->opr.mbcset;
2071 # ifdef _LIBC
2072       const char *pin = re_string_get_buffer (input) + str_idx;
2073 # endif /* _LIBC */
2074       int match_len = 0;
2075       wchar_t wc = ((cset->nranges || cset->nchar_classes || cset->nmbchars)
2076                     ? re_string_wchar_at (input, str_idx) : 0);
2077
2078       /* match with multibyte character?  */
2079       for (i = 0; i < cset->nmbchars; ++i)
2080         if (wc == cset->mbchars[i])
2081           {
2082             match_len = char_len;
2083             goto check_node_accept_bytes_match;
2084           }
2085       /* match with character_class?  */
2086       for (i = 0; i < cset->nchar_classes; ++i)
2087         {
2088           wctype_t wt = cset->char_classes[i];
2089           if (__iswctype (wc, wt))
2090             {
2091               match_len = char_len;
2092               goto check_node_accept_bytes_match;
2093             }
2094         }
2095
2096 # ifdef _LIBC
2097       if (nrules != 0)
2098         {
2099           unsigned int in_collseq = 0;
2100           const int32_t *table, *indirect;
2101           const char *weights, *extra, *collseqwc;
2102           int32_t idx;
2103           /* This #include defines a local function!  */
2104 #  include <locale/weight.h>
2105
2106           /* match with collating_symbol?  */
2107           if (cset->ncoll_syms)
2108             extra = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2109           for (i = 0; i < cset->ncoll_syms; ++i)
2110             {
2111               const char *coll_sym = extra + cset->coll_syms[i];
2112               /* Compare the length of input collating element and
2113                  the length of current collating element.  */
2114               if (*coll_sym != elem_len)
2115                 continue;
2116               /* Compare each bytes.  */
2117               for (j = 0; j < *coll_sym; j++)
2118                 if (pin[j] != coll_sym[1 + j])
2119                   break;
2120               if (j == *coll_sym)
2121                 {
2122                   /* Match if every bytes is equal.  */
2123                   match_len = j;
2124                   goto check_node_accept_bytes_match;
2125                 }
2126             }
2127
2128           if (cset->nranges)
2129             {
2130               if (elem_len <= char_len)
2131                 {
2132                   collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2133                   in_collseq = collseq_table_lookup (collseqwc, wc);
2134                 }
2135               else
2136                 in_collseq = find_collation_sequence_value (pin, elem_len);
2137             }
2138           /* match with range expression?  */
2139           for (i = 0; i < cset->nranges; ++i)
2140             if (cset->range_starts[i] <= in_collseq
2141                 && in_collseq <= cset->range_ends[i])
2142               {
2143                 match_len = elem_len;
2144                 goto check_node_accept_bytes_match;
2145               }
2146
2147           /* match with equivalence_class?  */
2148           if (cset->nequiv_classes)
2149             {
2150               const unsigned char *cp = (const unsigned char *) pin;
2151               table = (const int32_t *)
2152                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
2153               weights = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_WEIGHTMB);
2154               extra = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_EXTRAMB);
2155               indirect = (const int32_t *)
2156                 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_INDIRECTMB);
2157               idx = findidx (&cp);
2158               if (idx > 0)
2159                 for (i = 0; i < cset->nequiv_classes; ++i)
2160                   {
2161                     int32_t equiv_class_idx = cset->equiv_classes[i];
2162                     size_t weight_len = weights[idx];
2163                     if (weight_len == weights[equiv_class_idx])
2164                       {
2165                         int cnt = 0;
2166                         while (cnt <= weight_len
2167                                && (weights[equiv_class_idx + 1 + cnt]
2168                                    == weights[idx + 1 + cnt]))
2169                           ++cnt;
2170                         if (cnt > weight_len)
2171                           {
2172                             match_len = elem_len;
2173                             goto check_node_accept_bytes_match;
2174                           }
2175                       }
2176                   }
2177             }
2178         }
2179       else
2180 # endif /* _LIBC */
2181         {
2182           /* match with range expression?  */
2183 #if __GNUC__ >= 2
2184           wchar_t cmp_buf[] = {L'\0', L'\0', wc, L'\0', L'\0', L'\0'};
2185 #else
2186           wchar_t cmp_buf[] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2187           cmp_buf[2] = wc;
2188 #endif
2189           for (i = 0; i < cset->nranges; ++i)
2190             {
2191               cmp_buf[0] = cset->range_starts[i];
2192               cmp_buf[4] = cset->range_ends[i];
2193               if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2194                   && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2195                 {
2196                   match_len = char_len;
2197                   goto check_node_accept_bytes_match;
2198                 }
2199             }
2200         }
2201     check_node_accept_bytes_match:
2202       if (!cset->non_match)
2203         return match_len;
2204       else
2205         {
2206           if (match_len > 0)
2207             return 0;
2208           else
2209             return (elem_len > char_len) ? elem_len : char_len;
2210         }
2211     }
2212   return 0;
2213 }
2214
2215 # ifdef _LIBC
2216 static unsigned int
2217 find_collation_sequence_value (mbs, mbs_len)
2218     const char *mbs;
2219     size_t mbs_len;
2220 {
2221   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2222   if (nrules == 0)
2223     {
2224       if (mbs_len == 1)
2225         {
2226           /* No valid character.  Match it as a single byte character.  */
2227           const unsigned char *collseq = (const unsigned char *)
2228             _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2229           return collseq[*(unsigned char *) mbs];
2230         }
2231       return UINT_MAX;
2232     }
2233   else
2234     {
2235       int32_t idx;
2236       const unsigned char *extra = (const unsigned char *)
2237         _NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
2238
2239       for (idx = 0; ;)
2240         {
2241           int mbs_cnt, found = 0;
2242           int32_t elem_mbs_len;
2243           /* Skip the name of collating element name.  */
2244           idx = idx + extra[idx] + 1;
2245           elem_mbs_len = extra[idx++];
2246           if (mbs_len == elem_mbs_len)
2247             {
2248               for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
2249                 if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
2250                   break;
2251               if (mbs_cnt == elem_mbs_len)
2252                 /* Found the entry.  */
2253                 found = 1;
2254             }
2255           /* Skip the byte sequence of the collating element.  */
2256           idx += elem_mbs_len;
2257           /* Adjust for the alignment.  */
2258           idx = (idx + 3) & ~3;
2259           /* Skip the collation sequence value.  */
2260           idx += sizeof (uint32_t);
2261           /* Skip the wide char sequence of the collating element.  */
2262           idx = idx + sizeof (uint32_t) * (extra[idx] + 1);
2263           /* If we found the entry, return the sequence value.  */
2264           if (found)
2265             return *(uint32_t *) (extra + idx);
2266           /* Skip the collation sequence value.  */
2267           idx += sizeof (uint32_t);
2268         }
2269     }
2270 }
2271 # endif /* _LIBC */
2272 #endif /* RE_ENABLE_I18N */
2273
2274 /* Check whether the node accepts the byte which is IDX-th
2275    byte of the INPUT.  */
2276
2277 static int
2278 check_node_accept (preg, node, mctx, idx)
2279     const regex_t *preg;
2280     const re_token_t *node;
2281     const re_match_context_t *mctx;
2282     int idx;
2283 {
2284   const re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2285   const re_token_t *cur_node;
2286   unsigned char ch;
2287   if (node->type == OP_CONTEXT_NODE)
2288     {
2289       /* The node has constraints.  Check whether the current context
2290          satisfies the constraints.  */
2291       unsigned int context = re_string_context_at (mctx->input, idx,
2292                                                    mctx->eflags,
2293                                                    preg->newline_anchor);
2294       if (NOT_SATISFY_NEXT_CONSTRAINT (node->constraint, context))
2295         return 0;
2296       cur_node = dfa->nodes + node->opr.ctx_info->entity;
2297     }
2298   else
2299     cur_node = node;
2300
2301   ch = re_string_byte_at (mctx->input, idx);
2302   if (cur_node->type == CHARACTER)
2303     return cur_node->opr.c == ch;
2304   else if (cur_node->type == SIMPLE_BRACKET)
2305     return bitset_contain (cur_node->opr.sbcset, ch);
2306   else if (cur_node->type == OP_PERIOD)
2307     return !((ch == '\n' && !(preg->syntax & RE_DOT_NEWLINE))
2308              || (ch == '\0' && (preg->syntax & RE_DOT_NOT_NULL)));
2309   else
2310     return 0;
2311 }
2312
2313 /* Extend the buffers, if the buffers have run out.  */
2314
2315 static reg_errcode_t
2316 extend_buffers (mctx)
2317      re_match_context_t *mctx;
2318 {
2319   reg_errcode_t ret;
2320   re_string_t *pstr = mctx->input;
2321
2322   /* Double the lengthes of the buffers.  */
2323   ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
2324   if (BE (ret != REG_NOERROR, 0))
2325     return ret;
2326
2327   if (mctx->state_log != NULL)
2328     {
2329       /* And double the length of state_log.  */
2330       mctx->state_log = re_realloc (mctx->state_log, re_dfastate_t *,
2331                                     pstr->bufs_len * 2);
2332       if (BE (mctx->state_log == NULL, 0))
2333         return REG_ESPACE;
2334     }
2335
2336   /* Then reconstruct the buffers.  */
2337   if (pstr->icase)
2338     {
2339 #ifdef RE_ENABLE_I18N
2340       if (MB_CUR_MAX > 1)
2341         build_wcs_upper_buffer (pstr);
2342       else
2343 #endif /* RE_ENABLE_I18N  */
2344         build_upper_buffer (pstr);
2345     }
2346   else
2347     {
2348 #ifdef RE_ENABLE_I18N
2349       if (MB_CUR_MAX > 1)
2350         build_wcs_buffer (pstr);
2351       else
2352 #endif /* RE_ENABLE_I18N  */
2353         {
2354           if (pstr->trans != NULL)
2355             re_string_translate_buffer (pstr);
2356           else
2357             pstr->valid_len = pstr->bufs_len;
2358         }
2359     }
2360   return REG_NOERROR;
2361 }
2362
2363 \f
2364 /* Functions for matching context.  */
2365
2366 static reg_errcode_t
2367 match_ctx_init (mctx, eflags, input, n)
2368     re_match_context_t *mctx;
2369     int eflags, n;
2370     re_string_t *input;
2371 {
2372   mctx->eflags = eflags;
2373   mctx->input = input;
2374   mctx->match_last = -1;
2375   if (n > 0)
2376     {
2377       mctx->bkref_ents = re_malloc (struct re_backref_cache_entry, n);
2378       if (BE (mctx->bkref_ents == NULL, 0))
2379         return REG_ESPACE;
2380     }
2381   else
2382     mctx->bkref_ents = NULL;
2383   mctx->nbkref_ents = 0;
2384   mctx->abkref_ents = n;
2385   mctx->max_bkref_len = 0;
2386   return REG_NOERROR;
2387 }
2388
2389 static void
2390 match_ctx_free (mctx)
2391     re_match_context_t *mctx;
2392 {
2393   re_free (mctx->bkref_ents);
2394 }
2395
2396 /* Add a new backreference entry to the cache.  */
2397
2398 static reg_errcode_t
2399 match_ctx_add_entry (mctx, node, from, to)
2400     re_match_context_t *mctx;
2401     int node, from, to;
2402 {
2403   if (mctx->nbkref_ents >= mctx->abkref_ents)
2404     {
2405       mctx->bkref_ents = re_realloc (mctx->bkref_ents,
2406                                      struct re_backref_cache_entry,
2407                                      mctx->abkref_ents * 2);
2408       if (BE (mctx->bkref_ents == NULL, 0))
2409         return REG_ESPACE;
2410       memset (mctx->bkref_ents + mctx->nbkref_ents, '\0',
2411              sizeof (struct re_backref_cache_entry) * mctx->abkref_ents);
2412       mctx->abkref_ents *= 2;
2413     }
2414   mctx->bkref_ents[mctx->nbkref_ents].node = node;
2415   mctx->bkref_ents[mctx->nbkref_ents].from = from;
2416   mctx->bkref_ents[mctx->nbkref_ents++].to = to;
2417   if (mctx->max_bkref_len < to - from)
2418     mctx->max_bkref_len = to - from;
2419   return REG_NOERROR;
2420 }