libc/misc/regex/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, int len,
  21                                         re_string_t *pstr,
  22                                         __RE_TRANSLATE_TYPE trans, int icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           unsigned int hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           unsigned int hash) internal_function;
  31
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function
  39 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  40                     __RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   int init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function
  67 re_string_construct (re_string_t *pstr, const char *str, int len,
  68                      __RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function
 130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 136       if (BE (new_wcs == NULL, 0))
 137         return REG_ESPACE;
 138       pstr->wcs = new_wcs;
 139       if (pstr->offsets != NULL)
 140         {
 141           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 142           if (BE (new_offsets == NULL, 0))
 143             return REG_ESPACE;
 144           pstr->offsets = new_offsets;
 145         }
 146     }
 147 #endif /* RE_ENABLE_I18N  */
 148   if (pstr->mbs_allocated)
 149     {
 150       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 151                                            new_buf_len);
 152       if (BE (new_mbs == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->mbs = new_mbs;
 155     }
 156   pstr->bufs_len = new_buf_len;
 157   return REG_NOERROR;
 158 }
 159
 160
 161 static void
 162 internal_function
 163 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 164                             __RE_TRANSLATE_TYPE trans, int icase,
 165                             const re_dfa_t *dfa)
 166 {
 167   pstr->raw_mbs = (const unsigned char *) str;
 168   pstr->len = len;
 169   pstr->raw_len = len;
 170   pstr->trans = trans;
 171   pstr->icase = icase ? 1 : 0;
 172   pstr->mbs_allocated = (trans != NULL || icase);
 173   pstr->mb_cur_max = dfa->mb_cur_max;
 174   pstr->is_utf8 = dfa->is_utf8;
 175   pstr->map_notascii = dfa->map_notascii;
 176   pstr->stop = pstr->len;
 177   pstr->raw_stop = pstr->stop;
 178 }
 179
 180 #ifdef RE_ENABLE_I18N
 181
 182 /* Build wide character buffer PSTR->WCS.
 183    If the byte sequence of the string are:
 184      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 185    Then wide character buffer will be:
 186      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 187    We use WEOF for padding, they indicate that the position isn't
 188    a first byte of a multibyte character.
 189
 190    Note that this function assumes PSTR->VALID_LEN elements are already
 191    built and starts from PSTR->VALID_LEN.  */
 192
 193 static void
 194 internal_function
 195 build_wcs_buffer (re_string_t *pstr)
 196 {
 197 #if defined __UCLIBC__
 198   unsigned char buf[MB_LEN_MAX];
 199   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 200 #else
 201   unsigned char buf[64];
 202 #endif
 203   mbstate_t prev_st;
 204   int byte_idx, end_idx, remain_len;
 205   size_t mbclen;
 206
 207   /* Build the buffers from pstr->valid_len to either pstr->len or
 208      pstr->bufs_len.  */
 209   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 210   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 211     {
 212       wchar_t wc;
 213       const char *p;
 214
 215       remain_len = end_idx - byte_idx;
 216       prev_st = pstr->cur_state;
 217       /* Apply the translation if we need.  */
 218       if (BE (pstr->trans != NULL, 0))
 219         {
 220           int i, ch;
 221
 222           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 223             {
 224               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 225               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 226             }
 227           p = (const char *) buf;
 228         }
 229       else
 230         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 231       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 232       if (BE (mbclen == (size_t) -2, 0))
 233         {
 234           /* The buffer doesn't have enough space, finish to build.  */
 235           pstr->cur_state = prev_st;
 236           break;
 237         }
 238       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 239         {
 240           /* We treat these cases as a singlebyte character.  */
 241           mbclen = 1;
 242           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 243           if (BE (pstr->trans != NULL, 0))
 244             wc = pstr->trans[wc];
 245           pstr->cur_state = prev_st;
 246         }
 247
 248       /* Write wide character and padding.  */
 249       pstr->wcs[byte_idx++] = wc;
 250       /* Write paddings.  */
 251       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 252         pstr->wcs[byte_idx++] = WEOF;
 253     }
 254   pstr->valid_len = byte_idx;
 255   pstr->valid_raw_len = byte_idx;
 256 }
 257
 258 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 259    but for REG_ICASE.  */
 260
 261 static reg_errcode_t
 262 internal_function
 263 build_wcs_upper_buffer (re_string_t *pstr)
 264 {
 265   mbstate_t prev_st;
 266   int src_idx, byte_idx, end_idx, remain_len;
 267   size_t mbclen;
 268 #if defined __UCLIBC__
 269   char buf[MB_LEN_MAX];
 270   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 271 #else
 272   char buf[64];
 273 #endif
 274
 275   byte_idx = pstr->valid_len;
 276   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 277
 278   /* The following optimization assumes that ASCII characters can be
 279      mapped to wide characters with a simple cast.  */
 280   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 281     {
 282       while (byte_idx < end_idx)
 283         {
 284           wchar_t wc;
 285
 286           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 287               && mbsinit (&pstr->cur_state))
 288             {
 289               /* In case of a singlebyte character.  */
 290               pstr->mbs[byte_idx]
 291                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 292               /* The next step uses the assumption that wchar_t is encoded
 293                  ASCII-safe: all ASCII values can be converted like this.  */
 294               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 295               ++byte_idx;
 296               continue;
 297             }
 298
 299           remain_len = end_idx - byte_idx;
 300           prev_st = pstr->cur_state;
 301           mbclen = mbrtowc (&wc,
 302                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 303                              + byte_idx), remain_len, &pstr->cur_state);
 304           if (BE (mbclen + 2 > 2, 1))
 305             {
 306               wchar_t wcu = wc;
 307               if (iswlower (wc))
 308                 {
 309                   size_t mbcdlen;
 310
 311                   wcu = towupper (wc);
 312                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 313                   if (BE (mbclen == mbcdlen, 1))
 314                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 315                   else
 316                     {
 317                       src_idx = byte_idx;
 318                       goto offsets_needed;
 319                     }
 320                 }
 321               else
 322                 memcpy (pstr->mbs + byte_idx,
 323                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 324               pstr->wcs[byte_idx++] = wcu;
 325               /* Write paddings.  */
 326               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 327                 pstr->wcs[byte_idx++] = WEOF;
 328             }
 329           else if (mbclen == (size_t) -1 || mbclen == 0)
 330             {
 331               /* It is an invalid character or '\0'.  Just use the byte.  */
 332               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 333               pstr->mbs[byte_idx] = ch;
 334               /* And also cast it to wide char.  */
 335               pstr->wcs[byte_idx++] = (wchar_t) ch;
 336               if (BE (mbclen == (size_t) -1, 0))
 337                 pstr->cur_state = prev_st;
 338             }
 339           else
 340             {
 341               /* The buffer doesn't have enough space, finish to build.  */
 342               pstr->cur_state = prev_st;
 343               break;
 344             }
 345         }
 346       pstr->valid_len = byte_idx;
 347       pstr->valid_raw_len = byte_idx;
 348       return REG_NOERROR;
 349     }
 350   else
 351     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 352       {
 353         wchar_t wc;
 354         const char *p;
 355       offsets_needed:
 356         remain_len = end_idx - byte_idx;
 357         prev_st = pstr->cur_state;
 358         if (BE (pstr->trans != NULL, 0))
 359           {
 360             int i, ch;
 361
 362             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 363               {
 364                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 365                 buf[i] = pstr->trans[ch];
 366               }
 367             p = (const char *) buf;
 368           }
 369         else
 370           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 371         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 372         if (BE (mbclen + 2 > 2, 1))
 373           {
 374             wchar_t wcu = wc;
 375             if (iswlower (wc))
 376               {
 377                 size_t mbcdlen;
 378
 379                 wcu = towupper (wc);
 380                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 381                 if (BE (mbclen == mbcdlen, 1))
 382                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 383                 else if (mbcdlen != (size_t) -1)
 384                   {
 385                     size_t i;
 386
 387                     if (byte_idx + mbcdlen > pstr->bufs_len)
 388                       {
 389                         pstr->cur_state = prev_st;
 390                         break;
 391                       }
 392
 393                     if (pstr->offsets == NULL)
 394                       {
 395                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 396
 397                         if (pstr->offsets == NULL)
 398                           return REG_ESPACE;
 399                       }
 400                     if (!pstr->offsets_needed)
 401                       {
 402                         for (i = 0; i < (size_t) byte_idx; ++i)
 403                           pstr->offsets[i] = i;
 404                         pstr->offsets_needed = 1;
 405                       }
 406
 407                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 408                     pstr->wcs[byte_idx] = wcu;
 409                     pstr->offsets[byte_idx] = src_idx;
 410                     for (i = 1; i < mbcdlen; ++i)
 411                       {
 412                         pstr->offsets[byte_idx + i]
 413                           = src_idx + (i < mbclen ? i : mbclen - 1);
 414                         pstr->wcs[byte_idx + i] = WEOF;
 415                       }
 416                     pstr->len += mbcdlen - mbclen;
 417                     if (pstr->raw_stop > src_idx)
 418                       pstr->stop += mbcdlen - mbclen;
 419                     end_idx = (pstr->bufs_len > pstr->len)
 420                               ? pstr->len : pstr->bufs_len;
 421                     byte_idx += mbcdlen;
 422                     src_idx += mbclen;
 423                     continue;
 424                   }
 425                 else
 426                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 427               }
 428             else
 429               memcpy (pstr->mbs + byte_idx, p, mbclen);
 430
 431             if (BE (pstr->offsets_needed != 0, 0))
 432               {
 433                 size_t i;
 434                 for (i = 0; i < mbclen; ++i)
 435                   pstr->offsets[byte_idx + i] = src_idx + i;
 436               }
 437             src_idx += mbclen;
 438
 439             pstr->wcs[byte_idx++] = wcu;
 440             /* Write paddings.  */
 441             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 442               pstr->wcs[byte_idx++] = WEOF;
 443           }
 444         else if (mbclen == (size_t) -1 || mbclen == 0)
 445           {
 446             /* It is an invalid character or '\0'.  Just use the byte.  */
 447             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 448
 449             if (BE (pstr->trans != NULL, 0))
 450               ch = pstr->trans [ch];
 451             pstr->mbs[byte_idx] = ch;
 452
 453             if (BE (pstr->offsets_needed != 0, 0))
 454               pstr->offsets[byte_idx] = src_idx;
 455             ++src_idx;
 456
 457             /* And also cast it to wide char.  */
 458             pstr->wcs[byte_idx++] = (wchar_t) ch;
 459             if (BE (mbclen == (size_t) -1, 0))
 460               pstr->cur_state = prev_st;
 461           }
 462         else
 463           {
 464             /* The buffer doesn't have enough space, finish to build.  */
 465             pstr->cur_state = prev_st;
 466             break;
 467           }
 468       }
 469   pstr->valid_len = byte_idx;
 470   pstr->valid_raw_len = src_idx;
 471   return REG_NOERROR;
 472 }
 473
 474 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 475    Return the index.  */
 476
 477 static int
 478 internal_function
 479 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 480 {
 481   mbstate_t prev_st;
 482   int rawbuf_idx;
 483   size_t mbclen;
 484   wchar_t wc = 0;
 485
 486   /* Skip the characters which are not necessary to check.  */
 487   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 488        rawbuf_idx < new_raw_idx;)
 489     {
 490       int remain_len;
 491       remain_len = pstr->len - rawbuf_idx;
 492       prev_st = pstr->cur_state;
 493       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 494                         remain_len, &pstr->cur_state);
 495       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 496         {
 497           /* We treat these cases as a singlebyte character.  */
 498           mbclen = 1;
 499           pstr->cur_state = prev_st;
 500         }
 501       /* Then proceed the next character.  */
 502       rawbuf_idx += mbclen;
 503     }
 504   *last_wc = (wint_t) wc;
 505   return rawbuf_idx;
 506 }
 507 #endif /* RE_ENABLE_I18N  */
 508
 509 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 510    This function is used in case of REG_ICASE.  */
 511
 512 static void
 513 internal_function
 514 build_upper_buffer (re_string_t *pstr)
 515 {
 516   int char_idx, end_idx;
 517   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 518
 519   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 520     {
 521       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 522       if (BE (pstr->trans != NULL, 0))
 523         ch = pstr->trans[ch];
 524       if (islower (ch))
 525         pstr->mbs[char_idx] = toupper (ch);
 526       else
 527         pstr->mbs[char_idx] = ch;
 528     }
 529   pstr->valid_len = char_idx;
 530   pstr->valid_raw_len = char_idx;
 531 }
 532
 533 /* Apply TRANS to the buffer in PSTR.  */
 534
 535 static void
 536 internal_function
 537 re_string_translate_buffer (re_string_t *pstr)
 538 {
 539   int buf_idx, end_idx;
 540   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 541
 542   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 543     {
 544       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 545       pstr->mbs[buf_idx] = pstr->trans[ch];
 546     }
 547
 548   pstr->valid_len = buf_idx;
 549   pstr->valid_raw_len = buf_idx;
 550 }
 551
 552 /* This function re-construct the buffers.
 553    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 554    convert to upper case in case of REG_ICASE, apply translation.  */
 555
 556 static reg_errcode_t
 557 internal_function
 558 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 559 {
 560   int offset = idx - pstr->raw_mbs_idx;
 561   if (BE (offset < 0, 0))
 562     {
 563       /* Reset buffer.  */
 564 #ifdef RE_ENABLE_I18N
 565       if (pstr->mb_cur_max > 1)
 566         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 567 #endif
 568       pstr->len = pstr->raw_len;
 569       pstr->stop = pstr->raw_stop;
 570       pstr->valid_len = 0;
 571       pstr->raw_mbs_idx = 0;
 572       pstr->valid_raw_len = 0;
 573       pstr->offsets_needed = 0;
 574       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 575                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 576       if (!pstr->mbs_allocated)
 577         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 578       offset = idx;
 579     }
 580
 581   if (BE (offset != 0, 1))
 582     {
 583       /* Are the characters which are already checked remain?  */
 584       if (BE (offset < pstr->valid_raw_len, 1)
 585 #ifdef RE_ENABLE_I18N
 586           /* Handling this would enlarge the code too much.
 587              Accept a slowdown in that case.  */
 588           && pstr->offsets_needed == 0
 589 #endif
 590          )
 591         {
 592           /* Yes, move them to the front of the buffer.  */
 593           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 594 #ifdef RE_ENABLE_I18N
 595           if (pstr->mb_cur_max > 1)
 596             memmove (pstr->wcs, pstr->wcs + offset,
 597                      (pstr->valid_len - offset) * sizeof (wint_t));
 598 #endif
 599           if (BE (pstr->mbs_allocated, 0))
 600             memmove (pstr->mbs, pstr->mbs + offset,
 601                      pstr->valid_len - offset);
 602           pstr->valid_len -= offset;
 603           pstr->valid_raw_len -= offset;
 604 #ifdef DEBUG
 605           assert (pstr->valid_len > 0);
 606 #endif
 607         }
 608       else
 609         {
 610           /* No, skip all characters until IDX.  */
 611 #ifdef RE_ENABLE_I18N
 612           if (BE (pstr->offsets_needed, 0))
 613             {
 614               pstr->len = pstr->raw_len - idx + offset;
 615               pstr->stop = pstr->raw_stop - idx + offset;
 616               pstr->offsets_needed = 0;
 617             }
 618 #endif
 619           pstr->valid_len = 0;
 620           pstr->valid_raw_len = 0;
 621 #ifdef RE_ENABLE_I18N
 622           if (pstr->mb_cur_max > 1)
 623             {
 624               int wcs_idx;
 625               wint_t wc = WEOF;
 626
 627               if (pstr->is_utf8)
 628                 {
 629                   const unsigned char *raw, *p, *end;
 630
 631                   /* Special case UTF-8.  Multi-byte chars start with any
 632                      byte other than 0x80 - 0xbf.  */
 633                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 634                   end = raw + (offset - pstr->mb_cur_max);
 635                   p = raw + offset - 1;
 636 #if 0
 637                   /* We know the wchar_t encoding is UCS4, so for the simple
 638                      case, ASCII characters, skip the conversion step.  */
 639                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 640                     {
 641                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 642                       pstr->valid_len = 0;
 643                       wc = (wchar_t) *p;
 644                     }
 645                   else
 646 #endif
 647                     for (; p >= end; --p)
 648                       if ((*p & 0xc0) != 0x80)
 649                         {
 650                           mbstate_t cur_state;
 651                           wchar_t wc2;
 652                           int mlen = raw + pstr->len - p;
 653                           size_t mbclen;
 654
 655                           /* XXX Don't use mbrtowc, we know which conversion
 656                              to use (UTF-8 -> UCS4).  */
 657                           memset (&cur_state, 0, sizeof (cur_state));
 658                           mbclen = mbrtowc (&wc2, (const char *) p, mlen,
 659                                             &cur_state);
 660                           if (raw + offset - p <= mbclen
 661                               && mbclen < (size_t) -2)
 662                             {
 663                               memset (&pstr->cur_state, '\0',
 664                                       sizeof (mbstate_t));
 665                               pstr->valid_len = mbclen - (raw + offset - p);
 666                               wc = wc2;
 667                             }
 668                           break;
 669                         }
 670                 }
 671
 672               if (wc == WEOF)
 673                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 674               if (BE (pstr->valid_len, 0))
 675                 {
 676                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 677                     pstr->wcs[wcs_idx] = WEOF;
 678                   if (pstr->mbs_allocated)
 679                     memset (pstr->mbs, 255, pstr->valid_len);
 680                 }
 681               pstr->valid_raw_len = pstr->valid_len;
 682               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 683                                     && IS_WIDE_WORD_CHAR (wc))
 684                                    ? CONTEXT_WORD
 685                                    : ((IS_WIDE_NEWLINE (wc)
 686                                        && pstr->newline_anchor)
 687                                       ? CONTEXT_NEWLINE : 0));
 688             }
 689           else
 690 #endif /* RE_ENABLE_I18N */
 691             {
 692               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 693               if (pstr->trans)
 694                 c = pstr->trans[c];
 695               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 696                                    ? CONTEXT_WORD
 697                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 698                                       ? CONTEXT_NEWLINE : 0));
 699             }
 700         }
 701       if (!BE (pstr->mbs_allocated, 0))
 702         pstr->mbs += offset;
 703     }
 704   pstr->raw_mbs_idx = idx;
 705   pstr->len -= offset;
 706   pstr->stop -= offset;
 707
 708   /* Then build the buffers.  */
 709 #ifdef RE_ENABLE_I18N
 710   if (pstr->mb_cur_max > 1)
 711     {
 712       if (pstr->icase)
 713         {
 714           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 715           if (BE (ret != REG_NOERROR, 0))
 716             return ret;
 717         }
 718       else
 719         build_wcs_buffer (pstr);
 720     }
 721   else
 722 #endif
 723     if (BE (pstr->mbs_allocated, 0))
 724       {
 725         if (pstr->icase)
 726           build_upper_buffer (pstr);
 727         else if (pstr->trans != NULL)
 728           re_string_translate_buffer (pstr);
 729       }
 730     else
 731       pstr->valid_len = pstr->len;
 732
 733   pstr->cur_idx = 0;
 734   return REG_NOERROR;
 735 }
 736
 737 static unsigned char
 738 internal_function __attribute ((pure))
 739 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 740 {
 741   int ch, off;
 742
 743   /* Handle the common (easiest) cases first.  */
 744   if (BE (!pstr->mbs_allocated, 1))
 745     return re_string_peek_byte (pstr, idx);
 746
 747 #ifdef RE_ENABLE_I18N
 748   if (pstr->mb_cur_max > 1
 749       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 750     return re_string_peek_byte (pstr, idx);
 751 #endif
 752
 753   off = pstr->cur_idx + idx;
 754 #ifdef RE_ENABLE_I18N
 755   if (pstr->offsets_needed)
 756     off = pstr->offsets[off];
 757 #endif
 758
 759   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 760
 761 #ifdef RE_ENABLE_I18N
 762   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 763      this function returns CAPITAL LETTER I instead of first byte of
 764      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 765      since peek_byte_case doesn't advance cur_idx in any way.  */
 766   if (pstr->offsets_needed && !isascii (ch))
 767     return re_string_peek_byte (pstr, idx);
 768 #endif
 769
 770   return ch;
 771 }
 772
 773 static unsigned char
 774 internal_function __attribute ((pure))
 775 re_string_fetch_byte_case (re_string_t *pstr)
 776 {
 777   if (BE (!pstr->mbs_allocated, 1))
 778     return re_string_fetch_byte (pstr);
 779
 780 #ifdef RE_ENABLE_I18N
 781   if (pstr->offsets_needed)
 782     {
 783       int off, ch;
 784
 785       /* For tr_TR.UTF-8 [[:islower:]] there is
 786          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 787          in that case the whole multi-byte character and return
 788          the original letter.  On the other side, with
 789          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 790          anything else would complicate things too much.  */
 791
 792       if (!re_string_first_byte (pstr, pstr->cur_idx))
 793         return re_string_fetch_byte (pstr);
 794
 795       off = pstr->offsets[pstr->cur_idx];
 796       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 797
 798       if (! isascii (ch))
 799         return re_string_fetch_byte (pstr);
 800
 801       re_string_skip_bytes (pstr,
 802                             re_string_char_size_at (pstr, pstr->cur_idx));
 803       return ch;
 804     }
 805 #endif
 806
 807   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 808 }
 809
 810 static void
 811 internal_function
 812 re_string_destruct (re_string_t *pstr)
 813 {
 814 #ifdef RE_ENABLE_I18N
 815   re_free (pstr->wcs);
 816   re_free (pstr->offsets);
 817 #endif /* RE_ENABLE_I18N  */
 818   if (pstr->mbs_allocated)
 819     re_free (pstr->mbs);
 820 }
 821
 822 /* Return the context at IDX in INPUT.  */
 823
 824 static unsigned int
 825 internal_function
 826 re_string_context_at (const re_string_t *input, int idx, int eflags)
 827 {
 828   int c;
 829   if (BE (idx < 0, 0))
 830     /* In this case, we use the value stored in input->tip_context,
 831        since we can't know the character in input->mbs[-1] here.  */
 832     return input->tip_context;
 833   if (BE (idx == input->len, 0))
 834     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 835             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 836 #ifdef RE_ENABLE_I18N
 837   if (input->mb_cur_max > 1)
 838     {
 839       wint_t wc;
 840       int wc_idx = idx;
 841       while(input->wcs[wc_idx] == WEOF)
 842         {
 843 #ifdef DEBUG
 844           /* It must not happen.  */
 845           assert (wc_idx >= 0);
 846 #endif
 847           --wc_idx;
 848           if (wc_idx < 0)
 849             return input->tip_context;
 850         }
 851       wc = input->wcs[wc_idx];
 852       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 853         return CONTEXT_WORD;
 854       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 855               ? CONTEXT_NEWLINE : 0);
 856     }
 857 #endif
 858   c = re_string_byte_at (input, idx);
 859   if (bitset_contain (input->word_char, c))
 860     return CONTEXT_WORD;
 861   return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 862 }
 863
 864 /* Functions for set operation.  */
 865
 866 static reg_errcode_t
 867 internal_function
 868 re_node_set_alloc (re_node_set *set, int size)
 869 {
 870   set->alloc = size;
 871   set->nelem = 0;
 872   set->elems = re_malloc (int, size);   /* can be NULL if size == 0
 873                                            (see re_node_set_init_empty(set)) */
 874   if (BE (set->elems == NULL && size != 0, 0))
 875     return REG_ESPACE;
 876   return REG_NOERROR;
 877 }
 878
 879 static reg_errcode_t
 880 internal_function
 881 re_node_set_init_1 (re_node_set *set, int elem)
 882 {
 883   set->alloc = 1;
 884   set->nelem = 1;
 885   set->elems = re_malloc (int, 1);
 886   if (BE (set->elems == NULL, 0))
 887     {
 888       set->alloc = set->nelem = 0;
 889       return REG_ESPACE;
 890     }
 891   set->elems[0] = elem;
 892   return REG_NOERROR;
 893 }
 894
 895 static reg_errcode_t
 896 internal_function
 897 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 898 {
 899   set->alloc = 2;
 900   set->elems = re_malloc (int, 2);
 901   if (BE (set->elems == NULL, 0))
 902     return REG_ESPACE;
 903   if (elem1 == elem2)
 904     {
 905       set->nelem = 1;
 906       set->elems[0] = elem1;
 907     }
 908   else
 909     {
 910       set->nelem = 2;
 911       if (elem1 < elem2)
 912         {
 913           set->elems[0] = elem1;
 914           set->elems[1] = elem2;
 915         }
 916       else
 917         {
 918           set->elems[0] = elem2;
 919           set->elems[1] = elem1;
 920         }
 921     }
 922   return REG_NOERROR;
 923 }
 924
 925 static reg_errcode_t
 926 internal_function
 927 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 928 {
 929   dest->nelem = src->nelem;
 930   if (src->nelem > 0)
 931     {
 932       dest->alloc = dest->nelem;
 933       dest->elems = re_malloc (int, dest->alloc);
 934       if (BE (dest->elems == NULL, 0))
 935         {
 936           dest->alloc = dest->nelem = 0;
 937           return REG_ESPACE;
 938         }
 939       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 940     }
 941   else
 942     re_node_set_init_empty (dest);
 943   return REG_NOERROR;
 944 }
 945
 946 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 947    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 948    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 949
 950 static reg_errcode_t
 951 internal_function
 952 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
 953                            const re_node_set *src2)
 954 {
 955   int i1, i2, is, id, delta, sbase;
 956   if (src1->nelem == 0 || src2->nelem == 0)
 957     return REG_NOERROR;
 958
 959   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 960      conservative estimate.  */
 961   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 962     {
 963       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 964       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 965       if (BE (new_elems == NULL, 0))
 966         return REG_ESPACE;
 967       dest->elems = new_elems;
 968       dest->alloc = new_alloc;
 969     }
 970
 971   /* Find the items in the intersection of SRC1 and SRC2, and copy
 972      into the top of DEST those that are not already in DEST itself.  */
 973   sbase = dest->nelem + src1->nelem + src2->nelem;
 974   i1 = src1->nelem - 1;
 975   i2 = src2->nelem - 1;
 976   id = dest->nelem - 1;
 977   for (;;)
 978     {
 979       if (src1->elems[i1] == src2->elems[i2])
 980         {
 981           /* Try to find the item in DEST.  Maybe we could binary search?  */
 982           while (id >= 0 && dest->elems[id] > src1->elems[i1])
 983             --id;
 984
 985           if (id < 0 || dest->elems[id] != src1->elems[i1])
 986             dest->elems[--sbase] = src1->elems[i1];
 987
 988           if (--i1 < 0 || --i2 < 0)
 989             break;
 990         }
 991
 992       /* Lower the highest of the two items.  */
 993       else if (src1->elems[i1] < src2->elems[i2])
 994         {
 995           if (--i2 < 0)
 996             break;
 997         }
 998       else
 999         {
1000           if (--i1 < 0)
1001             break;
1002         }
1003     }
1004
1005   id = dest->nelem - 1;
1006   is = dest->nelem + src1->nelem + src2->nelem - 1;
1007   delta = is - sbase + 1;
1008
1009   /* Now copy.  When DELTA becomes zero, the remaining
1010      DEST elements are already in place; this is more or
1011      less the same loop that is in re_node_set_merge.  */
1012   dest->nelem += delta;
1013   if (delta > 0 && id >= 0)
1014     for (;;)
1015       {
1016         if (dest->elems[is] > dest->elems[id])
1017           {
1018             /* Copy from the top.  */
1019             dest->elems[id + delta--] = dest->elems[is--];
1020             if (delta == 0)
1021               break;
1022           }
1023         else
1024           {
1025             /* Slide from the bottom.  */
1026             dest->elems[id + delta] = dest->elems[id];
1027             if (--id < 0)
1028               break;
1029           }
1030       }
1031
1032   /* Copy remaining SRC elements.  */
1033   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1034
1035   return REG_NOERROR;
1036 }
1037
1038 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1039    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1040
1041 static reg_errcode_t
1042 internal_function
1043 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1044                         const re_node_set *src2)
1045 {
1046   int i1, i2, id;
1047   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1048     {
1049       dest->alloc = src1->nelem + src2->nelem;
1050       dest->elems = re_malloc (int, dest->alloc);
1051       if (BE (dest->elems == NULL, 0))
1052         return REG_ESPACE;
1053     }
1054   else
1055     {
1056       if (src1 != NULL && src1->nelem > 0)
1057         return re_node_set_init_copy (dest, src1);
1058       if (src2 != NULL && src2->nelem > 0)
1059         return re_node_set_init_copy (dest, src2);
1060       re_node_set_init_empty (dest);
1061       return REG_NOERROR;
1062     }
1063   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1064     {
1065       if (src1->elems[i1] > src2->elems[i2])
1066         {
1067           dest->elems[id++] = src2->elems[i2++];
1068           continue;
1069         }
1070       if (src1->elems[i1] == src2->elems[i2])
1071         ++i2;
1072       dest->elems[id++] = src1->elems[i1++];
1073     }
1074   if (i1 < src1->nelem)
1075     {
1076       memcpy (dest->elems + id, src1->elems + i1,
1077              (src1->nelem - i1) * sizeof (int));
1078       id += src1->nelem - i1;
1079     }
1080   else if (i2 < src2->nelem)
1081     {
1082       memcpy (dest->elems + id, src2->elems + i2,
1083              (src2->nelem - i2) * sizeof (int));
1084       id += src2->nelem - i2;
1085     }
1086   dest->nelem = id;
1087   return REG_NOERROR;
1088 }
1089
1090 /* Calculate the union set of the sets DEST and SRC. And store it to
1091    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1092
1093 static reg_errcode_t
1094 internal_function
1095 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1096 {
1097   int is, id, sbase, delta;
1098   if (src == NULL || src->nelem == 0)
1099     return REG_NOERROR;
1100   if (dest->alloc < 2 * src->nelem + dest->nelem)
1101     {
1102       int new_alloc = 2 * (src->nelem + dest->alloc);
1103       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1104       if (BE (new_buffer == NULL, 0))
1105         return REG_ESPACE;
1106       dest->elems = new_buffer;
1107       dest->alloc = new_alloc;
1108     }
1109
1110   if (BE (dest->nelem == 0, 0))
1111     {
1112       dest->nelem = src->nelem;
1113       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1114       return REG_NOERROR;
1115     }
1116
1117   /* Copy into the top of DEST the items of SRC that are not
1118      found in DEST.  Maybe we could binary search in DEST?  */
1119   for (sbase = dest->nelem + 2 * src->nelem,
1120        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1121     {
1122       if (dest->elems[id] == src->elems[is])
1123         is--, id--;
1124       else if (dest->elems[id] < src->elems[is])
1125         dest->elems[--sbase] = src->elems[is--];
1126       else /* if (dest->elems[id] > src->elems[is]) */
1127         --id;
1128     }
1129
1130   if (is >= 0)
1131     {
1132       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1133       sbase -= is + 1;
1134       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1135     }
1136
1137   id = dest->nelem - 1;
1138   is = dest->nelem + 2 * src->nelem - 1;
1139   delta = is - sbase + 1;
1140   if (delta == 0)
1141     return REG_NOERROR;
1142
1143   /* Now copy.  When DELTA becomes zero, the remaining
1144      DEST elements are already in place.  */
1145   dest->nelem += delta;
1146   for (;;)
1147     {
1148       if (dest->elems[is] > dest->elems[id])
1149         {
1150           /* Copy from the top.  */
1151           dest->elems[id + delta--] = dest->elems[is--];
1152           if (delta == 0)
1153             break;
1154         }
1155       else
1156         {
1157           /* Slide from the bottom.  */
1158           dest->elems[id + delta] = dest->elems[id];
1159           if (--id < 0)
1160             {
1161               /* Copy remaining SRC elements.  */
1162               memcpy (dest->elems, dest->elems + sbase,
1163                       delta * sizeof (int));
1164               break;
1165             }
1166         }
1167     }
1168
1169   return REG_NOERROR;
1170 }
1171
1172 /* Insert the new element ELEM to the re_node_set* SET.
1173    SET should not already have ELEM.
1174    return -1 if an error is occured, return 1 otherwise.  */
1175
1176 static int
1177 internal_function
1178 re_node_set_insert (re_node_set *set, int elem)
1179 {
1180   int idx;
1181   /* In case the set is empty.  */
1182   if (set->alloc == 0)
1183     {
1184       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1185         return 1;
1186       return -1;
1187     }
1188
1189   if (BE (set->nelem, 0) == 0)
1190     {
1191       /* We already guaranteed above that set->alloc != 0.  */
1192       set->elems[0] = elem;
1193       ++set->nelem;
1194       return 1;
1195     }
1196
1197   /* Realloc if we need.  */
1198   if (set->alloc == set->nelem)
1199     {
1200       int *new_elems;
1201       set->alloc = set->alloc * 2;
1202       new_elems = re_realloc (set->elems, int, set->alloc);
1203       if (BE (new_elems == NULL, 0))
1204         return -1;
1205       set->elems = new_elems;
1206     }
1207
1208   /* Move the elements which follows the new element.  Test the
1209      first element separately to skip a check in the inner loop.  */
1210   if (elem < set->elems[0])
1211     {
1212       idx = 0;
1213       for (idx = set->nelem; idx > 0; idx--)
1214         set->elems[idx] = set->elems[idx - 1];
1215     }
1216   else
1217     {
1218       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1219         set->elems[idx] = set->elems[idx - 1];
1220     }
1221
1222   /* Insert the new element.  */
1223   set->elems[idx] = elem;
1224   ++set->nelem;
1225   return 1;
1226 }
1227
1228 /* Insert the new element ELEM to the re_node_set* SET.
1229    SET should not already have any element greater than or equal to ELEM.
1230    Return -1 if an error is occured, return 1 otherwise.  */
1231
1232 static int
1233 internal_function
1234 re_node_set_insert_last (re_node_set *set, int elem)
1235 {
1236   /* Realloc if we need.  */
1237   if (set->alloc == set->nelem)
1238     {
1239       int *new_elems;
1240       set->alloc = (set->alloc + 1) * 2;
1241       new_elems = re_realloc (set->elems, int, set->alloc);
1242       if (BE (new_elems == NULL, 0))
1243         return -1;
1244       set->elems = new_elems;
1245     }
1246
1247   /* Insert the new element.  */
1248   set->elems[set->nelem++] = elem;
1249   return 1;
1250 }
1251
1252 /* Compare two node sets SET1 and SET2.
1253    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1254
1255 static int
1256 internal_function __attribute ((pure))
1257 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1258 {
1259   int i;
1260   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1261     return 0;
1262   for (i = set1->nelem ; --i >= 0 ; )
1263     if (set1->elems[i] != set2->elems[i])
1264       return 0;
1265   return 1;
1266 }
1267
1268 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1269
1270 static int
1271 internal_function __attribute ((pure))
1272 re_node_set_contains (const re_node_set *set, int elem)
1273 {
1274   unsigned int idx, right, mid;
1275   if (set->nelem <= 0)
1276     return 0;
1277
1278   /* Binary search the element.  */
1279   idx = 0;
1280   right = set->nelem - 1;
1281   while (idx < right)
1282     {
1283       mid = (idx + right) / 2;
1284       if (set->elems[mid] < elem)
1285         idx = mid + 1;
1286       else
1287         right = mid;
1288     }
1289   return set->elems[idx] == elem ? idx + 1 : 0;
1290 }
1291
1292 static void
1293 internal_function
1294 re_node_set_remove_at (re_node_set *set, int idx)
1295 {
1296   if (idx < 0 || idx >= set->nelem)
1297     return;
1298   --set->nelem;
1299   for (; idx < set->nelem; idx++)
1300     set->elems[idx] = set->elems[idx + 1];
1301 }
1302
1303
1304 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1305    Or return -1, if an error will be occured.  */
1306
1307 static int
1308 internal_function
1309 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1310 {
1311 #ifdef RE_ENABLE_I18N
1312   int type = token.type;
1313 #endif
1314   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1315     {
1316       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1317       int *new_nexts, *new_indices;
1318       re_node_set *new_edests, *new_eclosures;
1319       re_token_t *new_nodes;
1320
1321       /* Avoid overflows.  */
1322       if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1323         return -1;
1324
1325       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1326       if (BE (new_nodes == NULL, 0))
1327         return -1;
1328       dfa->nodes = new_nodes;
1329       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1330       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1331       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1332       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1333       if (BE (new_nexts == NULL || new_indices == NULL
1334               || new_edests == NULL || new_eclosures == NULL, 0))
1335         return -1;
1336       dfa->nexts = new_nexts;
1337       dfa->org_indices = new_indices;
1338       dfa->edests = new_edests;
1339       dfa->eclosures = new_eclosures;
1340       dfa->nodes_alloc = new_nodes_alloc;
1341     }
1342   dfa->nodes[dfa->nodes_len] = token;
1343   dfa->nodes[dfa->nodes_len].constraint = 0;
1344 #ifdef RE_ENABLE_I18N
1345   dfa->nodes[dfa->nodes_len].accept_mb =
1346     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1347 #endif
1348   dfa->nexts[dfa->nodes_len] = -1;
1349   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1350   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1351   return dfa->nodes_len++;
1352 }
1353
1354 static __inline__ unsigned int
1355 internal_function
1356 calc_state_hash (const re_node_set *nodes, unsigned int context)
1357 {
1358   unsigned int hash = nodes->nelem + context;
1359   int i;
1360   for (i = 0 ; i < nodes->nelem ; i++)
1361     hash += nodes->elems[i];
1362   return hash;
1363 }
1364
1365 /* Search for the state whose node_set is equivalent to NODES.
1366    Return the pointer to the state, if we found it in the DFA.
1367    Otherwise create the new one and return it.  In case of an error
1368    return NULL and set the error code in ERR.
1369    Note: - We assume NULL as the invalid state, then it is possible that
1370            return value is NULL and ERR is REG_NOERROR.
1371          - We never return non-NULL value in case of any errors, it is for
1372            optimization.  */
1373
1374 static re_dfastate_t *
1375 internal_function
1376 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1377                   const re_node_set *nodes)
1378 {
1379   unsigned int hash;
1380   re_dfastate_t *new_state;
1381   struct re_state_table_entry *spot;
1382   int i;
1383   if (BE (nodes->nelem == 0, 0))
1384     {
1385       *err = REG_NOERROR;
1386       return NULL;
1387     }
1388   hash = calc_state_hash (nodes, 0);
1389   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1390
1391   for (i = 0 ; i < spot->num ; i++)
1392     {
1393       re_dfastate_t *state = spot->array[i];
1394       if (hash != state->hash)
1395         continue;
1396       if (re_node_set_compare (&state->nodes, nodes))
1397         return state;
1398     }
1399
1400   /* There are no appropriate state in the dfa, create the new one.  */
1401   new_state = create_ci_newstate (dfa, nodes, hash);
1402   if (BE (new_state == NULL, 0))
1403     *err = REG_ESPACE;
1404
1405   return new_state;
1406 }
1407
1408 /* Search for the state whose node_set is equivalent to NODES and
1409    whose context is equivalent to CONTEXT.
1410    Return the pointer to the state, if we found it in the DFA.
1411    Otherwise create the new one and return it.  In case of an error
1412    return NULL and set the error code in ERR.
1413    Note: - We assume NULL as the invalid state, then it is possible that
1414            return value is NULL and ERR is REG_NOERROR.
1415          - We never return non-NULL value in case of any errors, it is for
1416            optimization.  */
1417
1418 static re_dfastate_t *
1419 internal_function
1420 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1421                           const re_node_set *nodes, unsigned int context)
1422 {
1423   unsigned int hash;
1424   re_dfastate_t *new_state;
1425   struct re_state_table_entry *spot;
1426   int i;
1427   if (nodes->nelem == 0)
1428     {
1429       *err = REG_NOERROR;
1430       return NULL;
1431     }
1432   hash = calc_state_hash (nodes, context);
1433   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1434
1435   for (i = 0 ; i < spot->num ; i++)
1436     {
1437       re_dfastate_t *state = spot->array[i];
1438       if (state->hash == hash
1439           && state->context == context
1440           && re_node_set_compare (state->entrance_nodes, nodes))
1441         return state;
1442     }
1443   /* There are no appropriate state in `dfa', create the new one.  */
1444   new_state = create_cd_newstate (dfa, nodes, context, hash);
1445   if (BE (new_state == NULL, 0))
1446     *err = REG_ESPACE;
1447
1448   return new_state;
1449 }
1450
1451 /* Finish initialization of the new state NEWSTATE, and using its hash value
1452    HASH put in the appropriate bucket of DFA's state table.  Return value
1453    indicates the error code if failed.  */
1454
1455 static reg_errcode_t
1456 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1457                 unsigned int hash)
1458 {
1459   struct re_state_table_entry *spot;
1460   reg_errcode_t err;
1461   int i;
1462
1463   newstate->hash = hash;
1464   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1465   if (BE (err != REG_NOERROR, 0))
1466     return REG_ESPACE;
1467   for (i = 0; i < newstate->nodes.nelem; i++)
1468     {
1469       int elem = newstate->nodes.elems[i];
1470       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1471         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1472     }
1473
1474   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1475   if (BE (spot->alloc <= spot->num, 0))
1476     {
1477       int new_alloc = 2 * spot->num + 2;
1478       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1479                                               new_alloc);
1480       if (BE (new_array == NULL, 0))
1481         return REG_ESPACE;
1482       spot->array = new_array;
1483       spot->alloc = new_alloc;
1484     }
1485   spot->array[spot->num++] = newstate;
1486   return REG_NOERROR;
1487 }
1488
1489 static void
1490 free_state (re_dfastate_t *state)
1491 {
1492   re_node_set_free (&state->non_eps_nodes);
1493   re_node_set_free (&state->inveclosure);
1494   if (state->entrance_nodes != &state->nodes)
1495     {
1496       re_node_set_free (state->entrance_nodes);
1497       re_free (state->entrance_nodes);
1498     }
1499   re_node_set_free (&state->nodes);
1500   re_free (state->word_trtable);
1501   re_free (state->trtable);
1502   re_free (state);
1503 }
1504
1505 /* Create the new state which is independ of contexts.
1506    Return the new state if succeeded, otherwise return NULL.  */
1507
1508 static re_dfastate_t *
1509 internal_function
1510 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1511                     unsigned int hash)
1512 {
1513   int i;
1514   reg_errcode_t err;
1515   re_dfastate_t *newstate;
1516
1517   newstate = calloc (sizeof (re_dfastate_t), 1);
1518   if (BE (newstate == NULL, 0))
1519     return NULL;
1520   err = re_node_set_init_copy (&newstate->nodes, nodes);
1521   if (BE (err != REG_NOERROR, 0))
1522     {
1523       re_free (newstate);
1524       return NULL;
1525     }
1526
1527   newstate->entrance_nodes = &newstate->nodes;
1528   for (i = 0 ; i < nodes->nelem ; i++)
1529     {
1530       re_token_t *node = dfa->nodes + nodes->elems[i];
1531       re_token_type_t type = node->type;
1532
1533       if (type == CHARACTER && !node->constraint)
1534         continue;
1535 #ifdef RE_ENABLE_I18N
1536       newstate->accept_mb |= node->accept_mb;
1537 #endif
1538
1539       /* If the state has the halt node, the state is a halt state.  */
1540       if (type == END_OF_RE)
1541         newstate->halt = 1;
1542       else if (type == OP_BACK_REF)
1543         newstate->has_backref = 1;
1544       else if (type == ANCHOR || node->constraint)
1545         newstate->has_constraint = 1;
1546     }
1547   err = register_state (dfa, newstate, hash);
1548   if (BE (err != REG_NOERROR, 0))
1549     {
1550       free_state (newstate);
1551       newstate = NULL;
1552     }
1553   return newstate;
1554 }
1555
1556 /* Create the new state which is depend on the context CONTEXT.
1557    Return the new state if succeeded, otherwise return NULL.  */
1558
1559 static re_dfastate_t *
1560 internal_function
1561 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1562                     unsigned int context, unsigned int hash)
1563 {
1564   int i, nctx_nodes = 0;
1565   reg_errcode_t err;
1566   re_dfastate_t *newstate;
1567
1568   newstate = calloc (sizeof (re_dfastate_t), 1);
1569   if (BE (newstate == NULL, 0))
1570     return NULL;
1571   err = re_node_set_init_copy (&newstate->nodes, nodes);
1572   if (BE (err != REG_NOERROR, 0))
1573     {
1574       re_free (newstate);
1575       return NULL;
1576     }
1577
1578   newstate->context = context;
1579   newstate->entrance_nodes = &newstate->nodes;
1580
1581   for (i = 0 ; i < nodes->nelem ; i++)
1582     {
1583       unsigned int constraint = 0;
1584       re_token_t *node = dfa->nodes + nodes->elems[i];
1585       re_token_type_t type = node->type;
1586       if (node->constraint)
1587         constraint = node->constraint;
1588
1589       if (type == CHARACTER && !constraint)
1590         continue;
1591 #ifdef RE_ENABLE_I18N
1592       newstate->accept_mb |= node->accept_mb;
1593 #endif /* RE_ENABLE_I18N */
1594
1595       /* If the state has the halt node, the state is a halt state.  */
1596       if (type == END_OF_RE)
1597         newstate->halt = 1;
1598       else if (type == OP_BACK_REF)
1599         newstate->has_backref = 1;
1600       else if (type == ANCHOR)
1601         constraint = node->opr.ctx_type;
1602
1603       if (constraint)
1604         {
1605           if (newstate->entrance_nodes == &newstate->nodes)
1606             {
1607               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1608               if (BE (newstate->entrance_nodes == NULL, 0))
1609                 {
1610                   free_state (newstate);
1611                   return NULL;
1612                 }
1613               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1614               nctx_nodes = 0;
1615               newstate->has_constraint = 1;
1616             }
1617
1618           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1619             {
1620               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1621               ++nctx_nodes;
1622             }
1623         }
1624     }
1625   err = register_state (dfa, newstate, hash);
1626   if (BE (err != REG_NOERROR, 0))
1627     {
1628       free_state (newstate);
1629       newstate = NULL;
1630     }
1631   return  newstate;
1632 }