posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2006, 2010, 2011 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                           const re_node_set *nodes,
  27                                           unsigned int hash) internal_function;
  28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                           const re_node_set *nodes,
  30                                           unsigned int context,
  31                                           unsigned int hash) internal_function;
  32 \f
  33 /* Functions for string operation.  */
  34
  35 /* This function allocate the buffers.  It is necessary to call
  36    re_string_reconstruct before using the object.  */
  37
  38 static reg_errcode_t
  39 internal_function __attribute_warn_unused_result__
  40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  41                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  42 {
  43   reg_errcode_t ret;
  44   int init_buf_len;
  45
  46   /* Ensure at least one character fits into the buffers.  */
  47   if (init_len < dfa->mb_cur_max)
  48     init_len = dfa->mb_cur_max;
  49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  51
  52   ret = re_string_realloc_buffers (pstr, init_buf_len);
  53   if (BE (ret != REG_NOERROR, 0))
  54     return ret;
  55
  56   pstr->word_char = dfa->word_char;
  57   pstr->word_ops_used = dfa->word_ops_used;
  58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  60   pstr->valid_raw_len = pstr->valid_len;
  61   return REG_NOERROR;
  62 }
  63
  64 /* This function allocate the buffers, and initialize them.  */
  65
  66 static reg_errcode_t
  67 internal_function __attribute_warn_unused_result__
  68 re_string_construct (re_string_t *pstr, const char *str, int len,
  69                      RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  70 {
  71   reg_errcode_t ret;
  72   memset (pstr, '\0', sizeof (re_string_t));
  73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  74
  75   if (len > 0)
  76     {
  77       ret = re_string_realloc_buffers (pstr, len + 1);
  78       if (BE (ret != REG_NOERROR, 0))
  79         return ret;
  80     }
  81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  82
  83   if (icase)
  84     {
  85 #ifdef RE_ENABLE_I18N
  86       if (dfa->mb_cur_max > 1)
  87         {
  88           while (1)
  89             {
  90               ret = build_wcs_upper_buffer (pstr);
  91               if (BE (ret != REG_NOERROR, 0))
  92                 return ret;
  93               if (pstr->valid_raw_len >= len)
  94                 break;
  95               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  96                 break;
  97               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  98               if (BE (ret != REG_NOERROR, 0))
  99                 return ret;
 100             }
 101         }
 102       else
 103 #endif /* RE_ENABLE_I18N  */
 104         build_upper_buffer (pstr);
 105     }
 106   else
 107     {
 108 #ifdef RE_ENABLE_I18N
 109       if (dfa->mb_cur_max > 1)
 110         build_wcs_buffer (pstr);
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         {
 114           if (trans != NULL)
 115             re_string_translate_buffer (pstr);
 116           else
 117             {
 118               pstr->valid_len = pstr->bufs_len;
 119               pstr->valid_raw_len = pstr->bufs_len;
 120             }
 121         }
 122     }
 123
 124   return REG_NOERROR;
 125 }
 126
 127 /* Helper functions for re_string_allocate, and re_string_construct.  */
 128
 129 static reg_errcode_t
 130 internal_function __attribute_warn_unused_result__
 131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 132 {
 133 #ifdef RE_ENABLE_I18N
 134   if (pstr->mb_cur_max > 1)
 135     {
 136       wint_t *new_wcs;
 137
 138       /* Avoid overflow in realloc.  */
 139       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
 140       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 141         return REG_ESPACE;
 142
 143       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 144       if (BE (new_wcs == NULL, 0))
 145         return REG_ESPACE;
 146       pstr->wcs = new_wcs;
 147       if (pstr->offsets != NULL)
 148         {
 149           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 150           if (BE (new_offsets == NULL, 0))
 151             return REG_ESPACE;
 152           pstr->offsets = new_offsets;
 153         }
 154     }
 155 #endif /* RE_ENABLE_I18N  */
 156   if (pstr->mbs_allocated)
 157     {
 158       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 159                                            new_buf_len);
 160       if (BE (new_mbs == NULL, 0))
 161         return REG_ESPACE;
 162       pstr->mbs = new_mbs;
 163     }
 164   pstr->bufs_len = new_buf_len;
 165   return REG_NOERROR;
 166 }
 167
 168
 169 static void
 170 internal_function
 171 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 172                             RE_TRANSLATE_TYPE trans, int icase,
 173                             const re_dfa_t *dfa)
 174 {
 175   pstr->raw_mbs = (const unsigned char *) str;
 176   pstr->len = len;
 177   pstr->raw_len = len;
 178   pstr->trans = trans;
 179   pstr->icase = icase ? 1 : 0;
 180   pstr->mbs_allocated = (trans != NULL || icase);
 181   pstr->mb_cur_max = dfa->mb_cur_max;
 182   pstr->is_utf8 = dfa->is_utf8;
 183   pstr->map_notascii = dfa->map_notascii;
 184   pstr->stop = pstr->len;
 185   pstr->raw_stop = pstr->stop;
 186 }
 187
 188 #ifdef RE_ENABLE_I18N
 189
 190 /* Build wide character buffer PSTR->WCS.
 191    If the byte sequence of the string are:
 192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 193    Then wide character buffer will be:
 194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 195    We use WEOF for padding, they indicate that the position isn't
 196    a first byte of a multibyte character.
 197
 198    Note that this function assumes PSTR->VALID_LEN elements are already
 199    built and starts from PSTR->VALID_LEN.  */
 200
 201 static void
 202 internal_function
 203 build_wcs_buffer (re_string_t *pstr)
 204 {
 205 #ifdef _LIBC
 206   unsigned char buf[MB_LEN_MAX];
 207   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 208 #else
 209   unsigned char buf[64];
 210 #endif
 211   mbstate_t prev_st;
 212   int byte_idx, end_idx, remain_len;
 213   size_t mbclen;
 214
 215   /* Build the buffers from pstr->valid_len to either pstr->len or
 216      pstr->bufs_len.  */
 217   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 218   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 219     {
 220       wchar_t wc;
 221       const char *p;
 222
 223       remain_len = end_idx - byte_idx;
 224       prev_st = pstr->cur_state;
 225       /* Apply the translation if we need.  */
 226       if (BE (pstr->trans != NULL, 0))
 227         {
 228           int i, ch;
 229
 230           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 231             {
 232               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 233               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 234             }
 235           p = (const char *) buf;
 236         }
 237       else
 238         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 239       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 240       if (BE (mbclen == (size_t) -1 || mbclen == 0
 241               || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
 242         {
 243           /* We treat these cases as a singlebyte character.  */
 244           mbclen = 1;
 245           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 246           if (BE (pstr->trans != NULL, 0))
 247             wc = pstr->trans[wc];
 248           pstr->cur_state = prev_st;
 249         }
 250       else if (BE (mbclen == (size_t) -2, 0))
 251         {
 252           /* The buffer doesn't have enough space, finish to build.  */
 253           pstr->cur_state = prev_st;
 254           break;
 255         }
 256
 257       /* Write wide character and padding.  */
 258       pstr->wcs[byte_idx++] = wc;
 259       /* Write paddings.  */
 260       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 261         pstr->wcs[byte_idx++] = WEOF;
 262     }
 263   pstr->valid_len = byte_idx;
 264   pstr->valid_raw_len = byte_idx;
 265 }
 266
 267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 268    but for REG_ICASE.  */
 269
 270 static reg_errcode_t
 271 internal_function __attribute_warn_unused_result__
 272 build_wcs_upper_buffer (re_string_t *pstr)
 273 {
 274   mbstate_t prev_st;
 275   int src_idx, byte_idx, end_idx, remain_len;
 276   size_t mbclen;
 277 #ifdef _LIBC
 278   char buf[MB_LEN_MAX];
 279   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 280 #else
 281   char buf[64];
 282 #endif
 283
 284   byte_idx = pstr->valid_len;
 285   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 286
 287   /* The following optimization assumes that ASCII characters can be
 288      mapped to wide characters with a simple cast.  */
 289   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 290     {
 291       while (byte_idx < end_idx)
 292         {
 293           wchar_t wc;
 294
 295           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 296               && mbsinit (&pstr->cur_state))
 297             {
 298               /* In case of a singlebyte character.  */
 299               pstr->mbs[byte_idx]
 300                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 301               /* The next step uses the assumption that wchar_t is encoded
 302                  ASCII-safe: all ASCII values can be converted like this.  */
 303               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 304               ++byte_idx;
 305               continue;
 306             }
 307
 308           remain_len = end_idx - byte_idx;
 309           prev_st = pstr->cur_state;
 310           mbclen = __mbrtowc (&wc,
 311                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 312                                + byte_idx), remain_len, &pstr->cur_state);
 313           if (BE (mbclen + 2 > 2, 1))
 314             {
 315               wchar_t wcu = wc;
 316               if (iswlower (wc))
 317                 {
 318                   size_t mbcdlen;
 319
 320                   wcu = towupper (wc);
 321                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 322                   if (BE (mbclen == mbcdlen, 1))
 323                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 324                   else
 325                     {
 326                       src_idx = byte_idx;
 327                       goto offsets_needed;
 328                     }
 329                 }
 330               else
 331                 memcpy (pstr->mbs + byte_idx,
 332                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 333               pstr->wcs[byte_idx++] = wcu;
 334               /* Write paddings.  */
 335               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 336                 pstr->wcs[byte_idx++] = WEOF;
 337             }
 338           else if (mbclen == (size_t) -1 || mbclen == 0
 339                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 340             {
 341               /* It is an invalid character, an incomplete character
 342                  at the end of the string, or '\0'.  Just use the byte.  */
 343               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 344               pstr->mbs[byte_idx] = ch;
 345               /* And also cast it to wide char.  */
 346               pstr->wcs[byte_idx++] = (wchar_t) ch;
 347               if (BE (mbclen == (size_t) -1, 0))
 348                 pstr->cur_state = prev_st;
 349             }
 350           else
 351             {
 352               /* The buffer doesn't have enough space, finish to build.  */
 353               pstr->cur_state = prev_st;
 354               break;
 355             }
 356         }
 357       pstr->valid_len = byte_idx;
 358       pstr->valid_raw_len = byte_idx;
 359       return REG_NOERROR;
 360     }
 361   else
 362     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 363       {
 364         wchar_t wc;
 365         const char *p;
 366       offsets_needed:
 367         remain_len = end_idx - byte_idx;
 368         prev_st = pstr->cur_state;
 369         if (BE (pstr->trans != NULL, 0))
 370           {
 371             int i, ch;
 372
 373             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 374               {
 375                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 376                 buf[i] = pstr->trans[ch];
 377               }
 378             p = (const char *) buf;
 379           }
 380         else
 381           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 382         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 383         if (BE (mbclen + 2 > 2, 1))
 384           {
 385             wchar_t wcu = wc;
 386             if (iswlower (wc))
 387               {
 388                 size_t mbcdlen;
 389
 390                 wcu = towupper (wc);
 391                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 392                 if (BE (mbclen == mbcdlen, 1))
 393                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 394                 else if (mbcdlen != (size_t) -1)
 395                   {
 396                     size_t i;
 397
 398                     if (byte_idx + mbcdlen > pstr->bufs_len)
 399                       {
 400                         pstr->cur_state = prev_st;
 401                         break;
 402                       }
 403
 404                     if (pstr->offsets == NULL)
 405                       {
 406                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 407
 408                         if (pstr->offsets == NULL)
 409                           return REG_ESPACE;
 410                       }
 411                     if (!pstr->offsets_needed)
 412                       {
 413                         for (i = 0; i < (size_t) byte_idx; ++i)
 414                           pstr->offsets[i] = i;
 415                         pstr->offsets_needed = 1;
 416                       }
 417
 418                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 419                     pstr->wcs[byte_idx] = wcu;
 420                     pstr->offsets[byte_idx] = src_idx;
 421                     for (i = 1; i < mbcdlen; ++i)
 422                       {
 423                         pstr->offsets[byte_idx + i]
 424                           = src_idx + (i < mbclen ? i : mbclen - 1);
 425                         pstr->wcs[byte_idx + i] = WEOF;
 426                       }
 427                     pstr->len += mbcdlen - mbclen;
 428                     if (pstr->raw_stop > src_idx)
 429                       pstr->stop += mbcdlen - mbclen;
 430                     end_idx = (pstr->bufs_len > pstr->len)
 431                               ? pstr->len : pstr->bufs_len;
 432                     byte_idx += mbcdlen;
 433                     src_idx += mbclen;
 434                     continue;
 435                   }
 436                 else
 437                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 438               }
 439             else
 440               memcpy (pstr->mbs + byte_idx, p, mbclen);
 441
 442             if (BE (pstr->offsets_needed != 0, 0))
 443               {
 444                 size_t i;
 445                 for (i = 0; i < mbclen; ++i)
 446                   pstr->offsets[byte_idx + i] = src_idx + i;
 447               }
 448             src_idx += mbclen;
 449
 450             pstr->wcs[byte_idx++] = wcu;
 451             /* Write paddings.  */
 452             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 453               pstr->wcs[byte_idx++] = WEOF;
 454           }
 455         else if (mbclen == (size_t) -1 || mbclen == 0
 456                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 457           {
 458             /* It is an invalid character or '\0'.  Just use the byte.  */
 459             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 460
 461             if (BE (pstr->trans != NULL, 0))
 462               ch = pstr->trans [ch];
 463             pstr->mbs[byte_idx] = ch;
 464
 465             if (BE (pstr->offsets_needed != 0, 0))
 466               pstr->offsets[byte_idx] = src_idx;
 467             ++src_idx;
 468
 469             /* And also cast it to wide char.  */
 470             pstr->wcs[byte_idx++] = (wchar_t) ch;
 471             if (BE (mbclen == (size_t) -1, 0))
 472               pstr->cur_state = prev_st;
 473           }
 474         else
 475           {
 476             /* The buffer doesn't have enough space, finish to build.  */
 477             pstr->cur_state = prev_st;
 478             break;
 479           }
 480       }
 481   pstr->valid_len = byte_idx;
 482   pstr->valid_raw_len = src_idx;
 483   return REG_NOERROR;
 484 }
 485
 486 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 487    Return the index.  */
 488
 489 static int
 490 internal_function
 491 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 492 {
 493   mbstate_t prev_st;
 494   int rawbuf_idx;
 495   size_t mbclen;
 496   wint_t wc = WEOF;
 497
 498   /* Skip the characters which are not necessary to check.  */
 499   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 500        rawbuf_idx < new_raw_idx;)
 501     {
 502       wchar_t wc2;
 503       int remain_len = pstr->len - rawbuf_idx;
 504       prev_st = pstr->cur_state;
 505       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 506                           remain_len, &pstr->cur_state);
 507       if (BE ((ssize_t) mbclen <= 0, 0))
 508         {
 509           /* We treat these cases as a single byte character.  */
 510           if (mbclen == 0 || remain_len == 0)
 511             wc = L'\0';
 512           else
 513             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 514           mbclen = 1;
 515           pstr->cur_state = prev_st;
 516         }
 517       else
 518         wc = (wint_t) wc2;
 519       /* Then proceed the next character.  */
 520       rawbuf_idx += mbclen;
 521     }
 522   *last_wc = wc;
 523   return rawbuf_idx;
 524 }
 525 #endif /* RE_ENABLE_I18N  */
 526
 527 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 528    This function is used in case of REG_ICASE.  */
 529
 530 static void
 531 internal_function
 532 build_upper_buffer (re_string_t *pstr)
 533 {
 534   int char_idx, end_idx;
 535   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 536
 537   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 538     {
 539       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 540       if (BE (pstr->trans != NULL, 0))
 541         ch = pstr->trans[ch];
 542       if (islower (ch))
 543         pstr->mbs[char_idx] = toupper (ch);
 544       else
 545         pstr->mbs[char_idx] = ch;
 546     }
 547   pstr->valid_len = char_idx;
 548   pstr->valid_raw_len = char_idx;
 549 }
 550
 551 /* Apply TRANS to the buffer in PSTR.  */
 552
 553 static void
 554 internal_function
 555 re_string_translate_buffer (re_string_t *pstr)
 556 {
 557   int buf_idx, end_idx;
 558   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 559
 560   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 561     {
 562       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 563       pstr->mbs[buf_idx] = pstr->trans[ch];
 564     }
 565
 566   pstr->valid_len = buf_idx;
 567   pstr->valid_raw_len = buf_idx;
 568 }
 569
 570 /* This function re-construct the buffers.
 571    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 572    convert to upper case in case of REG_ICASE, apply translation.  */
 573
 574 static reg_errcode_t
 575 internal_function __attribute_warn_unused_result__
 576 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 577 {
 578   int offset = idx - pstr->raw_mbs_idx;
 579   if (BE (offset < 0, 0))
 580     {
 581       /* Reset buffer.  */
 582 #ifdef RE_ENABLE_I18N
 583       if (pstr->mb_cur_max > 1)
 584         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 585 #endif /* RE_ENABLE_I18N */
 586       pstr->len = pstr->raw_len;
 587       pstr->stop = pstr->raw_stop;
 588       pstr->valid_len = 0;
 589       pstr->raw_mbs_idx = 0;
 590       pstr->valid_raw_len = 0;
 591       pstr->offsets_needed = 0;
 592       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 593                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 594       if (!pstr->mbs_allocated)
 595         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 596       offset = idx;
 597     }
 598
 599   if (BE (offset != 0, 1))
 600     {
 601       /* Should the already checked characters be kept?  */
 602       if (BE (offset < pstr->valid_raw_len, 1))
 603         {
 604           /* Yes, move them to the front of the buffer.  */
 605 #ifdef RE_ENABLE_I18N
 606           if (BE (pstr->offsets_needed, 0))
 607             {
 608               int low = 0, high = pstr->valid_len, mid;
 609               do
 610                 {
 611                   mid = (high + low) / 2;
 612                   if (pstr->offsets[mid] > offset)
 613                     high = mid;
 614                   else if (pstr->offsets[mid] < offset)
 615                     low = mid + 1;
 616                   else
 617                     break;
 618                 }
 619               while (low < high);
 620               if (pstr->offsets[mid] < offset)
 621                 ++mid;
 622               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 623                                                         eflags);
 624               /* This can be quite complicated, so handle specially
 625                  only the common and easy case where the character with
 626                  different length representation of lower and upper
 627                  case is present at or after offset.  */
 628               if (pstr->valid_len > offset
 629                   && mid == offset && pstr->offsets[mid] == offset)
 630                 {
 631                   memmove (pstr->wcs, pstr->wcs + offset,
 632                            (pstr->valid_len - offset) * sizeof (wint_t));
 633                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 634                   pstr->valid_len -= offset;
 635                   pstr->valid_raw_len -= offset;
 636                   for (low = 0; low < pstr->valid_len; low++)
 637                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 638                 }
 639               else
 640                 {
 641                   /* Otherwise, just find out how long the partial multibyte
 642                      character at offset is and fill it with WEOF/255.  */
 643                   pstr->len = pstr->raw_len - idx + offset;
 644                   pstr->stop = pstr->raw_stop - idx + offset;
 645                   pstr->offsets_needed = 0;
 646                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 647                     --mid;
 648                   while (mid < pstr->valid_len)
 649                     if (pstr->wcs[mid] != WEOF)
 650                       break;
 651                     else
 652                       ++mid;
 653                   if (mid == pstr->valid_len)
 654                     pstr->valid_len = 0;
 655                   else
 656                     {
 657                       pstr->valid_len = pstr->offsets[mid] - offset;
 658                       if (pstr->valid_len)
 659                         {
 660                           for (low = 0; low < pstr->valid_len; ++low)
 661                             pstr->wcs[low] = WEOF;
 662                           memset (pstr->mbs, 255, pstr->valid_len);
 663                         }
 664                     }
 665                   pstr->valid_raw_len = pstr->valid_len;
 666                 }
 667             }
 668           else
 669 #endif
 670             {
 671               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 672                                                         eflags);
 673 #ifdef RE_ENABLE_I18N
 674               if (pstr->mb_cur_max > 1)
 675                 memmove (pstr->wcs, pstr->wcs + offset,
 676                          (pstr->valid_len - offset) * sizeof (wint_t));
 677 #endif /* RE_ENABLE_I18N */
 678               if (BE (pstr->mbs_allocated, 0))
 679                 memmove (pstr->mbs, pstr->mbs + offset,
 680                          pstr->valid_len - offset);
 681               pstr->valid_len -= offset;
 682               pstr->valid_raw_len -= offset;
 683 #if DEBUG
 684               assert (pstr->valid_len > 0);
 685 #endif
 686             }
 687         }
 688       else
 689         {
 690           /* No, skip all characters until IDX.  */
 691           int prev_valid_len = pstr->valid_len;
 692
 693 #ifdef RE_ENABLE_I18N
 694           if (BE (pstr->offsets_needed, 0))
 695             {
 696               pstr->len = pstr->raw_len - idx + offset;
 697               pstr->stop = pstr->raw_stop - idx + offset;
 698               pstr->offsets_needed = 0;
 699             }
 700 #endif
 701           pstr->valid_len = 0;
 702 #ifdef RE_ENABLE_I18N
 703           if (pstr->mb_cur_max > 1)
 704             {
 705               int wcs_idx;
 706               wint_t wc = WEOF;
 707
 708               if (pstr->is_utf8)
 709                 {
 710                   const unsigned char *raw, *p, *end;
 711
 712                   /* Special case UTF-8.  Multi-byte chars start with any
 713                      byte other than 0x80 - 0xbf.  */
 714                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 715                   end = raw + (offset - pstr->mb_cur_max);
 716                   if (end < pstr->raw_mbs)
 717                     end = pstr->raw_mbs;
 718                   p = raw + offset - 1;
 719 #ifdef _LIBC
 720                   /* We know the wchar_t encoding is UCS4, so for the simple
 721                      case, ASCII characters, skip the conversion step.  */
 722                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 723                     {
 724                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 725                       /* pstr->valid_len = 0; */
 726                       wc = (wchar_t) *p;
 727                     }
 728                   else
 729 #endif
 730                     for (; p >= end; --p)
 731                       if ((*p & 0xc0) != 0x80)
 732                         {
 733                           mbstate_t cur_state;
 734                           wchar_t wc2;
 735                           int mlen = raw + pstr->len - p;
 736                           unsigned char buf[6];
 737                           size_t mbclen;
 738
 739                           const unsigned char *pp = p;
 740                           if (BE (pstr->trans != NULL, 0))
 741                             {
 742                               int i = mlen < 6 ? mlen : 6;
 743                               while (--i >= 0)
 744                                 buf[i] = pstr->trans[p[i]];
 745                               pp = buf;
 746                             }
 747                           /* XXX Don't use mbrtowc, we know which conversion
 748                              to use (UTF-8 -> UCS4).  */
 749                           memset (&cur_state, 0, sizeof (cur_state));
 750                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 751                                               &cur_state);
 752                           if (raw + offset - p <= mbclen
 753                               && mbclen < (size_t) -2)
 754                             {
 755                               memset (&pstr->cur_state, '\0',
 756                                       sizeof (mbstate_t));
 757                               pstr->valid_len = mbclen - (raw + offset - p);
 758                               wc = wc2;
 759                             }
 760                           break;
 761                         }
 762                 }
 763
 764               if (wc == WEOF)
 765                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 766               if (wc == WEOF)
 767                 pstr->tip_context
 768                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 769               else
 770                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 771                                       && IS_WIDE_WORD_CHAR (wc))
 772                                      ? CONTEXT_WORD
 773                                      : ((IS_WIDE_NEWLINE (wc)
 774                                          && pstr->newline_anchor)
 775                                         ? CONTEXT_NEWLINE : 0));
 776               if (BE (pstr->valid_len, 0))
 777                 {
 778                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 779                     pstr->wcs[wcs_idx] = WEOF;
 780                   if (pstr->mbs_allocated)
 781                     memset (pstr->mbs, 255, pstr->valid_len);
 782                 }
 783               pstr->valid_raw_len = pstr->valid_len;
 784             }
 785           else
 786 #endif /* RE_ENABLE_I18N */
 787             {
 788               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 789               pstr->valid_raw_len = 0;
 790               if (pstr->trans)
 791                 c = pstr->trans[c];
 792               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 793                                    ? CONTEXT_WORD
 794                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 795                                       ? CONTEXT_NEWLINE : 0));
 796             }
 797         }
 798       if (!BE (pstr->mbs_allocated, 0))
 799         pstr->mbs += offset;
 800     }
 801   pstr->raw_mbs_idx = idx;
 802   pstr->len -= offset;
 803   pstr->stop -= offset;
 804
 805   /* Then build the buffers.  */
 806 #ifdef RE_ENABLE_I18N
 807   if (pstr->mb_cur_max > 1)
 808     {
 809       if (pstr->icase)
 810         {
 811           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 812           if (BE (ret != REG_NOERROR, 0))
 813             return ret;
 814         }
 815       else
 816         build_wcs_buffer (pstr);
 817     }
 818   else
 819 #endif /* RE_ENABLE_I18N */
 820     if (BE (pstr->mbs_allocated, 0))
 821       {
 822         if (pstr->icase)
 823           build_upper_buffer (pstr);
 824         else if (pstr->trans != NULL)
 825           re_string_translate_buffer (pstr);
 826       }
 827     else
 828       pstr->valid_len = pstr->len;
 829
 830   pstr->cur_idx = 0;
 831   return REG_NOERROR;
 832 }
 833
 834 static unsigned char
 835 internal_function __attribute ((pure))
 836 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 837 {
 838   int ch, off;
 839
 840   /* Handle the common (easiest) cases first.  */
 841   if (BE (!pstr->mbs_allocated, 1))
 842     return re_string_peek_byte (pstr, idx);
 843
 844 #ifdef RE_ENABLE_I18N
 845   if (pstr->mb_cur_max > 1
 846       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 847     return re_string_peek_byte (pstr, idx);
 848 #endif
 849
 850   off = pstr->cur_idx + idx;
 851 #ifdef RE_ENABLE_I18N
 852   if (pstr->offsets_needed)
 853     off = pstr->offsets[off];
 854 #endif
 855
 856   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 857
 858 #ifdef RE_ENABLE_I18N
 859   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 860      this function returns CAPITAL LETTER I instead of first byte of
 861      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 862      since peek_byte_case doesn't advance cur_idx in any way.  */
 863   if (pstr->offsets_needed && !isascii (ch))
 864     return re_string_peek_byte (pstr, idx);
 865 #endif
 866
 867   return ch;
 868 }
 869
 870 static unsigned char
 871 internal_function
 872 re_string_fetch_byte_case (re_string_t *pstr)
 873 {
 874   if (BE (!pstr->mbs_allocated, 1))
 875     return re_string_fetch_byte (pstr);
 876
 877 #ifdef RE_ENABLE_I18N
 878   if (pstr->offsets_needed)
 879     {
 880       int off, ch;
 881
 882       /* For tr_TR.UTF-8 [[:islower:]] there is
 883          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 884          in that case the whole multi-byte character and return
 885          the original letter.  On the other side, with
 886          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 887          anything else would complicate things too much.  */
 888
 889       if (!re_string_first_byte (pstr, pstr->cur_idx))
 890         return re_string_fetch_byte (pstr);
 891
 892       off = pstr->offsets[pstr->cur_idx];
 893       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 894
 895       if (! isascii (ch))
 896         return re_string_fetch_byte (pstr);
 897
 898       re_string_skip_bytes (pstr,
 899                             re_string_char_size_at (pstr, pstr->cur_idx));
 900       return ch;
 901     }
 902 #endif
 903
 904   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 905 }
 906
 907 static void
 908 internal_function
 909 re_string_destruct (re_string_t *pstr)
 910 {
 911 #ifdef RE_ENABLE_I18N
 912   re_free (pstr->wcs);
 913   re_free (pstr->offsets);
 914 #endif /* RE_ENABLE_I18N  */
 915   if (pstr->mbs_allocated)
 916     re_free (pstr->mbs);
 917 }
 918
 919 /* Return the context at IDX in INPUT.  */
 920
 921 static unsigned int
 922 internal_function
 923 re_string_context_at (const re_string_t *input, int idx, int eflags)
 924 {
 925   int c;
 926   if (BE (idx < 0, 0))
 927     /* In this case, we use the value stored in input->tip_context,
 928        since we can't know the character in input->mbs[-1] here.  */
 929     return input->tip_context;
 930   if (BE (idx == input->len, 0))
 931     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 932             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 933 #ifdef RE_ENABLE_I18N
 934   if (input->mb_cur_max > 1)
 935     {
 936       wint_t wc;
 937       int wc_idx = idx;
 938       while(input->wcs[wc_idx] == WEOF)
 939         {
 940 #ifdef DEBUG
 941           /* It must not happen.  */
 942           assert (wc_idx >= 0);
 943 #endif
 944           --wc_idx;
 945           if (wc_idx < 0)
 946             return input->tip_context;
 947         }
 948       wc = input->wcs[wc_idx];
 949       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 950         return CONTEXT_WORD;
 951       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 952               ? CONTEXT_NEWLINE : 0);
 953     }
 954   else
 955 #endif
 956     {
 957       c = re_string_byte_at (input, idx);
 958       if (bitset_contain (input->word_char, c))
 959         return CONTEXT_WORD;
 960       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 961     }
 962 }
 963 \f
 964 /* Functions for set operation.  */
 965
 966 static reg_errcode_t
 967 internal_function __attribute_warn_unused_result__
 968 re_node_set_alloc (re_node_set *set, int size)
 969 {
 970   set->alloc = size;
 971   set->nelem = 0;
 972   set->elems = re_malloc (int, size);
 973   if (BE (set->elems == NULL, 0))
 974     return REG_ESPACE;
 975   return REG_NOERROR;
 976 }
 977
 978 static reg_errcode_t
 979 internal_function __attribute_warn_unused_result__
 980 re_node_set_init_1 (re_node_set *set, int elem)
 981 {
 982   set->alloc = 1;
 983   set->nelem = 1;
 984   set->elems = re_malloc (int, 1);
 985   if (BE (set->elems == NULL, 0))
 986     {
 987       set->alloc = set->nelem = 0;
 988       return REG_ESPACE;
 989     }
 990   set->elems[0] = elem;
 991   return REG_NOERROR;
 992 }
 993
 994 static reg_errcode_t
 995 internal_function __attribute_warn_unused_result__
 996 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 997 {
 998   set->alloc = 2;
 999   set->elems = re_malloc (int, 2);
1000   if (BE (set->elems == NULL, 0))
1001     return REG_ESPACE;
1002   if (elem1 == elem2)
1003     {
1004       set->nelem = 1;
1005       set->elems[0] = elem1;
1006     }
1007   else
1008     {
1009       set->nelem = 2;
1010       if (elem1 < elem2)
1011         {
1012           set->elems[0] = elem1;
1013           set->elems[1] = elem2;
1014         }
1015       else
1016         {
1017           set->elems[0] = elem2;
1018           set->elems[1] = elem1;
1019         }
1020     }
1021   return REG_NOERROR;
1022 }
1023
1024 static reg_errcode_t
1025 internal_function __attribute_warn_unused_result__
1026 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1027 {
1028   dest->nelem = src->nelem;
1029   if (src->nelem > 0)
1030     {
1031       dest->alloc = dest->nelem;
1032       dest->elems = re_malloc (int, dest->alloc);
1033       if (BE (dest->elems == NULL, 0))
1034         {
1035           dest->alloc = dest->nelem = 0;
1036           return REG_ESPACE;
1037         }
1038       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1039     }
1040   else
1041     re_node_set_init_empty (dest);
1042   return REG_NOERROR;
1043 }
1044
1045 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1046    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1047    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1048
1049 static reg_errcode_t
1050 internal_function __attribute_warn_unused_result__
1051 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1052                            const re_node_set *src2)
1053 {
1054   int i1, i2, is, id, delta, sbase;
1055   if (src1->nelem == 0 || src2->nelem == 0)
1056     return REG_NOERROR;
1057
1058   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1059      conservative estimate.  */
1060   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1061     {
1062       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1063       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1064       if (BE (new_elems == NULL, 0))
1065         return REG_ESPACE;
1066       dest->elems = new_elems;
1067       dest->alloc = new_alloc;
1068     }
1069
1070   /* Find the items in the intersection of SRC1 and SRC2, and copy
1071      into the top of DEST those that are not already in DEST itself.  */
1072   sbase = dest->nelem + src1->nelem + src2->nelem;
1073   i1 = src1->nelem - 1;
1074   i2 = src2->nelem - 1;
1075   id = dest->nelem - 1;
1076   for (;;)
1077     {
1078       if (src1->elems[i1] == src2->elems[i2])
1079         {
1080           /* Try to find the item in DEST.  Maybe we could binary search?  */
1081           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1082             --id;
1083
1084           if (id < 0 || dest->elems[id] != src1->elems[i1])
1085             dest->elems[--sbase] = src1->elems[i1];
1086
1087           if (--i1 < 0 || --i2 < 0)
1088             break;
1089         }
1090
1091       /* Lower the highest of the two items.  */
1092       else if (src1->elems[i1] < src2->elems[i2])
1093         {
1094           if (--i2 < 0)
1095             break;
1096         }
1097       else
1098         {
1099           if (--i1 < 0)
1100             break;
1101         }
1102     }
1103
1104   id = dest->nelem - 1;
1105   is = dest->nelem + src1->nelem + src2->nelem - 1;
1106   delta = is - sbase + 1;
1107
1108   /* Now copy.  When DELTA becomes zero, the remaining
1109      DEST elements are already in place; this is more or
1110      less the same loop that is in re_node_set_merge.  */
1111   dest->nelem += delta;
1112   if (delta > 0 && id >= 0)
1113     for (;;)
1114       {
1115         if (dest->elems[is] > dest->elems[id])
1116           {
1117             /* Copy from the top.  */
1118             dest->elems[id + delta--] = dest->elems[is--];
1119             if (delta == 0)
1120               break;
1121           }
1122         else
1123           {
1124             /* Slide from the bottom.  */
1125             dest->elems[id + delta] = dest->elems[id];
1126             if (--id < 0)
1127               break;
1128           }
1129       }
1130
1131   /* Copy remaining SRC elements.  */
1132   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1133
1134   return REG_NOERROR;
1135 }
1136
1137 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1138    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1139
1140 static reg_errcode_t
1141 internal_function __attribute_warn_unused_result__
1142 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1143                         const re_node_set *src2)
1144 {
1145   int i1, i2, id;
1146   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1147     {
1148       dest->alloc = src1->nelem + src2->nelem;
1149       dest->elems = re_malloc (int, dest->alloc);
1150       if (BE (dest->elems == NULL, 0))
1151         return REG_ESPACE;
1152     }
1153   else
1154     {
1155       if (src1 != NULL && src1->nelem > 0)
1156         return re_node_set_init_copy (dest, src1);
1157       else if (src2 != NULL && src2->nelem > 0)
1158         return re_node_set_init_copy (dest, src2);
1159       else
1160         re_node_set_init_empty (dest);
1161       return REG_NOERROR;
1162     }
1163   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1164     {
1165       if (src1->elems[i1] > src2->elems[i2])
1166         {
1167           dest->elems[id++] = src2->elems[i2++];
1168           continue;
1169         }
1170       if (src1->elems[i1] == src2->elems[i2])
1171         ++i2;
1172       dest->elems[id++] = src1->elems[i1++];
1173     }
1174   if (i1 < src1->nelem)
1175     {
1176       memcpy (dest->elems + id, src1->elems + i1,
1177              (src1->nelem - i1) * sizeof (int));
1178       id += src1->nelem - i1;
1179     }
1180   else if (i2 < src2->nelem)
1181     {
1182       memcpy (dest->elems + id, src2->elems + i2,
1183              (src2->nelem - i2) * sizeof (int));
1184       id += src2->nelem - i2;
1185     }
1186   dest->nelem = id;
1187   return REG_NOERROR;
1188 }
1189
1190 /* Calculate the union set of the sets DEST and SRC. And store it to
1191    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1192
1193 static reg_errcode_t
1194 internal_function __attribute_warn_unused_result__
1195 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1196 {
1197   int is, id, sbase, delta;
1198   if (src == NULL || src->nelem == 0)
1199     return REG_NOERROR;
1200   if (dest->alloc < 2 * src->nelem + dest->nelem)
1201     {
1202       int new_alloc = 2 * (src->nelem + dest->alloc);
1203       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1204       if (BE (new_buffer == NULL, 0))
1205         return REG_ESPACE;
1206       dest->elems = new_buffer;
1207       dest->alloc = new_alloc;
1208     }
1209
1210   if (BE (dest->nelem == 0, 0))
1211     {
1212       dest->nelem = src->nelem;
1213       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1214       return REG_NOERROR;
1215     }
1216
1217   /* Copy into the top of DEST the items of SRC that are not
1218      found in DEST.  Maybe we could binary search in DEST?  */
1219   for (sbase = dest->nelem + 2 * src->nelem,
1220        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1221     {
1222       if (dest->elems[id] == src->elems[is])
1223         is--, id--;
1224       else if (dest->elems[id] < src->elems[is])
1225         dest->elems[--sbase] = src->elems[is--];
1226       else /* if (dest->elems[id] > src->elems[is]) */
1227         --id;
1228     }
1229
1230   if (is >= 0)
1231     {
1232       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1233       sbase -= is + 1;
1234       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1235     }
1236
1237   id = dest->nelem - 1;
1238   is = dest->nelem + 2 * src->nelem - 1;
1239   delta = is - sbase + 1;
1240   if (delta == 0)
1241     return REG_NOERROR;
1242
1243   /* Now copy.  When DELTA becomes zero, the remaining
1244      DEST elements are already in place.  */
1245   dest->nelem += delta;
1246   for (;;)
1247     {
1248       if (dest->elems[is] > dest->elems[id])
1249         {
1250           /* Copy from the top.  */
1251           dest->elems[id + delta--] = dest->elems[is--];
1252           if (delta == 0)
1253             break;
1254         }
1255       else
1256         {
1257           /* Slide from the bottom.  */
1258           dest->elems[id + delta] = dest->elems[id];
1259           if (--id < 0)
1260             {
1261               /* Copy remaining SRC elements.  */
1262               memcpy (dest->elems, dest->elems + sbase,
1263                       delta * sizeof (int));
1264               break;
1265             }
1266         }
1267     }
1268
1269   return REG_NOERROR;
1270 }
1271
1272 /* Insert the new element ELEM to the re_node_set* SET.
1273    SET should not already have ELEM.
1274    return -1 if an error is occured, return 1 otherwise.  */
1275
1276 static int
1277 internal_function __attribute_warn_unused_result__
1278 re_node_set_insert (re_node_set *set, int elem)
1279 {
1280   int idx;
1281   /* In case the set is empty.  */
1282   if (set->alloc == 0)
1283     {
1284       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1285         return 1;
1286       else
1287         return -1;
1288     }
1289
1290   if (BE (set->nelem, 0) == 0)
1291     {
1292       /* We already guaranteed above that set->alloc != 0.  */
1293       set->elems[0] = elem;
1294       ++set->nelem;
1295       return 1;
1296     }
1297
1298   /* Realloc if we need.  */
1299   if (set->alloc == set->nelem)
1300     {
1301       int *new_elems;
1302       set->alloc = set->alloc * 2;
1303       new_elems = re_realloc (set->elems, int, set->alloc);
1304       if (BE (new_elems == NULL, 0))
1305         return -1;
1306       set->elems = new_elems;
1307     }
1308
1309   /* Move the elements which follows the new element.  Test the
1310      first element separately to skip a check in the inner loop.  */
1311   if (elem < set->elems[0])
1312     {
1313       idx = 0;
1314       for (idx = set->nelem; idx > 0; idx--)
1315         set->elems[idx] = set->elems[idx - 1];
1316     }
1317   else
1318     {
1319       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1320         set->elems[idx] = set->elems[idx - 1];
1321     }
1322
1323   /* Insert the new element.  */
1324   set->elems[idx] = elem;
1325   ++set->nelem;
1326   return 1;
1327 }
1328
1329 /* Insert the new element ELEM to the re_node_set* SET.
1330    SET should not already have any element greater than or equal to ELEM.
1331    Return -1 if an error is occured, return 1 otherwise.  */
1332
1333 static int
1334 internal_function __attribute_warn_unused_result__
1335 re_node_set_insert_last (re_node_set *set, int elem)
1336 {
1337   /* Realloc if we need.  */
1338   if (set->alloc == set->nelem)
1339     {
1340       int *new_elems;
1341       set->alloc = (set->alloc + 1) * 2;
1342       new_elems = re_realloc (set->elems, int, set->alloc);
1343       if (BE (new_elems == NULL, 0))
1344         return -1;
1345       set->elems = new_elems;
1346     }
1347
1348   /* Insert the new element.  */
1349   set->elems[set->nelem++] = elem;
1350   return 1;
1351 }
1352
1353 /* Compare two node sets SET1 and SET2.
1354    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1355
1356 static int
1357 internal_function __attribute ((pure))
1358 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1359 {
1360   int i;
1361   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1362     return 0;
1363   for (i = set1->nelem ; --i >= 0 ; )
1364     if (set1->elems[i] != set2->elems[i])
1365       return 0;
1366   return 1;
1367 }
1368
1369 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1370
1371 static int
1372 internal_function __attribute ((pure))
1373 re_node_set_contains (const re_node_set *set, int elem)
1374 {
1375   unsigned int idx, right, mid;
1376   if (set->nelem <= 0)
1377     return 0;
1378
1379   /* Binary search the element.  */
1380   idx = 0;
1381   right = set->nelem - 1;
1382   while (idx < right)
1383     {
1384       mid = (idx + right) / 2;
1385       if (set->elems[mid] < elem)
1386         idx = mid + 1;
1387       else
1388         right = mid;
1389     }
1390   return set->elems[idx] == elem ? idx + 1 : 0;
1391 }
1392
1393 static void
1394 internal_function
1395 re_node_set_remove_at (re_node_set *set, int idx)
1396 {
1397   if (idx < 0 || idx >= set->nelem)
1398     return;
1399   --set->nelem;
1400   for (; idx < set->nelem; idx++)
1401     set->elems[idx] = set->elems[idx + 1];
1402 }
1403 \f
1404
1405 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1406    Or return -1, if an error will be occured.  */
1407
1408 static int
1409 internal_function
1410 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1411 {
1412   int type = token.type;
1413   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1414     {
1415       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1416       int *new_nexts, *new_indices;
1417       re_node_set *new_edests, *new_eclosures;
1418       re_token_t *new_nodes;
1419
1420       /* Avoid overflows in realloc.  */
1421       const size_t max_object_size = MAX (sizeof (re_token_t),
1422                                           MAX (sizeof (re_node_set),
1423                                                sizeof (int)));
1424       if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1425         return -1;
1426
1427       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1428       if (BE (new_nodes == NULL, 0))
1429         return -1;
1430       dfa->nodes = new_nodes;
1431       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1432       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1433       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1434       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1435       if (BE (new_nexts == NULL || new_indices == NULL
1436               || new_edests == NULL || new_eclosures == NULL, 0))
1437         return -1;
1438       dfa->nexts = new_nexts;
1439       dfa->org_indices = new_indices;
1440       dfa->edests = new_edests;
1441       dfa->eclosures = new_eclosures;
1442       dfa->nodes_alloc = new_nodes_alloc;
1443     }
1444   dfa->nodes[dfa->nodes_len] = token;
1445   dfa->nodes[dfa->nodes_len].constraint = 0;
1446 #ifdef RE_ENABLE_I18N
1447   dfa->nodes[dfa->nodes_len].accept_mb =
1448     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1449 #endif
1450   dfa->nexts[dfa->nodes_len] = -1;
1451   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453   return dfa->nodes_len++;
1454 }
1455
1456 static inline unsigned int
1457 internal_function
1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1459 {
1460   unsigned int hash = nodes->nelem + context;
1461   int i;
1462   for (i = 0 ; i < nodes->nelem ; i++)
1463     hash += nodes->elems[i];
1464   return hash;
1465 }
1466
1467 /* Search for the state whose node_set is equivalent to NODES.
1468    Return the pointer to the state, if we found it in the DFA.
1469    Otherwise create the new one and return it.  In case of an error
1470    return NULL and set the error code in ERR.
1471    Note: - We assume NULL as the invalid state, then it is possible that
1472            return value is NULL and ERR is REG_NOERROR.
1473          - We never return non-NULL value in case of any errors, it is for
1474            optimization.  */
1475
1476 static re_dfastate_t *
1477 internal_function __attribute_warn_unused_result__
1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479                   const re_node_set *nodes)
1480 {
1481   unsigned int hash;
1482   re_dfastate_t *new_state;
1483   struct re_state_table_entry *spot;
1484   int i;
1485   if (BE (nodes->nelem == 0, 0))
1486     {
1487       *err = REG_NOERROR;
1488       return NULL;
1489     }
1490   hash = calc_state_hash (nodes, 0);
1491   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1492
1493   for (i = 0 ; i < spot->num ; i++)
1494     {
1495       re_dfastate_t *state = spot->array[i];
1496       if (hash != state->hash)
1497         continue;
1498       if (re_node_set_compare (&state->nodes, nodes))
1499         return state;
1500     }
1501
1502   /* There are no appropriate state in the dfa, create the new one.  */
1503   new_state = create_ci_newstate (dfa, nodes, hash);
1504   if (BE (new_state == NULL, 0))
1505     *err = REG_ESPACE;
1506
1507   return new_state;
1508 }
1509
1510 /* Search for the state whose node_set is equivalent to NODES and
1511    whose context is equivalent to CONTEXT.
1512    Return the pointer to the state, if we found it in the DFA.
1513    Otherwise create the new one and return it.  In case of an error
1514    return NULL and set the error code in ERR.
1515    Note: - We assume NULL as the invalid state, then it is possible that
1516            return value is NULL and ERR is REG_NOERROR.
1517          - We never return non-NULL value in case of any errors, it is for
1518            optimization.  */
1519
1520 static re_dfastate_t *
1521 internal_function __attribute_warn_unused_result__
1522 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1523                           const re_node_set *nodes, unsigned int context)
1524 {
1525   unsigned int hash;
1526   re_dfastate_t *new_state;
1527   struct re_state_table_entry *spot;
1528   int i;
1529   if (nodes->nelem == 0)
1530     {
1531       *err = REG_NOERROR;
1532       return NULL;
1533     }
1534   hash = calc_state_hash (nodes, context);
1535   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1536
1537   for (i = 0 ; i < spot->num ; i++)
1538     {
1539       re_dfastate_t *state = spot->array[i];
1540       if (state->hash == hash
1541           && state->context == context
1542           && re_node_set_compare (state->entrance_nodes, nodes))
1543         return state;
1544     }
1545   /* There are no appropriate state in `dfa', create the new one.  */
1546   new_state = create_cd_newstate (dfa, nodes, context, hash);
1547   if (BE (new_state == NULL, 0))
1548     *err = REG_ESPACE;
1549
1550   return new_state;
1551 }
1552
1553 /* Finish initialization of the new state NEWSTATE, and using its hash value
1554    HASH put in the appropriate bucket of DFA's state table.  Return value
1555    indicates the error code if failed.  */
1556
1557 static reg_errcode_t
1558 __attribute_warn_unused_result__
1559 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1560                 unsigned int hash)
1561 {
1562   struct re_state_table_entry *spot;
1563   reg_errcode_t err;
1564   int i;
1565
1566   newstate->hash = hash;
1567   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1568   if (BE (err != REG_NOERROR, 0))
1569     return REG_ESPACE;
1570   for (i = 0; i < newstate->nodes.nelem; i++)
1571     {
1572       int elem = newstate->nodes.elems[i];
1573       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1574         if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1575           return REG_ESPACE;
1576     }
1577
1578   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1579   if (BE (spot->alloc <= spot->num, 0))
1580     {
1581       int new_alloc = 2 * spot->num + 2;
1582       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1583                                               new_alloc);
1584       if (BE (new_array == NULL, 0))
1585         return REG_ESPACE;
1586       spot->array = new_array;
1587       spot->alloc = new_alloc;
1588     }
1589   spot->array[spot->num++] = newstate;
1590   return REG_NOERROR;
1591 }
1592
1593 static void
1594 free_state (re_dfastate_t *state)
1595 {
1596   re_node_set_free (&state->non_eps_nodes);
1597   re_node_set_free (&state->inveclosure);
1598   if (state->entrance_nodes != &state->nodes)
1599     {
1600       re_node_set_free (state->entrance_nodes);
1601       re_free (state->entrance_nodes);
1602     }
1603   re_node_set_free (&state->nodes);
1604   re_free (state->word_trtable);
1605   re_free (state->trtable);
1606   re_free (state);
1607 }
1608
1609 /* Create the new state which is independ of contexts.
1610    Return the new state if succeeded, otherwise return NULL.  */
1611
1612 static re_dfastate_t *
1613 internal_function __attribute_warn_unused_result__
1614 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1615                     unsigned int hash)
1616 {
1617   int i;
1618   reg_errcode_t err;
1619   re_dfastate_t *newstate;
1620
1621   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1622   if (BE (newstate == NULL, 0))
1623     return NULL;
1624   err = re_node_set_init_copy (&newstate->nodes, nodes);
1625   if (BE (err != REG_NOERROR, 0))
1626     {
1627       re_free (newstate);
1628       return NULL;
1629     }
1630
1631   newstate->entrance_nodes = &newstate->nodes;
1632   for (i = 0 ; i < nodes->nelem ; i++)
1633     {
1634       re_token_t *node = dfa->nodes + nodes->elems[i];
1635       re_token_type_t type = node->type;
1636       if (type == CHARACTER && !node->constraint)
1637         continue;
1638 #ifdef RE_ENABLE_I18N
1639       newstate->accept_mb |= node->accept_mb;
1640 #endif /* RE_ENABLE_I18N */
1641
1642       /* If the state has the halt node, the state is a halt state.  */
1643       if (type == END_OF_RE)
1644         newstate->halt = 1;
1645       else if (type == OP_BACK_REF)
1646         newstate->has_backref = 1;
1647       else if (type == ANCHOR || node->constraint)
1648         newstate->has_constraint = 1;
1649     }
1650   err = register_state (dfa, newstate, hash);
1651   if (BE (err != REG_NOERROR, 0))
1652     {
1653       free_state (newstate);
1654       newstate = NULL;
1655     }
1656   return newstate;
1657 }
1658
1659 /* Create the new state which is depend on the context CONTEXT.
1660    Return the new state if succeeded, otherwise return NULL.  */
1661
1662 static re_dfastate_t *
1663 internal_function __attribute_warn_unused_result__
1664 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1665                     unsigned int context, unsigned int hash)
1666 {
1667   int i, nctx_nodes = 0;
1668   reg_errcode_t err;
1669   re_dfastate_t *newstate;
1670
1671   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1672   if (BE (newstate == NULL, 0))
1673     return NULL;
1674   err = re_node_set_init_copy (&newstate->nodes, nodes);
1675   if (BE (err != REG_NOERROR, 0))
1676     {
1677       re_free (newstate);
1678       return NULL;
1679     }
1680
1681   newstate->context = context;
1682   newstate->entrance_nodes = &newstate->nodes;
1683
1684   for (i = 0 ; i < nodes->nelem ; i++)
1685     {
1686       re_token_t *node = dfa->nodes + nodes->elems[i];
1687       re_token_type_t type = node->type;
1688       unsigned int constraint = node->constraint;
1689
1690       if (type == CHARACTER && !constraint)
1691         continue;
1692 #ifdef RE_ENABLE_I18N
1693       newstate->accept_mb |= node->accept_mb;
1694 #endif /* RE_ENABLE_I18N */
1695
1696       /* If the state has the halt node, the state is a halt state.  */
1697       if (type == END_OF_RE)
1698         newstate->halt = 1;
1699       else if (type == OP_BACK_REF)
1700         newstate->has_backref = 1;
1701
1702       if (constraint)
1703         {
1704           if (newstate->entrance_nodes == &newstate->nodes)
1705             {
1706               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1707               if (BE (newstate->entrance_nodes == NULL, 0))
1708                 {
1709                   free_state (newstate);
1710                   return NULL;
1711                 }
1712               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1713                   != REG_NOERROR)
1714                 return NULL;
1715               nctx_nodes = 0;
1716               newstate->has_constraint = 1;
1717             }
1718
1719           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1720             {
1721               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1722               ++nctx_nodes;
1723             }
1724         }
1725     }
1726   err = register_state (dfa, newstate, hash);
1727   if (BE (err != REG_NOERROR, 0))
1728     {
1729       free_state (newstate);
1730       newstate = NULL;
1731     }
1732   return  newstate;
1733 }