posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2018 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, int len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, int icase,
  23                                         const re_dfa_t *dfa);
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           unsigned int hash);
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           unsigned int hash);
  31 \f
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 __attribute_warn_unused_result__
  39 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  40                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   int init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 __attribute_warn_unused_result__
  67 re_string_construct (re_string_t *pstr, const char *str, int len,
  68                      RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 __attribute_warn_unused_result__
 130 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs;
 136
 137       /* Avoid overflow in realloc.  */
 138       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
 139       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 140         return REG_ESPACE;
 141
 142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 143       if (BE (new_wcs == NULL, 0))
 144         return REG_ESPACE;
 145       pstr->wcs = new_wcs;
 146       if (pstr->offsets != NULL)
 147         {
 148           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 149           if (BE (new_offsets == NULL, 0))
 150             return REG_ESPACE;
 151           pstr->offsets = new_offsets;
 152         }
 153     }
 154 #endif /* RE_ENABLE_I18N  */
 155   if (pstr->mbs_allocated)
 156     {
 157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 158                                            new_buf_len);
 159       if (BE (new_mbs == NULL, 0))
 160         return REG_ESPACE;
 161       pstr->mbs = new_mbs;
 162     }
 163   pstr->bufs_len = new_buf_len;
 164   return REG_NOERROR;
 165 }
 166
 167
 168 static void
 169 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 170                             RE_TRANSLATE_TYPE trans, int icase,
 171                             const re_dfa_t *dfa)
 172 {
 173   pstr->raw_mbs = (const unsigned char *) str;
 174   pstr->len = len;
 175   pstr->raw_len = len;
 176   pstr->trans = trans;
 177   pstr->icase = icase ? 1 : 0;
 178   pstr->mbs_allocated = (trans != NULL || icase);
 179   pstr->mb_cur_max = dfa->mb_cur_max;
 180   pstr->is_utf8 = dfa->is_utf8;
 181   pstr->map_notascii = dfa->map_notascii;
 182   pstr->stop = pstr->len;
 183   pstr->raw_stop = pstr->stop;
 184 }
 185
 186 #ifdef RE_ENABLE_I18N
 187
 188 /* Build wide character buffer PSTR->WCS.
 189    If the byte sequence of the string are:
 190      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 191    Then wide character buffer will be:
 192      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 193    We use WEOF for padding, they indicate that the position isn't
 194    a first byte of a multibyte character.
 195
 196    Note that this function assumes PSTR->VALID_LEN elements are already
 197    built and starts from PSTR->VALID_LEN.  */
 198
 199 static void
 200 build_wcs_buffer (re_string_t *pstr)
 201 {
 202 #ifdef _LIBC
 203   unsigned char buf[MB_LEN_MAX];
 204   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 205 #else
 206   unsigned char buf[64];
 207 #endif
 208   mbstate_t prev_st;
 209   int byte_idx, end_idx, remain_len;
 210   size_t mbclen;
 211
 212   /* Build the buffers from pstr->valid_len to either pstr->len or
 213      pstr->bufs_len.  */
 214   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 215   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 216     {
 217       wchar_t wc;
 218       const char *p;
 219
 220       remain_len = end_idx - byte_idx;
 221       prev_st = pstr->cur_state;
 222       /* Apply the translation if we need.  */
 223       if (BE (pstr->trans != NULL, 0))
 224         {
 225           int i, ch;
 226
 227           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 228             {
 229               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 230               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 231             }
 232           p = (const char *) buf;
 233         }
 234       else
 235         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 236       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 237       if (BE (mbclen == (size_t) -1 || mbclen == 0
 238               || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
 239         {
 240           /* We treat these cases as a singlebyte character.  */
 241           mbclen = 1;
 242           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 243           if (BE (pstr->trans != NULL, 0))
 244             wc = pstr->trans[wc];
 245           pstr->cur_state = prev_st;
 246         }
 247       else if (BE (mbclen == (size_t) -2, 0))
 248         {
 249           /* The buffer doesn't have enough space, finish to build.  */
 250           pstr->cur_state = prev_st;
 251           break;
 252         }
 253
 254       /* Write wide character and padding.  */
 255       pstr->wcs[byte_idx++] = wc;
 256       /* Write paddings.  */
 257       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 258         pstr->wcs[byte_idx++] = WEOF;
 259     }
 260   pstr->valid_len = byte_idx;
 261   pstr->valid_raw_len = byte_idx;
 262 }
 263
 264 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 265    but for REG_ICASE.  */
 266
 267 static reg_errcode_t
 268 __attribute_warn_unused_result__
 269 build_wcs_upper_buffer (re_string_t *pstr)
 270 {
 271   mbstate_t prev_st;
 272   int src_idx, byte_idx, end_idx, remain_len;
 273   size_t mbclen;
 274 #ifdef _LIBC
 275   char buf[MB_LEN_MAX];
 276   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 277 #else
 278   char buf[64];
 279 #endif
 280
 281   byte_idx = pstr->valid_len;
 282   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 283
 284   /* The following optimization assumes that ASCII characters can be
 285      mapped to wide characters with a simple cast.  */
 286   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 287     {
 288       while (byte_idx < end_idx)
 289         {
 290           wchar_t wc;
 291
 292           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 293               && mbsinit (&pstr->cur_state))
 294             {
 295               /* In case of a singlebyte character.  */
 296               pstr->mbs[byte_idx]
 297                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 298               /* The next step uses the assumption that wchar_t is encoded
 299                  ASCII-safe: all ASCII values can be converted like this.  */
 300               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 301               ++byte_idx;
 302               continue;
 303             }
 304
 305           remain_len = end_idx - byte_idx;
 306           prev_st = pstr->cur_state;
 307           mbclen = __mbrtowc (&wc,
 308                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 309                                + byte_idx), remain_len, &pstr->cur_state);
 310           if (BE (mbclen + 2 > 2, 1))
 311             {
 312               wchar_t wcu = wc;
 313               if (__iswlower (wc))
 314                 {
 315                   size_t mbcdlen;
 316
 317                   wcu = __towupper (wc);
 318                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 319                   if (BE (mbclen == mbcdlen, 1))
 320                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 321                   else
 322                     {
 323                       src_idx = byte_idx;
 324                       goto offsets_needed;
 325                     }
 326                 }
 327               else
 328                 memcpy (pstr->mbs + byte_idx,
 329                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 330               pstr->wcs[byte_idx++] = wcu;
 331               /* Write paddings.  */
 332               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 333                 pstr->wcs[byte_idx++] = WEOF;
 334             }
 335           else if (mbclen == (size_t) -1 || mbclen == 0
 336                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 337             {
 338               /* It is an invalid character, an incomplete character
 339                  at the end of the string, or '\0'.  Just use the byte.  */
 340               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 341               pstr->mbs[byte_idx] = ch;
 342               /* And also cast it to wide char.  */
 343               pstr->wcs[byte_idx++] = (wchar_t) ch;
 344               if (BE (mbclen == (size_t) -1, 0))
 345                 pstr->cur_state = prev_st;
 346             }
 347           else
 348             {
 349               /* The buffer doesn't have enough space, finish to build.  */
 350               pstr->cur_state = prev_st;
 351               break;
 352             }
 353         }
 354       pstr->valid_len = byte_idx;
 355       pstr->valid_raw_len = byte_idx;
 356       return REG_NOERROR;
 357     }
 358   else
 359     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 360       {
 361         wchar_t wc;
 362         const char *p;
 363       offsets_needed:
 364         remain_len = end_idx - byte_idx;
 365         prev_st = pstr->cur_state;
 366         if (BE (pstr->trans != NULL, 0))
 367           {
 368             int i, ch;
 369
 370             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 371               {
 372                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 373                 buf[i] = pstr->trans[ch];
 374               }
 375             p = (const char *) buf;
 376           }
 377         else
 378           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 379         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 380         if (BE (mbclen + 2 > 2, 1))
 381           {
 382             wchar_t wcu = wc;
 383             if (__iswlower (wc))
 384               {
 385                 size_t mbcdlen;
 386
 387                 wcu = __towupper (wc);
 388                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
 389                 if (BE (mbclen == mbcdlen, 1))
 390                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 391                 else if (mbcdlen != (size_t) -1)
 392                   {
 393                     size_t i;
 394
 395                     if (byte_idx + mbcdlen > pstr->bufs_len)
 396                       {
 397                         pstr->cur_state = prev_st;
 398                         break;
 399                       }
 400
 401                     if (pstr->offsets == NULL)
 402                       {
 403                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 404
 405                         if (pstr->offsets == NULL)
 406                           return REG_ESPACE;
 407                       }
 408                     if (!pstr->offsets_needed)
 409                       {
 410                         for (i = 0; i < (size_t) byte_idx; ++i)
 411                           pstr->offsets[i] = i;
 412                         pstr->offsets_needed = 1;
 413                       }
 414
 415                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 416                     pstr->wcs[byte_idx] = wcu;
 417                     pstr->offsets[byte_idx] = src_idx;
 418                     for (i = 1; i < mbcdlen; ++i)
 419                       {
 420                         pstr->offsets[byte_idx + i]
 421                           = src_idx + (i < mbclen ? i : mbclen - 1);
 422                         pstr->wcs[byte_idx + i] = WEOF;
 423                       }
 424                     pstr->len += mbcdlen - mbclen;
 425                     if (pstr->raw_stop > src_idx)
 426                       pstr->stop += mbcdlen - mbclen;
 427                     end_idx = (pstr->bufs_len > pstr->len)
 428                               ? pstr->len : pstr->bufs_len;
 429                     byte_idx += mbcdlen;
 430                     src_idx += mbclen;
 431                     continue;
 432                   }
 433                 else
 434                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 435               }
 436             else
 437               memcpy (pstr->mbs + byte_idx, p, mbclen);
 438
 439             if (BE (pstr->offsets_needed != 0, 0))
 440               {
 441                 size_t i;
 442                 for (i = 0; i < mbclen; ++i)
 443                   pstr->offsets[byte_idx + i] = src_idx + i;
 444               }
 445             src_idx += mbclen;
 446
 447             pstr->wcs[byte_idx++] = wcu;
 448             /* Write paddings.  */
 449             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 450               pstr->wcs[byte_idx++] = WEOF;
 451           }
 452         else if (mbclen == (size_t) -1 || mbclen == 0
 453                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 454           {
 455             /* It is an invalid character or '\0'.  Just use the byte.  */
 456             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 457
 458             if (BE (pstr->trans != NULL, 0))
 459               ch = pstr->trans [ch];
 460             pstr->mbs[byte_idx] = ch;
 461
 462             if (BE (pstr->offsets_needed != 0, 0))
 463               pstr->offsets[byte_idx] = src_idx;
 464             ++src_idx;
 465
 466             /* And also cast it to wide char.  */
 467             pstr->wcs[byte_idx++] = (wchar_t) ch;
 468             if (BE (mbclen == (size_t) -1, 0))
 469               pstr->cur_state = prev_st;
 470           }
 471         else
 472           {
 473             /* The buffer doesn't have enough space, finish to build.  */
 474             pstr->cur_state = prev_st;
 475             break;
 476           }
 477       }
 478   pstr->valid_len = byte_idx;
 479   pstr->valid_raw_len = src_idx;
 480   return REG_NOERROR;
 481 }
 482
 483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 484    Return the index.  */
 485
 486 static int
 487 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 488 {
 489   mbstate_t prev_st;
 490   int rawbuf_idx;
 491   size_t mbclen;
 492   wint_t wc = WEOF;
 493
 494   /* Skip the characters which are not necessary to check.  */
 495   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 496        rawbuf_idx < new_raw_idx;)
 497     {
 498       wchar_t wc2;
 499       int remain_len = pstr->raw_len - rawbuf_idx;
 500       prev_st = pstr->cur_state;
 501       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 502                           remain_len, &pstr->cur_state);
 503       if (BE ((ssize_t) mbclen <= 0, 0))
 504         {
 505           /* We treat these cases as a single byte character.  */
 506           if (mbclen == 0 || remain_len == 0)
 507             wc = L'\0';
 508           else
 509             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 510           mbclen = 1;
 511           pstr->cur_state = prev_st;
 512         }
 513       else
 514         wc = (wint_t) wc2;
 515       /* Then proceed the next character.  */
 516       rawbuf_idx += mbclen;
 517     }
 518   *last_wc = wc;
 519   return rawbuf_idx;
 520 }
 521 #endif /* RE_ENABLE_I18N  */
 522
 523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 524    This function is used in case of REG_ICASE.  */
 525
 526 static void
 527 build_upper_buffer (re_string_t *pstr)
 528 {
 529   int char_idx, end_idx;
 530   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 531
 532   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 533     {
 534       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 535       if (BE (pstr->trans != NULL, 0))
 536         ch = pstr->trans[ch];
 537       if (islower (ch))
 538         pstr->mbs[char_idx] = toupper (ch);
 539       else
 540         pstr->mbs[char_idx] = ch;
 541     }
 542   pstr->valid_len = char_idx;
 543   pstr->valid_raw_len = char_idx;
 544 }
 545
 546 /* Apply TRANS to the buffer in PSTR.  */
 547
 548 static void
 549 re_string_translate_buffer (re_string_t *pstr)
 550 {
 551   int buf_idx, end_idx;
 552   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 553
 554   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 555     {
 556       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 557       pstr->mbs[buf_idx] = pstr->trans[ch];
 558     }
 559
 560   pstr->valid_len = buf_idx;
 561   pstr->valid_raw_len = buf_idx;
 562 }
 563
 564 /* This function re-construct the buffers.
 565    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 566    convert to upper case in case of REG_ICASE, apply translation.  */
 567
 568 static reg_errcode_t
 569 __attribute_warn_unused_result__
 570 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 571 {
 572   int offset = idx - pstr->raw_mbs_idx;
 573   if (BE (offset < 0, 0))
 574     {
 575       /* Reset buffer.  */
 576 #ifdef RE_ENABLE_I18N
 577       if (pstr->mb_cur_max > 1)
 578         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 579 #endif /* RE_ENABLE_I18N */
 580       pstr->len = pstr->raw_len;
 581       pstr->stop = pstr->raw_stop;
 582       pstr->valid_len = 0;
 583       pstr->raw_mbs_idx = 0;
 584       pstr->valid_raw_len = 0;
 585       pstr->offsets_needed = 0;
 586       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 587                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 588       if (!pstr->mbs_allocated)
 589         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 590       offset = idx;
 591     }
 592
 593   if (BE (offset != 0, 1))
 594     {
 595       /* Should the already checked characters be kept?  */
 596       if (BE (offset < pstr->valid_raw_len, 1))
 597         {
 598           /* Yes, move them to the front of the buffer.  */
 599 #ifdef RE_ENABLE_I18N
 600           if (BE (pstr->offsets_needed, 0))
 601             {
 602               int low = 0, high = pstr->valid_len, mid;
 603               do
 604                 {
 605                   mid = (high + low) / 2;
 606                   if (pstr->offsets[mid] > offset)
 607                     high = mid;
 608                   else if (pstr->offsets[mid] < offset)
 609                     low = mid + 1;
 610                   else
 611                     break;
 612                 }
 613               while (low < high);
 614               if (pstr->offsets[mid] < offset)
 615                 ++mid;
 616               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 617                                                         eflags);
 618               /* This can be quite complicated, so handle specially
 619                  only the common and easy case where the character with
 620                  different length representation of lower and upper
 621                  case is present at or after offset.  */
 622               if (pstr->valid_len > offset
 623                   && mid == offset && pstr->offsets[mid] == offset)
 624                 {
 625                   memmove (pstr->wcs, pstr->wcs + offset,
 626                            (pstr->valid_len - offset) * sizeof (wint_t));
 627                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 628                   pstr->valid_len -= offset;
 629                   pstr->valid_raw_len -= offset;
 630                   for (low = 0; low < pstr->valid_len; low++)
 631                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 632                 }
 633               else
 634                 {
 635                   /* Otherwise, just find out how long the partial multibyte
 636                      character at offset is and fill it with WEOF/255.  */
 637                   pstr->len = pstr->raw_len - idx + offset;
 638                   pstr->stop = pstr->raw_stop - idx + offset;
 639                   pstr->offsets_needed = 0;
 640                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 641                     --mid;
 642                   while (mid < pstr->valid_len)
 643                     if (pstr->wcs[mid] != WEOF)
 644                       break;
 645                     else
 646                       ++mid;
 647                   if (mid == pstr->valid_len)
 648                     pstr->valid_len = 0;
 649                   else
 650                     {
 651                       pstr->valid_len = pstr->offsets[mid] - offset;
 652                       if (pstr->valid_len)
 653                         {
 654                           for (low = 0; low < pstr->valid_len; ++low)
 655                             pstr->wcs[low] = WEOF;
 656                           memset (pstr->mbs, 255, pstr->valid_len);
 657                         }
 658                     }
 659                   pstr->valid_raw_len = pstr->valid_len;
 660                 }
 661             }
 662           else
 663 #endif
 664             {
 665               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 666                                                         eflags);
 667 #ifdef RE_ENABLE_I18N
 668               if (pstr->mb_cur_max > 1)
 669                 memmove (pstr->wcs, pstr->wcs + offset,
 670                          (pstr->valid_len - offset) * sizeof (wint_t));
 671 #endif /* RE_ENABLE_I18N */
 672               if (BE (pstr->mbs_allocated, 0))
 673                 memmove (pstr->mbs, pstr->mbs + offset,
 674                          pstr->valid_len - offset);
 675               pstr->valid_len -= offset;
 676               pstr->valid_raw_len -= offset;
 677 #if defined DEBUG && DEBUG
 678               assert (pstr->valid_len > 0);
 679 #endif
 680             }
 681         }
 682       else
 683         {
 684 #ifdef RE_ENABLE_I18N
 685           /* No, skip all characters until IDX.  */
 686           int prev_valid_len = pstr->valid_len;
 687
 688           if (BE (pstr->offsets_needed, 0))
 689             {
 690               pstr->len = pstr->raw_len - idx + offset;
 691               pstr->stop = pstr->raw_stop - idx + offset;
 692               pstr->offsets_needed = 0;
 693             }
 694 #endif
 695           pstr->valid_len = 0;
 696 #ifdef RE_ENABLE_I18N
 697           if (pstr->mb_cur_max > 1)
 698             {
 699               int wcs_idx;
 700               wint_t wc = WEOF;
 701
 702               if (pstr->is_utf8)
 703                 {
 704                   const unsigned char *raw, *p, *end;
 705
 706                   /* Special case UTF-8.  Multi-byte chars start with any
 707                      byte other than 0x80 - 0xbf.  */
 708                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 709                   end = raw + (offset - pstr->mb_cur_max);
 710                   if (end < pstr->raw_mbs)
 711                     end = pstr->raw_mbs;
 712                   p = raw + offset - 1;
 713 #ifdef _LIBC
 714                   /* We know the wchar_t encoding is UCS4, so for the simple
 715                      case, ASCII characters, skip the conversion step.  */
 716                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 717                     {
 718                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 719                       /* pstr->valid_len = 0; */
 720                       wc = (wchar_t) *p;
 721                     }
 722                   else
 723 #endif
 724                     for (; p >= end; --p)
 725                       if ((*p & 0xc0) != 0x80)
 726                         {
 727                           mbstate_t cur_state;
 728                           wchar_t wc2;
 729                           int mlen = raw + pstr->len - p;
 730                           unsigned char buf[6];
 731                           size_t mbclen;
 732
 733                           const unsigned char *pp = p;
 734                           if (BE (pstr->trans != NULL, 0))
 735                             {
 736                               int i = mlen < 6 ? mlen : 6;
 737                               while (--i >= 0)
 738                                 buf[i] = pstr->trans[p[i]];
 739                               pp = buf;
 740                             }
 741                           /* XXX Don't use mbrtowc, we know which conversion
 742                              to use (UTF-8 -> UCS4).  */
 743                           memset (&cur_state, 0, sizeof (cur_state));
 744                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 745                                               &cur_state);
 746                           if (raw + offset - p <= mbclen
 747                               && mbclen < (size_t) -2)
 748                             {
 749                               memset (&pstr->cur_state, '\0',
 750                                       sizeof (mbstate_t));
 751                               pstr->valid_len = mbclen - (raw + offset - p);
 752                               wc = wc2;
 753                             }
 754                           break;
 755                         }
 756                 }
 757
 758               if (wc == WEOF)
 759                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 760               if (wc == WEOF)
 761                 pstr->tip_context
 762                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 763               else
 764                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 765                                       && IS_WIDE_WORD_CHAR (wc))
 766                                      ? CONTEXT_WORD
 767                                      : ((IS_WIDE_NEWLINE (wc)
 768                                          && pstr->newline_anchor)
 769                                         ? CONTEXT_NEWLINE : 0));
 770               if (BE (pstr->valid_len, 0))
 771                 {
 772                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 773                     pstr->wcs[wcs_idx] = WEOF;
 774                   if (pstr->mbs_allocated)
 775                     memset (pstr->mbs, 255, pstr->valid_len);
 776                 }
 777               pstr->valid_raw_len = pstr->valid_len;
 778             }
 779           else
 780 #endif /* RE_ENABLE_I18N */
 781             {
 782               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 783               pstr->valid_raw_len = 0;
 784               if (pstr->trans)
 785                 c = pstr->trans[c];
 786               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 787                                    ? CONTEXT_WORD
 788                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 789                                       ? CONTEXT_NEWLINE : 0));
 790             }
 791         }
 792       if (!BE (pstr->mbs_allocated, 0))
 793         pstr->mbs += offset;
 794     }
 795   pstr->raw_mbs_idx = idx;
 796   pstr->len -= offset;
 797   pstr->stop -= offset;
 798
 799   /* Then build the buffers.  */
 800 #ifdef RE_ENABLE_I18N
 801   if (pstr->mb_cur_max > 1)
 802     {
 803       if (pstr->icase)
 804         {
 805           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 806           if (BE (ret != REG_NOERROR, 0))
 807             return ret;
 808         }
 809       else
 810         build_wcs_buffer (pstr);
 811     }
 812   else
 813 #endif /* RE_ENABLE_I18N */
 814     if (BE (pstr->mbs_allocated, 0))
 815       {
 816         if (pstr->icase)
 817           build_upper_buffer (pstr);
 818         else if (pstr->trans != NULL)
 819           re_string_translate_buffer (pstr);
 820       }
 821     else
 822       pstr->valid_len = pstr->len;
 823
 824   pstr->cur_idx = 0;
 825   return REG_NOERROR;
 826 }
 827
 828 static unsigned char
 829 __attribute ((pure))
 830 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 831 {
 832   int ch, off;
 833
 834   /* Handle the common (easiest) cases first.  */
 835   if (BE (!pstr->mbs_allocated, 1))
 836     return re_string_peek_byte (pstr, idx);
 837
 838 #ifdef RE_ENABLE_I18N
 839   if (pstr->mb_cur_max > 1
 840       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 841     return re_string_peek_byte (pstr, idx);
 842 #endif
 843
 844   off = pstr->cur_idx + idx;
 845 #ifdef RE_ENABLE_I18N
 846   if (pstr->offsets_needed)
 847     off = pstr->offsets[off];
 848 #endif
 849
 850   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 851
 852 #ifdef RE_ENABLE_I18N
 853   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 854      this function returns CAPITAL LETTER I instead of first byte of
 855      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 856      since peek_byte_case doesn't advance cur_idx in any way.  */
 857   if (pstr->offsets_needed && !isascii (ch))
 858     return re_string_peek_byte (pstr, idx);
 859 #endif
 860
 861   return ch;
 862 }
 863
 864 static unsigned char
 865 re_string_fetch_byte_case (re_string_t *pstr)
 866 {
 867   if (BE (!pstr->mbs_allocated, 1))
 868     return re_string_fetch_byte (pstr);
 869
 870 #ifdef RE_ENABLE_I18N
 871   if (pstr->offsets_needed)
 872     {
 873       int off, ch;
 874
 875       /* For tr_TR.UTF-8 [[:islower:]] there is
 876          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 877          in that case the whole multi-byte character and return
 878          the original letter.  On the other side, with
 879          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 880          anything else would complicate things too much.  */
 881
 882       if (!re_string_first_byte (pstr, pstr->cur_idx))
 883         return re_string_fetch_byte (pstr);
 884
 885       off = pstr->offsets[pstr->cur_idx];
 886       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 887
 888       if (! isascii (ch))
 889         return re_string_fetch_byte (pstr);
 890
 891       re_string_skip_bytes (pstr,
 892                             re_string_char_size_at (pstr, pstr->cur_idx));
 893       return ch;
 894     }
 895 #endif
 896
 897   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 898 }
 899
 900 static void
 901 re_string_destruct (re_string_t *pstr)
 902 {
 903 #ifdef RE_ENABLE_I18N
 904   re_free (pstr->wcs);
 905   re_free (pstr->offsets);
 906 #endif /* RE_ENABLE_I18N  */
 907   if (pstr->mbs_allocated)
 908     re_free (pstr->mbs);
 909 }
 910
 911 /* Return the context at IDX in INPUT.  */
 912
 913 static unsigned int
 914 re_string_context_at (const re_string_t *input, int idx, int eflags)
 915 {
 916   int c;
 917   if (BE (idx < 0, 0))
 918     /* In this case, we use the value stored in input->tip_context,
 919        since we can't know the character in input->mbs[-1] here.  */
 920     return input->tip_context;
 921   if (BE (idx == input->len, 0))
 922     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 923             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 924 #ifdef RE_ENABLE_I18N
 925   if (input->mb_cur_max > 1)
 926     {
 927       wint_t wc;
 928       int wc_idx = idx;
 929       while(input->wcs[wc_idx] == WEOF)
 930         {
 931 #if defined DEBUG && DEBUG
 932           /* It must not happen.  */
 933           assert (wc_idx >= 0);
 934 #endif
 935           --wc_idx;
 936           if (wc_idx < 0)
 937             return input->tip_context;
 938         }
 939       wc = input->wcs[wc_idx];
 940       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 941         return CONTEXT_WORD;
 942       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 943               ? CONTEXT_NEWLINE : 0);
 944     }
 945   else
 946 #endif
 947     {
 948       c = re_string_byte_at (input, idx);
 949       if (bitset_contain (input->word_char, c))
 950         return CONTEXT_WORD;
 951       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 952     }
 953 }
 954 \f
 955 /* Functions for set operation.  */
 956
 957 static reg_errcode_t
 958 __attribute_warn_unused_result__
 959 re_node_set_alloc (re_node_set *set, int size)
 960 {
 961   set->alloc = size;
 962   set->nelem = 0;
 963   set->elems = re_malloc (int, size);
 964   if (BE (set->elems == NULL, 0))
 965     return REG_ESPACE;
 966   return REG_NOERROR;
 967 }
 968
 969 static reg_errcode_t
 970 __attribute_warn_unused_result__
 971 re_node_set_init_1 (re_node_set *set, int elem)
 972 {
 973   set->alloc = 1;
 974   set->nelem = 1;
 975   set->elems = re_malloc (int, 1);
 976   if (BE (set->elems == NULL, 0))
 977     {
 978       set->alloc = set->nelem = 0;
 979       return REG_ESPACE;
 980     }
 981   set->elems[0] = elem;
 982   return REG_NOERROR;
 983 }
 984
 985 static reg_errcode_t
 986 __attribute_warn_unused_result__
 987 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 988 {
 989   set->alloc = 2;
 990   set->elems = re_malloc (int, 2);
 991   if (BE (set->elems == NULL, 0))
 992     return REG_ESPACE;
 993   if (elem1 == elem2)
 994     {
 995       set->nelem = 1;
 996       set->elems[0] = elem1;
 997     }
 998   else
 999     {
1000       set->nelem = 2;
1001       if (elem1 < elem2)
1002         {
1003           set->elems[0] = elem1;
1004           set->elems[1] = elem2;
1005         }
1006       else
1007         {
1008           set->elems[0] = elem2;
1009           set->elems[1] = elem1;
1010         }
1011     }
1012   return REG_NOERROR;
1013 }
1014
1015 static reg_errcode_t
1016 __attribute_warn_unused_result__
1017 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1018 {
1019   dest->nelem = src->nelem;
1020   if (src->nelem > 0)
1021     {
1022       dest->alloc = dest->nelem;
1023       dest->elems = re_malloc (int, dest->alloc);
1024       if (BE (dest->elems == NULL, 0))
1025         {
1026           dest->alloc = dest->nelem = 0;
1027           return REG_ESPACE;
1028         }
1029       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1030     }
1031   else
1032     re_node_set_init_empty (dest);
1033   return REG_NOERROR;
1034 }
1035
1036 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1037    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1038    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1039
1040 static reg_errcode_t
1041 __attribute_warn_unused_result__
1042 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1043                            const re_node_set *src2)
1044 {
1045   int i1, i2, is, id, delta, sbase;
1046   if (src1->nelem == 0 || src2->nelem == 0)
1047     return REG_NOERROR;
1048
1049   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1050      conservative estimate.  */
1051   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1052     {
1053       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1054       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1055       if (BE (new_elems == NULL, 0))
1056         return REG_ESPACE;
1057       dest->elems = new_elems;
1058       dest->alloc = new_alloc;
1059     }
1060
1061   /* Find the items in the intersection of SRC1 and SRC2, and copy
1062      into the top of DEST those that are not already in DEST itself.  */
1063   sbase = dest->nelem + src1->nelem + src2->nelem;
1064   i1 = src1->nelem - 1;
1065   i2 = src2->nelem - 1;
1066   id = dest->nelem - 1;
1067   for (;;)
1068     {
1069       if (src1->elems[i1] == src2->elems[i2])
1070         {
1071           /* Try to find the item in DEST.  Maybe we could binary search?  */
1072           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1073             --id;
1074
1075           if (id < 0 || dest->elems[id] != src1->elems[i1])
1076             dest->elems[--sbase] = src1->elems[i1];
1077
1078           if (--i1 < 0 || --i2 < 0)
1079             break;
1080         }
1081
1082       /* Lower the highest of the two items.  */
1083       else if (src1->elems[i1] < src2->elems[i2])
1084         {
1085           if (--i2 < 0)
1086             break;
1087         }
1088       else
1089         {
1090           if (--i1 < 0)
1091             break;
1092         }
1093     }
1094
1095   id = dest->nelem - 1;
1096   is = dest->nelem + src1->nelem + src2->nelem - 1;
1097   delta = is - sbase + 1;
1098
1099   /* Now copy.  When DELTA becomes zero, the remaining
1100      DEST elements are already in place; this is more or
1101      less the same loop that is in re_node_set_merge.  */
1102   dest->nelem += delta;
1103   if (delta > 0 && id >= 0)
1104     for (;;)
1105       {
1106         if (dest->elems[is] > dest->elems[id])
1107           {
1108             /* Copy from the top.  */
1109             dest->elems[id + delta--] = dest->elems[is--];
1110             if (delta == 0)
1111               break;
1112           }
1113         else
1114           {
1115             /* Slide from the bottom.  */
1116             dest->elems[id + delta] = dest->elems[id];
1117             if (--id < 0)
1118               break;
1119           }
1120       }
1121
1122   /* Copy remaining SRC elements.  */
1123   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1124
1125   return REG_NOERROR;
1126 }
1127
1128 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1129    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1130
1131 static reg_errcode_t
1132 __attribute_warn_unused_result__
1133 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1134                         const re_node_set *src2)
1135 {
1136   int i1, i2, id;
1137   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1138     {
1139       dest->alloc = src1->nelem + src2->nelem;
1140       dest->elems = re_malloc (int, dest->alloc);
1141       if (BE (dest->elems == NULL, 0))
1142         return REG_ESPACE;
1143     }
1144   else
1145     {
1146       if (src1 != NULL && src1->nelem > 0)
1147         return re_node_set_init_copy (dest, src1);
1148       else if (src2 != NULL && src2->nelem > 0)
1149         return re_node_set_init_copy (dest, src2);
1150       else
1151         re_node_set_init_empty (dest);
1152       return REG_NOERROR;
1153     }
1154   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1155     {
1156       if (src1->elems[i1] > src2->elems[i2])
1157         {
1158           dest->elems[id++] = src2->elems[i2++];
1159           continue;
1160         }
1161       if (src1->elems[i1] == src2->elems[i2])
1162         ++i2;
1163       dest->elems[id++] = src1->elems[i1++];
1164     }
1165   if (i1 < src1->nelem)
1166     {
1167       memcpy (dest->elems + id, src1->elems + i1,
1168              (src1->nelem - i1) * sizeof (int));
1169       id += src1->nelem - i1;
1170     }
1171   else if (i2 < src2->nelem)
1172     {
1173       memcpy (dest->elems + id, src2->elems + i2,
1174              (src2->nelem - i2) * sizeof (int));
1175       id += src2->nelem - i2;
1176     }
1177   dest->nelem = id;
1178   return REG_NOERROR;
1179 }
1180
1181 /* Calculate the union set of the sets DEST and SRC. And store it to
1182    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1183
1184 static reg_errcode_t
1185 __attribute_warn_unused_result__
1186 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1187 {
1188   int is, id, sbase, delta;
1189   if (src == NULL || src->nelem == 0)
1190     return REG_NOERROR;
1191   if (dest->alloc < 2 * src->nelem + dest->nelem)
1192     {
1193       int new_alloc = 2 * (src->nelem + dest->alloc);
1194       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1195       if (BE (new_buffer == NULL, 0))
1196         return REG_ESPACE;
1197       dest->elems = new_buffer;
1198       dest->alloc = new_alloc;
1199     }
1200
1201   if (BE (dest->nelem == 0, 0))
1202     {
1203       dest->nelem = src->nelem;
1204       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1205       return REG_NOERROR;
1206     }
1207
1208   /* Copy into the top of DEST the items of SRC that are not
1209      found in DEST.  Maybe we could binary search in DEST?  */
1210   for (sbase = dest->nelem + 2 * src->nelem,
1211        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1212     {
1213       if (dest->elems[id] == src->elems[is])
1214         is--, id--;
1215       else if (dest->elems[id] < src->elems[is])
1216         dest->elems[--sbase] = src->elems[is--];
1217       else /* if (dest->elems[id] > src->elems[is]) */
1218         --id;
1219     }
1220
1221   if (is >= 0)
1222     {
1223       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1224       sbase -= is + 1;
1225       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1226     }
1227
1228   id = dest->nelem - 1;
1229   is = dest->nelem + 2 * src->nelem - 1;
1230   delta = is - sbase + 1;
1231   if (delta == 0)
1232     return REG_NOERROR;
1233
1234   /* Now copy.  When DELTA becomes zero, the remaining
1235      DEST elements are already in place.  */
1236   dest->nelem += delta;
1237   for (;;)
1238     {
1239       if (dest->elems[is] > dest->elems[id])
1240         {
1241           /* Copy from the top.  */
1242           dest->elems[id + delta--] = dest->elems[is--];
1243           if (delta == 0)
1244             break;
1245         }
1246       else
1247         {
1248           /* Slide from the bottom.  */
1249           dest->elems[id + delta] = dest->elems[id];
1250           if (--id < 0)
1251             {
1252               /* Copy remaining SRC elements.  */
1253               memcpy (dest->elems, dest->elems + sbase,
1254                       delta * sizeof (int));
1255               break;
1256             }
1257         }
1258     }
1259
1260   return REG_NOERROR;
1261 }
1262
1263 /* Insert the new element ELEM to the re_node_set* SET.
1264    SET should not already have ELEM.
1265    return -1 if an error is occured, return 1 otherwise.  */
1266
1267 static int
1268 __attribute_warn_unused_result__
1269 re_node_set_insert (re_node_set *set, int elem)
1270 {
1271   int idx;
1272   /* In case the set is empty.  */
1273   if (set->alloc == 0)
1274     {
1275       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1276         return 1;
1277       else
1278         return -1;
1279     }
1280
1281   if (BE (set->nelem, 0) == 0)
1282     {
1283       /* We already guaranteed above that set->alloc != 0.  */
1284       set->elems[0] = elem;
1285       ++set->nelem;
1286       return 1;
1287     }
1288
1289   /* Realloc if we need.  */
1290   if (set->alloc == set->nelem)
1291     {
1292       int *new_elems;
1293       set->alloc = set->alloc * 2;
1294       new_elems = re_realloc (set->elems, int, set->alloc);
1295       if (BE (new_elems == NULL, 0))
1296         return -1;
1297       set->elems = new_elems;
1298     }
1299
1300   /* Move the elements which follows the new element.  Test the
1301      first element separately to skip a check in the inner loop.  */
1302   if (elem < set->elems[0])
1303     {
1304       idx = 0;
1305       for (idx = set->nelem; idx > 0; idx--)
1306         set->elems[idx] = set->elems[idx - 1];
1307     }
1308   else
1309     {
1310       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1311         set->elems[idx] = set->elems[idx - 1];
1312     }
1313
1314   /* Insert the new element.  */
1315   set->elems[idx] = elem;
1316   ++set->nelem;
1317   return 1;
1318 }
1319
1320 /* Insert the new element ELEM to the re_node_set* SET.
1321    SET should not already have any element greater than or equal to ELEM.
1322    Return -1 if an error is occured, return 1 otherwise.  */
1323
1324 static int
1325 __attribute_warn_unused_result__
1326 re_node_set_insert_last (re_node_set *set, int elem)
1327 {
1328   /* Realloc if we need.  */
1329   if (set->alloc == set->nelem)
1330     {
1331       int *new_elems;
1332       set->alloc = (set->alloc + 1) * 2;
1333       new_elems = re_realloc (set->elems, int, set->alloc);
1334       if (BE (new_elems == NULL, 0))
1335         return -1;
1336       set->elems = new_elems;
1337     }
1338
1339   /* Insert the new element.  */
1340   set->elems[set->nelem++] = elem;
1341   return 1;
1342 }
1343
1344 /* Compare two node sets SET1 and SET2.
1345    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1346
1347 static int
1348 __attribute ((pure))
1349 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1350 {
1351   int i;
1352   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1353     return 0;
1354   for (i = set1->nelem ; --i >= 0 ; )
1355     if (set1->elems[i] != set2->elems[i])
1356       return 0;
1357   return 1;
1358 }
1359
1360 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1361
1362 static int
1363 __attribute ((pure))
1364 re_node_set_contains (const re_node_set *set, int elem)
1365 {
1366   unsigned int idx, right, mid;
1367   if (set->nelem <= 0)
1368     return 0;
1369
1370   /* Binary search the element.  */
1371   idx = 0;
1372   right = set->nelem - 1;
1373   while (idx < right)
1374     {
1375       mid = (idx + right) / 2;
1376       if (set->elems[mid] < elem)
1377         idx = mid + 1;
1378       else
1379         right = mid;
1380     }
1381   return set->elems[idx] == elem ? idx + 1 : 0;
1382 }
1383
1384 static void
1385 re_node_set_remove_at (re_node_set *set, int idx)
1386 {
1387   if (idx < 0 || idx >= set->nelem)
1388     return;
1389   --set->nelem;
1390   for (; idx < set->nelem; idx++)
1391     set->elems[idx] = set->elems[idx + 1];
1392 }
1393 \f
1394
1395 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1396    Or return -1, if an error will be occured.  */
1397
1398 static int
1399 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1400 {
1401   int type = token.type;
1402   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1403     {
1404       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1405       int *new_nexts, *new_indices;
1406       re_node_set *new_edests, *new_eclosures;
1407       re_token_t *new_nodes;
1408
1409       /* Avoid overflows in realloc.  */
1410       const size_t max_object_size = MAX (sizeof (re_token_t),
1411                                           MAX (sizeof (re_node_set),
1412                                                sizeof (int)));
1413       if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1414         return -1;
1415
1416       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1417       if (BE (new_nodes == NULL, 0))
1418         return -1;
1419       dfa->nodes = new_nodes;
1420       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1421       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1422       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1423       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1424       if (BE (new_nexts == NULL || new_indices == NULL
1425               || new_edests == NULL || new_eclosures == NULL, 0))
1426         return -1;
1427       dfa->nexts = new_nexts;
1428       dfa->org_indices = new_indices;
1429       dfa->edests = new_edests;
1430       dfa->eclosures = new_eclosures;
1431       dfa->nodes_alloc = new_nodes_alloc;
1432     }
1433   dfa->nodes[dfa->nodes_len] = token;
1434   dfa->nodes[dfa->nodes_len].constraint = 0;
1435 #ifdef RE_ENABLE_I18N
1436   dfa->nodes[dfa->nodes_len].accept_mb =
1437     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1438 #endif
1439   dfa->nexts[dfa->nodes_len] = -1;
1440   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1441   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1442   return dfa->nodes_len++;
1443 }
1444
1445 static inline unsigned int
1446 calc_state_hash (const re_node_set *nodes, unsigned int context)
1447 {
1448   unsigned int hash = nodes->nelem + context;
1449   int i;
1450   for (i = 0 ; i < nodes->nelem ; i++)
1451     hash += nodes->elems[i];
1452   return hash;
1453 }
1454
1455 /* Search for the state whose node_set is equivalent to NODES.
1456    Return the pointer to the state, if we found it in the DFA.
1457    Otherwise create the new one and return it.  In case of an error
1458    return NULL and set the error code in ERR.
1459    Note: - We assume NULL as the invalid state, then it is possible that
1460            return value is NULL and ERR is REG_NOERROR.
1461          - We never return non-NULL value in case of any errors, it is for
1462            optimization.  */
1463
1464 static re_dfastate_t *
1465 __attribute_warn_unused_result__
1466 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1467                   const re_node_set *nodes)
1468 {
1469   unsigned int hash;
1470   re_dfastate_t *new_state;
1471   struct re_state_table_entry *spot;
1472   int i;
1473   if (BE (nodes->nelem == 0, 0))
1474     {
1475       *err = REG_NOERROR;
1476       return NULL;
1477     }
1478   hash = calc_state_hash (nodes, 0);
1479   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1480
1481   for (i = 0 ; i < spot->num ; i++)
1482     {
1483       re_dfastate_t *state = spot->array[i];
1484       if (hash != state->hash)
1485         continue;
1486       if (re_node_set_compare (&state->nodes, nodes))
1487         return state;
1488     }
1489
1490   /* There are no appropriate state in the dfa, create the new one.  */
1491   new_state = create_ci_newstate (dfa, nodes, hash);
1492   if (BE (new_state == NULL, 0))
1493     *err = REG_ESPACE;
1494
1495   return new_state;
1496 }
1497
1498 /* Search for the state whose node_set is equivalent to NODES and
1499    whose context is equivalent to CONTEXT.
1500    Return the pointer to the state, if we found it in the DFA.
1501    Otherwise create the new one and return it.  In case of an error
1502    return NULL and set the error code in ERR.
1503    Note: - We assume NULL as the invalid state, then it is possible that
1504            return value is NULL and ERR is REG_NOERROR.
1505          - We never return non-NULL value in case of any errors, it is for
1506            optimization.  */
1507
1508 static re_dfastate_t *
1509 __attribute_warn_unused_result__
1510 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1511                           const re_node_set *nodes, unsigned int context)
1512 {
1513   unsigned int hash;
1514   re_dfastate_t *new_state;
1515   struct re_state_table_entry *spot;
1516   int i;
1517   if (nodes->nelem == 0)
1518     {
1519       *err = REG_NOERROR;
1520       return NULL;
1521     }
1522   hash = calc_state_hash (nodes, context);
1523   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1524
1525   for (i = 0 ; i < spot->num ; i++)
1526     {
1527       re_dfastate_t *state = spot->array[i];
1528       if (state->hash == hash
1529           && state->context == context
1530           && re_node_set_compare (state->entrance_nodes, nodes))
1531         return state;
1532     }
1533   /* There are no appropriate state in `dfa', create the new one.  */
1534   new_state = create_cd_newstate (dfa, nodes, context, hash);
1535   if (BE (new_state == NULL, 0))
1536     *err = REG_ESPACE;
1537
1538   return new_state;
1539 }
1540
1541 /* Finish initialization of the new state NEWSTATE, and using its hash value
1542    HASH put in the appropriate bucket of DFA's state table.  Return value
1543    indicates the error code if failed.  */
1544
1545 static reg_errcode_t
1546 __attribute_warn_unused_result__
1547 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1548                 unsigned int hash)
1549 {
1550   struct re_state_table_entry *spot;
1551   reg_errcode_t err;
1552   int i;
1553
1554   newstate->hash = hash;
1555   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1556   if (BE (err != REG_NOERROR, 0))
1557     return REG_ESPACE;
1558   for (i = 0; i < newstate->nodes.nelem; i++)
1559     {
1560       int elem = newstate->nodes.elems[i];
1561       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1562         if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1563           return REG_ESPACE;
1564     }
1565
1566   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1567   if (BE (spot->alloc <= spot->num, 0))
1568     {
1569       int new_alloc = 2 * spot->num + 2;
1570       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1571                                               new_alloc);
1572       if (BE (new_array == NULL, 0))
1573         return REG_ESPACE;
1574       spot->array = new_array;
1575       spot->alloc = new_alloc;
1576     }
1577   spot->array[spot->num++] = newstate;
1578   return REG_NOERROR;
1579 }
1580
1581 static void
1582 free_state (re_dfastate_t *state)
1583 {
1584   re_node_set_free (&state->non_eps_nodes);
1585   re_node_set_free (&state->inveclosure);
1586   if (state->entrance_nodes != &state->nodes)
1587     {
1588       re_node_set_free (state->entrance_nodes);
1589       re_free (state->entrance_nodes);
1590     }
1591   re_node_set_free (&state->nodes);
1592   re_free (state->word_trtable);
1593   re_free (state->trtable);
1594   re_free (state);
1595 }
1596
1597 /* Create the new state which is independent of contexts.
1598    Return the new state if succeeded, otherwise return NULL.  */
1599
1600 static re_dfastate_t *
1601 __attribute_warn_unused_result__
1602 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1603                     unsigned int hash)
1604 {
1605   int i;
1606   reg_errcode_t err;
1607   re_dfastate_t *newstate;
1608
1609   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1610   if (BE (newstate == NULL, 0))
1611     return NULL;
1612   err = re_node_set_init_copy (&newstate->nodes, nodes);
1613   if (BE (err != REG_NOERROR, 0))
1614     {
1615       re_free (newstate);
1616       return NULL;
1617     }
1618
1619   newstate->entrance_nodes = &newstate->nodes;
1620   for (i = 0 ; i < nodes->nelem ; i++)
1621     {
1622       re_token_t *node = dfa->nodes + nodes->elems[i];
1623       re_token_type_t type = node->type;
1624       if (type == CHARACTER && !node->constraint)
1625         continue;
1626 #ifdef RE_ENABLE_I18N
1627       newstate->accept_mb |= node->accept_mb;
1628 #endif /* RE_ENABLE_I18N */
1629
1630       /* If the state has the halt node, the state is a halt state.  */
1631       if (type == END_OF_RE)
1632         newstate->halt = 1;
1633       else if (type == OP_BACK_REF)
1634         newstate->has_backref = 1;
1635       else if (type == ANCHOR || node->constraint)
1636         newstate->has_constraint = 1;
1637     }
1638   err = register_state (dfa, newstate, hash);
1639   if (BE (err != REG_NOERROR, 0))
1640     {
1641       free_state (newstate);
1642       newstate = NULL;
1643     }
1644   return newstate;
1645 }
1646
1647 /* Create the new state which is depend on the context CONTEXT.
1648    Return the new state if succeeded, otherwise return NULL.  */
1649
1650 static re_dfastate_t *
1651 __attribute_warn_unused_result__
1652 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1653                     unsigned int context, unsigned int hash)
1654 {
1655   int i, nctx_nodes = 0;
1656   reg_errcode_t err;
1657   re_dfastate_t *newstate;
1658
1659   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1660   if (BE (newstate == NULL, 0))
1661     return NULL;
1662   err = re_node_set_init_copy (&newstate->nodes, nodes);
1663   if (BE (err != REG_NOERROR, 0))
1664     {
1665       re_free (newstate);
1666       return NULL;
1667     }
1668
1669   newstate->context = context;
1670   newstate->entrance_nodes = &newstate->nodes;
1671
1672   for (i = 0 ; i < nodes->nelem ; i++)
1673     {
1674       re_token_t *node = dfa->nodes + nodes->elems[i];
1675       re_token_type_t type = node->type;
1676       unsigned int constraint = node->constraint;
1677
1678       if (type == CHARACTER && !constraint)
1679         continue;
1680 #ifdef RE_ENABLE_I18N
1681       newstate->accept_mb |= node->accept_mb;
1682 #endif /* RE_ENABLE_I18N */
1683
1684       /* If the state has the halt node, the state is a halt state.  */
1685       if (type == END_OF_RE)
1686         newstate->halt = 1;
1687       else if (type == OP_BACK_REF)
1688         newstate->has_backref = 1;
1689
1690       if (constraint)
1691         {
1692           if (newstate->entrance_nodes == &newstate->nodes)
1693             {
1694               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1695               if (BE (newstate->entrance_nodes == NULL, 0))
1696                 {
1697                   free_state (newstate);
1698                   return NULL;
1699                 }
1700               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1701                   != REG_NOERROR)
1702                 return NULL;
1703               nctx_nodes = 0;
1704               newstate->has_constraint = 1;
1705             }
1706
1707           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1708             {
1709               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1710               ++nctx_nodes;
1711             }
1712         }
1713     }
1714   err = register_state (dfa, newstate, hash);
1715   if (BE (err != REG_NOERROR, 0))
1716     {
1717       free_state (newstate);
1718       newstate = NULL;
1719     }
1720   return  newstate;
1721 }