lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2017 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <http://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa) internal_function;
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash) internal_function;
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash) internal_function;
  31 \f
  32 /* Functions for string operation.  */
  33
  34 /* This function allocate the buffers.  It is necessary to call
  35    re_string_reconstruct before using the object.  */
  36
  37 static reg_errcode_t
  38 internal_function __attribute_warn_unused_result__
  39 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  40                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  41 {
  42   reg_errcode_t ret;
  43   Idx init_buf_len;
  44
  45   /* Ensure at least one character fits into the buffers.  */
  46   if (init_len < dfa->mb_cur_max)
  47     init_len = dfa->mb_cur_max;
  48   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  49   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  50
  51   ret = re_string_realloc_buffers (pstr, init_buf_len);
  52   if (BE (ret != REG_NOERROR, 0))
  53     return ret;
  54
  55   pstr->word_char = dfa->word_char;
  56   pstr->word_ops_used = dfa->word_ops_used;
  57   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  58   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  59   pstr->valid_raw_len = pstr->valid_len;
  60   return REG_NOERROR;
  61 }
  62
  63 /* This function allocate the buffers, and initialize them.  */
  64
  65 static reg_errcode_t
  66 internal_function __attribute_warn_unused_result__
  67 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  68                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  69 {
  70   reg_errcode_t ret;
  71   memset (pstr, '\0', sizeof (re_string_t));
  72   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  73
  74   if (len > 0)
  75     {
  76       ret = re_string_realloc_buffers (pstr, len + 1);
  77       if (BE (ret != REG_NOERROR, 0))
  78         return ret;
  79     }
  80   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  81
  82   if (icase)
  83     {
  84 #ifdef RE_ENABLE_I18N
  85       if (dfa->mb_cur_max > 1)
  86         {
  87           while (1)
  88             {
  89               ret = build_wcs_upper_buffer (pstr);
  90               if (BE (ret != REG_NOERROR, 0))
  91                 return ret;
  92               if (pstr->valid_raw_len >= len)
  93                 break;
  94               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  95                 break;
  96               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  97               if (BE (ret != REG_NOERROR, 0))
  98                 return ret;
  99             }
 100         }
 101       else
 102 #endif /* RE_ENABLE_I18N  */
 103         build_upper_buffer (pstr);
 104     }
 105   else
 106     {
 107 #ifdef RE_ENABLE_I18N
 108       if (dfa->mb_cur_max > 1)
 109         build_wcs_buffer (pstr);
 110       else
 111 #endif /* RE_ENABLE_I18N  */
 112         {
 113           if (trans != NULL)
 114             re_string_translate_buffer (pstr);
 115           else
 116             {
 117               pstr->valid_len = pstr->bufs_len;
 118               pstr->valid_raw_len = pstr->bufs_len;
 119             }
 120         }
 121     }
 122
 123   return REG_NOERROR;
 124 }
 125
 126 /* Helper functions for re_string_allocate, and re_string_construct.  */
 127
 128 static reg_errcode_t
 129 internal_function __attribute_warn_unused_result__
 130 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 131 {
 132 #ifdef RE_ENABLE_I18N
 133   if (pstr->mb_cur_max > 1)
 134     {
 135       wint_t *new_wcs;
 136
 137       /* Avoid overflow in realloc.  */
 138       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 139       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
 140         return REG_ESPACE;
 141
 142       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 143       if (BE (new_wcs == NULL, 0))
 144         return REG_ESPACE;
 145       pstr->wcs = new_wcs;
 146       if (pstr->offsets != NULL)
 147         {
 148           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 149           if (BE (new_offsets == NULL, 0))
 150             return REG_ESPACE;
 151           pstr->offsets = new_offsets;
 152         }
 153     }
 154 #endif /* RE_ENABLE_I18N  */
 155   if (pstr->mbs_allocated)
 156     {
 157       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 158                                            new_buf_len);
 159       if (BE (new_mbs == NULL, 0))
 160         return REG_ESPACE;
 161       pstr->mbs = new_mbs;
 162     }
 163   pstr->bufs_len = new_buf_len;
 164   return REG_NOERROR;
 165 }
 166
 167
 168 static void
 169 internal_function
 170 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 171                             RE_TRANSLATE_TYPE trans, bool icase,
 172                             const re_dfa_t *dfa)
 173 {
 174   pstr->raw_mbs = (const unsigned char *) str;
 175   pstr->len = len;
 176   pstr->raw_len = len;
 177   pstr->trans = trans;
 178   pstr->icase = icase;
 179   pstr->mbs_allocated = (trans != NULL || icase);
 180   pstr->mb_cur_max = dfa->mb_cur_max;
 181   pstr->is_utf8 = dfa->is_utf8;
 182   pstr->map_notascii = dfa->map_notascii;
 183   pstr->stop = pstr->len;
 184   pstr->raw_stop = pstr->stop;
 185 }
 186
 187 #ifdef RE_ENABLE_I18N
 188
 189 /* Build wide character buffer PSTR->WCS.
 190    If the byte sequence of the string are:
 191      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 192    Then wide character buffer will be:
 193      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 194    We use WEOF for padding, they indicate that the position isn't
 195    a first byte of a multibyte character.
 196
 197    Note that this function assumes PSTR->VALID_LEN elements are already
 198    built and starts from PSTR->VALID_LEN.  */
 199
 200 static void
 201 internal_function
 202 build_wcs_buffer (re_string_t *pstr)
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (BE (pstr->trans != NULL, 0))
 226         {
 227           int i, ch;
 228
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (BE (mbclen == (size_t) -1 || mbclen == 0
 240               || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
 241         {
 242           /* We treat these cases as a singlebyte character.  */
 243           mbclen = 1;
 244           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 245           if (BE (pstr->trans != NULL, 0))
 246             wc = pstr->trans[wc];
 247           pstr->cur_state = prev_st;
 248         }
 249       else if (BE (mbclen == (size_t) -2, 0))
 250         {
 251           /* The buffer doesn't have enough space, finish to build.  */
 252           pstr->cur_state = prev_st;
 253           break;
 254         }
 255
 256       /* Write wide character and padding.  */
 257       pstr->wcs[byte_idx++] = wc;
 258       /* Write paddings.  */
 259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 260         pstr->wcs[byte_idx++] = WEOF;
 261     }
 262   pstr->valid_len = byte_idx;
 263   pstr->valid_raw_len = byte_idx;
 264 }
 265
 266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 267    but for REG_ICASE.  */
 268
 269 static reg_errcode_t
 270 internal_function __attribute_warn_unused_result__
 271 build_wcs_upper_buffer (re_string_t *pstr)
 272 {
 273   mbstate_t prev_st;
 274   Idx src_idx, byte_idx, end_idx, remain_len;
 275   size_t mbclen;
 276 #ifdef _LIBC
 277   char buf[MB_LEN_MAX];
 278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 279 #else
 280   char buf[64];
 281 #endif
 282
 283   byte_idx = pstr->valid_len;
 284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 285
 286   /* The following optimization assumes that ASCII characters can be
 287      mapped to wide characters with a simple cast.  */
 288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 289     {
 290       while (byte_idx < end_idx)
 291         {
 292           wchar_t wc;
 293
 294           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 295               && mbsinit (&pstr->cur_state))
 296             {
 297               /* In case of a singlebyte character.  */
 298               pstr->mbs[byte_idx]
 299                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 300               /* The next step uses the assumption that wchar_t is encoded
 301                  ASCII-safe: all ASCII values can be converted like this.  */
 302               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 303               ++byte_idx;
 304               continue;
 305             }
 306
 307           remain_len = end_idx - byte_idx;
 308           prev_st = pstr->cur_state;
 309           mbclen = __mbrtowc (&wc,
 310                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 311                                + byte_idx), remain_len, &pstr->cur_state);
 312           if (BE (mbclen < (size_t) -2, 1))
 313             {
 314               wchar_t wcu = __towupper (wc);
 315               if (wcu != wc)
 316                 {
 317                   size_t mbcdlen;
 318
 319                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 320                   if (BE (mbclen == mbcdlen, 1))
 321                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 322                   else
 323                     {
 324                       src_idx = byte_idx;
 325                       goto offsets_needed;
 326                     }
 327                 }
 328               else
 329                 memcpy (pstr->mbs + byte_idx,
 330                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 331               pstr->wcs[byte_idx++] = wcu;
 332               /* Write paddings.  */
 333               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 334                 pstr->wcs[byte_idx++] = WEOF;
 335             }
 336           else if (mbclen == (size_t) -1 || mbclen == 0
 337                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 338             {
 339               /* It is an invalid character, an incomplete character
 340                  at the end of the string, or '\0'.  Just use the byte.  */
 341               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 342               pstr->mbs[byte_idx] = ch;
 343               /* And also cast it to wide char.  */
 344               pstr->wcs[byte_idx++] = (wchar_t) ch;
 345               if (BE (mbclen == (size_t) -1, 0))
 346                 pstr->cur_state = prev_st;
 347             }
 348           else
 349             {
 350               /* The buffer doesn't have enough space, finish to build.  */
 351               pstr->cur_state = prev_st;
 352               break;
 353             }
 354         }
 355       pstr->valid_len = byte_idx;
 356       pstr->valid_raw_len = byte_idx;
 357       return REG_NOERROR;
 358     }
 359   else
 360     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 361       {
 362         wchar_t wc;
 363         const char *p;
 364       offsets_needed:
 365         remain_len = end_idx - byte_idx;
 366         prev_st = pstr->cur_state;
 367         if (BE (pstr->trans != NULL, 0))
 368           {
 369             int i, ch;
 370
 371             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 372               {
 373                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 374                 buf[i] = pstr->trans[ch];
 375               }
 376             p = (const char *) buf;
 377           }
 378         else
 379           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 380         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 381         if (BE (mbclen < (size_t) -2, 1))
 382           {
 383             wchar_t wcu = __towupper (wc);
 384             if (wcu != wc)
 385               {
 386                 size_t mbcdlen;
 387
 388                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 389                 if (BE (mbclen == mbcdlen, 1))
 390                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 391                 else if (mbcdlen != (size_t) -1)
 392                   {
 393                     size_t i;
 394
 395                     if (byte_idx + mbcdlen > pstr->bufs_len)
 396                       {
 397                         pstr->cur_state = prev_st;
 398                         break;
 399                       }
 400
 401                     if (pstr->offsets == NULL)
 402                       {
 403                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 404
 405                         if (pstr->offsets == NULL)
 406                           return REG_ESPACE;
 407                       }
 408                     if (!pstr->offsets_needed)
 409                       {
 410                         for (i = 0; i < (size_t) byte_idx; ++i)
 411                           pstr->offsets[i] = i;
 412                         pstr->offsets_needed = 1;
 413                       }
 414
 415                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 416                     pstr->wcs[byte_idx] = wcu;
 417                     pstr->offsets[byte_idx] = src_idx;
 418                     for (i = 1; i < mbcdlen; ++i)
 419                       {
 420                         pstr->offsets[byte_idx + i]
 421                           = src_idx + (i < mbclen ? i : mbclen - 1);
 422                         pstr->wcs[byte_idx + i] = WEOF;
 423                       }
 424                     pstr->len += mbcdlen - mbclen;
 425                     if (pstr->raw_stop > src_idx)
 426                       pstr->stop += mbcdlen - mbclen;
 427                     end_idx = (pstr->bufs_len > pstr->len)
 428                               ? pstr->len : pstr->bufs_len;
 429                     byte_idx += mbcdlen;
 430                     src_idx += mbclen;
 431                     continue;
 432                   }
 433                 else
 434                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 435               }
 436             else
 437               memcpy (pstr->mbs + byte_idx, p, mbclen);
 438
 439             if (BE (pstr->offsets_needed != 0, 0))
 440               {
 441                 size_t i;
 442                 for (i = 0; i < mbclen; ++i)
 443                   pstr->offsets[byte_idx + i] = src_idx + i;
 444               }
 445             src_idx += mbclen;
 446
 447             pstr->wcs[byte_idx++] = wcu;
 448             /* Write paddings.  */
 449             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 450               pstr->wcs[byte_idx++] = WEOF;
 451           }
 452         else if (mbclen == (size_t) -1 || mbclen == 0
 453                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 454           {
 455             /* It is an invalid character or '\0'.  Just use the byte.  */
 456             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 457
 458             if (BE (pstr->trans != NULL, 0))
 459               ch = pstr->trans [ch];
 460             pstr->mbs[byte_idx] = ch;
 461
 462             if (BE (pstr->offsets_needed != 0, 0))
 463               pstr->offsets[byte_idx] = src_idx;
 464             ++src_idx;
 465
 466             /* And also cast it to wide char.  */
 467             pstr->wcs[byte_idx++] = (wchar_t) ch;
 468             if (BE (mbclen == (size_t) -1, 0))
 469               pstr->cur_state = prev_st;
 470           }
 471         else
 472           {
 473             /* The buffer doesn't have enough space, finish to build.  */
 474             pstr->cur_state = prev_st;
 475             break;
 476           }
 477       }
 478   pstr->valid_len = byte_idx;
 479   pstr->valid_raw_len = src_idx;
 480   return REG_NOERROR;
 481 }
 482
 483 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 484    Return the index.  */
 485
 486 static Idx
 487 internal_function
 488 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 489 {
 490   mbstate_t prev_st;
 491   Idx rawbuf_idx;
 492   size_t mbclen;
 493   wint_t wc = WEOF;
 494
 495   /* Skip the characters which are not necessary to check.  */
 496   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 497        rawbuf_idx < new_raw_idx;)
 498     {
 499       wchar_t wc2;
 500       Idx remain_len = pstr->raw_len - rawbuf_idx;
 501       prev_st = pstr->cur_state;
 502       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 503                           remain_len, &pstr->cur_state);
 504       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 505         {
 506           /* We treat these cases as a single byte character.  */
 507           if (mbclen == 0 || remain_len == 0)
 508             wc = L'\0';
 509           else
 510             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 511           mbclen = 1;
 512           pstr->cur_state = prev_st;
 513         }
 514       else
 515         wc = wc2;
 516       /* Then proceed the next character.  */
 517       rawbuf_idx += mbclen;
 518     }
 519   *last_wc = wc;
 520   return rawbuf_idx;
 521 }
 522 #endif /* RE_ENABLE_I18N  */
 523
 524 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 525    This function is used in case of REG_ICASE.  */
 526
 527 static void
 528 internal_function
 529 build_upper_buffer (re_string_t *pstr)
 530 {
 531   Idx char_idx, end_idx;
 532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 533
 534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 535     {
 536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 537       if (BE (pstr->trans != NULL, 0))
 538         ch = pstr->trans[ch];
 539       pstr->mbs[char_idx] = toupper (ch);
 540     }
 541   pstr->valid_len = char_idx;
 542   pstr->valid_raw_len = char_idx;
 543 }
 544
 545 /* Apply TRANS to the buffer in PSTR.  */
 546
 547 static void
 548 internal_function
 549 re_string_translate_buffer (re_string_t *pstr)
 550 {
 551   Idx buf_idx, end_idx;
 552   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 553
 554   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 555     {
 556       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 557       pstr->mbs[buf_idx] = pstr->trans[ch];
 558     }
 559
 560   pstr->valid_len = buf_idx;
 561   pstr->valid_raw_len = buf_idx;
 562 }
 563
 564 /* This function re-construct the buffers.
 565    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 566    convert to upper case in case of REG_ICASE, apply translation.  */
 567
 568 static reg_errcode_t
 569 internal_function __attribute_warn_unused_result__
 570 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 571 {
 572   Idx offset;
 573
 574   if (BE (pstr->raw_mbs_idx <= idx, 0))
 575     offset = idx - pstr->raw_mbs_idx;
 576   else
 577     {
 578       /* Reset buffer.  */
 579 #ifdef RE_ENABLE_I18N
 580       if (pstr->mb_cur_max > 1)
 581         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 582 #endif /* RE_ENABLE_I18N */
 583       pstr->len = pstr->raw_len;
 584       pstr->stop = pstr->raw_stop;
 585       pstr->valid_len = 0;
 586       pstr->raw_mbs_idx = 0;
 587       pstr->valid_raw_len = 0;
 588       pstr->offsets_needed = 0;
 589       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 590                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 591       if (!pstr->mbs_allocated)
 592         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 593       offset = idx;
 594     }
 595
 596   if (BE (offset != 0, 1))
 597     {
 598       /* Should the already checked characters be kept?  */
 599       if (BE (offset < pstr->valid_raw_len, 1))
 600         {
 601           /* Yes, move them to the front of the buffer.  */
 602 #ifdef RE_ENABLE_I18N
 603           if (BE (pstr->offsets_needed, 0))
 604             {
 605               Idx low = 0, high = pstr->valid_len, mid;
 606               do
 607                 {
 608                   mid = (high + low) / 2;
 609                   if (pstr->offsets[mid] > offset)
 610                     high = mid;
 611                   else if (pstr->offsets[mid] < offset)
 612                     low = mid + 1;
 613                   else
 614                     break;
 615                 }
 616               while (low < high);
 617               if (pstr->offsets[mid] < offset)
 618                 ++mid;
 619               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 620                                                         eflags);
 621               /* This can be quite complicated, so handle specially
 622                  only the common and easy case where the character with
 623                  different length representation of lower and upper
 624                  case is present at or after offset.  */
 625               if (pstr->valid_len > offset
 626                   && mid == offset && pstr->offsets[mid] == offset)
 627                 {
 628                   memmove (pstr->wcs, pstr->wcs + offset,
 629                            (pstr->valid_len - offset) * sizeof (wint_t));
 630                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 631                   pstr->valid_len -= offset;
 632                   pstr->valid_raw_len -= offset;
 633                   for (low = 0; low < pstr->valid_len; low++)
 634                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 635                 }
 636               else
 637                 {
 638                   /* Otherwise, just find out how long the partial multibyte
 639                      character at offset is and fill it with WEOF/255.  */
 640                   pstr->len = pstr->raw_len - idx + offset;
 641                   pstr->stop = pstr->raw_stop - idx + offset;
 642                   pstr->offsets_needed = 0;
 643                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 644                     --mid;
 645                   while (mid < pstr->valid_len)
 646                     if (pstr->wcs[mid] != WEOF)
 647                       break;
 648                     else
 649                       ++mid;
 650                   if (mid == pstr->valid_len)
 651                     pstr->valid_len = 0;
 652                   else
 653                     {
 654                       pstr->valid_len = pstr->offsets[mid] - offset;
 655                       if (pstr->valid_len)
 656                         {
 657                           for (low = 0; low < pstr->valid_len; ++low)
 658                             pstr->wcs[low] = WEOF;
 659                           memset (pstr->mbs, 255, pstr->valid_len);
 660                         }
 661                     }
 662                   pstr->valid_raw_len = pstr->valid_len;
 663                 }
 664             }
 665           else
 666 #endif
 667             {
 668               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 669                                                         eflags);
 670 #ifdef RE_ENABLE_I18N
 671               if (pstr->mb_cur_max > 1)
 672                 memmove (pstr->wcs, pstr->wcs + offset,
 673                          (pstr->valid_len - offset) * sizeof (wint_t));
 674 #endif /* RE_ENABLE_I18N */
 675               if (BE (pstr->mbs_allocated, 0))
 676                 memmove (pstr->mbs, pstr->mbs + offset,
 677                          pstr->valid_len - offset);
 678               pstr->valid_len -= offset;
 679               pstr->valid_raw_len -= offset;
 680 #if defined DEBUG && DEBUG
 681               assert (pstr->valid_len > 0);
 682 #endif
 683             }
 684         }
 685       else
 686         {
 687 #ifdef RE_ENABLE_I18N
 688           /* No, skip all characters until IDX.  */
 689           Idx prev_valid_len = pstr->valid_len;
 690
 691           if (BE (pstr->offsets_needed, 0))
 692             {
 693               pstr->len = pstr->raw_len - idx + offset;
 694               pstr->stop = pstr->raw_stop - idx + offset;
 695               pstr->offsets_needed = 0;
 696             }
 697 #endif
 698           pstr->valid_len = 0;
 699 #ifdef RE_ENABLE_I18N
 700           if (pstr->mb_cur_max > 1)
 701             {
 702               Idx wcs_idx;
 703               wint_t wc = WEOF;
 704
 705               if (pstr->is_utf8)
 706                 {
 707                   const unsigned char *raw, *p, *end;
 708
 709                   /* Special case UTF-8.  Multi-byte chars start with any
 710                      byte other than 0x80 - 0xbf.  */
 711                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 712                   end = raw + (offset - pstr->mb_cur_max);
 713                   if (end < pstr->raw_mbs)
 714                     end = pstr->raw_mbs;
 715                   p = raw + offset - 1;
 716 #ifdef _LIBC
 717                   /* We know the wchar_t encoding is UCS4, so for the simple
 718                      case, ASCII characters, skip the conversion step.  */
 719                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 720                     {
 721                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 722                       /* pstr->valid_len = 0; */
 723                       wc = (wchar_t) *p;
 724                     }
 725                   else
 726 #endif
 727                     for (; p >= end; --p)
 728                       if ((*p & 0xc0) != 0x80)
 729                         {
 730                           mbstate_t cur_state;
 731                           wchar_t wc2;
 732                           Idx mlen = raw + pstr->len - p;
 733                           unsigned char buf[6];
 734                           size_t mbclen;
 735
 736                           const unsigned char *pp = p;
 737                           if (BE (pstr->trans != NULL, 0))
 738                             {
 739                               int i = mlen < 6 ? mlen : 6;
 740                               while (--i >= 0)
 741                                 buf[i] = pstr->trans[p[i]];
 742                               pp = buf;
 743                             }
 744                           /* XXX Don't use mbrtowc, we know which conversion
 745                              to use (UTF-8 -> UCS4).  */
 746                           memset (&cur_state, 0, sizeof (cur_state));
 747                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 748                                               &cur_state);
 749                           if (raw + offset - p <= mbclen
 750                               && mbclen < (size_t) -2)
 751                             {
 752                               memset (&pstr->cur_state, '\0',
 753                                       sizeof (mbstate_t));
 754                               pstr->valid_len = mbclen - (raw + offset - p);
 755                               wc = wc2;
 756                             }
 757                           break;
 758                         }
 759                 }
 760
 761               if (wc == WEOF)
 762                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 763               if (wc == WEOF)
 764                 pstr->tip_context
 765                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 766               else
 767                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 768                                       && IS_WIDE_WORD_CHAR (wc))
 769                                      ? CONTEXT_WORD
 770                                      : ((IS_WIDE_NEWLINE (wc)
 771                                          && pstr->newline_anchor)
 772                                         ? CONTEXT_NEWLINE : 0));
 773               if (BE (pstr->valid_len, 0))
 774                 {
 775                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 776                     pstr->wcs[wcs_idx] = WEOF;
 777                   if (pstr->mbs_allocated)
 778                     memset (pstr->mbs, 255, pstr->valid_len);
 779                 }
 780               pstr->valid_raw_len = pstr->valid_len;
 781             }
 782           else
 783 #endif /* RE_ENABLE_I18N */
 784             {
 785               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 786               pstr->valid_raw_len = 0;
 787               if (pstr->trans)
 788                 c = pstr->trans[c];
 789               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 790                                    ? CONTEXT_WORD
 791                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 792                                       ? CONTEXT_NEWLINE : 0));
 793             }
 794         }
 795       if (!BE (pstr->mbs_allocated, 0))
 796         pstr->mbs += offset;
 797     }
 798   pstr->raw_mbs_idx = idx;
 799   pstr->len -= offset;
 800   pstr->stop -= offset;
 801
 802   /* Then build the buffers.  */
 803 #ifdef RE_ENABLE_I18N
 804   if (pstr->mb_cur_max > 1)
 805     {
 806       if (pstr->icase)
 807         {
 808           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 809           if (BE (ret != REG_NOERROR, 0))
 810             return ret;
 811         }
 812       else
 813         build_wcs_buffer (pstr);
 814     }
 815   else
 816 #endif /* RE_ENABLE_I18N */
 817     if (BE (pstr->mbs_allocated, 0))
 818       {
 819         if (pstr->icase)
 820           build_upper_buffer (pstr);
 821         else if (pstr->trans != NULL)
 822           re_string_translate_buffer (pstr);
 823       }
 824     else
 825       pstr->valid_len = pstr->len;
 826
 827   pstr->cur_idx = 0;
 828   return REG_NOERROR;
 829 }
 830
 831 static unsigned char
 832 internal_function __attribute__ ((pure))
 833 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 834 {
 835   int ch;
 836   Idx off;
 837
 838   /* Handle the common (easiest) cases first.  */
 839   if (BE (!pstr->mbs_allocated, 1))
 840     return re_string_peek_byte (pstr, idx);
 841
 842 #ifdef RE_ENABLE_I18N
 843   if (pstr->mb_cur_max > 1
 844       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 845     return re_string_peek_byte (pstr, idx);
 846 #endif
 847
 848   off = pstr->cur_idx + idx;
 849 #ifdef RE_ENABLE_I18N
 850   if (pstr->offsets_needed)
 851     off = pstr->offsets[off];
 852 #endif
 853
 854   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 855
 856 #ifdef RE_ENABLE_I18N
 857   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 858      this function returns CAPITAL LETTER I instead of first byte of
 859      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 860      since peek_byte_case doesn't advance cur_idx in any way.  */
 861   if (pstr->offsets_needed && !isascii (ch))
 862     return re_string_peek_byte (pstr, idx);
 863 #endif
 864
 865   return ch;
 866 }
 867
 868 static unsigned char
 869 internal_function
 870 re_string_fetch_byte_case (re_string_t *pstr)
 871 {
 872   if (BE (!pstr->mbs_allocated, 1))
 873     return re_string_fetch_byte (pstr);
 874
 875 #ifdef RE_ENABLE_I18N
 876   if (pstr->offsets_needed)
 877     {
 878       Idx off;
 879       int ch;
 880
 881       /* For tr_TR.UTF-8 [[:islower:]] there is
 882          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 883          in that case the whole multi-byte character and return
 884          the original letter.  On the other side, with
 885          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 886          anything else would complicate things too much.  */
 887
 888       if (!re_string_first_byte (pstr, pstr->cur_idx))
 889         return re_string_fetch_byte (pstr);
 890
 891       off = pstr->offsets[pstr->cur_idx];
 892       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 893
 894       if (! isascii (ch))
 895         return re_string_fetch_byte (pstr);
 896
 897       re_string_skip_bytes (pstr,
 898                             re_string_char_size_at (pstr, pstr->cur_idx));
 899       return ch;
 900     }
 901 #endif
 902
 903   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 904 }
 905
 906 static void
 907 internal_function
 908 re_string_destruct (re_string_t *pstr)
 909 {
 910 #ifdef RE_ENABLE_I18N
 911   re_free (pstr->wcs);
 912   re_free (pstr->offsets);
 913 #endif /* RE_ENABLE_I18N  */
 914   if (pstr->mbs_allocated)
 915     re_free (pstr->mbs);
 916 }
 917
 918 /* Return the context at IDX in INPUT.  */
 919
 920 static unsigned int
 921 internal_function
 922 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 923 {
 924   int c;
 925   if (BE (idx < 0, 0))
 926     /* In this case, we use the value stored in input->tip_context,
 927        since we can't know the character in input->mbs[-1] here.  */
 928     return input->tip_context;
 929   if (BE (idx == input->len, 0))
 930     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 931             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 932 #ifdef RE_ENABLE_I18N
 933   if (input->mb_cur_max > 1)
 934     {
 935       wint_t wc;
 936       Idx wc_idx = idx;
 937       while(input->wcs[wc_idx] == WEOF)
 938         {
 939 #if defined DEBUG && DEBUG
 940           /* It must not happen.  */
 941           assert (wc_idx >= 0);
 942 #endif
 943           --wc_idx;
 944           if (wc_idx < 0)
 945             return input->tip_context;
 946         }
 947       wc = input->wcs[wc_idx];
 948       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 949         return CONTEXT_WORD;
 950       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 951               ? CONTEXT_NEWLINE : 0);
 952     }
 953   else
 954 #endif
 955     {
 956       c = re_string_byte_at (input, idx);
 957       if (bitset_contain (input->word_char, c))
 958         return CONTEXT_WORD;
 959       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 960     }
 961 }
 962 \f
 963 /* Functions for set operation.  */
 964
 965 static reg_errcode_t
 966 internal_function __attribute_warn_unused_result__
 967 re_node_set_alloc (re_node_set *set, Idx size)
 968 {
 969   set->alloc = size;
 970   set->nelem = 0;
 971   set->elems = re_malloc (Idx, size);
 972   if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
 973     return REG_ESPACE;
 974   return REG_NOERROR;
 975 }
 976
 977 static reg_errcode_t
 978 internal_function __attribute_warn_unused_result__
 979 re_node_set_init_1 (re_node_set *set, Idx elem)
 980 {
 981   set->alloc = 1;
 982   set->nelem = 1;
 983   set->elems = re_malloc (Idx, 1);
 984   if (BE (set->elems == NULL, 0))
 985     {
 986       set->alloc = set->nelem = 0;
 987       return REG_ESPACE;
 988     }
 989   set->elems[0] = elem;
 990   return REG_NOERROR;
 991 }
 992
 993 static reg_errcode_t
 994 internal_function __attribute_warn_unused_result__
 995 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 996 {
 997   set->alloc = 2;
 998   set->elems = re_malloc (Idx, 2);
 999   if (BE (set->elems == NULL, 0))
1000     return REG_ESPACE;
1001   if (elem1 == elem2)
1002     {
1003       set->nelem = 1;
1004       set->elems[0] = elem1;
1005     }
1006   else
1007     {
1008       set->nelem = 2;
1009       if (elem1 < elem2)
1010         {
1011           set->elems[0] = elem1;
1012           set->elems[1] = elem2;
1013         }
1014       else
1015         {
1016           set->elems[0] = elem2;
1017           set->elems[1] = elem1;
1018         }
1019     }
1020   return REG_NOERROR;
1021 }
1022
1023 static reg_errcode_t
1024 internal_function __attribute_warn_unused_result__
1025 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1026 {
1027   dest->nelem = src->nelem;
1028   if (src->nelem > 0)
1029     {
1030       dest->alloc = dest->nelem;
1031       dest->elems = re_malloc (Idx, dest->alloc);
1032       if (BE (dest->elems == NULL, 0))
1033         {
1034           dest->alloc = dest->nelem = 0;
1035           return REG_ESPACE;
1036         }
1037       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1038     }
1039   else
1040     re_node_set_init_empty (dest);
1041   return REG_NOERROR;
1042 }
1043
1044 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1045    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1046    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1047
1048 static reg_errcode_t
1049 internal_function __attribute_warn_unused_result__
1050 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1051                            const re_node_set *src2)
1052 {
1053   Idx i1, i2, is, id, delta, sbase;
1054   if (src1->nelem == 0 || src2->nelem == 0)
1055     return REG_NOERROR;
1056
1057   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1058      conservative estimate.  */
1059   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1060     {
1061       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1062       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1063       if (BE (new_elems == NULL, 0))
1064         return REG_ESPACE;
1065       dest->elems = new_elems;
1066       dest->alloc = new_alloc;
1067     }
1068
1069   /* Find the items in the intersection of SRC1 and SRC2, and copy
1070      into the top of DEST those that are not already in DEST itself.  */
1071   sbase = dest->nelem + src1->nelem + src2->nelem;
1072   i1 = src1->nelem - 1;
1073   i2 = src2->nelem - 1;
1074   id = dest->nelem - 1;
1075   for (;;)
1076     {
1077       if (src1->elems[i1] == src2->elems[i2])
1078         {
1079           /* Try to find the item in DEST.  Maybe we could binary search?  */
1080           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1081             --id;
1082
1083           if (id < 0 || dest->elems[id] != src1->elems[i1])
1084             dest->elems[--sbase] = src1->elems[i1];
1085
1086           if (--i1 < 0 || --i2 < 0)
1087             break;
1088         }
1089
1090       /* Lower the highest of the two items.  */
1091       else if (src1->elems[i1] < src2->elems[i2])
1092         {
1093           if (--i2 < 0)
1094             break;
1095         }
1096       else
1097         {
1098           if (--i1 < 0)
1099             break;
1100         }
1101     }
1102
1103   id = dest->nelem - 1;
1104   is = dest->nelem + src1->nelem + src2->nelem - 1;
1105   delta = is - sbase + 1;
1106
1107   /* Now copy.  When DELTA becomes zero, the remaining
1108      DEST elements are already in place; this is more or
1109      less the same loop that is in re_node_set_merge.  */
1110   dest->nelem += delta;
1111   if (delta > 0 && id >= 0)
1112     for (;;)
1113       {
1114         if (dest->elems[is] > dest->elems[id])
1115           {
1116             /* Copy from the top.  */
1117             dest->elems[id + delta--] = dest->elems[is--];
1118             if (delta == 0)
1119               break;
1120           }
1121         else
1122           {
1123             /* Slide from the bottom.  */
1124             dest->elems[id + delta] = dest->elems[id];
1125             if (--id < 0)
1126               break;
1127           }
1128       }
1129
1130   /* Copy remaining SRC elements.  */
1131   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1132
1133   return REG_NOERROR;
1134 }
1135
1136 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1137    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1138
1139 static reg_errcode_t
1140 internal_function __attribute_warn_unused_result__
1141 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1142                         const re_node_set *src2)
1143 {
1144   Idx i1, i2, id;
1145   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1146     {
1147       dest->alloc = src1->nelem + src2->nelem;
1148       dest->elems = re_malloc (Idx, dest->alloc);
1149       if (BE (dest->elems == NULL, 0))
1150         return REG_ESPACE;
1151     }
1152   else
1153     {
1154       if (src1 != NULL && src1->nelem > 0)
1155         return re_node_set_init_copy (dest, src1);
1156       else if (src2 != NULL && src2->nelem > 0)
1157         return re_node_set_init_copy (dest, src2);
1158       else
1159         re_node_set_init_empty (dest);
1160       return REG_NOERROR;
1161     }
1162   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1163     {
1164       if (src1->elems[i1] > src2->elems[i2])
1165         {
1166           dest->elems[id++] = src2->elems[i2++];
1167           continue;
1168         }
1169       if (src1->elems[i1] == src2->elems[i2])
1170         ++i2;
1171       dest->elems[id++] = src1->elems[i1++];
1172     }
1173   if (i1 < src1->nelem)
1174     {
1175       memcpy (dest->elems + id, src1->elems + i1,
1176              (src1->nelem - i1) * sizeof (Idx));
1177       id += src1->nelem - i1;
1178     }
1179   else if (i2 < src2->nelem)
1180     {
1181       memcpy (dest->elems + id, src2->elems + i2,
1182              (src2->nelem - i2) * sizeof (Idx));
1183       id += src2->nelem - i2;
1184     }
1185   dest->nelem = id;
1186   return REG_NOERROR;
1187 }
1188
1189 /* Calculate the union set of the sets DEST and SRC. And store it to
1190    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1191
1192 static reg_errcode_t
1193 internal_function __attribute_warn_unused_result__
1194 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1195 {
1196   Idx is, id, sbase, delta;
1197   if (src == NULL || src->nelem == 0)
1198     return REG_NOERROR;
1199   if (dest->alloc < 2 * src->nelem + dest->nelem)
1200     {
1201       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1202       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1203       if (BE (new_buffer == NULL, 0))
1204         return REG_ESPACE;
1205       dest->elems = new_buffer;
1206       dest->alloc = new_alloc;
1207     }
1208
1209   if (BE (dest->nelem == 0, 0))
1210     {
1211       dest->nelem = src->nelem;
1212       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1213       return REG_NOERROR;
1214     }
1215
1216   /* Copy into the top of DEST the items of SRC that are not
1217      found in DEST.  Maybe we could binary search in DEST?  */
1218   for (sbase = dest->nelem + 2 * src->nelem,
1219        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1220     {
1221       if (dest->elems[id] == src->elems[is])
1222         is--, id--;
1223       else if (dest->elems[id] < src->elems[is])
1224         dest->elems[--sbase] = src->elems[is--];
1225       else /* if (dest->elems[id] > src->elems[is]) */
1226         --id;
1227     }
1228
1229   if (is >= 0)
1230     {
1231       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1232       sbase -= is + 1;
1233       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1234     }
1235
1236   id = dest->nelem - 1;
1237   is = dest->nelem + 2 * src->nelem - 1;
1238   delta = is - sbase + 1;
1239   if (delta == 0)
1240     return REG_NOERROR;
1241
1242   /* Now copy.  When DELTA becomes zero, the remaining
1243      DEST elements are already in place.  */
1244   dest->nelem += delta;
1245   for (;;)
1246     {
1247       if (dest->elems[is] > dest->elems[id])
1248         {
1249           /* Copy from the top.  */
1250           dest->elems[id + delta--] = dest->elems[is--];
1251           if (delta == 0)
1252             break;
1253         }
1254       else
1255         {
1256           /* Slide from the bottom.  */
1257           dest->elems[id + delta] = dest->elems[id];
1258           if (--id < 0)
1259             {
1260               /* Copy remaining SRC elements.  */
1261               memcpy (dest->elems, dest->elems + sbase,
1262                       delta * sizeof (Idx));
1263               break;
1264             }
1265         }
1266     }
1267
1268   return REG_NOERROR;
1269 }
1270
1271 /* Insert the new element ELEM to the re_node_set* SET.
1272    SET should not already have ELEM.
1273    Return true if successful.  */
1274
1275 static bool
1276 internal_function __attribute_warn_unused_result__
1277 re_node_set_insert (re_node_set *set, Idx elem)
1278 {
1279   Idx idx;
1280   /* In case the set is empty.  */
1281   if (set->alloc == 0)
1282     return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1283
1284   if (BE (set->nelem, 0) == 0)
1285     {
1286       /* We already guaranteed above that set->alloc != 0.  */
1287       set->elems[0] = elem;
1288       ++set->nelem;
1289       return true;
1290     }
1291
1292   /* Realloc if we need.  */
1293   if (set->alloc == set->nelem)
1294     {
1295       Idx *new_elems;
1296       set->alloc = set->alloc * 2;
1297       new_elems = re_realloc (set->elems, Idx, set->alloc);
1298       if (BE (new_elems == NULL, 0))
1299         return false;
1300       set->elems = new_elems;
1301     }
1302
1303   /* Move the elements which follows the new element.  Test the
1304      first element separately to skip a check in the inner loop.  */
1305   if (elem < set->elems[0])
1306     {
1307       idx = 0;
1308       for (idx = set->nelem; idx > 0; idx--)
1309         set->elems[idx] = set->elems[idx - 1];
1310     }
1311   else
1312     {
1313       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1314         set->elems[idx] = set->elems[idx - 1];
1315     }
1316
1317   /* Insert the new element.  */
1318   set->elems[idx] = elem;
1319   ++set->nelem;
1320   return true;
1321 }
1322
1323 /* Insert the new element ELEM to the re_node_set* SET.
1324    SET should not already have any element greater than or equal to ELEM.
1325    Return true if successful.  */
1326
1327 static bool
1328 internal_function __attribute_warn_unused_result__
1329 re_node_set_insert_last (re_node_set *set, Idx elem)
1330 {
1331   /* Realloc if we need.  */
1332   if (set->alloc == set->nelem)
1333     {
1334       Idx *new_elems;
1335       set->alloc = (set->alloc + 1) * 2;
1336       new_elems = re_realloc (set->elems, Idx, set->alloc);
1337       if (BE (new_elems == NULL, 0))
1338         return false;
1339       set->elems = new_elems;
1340     }
1341
1342   /* Insert the new element.  */
1343   set->elems[set->nelem++] = elem;
1344   return true;
1345 }
1346
1347 /* Compare two node sets SET1 and SET2.
1348    Return true if SET1 and SET2 are equivalent.  */
1349
1350 static bool
1351 internal_function __attribute__ ((pure))
1352 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1353 {
1354   Idx i;
1355   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1356     return false;
1357   for (i = set1->nelem ; --i >= 0 ; )
1358     if (set1->elems[i] != set2->elems[i])
1359       return false;
1360   return true;
1361 }
1362
1363 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1364
1365 static Idx
1366 internal_function __attribute__ ((pure))
1367 re_node_set_contains (const re_node_set *set, Idx elem)
1368 {
1369   __re_size_t idx, right, mid;
1370   if (set->nelem <= 0)
1371     return 0;
1372
1373   /* Binary search the element.  */
1374   idx = 0;
1375   right = set->nelem - 1;
1376   while (idx < right)
1377     {
1378       mid = (idx + right) / 2;
1379       if (set->elems[mid] < elem)
1380         idx = mid + 1;
1381       else
1382         right = mid;
1383     }
1384   return set->elems[idx] == elem ? idx + 1 : 0;
1385 }
1386
1387 static void
1388 internal_function
1389 re_node_set_remove_at (re_node_set *set, Idx idx)
1390 {
1391   if (idx < 0 || idx >= set->nelem)
1392     return;
1393   --set->nelem;
1394   for (; idx < set->nelem; idx++)
1395     set->elems[idx] = set->elems[idx + 1];
1396 }
1397 \f
1398
1399 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400    Or return -1 if an error occurred.  */
1401
1402 static Idx
1403 internal_function
1404 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405 {
1406   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1407     {
1408       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1409       Idx *new_nexts, *new_indices;
1410       re_node_set *new_edests, *new_eclosures;
1411       re_token_t *new_nodes;
1412
1413       /* Avoid overflows in realloc.  */
1414       const size_t max_object_size = MAX (sizeof (re_token_t),
1415                                           MAX (sizeof (re_node_set),
1416                                                sizeof (Idx)));
1417       if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1418         return -1;
1419
1420       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421       if (BE (new_nodes == NULL, 0))
1422         return -1;
1423       dfa->nodes = new_nodes;
1424       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428       if (BE (new_nexts == NULL || new_indices == NULL
1429               || new_edests == NULL || new_eclosures == NULL, 0))
1430         {
1431            re_free (new_nexts);
1432            re_free (new_indices);
1433            re_free (new_edests);
1434            re_free (new_eclosures);
1435            return -1;
1436         }
1437       dfa->nexts = new_nexts;
1438       dfa->org_indices = new_indices;
1439       dfa->edests = new_edests;
1440       dfa->eclosures = new_eclosures;
1441       dfa->nodes_alloc = new_nodes_alloc;
1442     }
1443   dfa->nodes[dfa->nodes_len] = token;
1444   dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446   dfa->nodes[dfa->nodes_len].accept_mb =
1447     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448      || token.type == COMPLEX_BRACKET);
1449 #endif
1450   dfa->nexts[dfa->nodes_len] = -1;
1451   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453   return dfa->nodes_len++;
1454 }
1455
1456 static re_hashval_t
1457 internal_function
1458 calc_state_hash (const re_node_set *nodes, unsigned int context)
1459 {
1460   re_hashval_t hash = nodes->nelem + context;
1461   Idx i;
1462   for (i = 0 ; i < nodes->nelem ; i++)
1463     hash += nodes->elems[i];
1464   return hash;
1465 }
1466
1467 /* Search for the state whose node_set is equivalent to NODES.
1468    Return the pointer to the state, if we found it in the DFA.
1469    Otherwise create the new one and return it.  In case of an error
1470    return NULL and set the error code in ERR.
1471    Note: - We assume NULL as the invalid state, then it is possible that
1472            return value is NULL and ERR is REG_NOERROR.
1473          - We never return non-NULL value in case of any errors, it is for
1474            optimization.  */
1475
1476 static re_dfastate_t *
1477 internal_function __attribute_warn_unused_result__
1478 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1479                   const re_node_set *nodes)
1480 {
1481   re_hashval_t hash;
1482   re_dfastate_t *new_state;
1483   struct re_state_table_entry *spot;
1484   Idx i;
1485 #if defined GCC_LINT || defined lint
1486   /* Suppress bogus uninitialized-variable warnings.  */
1487   *err = REG_NOERROR;
1488 #endif
1489   if (BE (nodes->nelem == 0, 0))
1490     {
1491       *err = REG_NOERROR;
1492       return NULL;
1493     }
1494   hash = calc_state_hash (nodes, 0);
1495   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1496
1497   for (i = 0 ; i < spot->num ; i++)
1498     {
1499       re_dfastate_t *state = spot->array[i];
1500       if (hash != state->hash)
1501         continue;
1502       if (re_node_set_compare (&state->nodes, nodes))
1503         return state;
1504     }
1505
1506   /* There are no appropriate state in the dfa, create the new one.  */
1507   new_state = create_ci_newstate (dfa, nodes, hash);
1508   if (BE (new_state == NULL, 0))
1509     *err = REG_ESPACE;
1510
1511   return new_state;
1512 }
1513
1514 /* Search for the state whose node_set is equivalent to NODES and
1515    whose context is equivalent to CONTEXT.
1516    Return the pointer to the state, if we found it in the DFA.
1517    Otherwise create the new one and return it.  In case of an error
1518    return NULL and set the error code in ERR.
1519    Note: - We assume NULL as the invalid state, then it is possible that
1520            return value is NULL and ERR is REG_NOERROR.
1521          - We never return non-NULL value in case of any errors, it is for
1522            optimization.  */
1523
1524 static re_dfastate_t *
1525 internal_function __attribute_warn_unused_result__
1526 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1527                           const re_node_set *nodes, unsigned int context)
1528 {
1529   re_hashval_t hash;
1530   re_dfastate_t *new_state;
1531   struct re_state_table_entry *spot;
1532   Idx i;
1533 #if defined GCC_LINT || defined lint
1534   /* Suppress bogus uninitialized-variable warnings.  */
1535   *err = REG_NOERROR;
1536 #endif
1537   if (nodes->nelem == 0)
1538     {
1539       *err = REG_NOERROR;
1540       return NULL;
1541     }
1542   hash = calc_state_hash (nodes, context);
1543   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1544
1545   for (i = 0 ; i < spot->num ; i++)
1546     {
1547       re_dfastate_t *state = spot->array[i];
1548       if (state->hash == hash
1549           && state->context == context
1550           && re_node_set_compare (state->entrance_nodes, nodes))
1551         return state;
1552     }
1553   /* There are no appropriate state in 'dfa', create the new one.  */
1554   new_state = create_cd_newstate (dfa, nodes, context, hash);
1555   if (BE (new_state == NULL, 0))
1556     *err = REG_ESPACE;
1557
1558   return new_state;
1559 }
1560
1561 /* Finish initialization of the new state NEWSTATE, and using its hash value
1562    HASH put in the appropriate bucket of DFA's state table.  Return value
1563    indicates the error code if failed.  */
1564
1565 static reg_errcode_t
1566 __attribute_warn_unused_result__
1567 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1568                 re_hashval_t hash)
1569 {
1570   struct re_state_table_entry *spot;
1571   reg_errcode_t err;
1572   Idx i;
1573
1574   newstate->hash = hash;
1575   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1576   if (BE (err != REG_NOERROR, 0))
1577     return REG_ESPACE;
1578   for (i = 0; i < newstate->nodes.nelem; i++)
1579     {
1580       Idx elem = newstate->nodes.elems[i];
1581       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1582         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1583           return REG_ESPACE;
1584     }
1585
1586   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1587   if (BE (spot->alloc <= spot->num, 0))
1588     {
1589       Idx new_alloc = 2 * spot->num + 2;
1590       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1591                                               new_alloc);
1592       if (BE (new_array == NULL, 0))
1593         return REG_ESPACE;
1594       spot->array = new_array;
1595       spot->alloc = new_alloc;
1596     }
1597   spot->array[spot->num++] = newstate;
1598   return REG_NOERROR;
1599 }
1600
1601 static void
1602 free_state (re_dfastate_t *state)
1603 {
1604   re_node_set_free (&state->non_eps_nodes);
1605   re_node_set_free (&state->inveclosure);
1606   if (state->entrance_nodes != &state->nodes)
1607     {
1608       re_node_set_free (state->entrance_nodes);
1609       re_free (state->entrance_nodes);
1610     }
1611   re_node_set_free (&state->nodes);
1612   re_free (state->word_trtable);
1613   re_free (state->trtable);
1614   re_free (state);
1615 }
1616
1617 /* Create the new state which is independent of contexts.
1618    Return the new state if succeeded, otherwise return NULL.  */
1619
1620 static re_dfastate_t *
1621 internal_function __attribute_warn_unused_result__
1622 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1623                     re_hashval_t hash)
1624 {
1625   Idx i;
1626   reg_errcode_t err;
1627   re_dfastate_t *newstate;
1628
1629   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1630   if (BE (newstate == NULL, 0))
1631     return NULL;
1632   err = re_node_set_init_copy (&newstate->nodes, nodes);
1633   if (BE (err != REG_NOERROR, 0))
1634     {
1635       re_free (newstate);
1636       return NULL;
1637     }
1638
1639   newstate->entrance_nodes = &newstate->nodes;
1640   for (i = 0 ; i < nodes->nelem ; i++)
1641     {
1642       re_token_t *node = dfa->nodes + nodes->elems[i];
1643       re_token_type_t type = node->type;
1644       if (type == CHARACTER && !node->constraint)
1645         continue;
1646 #ifdef RE_ENABLE_I18N
1647       newstate->accept_mb |= node->accept_mb;
1648 #endif /* RE_ENABLE_I18N */
1649
1650       /* If the state has the halt node, the state is a halt state.  */
1651       if (type == END_OF_RE)
1652         newstate->halt = 1;
1653       else if (type == OP_BACK_REF)
1654         newstate->has_backref = 1;
1655       else if (type == ANCHOR || node->constraint)
1656         newstate->has_constraint = 1;
1657     }
1658   err = register_state (dfa, newstate, hash);
1659   if (BE (err != REG_NOERROR, 0))
1660     {
1661       free_state (newstate);
1662       newstate = NULL;
1663     }
1664   return newstate;
1665 }
1666
1667 /* Create the new state which is depend on the context CONTEXT.
1668    Return the new state if succeeded, otherwise return NULL.  */
1669
1670 static re_dfastate_t *
1671 internal_function __attribute_warn_unused_result__
1672 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1673                     unsigned int context, re_hashval_t hash)
1674 {
1675   Idx i, nctx_nodes = 0;
1676   reg_errcode_t err;
1677   re_dfastate_t *newstate;
1678
1679   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1680   if (BE (newstate == NULL, 0))
1681     return NULL;
1682   err = re_node_set_init_copy (&newstate->nodes, nodes);
1683   if (BE (err != REG_NOERROR, 0))
1684     {
1685       re_free (newstate);
1686       return NULL;
1687     }
1688
1689   newstate->context = context;
1690   newstate->entrance_nodes = &newstate->nodes;
1691
1692   for (i = 0 ; i < nodes->nelem ; i++)
1693     {
1694       re_token_t *node = dfa->nodes + nodes->elems[i];
1695       re_token_type_t type = node->type;
1696       unsigned int constraint = node->constraint;
1697
1698       if (type == CHARACTER && !constraint)
1699         continue;
1700 #ifdef RE_ENABLE_I18N
1701       newstate->accept_mb |= node->accept_mb;
1702 #endif /* RE_ENABLE_I18N */
1703
1704       /* If the state has the halt node, the state is a halt state.  */
1705       if (type == END_OF_RE)
1706         newstate->halt = 1;
1707       else if (type == OP_BACK_REF)
1708         newstate->has_backref = 1;
1709
1710       if (constraint)
1711         {
1712           if (newstate->entrance_nodes == &newstate->nodes)
1713             {
1714               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1715               if (BE (newstate->entrance_nodes == NULL, 0))
1716                 {
1717                   free_state (newstate);
1718                   return NULL;
1719                 }
1720               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721                   != REG_NOERROR)
1722                 return NULL;
1723               nctx_nodes = 0;
1724               newstate->has_constraint = 1;
1725             }
1726
1727           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1728             {
1729               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1730               ++nctx_nodes;
1731             }
1732         }
1733     }
1734   err = register_state (dfa, newstate, hash);
1735   if (BE (err != REG_NOERROR, 0))
1736     {
1737       free_state (newstate);
1738       newstate = NULL;
1739     }
1740   return  newstate;
1741 }