lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2024 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa);
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash);
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash);
  31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
  32                                                 Idx new_buf_len);
  33 static void build_wcs_buffer (re_string_t *pstr);
  34 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
  35 static void build_upper_buffer (re_string_t *pstr);
  36 static void re_string_translate_buffer (re_string_t *pstr);
  37 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
  38                                           int eflags) __attribute__ ((pure));
  39 \f
  40 /* Functions for string operation.  */
  41
  42 /* This function allocate the buffers.  It is necessary to call
  43    re_string_reconstruct before using the object.  */
  44
  45 static reg_errcode_t
  46 __attribute_warn_unused_result__
  47 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  48                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  49 {
  50   reg_errcode_t ret;
  51   Idx init_buf_len;
  52
  53   /* Ensure at least one character fits into the buffers.  */
  54   if (init_len < dfa->mb_cur_max)
  55     init_len = dfa->mb_cur_max;
  56   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  57   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  58
  59   ret = re_string_realloc_buffers (pstr, init_buf_len);
  60   if (__glibc_unlikely (ret != REG_NOERROR))
  61     return ret;
  62
  63   pstr->word_char = dfa->word_char;
  64   pstr->word_ops_used = dfa->word_ops_used;
  65   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  66   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  67   pstr->valid_raw_len = pstr->valid_len;
  68   return REG_NOERROR;
  69 }
  70
  71 /* This function allocate the buffers, and initialize them.  */
  72
  73 static reg_errcode_t
  74 __attribute_warn_unused_result__
  75 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  76                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  77 {
  78   reg_errcode_t ret;
  79   memset (pstr, '\0', sizeof (re_string_t));
  80   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  81
  82   if (len > 0)
  83     {
  84       ret = re_string_realloc_buffers (pstr, len + 1);
  85       if (__glibc_unlikely (ret != REG_NOERROR))
  86         return ret;
  87     }
  88   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  89
  90   if (icase)
  91     {
  92       if (dfa->mb_cur_max > 1)
  93         {
  94           while (1)
  95             {
  96               ret = build_wcs_upper_buffer (pstr);
  97               if (__glibc_unlikely (ret != REG_NOERROR))
  98                 return ret;
  99               if (pstr->valid_raw_len >= len)
 100                 break;
 101               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 102                 break;
 103               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 104               if (__glibc_unlikely (ret != REG_NOERROR))
 105                 return ret;
 106             }
 107         }
 108       else
 109         build_upper_buffer (pstr);
 110     }
 111   else
 112     {
 113       if (dfa->mb_cur_max > 1)
 114         build_wcs_buffer (pstr);
 115       else
 116         {
 117           if (trans != NULL)
 118             re_string_translate_buffer (pstr);
 119           else
 120             {
 121               pstr->valid_len = pstr->bufs_len;
 122               pstr->valid_raw_len = pstr->bufs_len;
 123             }
 124         }
 125     }
 126
 127   return REG_NOERROR;
 128 }
 129
 130 /* Helper functions for re_string_allocate, and re_string_construct.  */
 131
 132 static reg_errcode_t
 133 __attribute_warn_unused_result__
 134 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 135 {
 136   if (pstr->mb_cur_max > 1)
 137     {
 138       wint_t *new_wcs;
 139
 140       /* Avoid overflow in realloc.  */
 141       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 142       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
 143                             < new_buf_len))
 144         return REG_ESPACE;
 145
 146       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 147       if (__glibc_unlikely (new_wcs == NULL))
 148         return REG_ESPACE;
 149       pstr->wcs = new_wcs;
 150       if (pstr->offsets != NULL)
 151         {
 152           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 153           if (__glibc_unlikely (new_offsets == NULL))
 154             return REG_ESPACE;
 155           pstr->offsets = new_offsets;
 156         }
 157     }
 158   if (pstr->mbs_allocated)
 159     {
 160       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 161                                            new_buf_len);
 162       if (__glibc_unlikely (new_mbs == NULL))
 163         return REG_ESPACE;
 164       pstr->mbs = new_mbs;
 165     }
 166   pstr->bufs_len = new_buf_len;
 167   return REG_NOERROR;
 168 }
 169
 170
 171 static void
 172 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 173                             RE_TRANSLATE_TYPE trans, bool icase,
 174                             const re_dfa_t *dfa)
 175 {
 176   pstr->raw_mbs = (const unsigned char *) str;
 177   pstr->len = len;
 178   pstr->raw_len = len;
 179   pstr->trans = trans;
 180   pstr->icase = icase;
 181   pstr->mbs_allocated = (trans != NULL || icase);
 182   pstr->mb_cur_max = dfa->mb_cur_max;
 183   pstr->is_utf8 = dfa->is_utf8;
 184   pstr->map_notascii = dfa->map_notascii;
 185   pstr->stop = pstr->len;
 186   pstr->raw_stop = pstr->stop;
 187 }
 188
 189
 190 /* Build wide character buffer PSTR->WCS.
 191    If the byte sequence of the string are:
 192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 193    Then wide character buffer will be:
 194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 195    We use WEOF for padding, they indicate that the position isn't
 196    a first byte of a multibyte character.
 197
 198    Note that this function assumes PSTR->VALID_LEN elements are already
 199    built and starts from PSTR->VALID_LEN.  */
 200
 201 static void
 202 build_wcs_buffer (re_string_t *pstr)
 203 {
 204 #ifdef _LIBC
 205   unsigned char buf[MB_LEN_MAX];
 206   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
 207 #else
 208   unsigned char buf[64];
 209 #endif
 210   mbstate_t prev_st;
 211   Idx byte_idx, end_idx, remain_len;
 212   size_t mbclen;
 213
 214   /* Build the buffers from pstr->valid_len to either pstr->len or
 215      pstr->bufs_len.  */
 216   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 217   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 218     {
 219       wchar_t wc;
 220       const char *p;
 221
 222       remain_len = end_idx - byte_idx;
 223       prev_st = pstr->cur_state;
 224       /* Apply the translation if we need.  */
 225       if (__glibc_unlikely (pstr->trans != NULL))
 226         {
 227           int i, ch;
 228
 229           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 230             {
 231               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 232               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 233             }
 234           p = (const char *) buf;
 235         }
 236       else
 237         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 238       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 239       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
 240                             || (mbclen == (size_t) -2
 241                                 && pstr->bufs_len >= pstr->len)))
 242         {
 243           /* We treat these cases as a singlebyte character.  */
 244           mbclen = 1;
 245           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 246           if (__glibc_unlikely (pstr->trans != NULL))
 247             wc = pstr->trans[wc];
 248           pstr->cur_state = prev_st;
 249         }
 250       else if (__glibc_unlikely (mbclen == (size_t) -2))
 251         {
 252           /* The buffer doesn't have enough space, finish to build.  */
 253           pstr->cur_state = prev_st;
 254           break;
 255         }
 256
 257       /* Write wide character and padding.  */
 258       pstr->wcs[byte_idx++] = wc;
 259       /* Write paddings.  */
 260       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 261         pstr->wcs[byte_idx++] = WEOF;
 262     }
 263   pstr->valid_len = byte_idx;
 264   pstr->valid_raw_len = byte_idx;
 265 }
 266
 267 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 268    but for REG_ICASE.  */
 269
 270 static reg_errcode_t
 271 __attribute_warn_unused_result__
 272 build_wcs_upper_buffer (re_string_t *pstr)
 273 {
 274   mbstate_t prev_st;
 275   Idx src_idx, byte_idx, end_idx, remain_len;
 276   size_t mbclen;
 277 #ifdef _LIBC
 278   char buf[MB_LEN_MAX];
 279   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
 280 #else
 281   char buf[64];
 282 #endif
 283
 284   byte_idx = pstr->valid_len;
 285   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 286
 287   /* The following optimization assumes that ASCII characters can be
 288      mapped to wide characters with a simple cast.  */
 289   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 290     {
 291       while (byte_idx < end_idx)
 292         {
 293           wchar_t wc;
 294           unsigned char ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 295
 296           if (isascii (ch) && mbsinit (&pstr->cur_state))
 297             {
 298               /* The next step uses the assumption that wchar_t is encoded
 299                  ASCII-safe: all ASCII values can be converted like this.  */
 300               wchar_t wcu = __towupper (ch);
 301               if (isascii (wcu))
 302                 {
 303                   pstr->mbs[byte_idx] = wcu;
 304                   pstr->wcs[byte_idx] = wcu;
 305                   byte_idx++;
 306                   continue;
 307                 }
 308             }
 309
 310           remain_len = end_idx - byte_idx;
 311           prev_st = pstr->cur_state;
 312           mbclen = __mbrtowc (&wc,
 313                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 314                                + byte_idx), remain_len, &pstr->cur_state);
 315           if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 316             {
 317               wchar_t wcu = __towupper (wc);
 318               if (wcu != wc)
 319                 {
 320                   size_t mbcdlen;
 321
 322                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 323                   if (__glibc_likely (mbclen == mbcdlen))
 324                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 325                   else
 326                     {
 327                       src_idx = byte_idx;
 328                       goto offsets_needed;
 329                     }
 330                 }
 331               else
 332                 memcpy (pstr->mbs + byte_idx,
 333                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 334               pstr->wcs[byte_idx++] = wcu;
 335               /* Write paddings.  */
 336               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 337                 pstr->wcs[byte_idx++] = WEOF;
 338             }
 339           else if (mbclen == (size_t) -1 || mbclen == 0
 340                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 341             {
 342               /* It is an invalid character, an incomplete character
 343                  at the end of the string, or '\0'.  Just use the byte.  */
 344               pstr->mbs[byte_idx] = ch;
 345               /* And also cast it to wide char.  */
 346               pstr->wcs[byte_idx++] = (wchar_t) ch;
 347               if (__glibc_unlikely (mbclen == (size_t) -1))
 348                 pstr->cur_state = prev_st;
 349             }
 350           else
 351             {
 352               /* The buffer doesn't have enough space, finish to build.  */
 353               pstr->cur_state = prev_st;
 354               break;
 355             }
 356         }
 357       pstr->valid_len = byte_idx;
 358       pstr->valid_raw_len = byte_idx;
 359       return REG_NOERROR;
 360     }
 361   else
 362     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 363       {
 364         wchar_t wc;
 365         const char *p;
 366       offsets_needed:
 367         remain_len = end_idx - byte_idx;
 368         prev_st = pstr->cur_state;
 369         if (__glibc_unlikely (pstr->trans != NULL))
 370           {
 371             int i, ch;
 372
 373             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 374               {
 375                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 376                 buf[i] = pstr->trans[ch];
 377               }
 378             p = (const char *) buf;
 379           }
 380         else
 381           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 382         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 383         if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 384           {
 385             wchar_t wcu = __towupper (wc);
 386             if (wcu != wc)
 387               {
 388                 size_t mbcdlen;
 389
 390                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
 391                 if (__glibc_likely (mbclen == mbcdlen))
 392                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 393                 else if (mbcdlen != (size_t) -1)
 394                   {
 395                     size_t i;
 396
 397                     if (byte_idx + mbcdlen > pstr->bufs_len)
 398                       {
 399                         pstr->cur_state = prev_st;
 400                         break;
 401                       }
 402
 403                     if (pstr->offsets == NULL)
 404                       {
 405                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 406
 407                         if (pstr->offsets == NULL)
 408                           return REG_ESPACE;
 409                       }
 410                     if (!pstr->offsets_needed)
 411                       {
 412                         for (i = 0; i < (size_t) byte_idx; ++i)
 413                           pstr->offsets[i] = i;
 414                         pstr->offsets_needed = 1;
 415                       }
 416
 417                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 418                     pstr->wcs[byte_idx] = wcu;
 419                     pstr->offsets[byte_idx] = src_idx;
 420                     for (i = 1; i < mbcdlen; ++i)
 421                       {
 422                         pstr->offsets[byte_idx + i]
 423                           = src_idx + (i < mbclen ? i : mbclen - 1);
 424                         pstr->wcs[byte_idx + i] = WEOF;
 425                       }
 426                     pstr->len += mbcdlen - mbclen;
 427                     if (pstr->raw_stop > src_idx)
 428                       pstr->stop += mbcdlen - mbclen;
 429                     end_idx = (pstr->bufs_len > pstr->len)
 430                               ? pstr->len : pstr->bufs_len;
 431                     byte_idx += mbcdlen;
 432                     src_idx += mbclen;
 433                     continue;
 434                   }
 435                 else
 436                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 437               }
 438             else
 439               memcpy (pstr->mbs + byte_idx, p, mbclen);
 440
 441             if (__glibc_unlikely (pstr->offsets_needed != 0))
 442               {
 443                 size_t i;
 444                 for (i = 0; i < mbclen; ++i)
 445                   pstr->offsets[byte_idx + i] = src_idx + i;
 446               }
 447             src_idx += mbclen;
 448
 449             pstr->wcs[byte_idx++] = wcu;
 450             /* Write paddings.  */
 451             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 452               pstr->wcs[byte_idx++] = WEOF;
 453           }
 454         else if (mbclen == (size_t) -1 || mbclen == 0
 455                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 456           {
 457             /* It is an invalid character or '\0'.  Just use the byte.  */
 458             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 459
 460             if (__glibc_unlikely (pstr->trans != NULL))
 461               ch = pstr->trans [ch];
 462             pstr->mbs[byte_idx] = ch;
 463
 464             if (__glibc_unlikely (pstr->offsets_needed != 0))
 465               pstr->offsets[byte_idx] = src_idx;
 466             ++src_idx;
 467
 468             /* And also cast it to wide char.  */
 469             pstr->wcs[byte_idx++] = (wchar_t) ch;
 470             if (__glibc_unlikely (mbclen == (size_t) -1))
 471               pstr->cur_state = prev_st;
 472           }
 473         else
 474           {
 475             /* The buffer doesn't have enough space, finish to build.  */
 476             pstr->cur_state = prev_st;
 477             break;
 478           }
 479       }
 480   pstr->valid_len = byte_idx;
 481   pstr->valid_raw_len = src_idx;
 482   return REG_NOERROR;
 483 }
 484
 485 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 486    Return the index.  */
 487
 488 static Idx
 489 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 490 {
 491   mbstate_t prev_st;
 492   Idx rawbuf_idx;
 493   size_t mbclen;
 494   wint_t wc = WEOF;
 495
 496   /* Skip the characters which are not necessary to check.  */
 497   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 498        rawbuf_idx < new_raw_idx;)
 499     {
 500       wchar_t wc2;
 501       Idx remain_len = pstr->raw_len - rawbuf_idx;
 502       prev_st = pstr->cur_state;
 503       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 504                           remain_len, &pstr->cur_state);
 505       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
 506                             || mbclen == 0))
 507         {
 508           /* We treat these cases as a single byte character.  */
 509           if (mbclen == 0 || remain_len == 0)
 510             wc = L'\0';
 511           else
 512             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 513           mbclen = 1;
 514           pstr->cur_state = prev_st;
 515         }
 516       else
 517         wc = wc2;
 518       /* Then proceed the next character.  */
 519       rawbuf_idx += mbclen;
 520     }
 521   *last_wc = wc;
 522   return rawbuf_idx;
 523 }
 524
 525 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 526    This function is used in case of REG_ICASE.  */
 527
 528 static void
 529 build_upper_buffer (re_string_t *pstr)
 530 {
 531   Idx char_idx, end_idx;
 532   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 533
 534   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 535     {
 536       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 537       if (__glibc_unlikely (pstr->trans != NULL))
 538         ch = pstr->trans[ch];
 539       pstr->mbs[char_idx] = toupper (ch);
 540     }
 541   pstr->valid_len = char_idx;
 542   pstr->valid_raw_len = char_idx;
 543 }
 544
 545 /* Apply TRANS to the buffer in PSTR.  */
 546
 547 static void
 548 re_string_translate_buffer (re_string_t *pstr)
 549 {
 550   Idx buf_idx, end_idx;
 551   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 552
 553   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 554     {
 555       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 556       pstr->mbs[buf_idx] = pstr->trans[ch];
 557     }
 558
 559   pstr->valid_len = buf_idx;
 560   pstr->valid_raw_len = buf_idx;
 561 }
 562
 563 /* This function re-construct the buffers.
 564    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 565    convert to upper case in case of REG_ICASE, apply translation.  */
 566
 567 static reg_errcode_t
 568 __attribute_warn_unused_result__
 569 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 570 {
 571   Idx offset;
 572
 573   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
 574     offset = idx - pstr->raw_mbs_idx;
 575   else
 576     {
 577       /* Reset buffer.  */
 578       if (pstr->mb_cur_max > 1)
 579         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 580       pstr->len = pstr->raw_len;
 581       pstr->stop = pstr->raw_stop;
 582       pstr->valid_len = 0;
 583       pstr->raw_mbs_idx = 0;
 584       pstr->valid_raw_len = 0;
 585       pstr->offsets_needed = 0;
 586       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 587                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 588       if (!pstr->mbs_allocated)
 589         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 590       offset = idx;
 591     }
 592
 593   if (__glibc_likely (offset != 0))
 594     {
 595       /* Should the already checked characters be kept?  */
 596       if (__glibc_likely (offset < pstr->valid_raw_len))
 597         {
 598           /* Yes, move them to the front of the buffer.  */
 599           if (__glibc_unlikely (pstr->offsets_needed))
 600             {
 601               Idx low = 0, high = pstr->valid_len, mid;
 602               do
 603                 {
 604                   mid = (high + low) / 2;
 605                   if (pstr->offsets[mid] > offset)
 606                     high = mid;
 607                   else if (pstr->offsets[mid] < offset)
 608                     low = mid + 1;
 609                   else
 610                     break;
 611                 }
 612               while (low < high);
 613               if (pstr->offsets[mid] < offset)
 614                 ++mid;
 615               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 616                                                         eflags);
 617               /* This can be quite complicated, so handle specially
 618                  only the common and easy case where the character with
 619                  different length representation of lower and upper
 620                  case is present at or after offset.  */
 621               if (pstr->valid_len > offset
 622                   && mid == offset && pstr->offsets[mid] == offset)
 623                 {
 624                   memmove (pstr->wcs, pstr->wcs + offset,
 625                            (pstr->valid_len - offset) * sizeof (wint_t));
 626                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 627                   pstr->valid_len -= offset;
 628                   pstr->valid_raw_len -= offset;
 629                   for (low = 0; low < pstr->valid_len; low++)
 630                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 631                 }
 632               else
 633                 {
 634                   /* Otherwise, just find out how long the partial multibyte
 635                      character at offset is and fill it with WEOF/255.  */
 636                   pstr->len = pstr->raw_len - idx + offset;
 637                   pstr->stop = pstr->raw_stop - idx + offset;
 638                   pstr->offsets_needed = 0;
 639                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 640                     --mid;
 641                   while (mid < pstr->valid_len)
 642                     if (pstr->wcs[mid] != WEOF)
 643                       break;
 644                     else
 645                       ++mid;
 646                   if (mid == pstr->valid_len)
 647                     pstr->valid_len = 0;
 648                   else
 649                     {
 650                       pstr->valid_len = pstr->offsets[mid] - offset;
 651                       if (pstr->valid_len)
 652                         {
 653                           for (low = 0; low < pstr->valid_len; ++low)
 654                             pstr->wcs[low] = WEOF;
 655                           memset (pstr->mbs, 255, pstr->valid_len);
 656                         }
 657                     }
 658                   pstr->valid_raw_len = pstr->valid_len;
 659                 }
 660             }
 661           else
 662             {
 663               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 664                                                         eflags);
 665               if (pstr->mb_cur_max > 1)
 666                 memmove (pstr->wcs, pstr->wcs + offset,
 667                          (pstr->valid_len - offset) * sizeof (wint_t));
 668               if (__glibc_unlikely (pstr->mbs_allocated))
 669                 memmove (pstr->mbs, pstr->mbs + offset,
 670                          pstr->valid_len - offset);
 671               pstr->valid_len -= offset;
 672               pstr->valid_raw_len -= offset;
 673               DEBUG_ASSERT (pstr->valid_len > 0);
 674             }
 675         }
 676       else
 677         {
 678           /* No, skip all characters until IDX.  */
 679           Idx prev_valid_len = pstr->valid_len;
 680
 681           if (__glibc_unlikely (pstr->offsets_needed))
 682             {
 683               pstr->len = pstr->raw_len - idx + offset;
 684               pstr->stop = pstr->raw_stop - idx + offset;
 685               pstr->offsets_needed = 0;
 686             }
 687           pstr->valid_len = 0;
 688           if (pstr->mb_cur_max > 1)
 689             {
 690               Idx wcs_idx;
 691               wint_t wc = WEOF;
 692
 693               if (pstr->is_utf8)
 694                 {
 695                   const unsigned char *raw, *p, *end;
 696
 697                   /* Special case UTF-8.  Multi-byte chars start with any
 698                      byte other than 0x80 - 0xbf.  */
 699                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 700                   end = raw + (offset - pstr->mb_cur_max);
 701                   if (end < pstr->raw_mbs)
 702                     end = pstr->raw_mbs;
 703                   p = raw + offset - 1;
 704 #ifdef _LIBC
 705                   /* We know the wchar_t encoding is UCS4, so for the simple
 706                      case, ASCII characters, skip the conversion step.  */
 707                   if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
 708                     {
 709                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 710                       /* pstr->valid_len = 0; */
 711                       wc = (wchar_t) *p;
 712                     }
 713                   else
 714 #endif
 715                     for (; p >= end; --p)
 716                       if ((*p & 0xc0) != 0x80)
 717                         {
 718                           mbstate_t cur_state;
 719                           wchar_t wc2;
 720                           Idx mlen = raw + pstr->len - p;
 721                           unsigned char buf[6];
 722                           size_t mbclen;
 723
 724                           const unsigned char *pp = p;
 725                           if (__glibc_unlikely (pstr->trans != NULL))
 726                             {
 727                               int i = mlen < 6 ? mlen : 6;
 728                               while (--i >= 0)
 729                                 buf[i] = pstr->trans[p[i]];
 730                               pp = buf;
 731                             }
 732                           /* XXX Don't use mbrtowc, we know which conversion
 733                              to use (UTF-8 -> UCS4).  */
 734                           memset (&cur_state, 0, sizeof (cur_state));
 735                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 736                                               &cur_state);
 737                           if (raw + offset - p <= mbclen
 738                               && mbclen < (size_t) -2)
 739                             {
 740                               memset (&pstr->cur_state, '\0',
 741                                       sizeof (mbstate_t));
 742                               pstr->valid_len = mbclen - (raw + offset - p);
 743                               wc = wc2;
 744                             }
 745                           break;
 746                         }
 747                 }
 748
 749               if (wc == WEOF)
 750                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 751               if (wc == WEOF)
 752                 pstr->tip_context
 753                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 754               else
 755                 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
 756                                       && IS_WIDE_WORD_CHAR (wc))
 757                                      ? CONTEXT_WORD
 758                                      : ((IS_WIDE_NEWLINE (wc)
 759                                          && pstr->newline_anchor)
 760                                         ? CONTEXT_NEWLINE : 0));
 761               if (__glibc_unlikely (pstr->valid_len))
 762                 {
 763                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 764                     pstr->wcs[wcs_idx] = WEOF;
 765                   if (pstr->mbs_allocated)
 766                     memset (pstr->mbs, 255, pstr->valid_len);
 767                 }
 768               pstr->valid_raw_len = pstr->valid_len;
 769             }
 770           else
 771             {
 772               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 773               pstr->valid_raw_len = 0;
 774               if (pstr->trans)
 775                 c = pstr->trans[c];
 776               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 777                                    ? CONTEXT_WORD
 778                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 779                                       ? CONTEXT_NEWLINE : 0));
 780             }
 781         }
 782       if (!__glibc_unlikely (pstr->mbs_allocated))
 783         pstr->mbs += offset;
 784     }
 785   pstr->raw_mbs_idx = idx;
 786   pstr->len -= offset;
 787   pstr->stop -= offset;
 788
 789   /* Then build the buffers.  */
 790   if (pstr->mb_cur_max > 1)
 791     {
 792       if (pstr->icase)
 793         {
 794           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 795           if (__glibc_unlikely (ret != REG_NOERROR))
 796             return ret;
 797         }
 798       else
 799         build_wcs_buffer (pstr);
 800     }
 801   else
 802     if (__glibc_unlikely (pstr->mbs_allocated))
 803       {
 804         if (pstr->icase)
 805           build_upper_buffer (pstr);
 806         else if (pstr->trans != NULL)
 807           re_string_translate_buffer (pstr);
 808       }
 809     else
 810       pstr->valid_len = pstr->len;
 811
 812   pstr->cur_idx = 0;
 813   return REG_NOERROR;
 814 }
 815
 816 static unsigned char
 817 __attribute__ ((pure))
 818 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 819 {
 820   int ch;
 821   Idx off;
 822
 823   /* Handle the common (easiest) cases first.  */
 824   if (__glibc_likely (!pstr->mbs_allocated))
 825     return re_string_peek_byte (pstr, idx);
 826
 827   if (pstr->mb_cur_max > 1
 828       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 829     return re_string_peek_byte (pstr, idx);
 830
 831   off = pstr->cur_idx + idx;
 832   if (pstr->offsets_needed)
 833     off = pstr->offsets[off];
 834
 835   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 836
 837   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 838      this function returns CAPITAL LETTER I instead of first byte of
 839      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 840      since peek_byte_case doesn't advance cur_idx in any way.  */
 841   if (pstr->offsets_needed && !isascii (ch))
 842     return re_string_peek_byte (pstr, idx);
 843
 844   return ch;
 845 }
 846
 847 static unsigned char
 848 re_string_fetch_byte_case (re_string_t *pstr)
 849 {
 850   if (__glibc_likely (!pstr->mbs_allocated))
 851     return re_string_fetch_byte (pstr);
 852
 853   if (pstr->offsets_needed)
 854     {
 855       Idx off;
 856       int ch;
 857
 858       /* For tr_TR.UTF-8 [[:islower:]] there is
 859          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 860          in that case the whole multi-byte character and return
 861          the original letter.  On the other side, with
 862          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 863          anything else would complicate things too much.  */
 864
 865       if (!re_string_first_byte (pstr, pstr->cur_idx))
 866         return re_string_fetch_byte (pstr);
 867
 868       off = pstr->offsets[pstr->cur_idx];
 869       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 870
 871       if (! isascii (ch))
 872         return re_string_fetch_byte (pstr);
 873
 874       re_string_skip_bytes (pstr,
 875                             re_string_char_size_at (pstr, pstr->cur_idx));
 876       return ch;
 877     }
 878
 879   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 880 }
 881
 882 static void
 883 re_string_destruct (re_string_t *pstr)
 884 {
 885   re_free (pstr->wcs);
 886   re_free (pstr->offsets);
 887   if (pstr->mbs_allocated)
 888     re_free (pstr->mbs);
 889 }
 890
 891 /* Return the context at IDX in INPUT.  */
 892
 893 static unsigned int
 894 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 895 {
 896   int c;
 897   if (__glibc_unlikely (idx < 0))
 898     /* In this case, we use the value stored in input->tip_context,
 899        since we can't know the character in input->mbs[-1] here.  */
 900     return input->tip_context;
 901   if (__glibc_unlikely (idx == input->len))
 902     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 903             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 904   if (input->mb_cur_max > 1)
 905     {
 906       wint_t wc;
 907       Idx wc_idx = idx;
 908       while(input->wcs[wc_idx] == WEOF)
 909         {
 910           DEBUG_ASSERT (wc_idx >= 0);
 911           --wc_idx;
 912           if (wc_idx < 0)
 913             return input->tip_context;
 914         }
 915       wc = input->wcs[wc_idx];
 916       if (__glibc_unlikely (input->word_ops_used != 0)
 917           && IS_WIDE_WORD_CHAR (wc))
 918         return CONTEXT_WORD;
 919       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 920               ? CONTEXT_NEWLINE : 0);
 921     }
 922   else
 923     {
 924       c = re_string_byte_at (input, idx);
 925       if (bitset_contain (input->word_char, c))
 926         return CONTEXT_WORD;
 927       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 928     }
 929 }
 930 \f
 931 /* Functions for set operation.  */
 932
 933 static reg_errcode_t
 934 __attribute_warn_unused_result__
 935 re_node_set_alloc (re_node_set *set, Idx size)
 936 {
 937   set->alloc = size;
 938   set->nelem = 0;
 939   set->elems = re_malloc (Idx, size);
 940   if (__glibc_unlikely (set->elems == NULL))
 941     return REG_ESPACE;
 942   return REG_NOERROR;
 943 }
 944
 945 static reg_errcode_t
 946 __attribute_warn_unused_result__
 947 re_node_set_init_1 (re_node_set *set, Idx elem)
 948 {
 949   set->alloc = 1;
 950   set->nelem = 1;
 951   set->elems = re_malloc (Idx, 1);
 952   if (__glibc_unlikely (set->elems == NULL))
 953     {
 954       set->alloc = set->nelem = 0;
 955       return REG_ESPACE;
 956     }
 957   set->elems[0] = elem;
 958   return REG_NOERROR;
 959 }
 960
 961 static reg_errcode_t
 962 __attribute_warn_unused_result__
 963 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 964 {
 965   set->alloc = 2;
 966   set->elems = re_malloc (Idx, 2);
 967   if (__glibc_unlikely (set->elems == NULL))
 968     return REG_ESPACE;
 969   if (elem1 == elem2)
 970     {
 971       set->nelem = 1;
 972       set->elems[0] = elem1;
 973     }
 974   else
 975     {
 976       set->nelem = 2;
 977       if (elem1 < elem2)
 978         {
 979           set->elems[0] = elem1;
 980           set->elems[1] = elem2;
 981         }
 982       else
 983         {
 984           set->elems[0] = elem2;
 985           set->elems[1] = elem1;
 986         }
 987     }
 988   return REG_NOERROR;
 989 }
 990
 991 static reg_errcode_t
 992 __attribute_warn_unused_result__
 993 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
 994 {
 995   dest->nelem = src->nelem;
 996   if (src->nelem > 0)
 997     {
 998       dest->alloc = dest->nelem;
 999       dest->elems = re_malloc (Idx, dest->alloc);
1000       if (__glibc_unlikely (dest->elems == NULL))
1001         {
1002           dest->alloc = dest->nelem = 0;
1003           return REG_ESPACE;
1004         }
1005       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1006     }
1007   else
1008     re_node_set_init_empty (dest);
1009   return REG_NOERROR;
1010 }
1011
1012 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1013    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1014    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1015
1016 static reg_errcode_t
1017 __attribute_warn_unused_result__
1018 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1019                            const re_node_set *src2)
1020 {
1021   Idx i1, i2, is, id, delta, sbase;
1022   if (src1->nelem == 0 || src2->nelem == 0)
1023     return REG_NOERROR;
1024
1025   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1026      conservative estimate.  */
1027   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1028     {
1029       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1030       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1031       if (__glibc_unlikely (new_elems == NULL))
1032         return REG_ESPACE;
1033       dest->elems = new_elems;
1034       dest->alloc = new_alloc;
1035     }
1036
1037   /* Find the items in the intersection of SRC1 and SRC2, and copy
1038      into the top of DEST those that are not already in DEST itself.  */
1039   sbase = dest->nelem + src1->nelem + src2->nelem;
1040   i1 = src1->nelem - 1;
1041   i2 = src2->nelem - 1;
1042   id = dest->nelem - 1;
1043   for (;;)
1044     {
1045       if (src1->elems[i1] == src2->elems[i2])
1046         {
1047           /* Try to find the item in DEST.  Maybe we could binary search?  */
1048           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1049             --id;
1050
1051           if (id < 0 || dest->elems[id] != src1->elems[i1])
1052             dest->elems[--sbase] = src1->elems[i1];
1053
1054           if (--i1 < 0 || --i2 < 0)
1055             break;
1056         }
1057
1058       /* Lower the highest of the two items.  */
1059       else if (src1->elems[i1] < src2->elems[i2])
1060         {
1061           if (--i2 < 0)
1062             break;
1063         }
1064       else
1065         {
1066           if (--i1 < 0)
1067             break;
1068         }
1069     }
1070
1071   id = dest->nelem - 1;
1072   is = dest->nelem + src1->nelem + src2->nelem - 1;
1073   delta = is - sbase + 1;
1074
1075   /* Now copy.  When DELTA becomes zero, the remaining
1076      DEST elements are already in place; this is more or
1077      less the same loop that is in re_node_set_merge.  */
1078   dest->nelem += delta;
1079   if (delta > 0 && id >= 0)
1080     for (;;)
1081       {
1082         if (dest->elems[is] > dest->elems[id])
1083           {
1084             /* Copy from the top.  */
1085             dest->elems[id + delta--] = dest->elems[is--];
1086             if (delta == 0)
1087               break;
1088           }
1089         else
1090           {
1091             /* Slide from the bottom.  */
1092             dest->elems[id + delta] = dest->elems[id];
1093             if (--id < 0)
1094               break;
1095           }
1096       }
1097
1098   /* Copy remaining SRC elements.  */
1099   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1100
1101   return REG_NOERROR;
1102 }
1103
1104 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1105    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1106
1107 static reg_errcode_t
1108 __attribute_warn_unused_result__
1109 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1110                         const re_node_set *src2)
1111 {
1112   Idx i1, i2, id;
1113   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1114     {
1115       dest->alloc = src1->nelem + src2->nelem;
1116       dest->elems = re_malloc (Idx, dest->alloc);
1117       if (__glibc_unlikely (dest->elems == NULL))
1118         return REG_ESPACE;
1119     }
1120   else
1121     {
1122       if (src1 != NULL && src1->nelem > 0)
1123         return re_node_set_init_copy (dest, src1);
1124       else if (src2 != NULL && src2->nelem > 0)
1125         return re_node_set_init_copy (dest, src2);
1126       else
1127         re_node_set_init_empty (dest);
1128       return REG_NOERROR;
1129     }
1130   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1131     {
1132       if (src1->elems[i1] > src2->elems[i2])
1133         {
1134           dest->elems[id++] = src2->elems[i2++];
1135           continue;
1136         }
1137       if (src1->elems[i1] == src2->elems[i2])
1138         ++i2;
1139       dest->elems[id++] = src1->elems[i1++];
1140     }
1141   if (i1 < src1->nelem)
1142     {
1143       memcpy (dest->elems + id, src1->elems + i1,
1144              (src1->nelem - i1) * sizeof (Idx));
1145       id += src1->nelem - i1;
1146     }
1147   else if (i2 < src2->nelem)
1148     {
1149       memcpy (dest->elems + id, src2->elems + i2,
1150              (src2->nelem - i2) * sizeof (Idx));
1151       id += src2->nelem - i2;
1152     }
1153   dest->nelem = id;
1154   return REG_NOERROR;
1155 }
1156
1157 /* Calculate the union set of the sets DEST and SRC. And store it to
1158    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1159
1160 static reg_errcode_t
1161 __attribute_warn_unused_result__
1162 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1163 {
1164   Idx is, id, sbase, delta;
1165   if (src == NULL || src->nelem == 0)
1166     return REG_NOERROR;
1167   if (dest->alloc < 2 * src->nelem + dest->nelem)
1168     {
1169       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1170       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1171       if (__glibc_unlikely (new_buffer == NULL))
1172         return REG_ESPACE;
1173       dest->elems = new_buffer;
1174       dest->alloc = new_alloc;
1175     }
1176
1177   if (__glibc_unlikely (dest->nelem == 0))
1178     {
1179       /* Although we already guaranteed above that dest->alloc != 0 and
1180          therefore dest->elems != NULL, add a debug assertion to pacify
1181          GCC 11.2.1's -fanalyzer.  */
1182       DEBUG_ASSERT (dest->elems);
1183       dest->nelem = src->nelem;
1184       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1185       return REG_NOERROR;
1186     }
1187
1188   /* Copy into the top of DEST the items of SRC that are not
1189      found in DEST.  Maybe we could binary search in DEST?  */
1190   for (sbase = dest->nelem + 2 * src->nelem,
1191        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1192     {
1193       if (dest->elems[id] == src->elems[is])
1194         is--, id--;
1195       else if (dest->elems[id] < src->elems[is])
1196         dest->elems[--sbase] = src->elems[is--];
1197       else /* if (dest->elems[id] > src->elems[is]) */
1198         --id;
1199     }
1200
1201   if (is >= 0)
1202     {
1203       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1204       sbase -= is + 1;
1205       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1206     }
1207
1208   id = dest->nelem - 1;
1209   is = dest->nelem + 2 * src->nelem - 1;
1210   delta = is - sbase + 1;
1211   if (delta == 0)
1212     return REG_NOERROR;
1213
1214   /* Now copy.  When DELTA becomes zero, the remaining
1215      DEST elements are already in place.  */
1216   dest->nelem += delta;
1217   for (;;)
1218     {
1219       if (dest->elems[is] > dest->elems[id])
1220         {
1221           /* Copy from the top.  */
1222           dest->elems[id + delta--] = dest->elems[is--];
1223           if (delta == 0)
1224             break;
1225         }
1226       else
1227         {
1228           /* Slide from the bottom.  */
1229           dest->elems[id + delta] = dest->elems[id];
1230           if (--id < 0)
1231             {
1232               /* Copy remaining SRC elements.  */
1233               memcpy (dest->elems, dest->elems + sbase,
1234                       delta * sizeof (Idx));
1235               break;
1236             }
1237         }
1238     }
1239
1240   return REG_NOERROR;
1241 }
1242
1243 /* Insert the new element ELEM to the re_node_set* SET.
1244    SET should not already have ELEM.
1245    Return true if successful.  */
1246
1247 static bool
1248 __attribute_warn_unused_result__
1249 re_node_set_insert (re_node_set *set, Idx elem)
1250 {
1251   Idx idx;
1252   /* In case the set is empty.  */
1253   if (set->alloc == 0)
1254     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1255
1256   if (__glibc_unlikely (set->nelem) == 0)
1257     {
1258       /* Although we already guaranteed above that set->alloc != 0 and
1259          therefore set->elems != NULL, add a debug assertion to pacify
1260          GCC 11.2 -fanalyzer.  */
1261       DEBUG_ASSERT (set->elems);
1262       set->elems[0] = elem;
1263       ++set->nelem;
1264       return true;
1265     }
1266
1267   /* Realloc if we need.  */
1268   if (set->alloc == set->nelem)
1269     {
1270       Idx *new_elems;
1271       set->alloc = set->alloc * 2;
1272       new_elems = re_realloc (set->elems, Idx, set->alloc);
1273       if (__glibc_unlikely (new_elems == NULL))
1274         return false;
1275       set->elems = new_elems;
1276     }
1277
1278   /* Move the elements which follows the new element.  Test the
1279      first element separately to skip a check in the inner loop.  */
1280   if (elem < set->elems[0])
1281     {
1282       for (idx = set->nelem; idx > 0; idx--)
1283         set->elems[idx] = set->elems[idx - 1];
1284     }
1285   else
1286     {
1287       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1288         set->elems[idx] = set->elems[idx - 1];
1289       DEBUG_ASSERT (set->elems[idx - 1] < elem);
1290     }
1291
1292   /* Insert the new element.  */
1293   set->elems[idx] = elem;
1294   ++set->nelem;
1295   return true;
1296 }
1297
1298 /* Insert the new element ELEM to the re_node_set* SET.
1299    SET should not already have any element greater than or equal to ELEM.
1300    Return true if successful.  */
1301
1302 static bool
1303 __attribute_warn_unused_result__
1304 re_node_set_insert_last (re_node_set *set, Idx elem)
1305 {
1306   /* Realloc if we need.  */
1307   if (set->alloc == set->nelem)
1308     {
1309       Idx *new_elems;
1310       set->alloc = (set->alloc + 1) * 2;
1311       new_elems = re_realloc (set->elems, Idx, set->alloc);
1312       if (__glibc_unlikely (new_elems == NULL))
1313         return false;
1314       set->elems = new_elems;
1315     }
1316
1317   /* Insert the new element.  */
1318   set->elems[set->nelem++] = elem;
1319   return true;
1320 }
1321
1322 /* Compare two node sets SET1 and SET2.
1323    Return true if SET1 and SET2 are equivalent.  */
1324
1325 static bool
1326 __attribute__ ((pure))
1327 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1328 {
1329   Idx i;
1330   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1331     return false;
1332   for (i = set1->nelem ; --i >= 0 ; )
1333     if (set1->elems[i] != set2->elems[i])
1334       return false;
1335   return true;
1336 }
1337
1338 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1339
1340 static Idx
1341 __attribute__ ((pure))
1342 re_node_set_contains (const re_node_set *set, Idx elem)
1343 {
1344   __re_size_t idx, right, mid;
1345   if (set->nelem <= 0)
1346     return 0;
1347
1348   /* Binary search the element.  */
1349   idx = 0;
1350   right = set->nelem - 1;
1351   while (idx < right)
1352     {
1353       mid = (idx + right) / 2;
1354       if (set->elems[mid] < elem)
1355         idx = mid + 1;
1356       else
1357         right = mid;
1358     }
1359   return set->elems[idx] == elem ? idx + 1 : 0;
1360 }
1361
1362 static void
1363 re_node_set_remove_at (re_node_set *set, Idx idx)
1364 {
1365   if (idx < 0 || idx >= set->nelem)
1366     return;
1367   --set->nelem;
1368   for (; idx < set->nelem; idx++)
1369     set->elems[idx] = set->elems[idx + 1];
1370 }
1371 \f
1372
1373 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1374    Or return -1 if an error occurred.  */
1375
1376 static Idx
1377 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1378 {
1379   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1380     {
1381       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1382       Idx *new_nexts, *new_indices;
1383       re_node_set *new_edests, *new_eclosures;
1384       re_token_t *new_nodes;
1385
1386       /* Avoid overflows in realloc.  */
1387       const size_t max_object_size = MAX (sizeof (re_token_t),
1388                                           MAX (sizeof (re_node_set),
1389                                                sizeof (Idx)));
1390       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1391                             < new_nodes_alloc))
1392         return -1;
1393
1394       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1395       if (__glibc_unlikely (new_nodes == NULL))
1396         return -1;
1397       dfa->nodes = new_nodes;
1398       dfa->nodes_alloc = new_nodes_alloc;
1399       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1400       if (new_nexts != NULL)
1401         dfa->nexts = new_nexts;
1402       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1403       if (new_indices != NULL)
1404         dfa->org_indices = new_indices;
1405       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1406       if (new_edests != NULL)
1407         dfa->edests = new_edests;
1408       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1409       if (new_eclosures != NULL)
1410         dfa->eclosures = new_eclosures;
1411       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1412                             || new_edests == NULL || new_eclosures == NULL))
1413         return -1;
1414     }
1415   dfa->nodes[dfa->nodes_len] = token;
1416   dfa->nodes[dfa->nodes_len].constraint = 0;
1417   dfa->nodes[dfa->nodes_len].accept_mb =
1418     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1419      || token.type == COMPLEX_BRACKET);
1420   dfa->nexts[dfa->nodes_len] = -1;
1421   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1422   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1423   return dfa->nodes_len++;
1424 }
1425
1426 static re_hashval_t
1427 calc_state_hash (const re_node_set *nodes, unsigned int context)
1428 {
1429   re_hashval_t hash = nodes->nelem + context;
1430   Idx i;
1431   for (i = 0 ; i < nodes->nelem ; i++)
1432     hash += nodes->elems[i];
1433   return hash;
1434 }
1435
1436 /* Search for the state whose node_set is equivalent to NODES.
1437    Return the pointer to the state, if we found it in the DFA.
1438    Otherwise create the new one and return it.  In case of an error
1439    return NULL and set the error code in ERR.
1440    Note: - We assume NULL as the invalid state, then it is possible that
1441            return value is NULL and ERR is REG_NOERROR.
1442          - We never return non-NULL value in case of any errors, it is for
1443            optimization.  */
1444
1445 static re_dfastate_t *
1446 __attribute_warn_unused_result__
1447 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1448                   const re_node_set *nodes)
1449 {
1450   re_hashval_t hash;
1451   re_dfastate_t *new_state;
1452   struct re_state_table_entry *spot;
1453   Idx i;
1454 #if defined GCC_LINT || defined lint
1455   /* Suppress bogus uninitialized-variable warnings.  */
1456   *err = REG_NOERROR;
1457 #endif
1458   if (__glibc_unlikely (nodes->nelem == 0))
1459     {
1460       *err = REG_NOERROR;
1461       return NULL;
1462     }
1463   hash = calc_state_hash (nodes, 0);
1464   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1465
1466   for (i = 0 ; i < spot->num ; i++)
1467     {
1468       re_dfastate_t *state = spot->array[i];
1469       if (hash != state->hash)
1470         continue;
1471       if (re_node_set_compare (&state->nodes, nodes))
1472         return state;
1473     }
1474
1475   /* There are no appropriate state in the dfa, create the new one.  */
1476   new_state = create_ci_newstate (dfa, nodes, hash);
1477   if (__glibc_unlikely (new_state == NULL))
1478     *err = REG_ESPACE;
1479
1480   return new_state;
1481 }
1482
1483 /* Search for the state whose node_set is equivalent to NODES and
1484    whose context is equivalent to CONTEXT.
1485    Return the pointer to the state, if we found it in the DFA.
1486    Otherwise create the new one and return it.  In case of an error
1487    return NULL and set the error code in ERR.
1488    Note: - We assume NULL as the invalid state, then it is possible that
1489            return value is NULL and ERR is REG_NOERROR.
1490          - We never return non-NULL value in case of any errors, it is for
1491            optimization.  */
1492
1493 static re_dfastate_t *
1494 __attribute_warn_unused_result__
1495 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1496                           const re_node_set *nodes, unsigned int context)
1497 {
1498   re_hashval_t hash;
1499   re_dfastate_t *new_state;
1500   struct re_state_table_entry *spot;
1501   Idx i;
1502 #if defined GCC_LINT || defined lint
1503   /* Suppress bogus uninitialized-variable warnings.  */
1504   *err = REG_NOERROR;
1505 #endif
1506   if (nodes->nelem == 0)
1507     {
1508       *err = REG_NOERROR;
1509       return NULL;
1510     }
1511   hash = calc_state_hash (nodes, context);
1512   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1513
1514   for (i = 0 ; i < spot->num ; i++)
1515     {
1516       re_dfastate_t *state = spot->array[i];
1517       if (state->hash == hash
1518           && state->context == context
1519           && re_node_set_compare (state->entrance_nodes, nodes))
1520         return state;
1521     }
1522   /* There are no appropriate state in 'dfa', create the new one.  */
1523   new_state = create_cd_newstate (dfa, nodes, context, hash);
1524   if (__glibc_unlikely (new_state == NULL))
1525     *err = REG_ESPACE;
1526
1527   return new_state;
1528 }
1529
1530 /* Finish initialization of the new state NEWSTATE, and using its hash value
1531    HASH put in the appropriate bucket of DFA's state table.  Return value
1532    indicates the error code if failed.  */
1533
1534 static reg_errcode_t
1535 __attribute_warn_unused_result__
1536 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1537                 re_hashval_t hash)
1538 {
1539   struct re_state_table_entry *spot;
1540   reg_errcode_t err;
1541   Idx i;
1542
1543   newstate->hash = hash;
1544   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1545   if (__glibc_unlikely (err != REG_NOERROR))
1546     return REG_ESPACE;
1547   for (i = 0; i < newstate->nodes.nelem; i++)
1548     {
1549       Idx elem = newstate->nodes.elems[i];
1550       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1551         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1552           return REG_ESPACE;
1553     }
1554
1555   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1556   if (__glibc_unlikely (spot->alloc <= spot->num))
1557     {
1558       Idx new_alloc = 2 * spot->num + 2;
1559       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1560                                               new_alloc);
1561       if (__glibc_unlikely (new_array == NULL))
1562         return REG_ESPACE;
1563       spot->array = new_array;
1564       spot->alloc = new_alloc;
1565     }
1566   spot->array[spot->num++] = newstate;
1567   return REG_NOERROR;
1568 }
1569
1570 static void
1571 free_state (re_dfastate_t *state)
1572 {
1573   re_node_set_free (&state->non_eps_nodes);
1574   re_node_set_free (&state->inveclosure);
1575   if (state->entrance_nodes != &state->nodes)
1576     {
1577       re_node_set_free (state->entrance_nodes);
1578       re_free (state->entrance_nodes);
1579     }
1580   re_node_set_free (&state->nodes);
1581   re_free (state->word_trtable);
1582   re_free (state->trtable);
1583   re_free (state);
1584 }
1585
1586 /* Create the new state which is independent of contexts.
1587    Return the new state if succeeded, otherwise return NULL.  */
1588
1589 static re_dfastate_t *
1590 __attribute_warn_unused_result__
1591 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1592                     re_hashval_t hash)
1593 {
1594   Idx i;
1595   reg_errcode_t err;
1596   re_dfastate_t *newstate;
1597
1598   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1599   if (__glibc_unlikely (newstate == NULL))
1600     return NULL;
1601   err = re_node_set_init_copy (&newstate->nodes, nodes);
1602   if (__glibc_unlikely (err != REG_NOERROR))
1603     {
1604       re_free (newstate);
1605       return NULL;
1606     }
1607
1608   newstate->entrance_nodes = &newstate->nodes;
1609   for (i = 0 ; i < nodes->nelem ; i++)
1610     {
1611       re_token_t *node = dfa->nodes + nodes->elems[i];
1612       re_token_type_t type = node->type;
1613       if (type == CHARACTER && !node->constraint)
1614         continue;
1615       newstate->accept_mb |= node->accept_mb;
1616
1617       /* If the state has the halt node, the state is a halt state.  */
1618       if (type == END_OF_RE)
1619         newstate->halt = 1;
1620       else if (type == OP_BACK_REF)
1621         newstate->has_backref = 1;
1622       else if (type == ANCHOR || node->constraint)
1623         newstate->has_constraint = 1;
1624     }
1625   err = register_state (dfa, newstate, hash);
1626   if (__glibc_unlikely (err != REG_NOERROR))
1627     {
1628       free_state (newstate);
1629       newstate = NULL;
1630     }
1631   return newstate;
1632 }
1633
1634 /* Create the new state which is depend on the context CONTEXT.
1635    Return the new state if succeeded, otherwise return NULL.  */
1636
1637 static re_dfastate_t *
1638 __attribute_warn_unused_result__
1639 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1640                     unsigned int context, re_hashval_t hash)
1641 {
1642   Idx i, nctx_nodes = 0;
1643   reg_errcode_t err;
1644   re_dfastate_t *newstate;
1645
1646   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1647   if (__glibc_unlikely (newstate == NULL))
1648     return NULL;
1649   err = re_node_set_init_copy (&newstate->nodes, nodes);
1650   if (__glibc_unlikely (err != REG_NOERROR))
1651     {
1652       re_free (newstate);
1653       return NULL;
1654     }
1655
1656   newstate->context = context;
1657   newstate->entrance_nodes = &newstate->nodes;
1658
1659   for (i = 0 ; i < nodes->nelem ; i++)
1660     {
1661       re_token_t *node = dfa->nodes + nodes->elems[i];
1662       re_token_type_t type = node->type;
1663       unsigned int constraint = node->constraint;
1664
1665       if (type == CHARACTER && !constraint)
1666         continue;
1667       newstate->accept_mb |= node->accept_mb;
1668
1669       /* If the state has the halt node, the state is a halt state.  */
1670       if (type == END_OF_RE)
1671         newstate->halt = 1;
1672       else if (type == OP_BACK_REF)
1673         newstate->has_backref = 1;
1674
1675       if (constraint)
1676         {
1677           if (newstate->entrance_nodes == &newstate->nodes)
1678             {
1679               re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1680               if (__glibc_unlikely (entrance_nodes == NULL))
1681                 {
1682                   free_state (newstate);
1683                   return NULL;
1684                 }
1685               newstate->entrance_nodes = entrance_nodes;
1686               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1687                   != REG_NOERROR)
1688                 {
1689                   free_state (newstate);
1690                   return NULL;
1691                 }
1692               nctx_nodes = 0;
1693               newstate->has_constraint = 1;
1694             }
1695
1696           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1697             {
1698               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1699               ++nctx_nodes;
1700             }
1701         }
1702     }
1703   err = register_state (dfa, newstate, hash);
1704   if (__glibc_unlikely (err != REG_NOERROR))
1705     {
1706       free_state (newstate);
1707       newstate = NULL;
1708     }
1709   return  newstate;
1710 }