lib/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2020 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, see
  18    <https://www.gnu.org/licenses/>.  */
  19
  20 static void re_string_construct_common (const char *str, Idx len,
  21                                         re_string_t *pstr,
  22                                         RE_TRANSLATE_TYPE trans, bool icase,
  23                                         const re_dfa_t *dfa);
  24 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  25                                           const re_node_set *nodes,
  26                                           re_hashval_t hash);
  27 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  28                                           const re_node_set *nodes,
  29                                           unsigned int context,
  30                                           re_hashval_t hash);
  31 static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr,
  32                                                 Idx new_buf_len);
  33 #ifdef RE_ENABLE_I18N
  34 static void build_wcs_buffer (re_string_t *pstr);
  35 static reg_errcode_t build_wcs_upper_buffer (re_string_t *pstr);
  36 #endif /* RE_ENABLE_I18N */
  37 static void build_upper_buffer (re_string_t *pstr);
  38 static void re_string_translate_buffer (re_string_t *pstr);
  39 static unsigned int re_string_context_at (const re_string_t *input, Idx idx,
  40                                           int eflags) __attribute__ ((pure));
  41 \f
  42 /* Functions for string operation.  */
  43
  44 /* This function allocate the buffers.  It is necessary to call
  45    re_string_reconstruct before using the object.  */
  46
  47 static reg_errcode_t
  48 __attribute_warn_unused_result__
  49 re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  50                     RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  51 {
  52   reg_errcode_t ret;
  53   Idx init_buf_len;
  54
  55   /* Ensure at least one character fits into the buffers.  */
  56   if (init_len < dfa->mb_cur_max)
  57     init_len = dfa->mb_cur_max;
  58   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  59   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  60
  61   ret = re_string_realloc_buffers (pstr, init_buf_len);
  62   if (__glibc_unlikely (ret != REG_NOERROR))
  63     return ret;
  64
  65   pstr->word_char = dfa->word_char;
  66   pstr->word_ops_used = dfa->word_ops_used;
  67   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  68   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  69   pstr->valid_raw_len = pstr->valid_len;
  70   return REG_NOERROR;
  71 }
  72
  73 /* This function allocate the buffers, and initialize them.  */
  74
  75 static reg_errcode_t
  76 __attribute_warn_unused_result__
  77 re_string_construct (re_string_t *pstr, const char *str, Idx len,
  78                      RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  79 {
  80   reg_errcode_t ret;
  81   memset (pstr, '\0', sizeof (re_string_t));
  82   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  83
  84   if (len > 0)
  85     {
  86       ret = re_string_realloc_buffers (pstr, len + 1);
  87       if (__glibc_unlikely (ret != REG_NOERROR))
  88         return ret;
  89     }
  90   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  91
  92   if (icase)
  93     {
  94 #ifdef RE_ENABLE_I18N
  95       if (dfa->mb_cur_max > 1)
  96         {
  97           while (1)
  98             {
  99               ret = build_wcs_upper_buffer (pstr);
 100               if (__glibc_unlikely (ret != REG_NOERROR))
 101                 return ret;
 102               if (pstr->valid_raw_len >= len)
 103                 break;
 104               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 105                 break;
 106               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 107               if (__glibc_unlikely (ret != REG_NOERROR))
 108                 return ret;
 109             }
 110         }
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         build_upper_buffer (pstr);
 114     }
 115   else
 116     {
 117 #ifdef RE_ENABLE_I18N
 118       if (dfa->mb_cur_max > 1)
 119         build_wcs_buffer (pstr);
 120       else
 121 #endif /* RE_ENABLE_I18N  */
 122         {
 123           if (trans != NULL)
 124             re_string_translate_buffer (pstr);
 125           else
 126             {
 127               pstr->valid_len = pstr->bufs_len;
 128               pstr->valid_raw_len = pstr->bufs_len;
 129             }
 130         }
 131     }
 132
 133   return REG_NOERROR;
 134 }
 135
 136 /* Helper functions for re_string_allocate, and re_string_construct.  */
 137
 138 static reg_errcode_t
 139 __attribute_warn_unused_result__
 140 re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
 141 {
 142 #ifdef RE_ENABLE_I18N
 143   if (pstr->mb_cur_max > 1)
 144     {
 145       wint_t *new_wcs;
 146
 147       /* Avoid overflow in realloc.  */
 148       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
 149       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
 150                             < new_buf_len))
 151         return REG_ESPACE;
 152
 153       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 154       if (__glibc_unlikely (new_wcs == NULL))
 155         return REG_ESPACE;
 156       pstr->wcs = new_wcs;
 157       if (pstr->offsets != NULL)
 158         {
 159           Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
 160           if (__glibc_unlikely (new_offsets == NULL))
 161             return REG_ESPACE;
 162           pstr->offsets = new_offsets;
 163         }
 164     }
 165 #endif /* RE_ENABLE_I18N  */
 166   if (pstr->mbs_allocated)
 167     {
 168       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 169                                            new_buf_len);
 170       if (__glibc_unlikely (new_mbs == NULL))
 171         return REG_ESPACE;
 172       pstr->mbs = new_mbs;
 173     }
 174   pstr->bufs_len = new_buf_len;
 175   return REG_NOERROR;
 176 }
 177
 178
 179 static void
 180 re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
 181                             RE_TRANSLATE_TYPE trans, bool icase,
 182                             const re_dfa_t *dfa)
 183 {
 184   pstr->raw_mbs = (const unsigned char *) str;
 185   pstr->len = len;
 186   pstr->raw_len = len;
 187   pstr->trans = trans;
 188   pstr->icase = icase;
 189   pstr->mbs_allocated = (trans != NULL || icase);
 190   pstr->mb_cur_max = dfa->mb_cur_max;
 191   pstr->is_utf8 = dfa->is_utf8;
 192   pstr->map_notascii = dfa->map_notascii;
 193   pstr->stop = pstr->len;
 194   pstr->raw_stop = pstr->stop;
 195 }
 196
 197 #ifdef RE_ENABLE_I18N
 198
 199 /* Build wide character buffer PSTR->WCS.
 200    If the byte sequence of the string are:
 201      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 202    Then wide character buffer will be:
 203      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 204    We use WEOF for padding, they indicate that the position isn't
 205    a first byte of a multibyte character.
 206
 207    Note that this function assumes PSTR->VALID_LEN elements are already
 208    built and starts from PSTR->VALID_LEN.  */
 209
 210 static void
 211 build_wcs_buffer (re_string_t *pstr)
 212 {
 213 #ifdef _LIBC
 214   unsigned char buf[MB_LEN_MAX];
 215   DEBUG_ASSERT (MB_LEN_MAX >= pstr->mb_cur_max);
 216 #else
 217   unsigned char buf[64];
 218 #endif
 219   mbstate_t prev_st;
 220   Idx byte_idx, end_idx, remain_len;
 221   size_t mbclen;
 222
 223   /* Build the buffers from pstr->valid_len to either pstr->len or
 224      pstr->bufs_len.  */
 225   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 226   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 227     {
 228       wchar_t wc;
 229       const char *p;
 230
 231       remain_len = end_idx - byte_idx;
 232       prev_st = pstr->cur_state;
 233       /* Apply the translation if we need.  */
 234       if (__glibc_unlikely (pstr->trans != NULL))
 235         {
 236           int i, ch;
 237
 238           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 239             {
 240               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 241               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 242             }
 243           p = (const char *) buf;
 244         }
 245       else
 246         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 247       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 248       if (__glibc_unlikely (mbclen == (size_t) -1 || mbclen == 0
 249                             || (mbclen == (size_t) -2
 250                                 && pstr->bufs_len >= pstr->len)))
 251         {
 252           /* We treat these cases as a singlebyte character.  */
 253           mbclen = 1;
 254           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 255           if (__glibc_unlikely (pstr->trans != NULL))
 256             wc = pstr->trans[wc];
 257           pstr->cur_state = prev_st;
 258         }
 259       else if (__glibc_unlikely (mbclen == (size_t) -2))
 260         {
 261           /* The buffer doesn't have enough space, finish to build.  */
 262           pstr->cur_state = prev_st;
 263           break;
 264         }
 265
 266       /* Write wide character and padding.  */
 267       pstr->wcs[byte_idx++] = wc;
 268       /* Write paddings.  */
 269       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 270         pstr->wcs[byte_idx++] = WEOF;
 271     }
 272   pstr->valid_len = byte_idx;
 273   pstr->valid_raw_len = byte_idx;
 274 }
 275
 276 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 277    but for REG_ICASE.  */
 278
 279 static reg_errcode_t
 280 __attribute_warn_unused_result__
 281 build_wcs_upper_buffer (re_string_t *pstr)
 282 {
 283   mbstate_t prev_st;
 284   Idx src_idx, byte_idx, end_idx, remain_len;
 285   size_t mbclen;
 286 #ifdef _LIBC
 287   char buf[MB_LEN_MAX];
 288   DEBUG_ASSERT (pstr->mb_cur_max <= MB_LEN_MAX);
 289 #else
 290   char buf[64];
 291 #endif
 292
 293   byte_idx = pstr->valid_len;
 294   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 295
 296   /* The following optimization assumes that ASCII characters can be
 297      mapped to wide characters with a simple cast.  */
 298   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 299     {
 300       while (byte_idx < end_idx)
 301         {
 302           wchar_t wc;
 303
 304           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 305               && mbsinit (&pstr->cur_state))
 306             {
 307               /* In case of a singlebyte character.  */
 308               pstr->mbs[byte_idx]
 309                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 310               /* The next step uses the assumption that wchar_t is encoded
 311                  ASCII-safe: all ASCII values can be converted like this.  */
 312               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 313               ++byte_idx;
 314               continue;
 315             }
 316
 317           remain_len = end_idx - byte_idx;
 318           prev_st = pstr->cur_state;
 319           mbclen = __mbrtowc (&wc,
 320                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 321                                + byte_idx), remain_len, &pstr->cur_state);
 322           if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 323             {
 324               wchar_t wcu = __towupper (wc);
 325               if (wcu != wc)
 326                 {
 327                   size_t mbcdlen;
 328
 329                   mbcdlen = __wcrtomb (buf, wcu, &prev_st);
 330                   if (__glibc_likely (mbclen == mbcdlen))
 331                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 332                   else
 333                     {
 334                       src_idx = byte_idx;
 335                       goto offsets_needed;
 336                     }
 337                 }
 338               else
 339                 memcpy (pstr->mbs + byte_idx,
 340                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 341               pstr->wcs[byte_idx++] = wcu;
 342               /* Write paddings.  */
 343               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 344                 pstr->wcs[byte_idx++] = WEOF;
 345             }
 346           else if (mbclen == (size_t) -1 || mbclen == 0
 347                    || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 348             {
 349               /* It is an invalid character, an incomplete character
 350                  at the end of the string, or '\0'.  Just use the byte.  */
 351               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 352               pstr->mbs[byte_idx] = ch;
 353               /* And also cast it to wide char.  */
 354               pstr->wcs[byte_idx++] = (wchar_t) ch;
 355               if (__glibc_unlikely (mbclen == (size_t) -1))
 356                 pstr->cur_state = prev_st;
 357             }
 358           else
 359             {
 360               /* The buffer doesn't have enough space, finish to build.  */
 361               pstr->cur_state = prev_st;
 362               break;
 363             }
 364         }
 365       pstr->valid_len = byte_idx;
 366       pstr->valid_raw_len = byte_idx;
 367       return REG_NOERROR;
 368     }
 369   else
 370     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 371       {
 372         wchar_t wc;
 373         const char *p;
 374       offsets_needed:
 375         remain_len = end_idx - byte_idx;
 376         prev_st = pstr->cur_state;
 377         if (__glibc_unlikely (pstr->trans != NULL))
 378           {
 379             int i, ch;
 380
 381             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 382               {
 383                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 384                 buf[i] = pstr->trans[ch];
 385               }
 386             p = (const char *) buf;
 387           }
 388         else
 389           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 390         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 391         if (__glibc_likely (0 < mbclen && mbclen < (size_t) -2))
 392           {
 393             wchar_t wcu = __towupper (wc);
 394             if (wcu != wc)
 395               {
 396                 size_t mbcdlen;
 397
 398                 mbcdlen = __wcrtomb ((char *) buf, wcu, &prev_st);
 399                 if (__glibc_likely (mbclen == mbcdlen))
 400                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 401                 else if (mbcdlen != (size_t) -1)
 402                   {
 403                     size_t i;
 404
 405                     if (byte_idx + mbcdlen > pstr->bufs_len)
 406                       {
 407                         pstr->cur_state = prev_st;
 408                         break;
 409                       }
 410
 411                     if (pstr->offsets == NULL)
 412                       {
 413                         pstr->offsets = re_malloc (Idx, pstr->bufs_len);
 414
 415                         if (pstr->offsets == NULL)
 416                           return REG_ESPACE;
 417                       }
 418                     if (!pstr->offsets_needed)
 419                       {
 420                         for (i = 0; i < (size_t) byte_idx; ++i)
 421                           pstr->offsets[i] = i;
 422                         pstr->offsets_needed = 1;
 423                       }
 424
 425                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 426                     pstr->wcs[byte_idx] = wcu;
 427                     pstr->offsets[byte_idx] = src_idx;
 428                     for (i = 1; i < mbcdlen; ++i)
 429                       {
 430                         pstr->offsets[byte_idx + i]
 431                           = src_idx + (i < mbclen ? i : mbclen - 1);
 432                         pstr->wcs[byte_idx + i] = WEOF;
 433                       }
 434                     pstr->len += mbcdlen - mbclen;
 435                     if (pstr->raw_stop > src_idx)
 436                       pstr->stop += mbcdlen - mbclen;
 437                     end_idx = (pstr->bufs_len > pstr->len)
 438                               ? pstr->len : pstr->bufs_len;
 439                     byte_idx += mbcdlen;
 440                     src_idx += mbclen;
 441                     continue;
 442                   }
 443                 else
 444                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 445               }
 446             else
 447               memcpy (pstr->mbs + byte_idx, p, mbclen);
 448
 449             if (__glibc_unlikely (pstr->offsets_needed != 0))
 450               {
 451                 size_t i;
 452                 for (i = 0; i < mbclen; ++i)
 453                   pstr->offsets[byte_idx + i] = src_idx + i;
 454               }
 455             src_idx += mbclen;
 456
 457             pstr->wcs[byte_idx++] = wcu;
 458             /* Write paddings.  */
 459             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 460               pstr->wcs[byte_idx++] = WEOF;
 461           }
 462         else if (mbclen == (size_t) -1 || mbclen == 0
 463                  || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
 464           {
 465             /* It is an invalid character or '\0'.  Just use the byte.  */
 466             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 467
 468             if (__glibc_unlikely (pstr->trans != NULL))
 469               ch = pstr->trans [ch];
 470             pstr->mbs[byte_idx] = ch;
 471
 472             if (__glibc_unlikely (pstr->offsets_needed != 0))
 473               pstr->offsets[byte_idx] = src_idx;
 474             ++src_idx;
 475
 476             /* And also cast it to wide char.  */
 477             pstr->wcs[byte_idx++] = (wchar_t) ch;
 478             if (__glibc_unlikely (mbclen == (size_t) -1))
 479               pstr->cur_state = prev_st;
 480           }
 481         else
 482           {
 483             /* The buffer doesn't have enough space, finish to build.  */
 484             pstr->cur_state = prev_st;
 485             break;
 486           }
 487       }
 488   pstr->valid_len = byte_idx;
 489   pstr->valid_raw_len = src_idx;
 490   return REG_NOERROR;
 491 }
 492
 493 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 494    Return the index.  */
 495
 496 static Idx
 497 re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
 498 {
 499   mbstate_t prev_st;
 500   Idx rawbuf_idx;
 501   size_t mbclen;
 502   wint_t wc = WEOF;
 503
 504   /* Skip the characters which are not necessary to check.  */
 505   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 506        rawbuf_idx < new_raw_idx;)
 507     {
 508       wchar_t wc2;
 509       Idx remain_len = pstr->raw_len - rawbuf_idx;
 510       prev_st = pstr->cur_state;
 511       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 512                           remain_len, &pstr->cur_state);
 513       if (__glibc_unlikely (mbclen == (size_t) -2 || mbclen == (size_t) -1
 514                             || mbclen == 0))
 515         {
 516           /* We treat these cases as a single byte character.  */
 517           if (mbclen == 0 || remain_len == 0)
 518             wc = L'\0';
 519           else
 520             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 521           mbclen = 1;
 522           pstr->cur_state = prev_st;
 523         }
 524       else
 525         wc = wc2;
 526       /* Then proceed the next character.  */
 527       rawbuf_idx += mbclen;
 528     }
 529   *last_wc = wc;
 530   return rawbuf_idx;
 531 }
 532 #endif /* RE_ENABLE_I18N  */
 533
 534 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 535    This function is used in case of REG_ICASE.  */
 536
 537 static void
 538 build_upper_buffer (re_string_t *pstr)
 539 {
 540   Idx char_idx, end_idx;
 541   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 542
 543   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 544     {
 545       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 546       if (__glibc_unlikely (pstr->trans != NULL))
 547         ch = pstr->trans[ch];
 548       pstr->mbs[char_idx] = toupper (ch);
 549     }
 550   pstr->valid_len = char_idx;
 551   pstr->valid_raw_len = char_idx;
 552 }
 553
 554 /* Apply TRANS to the buffer in PSTR.  */
 555
 556 static void
 557 re_string_translate_buffer (re_string_t *pstr)
 558 {
 559   Idx buf_idx, end_idx;
 560   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 561
 562   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 563     {
 564       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 565       pstr->mbs[buf_idx] = pstr->trans[ch];
 566     }
 567
 568   pstr->valid_len = buf_idx;
 569   pstr->valid_raw_len = buf_idx;
 570 }
 571
 572 /* This function re-construct the buffers.
 573    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 574    convert to upper case in case of REG_ICASE, apply translation.  */
 575
 576 static reg_errcode_t
 577 __attribute_warn_unused_result__
 578 re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
 579 {
 580   Idx offset;
 581
 582   if (__glibc_unlikely (pstr->raw_mbs_idx <= idx))
 583     offset = idx - pstr->raw_mbs_idx;
 584   else
 585     {
 586       /* Reset buffer.  */
 587 #ifdef RE_ENABLE_I18N
 588       if (pstr->mb_cur_max > 1)
 589         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 590 #endif /* RE_ENABLE_I18N */
 591       pstr->len = pstr->raw_len;
 592       pstr->stop = pstr->raw_stop;
 593       pstr->valid_len = 0;
 594       pstr->raw_mbs_idx = 0;
 595       pstr->valid_raw_len = 0;
 596       pstr->offsets_needed = 0;
 597       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 598                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 599       if (!pstr->mbs_allocated)
 600         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 601       offset = idx;
 602     }
 603
 604   if (__glibc_likely (offset != 0))
 605     {
 606       /* Should the already checked characters be kept?  */
 607       if (__glibc_likely (offset < pstr->valid_raw_len))
 608         {
 609           /* Yes, move them to the front of the buffer.  */
 610 #ifdef RE_ENABLE_I18N
 611           if (__glibc_unlikely (pstr->offsets_needed))
 612             {
 613               Idx low = 0, high = pstr->valid_len, mid;
 614               do
 615                 {
 616                   mid = (high + low) / 2;
 617                   if (pstr->offsets[mid] > offset)
 618                     high = mid;
 619                   else if (pstr->offsets[mid] < offset)
 620                     low = mid + 1;
 621                   else
 622                     break;
 623                 }
 624               while (low < high);
 625               if (pstr->offsets[mid] < offset)
 626                 ++mid;
 627               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 628                                                         eflags);
 629               /* This can be quite complicated, so handle specially
 630                  only the common and easy case where the character with
 631                  different length representation of lower and upper
 632                  case is present at or after offset.  */
 633               if (pstr->valid_len > offset
 634                   && mid == offset && pstr->offsets[mid] == offset)
 635                 {
 636                   memmove (pstr->wcs, pstr->wcs + offset,
 637                            (pstr->valid_len - offset) * sizeof (wint_t));
 638                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 639                   pstr->valid_len -= offset;
 640                   pstr->valid_raw_len -= offset;
 641                   for (low = 0; low < pstr->valid_len; low++)
 642                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 643                 }
 644               else
 645                 {
 646                   /* Otherwise, just find out how long the partial multibyte
 647                      character at offset is and fill it with WEOF/255.  */
 648                   pstr->len = pstr->raw_len - idx + offset;
 649                   pstr->stop = pstr->raw_stop - idx + offset;
 650                   pstr->offsets_needed = 0;
 651                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 652                     --mid;
 653                   while (mid < pstr->valid_len)
 654                     if (pstr->wcs[mid] != WEOF)
 655                       break;
 656                     else
 657                       ++mid;
 658                   if (mid == pstr->valid_len)
 659                     pstr->valid_len = 0;
 660                   else
 661                     {
 662                       pstr->valid_len = pstr->offsets[mid] - offset;
 663                       if (pstr->valid_len)
 664                         {
 665                           for (low = 0; low < pstr->valid_len; ++low)
 666                             pstr->wcs[low] = WEOF;
 667                           memset (pstr->mbs, 255, pstr->valid_len);
 668                         }
 669                     }
 670                   pstr->valid_raw_len = pstr->valid_len;
 671                 }
 672             }
 673           else
 674 #endif
 675             {
 676               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 677                                                         eflags);
 678 #ifdef RE_ENABLE_I18N
 679               if (pstr->mb_cur_max > 1)
 680                 memmove (pstr->wcs, pstr->wcs + offset,
 681                          (pstr->valid_len - offset) * sizeof (wint_t));
 682 #endif /* RE_ENABLE_I18N */
 683               if (__glibc_unlikely (pstr->mbs_allocated))
 684                 memmove (pstr->mbs, pstr->mbs + offset,
 685                          pstr->valid_len - offset);
 686               pstr->valid_len -= offset;
 687               pstr->valid_raw_len -= offset;
 688               DEBUG_ASSERT (pstr->valid_len > 0);
 689             }
 690         }
 691       else
 692         {
 693 #ifdef RE_ENABLE_I18N
 694           /* No, skip all characters until IDX.  */
 695           Idx prev_valid_len = pstr->valid_len;
 696
 697           if (__glibc_unlikely (pstr->offsets_needed))
 698             {
 699               pstr->len = pstr->raw_len - idx + offset;
 700               pstr->stop = pstr->raw_stop - idx + offset;
 701               pstr->offsets_needed = 0;
 702             }
 703 #endif
 704           pstr->valid_len = 0;
 705 #ifdef RE_ENABLE_I18N
 706           if (pstr->mb_cur_max > 1)
 707             {
 708               Idx wcs_idx;
 709               wint_t wc = WEOF;
 710
 711               if (pstr->is_utf8)
 712                 {
 713                   const unsigned char *raw, *p, *end;
 714
 715                   /* Special case UTF-8.  Multi-byte chars start with any
 716                      byte other than 0x80 - 0xbf.  */
 717                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 718                   end = raw + (offset - pstr->mb_cur_max);
 719                   if (end < pstr->raw_mbs)
 720                     end = pstr->raw_mbs;
 721                   p = raw + offset - 1;
 722 #ifdef _LIBC
 723                   /* We know the wchar_t encoding is UCS4, so for the simple
 724                      case, ASCII characters, skip the conversion step.  */
 725                   if (isascii (*p) && __glibc_likely (pstr->trans == NULL))
 726                     {
 727                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 728                       /* pstr->valid_len = 0; */
 729                       wc = (wchar_t) *p;
 730                     }
 731                   else
 732 #endif
 733                     for (; p >= end; --p)
 734                       if ((*p & 0xc0) != 0x80)
 735                         {
 736                           mbstate_t cur_state;
 737                           wchar_t wc2;
 738                           Idx mlen = raw + pstr->len - p;
 739                           unsigned char buf[6];
 740                           size_t mbclen;
 741
 742                           const unsigned char *pp = p;
 743                           if (__glibc_unlikely (pstr->trans != NULL))
 744                             {
 745                               int i = mlen < 6 ? mlen : 6;
 746                               while (--i >= 0)
 747                                 buf[i] = pstr->trans[p[i]];
 748                               pp = buf;
 749                             }
 750                           /* XXX Don't use mbrtowc, we know which conversion
 751                              to use (UTF-8 -> UCS4).  */
 752                           memset (&cur_state, 0, sizeof (cur_state));
 753                           mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
 754                                               &cur_state);
 755                           if (raw + offset - p <= mbclen
 756                               && mbclen < (size_t) -2)
 757                             {
 758                               memset (&pstr->cur_state, '\0',
 759                                       sizeof (mbstate_t));
 760                               pstr->valid_len = mbclen - (raw + offset - p);
 761                               wc = wc2;
 762                             }
 763                           break;
 764                         }
 765                 }
 766
 767               if (wc == WEOF)
 768                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 769               if (wc == WEOF)
 770                 pstr->tip_context
 771                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 772               else
 773                 pstr->tip_context = ((__glibc_unlikely (pstr->word_ops_used != 0)
 774                                       && IS_WIDE_WORD_CHAR (wc))
 775                                      ? CONTEXT_WORD
 776                                      : ((IS_WIDE_NEWLINE (wc)
 777                                          && pstr->newline_anchor)
 778                                         ? CONTEXT_NEWLINE : 0));
 779               if (__glibc_unlikely (pstr->valid_len))
 780                 {
 781                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 782                     pstr->wcs[wcs_idx] = WEOF;
 783                   if (pstr->mbs_allocated)
 784                     memset (pstr->mbs, 255, pstr->valid_len);
 785                 }
 786               pstr->valid_raw_len = pstr->valid_len;
 787             }
 788           else
 789 #endif /* RE_ENABLE_I18N */
 790             {
 791               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 792               pstr->valid_raw_len = 0;
 793               if (pstr->trans)
 794                 c = pstr->trans[c];
 795               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 796                                    ? CONTEXT_WORD
 797                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 798                                       ? CONTEXT_NEWLINE : 0));
 799             }
 800         }
 801       if (!__glibc_unlikely (pstr->mbs_allocated))
 802         pstr->mbs += offset;
 803     }
 804   pstr->raw_mbs_idx = idx;
 805   pstr->len -= offset;
 806   pstr->stop -= offset;
 807
 808   /* Then build the buffers.  */
 809 #ifdef RE_ENABLE_I18N
 810   if (pstr->mb_cur_max > 1)
 811     {
 812       if (pstr->icase)
 813         {
 814           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 815           if (__glibc_unlikely (ret != REG_NOERROR))
 816             return ret;
 817         }
 818       else
 819         build_wcs_buffer (pstr);
 820     }
 821   else
 822 #endif /* RE_ENABLE_I18N */
 823     if (__glibc_unlikely (pstr->mbs_allocated))
 824       {
 825         if (pstr->icase)
 826           build_upper_buffer (pstr);
 827         else if (pstr->trans != NULL)
 828           re_string_translate_buffer (pstr);
 829       }
 830     else
 831       pstr->valid_len = pstr->len;
 832
 833   pstr->cur_idx = 0;
 834   return REG_NOERROR;
 835 }
 836
 837 static unsigned char
 838 __attribute__ ((pure))
 839 re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
 840 {
 841   int ch;
 842   Idx off;
 843
 844   /* Handle the common (easiest) cases first.  */
 845   if (__glibc_likely (!pstr->mbs_allocated))
 846     return re_string_peek_byte (pstr, idx);
 847
 848 #ifdef RE_ENABLE_I18N
 849   if (pstr->mb_cur_max > 1
 850       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 851     return re_string_peek_byte (pstr, idx);
 852 #endif
 853
 854   off = pstr->cur_idx + idx;
 855 #ifdef RE_ENABLE_I18N
 856   if (pstr->offsets_needed)
 857     off = pstr->offsets[off];
 858 #endif
 859
 860   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 861
 862 #ifdef RE_ENABLE_I18N
 863   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 864      this function returns CAPITAL LETTER I instead of first byte of
 865      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 866      since peek_byte_case doesn't advance cur_idx in any way.  */
 867   if (pstr->offsets_needed && !isascii (ch))
 868     return re_string_peek_byte (pstr, idx);
 869 #endif
 870
 871   return ch;
 872 }
 873
 874 static unsigned char
 875 re_string_fetch_byte_case (re_string_t *pstr)
 876 {
 877   if (__glibc_likely (!pstr->mbs_allocated))
 878     return re_string_fetch_byte (pstr);
 879
 880 #ifdef RE_ENABLE_I18N
 881   if (pstr->offsets_needed)
 882     {
 883       Idx off;
 884       int ch;
 885
 886       /* For tr_TR.UTF-8 [[:islower:]] there is
 887          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 888          in that case the whole multi-byte character and return
 889          the original letter.  On the other side, with
 890          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 891          anything else would complicate things too much.  */
 892
 893       if (!re_string_first_byte (pstr, pstr->cur_idx))
 894         return re_string_fetch_byte (pstr);
 895
 896       off = pstr->offsets[pstr->cur_idx];
 897       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 898
 899       if (! isascii (ch))
 900         return re_string_fetch_byte (pstr);
 901
 902       re_string_skip_bytes (pstr,
 903                             re_string_char_size_at (pstr, pstr->cur_idx));
 904       return ch;
 905     }
 906 #endif
 907
 908   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 909 }
 910
 911 static void
 912 re_string_destruct (re_string_t *pstr)
 913 {
 914 #ifdef RE_ENABLE_I18N
 915   re_free (pstr->wcs);
 916   re_free (pstr->offsets);
 917 #endif /* RE_ENABLE_I18N  */
 918   if (pstr->mbs_allocated)
 919     re_free (pstr->mbs);
 920 }
 921
 922 /* Return the context at IDX in INPUT.  */
 923
 924 static unsigned int
 925 re_string_context_at (const re_string_t *input, Idx idx, int eflags)
 926 {
 927   int c;
 928   if (__glibc_unlikely (idx < 0))
 929     /* In this case, we use the value stored in input->tip_context,
 930        since we can't know the character in input->mbs[-1] here.  */
 931     return input->tip_context;
 932   if (__glibc_unlikely (idx == input->len))
 933     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 934             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 935 #ifdef RE_ENABLE_I18N
 936   if (input->mb_cur_max > 1)
 937     {
 938       wint_t wc;
 939       Idx wc_idx = idx;
 940       while(input->wcs[wc_idx] == WEOF)
 941         {
 942           DEBUG_ASSERT (wc_idx >= 0);
 943           --wc_idx;
 944           if (wc_idx < 0)
 945             return input->tip_context;
 946         }
 947       wc = input->wcs[wc_idx];
 948       if (__glibc_unlikely (input->word_ops_used != 0)
 949           && IS_WIDE_WORD_CHAR (wc))
 950         return CONTEXT_WORD;
 951       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 952               ? CONTEXT_NEWLINE : 0);
 953     }
 954   else
 955 #endif
 956     {
 957       c = re_string_byte_at (input, idx);
 958       if (bitset_contain (input->word_char, c))
 959         return CONTEXT_WORD;
 960       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 961     }
 962 }
 963 \f
 964 /* Functions for set operation.  */
 965
 966 static reg_errcode_t
 967 __attribute_warn_unused_result__
 968 re_node_set_alloc (re_node_set *set, Idx size)
 969 {
 970   set->alloc = size;
 971   set->nelem = 0;
 972   set->elems = re_malloc (Idx, size);
 973   if (__glibc_unlikely (set->elems == NULL)
 974       && (MALLOC_0_IS_NONNULL || size != 0))
 975     return REG_ESPACE;
 976   return REG_NOERROR;
 977 }
 978
 979 static reg_errcode_t
 980 __attribute_warn_unused_result__
 981 re_node_set_init_1 (re_node_set *set, Idx elem)
 982 {
 983   set->alloc = 1;
 984   set->nelem = 1;
 985   set->elems = re_malloc (Idx, 1);
 986   if (__glibc_unlikely (set->elems == NULL))
 987     {
 988       set->alloc = set->nelem = 0;
 989       return REG_ESPACE;
 990     }
 991   set->elems[0] = elem;
 992   return REG_NOERROR;
 993 }
 994
 995 static reg_errcode_t
 996 __attribute_warn_unused_result__
 997 re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
 998 {
 999   set->alloc = 2;
1000   set->elems = re_malloc (Idx, 2);
1001   if (__glibc_unlikely (set->elems == NULL))
1002     return REG_ESPACE;
1003   if (elem1 == elem2)
1004     {
1005       set->nelem = 1;
1006       set->elems[0] = elem1;
1007     }
1008   else
1009     {
1010       set->nelem = 2;
1011       if (elem1 < elem2)
1012         {
1013           set->elems[0] = elem1;
1014           set->elems[1] = elem2;
1015         }
1016       else
1017         {
1018           set->elems[0] = elem2;
1019           set->elems[1] = elem1;
1020         }
1021     }
1022   return REG_NOERROR;
1023 }
1024
1025 static reg_errcode_t
1026 __attribute_warn_unused_result__
1027 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1028 {
1029   dest->nelem = src->nelem;
1030   if (src->nelem > 0)
1031     {
1032       dest->alloc = dest->nelem;
1033       dest->elems = re_malloc (Idx, dest->alloc);
1034       if (__glibc_unlikely (dest->elems == NULL))
1035         {
1036           dest->alloc = dest->nelem = 0;
1037           return REG_ESPACE;
1038         }
1039       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1040     }
1041   else
1042     re_node_set_init_empty (dest);
1043   return REG_NOERROR;
1044 }
1045
1046 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1047    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1048    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1049
1050 static reg_errcode_t
1051 __attribute_warn_unused_result__
1052 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1053                            const re_node_set *src2)
1054 {
1055   Idx i1, i2, is, id, delta, sbase;
1056   if (src1->nelem == 0 || src2->nelem == 0)
1057     return REG_NOERROR;
1058
1059   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1060      conservative estimate.  */
1061   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1062     {
1063       Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1064       Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1065       if (__glibc_unlikely (new_elems == NULL))
1066         return REG_ESPACE;
1067       dest->elems = new_elems;
1068       dest->alloc = new_alloc;
1069     }
1070
1071   /* Find the items in the intersection of SRC1 and SRC2, and copy
1072      into the top of DEST those that are not already in DEST itself.  */
1073   sbase = dest->nelem + src1->nelem + src2->nelem;
1074   i1 = src1->nelem - 1;
1075   i2 = src2->nelem - 1;
1076   id = dest->nelem - 1;
1077   for (;;)
1078     {
1079       if (src1->elems[i1] == src2->elems[i2])
1080         {
1081           /* Try to find the item in DEST.  Maybe we could binary search?  */
1082           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1083             --id;
1084
1085           if (id < 0 || dest->elems[id] != src1->elems[i1])
1086             dest->elems[--sbase] = src1->elems[i1];
1087
1088           if (--i1 < 0 || --i2 < 0)
1089             break;
1090         }
1091
1092       /* Lower the highest of the two items.  */
1093       else if (src1->elems[i1] < src2->elems[i2])
1094         {
1095           if (--i2 < 0)
1096             break;
1097         }
1098       else
1099         {
1100           if (--i1 < 0)
1101             break;
1102         }
1103     }
1104
1105   id = dest->nelem - 1;
1106   is = dest->nelem + src1->nelem + src2->nelem - 1;
1107   delta = is - sbase + 1;
1108
1109   /* Now copy.  When DELTA becomes zero, the remaining
1110      DEST elements are already in place; this is more or
1111      less the same loop that is in re_node_set_merge.  */
1112   dest->nelem += delta;
1113   if (delta > 0 && id >= 0)
1114     for (;;)
1115       {
1116         if (dest->elems[is] > dest->elems[id])
1117           {
1118             /* Copy from the top.  */
1119             dest->elems[id + delta--] = dest->elems[is--];
1120             if (delta == 0)
1121               break;
1122           }
1123         else
1124           {
1125             /* Slide from the bottom.  */
1126             dest->elems[id + delta] = dest->elems[id];
1127             if (--id < 0)
1128               break;
1129           }
1130       }
1131
1132   /* Copy remaining SRC elements.  */
1133   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1134
1135   return REG_NOERROR;
1136 }
1137
1138 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1139    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1140
1141 static reg_errcode_t
1142 __attribute_warn_unused_result__
1143 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1144                         const re_node_set *src2)
1145 {
1146   Idx i1, i2, id;
1147   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1148     {
1149       dest->alloc = src1->nelem + src2->nelem;
1150       dest->elems = re_malloc (Idx, dest->alloc);
1151       if (__glibc_unlikely (dest->elems == NULL))
1152         return REG_ESPACE;
1153     }
1154   else
1155     {
1156       if (src1 != NULL && src1->nelem > 0)
1157         return re_node_set_init_copy (dest, src1);
1158       else if (src2 != NULL && src2->nelem > 0)
1159         return re_node_set_init_copy (dest, src2);
1160       else
1161         re_node_set_init_empty (dest);
1162       return REG_NOERROR;
1163     }
1164   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1165     {
1166       if (src1->elems[i1] > src2->elems[i2])
1167         {
1168           dest->elems[id++] = src2->elems[i2++];
1169           continue;
1170         }
1171       if (src1->elems[i1] == src2->elems[i2])
1172         ++i2;
1173       dest->elems[id++] = src1->elems[i1++];
1174     }
1175   if (i1 < src1->nelem)
1176     {
1177       memcpy (dest->elems + id, src1->elems + i1,
1178              (src1->nelem - i1) * sizeof (Idx));
1179       id += src1->nelem - i1;
1180     }
1181   else if (i2 < src2->nelem)
1182     {
1183       memcpy (dest->elems + id, src2->elems + i2,
1184              (src2->nelem - i2) * sizeof (Idx));
1185       id += src2->nelem - i2;
1186     }
1187   dest->nelem = id;
1188   return REG_NOERROR;
1189 }
1190
1191 /* Calculate the union set of the sets DEST and SRC. And store it to
1192    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1193
1194 static reg_errcode_t
1195 __attribute_warn_unused_result__
1196 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1197 {
1198   Idx is, id, sbase, delta;
1199   if (src == NULL || src->nelem == 0)
1200     return REG_NOERROR;
1201   if (dest->alloc < 2 * src->nelem + dest->nelem)
1202     {
1203       Idx new_alloc = 2 * (src->nelem + dest->alloc);
1204       Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1205       if (__glibc_unlikely (new_buffer == NULL))
1206         return REG_ESPACE;
1207       dest->elems = new_buffer;
1208       dest->alloc = new_alloc;
1209     }
1210
1211   if (__glibc_unlikely (dest->nelem == 0))
1212     {
1213       dest->nelem = src->nelem;
1214       memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1215       return REG_NOERROR;
1216     }
1217
1218   /* Copy into the top of DEST the items of SRC that are not
1219      found in DEST.  Maybe we could binary search in DEST?  */
1220   for (sbase = dest->nelem + 2 * src->nelem,
1221        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1222     {
1223       if (dest->elems[id] == src->elems[is])
1224         is--, id--;
1225       else if (dest->elems[id] < src->elems[is])
1226         dest->elems[--sbase] = src->elems[is--];
1227       else /* if (dest->elems[id] > src->elems[is]) */
1228         --id;
1229     }
1230
1231   if (is >= 0)
1232     {
1233       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1234       sbase -= is + 1;
1235       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1236     }
1237
1238   id = dest->nelem - 1;
1239   is = dest->nelem + 2 * src->nelem - 1;
1240   delta = is - sbase + 1;
1241   if (delta == 0)
1242     return REG_NOERROR;
1243
1244   /* Now copy.  When DELTA becomes zero, the remaining
1245      DEST elements are already in place.  */
1246   dest->nelem += delta;
1247   for (;;)
1248     {
1249       if (dest->elems[is] > dest->elems[id])
1250         {
1251           /* Copy from the top.  */
1252           dest->elems[id + delta--] = dest->elems[is--];
1253           if (delta == 0)
1254             break;
1255         }
1256       else
1257         {
1258           /* Slide from the bottom.  */
1259           dest->elems[id + delta] = dest->elems[id];
1260           if (--id < 0)
1261             {
1262               /* Copy remaining SRC elements.  */
1263               memcpy (dest->elems, dest->elems + sbase,
1264                       delta * sizeof (Idx));
1265               break;
1266             }
1267         }
1268     }
1269
1270   return REG_NOERROR;
1271 }
1272
1273 /* Insert the new element ELEM to the re_node_set* SET.
1274    SET should not already have ELEM.
1275    Return true if successful.  */
1276
1277 static bool
1278 __attribute_warn_unused_result__
1279 re_node_set_insert (re_node_set *set, Idx elem)
1280 {
1281   Idx idx;
1282   /* In case the set is empty.  */
1283   if (set->alloc == 0)
1284     return __glibc_likely (re_node_set_init_1 (set, elem) == REG_NOERROR);
1285
1286   if (__glibc_unlikely (set->nelem) == 0)
1287     {
1288       /* We already guaranteed above that set->alloc != 0.  */
1289       set->elems[0] = elem;
1290       ++set->nelem;
1291       return true;
1292     }
1293
1294   /* Realloc if we need.  */
1295   if (set->alloc == set->nelem)
1296     {
1297       Idx *new_elems;
1298       set->alloc = set->alloc * 2;
1299       new_elems = re_realloc (set->elems, Idx, set->alloc);
1300       if (__glibc_unlikely (new_elems == NULL))
1301         return false;
1302       set->elems = new_elems;
1303     }
1304
1305   /* Move the elements which follows the new element.  Test the
1306      first element separately to skip a check in the inner loop.  */
1307   if (elem < set->elems[0])
1308     {
1309       for (idx = set->nelem; idx > 0; idx--)
1310         set->elems[idx] = set->elems[idx - 1];
1311     }
1312   else
1313     {
1314       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1315         set->elems[idx] = set->elems[idx - 1];
1316     }
1317
1318   /* Insert the new element.  */
1319   set->elems[idx] = elem;
1320   ++set->nelem;
1321   return true;
1322 }
1323
1324 /* Insert the new element ELEM to the re_node_set* SET.
1325    SET should not already have any element greater than or equal to ELEM.
1326    Return true if successful.  */
1327
1328 static bool
1329 __attribute_warn_unused_result__
1330 re_node_set_insert_last (re_node_set *set, Idx elem)
1331 {
1332   /* Realloc if we need.  */
1333   if (set->alloc == set->nelem)
1334     {
1335       Idx *new_elems;
1336       set->alloc = (set->alloc + 1) * 2;
1337       new_elems = re_realloc (set->elems, Idx, set->alloc);
1338       if (__glibc_unlikely (new_elems == NULL))
1339         return false;
1340       set->elems = new_elems;
1341     }
1342
1343   /* Insert the new element.  */
1344   set->elems[set->nelem++] = elem;
1345   return true;
1346 }
1347
1348 /* Compare two node sets SET1 and SET2.
1349    Return true if SET1 and SET2 are equivalent.  */
1350
1351 static bool
1352 __attribute__ ((pure))
1353 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1354 {
1355   Idx i;
1356   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1357     return false;
1358   for (i = set1->nelem ; --i >= 0 ; )
1359     if (set1->elems[i] != set2->elems[i])
1360       return false;
1361   return true;
1362 }
1363
1364 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1365
1366 static Idx
1367 __attribute__ ((pure))
1368 re_node_set_contains (const re_node_set *set, Idx elem)
1369 {
1370   __re_size_t idx, right, mid;
1371   if (set->nelem <= 0)
1372     return 0;
1373
1374   /* Binary search the element.  */
1375   idx = 0;
1376   right = set->nelem - 1;
1377   while (idx < right)
1378     {
1379       mid = (idx + right) / 2;
1380       if (set->elems[mid] < elem)
1381         idx = mid + 1;
1382       else
1383         right = mid;
1384     }
1385   return set->elems[idx] == elem ? idx + 1 : 0;
1386 }
1387
1388 static void
1389 re_node_set_remove_at (re_node_set *set, Idx idx)
1390 {
1391   if (idx < 0 || idx >= set->nelem)
1392     return;
1393   --set->nelem;
1394   for (; idx < set->nelem; idx++)
1395     set->elems[idx] = set->elems[idx + 1];
1396 }
1397 \f
1398
1399 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400    Or return -1 if an error occurred.  */
1401
1402 static Idx
1403 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1404 {
1405   if (__glibc_unlikely (dfa->nodes_len >= dfa->nodes_alloc))
1406     {
1407       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1408       Idx *new_nexts, *new_indices;
1409       re_node_set *new_edests, *new_eclosures;
1410       re_token_t *new_nodes;
1411
1412       /* Avoid overflows in realloc.  */
1413       const size_t max_object_size = MAX (sizeof (re_token_t),
1414                                           MAX (sizeof (re_node_set),
1415                                                sizeof (Idx)));
1416       if (__glibc_unlikely (MIN (IDX_MAX, SIZE_MAX / max_object_size)
1417                             < new_nodes_alloc))
1418         return -1;
1419
1420       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1421       if (__glibc_unlikely (new_nodes == NULL))
1422         return -1;
1423       dfa->nodes = new_nodes;
1424       new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1425       new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1426       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1427       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1428       if (__glibc_unlikely (new_nexts == NULL || new_indices == NULL
1429                             || new_edests == NULL || new_eclosures == NULL))
1430         {
1431            re_free (new_nexts);
1432            re_free (new_indices);
1433            re_free (new_edests);
1434            re_free (new_eclosures);
1435            return -1;
1436         }
1437       dfa->nexts = new_nexts;
1438       dfa->org_indices = new_indices;
1439       dfa->edests = new_edests;
1440       dfa->eclosures = new_eclosures;
1441       dfa->nodes_alloc = new_nodes_alloc;
1442     }
1443   dfa->nodes[dfa->nodes_len] = token;
1444   dfa->nodes[dfa->nodes_len].constraint = 0;
1445 #ifdef RE_ENABLE_I18N
1446   dfa->nodes[dfa->nodes_len].accept_mb =
1447     ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1448      || token.type == COMPLEX_BRACKET);
1449 #endif
1450   dfa->nexts[dfa->nodes_len] = -1;
1451   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1452   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1453   return dfa->nodes_len++;
1454 }
1455
1456 static re_hashval_t
1457 calc_state_hash (const re_node_set *nodes, unsigned int context)
1458 {
1459   re_hashval_t hash = nodes->nelem + context;
1460   Idx i;
1461   for (i = 0 ; i < nodes->nelem ; i++)
1462     hash += nodes->elems[i];
1463   return hash;
1464 }
1465
1466 /* Search for the state whose node_set is equivalent to NODES.
1467    Return the pointer to the state, if we found it in the DFA.
1468    Otherwise create the new one and return it.  In case of an error
1469    return NULL and set the error code in ERR.
1470    Note: - We assume NULL as the invalid state, then it is possible that
1471            return value is NULL and ERR is REG_NOERROR.
1472          - We never return non-NULL value in case of any errors, it is for
1473            optimization.  */
1474
1475 static re_dfastate_t *
1476 __attribute_warn_unused_result__
1477 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1478                   const re_node_set *nodes)
1479 {
1480   re_hashval_t hash;
1481   re_dfastate_t *new_state;
1482   struct re_state_table_entry *spot;
1483   Idx i;
1484 #if defined GCC_LINT || defined lint
1485   /* Suppress bogus uninitialized-variable warnings.  */
1486   *err = REG_NOERROR;
1487 #endif
1488   if (__glibc_unlikely (nodes->nelem == 0))
1489     {
1490       *err = REG_NOERROR;
1491       return NULL;
1492     }
1493   hash = calc_state_hash (nodes, 0);
1494   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1495
1496   for (i = 0 ; i < spot->num ; i++)
1497     {
1498       re_dfastate_t *state = spot->array[i];
1499       if (hash != state->hash)
1500         continue;
1501       if (re_node_set_compare (&state->nodes, nodes))
1502         return state;
1503     }
1504
1505   /* There are no appropriate state in the dfa, create the new one.  */
1506   new_state = create_ci_newstate (dfa, nodes, hash);
1507   if (__glibc_unlikely (new_state == NULL))
1508     *err = REG_ESPACE;
1509
1510   return new_state;
1511 }
1512
1513 /* Search for the state whose node_set is equivalent to NODES and
1514    whose context is equivalent to CONTEXT.
1515    Return the pointer to the state, if we found it in the DFA.
1516    Otherwise create the new one and return it.  In case of an error
1517    return NULL and set the error code in ERR.
1518    Note: - We assume NULL as the invalid state, then it is possible that
1519            return value is NULL and ERR is REG_NOERROR.
1520          - We never return non-NULL value in case of any errors, it is for
1521            optimization.  */
1522
1523 static re_dfastate_t *
1524 __attribute_warn_unused_result__
1525 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1526                           const re_node_set *nodes, unsigned int context)
1527 {
1528   re_hashval_t hash;
1529   re_dfastate_t *new_state;
1530   struct re_state_table_entry *spot;
1531   Idx i;
1532 #if defined GCC_LINT || defined lint
1533   /* Suppress bogus uninitialized-variable warnings.  */
1534   *err = REG_NOERROR;
1535 #endif
1536   if (nodes->nelem == 0)
1537     {
1538       *err = REG_NOERROR;
1539       return NULL;
1540     }
1541   hash = calc_state_hash (nodes, context);
1542   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1543
1544   for (i = 0 ; i < spot->num ; i++)
1545     {
1546       re_dfastate_t *state = spot->array[i];
1547       if (state->hash == hash
1548           && state->context == context
1549           && re_node_set_compare (state->entrance_nodes, nodes))
1550         return state;
1551     }
1552   /* There are no appropriate state in 'dfa', create the new one.  */
1553   new_state = create_cd_newstate (dfa, nodes, context, hash);
1554   if (__glibc_unlikely (new_state == NULL))
1555     *err = REG_ESPACE;
1556
1557   return new_state;
1558 }
1559
1560 /* Finish initialization of the new state NEWSTATE, and using its hash value
1561    HASH put in the appropriate bucket of DFA's state table.  Return value
1562    indicates the error code if failed.  */
1563
1564 static reg_errcode_t
1565 __attribute_warn_unused_result__
1566 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1567                 re_hashval_t hash)
1568 {
1569   struct re_state_table_entry *spot;
1570   reg_errcode_t err;
1571   Idx i;
1572
1573   newstate->hash = hash;
1574   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1575   if (__glibc_unlikely (err != REG_NOERROR))
1576     return REG_ESPACE;
1577   for (i = 0; i < newstate->nodes.nelem; i++)
1578     {
1579       Idx elem = newstate->nodes.elems[i];
1580       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1581         if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1582           return REG_ESPACE;
1583     }
1584
1585   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1586   if (__glibc_unlikely (spot->alloc <= spot->num))
1587     {
1588       Idx new_alloc = 2 * spot->num + 2;
1589       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1590                                               new_alloc);
1591       if (__glibc_unlikely (new_array == NULL))
1592         return REG_ESPACE;
1593       spot->array = new_array;
1594       spot->alloc = new_alloc;
1595     }
1596   spot->array[spot->num++] = newstate;
1597   return REG_NOERROR;
1598 }
1599
1600 static void
1601 free_state (re_dfastate_t *state)
1602 {
1603   re_node_set_free (&state->non_eps_nodes);
1604   re_node_set_free (&state->inveclosure);
1605   if (state->entrance_nodes != &state->nodes)
1606     {
1607       re_node_set_free (state->entrance_nodes);
1608       re_free (state->entrance_nodes);
1609     }
1610   re_node_set_free (&state->nodes);
1611   re_free (state->word_trtable);
1612   re_free (state->trtable);
1613   re_free (state);
1614 }
1615
1616 /* Create the new state which is independent of contexts.
1617    Return the new state if succeeded, otherwise return NULL.  */
1618
1619 static re_dfastate_t *
1620 __attribute_warn_unused_result__
1621 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1622                     re_hashval_t hash)
1623 {
1624   Idx i;
1625   reg_errcode_t err;
1626   re_dfastate_t *newstate;
1627
1628   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1629   if (__glibc_unlikely (newstate == NULL))
1630     return NULL;
1631   err = re_node_set_init_copy (&newstate->nodes, nodes);
1632   if (__glibc_unlikely (err != REG_NOERROR))
1633     {
1634       re_free (newstate);
1635       return NULL;
1636     }
1637
1638   newstate->entrance_nodes = &newstate->nodes;
1639   for (i = 0 ; i < nodes->nelem ; i++)
1640     {
1641       re_token_t *node = dfa->nodes + nodes->elems[i];
1642       re_token_type_t type = node->type;
1643       if (type == CHARACTER && !node->constraint)
1644         continue;
1645 #ifdef RE_ENABLE_I18N
1646       newstate->accept_mb |= node->accept_mb;
1647 #endif /* RE_ENABLE_I18N */
1648
1649       /* If the state has the halt node, the state is a halt state.  */
1650       if (type == END_OF_RE)
1651         newstate->halt = 1;
1652       else if (type == OP_BACK_REF)
1653         newstate->has_backref = 1;
1654       else if (type == ANCHOR || node->constraint)
1655         newstate->has_constraint = 1;
1656     }
1657   err = register_state (dfa, newstate, hash);
1658   if (__glibc_unlikely (err != REG_NOERROR))
1659     {
1660       free_state (newstate);
1661       newstate = NULL;
1662     }
1663   return newstate;
1664 }
1665
1666 /* Create the new state which is depend on the context CONTEXT.
1667    Return the new state if succeeded, otherwise return NULL.  */
1668
1669 static re_dfastate_t *
1670 __attribute_warn_unused_result__
1671 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1672                     unsigned int context, re_hashval_t hash)
1673 {
1674   Idx i, nctx_nodes = 0;
1675   reg_errcode_t err;
1676   re_dfastate_t *newstate;
1677
1678   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1679   if (__glibc_unlikely (newstate == NULL))
1680     return NULL;
1681   err = re_node_set_init_copy (&newstate->nodes, nodes);
1682   if (__glibc_unlikely (err != REG_NOERROR))
1683     {
1684       re_free (newstate);
1685       return NULL;
1686     }
1687
1688   newstate->context = context;
1689   newstate->entrance_nodes = &newstate->nodes;
1690
1691   for (i = 0 ; i < nodes->nelem ; i++)
1692     {
1693       re_token_t *node = dfa->nodes + nodes->elems[i];
1694       re_token_type_t type = node->type;
1695       unsigned int constraint = node->constraint;
1696
1697       if (type == CHARACTER && !constraint)
1698         continue;
1699 #ifdef RE_ENABLE_I18N
1700       newstate->accept_mb |= node->accept_mb;
1701 #endif /* RE_ENABLE_I18N */
1702
1703       /* If the state has the halt node, the state is a halt state.  */
1704       if (type == END_OF_RE)
1705         newstate->halt = 1;
1706       else if (type == OP_BACK_REF)
1707         newstate->has_backref = 1;
1708
1709       if (constraint)
1710         {
1711           if (newstate->entrance_nodes == &newstate->nodes)
1712             {
1713               re_node_set *entrance_nodes = re_malloc (re_node_set, 1);
1714               if (__glibc_unlikely (entrance_nodes == NULL))
1715                 {
1716                   free_state (newstate);
1717                   return NULL;
1718                 }
1719               newstate->entrance_nodes = entrance_nodes;
1720               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1721                   != REG_NOERROR)
1722                 {
1723                   free_state (newstate);
1724                   return NULL;
1725                 }
1726               nctx_nodes = 0;
1727               newstate->has_constraint = 1;
1728             }
1729
1730           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731             {
1732               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733               ++nctx_nodes;
1734             }
1735         }
1736     }
1737   err = register_state (dfa, newstate, hash);
1738   if (__glibc_unlikely (err != REG_NOERROR))
1739     {
1740       free_state (newstate);
1741       newstate = NULL;
1742     }
1743   return  newstate;
1744 }