posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002-2006, 2010 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  26                                           const re_node_set *nodes,
  27                                           unsigned int hash) internal_function;
  28 static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  29                                           const re_node_set *nodes,
  30                                           unsigned int context,
  31                                           unsigned int hash) internal_function;
  32 \f
  33 /* Functions for string operation.  */
  34
  35 /* This function allocate the buffers.  It is necessary to call
  36    re_string_reconstruct before using the object.  */
  37
  38 static reg_errcode_t
  39 internal_function __attribute_warn_unused_result__
  40 re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len,
  41                     RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  42 {
  43   reg_errcode_t ret;
  44   int init_buf_len;
  45
  46   /* Ensure at least one character fits into the buffers.  */
  47   if (init_len < dfa->mb_cur_max)
  48     init_len = dfa->mb_cur_max;
  49   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  50   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  51
  52   ret = re_string_realloc_buffers (pstr, init_buf_len);
  53   if (BE (ret != REG_NOERROR, 0))
  54     return ret;
  55
  56   pstr->word_char = dfa->word_char;
  57   pstr->word_ops_used = dfa->word_ops_used;
  58   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  59   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  60   pstr->valid_raw_len = pstr->valid_len;
  61   return REG_NOERROR;
  62 }
  63
  64 /* This function allocate the buffers, and initialize them.  */
  65
  66 static reg_errcode_t
  67 internal_function __attribute_warn_unused_result__
  68 re_string_construct (re_string_t *pstr, const char *str, int len,
  69                      RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
  70 {
  71   reg_errcode_t ret;
  72   memset (pstr, '\0', sizeof (re_string_t));
  73   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  74
  75   if (len > 0)
  76     {
  77       ret = re_string_realloc_buffers (pstr, len + 1);
  78       if (BE (ret != REG_NOERROR, 0))
  79         return ret;
  80     }
  81   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  82
  83   if (icase)
  84     {
  85 #ifdef RE_ENABLE_I18N
  86       if (dfa->mb_cur_max > 1)
  87         {
  88           while (1)
  89             {
  90               ret = build_wcs_upper_buffer (pstr);
  91               if (BE (ret != REG_NOERROR, 0))
  92                 return ret;
  93               if (pstr->valid_raw_len >= len)
  94                 break;
  95               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  96                 break;
  97               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  98               if (BE (ret != REG_NOERROR, 0))
  99                 return ret;
 100             }
 101         }
 102       else
 103 #endif /* RE_ENABLE_I18N  */
 104         build_upper_buffer (pstr);
 105     }
 106   else
 107     {
 108 #ifdef RE_ENABLE_I18N
 109       if (dfa->mb_cur_max > 1)
 110         build_wcs_buffer (pstr);
 111       else
 112 #endif /* RE_ENABLE_I18N  */
 113         {
 114           if (trans != NULL)
 115             re_string_translate_buffer (pstr);
 116           else
 117             {
 118               pstr->valid_len = pstr->bufs_len;
 119               pstr->valid_raw_len = pstr->bufs_len;
 120             }
 121         }
 122     }
 123
 124   return REG_NOERROR;
 125 }
 126
 127 /* Helper functions for re_string_allocate, and re_string_construct.  */
 128
 129 static reg_errcode_t
 130 internal_function __attribute_warn_unused_result__
 131 re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
 132 {
 133 #ifdef RE_ENABLE_I18N
 134   if (pstr->mb_cur_max > 1)
 135     {
 136       wint_t *new_wcs;
 137
 138       /* Avoid overflow in realloc.  */
 139       const size_t max_object_size = MAX (sizeof (wint_t), sizeof (int));
 140       if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
 141         return REG_ESPACE;
 142
 143       new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
 144       if (BE (new_wcs == NULL, 0))
 145         return REG_ESPACE;
 146       pstr->wcs = new_wcs;
 147       if (pstr->offsets != NULL)
 148         {
 149           int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
 150           if (BE (new_offsets == NULL, 0))
 151             return REG_ESPACE;
 152           pstr->offsets = new_offsets;
 153         }
 154     }
 155 #endif /* RE_ENABLE_I18N  */
 156   if (pstr->mbs_allocated)
 157     {
 158       unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
 159                                            new_buf_len);
 160       if (BE (new_mbs == NULL, 0))
 161         return REG_ESPACE;
 162       pstr->mbs = new_mbs;
 163     }
 164   pstr->bufs_len = new_buf_len;
 165   return REG_NOERROR;
 166 }
 167
 168
 169 static void
 170 internal_function
 171 re_string_construct_common (const char *str, int len, re_string_t *pstr,
 172                             RE_TRANSLATE_TYPE trans, int icase,
 173                             const re_dfa_t *dfa)
 174 {
 175   pstr->raw_mbs = (const unsigned char *) str;
 176   pstr->len = len;
 177   pstr->raw_len = len;
 178   pstr->trans = trans;
 179   pstr->icase = icase ? 1 : 0;
 180   pstr->mbs_allocated = (trans != NULL || icase);
 181   pstr->mb_cur_max = dfa->mb_cur_max;
 182   pstr->is_utf8 = dfa->is_utf8;
 183   pstr->map_notascii = dfa->map_notascii;
 184   pstr->stop = pstr->len;
 185   pstr->raw_stop = pstr->stop;
 186 }
 187
 188 #ifdef RE_ENABLE_I18N
 189
 190 /* Build wide character buffer PSTR->WCS.
 191    If the byte sequence of the string are:
 192      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 193    Then wide character buffer will be:
 194      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 195    We use WEOF for padding, they indicate that the position isn't
 196    a first byte of a multibyte character.
 197
 198    Note that this function assumes PSTR->VALID_LEN elements are already
 199    built and starts from PSTR->VALID_LEN.  */
 200
 201 static void
 202 internal_function
 203 build_wcs_buffer (re_string_t *pstr)
 204 {
 205 #ifdef _LIBC
 206   unsigned char buf[MB_LEN_MAX];
 207   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 208 #else
 209   unsigned char buf[64];
 210 #endif
 211   mbstate_t prev_st;
 212   int byte_idx, end_idx, remain_len;
 213   size_t mbclen;
 214
 215   /* Build the buffers from pstr->valid_len to either pstr->len or
 216      pstr->bufs_len.  */
 217   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 218   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 219     {
 220       wchar_t wc;
 221       const char *p;
 222
 223       remain_len = end_idx - byte_idx;
 224       prev_st = pstr->cur_state;
 225       /* Apply the translation if we need.  */
 226       if (BE (pstr->trans != NULL, 0))
 227         {
 228           int i, ch;
 229
 230           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 231             {
 232               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 233               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 234             }
 235           p = (const char *) buf;
 236         }
 237       else
 238         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 239       mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 240       if (BE (mbclen == (size_t) -2, 0))
 241         {
 242           /* The buffer doesn't have enough space, finish to build.  */
 243           pstr->cur_state = prev_st;
 244           break;
 245         }
 246       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 247         {
 248           /* We treat these cases as a singlebyte character.  */
 249           mbclen = 1;
 250           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 251           if (BE (pstr->trans != NULL, 0))
 252             wc = pstr->trans[wc];
 253           pstr->cur_state = prev_st;
 254         }
 255
 256       /* Write wide character and padding.  */
 257       pstr->wcs[byte_idx++] = wc;
 258       /* Write paddings.  */
 259       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 260         pstr->wcs[byte_idx++] = WEOF;
 261     }
 262   pstr->valid_len = byte_idx;
 263   pstr->valid_raw_len = byte_idx;
 264 }
 265
 266 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 267    but for REG_ICASE.  */
 268
 269 static reg_errcode_t
 270 internal_function __attribute_warn_unused_result__
 271 build_wcs_upper_buffer (re_string_t *pstr)
 272 {
 273   mbstate_t prev_st;
 274   int src_idx, byte_idx, end_idx, remain_len;
 275   size_t mbclen;
 276 #ifdef _LIBC
 277   char buf[MB_LEN_MAX];
 278   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 279 #else
 280   char buf[64];
 281 #endif
 282
 283   byte_idx = pstr->valid_len;
 284   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 285
 286   /* The following optimization assumes that ASCII characters can be
 287      mapped to wide characters with a simple cast.  */
 288   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 289     {
 290       while (byte_idx < end_idx)
 291         {
 292           wchar_t wc;
 293
 294           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 295               && mbsinit (&pstr->cur_state))
 296             {
 297               /* In case of a singlebyte character.  */
 298               pstr->mbs[byte_idx]
 299                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 300               /* The next step uses the assumption that wchar_t is encoded
 301                  ASCII-safe: all ASCII values can be converted like this.  */
 302               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 303               ++byte_idx;
 304               continue;
 305             }
 306
 307           remain_len = end_idx - byte_idx;
 308           prev_st = pstr->cur_state;
 309           mbclen = __mbrtowc (&wc,
 310                               ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 311                                + byte_idx), remain_len, &pstr->cur_state);
 312           if (BE (mbclen + 2 > 2, 1))
 313             {
 314               wchar_t wcu = wc;
 315               if (iswlower (wc))
 316                 {
 317                   size_t mbcdlen;
 318
 319                   wcu = towupper (wc);
 320                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 321                   if (BE (mbclen == mbcdlen, 1))
 322                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 323                   else
 324                     {
 325                       src_idx = byte_idx;
 326                       goto offsets_needed;
 327                     }
 328                 }
 329               else
 330                 memcpy (pstr->mbs + byte_idx,
 331                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 332               pstr->wcs[byte_idx++] = wcu;
 333               /* Write paddings.  */
 334               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 335                 pstr->wcs[byte_idx++] = WEOF;
 336             }
 337           else if (mbclen == (size_t) -1 || mbclen == 0)
 338             {
 339               /* It is an invalid character or '\0'.  Just use the byte.  */
 340               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 341               pstr->mbs[byte_idx] = ch;
 342               /* And also cast it to wide char.  */
 343               pstr->wcs[byte_idx++] = (wchar_t) ch;
 344               if (BE (mbclen == (size_t) -1, 0))
 345                 pstr->cur_state = prev_st;
 346             }
 347           else
 348             {
 349               /* The buffer doesn't have enough space, finish to build.  */
 350               pstr->cur_state = prev_st;
 351               break;
 352             }
 353         }
 354       pstr->valid_len = byte_idx;
 355       pstr->valid_raw_len = byte_idx;
 356       return REG_NOERROR;
 357     }
 358   else
 359     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 360       {
 361         wchar_t wc;
 362         const char *p;
 363       offsets_needed:
 364         remain_len = end_idx - byte_idx;
 365         prev_st = pstr->cur_state;
 366         if (BE (pstr->trans != NULL, 0))
 367           {
 368             int i, ch;
 369
 370             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 371               {
 372                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 373                 buf[i] = pstr->trans[ch];
 374               }
 375             p = (const char *) buf;
 376           }
 377         else
 378           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 379         mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 380         if (BE (mbclen + 2 > 2, 1))
 381           {
 382             wchar_t wcu = wc;
 383             if (iswlower (wc))
 384               {
 385                 size_t mbcdlen;
 386
 387                 wcu = towupper (wc);
 388                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 389                 if (BE (mbclen == mbcdlen, 1))
 390                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 391                 else if (mbcdlen != (size_t) -1)
 392                   {
 393                     size_t i;
 394
 395                     if (byte_idx + mbcdlen > pstr->bufs_len)
 396                       {
 397                         pstr->cur_state = prev_st;
 398                         break;
 399                       }
 400
 401                     if (pstr->offsets == NULL)
 402                       {
 403                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 404
 405                         if (pstr->offsets == NULL)
 406                           return REG_ESPACE;
 407                       }
 408                     if (!pstr->offsets_needed)
 409                       {
 410                         for (i = 0; i < (size_t) byte_idx; ++i)
 411                           pstr->offsets[i] = i;
 412                         pstr->offsets_needed = 1;
 413                       }
 414
 415                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 416                     pstr->wcs[byte_idx] = wcu;
 417                     pstr->offsets[byte_idx] = src_idx;
 418                     for (i = 1; i < mbcdlen; ++i)
 419                       {
 420                         pstr->offsets[byte_idx + i]
 421                           = src_idx + (i < mbclen ? i : mbclen - 1);
 422                         pstr->wcs[byte_idx + i] = WEOF;
 423                       }
 424                     pstr->len += mbcdlen - mbclen;
 425                     if (pstr->raw_stop > src_idx)
 426                       pstr->stop += mbcdlen - mbclen;
 427                     end_idx = (pstr->bufs_len > pstr->len)
 428                               ? pstr->len : pstr->bufs_len;
 429                     byte_idx += mbcdlen;
 430                     src_idx += mbclen;
 431                     continue;
 432                   }
 433                 else
 434                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 435               }
 436             else
 437               memcpy (pstr->mbs + byte_idx, p, mbclen);
 438
 439             if (BE (pstr->offsets_needed != 0, 0))
 440               {
 441                 size_t i;
 442                 for (i = 0; i < mbclen; ++i)
 443                   pstr->offsets[byte_idx + i] = src_idx + i;
 444               }
 445             src_idx += mbclen;
 446
 447             pstr->wcs[byte_idx++] = wcu;
 448             /* Write paddings.  */
 449             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 450               pstr->wcs[byte_idx++] = WEOF;
 451           }
 452         else if (mbclen == (size_t) -1 || mbclen == 0)
 453           {
 454             /* It is an invalid character or '\0'.  Just use the byte.  */
 455             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 456
 457             if (BE (pstr->trans != NULL, 0))
 458               ch = pstr->trans [ch];
 459             pstr->mbs[byte_idx] = ch;
 460
 461             if (BE (pstr->offsets_needed != 0, 0))
 462               pstr->offsets[byte_idx] = src_idx;
 463             ++src_idx;
 464
 465             /* And also cast it to wide char.  */
 466             pstr->wcs[byte_idx++] = (wchar_t) ch;
 467             if (BE (mbclen == (size_t) -1, 0))
 468               pstr->cur_state = prev_st;
 469           }
 470         else
 471           {
 472             /* The buffer doesn't have enough space, finish to build.  */
 473             pstr->cur_state = prev_st;
 474             break;
 475           }
 476       }
 477   pstr->valid_len = byte_idx;
 478   pstr->valid_raw_len = src_idx;
 479   return REG_NOERROR;
 480 }
 481
 482 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 483    Return the index.  */
 484
 485 static int
 486 internal_function
 487 re_string_skip_chars (re_string_t *pstr, int new_raw_idx, wint_t *last_wc)
 488 {
 489   mbstate_t prev_st;
 490   int rawbuf_idx;
 491   size_t mbclen;
 492   wint_t wc = WEOF;
 493
 494   /* Skip the characters which are not necessary to check.  */
 495   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 496        rawbuf_idx < new_raw_idx;)
 497     {
 498       wchar_t wc2;
 499       int remain_len = pstr->len - rawbuf_idx;
 500       prev_st = pstr->cur_state;
 501       mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
 502                           remain_len, &pstr->cur_state);
 503       if (BE ((ssize_t) mbclen <= 0, 0))
 504         {
 505           /* We treat these cases as a single byte character.  */
 506           if (mbclen == 0 || remain_len == 0)
 507             wc = L'\0';
 508           else
 509             wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
 510           mbclen = 1;
 511           pstr->cur_state = prev_st;
 512         }
 513       else
 514         wc = (wint_t) wc2;
 515       /* Then proceed the next character.  */
 516       rawbuf_idx += mbclen;
 517     }
 518   *last_wc = wc;
 519   return rawbuf_idx;
 520 }
 521 #endif /* RE_ENABLE_I18N  */
 522
 523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 524    This function is used in case of REG_ICASE.  */
 525
 526 static void
 527 internal_function
 528 build_upper_buffer (re_string_t *pstr)
 529 {
 530   int char_idx, end_idx;
 531   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 532
 533   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 534     {
 535       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 536       if (BE (pstr->trans != NULL, 0))
 537         ch = pstr->trans[ch];
 538       if (islower (ch))
 539         pstr->mbs[char_idx] = toupper (ch);
 540       else
 541         pstr->mbs[char_idx] = ch;
 542     }
 543   pstr->valid_len = char_idx;
 544   pstr->valid_raw_len = char_idx;
 545 }
 546
 547 /* Apply TRANS to the buffer in PSTR.  */
 548
 549 static void
 550 internal_function
 551 re_string_translate_buffer (re_string_t *pstr)
 552 {
 553   int buf_idx, end_idx;
 554   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 555
 556   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 557     {
 558       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 559       pstr->mbs[buf_idx] = pstr->trans[ch];
 560     }
 561
 562   pstr->valid_len = buf_idx;
 563   pstr->valid_raw_len = buf_idx;
 564 }
 565
 566 /* This function re-construct the buffers.
 567    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 568    convert to upper case in case of REG_ICASE, apply translation.  */
 569
 570 static reg_errcode_t
 571 internal_function __attribute_warn_unused_result__
 572 re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
 573 {
 574   int offset = idx - pstr->raw_mbs_idx;
 575   if (BE (offset < 0, 0))
 576     {
 577       /* Reset buffer.  */
 578 #ifdef RE_ENABLE_I18N
 579       if (pstr->mb_cur_max > 1)
 580         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 581 #endif /* RE_ENABLE_I18N */
 582       pstr->len = pstr->raw_len;
 583       pstr->stop = pstr->raw_stop;
 584       pstr->valid_len = 0;
 585       pstr->raw_mbs_idx = 0;
 586       pstr->valid_raw_len = 0;
 587       pstr->offsets_needed = 0;
 588       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 589                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 590       if (!pstr->mbs_allocated)
 591         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 592       offset = idx;
 593     }
 594
 595   if (BE (offset != 0, 1))
 596     {
 597       /* Should the already checked characters be kept?  */
 598       if (BE (offset < pstr->valid_raw_len, 1))
 599         {
 600           /* Yes, move them to the front of the buffer.  */
 601 #ifdef RE_ENABLE_I18N
 602           if (BE (pstr->offsets_needed, 0))
 603             {
 604               int low = 0, high = pstr->valid_len, mid;
 605               do
 606                 {
 607                   mid = (high + low) / 2;
 608                   if (pstr->offsets[mid] > offset)
 609                     high = mid;
 610                   else if (pstr->offsets[mid] < offset)
 611                     low = mid + 1;
 612                   else
 613                     break;
 614                 }
 615               while (low < high);
 616               if (pstr->offsets[mid] < offset)
 617                 ++mid;
 618               pstr->tip_context = re_string_context_at (pstr, mid - 1,
 619                                                         eflags);
 620               /* This can be quite complicated, so handle specially
 621                  only the common and easy case where the character with
 622                  different length representation of lower and upper
 623                  case is present at or after offset.  */
 624               if (pstr->valid_len > offset
 625                   && mid == offset && pstr->offsets[mid] == offset)
 626                 {
 627                   memmove (pstr->wcs, pstr->wcs + offset,
 628                            (pstr->valid_len - offset) * sizeof (wint_t));
 629                   memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
 630                   pstr->valid_len -= offset;
 631                   pstr->valid_raw_len -= offset;
 632                   for (low = 0; low < pstr->valid_len; low++)
 633                     pstr->offsets[low] = pstr->offsets[low + offset] - offset;
 634                 }
 635               else
 636                 {
 637                   /* Otherwise, just find out how long the partial multibyte
 638                      character at offset is and fill it with WEOF/255.  */
 639                   pstr->len = pstr->raw_len - idx + offset;
 640                   pstr->stop = pstr->raw_stop - idx + offset;
 641                   pstr->offsets_needed = 0;
 642                   while (mid > 0 && pstr->offsets[mid - 1] == offset)
 643                     --mid;
 644                   while (mid < pstr->valid_len)
 645                     if (pstr->wcs[mid] != WEOF)
 646                       break;
 647                     else
 648                       ++mid;
 649                   if (mid == pstr->valid_len)
 650                     pstr->valid_len = 0;
 651                   else
 652                     {
 653                       pstr->valid_len = pstr->offsets[mid] - offset;
 654                       if (pstr->valid_len)
 655                         {
 656                           for (low = 0; low < pstr->valid_len; ++low)
 657                             pstr->wcs[low] = WEOF;
 658                           memset (pstr->mbs, 255, pstr->valid_len);
 659                         }
 660                     }
 661                   pstr->valid_raw_len = pstr->valid_len;
 662                 }
 663             }
 664           else
 665 #endif
 666             {
 667               pstr->tip_context = re_string_context_at (pstr, offset - 1,
 668                                                         eflags);
 669 #ifdef RE_ENABLE_I18N
 670               if (pstr->mb_cur_max > 1)
 671                 memmove (pstr->wcs, pstr->wcs + offset,
 672                          (pstr->valid_len - offset) * sizeof (wint_t));
 673 #endif /* RE_ENABLE_I18N */
 674               if (BE (pstr->mbs_allocated, 0))
 675                 memmove (pstr->mbs, pstr->mbs + offset,
 676                          pstr->valid_len - offset);
 677               pstr->valid_len -= offset;
 678               pstr->valid_raw_len -= offset;
 679 #if DEBUG
 680               assert (pstr->valid_len > 0);
 681 #endif
 682             }
 683         }
 684       else
 685         {
 686           /* No, skip all characters until IDX.  */
 687           int prev_valid_len = pstr->valid_len;
 688
 689 #ifdef RE_ENABLE_I18N
 690           if (BE (pstr->offsets_needed, 0))
 691             {
 692               pstr->len = pstr->raw_len - idx + offset;
 693               pstr->stop = pstr->raw_stop - idx + offset;
 694               pstr->offsets_needed = 0;
 695             }
 696 #endif
 697           pstr->valid_len = 0;
 698 #ifdef RE_ENABLE_I18N
 699           if (pstr->mb_cur_max > 1)
 700             {
 701               int wcs_idx;
 702               wint_t wc = WEOF;
 703
 704               if (pstr->is_utf8)
 705                 {
 706                   const unsigned char *raw, *p, *end;
 707
 708                   /* Special case UTF-8.  Multi-byte chars start with any
 709                      byte other than 0x80 - 0xbf.  */
 710                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 711                   end = raw + (offset - pstr->mb_cur_max);
 712                   if (end < pstr->raw_mbs)
 713                     end = pstr->raw_mbs;
 714                   p = raw + offset - 1;
 715 #ifdef _LIBC
 716                   /* We know the wchar_t encoding is UCS4, so for the simple
 717                      case, ASCII characters, skip the conversion step.  */
 718                   if (isascii (*p) && BE (pstr->trans == NULL, 1))
 719                     {
 720                       memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 721                       /* pstr->valid_len = 0; */
 722                       wc = (wchar_t) *p;
 723                     }
 724                   else
 725 #endif
 726                     for (; p >= end; --p)
 727                       if ((*p & 0xc0) != 0x80)
 728                         {
 729                           mbstate_t cur_state;
 730                           wchar_t wc2;
 731                           int mlen = raw + pstr->len - p;
 732                           unsigned char buf[6];
 733                           size_t mbclen;
 734
 735                           if (BE (pstr->trans != NULL, 0))
 736                             {
 737                               int i = mlen < 6 ? mlen : 6;
 738                               while (--i >= 0)
 739                                 buf[i] = pstr->trans[p[i]];
 740                             }
 741                           /* XXX Don't use mbrtowc, we know which conversion
 742                              to use (UTF-8 -> UCS4).  */
 743                           memset (&cur_state, 0, sizeof (cur_state));
 744                           mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
 745                                               &cur_state);
 746                           if (raw + offset - p <= mbclen
 747                               && mbclen < (size_t) -2)
 748                             {
 749                               memset (&pstr->cur_state, '\0',
 750                                       sizeof (mbstate_t));
 751                               pstr->valid_len = mbclen - (raw + offset - p);
 752                               wc = wc2;
 753                             }
 754                           break;
 755                         }
 756                 }
 757
 758               if (wc == WEOF)
 759                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 760               if (wc == WEOF)
 761                 pstr->tip_context
 762                   = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 763               else
 764                 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 765                                       && IS_WIDE_WORD_CHAR (wc))
 766                                      ? CONTEXT_WORD
 767                                      : ((IS_WIDE_NEWLINE (wc)
 768                                          && pstr->newline_anchor)
 769                                         ? CONTEXT_NEWLINE : 0));
 770               if (BE (pstr->valid_len, 0))
 771                 {
 772                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 773                     pstr->wcs[wcs_idx] = WEOF;
 774                   if (pstr->mbs_allocated)
 775                     memset (pstr->mbs, 255, pstr->valid_len);
 776                 }
 777               pstr->valid_raw_len = pstr->valid_len;
 778             }
 779           else
 780 #endif /* RE_ENABLE_I18N */
 781             {
 782               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 783               pstr->valid_raw_len = 0;
 784               if (pstr->trans)
 785                 c = pstr->trans[c];
 786               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 787                                    ? CONTEXT_WORD
 788                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 789                                       ? CONTEXT_NEWLINE : 0));
 790             }
 791         }
 792       if (!BE (pstr->mbs_allocated, 0))
 793         pstr->mbs += offset;
 794     }
 795   pstr->raw_mbs_idx = idx;
 796   pstr->len -= offset;
 797   pstr->stop -= offset;
 798
 799   /* Then build the buffers.  */
 800 #ifdef RE_ENABLE_I18N
 801   if (pstr->mb_cur_max > 1)
 802     {
 803       if (pstr->icase)
 804         {
 805           reg_errcode_t ret = build_wcs_upper_buffer (pstr);
 806           if (BE (ret != REG_NOERROR, 0))
 807             return ret;
 808         }
 809       else
 810         build_wcs_buffer (pstr);
 811     }
 812   else
 813 #endif /* RE_ENABLE_I18N */
 814     if (BE (pstr->mbs_allocated, 0))
 815       {
 816         if (pstr->icase)
 817           build_upper_buffer (pstr);
 818         else if (pstr->trans != NULL)
 819           re_string_translate_buffer (pstr);
 820       }
 821     else
 822       pstr->valid_len = pstr->len;
 823
 824   pstr->cur_idx = 0;
 825   return REG_NOERROR;
 826 }
 827
 828 static unsigned char
 829 internal_function __attribute ((pure))
 830 re_string_peek_byte_case (const re_string_t *pstr, int idx)
 831 {
 832   int ch, off;
 833
 834   /* Handle the common (easiest) cases first.  */
 835   if (BE (!pstr->mbs_allocated, 1))
 836     return re_string_peek_byte (pstr, idx);
 837
 838 #ifdef RE_ENABLE_I18N
 839   if (pstr->mb_cur_max > 1
 840       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 841     return re_string_peek_byte (pstr, idx);
 842 #endif
 843
 844   off = pstr->cur_idx + idx;
 845 #ifdef RE_ENABLE_I18N
 846   if (pstr->offsets_needed)
 847     off = pstr->offsets[off];
 848 #endif
 849
 850   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 851
 852 #ifdef RE_ENABLE_I18N
 853   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 854      this function returns CAPITAL LETTER I instead of first byte of
 855      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 856      since peek_byte_case doesn't advance cur_idx in any way.  */
 857   if (pstr->offsets_needed && !isascii (ch))
 858     return re_string_peek_byte (pstr, idx);
 859 #endif
 860
 861   return ch;
 862 }
 863
 864 static unsigned char
 865 internal_function __attribute ((pure))
 866 re_string_fetch_byte_case (re_string_t *pstr)
 867 {
 868   if (BE (!pstr->mbs_allocated, 1))
 869     return re_string_fetch_byte (pstr);
 870
 871 #ifdef RE_ENABLE_I18N
 872   if (pstr->offsets_needed)
 873     {
 874       int off, ch;
 875
 876       /* For tr_TR.UTF-8 [[:islower:]] there is
 877          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 878          in that case the whole multi-byte character and return
 879          the original letter.  On the other side, with
 880          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 881          anything else would complicate things too much.  */
 882
 883       if (!re_string_first_byte (pstr, pstr->cur_idx))
 884         return re_string_fetch_byte (pstr);
 885
 886       off = pstr->offsets[pstr->cur_idx];
 887       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 888
 889       if (! isascii (ch))
 890         return re_string_fetch_byte (pstr);
 891
 892       re_string_skip_bytes (pstr,
 893                             re_string_char_size_at (pstr, pstr->cur_idx));
 894       return ch;
 895     }
 896 #endif
 897
 898   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 899 }
 900
 901 static void
 902 internal_function
 903 re_string_destruct (re_string_t *pstr)
 904 {
 905 #ifdef RE_ENABLE_I18N
 906   re_free (pstr->wcs);
 907   re_free (pstr->offsets);
 908 #endif /* RE_ENABLE_I18N  */
 909   if (pstr->mbs_allocated)
 910     re_free (pstr->mbs);
 911 }
 912
 913 /* Return the context at IDX in INPUT.  */
 914
 915 static unsigned int
 916 internal_function
 917 re_string_context_at (const re_string_t *input, int idx, int eflags)
 918 {
 919   int c;
 920   if (BE (idx < 0, 0))
 921     /* In this case, we use the value stored in input->tip_context,
 922        since we can't know the character in input->mbs[-1] here.  */
 923     return input->tip_context;
 924   if (BE (idx == input->len, 0))
 925     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 926             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 927 #ifdef RE_ENABLE_I18N
 928   if (input->mb_cur_max > 1)
 929     {
 930       wint_t wc;
 931       int wc_idx = idx;
 932       while(input->wcs[wc_idx] == WEOF)
 933         {
 934 #ifdef DEBUG
 935           /* It must not happen.  */
 936           assert (wc_idx >= 0);
 937 #endif
 938           --wc_idx;
 939           if (wc_idx < 0)
 940             return input->tip_context;
 941         }
 942       wc = input->wcs[wc_idx];
 943       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 944         return CONTEXT_WORD;
 945       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 946               ? CONTEXT_NEWLINE : 0);
 947     }
 948   else
 949 #endif
 950     {
 951       c = re_string_byte_at (input, idx);
 952       if (bitset_contain (input->word_char, c))
 953         return CONTEXT_WORD;
 954       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 955     }
 956 }
 957 \f
 958 /* Functions for set operation.  */
 959
 960 static reg_errcode_t
 961 internal_function __attribute_warn_unused_result__
 962 re_node_set_alloc (re_node_set *set, int size)
 963 {
 964   set->alloc = size;
 965   set->nelem = 0;
 966   set->elems = re_malloc (int, size);
 967   if (BE (set->elems == NULL, 0))
 968     return REG_ESPACE;
 969   return REG_NOERROR;
 970 }
 971
 972 static reg_errcode_t
 973 internal_function __attribute_warn_unused_result__
 974 re_node_set_init_1 (re_node_set *set, int elem)
 975 {
 976   set->alloc = 1;
 977   set->nelem = 1;
 978   set->elems = re_malloc (int, 1);
 979   if (BE (set->elems == NULL, 0))
 980     {
 981       set->alloc = set->nelem = 0;
 982       return REG_ESPACE;
 983     }
 984   set->elems[0] = elem;
 985   return REG_NOERROR;
 986 }
 987
 988 static reg_errcode_t
 989 internal_function __attribute_warn_unused_result__
 990 re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
 991 {
 992   set->alloc = 2;
 993   set->elems = re_malloc (int, 2);
 994   if (BE (set->elems == NULL, 0))
 995     return REG_ESPACE;
 996   if (elem1 == elem2)
 997     {
 998       set->nelem = 1;
 999       set->elems[0] = elem1;
1000     }
1001   else
1002     {
1003       set->nelem = 2;
1004       if (elem1 < elem2)
1005         {
1006           set->elems[0] = elem1;
1007           set->elems[1] = elem2;
1008         }
1009       else
1010         {
1011           set->elems[0] = elem2;
1012           set->elems[1] = elem1;
1013         }
1014     }
1015   return REG_NOERROR;
1016 }
1017
1018 static reg_errcode_t
1019 internal_function __attribute_warn_unused_result__
1020 re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1021 {
1022   dest->nelem = src->nelem;
1023   if (src->nelem > 0)
1024     {
1025       dest->alloc = dest->nelem;
1026       dest->elems = re_malloc (int, dest->alloc);
1027       if (BE (dest->elems == NULL, 0))
1028         {
1029           dest->alloc = dest->nelem = 0;
1030           return REG_ESPACE;
1031         }
1032       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1033     }
1034   else
1035     re_node_set_init_empty (dest);
1036   return REG_NOERROR;
1037 }
1038
1039 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1040    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1041    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
1042
1043 static reg_errcode_t
1044 internal_function __attribute_warn_unused_result__
1045 re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1046                            const re_node_set *src2)
1047 {
1048   int i1, i2, is, id, delta, sbase;
1049   if (src1->nelem == 0 || src2->nelem == 0)
1050     return REG_NOERROR;
1051
1052   /* We need dest->nelem + 2 * elems_in_intersection; this is a
1053      conservative estimate.  */
1054   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1055     {
1056       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
1057       int *new_elems = re_realloc (dest->elems, int, new_alloc);
1058       if (BE (new_elems == NULL, 0))
1059         return REG_ESPACE;
1060       dest->elems = new_elems;
1061       dest->alloc = new_alloc;
1062     }
1063
1064   /* Find the items in the intersection of SRC1 and SRC2, and copy
1065      into the top of DEST those that are not already in DEST itself.  */
1066   sbase = dest->nelem + src1->nelem + src2->nelem;
1067   i1 = src1->nelem - 1;
1068   i2 = src2->nelem - 1;
1069   id = dest->nelem - 1;
1070   for (;;)
1071     {
1072       if (src1->elems[i1] == src2->elems[i2])
1073         {
1074           /* Try to find the item in DEST.  Maybe we could binary search?  */
1075           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1076             --id;
1077
1078           if (id < 0 || dest->elems[id] != src1->elems[i1])
1079             dest->elems[--sbase] = src1->elems[i1];
1080
1081           if (--i1 < 0 || --i2 < 0)
1082             break;
1083         }
1084
1085       /* Lower the highest of the two items.  */
1086       else if (src1->elems[i1] < src2->elems[i2])
1087         {
1088           if (--i2 < 0)
1089             break;
1090         }
1091       else
1092         {
1093           if (--i1 < 0)
1094             break;
1095         }
1096     }
1097
1098   id = dest->nelem - 1;
1099   is = dest->nelem + src1->nelem + src2->nelem - 1;
1100   delta = is - sbase + 1;
1101
1102   /* Now copy.  When DELTA becomes zero, the remaining
1103      DEST elements are already in place; this is more or
1104      less the same loop that is in re_node_set_merge.  */
1105   dest->nelem += delta;
1106   if (delta > 0 && id >= 0)
1107     for (;;)
1108       {
1109         if (dest->elems[is] > dest->elems[id])
1110           {
1111             /* Copy from the top.  */
1112             dest->elems[id + delta--] = dest->elems[is--];
1113             if (delta == 0)
1114               break;
1115           }
1116         else
1117           {
1118             /* Slide from the bottom.  */
1119             dest->elems[id + delta] = dest->elems[id];
1120             if (--id < 0)
1121               break;
1122           }
1123       }
1124
1125   /* Copy remaining SRC elements.  */
1126   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1127
1128   return REG_NOERROR;
1129 }
1130
1131 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1132    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1133
1134 static reg_errcode_t
1135 internal_function __attribute_warn_unused_result__
1136 re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1137                         const re_node_set *src2)
1138 {
1139   int i1, i2, id;
1140   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1141     {
1142       dest->alloc = src1->nelem + src2->nelem;
1143       dest->elems = re_malloc (int, dest->alloc);
1144       if (BE (dest->elems == NULL, 0))
1145         return REG_ESPACE;
1146     }
1147   else
1148     {
1149       if (src1 != NULL && src1->nelem > 0)
1150         return re_node_set_init_copy (dest, src1);
1151       else if (src2 != NULL && src2->nelem > 0)
1152         return re_node_set_init_copy (dest, src2);
1153       else
1154         re_node_set_init_empty (dest);
1155       return REG_NOERROR;
1156     }
1157   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1158     {
1159       if (src1->elems[i1] > src2->elems[i2])
1160         {
1161           dest->elems[id++] = src2->elems[i2++];
1162           continue;
1163         }
1164       if (src1->elems[i1] == src2->elems[i2])
1165         ++i2;
1166       dest->elems[id++] = src1->elems[i1++];
1167     }
1168   if (i1 < src1->nelem)
1169     {
1170       memcpy (dest->elems + id, src1->elems + i1,
1171              (src1->nelem - i1) * sizeof (int));
1172       id += src1->nelem - i1;
1173     }
1174   else if (i2 < src2->nelem)
1175     {
1176       memcpy (dest->elems + id, src2->elems + i2,
1177              (src2->nelem - i2) * sizeof (int));
1178       id += src2->nelem - i2;
1179     }
1180   dest->nelem = id;
1181   return REG_NOERROR;
1182 }
1183
1184 /* Calculate the union set of the sets DEST and SRC. And store it to
1185    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1186
1187 static reg_errcode_t
1188 internal_function __attribute_warn_unused_result__
1189 re_node_set_merge (re_node_set *dest, const re_node_set *src)
1190 {
1191   int is, id, sbase, delta;
1192   if (src == NULL || src->nelem == 0)
1193     return REG_NOERROR;
1194   if (dest->alloc < 2 * src->nelem + dest->nelem)
1195     {
1196       int new_alloc = 2 * (src->nelem + dest->alloc);
1197       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1198       if (BE (new_buffer == NULL, 0))
1199         return REG_ESPACE;
1200       dest->elems = new_buffer;
1201       dest->alloc = new_alloc;
1202     }
1203
1204   if (BE (dest->nelem == 0, 0))
1205     {
1206       dest->nelem = src->nelem;
1207       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1208       return REG_NOERROR;
1209     }
1210
1211   /* Copy into the top of DEST the items of SRC that are not
1212      found in DEST.  Maybe we could binary search in DEST?  */
1213   for (sbase = dest->nelem + 2 * src->nelem,
1214        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1215     {
1216       if (dest->elems[id] == src->elems[is])
1217         is--, id--;
1218       else if (dest->elems[id] < src->elems[is])
1219         dest->elems[--sbase] = src->elems[is--];
1220       else /* if (dest->elems[id] > src->elems[is]) */
1221         --id;
1222     }
1223
1224   if (is >= 0)
1225     {
1226       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1227       sbase -= is + 1;
1228       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1229     }
1230
1231   id = dest->nelem - 1;
1232   is = dest->nelem + 2 * src->nelem - 1;
1233   delta = is - sbase + 1;
1234   if (delta == 0)
1235     return REG_NOERROR;
1236
1237   /* Now copy.  When DELTA becomes zero, the remaining
1238      DEST elements are already in place.  */
1239   dest->nelem += delta;
1240   for (;;)
1241     {
1242       if (dest->elems[is] > dest->elems[id])
1243         {
1244           /* Copy from the top.  */
1245           dest->elems[id + delta--] = dest->elems[is--];
1246           if (delta == 0)
1247             break;
1248         }
1249       else
1250         {
1251           /* Slide from the bottom.  */
1252           dest->elems[id + delta] = dest->elems[id];
1253           if (--id < 0)
1254             {
1255               /* Copy remaining SRC elements.  */
1256               memcpy (dest->elems, dest->elems + sbase,
1257                       delta * sizeof (int));
1258               break;
1259             }
1260         }
1261     }
1262
1263   return REG_NOERROR;
1264 }
1265
1266 /* Insert the new element ELEM to the re_node_set* SET.
1267    SET should not already have ELEM.
1268    return -1 if an error is occured, return 1 otherwise.  */
1269
1270 static int
1271 internal_function __attribute_warn_unused_result__
1272 re_node_set_insert (re_node_set *set, int elem)
1273 {
1274   int idx;
1275   /* In case the set is empty.  */
1276   if (set->alloc == 0)
1277     {
1278       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1279         return 1;
1280       else
1281         return -1;
1282     }
1283
1284   if (BE (set->nelem, 0) == 0)
1285     {
1286       /* We already guaranteed above that set->alloc != 0.  */
1287       set->elems[0] = elem;
1288       ++set->nelem;
1289       return 1;
1290     }
1291
1292   /* Realloc if we need.  */
1293   if (set->alloc == set->nelem)
1294     {
1295       int *new_elems;
1296       set->alloc = set->alloc * 2;
1297       new_elems = re_realloc (set->elems, int, set->alloc);
1298       if (BE (new_elems == NULL, 0))
1299         return -1;
1300       set->elems = new_elems;
1301     }
1302
1303   /* Move the elements which follows the new element.  Test the
1304      first element separately to skip a check in the inner loop.  */
1305   if (elem < set->elems[0])
1306     {
1307       idx = 0;
1308       for (idx = set->nelem; idx > 0; idx--)
1309         set->elems[idx] = set->elems[idx - 1];
1310     }
1311   else
1312     {
1313       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1314         set->elems[idx] = set->elems[idx - 1];
1315     }
1316
1317   /* Insert the new element.  */
1318   set->elems[idx] = elem;
1319   ++set->nelem;
1320   return 1;
1321 }
1322
1323 /* Insert the new element ELEM to the re_node_set* SET.
1324    SET should not already have any element greater than or equal to ELEM.
1325    Return -1 if an error is occured, return 1 otherwise.  */
1326
1327 static int
1328 internal_function __attribute_warn_unused_result__
1329 re_node_set_insert_last (re_node_set *set, int elem)
1330 {
1331   /* Realloc if we need.  */
1332   if (set->alloc == set->nelem)
1333     {
1334       int *new_elems;
1335       set->alloc = (set->alloc + 1) * 2;
1336       new_elems = re_realloc (set->elems, int, set->alloc);
1337       if (BE (new_elems == NULL, 0))
1338         return -1;
1339       set->elems = new_elems;
1340     }
1341
1342   /* Insert the new element.  */
1343   set->elems[set->nelem++] = elem;
1344   return 1;
1345 }
1346
1347 /* Compare two node sets SET1 and SET2.
1348    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1349
1350 static int
1351 internal_function __attribute ((pure))
1352 re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1353 {
1354   int i;
1355   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1356     return 0;
1357   for (i = set1->nelem ; --i >= 0 ; )
1358     if (set1->elems[i] != set2->elems[i])
1359       return 0;
1360   return 1;
1361 }
1362
1363 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1364
1365 static int
1366 internal_function __attribute ((pure))
1367 re_node_set_contains (const re_node_set *set, int elem)
1368 {
1369   unsigned int idx, right, mid;
1370   if (set->nelem <= 0)
1371     return 0;
1372
1373   /* Binary search the element.  */
1374   idx = 0;
1375   right = set->nelem - 1;
1376   while (idx < right)
1377     {
1378       mid = (idx + right) / 2;
1379       if (set->elems[mid] < elem)
1380         idx = mid + 1;
1381       else
1382         right = mid;
1383     }
1384   return set->elems[idx] == elem ? idx + 1 : 0;
1385 }
1386
1387 static void
1388 internal_function
1389 re_node_set_remove_at (re_node_set *set, int idx)
1390 {
1391   if (idx < 0 || idx >= set->nelem)
1392     return;
1393   --set->nelem;
1394   for (; idx < set->nelem; idx++)
1395     set->elems[idx] = set->elems[idx + 1];
1396 }
1397 \f
1398
1399 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1400    Or return -1, if an error will be occured.  */
1401
1402 static int
1403 internal_function
1404 re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1405 {
1406   int type = token.type;
1407   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1408     {
1409       size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1410       int *new_nexts, *new_indices;
1411       re_node_set *new_edests, *new_eclosures;
1412       re_token_t *new_nodes;
1413
1414       /* Avoid overflows in realloc.  */
1415       const size_t max_object_size = MAX (sizeof (re_token_t),
1416                                           MAX (sizeof (re_node_set),
1417                                                sizeof (int)));
1418       if (BE (SIZE_MAX / max_object_size < new_nodes_alloc, 0))
1419         return -1;
1420
1421       new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1422       if (BE (new_nodes == NULL, 0))
1423         return -1;
1424       dfa->nodes = new_nodes;
1425       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1426       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1427       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1428       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1429       if (BE (new_nexts == NULL || new_indices == NULL
1430               || new_edests == NULL || new_eclosures == NULL, 0))
1431         return -1;
1432       dfa->nexts = new_nexts;
1433       dfa->org_indices = new_indices;
1434       dfa->edests = new_edests;
1435       dfa->eclosures = new_eclosures;
1436       dfa->nodes_alloc = new_nodes_alloc;
1437     }
1438   dfa->nodes[dfa->nodes_len] = token;
1439   dfa->nodes[dfa->nodes_len].constraint = 0;
1440 #ifdef RE_ENABLE_I18N
1441   dfa->nodes[dfa->nodes_len].accept_mb =
1442     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1443 #endif
1444   dfa->nexts[dfa->nodes_len] = -1;
1445   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1446   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1447   return dfa->nodes_len++;
1448 }
1449
1450 static inline unsigned int
1451 internal_function
1452 calc_state_hash (const re_node_set *nodes, unsigned int context)
1453 {
1454   unsigned int hash = nodes->nelem + context;
1455   int i;
1456   for (i = 0 ; i < nodes->nelem ; i++)
1457     hash += nodes->elems[i];
1458   return hash;
1459 }
1460
1461 /* Search for the state whose node_set is equivalent to NODES.
1462    Return the pointer to the state, if we found it in the DFA.
1463    Otherwise create the new one and return it.  In case of an error
1464    return NULL and set the error code in ERR.
1465    Note: - We assume NULL as the invalid state, then it is possible that
1466            return value is NULL and ERR is REG_NOERROR.
1467          - We never return non-NULL value in case of any errors, it is for
1468            optimization.  */
1469
1470 static re_dfastate_t *
1471 internal_function __attribute_warn_unused_result__
1472 re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1473                   const re_node_set *nodes)
1474 {
1475   unsigned int hash;
1476   re_dfastate_t *new_state;
1477   struct re_state_table_entry *spot;
1478   int i;
1479   if (BE (nodes->nelem == 0, 0))
1480     {
1481       *err = REG_NOERROR;
1482       return NULL;
1483     }
1484   hash = calc_state_hash (nodes, 0);
1485   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1486
1487   for (i = 0 ; i < spot->num ; i++)
1488     {
1489       re_dfastate_t *state = spot->array[i];
1490       if (hash != state->hash)
1491         continue;
1492       if (re_node_set_compare (&state->nodes, nodes))
1493         return state;
1494     }
1495
1496   /* There are no appropriate state in the dfa, create the new one.  */
1497   new_state = create_ci_newstate (dfa, nodes, hash);
1498   if (BE (new_state == NULL, 0))
1499     *err = REG_ESPACE;
1500
1501   return new_state;
1502 }
1503
1504 /* Search for the state whose node_set is equivalent to NODES and
1505    whose context is equivalent to CONTEXT.
1506    Return the pointer to the state, if we found it in the DFA.
1507    Otherwise create the new one and return it.  In case of an error
1508    return NULL and set the error code in ERR.
1509    Note: - We assume NULL as the invalid state, then it is possible that
1510            return value is NULL and ERR is REG_NOERROR.
1511          - We never return non-NULL value in case of any errors, it is for
1512            optimization.  */
1513
1514 static re_dfastate_t *
1515 internal_function __attribute_warn_unused_result__
1516 re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1517                           const re_node_set *nodes, unsigned int context)
1518 {
1519   unsigned int hash;
1520   re_dfastate_t *new_state;
1521   struct re_state_table_entry *spot;
1522   int i;
1523   if (nodes->nelem == 0)
1524     {
1525       *err = REG_NOERROR;
1526       return NULL;
1527     }
1528   hash = calc_state_hash (nodes, context);
1529   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1530
1531   for (i = 0 ; i < spot->num ; i++)
1532     {
1533       re_dfastate_t *state = spot->array[i];
1534       if (state->hash == hash
1535           && state->context == context
1536           && re_node_set_compare (state->entrance_nodes, nodes))
1537         return state;
1538     }
1539   /* There are no appropriate state in `dfa', create the new one.  */
1540   new_state = create_cd_newstate (dfa, nodes, context, hash);
1541   if (BE (new_state == NULL, 0))
1542     *err = REG_ESPACE;
1543
1544   return new_state;
1545 }
1546
1547 /* Finish initialization of the new state NEWSTATE, and using its hash value
1548    HASH put in the appropriate bucket of DFA's state table.  Return value
1549    indicates the error code if failed.  */
1550
1551 static reg_errcode_t
1552 __attribute_warn_unused_result__
1553 register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1554                 unsigned int hash)
1555 {
1556   struct re_state_table_entry *spot;
1557   reg_errcode_t err;
1558   int i;
1559
1560   newstate->hash = hash;
1561   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1562   if (BE (err != REG_NOERROR, 0))
1563     return REG_ESPACE;
1564   for (i = 0; i < newstate->nodes.nelem; i++)
1565     {
1566       int elem = newstate->nodes.elems[i];
1567       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1568         if (re_node_set_insert_last (&newstate->non_eps_nodes, elem) < 0)
1569           return REG_ESPACE;
1570     }
1571
1572   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1573   if (BE (spot->alloc <= spot->num, 0))
1574     {
1575       int new_alloc = 2 * spot->num + 2;
1576       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1577                                               new_alloc);
1578       if (BE (new_array == NULL, 0))
1579         return REG_ESPACE;
1580       spot->array = new_array;
1581       spot->alloc = new_alloc;
1582     }
1583   spot->array[spot->num++] = newstate;
1584   return REG_NOERROR;
1585 }
1586
1587 static void
1588 free_state (re_dfastate_t *state)
1589 {
1590   re_node_set_free (&state->non_eps_nodes);
1591   re_node_set_free (&state->inveclosure);
1592   if (state->entrance_nodes != &state->nodes)
1593     {
1594       re_node_set_free (state->entrance_nodes);
1595       re_free (state->entrance_nodes);
1596     }
1597   re_node_set_free (&state->nodes);
1598   re_free (state->word_trtable);
1599   re_free (state->trtable);
1600   re_free (state);
1601 }
1602
1603 /* Create the new state which is independ of contexts.
1604    Return the new state if succeeded, otherwise return NULL.  */
1605
1606 static re_dfastate_t *
1607 internal_function __attribute_warn_unused_result__
1608 create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1609                     unsigned int hash)
1610 {
1611   int i;
1612   reg_errcode_t err;
1613   re_dfastate_t *newstate;
1614
1615   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1616   if (BE (newstate == NULL, 0))
1617     return NULL;
1618   err = re_node_set_init_copy (&newstate->nodes, nodes);
1619   if (BE (err != REG_NOERROR, 0))
1620     {
1621       re_free (newstate);
1622       return NULL;
1623     }
1624
1625   newstate->entrance_nodes = &newstate->nodes;
1626   for (i = 0 ; i < nodes->nelem ; i++)
1627     {
1628       re_token_t *node = dfa->nodes + nodes->elems[i];
1629       re_token_type_t type = node->type;
1630       if (type == CHARACTER && !node->constraint)
1631         continue;
1632 #ifdef RE_ENABLE_I18N
1633       newstate->accept_mb |= node->accept_mb;
1634 #endif /* RE_ENABLE_I18N */
1635
1636       /* If the state has the halt node, the state is a halt state.  */
1637       if (type == END_OF_RE)
1638         newstate->halt = 1;
1639       else if (type == OP_BACK_REF)
1640         newstate->has_backref = 1;
1641       else if (type == ANCHOR || node->constraint)
1642         newstate->has_constraint = 1;
1643     }
1644   err = register_state (dfa, newstate, hash);
1645   if (BE (err != REG_NOERROR, 0))
1646     {
1647       free_state (newstate);
1648       newstate = NULL;
1649     }
1650   return newstate;
1651 }
1652
1653 /* Create the new state which is depend on the context CONTEXT.
1654    Return the new state if succeeded, otherwise return NULL.  */
1655
1656 static re_dfastate_t *
1657 internal_function __attribute_warn_unused_result__
1658 create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1659                     unsigned int context, unsigned int hash)
1660 {
1661   int i, nctx_nodes = 0;
1662   reg_errcode_t err;
1663   re_dfastate_t *newstate;
1664
1665   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1666   if (BE (newstate == NULL, 0))
1667     return NULL;
1668   err = re_node_set_init_copy (&newstate->nodes, nodes);
1669   if (BE (err != REG_NOERROR, 0))
1670     {
1671       re_free (newstate);
1672       return NULL;
1673     }
1674
1675   newstate->context = context;
1676   newstate->entrance_nodes = &newstate->nodes;
1677
1678   for (i = 0 ; i < nodes->nelem ; i++)
1679     {
1680       re_token_t *node = dfa->nodes + nodes->elems[i];
1681       re_token_type_t type = node->type;
1682       unsigned int constraint = node->constraint;
1683
1684       if (type == CHARACTER && !constraint)
1685         continue;
1686 #ifdef RE_ENABLE_I18N
1687       newstate->accept_mb |= node->accept_mb;
1688 #endif /* RE_ENABLE_I18N */
1689
1690       /* If the state has the halt node, the state is a halt state.  */
1691       if (type == END_OF_RE)
1692         newstate->halt = 1;
1693       else if (type == OP_BACK_REF)
1694         newstate->has_backref = 1;
1695
1696       if (constraint)
1697         {
1698           if (newstate->entrance_nodes == &newstate->nodes)
1699             {
1700               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1701               if (BE (newstate->entrance_nodes == NULL, 0))
1702                 {
1703                   free_state (newstate);
1704                   return NULL;
1705                 }
1706               if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1707                   != REG_NOERROR)
1708                 return NULL;
1709               nctx_nodes = 0;
1710               newstate->has_constraint = 1;
1711             }
1712
1713           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1714             {
1715               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1716               ++nctx_nodes;
1717             }
1718         }
1719     }
1720   err = register_state (dfa, newstate, hash);
1721   if (BE (err != REG_NOERROR, 0))
1722     {
1723       free_state (newstate);
1724       newstate = NULL;
1725     }
1726   return  newstate;
1727 }