posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 #ifdef RE_ENABLE_I18N
  26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
  27                                  wint_t *last_wc) internal_function;
  28 #endif /* RE_ENABLE_I18N */
  29 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
  30                                      unsigned int hash) internal_function;
  31 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
  32                                           const re_node_set *nodes,
  33                                           unsigned int hash) internal_function;
  34 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
  35                                           const re_node_set *nodes,
  36                                           unsigned int context,
  37                                           unsigned int hash) internal_function;
  38 static unsigned int inline calc_state_hash (const re_node_set *nodes,
  39                                             unsigned int context) internal_function;
  40 \f
  41 /* Functions for string operation.  */
  42
  43 /* This function allocate the buffers.  It is necessary to call
  44    re_string_reconstruct before using the object.  */
  45
  46 static reg_errcode_t
  47 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
  48      re_string_t *pstr;
  49      const char *str;
  50      int len, init_len, icase;
  51      RE_TRANSLATE_TYPE trans;
  52      const re_dfa_t *dfa;
  53 {
  54   reg_errcode_t ret;
  55   int init_buf_len;
  56
  57   /* Ensure at least one character fits into the buffers.  */
  58   if (init_len < dfa->mb_cur_max)
  59     init_len = dfa->mb_cur_max;
  60   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  61   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  62
  63   ret = re_string_realloc_buffers (pstr, init_buf_len);
  64   if (BE (ret != REG_NOERROR, 0))
  65     return ret;
  66
  67   pstr->word_char = dfa->word_char;
  68   pstr->word_ops_used = dfa->word_ops_used;
  69   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  70   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  71   pstr->valid_raw_len = pstr->valid_len;
  72   return REG_NOERROR;
  73 }
  74
  75 /* This function allocate the buffers, and initialize them.  */
  76
  77 static reg_errcode_t
  78 re_string_construct (pstr, str, len, trans, icase, dfa)
  79      re_string_t *pstr;
  80      const char *str;
  81      int len, icase;
  82      RE_TRANSLATE_TYPE trans;
  83      const re_dfa_t *dfa;
  84 {
  85   reg_errcode_t ret;
  86   memset (pstr, '\0', sizeof (re_string_t));
  87   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  88
  89   if (len > 0)
  90     {
  91       ret = re_string_realloc_buffers (pstr, len + 1);
  92       if (BE (ret != REG_NOERROR, 0))
  93         return ret;
  94     }
  95   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  96
  97   if (icase)
  98     {
  99 #ifdef RE_ENABLE_I18N
 100       if (dfa->mb_cur_max > 1)
 101         {
 102           while (1)
 103             {
 104               ret = build_wcs_upper_buffer (pstr);
 105               if (BE (ret != REG_NOERROR, 0))
 106                 return ret;
 107               if (pstr->valid_raw_len >= len)
 108                 break;
 109               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 110                 break;
 111               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 112               if (BE (ret != REG_NOERROR, 0))
 113                 return ret;
 114             }
 115         }
 116       else
 117 #endif /* RE_ENABLE_I18N  */
 118         build_upper_buffer (pstr);
 119     }
 120   else
 121     {
 122 #ifdef RE_ENABLE_I18N
 123       if (dfa->mb_cur_max > 1)
 124         build_wcs_buffer (pstr);
 125       else
 126 #endif /* RE_ENABLE_I18N  */
 127         {
 128           if (trans != NULL)
 129             re_string_translate_buffer (pstr);
 130           else
 131             {
 132               pstr->valid_len = pstr->bufs_len;
 133               pstr->valid_raw_len = pstr->bufs_len;
 134             }
 135         }
 136     }
 137
 138   return REG_NOERROR;
 139 }
 140
 141 /* Helper functions for re_string_allocate, and re_string_construct.  */
 142
 143 static reg_errcode_t
 144 re_string_realloc_buffers (pstr, new_buf_len)
 145      re_string_t *pstr;
 146      int new_buf_len;
 147 {
 148 #ifdef RE_ENABLE_I18N
 149   if (pstr->mb_cur_max > 1)
 150     {
 151       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
 152       if (BE (new_array == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->wcs = new_array;
 155       if (pstr->offsets != NULL)
 156         {
 157           int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
 158           if (BE (new_array == NULL, 0))
 159             return REG_ESPACE;
 160           pstr->offsets = new_array;
 161         }
 162     }
 163 #endif /* RE_ENABLE_I18N  */
 164   if (pstr->mbs_allocated)
 165     {
 166       unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
 167                                              new_buf_len);
 168       if (BE (new_array == NULL, 0))
 169         return REG_ESPACE;
 170       pstr->mbs = new_array;
 171     }
 172   pstr->bufs_len = new_buf_len;
 173   return REG_NOERROR;
 174 }
 175
 176
 177 static void
 178 re_string_construct_common (str, len, pstr, trans, icase, dfa)
 179      const char *str;
 180      int len;
 181      re_string_t *pstr;
 182      RE_TRANSLATE_TYPE trans;
 183      int icase;
 184      const re_dfa_t *dfa;
 185 {
 186   pstr->raw_mbs = (const unsigned char *) str;
 187   pstr->len = len;
 188   pstr->raw_len = len;
 189   pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
 190   pstr->icase = icase ? 1 : 0;
 191   pstr->mbs_allocated = (trans != NULL || icase);
 192   pstr->mb_cur_max = dfa->mb_cur_max;
 193   pstr->is_utf8 = dfa->is_utf8;
 194   pstr->map_notascii = dfa->map_notascii;
 195   pstr->stop = pstr->len;
 196   pstr->raw_stop = pstr->stop;
 197 }
 198
 199 #ifdef RE_ENABLE_I18N
 200
 201 /* Build wide character buffer PSTR->WCS.
 202    If the byte sequence of the string are:
 203      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 204    Then wide character buffer will be:
 205      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 206    We use WEOF for padding, they indicate that the position isn't
 207    a first byte of a multibyte character.
 208
 209    Note that this function assumes PSTR->VALID_LEN elements are already
 210    built and starts from PSTR->VALID_LEN.  */
 211
 212 static void
 213 build_wcs_buffer (pstr)
 214      re_string_t *pstr;
 215 {
 216 #ifdef _LIBC
 217   unsigned char buf[pstr->mb_cur_max];
 218 #else
 219   unsigned char buf[64];
 220 #endif
 221   mbstate_t prev_st;
 222   int byte_idx, end_idx, mbclen, remain_len;
 223
 224   /* Build the buffers from pstr->valid_len to either pstr->len or
 225      pstr->bufs_len.  */
 226   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 227   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 228     {
 229       wchar_t wc;
 230       const char *p;
 231
 232       remain_len = end_idx - byte_idx;
 233       prev_st = pstr->cur_state;
 234       /* Apply the translation if we need.  */
 235       if (BE (pstr->trans != NULL, 0))
 236         {
 237           int i, ch;
 238
 239           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 240             {
 241               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 242               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 243             }
 244           p = (const char *) buf;
 245         }
 246       else
 247         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 248       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 249       if (BE (mbclen == (size_t) -2, 0))
 250         {
 251           /* The buffer doesn't have enough space, finish to build.  */
 252           pstr->cur_state = prev_st;
 253           break;
 254         }
 255       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 256         {
 257           /* We treat these cases as a singlebyte character.  */
 258           mbclen = 1;
 259           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 260           if (BE (pstr->trans != NULL, 0))
 261             wc = pstr->trans[wc];
 262           pstr->cur_state = prev_st;
 263         }
 264
 265       /* Write wide character and padding.  */
 266       pstr->wcs[byte_idx++] = wc;
 267       /* Write paddings.  */
 268       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 269         pstr->wcs[byte_idx++] = WEOF;
 270     }
 271   pstr->valid_len = byte_idx;
 272   pstr->valid_raw_len = byte_idx;
 273 }
 274
 275 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 276    but for REG_ICASE.  */
 277
 278 static int
 279 build_wcs_upper_buffer (pstr)
 280      re_string_t *pstr;
 281 {
 282   mbstate_t prev_st;
 283   int src_idx, byte_idx, end_idx, mbclen, remain_len;
 284 #ifdef _LIBC
 285   unsigned char buf[pstr->mb_cur_max];
 286 #else
 287   unsigned char buf[64];
 288 #endif
 289
 290   byte_idx = pstr->valid_len;
 291   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 292
 293   /* The following optimization assumes that ASCII characters can be
 294      mapped to wide characters with a simple cast.  */
 295   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 296     {
 297       while (byte_idx < end_idx)
 298         {
 299           wchar_t wc;
 300
 301           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 302               && mbsinit (&pstr->cur_state))
 303             {
 304               /* In case of a singlebyte character.  */
 305               pstr->mbs[byte_idx]
 306                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 307               /* The next step uses the assumption that wchar_t is encoded
 308                  ASCII-safe: all ASCII values can be converted like this.  */
 309               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 310               ++byte_idx;
 311               continue;
 312             }
 313
 314           remain_len = end_idx - byte_idx;
 315           prev_st = pstr->cur_state;
 316           mbclen = mbrtowc (&wc,
 317                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 318                              + byte_idx), remain_len, &pstr->cur_state);
 319           if (BE (mbclen > 0, 1))
 320             {
 321               wchar_t wcu = wc;
 322               if (iswlower (wc))
 323                 {
 324                   int mbcdlen;
 325
 326                   wcu = towupper (wc);
 327                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 328                   if (BE (mbclen == mbcdlen, 1))
 329                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 330                   else
 331                     {
 332                       src_idx = byte_idx;
 333                       goto offsets_needed;
 334                     }
 335                 }
 336               else
 337                 memcpy (pstr->mbs + byte_idx,
 338                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 339               pstr->wcs[byte_idx++] = wcu;
 340               /* Write paddings.  */
 341               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 342                 pstr->wcs[byte_idx++] = WEOF;
 343             }
 344           else if (mbclen == (size_t) -1 || mbclen == 0)
 345             {
 346               /* It is an invalid character or '\0'.  Just use the byte.  */
 347               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 348               pstr->mbs[byte_idx] = ch;
 349               /* And also cast it to wide char.  */
 350               pstr->wcs[byte_idx++] = (wchar_t) ch;
 351               if (BE (mbclen == (size_t) -1, 0))
 352                 pstr->cur_state = prev_st;
 353             }
 354           else
 355             {
 356               /* The buffer doesn't have enough space, finish to build.  */
 357               pstr->cur_state = prev_st;
 358               break;
 359             }
 360         }
 361       pstr->valid_len = byte_idx;
 362       pstr->valid_raw_len = byte_idx;
 363       return REG_NOERROR;
 364     }
 365   else
 366     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 367       {
 368         wchar_t wc;
 369         const char *p;
 370       offsets_needed:
 371         remain_len = end_idx - byte_idx;
 372         prev_st = pstr->cur_state;
 373         if (BE (pstr->trans != NULL, 0))
 374           {
 375             int i, ch;
 376
 377             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 378               {
 379                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 380                 buf[i] = pstr->trans[ch];
 381               }
 382             p = (const char *) buf;
 383           }
 384         else
 385           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 386         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 387         if (BE (mbclen > 0, 1))
 388           {
 389             wchar_t wcu = wc;
 390             if (iswlower (wc))
 391               {
 392                 int mbcdlen;
 393
 394                 wcu = towupper (wc);
 395                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 396                 if (BE (mbclen == mbcdlen, 1))
 397                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 398                 else
 399                   {
 400                     int i;
 401
 402                     if (byte_idx + mbcdlen > pstr->bufs_len)
 403                       {
 404                         pstr->cur_state = prev_st;
 405                         break;
 406                       }
 407
 408                     if (pstr->offsets == NULL)
 409                       {
 410                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 411
 412                         if (pstr->offsets == NULL)
 413                           return REG_ESPACE;
 414                       }
 415                     if (!pstr->offsets_needed)
 416                       {
 417                         for (i = 0; i < byte_idx; ++i)
 418                           pstr->offsets[i] = i;
 419                         pstr->offsets_needed = 1;
 420                       }
 421
 422                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 423                     pstr->wcs[byte_idx] = wcu;
 424                     pstr->offsets[byte_idx] = src_idx;
 425                     for (i = 1; i < mbcdlen; ++i)
 426                       {
 427                         pstr->offsets[byte_idx + i]
 428                           = src_idx + (i < mbclen ? i : mbclen - 1);
 429                         pstr->wcs[byte_idx + i] = WEOF;
 430                       }
 431                     pstr->len += mbcdlen - mbclen;
 432                     if (pstr->raw_stop > src_idx)
 433                       pstr->stop += mbcdlen - mbclen;
 434                     end_idx = (pstr->bufs_len > pstr->len)
 435                               ? pstr->len : pstr->bufs_len;
 436                     byte_idx += mbcdlen;
 437                     src_idx += mbclen;
 438                     continue;
 439                   }
 440               }
 441             else
 442               memcpy (pstr->mbs + byte_idx, p, mbclen);
 443
 444             if (BE (pstr->offsets_needed != 0, 0))
 445               {
 446                 int i;
 447                 for (i = 0; i < mbclen; ++i)
 448                   pstr->offsets[byte_idx + i] = src_idx + i;
 449               }
 450             src_idx += mbclen;
 451
 452             pstr->wcs[byte_idx++] = wcu;
 453             /* Write paddings.  */
 454             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 455               pstr->wcs[byte_idx++] = WEOF;
 456           }
 457         else if (mbclen == (size_t) -1 || mbclen == 0)
 458           {
 459             /* It is an invalid character or '\0'.  Just use the byte.  */
 460             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 461
 462             if (BE (pstr->trans != NULL, 0))
 463               ch = pstr->trans [ch];
 464             pstr->mbs[byte_idx] = ch;
 465
 466             if (BE (pstr->offsets_needed != 0, 0))
 467               pstr->offsets[byte_idx] = src_idx;
 468             ++src_idx;
 469
 470             /* And also cast it to wide char.  */
 471             pstr->wcs[byte_idx++] = (wchar_t) ch;
 472             if (BE (mbclen == (size_t) -1, 0))
 473               pstr->cur_state = prev_st;
 474           }
 475         else
 476           {
 477             /* The buffer doesn't have enough space, finish to build.  */
 478             pstr->cur_state = prev_st;
 479             break;
 480           }
 481       }
 482   pstr->valid_len = byte_idx;
 483   pstr->valid_raw_len = src_idx;
 484   return REG_NOERROR;
 485 }
 486
 487 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 488    Return the index.  */
 489
 490 static int
 491 re_string_skip_chars (pstr, new_raw_idx, last_wc)
 492      re_string_t *pstr;
 493      int new_raw_idx;
 494      wint_t *last_wc;
 495 {
 496   mbstate_t prev_st;
 497   int rawbuf_idx, mbclen;
 498   wchar_t wc = 0;
 499
 500   /* Skip the characters which are not necessary to check.  */
 501   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 502        rawbuf_idx < new_raw_idx;)
 503     {
 504       int remain_len;
 505       remain_len = pstr->len - rawbuf_idx;
 506       prev_st = pstr->cur_state;
 507       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 508                         remain_len, &pstr->cur_state);
 509       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 510         {
 511           /* We treat these cases as a singlebyte character.  */
 512           mbclen = 1;
 513           pstr->cur_state = prev_st;
 514         }
 515       /* Then proceed the next character.  */
 516       rawbuf_idx += mbclen;
 517     }
 518   *last_wc = (wint_t) wc;
 519   return rawbuf_idx;
 520 }
 521 #endif /* RE_ENABLE_I18N  */
 522
 523 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 524    This function is used in case of REG_ICASE.  */
 525
 526 static void
 527 build_upper_buffer (pstr)
 528      re_string_t *pstr;
 529 {
 530   int char_idx, end_idx;
 531   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 532
 533   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 534     {
 535       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 536       if (BE (pstr->trans != NULL, 0))
 537         ch = pstr->trans[ch];
 538       if (islower (ch))
 539         pstr->mbs[char_idx] = toupper (ch);
 540       else
 541         pstr->mbs[char_idx] = ch;
 542     }
 543   pstr->valid_len = char_idx;
 544   pstr->valid_raw_len = char_idx;
 545 }
 546
 547 /* Apply TRANS to the buffer in PSTR.  */
 548
 549 static void
 550 re_string_translate_buffer (pstr)
 551      re_string_t *pstr;
 552 {
 553   int buf_idx, end_idx;
 554   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 555
 556   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 557     {
 558       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 559       pstr->mbs[buf_idx] = pstr->trans[ch];
 560     }
 561
 562   pstr->valid_len = buf_idx;
 563   pstr->valid_raw_len = buf_idx;
 564 }
 565
 566 /* This function re-construct the buffers.
 567    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 568    convert to upper case in case of REG_ICASE, apply translation.  */
 569
 570 static reg_errcode_t
 571 re_string_reconstruct (pstr, idx, eflags)
 572      re_string_t *pstr;
 573      int idx, eflags;
 574 {
 575   int offset = idx - pstr->raw_mbs_idx;
 576   if (BE (offset < 0, 0))
 577     {
 578       /* Reset buffer.  */
 579 #ifdef RE_ENABLE_I18N
 580       if (pstr->mb_cur_max > 1)
 581         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 582 #endif /* RE_ENABLE_I18N */
 583       pstr->len = pstr->raw_len;
 584       pstr->stop = pstr->raw_stop;
 585       pstr->valid_len = 0;
 586       pstr->raw_mbs_idx = 0;
 587       pstr->valid_raw_len = 0;
 588       pstr->offsets_needed = 0;
 589       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 590                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 591       if (!pstr->mbs_allocated)
 592         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 593       offset = idx;
 594     }
 595
 596   if (BE (offset != 0, 1))
 597     {
 598       /* Are the characters which are already checked remain?  */
 599       if (BE (offset < pstr->valid_raw_len, 1)
 600 #ifdef RE_ENABLE_I18N
 601           /* Handling this would enlarge the code too much.
 602              Accept a slowdown in that case.  */
 603           && pstr->offsets_needed == 0
 604 #endif
 605          )
 606         {
 607           /* Yes, move them to the front of the buffer.  */
 608           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 609 #ifdef RE_ENABLE_I18N
 610           if (pstr->mb_cur_max > 1)
 611             memmove (pstr->wcs, pstr->wcs + offset,
 612                      (pstr->valid_len - offset) * sizeof (wint_t));
 613 #endif /* RE_ENABLE_I18N */
 614           if (BE (pstr->mbs_allocated, 0))
 615             memmove (pstr->mbs, pstr->mbs + offset,
 616                      pstr->valid_len - offset);
 617           pstr->valid_len -= offset;
 618           pstr->valid_raw_len -= offset;
 619 #if DEBUG
 620           assert (pstr->valid_len > 0);
 621 #endif
 622         }
 623       else
 624         {
 625           /* No, skip all characters until IDX.  */
 626 #ifdef RE_ENABLE_I18N
 627           if (BE (pstr->offsets_needed, 0))
 628             {
 629               pstr->len = pstr->raw_len - idx + offset;
 630               pstr->stop = pstr->raw_stop - idx + offset;
 631               pstr->offsets_needed = 0;
 632             }
 633 #endif
 634           pstr->valid_len = 0;
 635           pstr->valid_raw_len = 0;
 636 #ifdef RE_ENABLE_I18N
 637           if (pstr->mb_cur_max > 1)
 638             {
 639               int wcs_idx;
 640               wint_t wc = WEOF;
 641
 642               if (pstr->is_utf8)
 643                 {
 644                   const unsigned char *raw, *p, *q, *end;
 645
 646                   /* Special case UTF-8.  Multi-byte chars start with any
 647                      byte other than 0x80 - 0xbf.  */
 648                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 649                   end = raw + (offset - pstr->mb_cur_max);
 650                   for (p = raw + offset - 1; p >= end; --p)
 651                     if ((*p & 0xc0) != 0x80)
 652                       {
 653                         mbstate_t cur_state;
 654                         wchar_t wc2;
 655                         int mlen = raw + pstr->len - p;
 656                         unsigned char buf[6];
 657
 658                         q = p;
 659                         if (BE (pstr->trans != NULL, 0))
 660                           {
 661                             int i = mlen < 6 ? mlen : 6;
 662                             while (--i >= 0)
 663                               buf[i] = pstr->trans[p[i]];
 664                             q = buf;
 665                           }
 666                         /* XXX Don't use mbrtowc, we know which conversion
 667                            to use (UTF-8 -> UCS4).  */
 668                         memset (&cur_state, 0, sizeof (cur_state));
 669                         mlen = mbrtowc (&wc2, p, mlen, &cur_state)
 670                                - (raw + offset - p);
 671                         if (mlen >= 0)
 672                           {
 673                             memset (&pstr->cur_state, '\0',
 674                                     sizeof (mbstate_t));
 675                             pstr->valid_len = mlen;
 676                             wc = wc2;
 677                           }
 678                         break;
 679                       }
 680                 }
 681
 682               if (wc == WEOF)
 683                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 684               if (BE (pstr->valid_len, 0))
 685                 {
 686                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 687                     pstr->wcs[wcs_idx] = WEOF;
 688                   if (pstr->mbs_allocated)
 689                     memset (pstr->mbs, 255, pstr->valid_len);
 690                 }
 691               pstr->valid_raw_len = pstr->valid_len;
 692               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 693                                     && IS_WIDE_WORD_CHAR (wc))
 694                                    ? CONTEXT_WORD
 695                                    : ((IS_WIDE_NEWLINE (wc)
 696                                        && pstr->newline_anchor)
 697                                       ? CONTEXT_NEWLINE : 0));
 698             }
 699           else
 700 #endif /* RE_ENABLE_I18N */
 701             {
 702               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 703               if (pstr->trans)
 704                 c = pstr->trans[c];
 705               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 706                                    ? CONTEXT_WORD
 707                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 708                                       ? CONTEXT_NEWLINE : 0));
 709             }
 710         }
 711       if (!BE (pstr->mbs_allocated, 0))
 712         pstr->mbs += offset;
 713     }
 714   pstr->raw_mbs_idx = idx;
 715   pstr->len -= offset;
 716   pstr->stop -= offset;
 717
 718   /* Then build the buffers.  */
 719 #ifdef RE_ENABLE_I18N
 720   if (pstr->mb_cur_max > 1)
 721     {
 722       if (pstr->icase)
 723         {
 724           int ret = build_wcs_upper_buffer (pstr);
 725           if (BE (ret != REG_NOERROR, 0))
 726             return ret;
 727         }
 728       else
 729         build_wcs_buffer (pstr);
 730     }
 731   else
 732 #endif /* RE_ENABLE_I18N */
 733   if (BE (pstr->mbs_allocated, 0))
 734     {
 735       if (pstr->icase)
 736         build_upper_buffer (pstr);
 737       else if (pstr->trans != NULL)
 738         re_string_translate_buffer (pstr);
 739     }
 740   else
 741     pstr->valid_len = pstr->len;
 742
 743   pstr->cur_idx = 0;
 744   return REG_NOERROR;
 745 }
 746
 747 static unsigned char
 748 re_string_peek_byte_case (pstr, idx)
 749      const re_string_t *pstr;
 750      int idx;
 751 {
 752   int ch, off;
 753
 754   /* Handle the common (easiest) cases first.  */
 755   if (BE (!pstr->mbs_allocated, 1))
 756     return re_string_peek_byte (pstr, idx);
 757
 758 #ifdef RE_ENABLE_I18N
 759   if (pstr->mb_cur_max > 1
 760       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 761     return re_string_peek_byte (pstr, idx);
 762 #endif
 763
 764   off = pstr->cur_idx + idx;
 765 #ifdef RE_ENABLE_I18N
 766   if (pstr->offsets_needed)
 767     off = pstr->offsets[off];
 768 #endif
 769
 770   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 771
 772 #ifdef RE_ENABLE_I18N
 773   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 774      this function returns CAPITAL LETTER I instead of first byte of
 775      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 776      since peek_byte_case doesn't advance cur_idx in any way.  */
 777   if (pstr->offsets_needed && !isascii (ch))
 778     return re_string_peek_byte (pstr, idx);
 779 #endif
 780
 781   return ch;
 782 }
 783
 784 static unsigned char
 785 re_string_fetch_byte_case (pstr)
 786      re_string_t *pstr;
 787 {
 788   if (BE (!pstr->mbs_allocated, 1))
 789     return re_string_fetch_byte (pstr);
 790
 791 #ifdef RE_ENABLE_I18N
 792   if (pstr->offsets_needed)
 793     {
 794       int off, ch;
 795
 796       /* For tr_TR.UTF-8 [[:islower:]] there is
 797          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 798          in that case the whole multi-byte character and return
 799          the original letter.  On the other side, with
 800          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 801          anything else would complicate things too much.  */
 802
 803       if (!re_string_first_byte (pstr, pstr->cur_idx))
 804         return re_string_fetch_byte (pstr);
 805
 806       off = pstr->offsets[pstr->cur_idx];
 807       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 808
 809       if (! isascii (ch))
 810         return re_string_fetch_byte (pstr);
 811
 812       re_string_skip_bytes (pstr,
 813                             re_string_char_size_at (pstr, pstr->cur_idx));
 814       return ch;
 815     }
 816 #endif
 817
 818   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 819 }
 820
 821 static void
 822 re_string_destruct (pstr)
 823      re_string_t *pstr;
 824 {
 825 #ifdef RE_ENABLE_I18N
 826   re_free (pstr->wcs);
 827   re_free (pstr->offsets);
 828 #endif /* RE_ENABLE_I18N  */
 829   if (pstr->mbs_allocated)
 830     re_free (pstr->mbs);
 831 }
 832
 833 /* Return the context at IDX in INPUT.  */
 834
 835 static unsigned int
 836 re_string_context_at (input, idx, eflags)
 837      const re_string_t *input;
 838      int idx, eflags;
 839 {
 840   int c;
 841   if (BE (idx < 0, 0))
 842     /* In this case, we use the value stored in input->tip_context,
 843        since we can't know the character in input->mbs[-1] here.  */
 844     return input->tip_context;
 845   if (BE (idx == input->len, 0))
 846     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 847             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 848 #ifdef RE_ENABLE_I18N
 849   if (input->mb_cur_max > 1)
 850     {
 851       wint_t wc;
 852       int wc_idx = idx;
 853       while(input->wcs[wc_idx] == WEOF)
 854         {
 855 #ifdef DEBUG
 856           /* It must not happen.  */
 857           assert (wc_idx >= 0);
 858 #endif
 859           --wc_idx;
 860           if (wc_idx < 0)
 861             return input->tip_context;
 862         }
 863       wc = input->wcs[wc_idx];
 864       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 865         return CONTEXT_WORD;
 866       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 867               ? CONTEXT_NEWLINE : 0);
 868     }
 869   else
 870 #endif
 871     {
 872       c = re_string_byte_at (input, idx);
 873       if (bitset_contain (input->word_char, c))
 874         return CONTEXT_WORD;
 875       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 876     }
 877 }
 878 \f
 879 /* Functions for set operation.  */
 880
 881 static reg_errcode_t
 882 re_node_set_alloc (set, size)
 883      re_node_set *set;
 884      int size;
 885 {
 886   set->alloc = size;
 887   set->nelem = 0;
 888   set->elems = re_malloc (int, size);
 889   if (BE (set->elems == NULL, 0))
 890     return REG_ESPACE;
 891   return REG_NOERROR;
 892 }
 893
 894 static reg_errcode_t
 895 re_node_set_init_1 (set, elem)
 896      re_node_set *set;
 897      int elem;
 898 {
 899   set->alloc = 1;
 900   set->nelem = 1;
 901   set->elems = re_malloc (int, 1);
 902   if (BE (set->elems == NULL, 0))
 903     {
 904       set->alloc = set->nelem = 0;
 905       return REG_ESPACE;
 906     }
 907   set->elems[0] = elem;
 908   return REG_NOERROR;
 909 }
 910
 911 static reg_errcode_t
 912 re_node_set_init_2 (set, elem1, elem2)
 913      re_node_set *set;
 914      int elem1, elem2;
 915 {
 916   set->alloc = 2;
 917   set->elems = re_malloc (int, 2);
 918   if (BE (set->elems == NULL, 0))
 919     return REG_ESPACE;
 920   if (elem1 == elem2)
 921     {
 922       set->nelem = 1;
 923       set->elems[0] = elem1;
 924     }
 925   else
 926     {
 927       set->nelem = 2;
 928       if (elem1 < elem2)
 929         {
 930           set->elems[0] = elem1;
 931           set->elems[1] = elem2;
 932         }
 933       else
 934         {
 935           set->elems[0] = elem2;
 936           set->elems[1] = elem1;
 937         }
 938     }
 939   return REG_NOERROR;
 940 }
 941
 942 static reg_errcode_t
 943 re_node_set_init_copy (dest, src)
 944      re_node_set *dest;
 945      const re_node_set *src;
 946 {
 947   dest->nelem = src->nelem;
 948   if (src->nelem > 0)
 949     {
 950       dest->alloc = dest->nelem;
 951       dest->elems = re_malloc (int, dest->alloc);
 952       if (BE (dest->elems == NULL, 0))
 953         {
 954           dest->alloc = dest->nelem = 0;
 955           return REG_ESPACE;
 956         }
 957       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 958     }
 959   else
 960     re_node_set_init_empty (dest);
 961   return REG_NOERROR;
 962 }
 963
 964 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 965    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 966    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 967
 968 static reg_errcode_t
 969 re_node_set_add_intersect (dest, src1, src2)
 970      re_node_set *dest;
 971      const re_node_set *src1, *src2;
 972 {
 973   int i1, i2, is, id, delta, sbase;
 974   if (src1->nelem == 0 || src2->nelem == 0)
 975     return REG_NOERROR;
 976
 977   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 978      conservative estimate.  */
 979   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 980     {
 981       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 982       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 983       if (BE (new_elems == NULL, 0))
 984         return REG_ESPACE;
 985       dest->elems = new_elems;
 986       dest->alloc = new_alloc;
 987     }
 988
 989   /* Find the items in the intersection of SRC1 and SRC2, and copy
 990      into the top of DEST those that are not already in DEST itself.  */
 991   sbase = dest->nelem + src1->nelem + src2->nelem;
 992   i1 = src1->nelem - 1;
 993   i2 = src2->nelem - 1;
 994   id = dest->nelem - 1;
 995   for (;;)
 996     {
 997       if (src1->elems[i1] == src2->elems[i2])
 998         {
 999           /* Try to find the item in DEST.  Maybe we could binary search?  */
1000           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1001             --id;
1002
1003           if (id < 0 || dest->elems[id] != src1->elems[i1])
1004             dest->elems[--sbase] = src1->elems[i1];
1005
1006           if (--i1 < 0 || --i2 < 0)
1007             break;
1008         }
1009
1010       /* Lower the highest of the two items.  */
1011       else if (src1->elems[i1] < src2->elems[i2])
1012         {
1013           if (--i2 < 0)
1014             break;
1015         }
1016       else
1017         {
1018           if (--i1 < 0)
1019             break;
1020         }
1021     }
1022
1023   id = dest->nelem - 1;
1024   is = dest->nelem + src1->nelem + src2->nelem - 1;
1025   delta = is - sbase + 1;
1026
1027   /* Now copy.  When DELTA becomes zero, the remaining
1028      DEST elements are already in place; this is more or
1029      less the same loop that is in re_node_set_merge.  */
1030   dest->nelem += delta;
1031   if (delta > 0 && id >= 0)
1032     for (;;)
1033       {
1034         if (dest->elems[is] > dest->elems[id])
1035           {
1036             /* Copy from the top.  */
1037             dest->elems[id + delta--] = dest->elems[is--];
1038             if (delta == 0)
1039               break;
1040           }
1041         else
1042           {
1043             /* Slide from the bottom.  */
1044             dest->elems[id + delta] = dest->elems[id];
1045             if (--id < 0)
1046               break;
1047           }
1048       }
1049
1050   /* Copy remaining SRC elements.  */
1051   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1052
1053   return REG_NOERROR;
1054 }
1055
1056 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1057    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1058
1059 static reg_errcode_t
1060 re_node_set_init_union (dest, src1, src2)
1061      re_node_set *dest;
1062      const re_node_set *src1, *src2;
1063 {
1064   int i1, i2, id;
1065   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1066     {
1067       dest->alloc = src1->nelem + src2->nelem;
1068       dest->elems = re_malloc (int, dest->alloc);
1069       if (BE (dest->elems == NULL, 0))
1070         return REG_ESPACE;
1071     }
1072   else
1073     {
1074       if (src1 != NULL && src1->nelem > 0)
1075         return re_node_set_init_copy (dest, src1);
1076       else if (src2 != NULL && src2->nelem > 0)
1077         return re_node_set_init_copy (dest, src2);
1078       else
1079         re_node_set_init_empty (dest);
1080       return REG_NOERROR;
1081     }
1082   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1083     {
1084       if (src1->elems[i1] > src2->elems[i2])
1085         {
1086           dest->elems[id++] = src2->elems[i2++];
1087           continue;
1088         }
1089       if (src1->elems[i1] == src2->elems[i2])
1090         ++i2;
1091       dest->elems[id++] = src1->elems[i1++];
1092     }
1093   if (i1 < src1->nelem)
1094     {
1095       memcpy (dest->elems + id, src1->elems + i1,
1096              (src1->nelem - i1) * sizeof (int));
1097       id += src1->nelem - i1;
1098     }
1099   else if (i2 < src2->nelem)
1100     {
1101       memcpy (dest->elems + id, src2->elems + i2,
1102              (src2->nelem - i2) * sizeof (int));
1103       id += src2->nelem - i2;
1104     }
1105   dest->nelem = id;
1106   return REG_NOERROR;
1107 }
1108
1109 /* Calculate the union set of the sets DEST and SRC. And store it to
1110    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1111
1112 static reg_errcode_t
1113 re_node_set_merge (dest, src)
1114      re_node_set *dest;
1115      const re_node_set *src;
1116 {
1117   int is, id, sbase, delta;
1118   if (src == NULL || src->nelem == 0)
1119     return REG_NOERROR;
1120   if (dest->alloc < 2 * src->nelem + dest->nelem)
1121     {
1122       int new_alloc = 2 * (src->nelem + dest->alloc);
1123       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1124       if (BE (new_buffer == NULL, 0))
1125         return REG_ESPACE;
1126       dest->elems = new_buffer;
1127       dest->alloc = new_alloc;
1128     }
1129
1130   if (BE (dest->nelem == 0, 0))
1131     {
1132       dest->nelem = src->nelem;
1133       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1134       return REG_NOERROR;
1135     }
1136
1137   /* Copy into the top of DEST the items of SRC that are not
1138      found in DEST.  Maybe we could binary search in DEST?  */
1139   for (sbase = dest->nelem + 2 * src->nelem,
1140        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1141     {
1142       if (dest->elems[id] == src->elems[is])
1143         is--, id--;
1144       else if (dest->elems[id] < src->elems[is])
1145         dest->elems[--sbase] = src->elems[is--];
1146       else /* if (dest->elems[id] > src->elems[is]) */
1147         --id;
1148     }
1149
1150   if (is >= 0)
1151     {
1152       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1153       sbase -= is + 1;
1154       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1155     }
1156
1157   id = dest->nelem - 1;
1158   is = dest->nelem + 2 * src->nelem - 1;
1159   delta = is - sbase + 1;
1160   if (delta == 0)
1161     return REG_NOERROR;
1162
1163   /* Now copy.  When DELTA becomes zero, the remaining
1164      DEST elements are already in place.  */
1165   dest->nelem += delta;
1166   for (;;)
1167     {
1168       if (dest->elems[is] > dest->elems[id])
1169         {
1170           /* Copy from the top.  */
1171           dest->elems[id + delta--] = dest->elems[is--];
1172           if (delta == 0)
1173             break;
1174         }
1175       else
1176         {
1177           /* Slide from the bottom.  */
1178           dest->elems[id + delta] = dest->elems[id];
1179           if (--id < 0)
1180             {
1181               /* Copy remaining SRC elements.  */
1182               memcpy (dest->elems, dest->elems + sbase,
1183                       delta * sizeof (int));
1184               break;
1185             }
1186         }
1187     }
1188
1189   return REG_NOERROR;
1190 }
1191
1192 /* Insert the new element ELEM to the re_node_set* SET.
1193    SET should not already have ELEM.
1194    return -1 if an error is occured, return 1 otherwise.  */
1195
1196 static int
1197 re_node_set_insert (set, elem)
1198      re_node_set *set;
1199      int elem;
1200 {
1201   int idx;
1202   /* In case the set is empty.  */
1203   if (set->alloc == 0)
1204     {
1205       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1206         return 1;
1207       else
1208         return -1;
1209     }
1210
1211   if (BE (set->nelem, 0) == 0)
1212     {
1213       /* We already guaranteed above that set->alloc != 0.  */
1214       set->elems[0] = elem;
1215       ++set->nelem;
1216       return 1;
1217     }
1218
1219   /* Realloc if we need.  */
1220   if (set->alloc == set->nelem)
1221     {
1222       int *new_array;
1223       set->alloc = set->alloc * 2;
1224       new_array = re_realloc (set->elems, int, set->alloc);
1225       if (BE (new_array == NULL, 0))
1226         return -1;
1227       set->elems = new_array;
1228     }
1229
1230   /* Move the elements which follows the new element.  Test the
1231      first element separately to skip a check in the inner loop.  */
1232   if (elem < set->elems[0])
1233     {
1234       idx = 0;
1235       for (idx = set->nelem; idx > 0; idx--)
1236         set->elems[idx] = set->elems[idx - 1];
1237     }
1238   else
1239     {
1240       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1241         set->elems[idx] = set->elems[idx - 1];
1242     }
1243
1244   /* Insert the new element.  */
1245   set->elems[idx] = elem;
1246   ++set->nelem;
1247   return 1;
1248 }
1249
1250 /* Insert the new element ELEM to the re_node_set* SET.
1251    SET should not already have any element greater than or equal to ELEM.
1252    Return -1 if an error is occured, return 1 otherwise.  */
1253
1254 static int
1255 re_node_set_insert_last (set, elem)
1256      re_node_set *set;
1257      int elem;
1258 {
1259   /* Realloc if we need.  */
1260   if (set->alloc == set->nelem)
1261     {
1262       int *new_array;
1263       set->alloc = (set->alloc + 1) * 2;
1264       new_array = re_realloc (set->elems, int, set->alloc);
1265       if (BE (new_array == NULL, 0))
1266         return -1;
1267       set->elems = new_array;
1268     }
1269
1270   /* Insert the new element.  */
1271   set->elems[set->nelem++] = elem;
1272   return 1;
1273 }
1274
1275 /* Compare two node sets SET1 and SET2.
1276    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1277
1278 static int
1279 re_node_set_compare (set1, set2)
1280      const re_node_set *set1, *set2;
1281 {
1282   int i;
1283   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1284     return 0;
1285   for (i = set1->nelem ; --i >= 0 ; )
1286     if (set1->elems[i] != set2->elems[i])
1287       return 0;
1288   return 1;
1289 }
1290
1291 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1292
1293 static int
1294 re_node_set_contains (set, elem)
1295      const re_node_set *set;
1296      int elem;
1297 {
1298   unsigned int idx, right, mid;
1299   if (set->nelem <= 0)
1300     return 0;
1301
1302   /* Binary search the element.  */
1303   idx = 0;
1304   right = set->nelem - 1;
1305   while (idx < right)
1306     {
1307       mid = (idx + right) / 2;
1308       if (set->elems[mid] < elem)
1309         idx = mid + 1;
1310       else
1311         right = mid;
1312     }
1313   return set->elems[idx] == elem ? idx + 1 : 0;
1314 }
1315
1316 static void
1317 re_node_set_remove_at (set, idx)
1318      re_node_set *set;
1319      int idx;
1320 {
1321   if (idx < 0 || idx >= set->nelem)
1322     return;
1323   --set->nelem;
1324   for (; idx < set->nelem; idx++)
1325     set->elems[idx] = set->elems[idx + 1];
1326 }
1327 \f
1328
1329 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1330    Or return -1, if an error will be occured.  */
1331
1332 static int
1333 re_dfa_add_node (dfa, token, mode)
1334      re_dfa_t *dfa;
1335      re_token_t token;
1336      int mode;
1337 {
1338   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1339     {
1340       int new_nodes_alloc = dfa->nodes_alloc * 2;
1341       re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1342                                           new_nodes_alloc);
1343       if (BE (new_array == NULL, 0))
1344         return -1;
1345       dfa->nodes = new_array;
1346       if (mode)
1347         {
1348           int *new_nexts, *new_indices;
1349           re_node_set *new_edests, *new_eclosures, *new_inveclosures;
1350
1351           new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1352           new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1353           new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1354           new_eclosures = re_realloc (dfa->eclosures, re_node_set,
1355                                       new_nodes_alloc);
1356           new_inveclosures = re_realloc (dfa->inveclosures, re_node_set,
1357                                          new_nodes_alloc);
1358           if (BE (new_nexts == NULL || new_indices == NULL
1359                   || new_edests == NULL || new_eclosures == NULL
1360                   || new_inveclosures == NULL, 0))
1361             return -1;
1362           dfa->nexts = new_nexts;
1363           dfa->org_indices = new_indices;
1364           dfa->edests = new_edests;
1365           dfa->eclosures = new_eclosures;
1366           dfa->inveclosures = new_inveclosures;
1367         }
1368       dfa->nodes_alloc = new_nodes_alloc;
1369     }
1370   dfa->nodes[dfa->nodes_len] = token;
1371   dfa->nodes[dfa->nodes_len].opt_subexp = 0;
1372   dfa->nodes[dfa->nodes_len].duplicated = 0;
1373   dfa->nodes[dfa->nodes_len].constraint = 0;
1374   return dfa->nodes_len++;
1375 }
1376
1377 static unsigned int inline
1378 calc_state_hash (nodes, context)
1379      const re_node_set *nodes;
1380      unsigned int context;
1381 {
1382   unsigned int hash = nodes->nelem + context;
1383   int i;
1384   for (i = 0 ; i < nodes->nelem ; i++)
1385     hash += nodes->elems[i];
1386   return hash;
1387 }
1388
1389 /* Search for the state whose node_set is equivalent to NODES.
1390    Return the pointer to the state, if we found it in the DFA.
1391    Otherwise create the new one and return it.  In case of an error
1392    return NULL and set the error code in ERR.
1393    Note: - We assume NULL as the invalid state, then it is possible that
1394            return value is NULL and ERR is REG_NOERROR.
1395          - We never return non-NULL value in case of any errors, it is for
1396            optimization.  */
1397
1398 static re_dfastate_t*
1399 re_acquire_state (err, dfa, nodes)
1400      reg_errcode_t *err;
1401      re_dfa_t *dfa;
1402      const re_node_set *nodes;
1403 {
1404   unsigned int hash;
1405   re_dfastate_t *new_state;
1406   struct re_state_table_entry *spot;
1407   int i;
1408   if (BE (nodes->nelem == 0, 0))
1409     {
1410       *err = REG_NOERROR;
1411       return NULL;
1412     }
1413   hash = calc_state_hash (nodes, 0);
1414   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1415
1416   for (i = 0 ; i < spot->num ; i++)
1417     {
1418       re_dfastate_t *state = spot->array[i];
1419       if (hash != state->hash)
1420         continue;
1421       if (re_node_set_compare (&state->nodes, nodes))
1422         return state;
1423     }
1424
1425   /* There are no appropriate state in the dfa, create the new one.  */
1426   new_state = create_ci_newstate (dfa, nodes, hash);
1427   if (BE (new_state != NULL, 1))
1428     return new_state;
1429   else
1430     {
1431       *err = REG_ESPACE;
1432       return NULL;
1433     }
1434 }
1435
1436 /* Search for the state whose node_set is equivalent to NODES and
1437    whose context is equivalent to CONTEXT.
1438    Return the pointer to the state, if we found it in the DFA.
1439    Otherwise create the new one and return it.  In case of an error
1440    return NULL and set the error code in ERR.
1441    Note: - We assume NULL as the invalid state, then it is possible that
1442            return value is NULL and ERR is REG_NOERROR.
1443          - We never return non-NULL value in case of any errors, it is for
1444            optimization.  */
1445
1446 static re_dfastate_t*
1447 re_acquire_state_context (err, dfa, nodes, context)
1448      reg_errcode_t *err;
1449      re_dfa_t *dfa;
1450      const re_node_set *nodes;
1451      unsigned int context;
1452 {
1453   unsigned int hash;
1454   re_dfastate_t *new_state;
1455   struct re_state_table_entry *spot;
1456   int i;
1457   if (nodes->nelem == 0)
1458     {
1459       *err = REG_NOERROR;
1460       return NULL;
1461     }
1462   hash = calc_state_hash (nodes, context);
1463   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1464
1465   for (i = 0 ; i < spot->num ; i++)
1466     {
1467       re_dfastate_t *state = spot->array[i];
1468       if (state->hash == hash
1469           && state->context == context
1470           && re_node_set_compare (state->entrance_nodes, nodes))
1471         return state;
1472     }
1473   /* There are no appropriate state in `dfa', create the new one.  */
1474   new_state = create_cd_newstate (dfa, nodes, context, hash);
1475   if (BE (new_state != NULL, 1))
1476     return new_state;
1477   else
1478     {
1479       *err = REG_ESPACE;
1480       return NULL;
1481     }
1482 }
1483
1484 /* Finish initialization of the new state NEWSTATE, and using its hash value
1485    HASH put in the appropriate bucket of DFA's state table.  Return value
1486    indicates the error code if failed.  */
1487
1488 static reg_errcode_t
1489 register_state (dfa, newstate, hash)
1490      re_dfa_t *dfa;
1491      re_dfastate_t *newstate;
1492      unsigned int hash;
1493 {
1494   struct re_state_table_entry *spot;
1495   reg_errcode_t err;
1496   int i;
1497
1498   newstate->hash = hash;
1499   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1500   if (BE (err != REG_NOERROR, 0))
1501     return REG_ESPACE;
1502   for (i = 0; i < newstate->nodes.nelem; i++)
1503     {
1504       int elem = newstate->nodes.elems[i];
1505       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1506         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1507     }
1508
1509   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1510   if (BE (spot->alloc <= spot->num, 0))
1511     {
1512       int new_alloc = 2 * spot->num + 2;
1513       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1514                                               new_alloc);
1515       if (BE (new_array == NULL, 0))
1516         return REG_ESPACE;
1517       spot->array = new_array;
1518       spot->alloc = new_alloc;
1519     }
1520   spot->array[spot->num++] = newstate;
1521   return REG_NOERROR;
1522 }
1523
1524 /* Create the new state which is independ of contexts.
1525    Return the new state if succeeded, otherwise return NULL.  */
1526
1527 static re_dfastate_t *
1528 create_ci_newstate (dfa, nodes, hash)
1529      re_dfa_t *dfa;
1530      const re_node_set *nodes;
1531      unsigned int hash;
1532 {
1533   int i;
1534   reg_errcode_t err;
1535   re_dfastate_t *newstate;
1536
1537   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1538   if (BE (newstate == NULL, 0))
1539     return NULL;
1540   err = re_node_set_init_copy (&newstate->nodes, nodes);
1541   if (BE (err != REG_NOERROR, 0))
1542     {
1543       re_free (newstate);
1544       return NULL;
1545     }
1546
1547   newstate->entrance_nodes = &newstate->nodes;
1548   for (i = 0 ; i < nodes->nelem ; i++)
1549     {
1550       re_token_t *node = dfa->nodes + nodes->elems[i];
1551       re_token_type_t type = node->type;
1552       if (type == CHARACTER && !node->constraint)
1553         continue;
1554
1555       /* If the state has the halt node, the state is a halt state.  */
1556       else if (type == END_OF_RE)
1557         newstate->halt = 1;
1558 #ifdef RE_ENABLE_I18N
1559       else if (type == COMPLEX_BRACKET
1560                || type == OP_UTF8_PERIOD
1561                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1562         newstate->accept_mb = 1;
1563 #endif /* RE_ENABLE_I18N */
1564       else if (type == OP_BACK_REF)
1565         newstate->has_backref = 1;
1566       else if (type == ANCHOR || node->constraint)
1567         newstate->has_constraint = 1;
1568     }
1569   err = register_state (dfa, newstate, hash);
1570   if (BE (err != REG_NOERROR, 0))
1571     {
1572       free_state (newstate);
1573       newstate = NULL;
1574     }
1575   return newstate;
1576 }
1577
1578 /* Create the new state which is depend on the context CONTEXT.
1579    Return the new state if succeeded, otherwise return NULL.  */
1580
1581 static re_dfastate_t *
1582 create_cd_newstate (dfa, nodes, context, hash)
1583      re_dfa_t *dfa;
1584      const re_node_set *nodes;
1585      unsigned int context, hash;
1586 {
1587   int i, nctx_nodes = 0;
1588   reg_errcode_t err;
1589   re_dfastate_t *newstate;
1590
1591   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1592   if (BE (newstate == NULL, 0))
1593     return NULL;
1594   err = re_node_set_init_copy (&newstate->nodes, nodes);
1595   if (BE (err != REG_NOERROR, 0))
1596     {
1597       re_free (newstate);
1598       return NULL;
1599     }
1600
1601   newstate->context = context;
1602   newstate->entrance_nodes = &newstate->nodes;
1603
1604   for (i = 0 ; i < nodes->nelem ; i++)
1605     {
1606       unsigned int constraint = 0;
1607       re_token_t *node = dfa->nodes + nodes->elems[i];
1608       re_token_type_t type = node->type;
1609       if (node->constraint)
1610         constraint = node->constraint;
1611
1612       if (type == CHARACTER && !constraint)
1613         continue;
1614       /* If the state has the halt node, the state is a halt state.  */
1615       else if (type == END_OF_RE)
1616         newstate->halt = 1;
1617 #ifdef RE_ENABLE_I18N
1618       else if (type == COMPLEX_BRACKET
1619                || type == OP_UTF8_PERIOD
1620                || (type == OP_PERIOD && dfa->mb_cur_max > 1))
1621         newstate->accept_mb = 1;
1622 #endif /* RE_ENABLE_I18N */
1623       else if (type == OP_BACK_REF)
1624         newstate->has_backref = 1;
1625       else if (type == ANCHOR)
1626         constraint = node->opr.ctx_type;
1627
1628       if (constraint)
1629         {
1630           if (newstate->entrance_nodes == &newstate->nodes)
1631             {
1632               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633               if (BE (newstate->entrance_nodes == NULL, 0))
1634                 {
1635                   free_state (newstate);
1636                   return NULL;
1637                 }
1638               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639               nctx_nodes = 0;
1640               newstate->has_constraint = 1;
1641             }
1642
1643           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1644             {
1645               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1646               ++nctx_nodes;
1647             }
1648         }
1649     }
1650   err = register_state (dfa, newstate, hash);
1651   if (BE (err != REG_NOERROR, 0))
1652     {
1653       free_state (newstate);
1654       newstate = NULL;
1655     }
1656   return  newstate;
1657 }
1658
1659 static void
1660 free_state (state)
1661      re_dfastate_t *state;
1662 {
1663   re_node_set_free (&state->non_eps_nodes);
1664   re_node_set_free (&state->inveclosure);
1665   if (state->entrance_nodes != &state->nodes)
1666     {
1667       re_node_set_free (state->entrance_nodes);
1668       re_free (state->entrance_nodes);
1669     }
1670   re_node_set_free (&state->nodes);
1671   re_free (state->trtable);
1672   re_free (state);
1673 }