posix/regex_internal.c

   1 /* Extended regular expression matching and search library.
   2    Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4    Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
   5
   6    The GNU C Library is free software; you can redistribute it and/or
   7    modify it under the terms of the GNU Lesser General Public
   8    License as published by the Free Software Foundation; either
   9    version 2.1 of the License, or (at your option) any later version.
  10
  11    The GNU C Library is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14    Lesser General Public License for more details.
  15
  16    You should have received a copy of the GNU Lesser General Public
  17    License along with the GNU C Library; if not, write to the Free
  18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  19    02111-1307 USA.  */
  20
  21 static void re_string_construct_common (const char *str, int len,
  22                                         re_string_t *pstr,
  23                                         RE_TRANSLATE_TYPE trans, int icase,
  24                                         const re_dfa_t *dfa) internal_function;
  25 #ifdef RE_ENABLE_I18N
  26 static int re_string_skip_chars (re_string_t *pstr, int new_raw_idx,
  27                                  wint_t *last_wc) internal_function;
  28 #endif /* RE_ENABLE_I18N */
  29 static reg_errcode_t register_state (re_dfa_t *dfa, re_dfastate_t *newstate,
  30                                      unsigned int hash) internal_function;
  31 static re_dfastate_t *create_ci_newstate (re_dfa_t *dfa,
  32                                           const re_node_set *nodes,
  33                                           unsigned int hash) internal_function;
  34 static re_dfastate_t *create_cd_newstate (re_dfa_t *dfa,
  35                                           const re_node_set *nodes,
  36                                           unsigned int context,
  37                                           unsigned int hash) internal_function;
  38 static unsigned int inline calc_state_hash (const re_node_set *nodes,
  39                                             unsigned int context) internal_function;
  40 \f
  41 /* Functions for string operation.  */
  42
  43 /* This function allocate the buffers.  It is necessary to call
  44    re_string_reconstruct before using the object.  */
  45
  46 static reg_errcode_t
  47 re_string_allocate (pstr, str, len, init_len, trans, icase, dfa)
  48      re_string_t *pstr;
  49      const char *str;
  50      int len, init_len, icase;
  51      RE_TRANSLATE_TYPE trans;
  52      const re_dfa_t *dfa;
  53 {
  54   reg_errcode_t ret;
  55   int init_buf_len;
  56
  57   /* Ensure at least one character fits into the buffers.  */
  58   if (init_len < dfa->mb_cur_max)
  59     init_len = dfa->mb_cur_max;
  60   init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  61   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  62
  63   ret = re_string_realloc_buffers (pstr, init_buf_len);
  64   if (BE (ret != REG_NOERROR, 0))
  65     return ret;
  66
  67   pstr->word_char = dfa->word_char;
  68   pstr->word_ops_used = dfa->word_ops_used;
  69   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  70   pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  71   pstr->valid_raw_len = pstr->valid_len;
  72   return REG_NOERROR;
  73 }
  74
  75 /* This function allocate the buffers, and initialize them.  */
  76
  77 static reg_errcode_t
  78 re_string_construct (pstr, str, len, trans, icase, dfa)
  79      re_string_t *pstr;
  80      const char *str;
  81      int len, icase;
  82      RE_TRANSLATE_TYPE trans;
  83      const re_dfa_t *dfa;
  84 {
  85   reg_errcode_t ret;
  86   memset (pstr, '\0', sizeof (re_string_t));
  87   re_string_construct_common (str, len, pstr, trans, icase, dfa);
  88
  89   if (len > 0)
  90     {
  91       ret = re_string_realloc_buffers (pstr, len + 1);
  92       if (BE (ret != REG_NOERROR, 0))
  93         return ret;
  94     }
  95   pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  96
  97   if (icase)
  98     {
  99 #ifdef RE_ENABLE_I18N
 100       if (dfa->mb_cur_max > 1)
 101         {
 102           while (1)
 103             {
 104               ret = build_wcs_upper_buffer (pstr);
 105               if (BE (ret != REG_NOERROR, 0))
 106                 return ret;
 107               if (pstr->valid_raw_len >= len)
 108                 break;
 109               if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
 110                 break;
 111               ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
 112               if (BE (ret != REG_NOERROR, 0))
 113                 return ret;
 114             }
 115         }
 116       else
 117 #endif /* RE_ENABLE_I18N  */
 118         build_upper_buffer (pstr);
 119     }
 120   else
 121     {
 122 #ifdef RE_ENABLE_I18N
 123       if (dfa->mb_cur_max > 1)
 124         build_wcs_buffer (pstr);
 125       else
 126 #endif /* RE_ENABLE_I18N  */
 127         {
 128           if (trans != NULL)
 129             re_string_translate_buffer (pstr);
 130           else
 131             {
 132               pstr->valid_len = pstr->bufs_len;
 133               pstr->valid_raw_len = pstr->bufs_len;
 134             }
 135         }
 136     }
 137
 138   return REG_NOERROR;
 139 }
 140
 141 /* Helper functions for re_string_allocate, and re_string_construct.  */
 142
 143 static reg_errcode_t
 144 re_string_realloc_buffers (pstr, new_buf_len)
 145      re_string_t *pstr;
 146      int new_buf_len;
 147 {
 148 #ifdef RE_ENABLE_I18N
 149   if (pstr->mb_cur_max > 1)
 150     {
 151       wint_t *new_array = re_realloc (pstr->wcs, wint_t, new_buf_len);
 152       if (BE (new_array == NULL, 0))
 153         return REG_ESPACE;
 154       pstr->wcs = new_array;
 155       if (pstr->offsets != NULL)
 156         {
 157           int *new_array = re_realloc (pstr->offsets, int, new_buf_len);
 158           if (BE (new_array == NULL, 0))
 159             return REG_ESPACE;
 160           pstr->offsets = new_array;
 161         }
 162     }
 163 #endif /* RE_ENABLE_I18N  */
 164   if (pstr->mbs_allocated)
 165     {
 166       unsigned char *new_array = re_realloc (pstr->mbs, unsigned char,
 167                                              new_buf_len);
 168       if (BE (new_array == NULL, 0))
 169         return REG_ESPACE;
 170       pstr->mbs = new_array;
 171     }
 172   pstr->bufs_len = new_buf_len;
 173   return REG_NOERROR;
 174 }
 175
 176
 177 static void
 178 re_string_construct_common (str, len, pstr, trans, icase, dfa)
 179      const char *str;
 180      int len;
 181      re_string_t *pstr;
 182      RE_TRANSLATE_TYPE trans;
 183      int icase;
 184      const re_dfa_t *dfa;
 185 {
 186   pstr->raw_mbs = (const unsigned char *) str;
 187   pstr->len = len;
 188   pstr->raw_len = len;
 189   pstr->trans = (unsigned RE_TRANSLATE_TYPE) trans;
 190   pstr->icase = icase ? 1 : 0;
 191   pstr->mbs_allocated = (trans != NULL || icase);
 192   pstr->mb_cur_max = dfa->mb_cur_max;
 193   pstr->is_utf8 = dfa->is_utf8;
 194   pstr->map_notascii = dfa->map_notascii;
 195   pstr->stop = pstr->len;
 196   pstr->raw_stop = pstr->stop;
 197 }
 198
 199 #ifdef RE_ENABLE_I18N
 200
 201 /* Build wide character buffer PSTR->WCS.
 202    If the byte sequence of the string are:
 203      <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
 204    Then wide character buffer will be:
 205      <wc1>   , WEOF    , <wc2>   , WEOF    , <wc3>
 206    We use WEOF for padding, they indicate that the position isn't
 207    a first byte of a multibyte character.
 208
 209    Note that this function assumes PSTR->VALID_LEN elements are already
 210    built and starts from PSTR->VALID_LEN.  */
 211
 212 static void
 213 build_wcs_buffer (pstr)
 214      re_string_t *pstr;
 215 {
 216 #ifdef _LIBC
 217   unsigned char buf[MB_LEN_MAX];
 218   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 219 #else
 220   unsigned char buf[64];
 221 #endif
 222   mbstate_t prev_st;
 223   int byte_idx, end_idx, remain_len;
 224   size_t mbclen;
 225
 226   /* Build the buffers from pstr->valid_len to either pstr->len or
 227      pstr->bufs_len.  */
 228   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 229   for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
 230     {
 231       wchar_t wc;
 232       const char *p;
 233
 234       remain_len = end_idx - byte_idx;
 235       prev_st = pstr->cur_state;
 236       /* Apply the translation if we need.  */
 237       if (BE (pstr->trans != NULL, 0))
 238         {
 239           int i, ch;
 240
 241           for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 242             {
 243               ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
 244               buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
 245             }
 246           p = (const char *) buf;
 247         }
 248       else
 249         p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
 250       mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 251       if (BE (mbclen == (size_t) -2, 0))
 252         {
 253           /* The buffer doesn't have enough space, finish to build.  */
 254           pstr->cur_state = prev_st;
 255           break;
 256         }
 257       else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
 258         {
 259           /* We treat these cases as a singlebyte character.  */
 260           mbclen = 1;
 261           wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 262           if (BE (pstr->trans != NULL, 0))
 263             wc = pstr->trans[wc];
 264           pstr->cur_state = prev_st;
 265         }
 266
 267       /* Write wide character and padding.  */
 268       pstr->wcs[byte_idx++] = wc;
 269       /* Write paddings.  */
 270       for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 271         pstr->wcs[byte_idx++] = WEOF;
 272     }
 273   pstr->valid_len = byte_idx;
 274   pstr->valid_raw_len = byte_idx;
 275 }
 276
 277 /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
 278    but for REG_ICASE.  */
 279
 280 static int
 281 build_wcs_upper_buffer (pstr)
 282      re_string_t *pstr;
 283 {
 284   mbstate_t prev_st;
 285   int src_idx, byte_idx, end_idx, remain_len;
 286   size_t mbclen;
 287 #ifdef _LIBC
 288   char buf[MB_LEN_MAX];
 289   assert (MB_LEN_MAX >= pstr->mb_cur_max);
 290 #else
 291   char buf[64];
 292 #endif
 293
 294   byte_idx = pstr->valid_len;
 295   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 296
 297   /* The following optimization assumes that ASCII characters can be
 298      mapped to wide characters with a simple cast.  */
 299   if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
 300     {
 301       while (byte_idx < end_idx)
 302         {
 303           wchar_t wc;
 304
 305           if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
 306               && mbsinit (&pstr->cur_state))
 307             {
 308               /* In case of a singlebyte character.  */
 309               pstr->mbs[byte_idx]
 310                 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
 311               /* The next step uses the assumption that wchar_t is encoded
 312                  ASCII-safe: all ASCII values can be converted like this.  */
 313               pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
 314               ++byte_idx;
 315               continue;
 316             }
 317
 318           remain_len = end_idx - byte_idx;
 319           prev_st = pstr->cur_state;
 320           mbclen = mbrtowc (&wc,
 321                             ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
 322                              + byte_idx), remain_len, &pstr->cur_state);
 323           if (BE (mbclen + 2 > 2, 1))
 324             {
 325               wchar_t wcu = wc;
 326               if (iswlower (wc))
 327                 {
 328                   size_t mbcdlen;
 329
 330                   wcu = towupper (wc);
 331                   mbcdlen = wcrtomb (buf, wcu, &prev_st);
 332                   if (BE (mbclen == mbcdlen, 1))
 333                     memcpy (pstr->mbs + byte_idx, buf, mbclen);
 334                   else
 335                     {
 336                       src_idx = byte_idx;
 337                       goto offsets_needed;
 338                     }
 339                 }
 340               else
 341                 memcpy (pstr->mbs + byte_idx,
 342                         pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
 343               pstr->wcs[byte_idx++] = wcu;
 344               /* Write paddings.  */
 345               for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 346                 pstr->wcs[byte_idx++] = WEOF;
 347             }
 348           else if (mbclen == (size_t) -1 || mbclen == 0)
 349             {
 350               /* It is an invalid character or '\0'.  Just use the byte.  */
 351               int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
 352               pstr->mbs[byte_idx] = ch;
 353               /* And also cast it to wide char.  */
 354               pstr->wcs[byte_idx++] = (wchar_t) ch;
 355               if (BE (mbclen == (size_t) -1, 0))
 356                 pstr->cur_state = prev_st;
 357             }
 358           else
 359             {
 360               /* The buffer doesn't have enough space, finish to build.  */
 361               pstr->cur_state = prev_st;
 362               break;
 363             }
 364         }
 365       pstr->valid_len = byte_idx;
 366       pstr->valid_raw_len = byte_idx;
 367       return REG_NOERROR;
 368     }
 369   else
 370     for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
 371       {
 372         wchar_t wc;
 373         const char *p;
 374       offsets_needed:
 375         remain_len = end_idx - byte_idx;
 376         prev_st = pstr->cur_state;
 377         if (BE (pstr->trans != NULL, 0))
 378           {
 379             int i, ch;
 380
 381             for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
 382               {
 383                 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
 384                 buf[i] = pstr->trans[ch];
 385               }
 386             p = (const char *) buf;
 387           }
 388         else
 389           p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
 390         mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
 391         if (BE (mbclen + 2 > 2, 1))
 392           {
 393             wchar_t wcu = wc;
 394             if (iswlower (wc))
 395               {
 396                 size_t mbcdlen;
 397
 398                 wcu = towupper (wc);
 399                 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
 400                 if (BE (mbclen == mbcdlen, 1))
 401                   memcpy (pstr->mbs + byte_idx, buf, mbclen);
 402                 else if (mbcdlen != (size_t) -1)
 403                   {
 404                     size_t i;
 405
 406                     if (byte_idx + mbcdlen > pstr->bufs_len)
 407                       {
 408                         pstr->cur_state = prev_st;
 409                         break;
 410                       }
 411
 412                     if (pstr->offsets == NULL)
 413                       {
 414                         pstr->offsets = re_malloc (int, pstr->bufs_len);
 415
 416                         if (pstr->offsets == NULL)
 417                           return REG_ESPACE;
 418                       }
 419                     if (!pstr->offsets_needed)
 420                       {
 421                         for (i = 0; i < (size_t) byte_idx; ++i)
 422                           pstr->offsets[i] = i;
 423                         pstr->offsets_needed = 1;
 424                       }
 425
 426                     memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
 427                     pstr->wcs[byte_idx] = wcu;
 428                     pstr->offsets[byte_idx] = src_idx;
 429                     for (i = 1; i < mbcdlen; ++i)
 430                       {
 431                         pstr->offsets[byte_idx + i]
 432                           = src_idx + (i < mbclen ? i : mbclen - 1);
 433                         pstr->wcs[byte_idx + i] = WEOF;
 434                       }
 435                     pstr->len += mbcdlen - mbclen;
 436                     if (pstr->raw_stop > src_idx)
 437                       pstr->stop += mbcdlen - mbclen;
 438                     end_idx = (pstr->bufs_len > pstr->len)
 439                               ? pstr->len : pstr->bufs_len;
 440                     byte_idx += mbcdlen;
 441                     src_idx += mbclen;
 442                     continue;
 443                   }
 444                 else
 445                   memcpy (pstr->mbs + byte_idx, p, mbclen);
 446               }
 447             else
 448               memcpy (pstr->mbs + byte_idx, p, mbclen);
 449
 450             if (BE (pstr->offsets_needed != 0, 0))
 451               {
 452                 size_t i;
 453                 for (i = 0; i < mbclen; ++i)
 454                   pstr->offsets[byte_idx + i] = src_idx + i;
 455               }
 456             src_idx += mbclen;
 457
 458             pstr->wcs[byte_idx++] = wcu;
 459             /* Write paddings.  */
 460             for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
 461               pstr->wcs[byte_idx++] = WEOF;
 462           }
 463         else if (mbclen == (size_t) -1 || mbclen == 0)
 464           {
 465             /* It is an invalid character or '\0'.  Just use the byte.  */
 466             int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
 467
 468             if (BE (pstr->trans != NULL, 0))
 469               ch = pstr->trans [ch];
 470             pstr->mbs[byte_idx] = ch;
 471
 472             if (BE (pstr->offsets_needed != 0, 0))
 473               pstr->offsets[byte_idx] = src_idx;
 474             ++src_idx;
 475
 476             /* And also cast it to wide char.  */
 477             pstr->wcs[byte_idx++] = (wchar_t) ch;
 478             if (BE (mbclen == (size_t) -1, 0))
 479               pstr->cur_state = prev_st;
 480           }
 481         else
 482           {
 483             /* The buffer doesn't have enough space, finish to build.  */
 484             pstr->cur_state = prev_st;
 485             break;
 486           }
 487       }
 488   pstr->valid_len = byte_idx;
 489   pstr->valid_raw_len = src_idx;
 490   return REG_NOERROR;
 491 }
 492
 493 /* Skip characters until the index becomes greater than NEW_RAW_IDX.
 494    Return the index.  */
 495
 496 static int
 497 re_string_skip_chars (pstr, new_raw_idx, last_wc)
 498      re_string_t *pstr;
 499      int new_raw_idx;
 500      wint_t *last_wc;
 501 {
 502   mbstate_t prev_st;
 503   int rawbuf_idx;
 504   size_t mbclen;
 505   wchar_t wc = 0;
 506
 507   /* Skip the characters which are not necessary to check.  */
 508   for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
 509        rawbuf_idx < new_raw_idx;)
 510     {
 511       int remain_len;
 512       remain_len = pstr->len - rawbuf_idx;
 513       prev_st = pstr->cur_state;
 514       mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
 515                         remain_len, &pstr->cur_state);
 516       if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
 517         {
 518           /* We treat these cases as a singlebyte character.  */
 519           mbclen = 1;
 520           pstr->cur_state = prev_st;
 521         }
 522       /* Then proceed the next character.  */
 523       rawbuf_idx += mbclen;
 524     }
 525   *last_wc = (wint_t) wc;
 526   return rawbuf_idx;
 527 }
 528 #endif /* RE_ENABLE_I18N  */
 529
 530 /* Build the buffer PSTR->MBS, and apply the translation if we need.
 531    This function is used in case of REG_ICASE.  */
 532
 533 static void
 534 build_upper_buffer (pstr)
 535      re_string_t *pstr;
 536 {
 537   int char_idx, end_idx;
 538   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 539
 540   for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
 541     {
 542       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
 543       if (BE (pstr->trans != NULL, 0))
 544         ch = pstr->trans[ch];
 545       if (islower (ch))
 546         pstr->mbs[char_idx] = toupper (ch);
 547       else
 548         pstr->mbs[char_idx] = ch;
 549     }
 550   pstr->valid_len = char_idx;
 551   pstr->valid_raw_len = char_idx;
 552 }
 553
 554 /* Apply TRANS to the buffer in PSTR.  */
 555
 556 static void
 557 re_string_translate_buffer (pstr)
 558      re_string_t *pstr;
 559 {
 560   int buf_idx, end_idx;
 561   end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
 562
 563   for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
 564     {
 565       int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
 566       pstr->mbs[buf_idx] = pstr->trans[ch];
 567     }
 568
 569   pstr->valid_len = buf_idx;
 570   pstr->valid_raw_len = buf_idx;
 571 }
 572
 573 /* This function re-construct the buffers.
 574    Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
 575    convert to upper case in case of REG_ICASE, apply translation.  */
 576
 577 static reg_errcode_t
 578 re_string_reconstruct (pstr, idx, eflags)
 579      re_string_t *pstr;
 580      int idx, eflags;
 581 {
 582   int offset = idx - pstr->raw_mbs_idx;
 583   if (BE (offset < 0, 0))
 584     {
 585       /* Reset buffer.  */
 586 #ifdef RE_ENABLE_I18N
 587       if (pstr->mb_cur_max > 1)
 588         memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
 589 #endif /* RE_ENABLE_I18N */
 590       pstr->len = pstr->raw_len;
 591       pstr->stop = pstr->raw_stop;
 592       pstr->valid_len = 0;
 593       pstr->raw_mbs_idx = 0;
 594       pstr->valid_raw_len = 0;
 595       pstr->offsets_needed = 0;
 596       pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
 597                            : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
 598       if (!pstr->mbs_allocated)
 599         pstr->mbs = (unsigned char *) pstr->raw_mbs;
 600       offset = idx;
 601     }
 602
 603   if (BE (offset != 0, 1))
 604     {
 605       /* Are the characters which are already checked remain?  */
 606       if (BE (offset < pstr->valid_raw_len, 1)
 607 #ifdef RE_ENABLE_I18N
 608           /* Handling this would enlarge the code too much.
 609              Accept a slowdown in that case.  */
 610           && pstr->offsets_needed == 0
 611 #endif
 612          )
 613         {
 614           /* Yes, move them to the front of the buffer.  */
 615           pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 616 #ifdef RE_ENABLE_I18N
 617           if (pstr->mb_cur_max > 1)
 618             memmove (pstr->wcs, pstr->wcs + offset,
 619                      (pstr->valid_len - offset) * sizeof (wint_t));
 620 #endif /* RE_ENABLE_I18N */
 621           if (BE (pstr->mbs_allocated, 0))
 622             memmove (pstr->mbs, pstr->mbs + offset,
 623                      pstr->valid_len - offset);
 624           pstr->valid_len -= offset;
 625           pstr->valid_raw_len -= offset;
 626 #if DEBUG
 627           assert (pstr->valid_len > 0);
 628 #endif
 629         }
 630       else
 631         {
 632           /* No, skip all characters until IDX.  */
 633 #ifdef RE_ENABLE_I18N
 634           if (BE (pstr->offsets_needed, 0))
 635             {
 636               pstr->len = pstr->raw_len - idx + offset;
 637               pstr->stop = pstr->raw_stop - idx + offset;
 638               pstr->offsets_needed = 0;
 639             }
 640 #endif
 641           pstr->valid_len = 0;
 642           pstr->valid_raw_len = 0;
 643 #ifdef RE_ENABLE_I18N
 644           if (pstr->mb_cur_max > 1)
 645             {
 646               int wcs_idx;
 647               wint_t wc = WEOF;
 648
 649               if (pstr->is_utf8)
 650                 {
 651                   const unsigned char *raw, *p, *q, *end;
 652
 653                   /* Special case UTF-8.  Multi-byte chars start with any
 654                      byte other than 0x80 - 0xbf.  */
 655                   raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 656                   end = raw + (offset - pstr->mb_cur_max);
 657                   for (p = raw + offset - 1; p >= end; --p)
 658                     if ((*p & 0xc0) != 0x80)
 659                       {
 660                         mbstate_t cur_state;
 661                         wchar_t wc2;
 662                         int mlen = raw + pstr->len - p;
 663                         unsigned char buf[6];
 664
 665                         q = p;
 666                         if (BE (pstr->trans != NULL, 0))
 667                           {
 668                             int i = mlen < 6 ? mlen : 6;
 669                             while (--i >= 0)
 670                               buf[i] = pstr->trans[p[i]];
 671                             q = buf;
 672                           }
 673                         /* XXX Don't use mbrtowc, we know which conversion
 674                            to use (UTF-8 -> UCS4).  */
 675                         memset (&cur_state, 0, sizeof (cur_state));
 676                         mlen = (mbrtowc (&wc2, (const char *) p, mlen,
 677                                          &cur_state)
 678                                 - (raw + offset - p));
 679                         if (mlen >= 0)
 680                           {
 681                             memset (&pstr->cur_state, '\0',
 682                                     sizeof (mbstate_t));
 683                             pstr->valid_len = mlen;
 684                             wc = wc2;
 685                           }
 686                         break;
 687                       }
 688                 }
 689
 690               if (wc == WEOF)
 691                 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 692               if (BE (pstr->valid_len, 0))
 693                 {
 694                   for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 695                     pstr->wcs[wcs_idx] = WEOF;
 696                   if (pstr->mbs_allocated)
 697                     memset (pstr->mbs, 255, pstr->valid_len);
 698                 }
 699               pstr->valid_raw_len = pstr->valid_len;
 700               pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 701                                     && IS_WIDE_WORD_CHAR (wc))
 702                                    ? CONTEXT_WORD
 703                                    : ((IS_WIDE_NEWLINE (wc)
 704                                        && pstr->newline_anchor)
 705                                       ? CONTEXT_NEWLINE : 0));
 706             }
 707           else
 708 #endif /* RE_ENABLE_I18N */
 709             {
 710               int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
 711               if (pstr->trans)
 712                 c = pstr->trans[c];
 713               pstr->tip_context = (bitset_contain (pstr->word_char, c)
 714                                    ? CONTEXT_WORD
 715                                    : ((IS_NEWLINE (c) && pstr->newline_anchor)
 716                                       ? CONTEXT_NEWLINE : 0));
 717             }
 718         }
 719       if (!BE (pstr->mbs_allocated, 0))
 720         pstr->mbs += offset;
 721     }
 722   pstr->raw_mbs_idx = idx;
 723   pstr->len -= offset;
 724   pstr->stop -= offset;
 725
 726   /* Then build the buffers.  */
 727 #ifdef RE_ENABLE_I18N
 728   if (pstr->mb_cur_max > 1)
 729     {
 730       if (pstr->icase)
 731         {
 732           int ret = build_wcs_upper_buffer (pstr);
 733           if (BE (ret != REG_NOERROR, 0))
 734             return ret;
 735         }
 736       else
 737         build_wcs_buffer (pstr);
 738     }
 739   else
 740 #endif /* RE_ENABLE_I18N */
 741   if (BE (pstr->mbs_allocated, 0))
 742     {
 743       if (pstr->icase)
 744         build_upper_buffer (pstr);
 745       else if (pstr->trans != NULL)
 746         re_string_translate_buffer (pstr);
 747     }
 748   else
 749     pstr->valid_len = pstr->len;
 750
 751   pstr->cur_idx = 0;
 752   return REG_NOERROR;
 753 }
 754
 755 static unsigned char
 756 re_string_peek_byte_case (pstr, idx)
 757      const re_string_t *pstr;
 758      int idx;
 759 {
 760   int ch, off;
 761
 762   /* Handle the common (easiest) cases first.  */
 763   if (BE (!pstr->mbs_allocated, 1))
 764     return re_string_peek_byte (pstr, idx);
 765
 766 #ifdef RE_ENABLE_I18N
 767   if (pstr->mb_cur_max > 1
 768       && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
 769     return re_string_peek_byte (pstr, idx);
 770 #endif
 771
 772   off = pstr->cur_idx + idx;
 773 #ifdef RE_ENABLE_I18N
 774   if (pstr->offsets_needed)
 775     off = pstr->offsets[off];
 776 #endif
 777
 778   ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 779
 780 #ifdef RE_ENABLE_I18N
 781   /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
 782      this function returns CAPITAL LETTER I instead of first byte of
 783      DOTLESS SMALL LETTER I.  The latter would confuse the parser,
 784      since peek_byte_case doesn't advance cur_idx in any way.  */
 785   if (pstr->offsets_needed && !isascii (ch))
 786     return re_string_peek_byte (pstr, idx);
 787 #endif
 788
 789   return ch;
 790 }
 791
 792 static unsigned char
 793 re_string_fetch_byte_case (pstr)
 794      re_string_t *pstr;
 795 {
 796   if (BE (!pstr->mbs_allocated, 1))
 797     return re_string_fetch_byte (pstr);
 798
 799 #ifdef RE_ENABLE_I18N
 800   if (pstr->offsets_needed)
 801     {
 802       int off, ch;
 803
 804       /* For tr_TR.UTF-8 [[:islower:]] there is
 805          [[: CAPITAL LETTER I WITH DOT lower:]] in mbs.  Skip
 806          in that case the whole multi-byte character and return
 807          the original letter.  On the other side, with
 808          [[: DOTLESS SMALL LETTER I return [[:I, as doing
 809          anything else would complicate things too much.  */
 810
 811       if (!re_string_first_byte (pstr, pstr->cur_idx))
 812         return re_string_fetch_byte (pstr);
 813
 814       off = pstr->offsets[pstr->cur_idx];
 815       ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
 816
 817       if (! isascii (ch))
 818         return re_string_fetch_byte (pstr);
 819
 820       re_string_skip_bytes (pstr,
 821                             re_string_char_size_at (pstr, pstr->cur_idx));
 822       return ch;
 823     }
 824 #endif
 825
 826   return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
 827 }
 828
 829 static void
 830 re_string_destruct (pstr)
 831      re_string_t *pstr;
 832 {
 833 #ifdef RE_ENABLE_I18N
 834   re_free (pstr->wcs);
 835   re_free (pstr->offsets);
 836 #endif /* RE_ENABLE_I18N  */
 837   if (pstr->mbs_allocated)
 838     re_free (pstr->mbs);
 839 }
 840
 841 /* Return the context at IDX in INPUT.  */
 842
 843 static unsigned int
 844 re_string_context_at (input, idx, eflags)
 845      const re_string_t *input;
 846      int idx, eflags;
 847 {
 848   int c;
 849   if (BE (idx < 0, 0))
 850     /* In this case, we use the value stored in input->tip_context,
 851        since we can't know the character in input->mbs[-1] here.  */
 852     return input->tip_context;
 853   if (BE (idx == input->len, 0))
 854     return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
 855             : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
 856 #ifdef RE_ENABLE_I18N
 857   if (input->mb_cur_max > 1)
 858     {
 859       wint_t wc;
 860       int wc_idx = idx;
 861       while(input->wcs[wc_idx] == WEOF)
 862         {
 863 #ifdef DEBUG
 864           /* It must not happen.  */
 865           assert (wc_idx >= 0);
 866 #endif
 867           --wc_idx;
 868           if (wc_idx < 0)
 869             return input->tip_context;
 870         }
 871       wc = input->wcs[wc_idx];
 872       if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
 873         return CONTEXT_WORD;
 874       return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
 875               ? CONTEXT_NEWLINE : 0);
 876     }
 877   else
 878 #endif
 879     {
 880       c = re_string_byte_at (input, idx);
 881       if (bitset_contain (input->word_char, c))
 882         return CONTEXT_WORD;
 883       return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
 884     }
 885 }
 886 \f
 887 /* Functions for set operation.  */
 888
 889 static reg_errcode_t
 890 re_node_set_alloc (set, size)
 891      re_node_set *set;
 892      int size;
 893 {
 894   set->alloc = size;
 895   set->nelem = 0;
 896   set->elems = re_malloc (int, size);
 897   if (BE (set->elems == NULL, 0))
 898     return REG_ESPACE;
 899   return REG_NOERROR;
 900 }
 901
 902 static reg_errcode_t
 903 re_node_set_init_1 (set, elem)
 904      re_node_set *set;
 905      int elem;
 906 {
 907   set->alloc = 1;
 908   set->nelem = 1;
 909   set->elems = re_malloc (int, 1);
 910   if (BE (set->elems == NULL, 0))
 911     {
 912       set->alloc = set->nelem = 0;
 913       return REG_ESPACE;
 914     }
 915   set->elems[0] = elem;
 916   return REG_NOERROR;
 917 }
 918
 919 static reg_errcode_t
 920 re_node_set_init_2 (set, elem1, elem2)
 921      re_node_set *set;
 922      int elem1, elem2;
 923 {
 924   set->alloc = 2;
 925   set->elems = re_malloc (int, 2);
 926   if (BE (set->elems == NULL, 0))
 927     return REG_ESPACE;
 928   if (elem1 == elem2)
 929     {
 930       set->nelem = 1;
 931       set->elems[0] = elem1;
 932     }
 933   else
 934     {
 935       set->nelem = 2;
 936       if (elem1 < elem2)
 937         {
 938           set->elems[0] = elem1;
 939           set->elems[1] = elem2;
 940         }
 941       else
 942         {
 943           set->elems[0] = elem2;
 944           set->elems[1] = elem1;
 945         }
 946     }
 947   return REG_NOERROR;
 948 }
 949
 950 static reg_errcode_t
 951 re_node_set_init_copy (dest, src)
 952      re_node_set *dest;
 953      const re_node_set *src;
 954 {
 955   dest->nelem = src->nelem;
 956   if (src->nelem > 0)
 957     {
 958       dest->alloc = dest->nelem;
 959       dest->elems = re_malloc (int, dest->alloc);
 960       if (BE (dest->elems == NULL, 0))
 961         {
 962           dest->alloc = dest->nelem = 0;
 963           return REG_ESPACE;
 964         }
 965       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
 966     }
 967   else
 968     re_node_set_init_empty (dest);
 969   return REG_NOERROR;
 970 }
 971
 972 /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
 973    DEST. Return value indicate the error code or REG_NOERROR if succeeded.
 974    Note: We assume dest->elems is NULL, when dest->alloc is 0.  */
 975
 976 static reg_errcode_t
 977 re_node_set_add_intersect (dest, src1, src2)
 978      re_node_set *dest;
 979      const re_node_set *src1, *src2;
 980 {
 981   int i1, i2, is, id, delta, sbase;
 982   if (src1->nelem == 0 || src2->nelem == 0)
 983     return REG_NOERROR;
 984
 985   /* We need dest->nelem + 2 * elems_in_intersection; this is a
 986      conservative estimate.  */
 987   if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
 988     {
 989       int new_alloc = src1->nelem + src2->nelem + dest->alloc;
 990       int *new_elems = re_realloc (dest->elems, int, new_alloc);
 991       if (BE (new_elems == NULL, 0))
 992         return REG_ESPACE;
 993       dest->elems = new_elems;
 994       dest->alloc = new_alloc;
 995     }
 996
 997   /* Find the items in the intersection of SRC1 and SRC2, and copy
 998      into the top of DEST those that are not already in DEST itself.  */
 999   sbase = dest->nelem + src1->nelem + src2->nelem;
1000   i1 = src1->nelem - 1;
1001   i2 = src2->nelem - 1;
1002   id = dest->nelem - 1;
1003   for (;;)
1004     {
1005       if (src1->elems[i1] == src2->elems[i2])
1006         {
1007           /* Try to find the item in DEST.  Maybe we could binary search?  */
1008           while (id >= 0 && dest->elems[id] > src1->elems[i1])
1009             --id;
1010
1011           if (id < 0 || dest->elems[id] != src1->elems[i1])
1012             dest->elems[--sbase] = src1->elems[i1];
1013
1014           if (--i1 < 0 || --i2 < 0)
1015             break;
1016         }
1017
1018       /* Lower the highest of the two items.  */
1019       else if (src1->elems[i1] < src2->elems[i2])
1020         {
1021           if (--i2 < 0)
1022             break;
1023         }
1024       else
1025         {
1026           if (--i1 < 0)
1027             break;
1028         }
1029     }
1030
1031   id = dest->nelem - 1;
1032   is = dest->nelem + src1->nelem + src2->nelem - 1;
1033   delta = is - sbase + 1;
1034
1035   /* Now copy.  When DELTA becomes zero, the remaining
1036      DEST elements are already in place; this is more or
1037      less the same loop that is in re_node_set_merge.  */
1038   dest->nelem += delta;
1039   if (delta > 0 && id >= 0)
1040     for (;;)
1041       {
1042         if (dest->elems[is] > dest->elems[id])
1043           {
1044             /* Copy from the top.  */
1045             dest->elems[id + delta--] = dest->elems[is--];
1046             if (delta == 0)
1047               break;
1048           }
1049         else
1050           {
1051             /* Slide from the bottom.  */
1052             dest->elems[id + delta] = dest->elems[id];
1053             if (--id < 0)
1054               break;
1055           }
1056       }
1057
1058   /* Copy remaining SRC elements.  */
1059   memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1060
1061   return REG_NOERROR;
1062 }
1063
1064 /* Calculate the union set of the sets SRC1 and SRC2. And store it to
1065    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1066
1067 static reg_errcode_t
1068 re_node_set_init_union (dest, src1, src2)
1069      re_node_set *dest;
1070      const re_node_set *src1, *src2;
1071 {
1072   int i1, i2, id;
1073   if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1074     {
1075       dest->alloc = src1->nelem + src2->nelem;
1076       dest->elems = re_malloc (int, dest->alloc);
1077       if (BE (dest->elems == NULL, 0))
1078         return REG_ESPACE;
1079     }
1080   else
1081     {
1082       if (src1 != NULL && src1->nelem > 0)
1083         return re_node_set_init_copy (dest, src1);
1084       else if (src2 != NULL && src2->nelem > 0)
1085         return re_node_set_init_copy (dest, src2);
1086       else
1087         re_node_set_init_empty (dest);
1088       return REG_NOERROR;
1089     }
1090   for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1091     {
1092       if (src1->elems[i1] > src2->elems[i2])
1093         {
1094           dest->elems[id++] = src2->elems[i2++];
1095           continue;
1096         }
1097       if (src1->elems[i1] == src2->elems[i2])
1098         ++i2;
1099       dest->elems[id++] = src1->elems[i1++];
1100     }
1101   if (i1 < src1->nelem)
1102     {
1103       memcpy (dest->elems + id, src1->elems + i1,
1104              (src1->nelem - i1) * sizeof (int));
1105       id += src1->nelem - i1;
1106     }
1107   else if (i2 < src2->nelem)
1108     {
1109       memcpy (dest->elems + id, src2->elems + i2,
1110              (src2->nelem - i2) * sizeof (int));
1111       id += src2->nelem - i2;
1112     }
1113   dest->nelem = id;
1114   return REG_NOERROR;
1115 }
1116
1117 /* Calculate the union set of the sets DEST and SRC. And store it to
1118    DEST. Return value indicate the error code or REG_NOERROR if succeeded.  */
1119
1120 static reg_errcode_t
1121 re_node_set_merge (dest, src)
1122      re_node_set *dest;
1123      const re_node_set *src;
1124 {
1125   int is, id, sbase, delta;
1126   if (src == NULL || src->nelem == 0)
1127     return REG_NOERROR;
1128   if (dest->alloc < 2 * src->nelem + dest->nelem)
1129     {
1130       int new_alloc = 2 * (src->nelem + dest->alloc);
1131       int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1132       if (BE (new_buffer == NULL, 0))
1133         return REG_ESPACE;
1134       dest->elems = new_buffer;
1135       dest->alloc = new_alloc;
1136     }
1137
1138   if (BE (dest->nelem == 0, 0))
1139     {
1140       dest->nelem = src->nelem;
1141       memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1142       return REG_NOERROR;
1143     }
1144
1145   /* Copy into the top of DEST the items of SRC that are not
1146      found in DEST.  Maybe we could binary search in DEST?  */
1147   for (sbase = dest->nelem + 2 * src->nelem,
1148        is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1149     {
1150       if (dest->elems[id] == src->elems[is])
1151         is--, id--;
1152       else if (dest->elems[id] < src->elems[is])
1153         dest->elems[--sbase] = src->elems[is--];
1154       else /* if (dest->elems[id] > src->elems[is]) */
1155         --id;
1156     }
1157
1158   if (is >= 0)
1159     {
1160       /* If DEST is exhausted, the remaining items of SRC must be unique.  */
1161       sbase -= is + 1;
1162       memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1163     }
1164
1165   id = dest->nelem - 1;
1166   is = dest->nelem + 2 * src->nelem - 1;
1167   delta = is - sbase + 1;
1168   if (delta == 0)
1169     return REG_NOERROR;
1170
1171   /* Now copy.  When DELTA becomes zero, the remaining
1172      DEST elements are already in place.  */
1173   dest->nelem += delta;
1174   for (;;)
1175     {
1176       if (dest->elems[is] > dest->elems[id])
1177         {
1178           /* Copy from the top.  */
1179           dest->elems[id + delta--] = dest->elems[is--];
1180           if (delta == 0)
1181             break;
1182         }
1183       else
1184         {
1185           /* Slide from the bottom.  */
1186           dest->elems[id + delta] = dest->elems[id];
1187           if (--id < 0)
1188             {
1189               /* Copy remaining SRC elements.  */
1190               memcpy (dest->elems, dest->elems + sbase,
1191                       delta * sizeof (int));
1192               break;
1193             }
1194         }
1195     }
1196
1197   return REG_NOERROR;
1198 }
1199
1200 /* Insert the new element ELEM to the re_node_set* SET.
1201    SET should not already have ELEM.
1202    return -1 if an error is occured, return 1 otherwise.  */
1203
1204 static int
1205 re_node_set_insert (set, elem)
1206      re_node_set *set;
1207      int elem;
1208 {
1209   int idx;
1210   /* In case the set is empty.  */
1211   if (set->alloc == 0)
1212     {
1213       if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1214         return 1;
1215       else
1216         return -1;
1217     }
1218
1219   if (BE (set->nelem, 0) == 0)
1220     {
1221       /* We already guaranteed above that set->alloc != 0.  */
1222       set->elems[0] = elem;
1223       ++set->nelem;
1224       return 1;
1225     }
1226
1227   /* Realloc if we need.  */
1228   if (set->alloc == set->nelem)
1229     {
1230       int *new_array;
1231       set->alloc = set->alloc * 2;
1232       new_array = re_realloc (set->elems, int, set->alloc);
1233       if (BE (new_array == NULL, 0))
1234         return -1;
1235       set->elems = new_array;
1236     }
1237
1238   /* Move the elements which follows the new element.  Test the
1239      first element separately to skip a check in the inner loop.  */
1240   if (elem < set->elems[0])
1241     {
1242       idx = 0;
1243       for (idx = set->nelem; idx > 0; idx--)
1244         set->elems[idx] = set->elems[idx - 1];
1245     }
1246   else
1247     {
1248       for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1249         set->elems[idx] = set->elems[idx - 1];
1250     }
1251
1252   /* Insert the new element.  */
1253   set->elems[idx] = elem;
1254   ++set->nelem;
1255   return 1;
1256 }
1257
1258 /* Insert the new element ELEM to the re_node_set* SET.
1259    SET should not already have any element greater than or equal to ELEM.
1260    Return -1 if an error is occured, return 1 otherwise.  */
1261
1262 static int
1263 re_node_set_insert_last (set, elem)
1264      re_node_set *set;
1265      int elem;
1266 {
1267   /* Realloc if we need.  */
1268   if (set->alloc == set->nelem)
1269     {
1270       int *new_array;
1271       set->alloc = (set->alloc + 1) * 2;
1272       new_array = re_realloc (set->elems, int, set->alloc);
1273       if (BE (new_array == NULL, 0))
1274         return -1;
1275       set->elems = new_array;
1276     }
1277
1278   /* Insert the new element.  */
1279   set->elems[set->nelem++] = elem;
1280   return 1;
1281 }
1282
1283 /* Compare two node sets SET1 and SET2.
1284    return 1 if SET1 and SET2 are equivalent, return 0 otherwise.  */
1285
1286 static int
1287 re_node_set_compare (set1, set2)
1288      const re_node_set *set1, *set2;
1289 {
1290   int i;
1291   if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1292     return 0;
1293   for (i = set1->nelem ; --i >= 0 ; )
1294     if (set1->elems[i] != set2->elems[i])
1295       return 0;
1296   return 1;
1297 }
1298
1299 /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise.  */
1300
1301 static int
1302 re_node_set_contains (set, elem)
1303      const re_node_set *set;
1304      int elem;
1305 {
1306   unsigned int idx, right, mid;
1307   if (set->nelem <= 0)
1308     return 0;
1309
1310   /* Binary search the element.  */
1311   idx = 0;
1312   right = set->nelem - 1;
1313   while (idx < right)
1314     {
1315       mid = (idx + right) / 2;
1316       if (set->elems[mid] < elem)
1317         idx = mid + 1;
1318       else
1319         right = mid;
1320     }
1321   return set->elems[idx] == elem ? idx + 1 : 0;
1322 }
1323
1324 static void
1325 re_node_set_remove_at (set, idx)
1326      re_node_set *set;
1327      int idx;
1328 {
1329   if (idx < 0 || idx >= set->nelem)
1330     return;
1331   --set->nelem;
1332   for (; idx < set->nelem; idx++)
1333     set->elems[idx] = set->elems[idx + 1];
1334 }
1335 \f
1336
1337 /* Add the token TOKEN to dfa->nodes, and return the index of the token.
1338    Or return -1, if an error will be occured.  */
1339
1340 static int
1341 re_dfa_add_node (dfa, token)
1342      re_dfa_t *dfa;
1343      re_token_t token;
1344 {
1345   int type = token.type;
1346   if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1347     {
1348       int new_nodes_alloc = dfa->nodes_alloc * 2;
1349       int *new_nexts, *new_indices;
1350       re_node_set *new_edests, *new_eclosures;
1351
1352       re_token_t *new_array = re_realloc (dfa->nodes, re_token_t,
1353                                           new_nodes_alloc);
1354       if (BE (new_array == NULL, 0))
1355         return -1;
1356       dfa->nodes = new_array;
1357       new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1358       new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1359       new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1360       new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1361       if (BE (new_nexts == NULL || new_indices == NULL
1362               || new_edests == NULL || new_eclosures == NULL, 0))
1363         return -1;
1364       dfa->nexts = new_nexts;
1365       dfa->org_indices = new_indices;
1366       dfa->edests = new_edests;
1367       dfa->eclosures = new_eclosures;
1368       dfa->nodes_alloc = new_nodes_alloc;
1369     }
1370   dfa->nodes[dfa->nodes_len] = token;
1371   dfa->nodes[dfa->nodes_len].constraint = 0;
1372 #ifdef RE_ENABLE_I18N
1373   dfa->nodes[dfa->nodes_len].accept_mb =
1374     (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
1375 #endif
1376   dfa->nexts[dfa->nodes_len] = -1;
1377   re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1378   re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1379   return dfa->nodes_len++;
1380 }
1381
1382 static unsigned int inline
1383 calc_state_hash (nodes, context)
1384      const re_node_set *nodes;
1385      unsigned int context;
1386 {
1387   unsigned int hash = nodes->nelem + context;
1388   int i;
1389   for (i = 0 ; i < nodes->nelem ; i++)
1390     hash += nodes->elems[i];
1391   return hash;
1392 }
1393
1394 /* Search for the state whose node_set is equivalent to NODES.
1395    Return the pointer to the state, if we found it in the DFA.
1396    Otherwise create the new one and return it.  In case of an error
1397    return NULL and set the error code in ERR.
1398    Note: - We assume NULL as the invalid state, then it is possible that
1399            return value is NULL and ERR is REG_NOERROR.
1400          - We never return non-NULL value in case of any errors, it is for
1401            optimization.  */
1402
1403 static re_dfastate_t*
1404 re_acquire_state (err, dfa, nodes)
1405      reg_errcode_t *err;
1406      re_dfa_t *dfa;
1407      const re_node_set *nodes;
1408 {
1409   unsigned int hash;
1410   re_dfastate_t *new_state;
1411   struct re_state_table_entry *spot;
1412   int i;
1413   if (BE (nodes->nelem == 0, 0))
1414     {
1415       *err = REG_NOERROR;
1416       return NULL;
1417     }
1418   hash = calc_state_hash (nodes, 0);
1419   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1420
1421   for (i = 0 ; i < spot->num ; i++)
1422     {
1423       re_dfastate_t *state = spot->array[i];
1424       if (hash != state->hash)
1425         continue;
1426       if (re_node_set_compare (&state->nodes, nodes))
1427         return state;
1428     }
1429
1430   /* There are no appropriate state in the dfa, create the new one.  */
1431   new_state = create_ci_newstate (dfa, nodes, hash);
1432   if (BE (new_state != NULL, 1))
1433     return new_state;
1434   else
1435     {
1436       *err = REG_ESPACE;
1437       return NULL;
1438     }
1439 }
1440
1441 /* Search for the state whose node_set is equivalent to NODES and
1442    whose context is equivalent to CONTEXT.
1443    Return the pointer to the state, if we found it in the DFA.
1444    Otherwise create the new one and return it.  In case of an error
1445    return NULL and set the error code in ERR.
1446    Note: - We assume NULL as the invalid state, then it is possible that
1447            return value is NULL and ERR is REG_NOERROR.
1448          - We never return non-NULL value in case of any errors, it is for
1449            optimization.  */
1450
1451 static re_dfastate_t*
1452 re_acquire_state_context (err, dfa, nodes, context)
1453      reg_errcode_t *err;
1454      re_dfa_t *dfa;
1455      const re_node_set *nodes;
1456      unsigned int context;
1457 {
1458   unsigned int hash;
1459   re_dfastate_t *new_state;
1460   struct re_state_table_entry *spot;
1461   int i;
1462   if (nodes->nelem == 0)
1463     {
1464       *err = REG_NOERROR;
1465       return NULL;
1466     }
1467   hash = calc_state_hash (nodes, context);
1468   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1469
1470   for (i = 0 ; i < spot->num ; i++)
1471     {
1472       re_dfastate_t *state = spot->array[i];
1473       if (state->hash == hash
1474           && state->context == context
1475           && re_node_set_compare (state->entrance_nodes, nodes))
1476         return state;
1477     }
1478   /* There are no appropriate state in `dfa', create the new one.  */
1479   new_state = create_cd_newstate (dfa, nodes, context, hash);
1480   if (BE (new_state != NULL, 1))
1481     return new_state;
1482   else
1483     {
1484       *err = REG_ESPACE;
1485       return NULL;
1486     }
1487 }
1488
1489 /* Finish initialization of the new state NEWSTATE, and using its hash value
1490    HASH put in the appropriate bucket of DFA's state table.  Return value
1491    indicates the error code if failed.  */
1492
1493 static reg_errcode_t
1494 register_state (dfa, newstate, hash)
1495      re_dfa_t *dfa;
1496      re_dfastate_t *newstate;
1497      unsigned int hash;
1498 {
1499   struct re_state_table_entry *spot;
1500   reg_errcode_t err;
1501   int i;
1502
1503   newstate->hash = hash;
1504   err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1505   if (BE (err != REG_NOERROR, 0))
1506     return REG_ESPACE;
1507   for (i = 0; i < newstate->nodes.nelem; i++)
1508     {
1509       int elem = newstate->nodes.elems[i];
1510       if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1511         re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1512     }
1513
1514   spot = dfa->state_table + (hash & dfa->state_hash_mask);
1515   if (BE (spot->alloc <= spot->num, 0))
1516     {
1517       int new_alloc = 2 * spot->num + 2;
1518       re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1519                                               new_alloc);
1520       if (BE (new_array == NULL, 0))
1521         return REG_ESPACE;
1522       spot->array = new_array;
1523       spot->alloc = new_alloc;
1524     }
1525   spot->array[spot->num++] = newstate;
1526   return REG_NOERROR;
1527 }
1528
1529 /* Create the new state which is independ of contexts.
1530    Return the new state if succeeded, otherwise return NULL.  */
1531
1532 static re_dfastate_t *
1533 create_ci_newstate (dfa, nodes, hash)
1534      re_dfa_t *dfa;
1535      const re_node_set *nodes;
1536      unsigned int hash;
1537 {
1538   int i;
1539   reg_errcode_t err;
1540   re_dfastate_t *newstate;
1541
1542   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1543   if (BE (newstate == NULL, 0))
1544     return NULL;
1545   err = re_node_set_init_copy (&newstate->nodes, nodes);
1546   if (BE (err != REG_NOERROR, 0))
1547     {
1548       re_free (newstate);
1549       return NULL;
1550     }
1551
1552   newstate->entrance_nodes = &newstate->nodes;
1553   for (i = 0 ; i < nodes->nelem ; i++)
1554     {
1555       re_token_t *node = dfa->nodes + nodes->elems[i];
1556       re_token_type_t type = node->type;
1557       if (type == CHARACTER && !node->constraint)
1558         continue;
1559 #ifdef RE_ENABLE_I18N
1560       newstate->accept_mb |= node->accept_mb;
1561 #endif /* RE_ENABLE_I18N */
1562
1563       /* If the state has the halt node, the state is a halt state.  */
1564       if (type == END_OF_RE)
1565         newstate->halt = 1;
1566       else if (type == OP_BACK_REF)
1567         newstate->has_backref = 1;
1568       else if (type == ANCHOR || node->constraint)
1569         newstate->has_constraint = 1;
1570     }
1571   err = register_state (dfa, newstate, hash);
1572   if (BE (err != REG_NOERROR, 0))
1573     {
1574       free_state (newstate);
1575       newstate = NULL;
1576     }
1577   return newstate;
1578 }
1579
1580 /* Create the new state which is depend on the context CONTEXT.
1581    Return the new state if succeeded, otherwise return NULL.  */
1582
1583 static re_dfastate_t *
1584 create_cd_newstate (dfa, nodes, context, hash)
1585      re_dfa_t *dfa;
1586      const re_node_set *nodes;
1587      unsigned int context, hash;
1588 {
1589   int i, nctx_nodes = 0;
1590   reg_errcode_t err;
1591   re_dfastate_t *newstate;
1592
1593   newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1594   if (BE (newstate == NULL, 0))
1595     return NULL;
1596   err = re_node_set_init_copy (&newstate->nodes, nodes);
1597   if (BE (err != REG_NOERROR, 0))
1598     {
1599       re_free (newstate);
1600       return NULL;
1601     }
1602
1603   newstate->context = context;
1604   newstate->entrance_nodes = &newstate->nodes;
1605
1606   for (i = 0 ; i < nodes->nelem ; i++)
1607     {
1608       unsigned int constraint = 0;
1609       re_token_t *node = dfa->nodes + nodes->elems[i];
1610       re_token_type_t type = node->type;
1611       if (node->constraint)
1612         constraint = node->constraint;
1613
1614       if (type == CHARACTER && !constraint)
1615         continue;
1616 #ifdef RE_ENABLE_I18N
1617       newstate->accept_mb |= node->accept_mb;
1618 #endif /* RE_ENABLE_I18N */
1619
1620       /* If the state has the halt node, the state is a halt state.  */
1621       if (type == END_OF_RE)
1622         newstate->halt = 1;
1623       else if (type == OP_BACK_REF)
1624         newstate->has_backref = 1;
1625       else if (type == ANCHOR)
1626         constraint = node->opr.ctx_type;
1627
1628       if (constraint)
1629         {
1630           if (newstate->entrance_nodes == &newstate->nodes)
1631             {
1632               newstate->entrance_nodes = re_malloc (re_node_set, 1);
1633               if (BE (newstate->entrance_nodes == NULL, 0))
1634                 {
1635                   free_state (newstate);
1636                   return NULL;
1637                 }
1638               re_node_set_init_copy (newstate->entrance_nodes, nodes);
1639               nctx_nodes = 0;
1640               newstate->has_constraint = 1;
1641             }
1642
1643           if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1644             {
1645               re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1646               ++nctx_nodes;
1647             }
1648         }
1649     }
1650   err = register_state (dfa, newstate, hash);
1651   if (BE (err != REG_NOERROR, 0))
1652     {
1653       free_state (newstate);
1654       newstate = NULL;
1655     }
1656   return  newstate;
1657 }
1658
1659 static void
1660 free_state (state)
1661      re_dfastate_t *state;
1662 {
1663   re_node_set_free (&state->non_eps_nodes);
1664   re_node_set_free (&state->inveclosure);
1665   if (state->entrance_nodes != &state->nodes)
1666     {
1667       re_node_set_free (state->entrance_nodes);
1668       re_free (state->entrance_nodes);
1669     }
1670   re_node_set_free (&state->nodes);
1671   re_free (state->word_trtable);
1672   re_free (state->trtable);
1673   re_free (state);
1674 }