bin/grep/regex/tre-fastmatch.c

   1 /* $FreeBSD$ */
   2
   3 /*-
   4  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
   5  * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
   6  * All rights reserved.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions
  10  * are met:
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in the
  15  *    documentation and/or other materials provided with the distribution.
  16  *
  17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27  * SUCH DAMAGE.
  28  */
  29
  30 #include "glue.h"
  31
  32 #include <ctype.h>
  33 #include <limits.h>
  34 #include <regex.h>
  35 #include <stdbool.h>
  36 #include <stdlib.h>
  37 #include <string.h>
  38 #ifdef TRE_WCHAR
  39 #include <wchar.h>
  40 #include <wctype.h>
  41 #endif
  42
  43 #include "hashtable.h"
  44 #include "tre-fastmatch.h"
  45
  46 static int      fastcmp(const fastmatch_t *fg, const void *data,
  47                         tre_str_type_t type);
  48
  49 /*
  50  * Clean up if pattern compilation fails.
  51  */
  52 #define FAIL_COMP(errcode)                                              \
  53   {                                                                     \
  54     if (fg->pattern)                                                    \
  55       free(fg->pattern);                                                \
  56     if (fg->wpattern)                                                   \
  57       free(fg->wpattern);                                               \
  58     if (fg->qsBc_table)                                                 \
  59       hashtable_free(fg->qsBc_table);                                   \
  60     fg = NULL;                                                          \
  61     return errcode;                                                     \
  62   }
  63
  64 /*
  65  * Skips n characters in the input string and assigns the start
  66  * address to startptr. Note: as per IEEE Std 1003.1-2008
  67  * matching is based on bit pattern not character representations
  68  * so we can handle MB strings as byte sequences just like
  69  * SB strings.
  70  */
  71 #define SKIP_CHARS(n)                                                   \
  72   switch (type)                                                         \
  73     {                                                                   \
  74       case STR_WIDE:                                                    \
  75         startptr = str_wide + n;                                        \
  76         break;                                                          \
  77       default:                                                          \
  78         startptr = str_byte + n;                                        \
  79     }
  80
  81 /*
  82  * Converts the wide string pattern to SB/MB string and stores
  83  * it in fg->pattern. Sets fg->len to the byte length of the
  84  * converted string.
  85  */
  86 #define STORE_MBS_PAT                                                   \
  87   {                                                                     \
  88     size_t siz;                                                         \
  89                                                                         \
  90     siz = wcstombs(NULL, fg->wpattern, 0);                              \
  91     if (siz == (size_t)-1)                                              \
  92       return REG_BADPAT;                                                \
  93     fg->len = siz;                                                      \
  94     fg->pattern = malloc(siz + 1);                                      \
  95     if (fg->pattern == NULL)                                            \
  96       return REG_ESPACE;                                                \
  97     wcstombs(fg->pattern, fg->wpattern, siz);                           \
  98     fg->pattern[siz] = '\0';                                            \
  99   }                                                                     \
 100
 101 #define CONV_MBS_PAT(src, dest, destsz)                                 \
 102   {                                                                     \
 103     destsz = wcstombs(NULL, src, 0);                                    \
 104     if (destsz == (size_t)-1)                                           \
 105       return REG_BADPAT;                                                \
 106     dest = malloc(destsz + 1);                                          \
 107     if (dest == NULL)                                                   \
 108       return REG_ESPACE;                                                \
 109     wcstombs(dest, src, destsz);                                        \
 110     dest[destsz] = '\0';                                                \
 111   }                                                                     \
 112
 113 #define IS_OUT_OF_BOUNDS                                                \
 114   ((!fg->reversed                                                       \
 115     ? ((type == STR_WIDE) ? ((j + fg->wlen) > len)                      \
 116                           : ((j + fg->len) > len))                      \
 117     : (j < 0)))
 118
 119 /*
 120  * Checks whether the new position after shifting in the input string
 121  * is out of the bounds and break out from the loop if so.
 122  */
 123 #define CHECKBOUNDS                                                     \
 124   if (IS_OUT_OF_BOUNDS)                                                 \
 125     break;                                                              \
 126
 127 /*
 128  * Shifts in the input string after a mismatch. The position of the
 129  * mismatch is stored in the mismatch variable.
 130  */
 131 #define SHIFT                                                           \
 132   CHECKBOUNDS;                                                          \
 133                                                                         \
 134   {                                                                     \
 135     int r = -1;                                                         \
 136     unsigned int bc = 0, gs = 0, ts;                                    \
 137                                                                         \
 138     switch (type)                                                       \
 139       {                                                                 \
 140         case STR_WIDE:                                                  \
 141           if (!fg->hasdot)                                              \
 142             {                                                           \
 143               if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \
 144                 mismatch -= u;                                          \
 145               v = fg->wlen - 1 - mismatch;                              \
 146               r = hashtable_get(fg->qsBc_table,                         \
 147                 &str_wide[!fg->reversed ? (size_t)j + fg->wlen          \
 148                           : (size_t)j - 1], &bc);                       \
 149               gs = fg->bmGs[mismatch];                                  \
 150             }                                                           \
 151             bc = (r == HASH_OK) ? bc : fg->defBc;                       \
 152             DPRINT(("tre_fast_match: mismatch on character" CHF ", "    \
 153                     "BC %d, GS %d\n",                                   \
 154                     ((const tre_char_t *)startptr)[mismatch + 1],       \
 155                     bc, gs));                                           \
 156             break;                                                      \
 157         default:                                                        \
 158           if (!fg->hasdot)                                              \
 159             {                                                           \
 160               if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift)  \
 161                 mismatch -= u;                                          \
 162               v = fg->len - 1 - mismatch;                               \
 163               gs = fg->sbmGs[mismatch];                                 \
 164             }                                                           \
 165           bc = fg->qsBc[((const unsigned char *)str_byte)               \
 166                         [!fg->reversed ? (size_t)j + fg->len            \
 167                          : (size_t)j - 1]];                             \
 168           DPRINT(("tre_fast_match: mismatch on character %c, "          \
 169                  "BC %d, GS %d\n",                                      \
 170                  ((const unsigned char *)startptr)[mismatch + 1],       \
 171                  bc, gs));                                              \
 172       }                                                                 \
 173     if (fg->hasdot)                                                     \
 174       shift = bc;                                                       \
 175     else                                                                \
 176       {                                                                 \
 177         ts = (u >= v) ? (u - v) : 0;                                    \
 178         shift = MAX(ts, bc);                                            \
 179         shift = MAX(shift, gs);                                         \
 180         if (shift == gs)                                                \
 181           u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v);  \
 182         else                                                            \
 183           {                                                             \
 184             if (ts < bc)                                                \
 185               shift = MAX(shift, u + 1);                                \
 186             u = 0;                                                      \
 187           }                                                             \
 188       }                                                                 \
 189       DPRINT(("tre_fast_match: shifting %u characters\n", shift));      \
 190       j = !fg->reversed ? j + shift : j - shift;                        \
 191   }
 192
 193 /*
 194  * Normal Quick Search would require a shift based on the position the
 195  * next character after the comparison is within the pattern.  With
 196  * wildcards, the position of the last dot effects the maximum shift
 197  * distance.
 198  * The closer to the end the wild card is the slower the search.
 199  *
 200  * Examples:
 201  * Pattern    Max shift
 202  * -------    ---------
 203  * this               5
 204  * .his               4
 205  * t.is               3
 206  * th.s               2
 207  * thi.               1
 208  */
 209
 210 /*
 211  * Fills in the bad character shift array for SB/MB strings.
 212  */
 213 #define FILL_QSBC                                                       \
 214   if (fg->reversed)                                                     \
 215     {                                                                   \
 216       _FILL_QSBC_REVERSED                                               \
 217     }                                                                   \
 218   else                                                                  \
 219     {                                                                   \
 220       _FILL_QSBC                                                        \
 221     }
 222
 223 #define _FILL_QSBC                                                      \
 224   for (unsigned int i = 0; i <= UCHAR_MAX; i++)                         \
 225     fg->qsBc[i] = fg->len - hasdot;                                     \
 226   for (unsigned int i = hasdot + 1; i < fg->len; i++)                   \
 227     {                                                                   \
 228       fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;            \
 229       DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i],          \
 230              fg->len - i));                                             \
 231       if (fg->icase)                                                    \
 232         {                                                               \
 233           char c = islower((unsigned char)fg->pattern[i]) ?             \
 234                    toupper((unsigned char)fg->pattern[i]) :             \
 235                    tolower((unsigned char)fg->pattern[i]);              \
 236           fg->qsBc[(unsigned char)c] = fg->len - i;                     \
 237           DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i));    \
 238         }                                                               \
 239     }
 240
 241 #define _FILL_QSBC_REVERSED                                             \
 242   for (unsigned int i = 0; i <= UCHAR_MAX; i++)                         \
 243     fg->qsBc[i] = firstdot + 1;                                         \
 244   for (int i = firstdot - 1; i >= 0; i--)                               \
 245     {                                                                   \
 246       fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1;                  \
 247       DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i],   \
 248              i + 1));                                                   \
 249       if (fg->icase)                                                    \
 250         {                                                               \
 251           char c = islower((unsigned char)fg->pattern[i]) ?             \
 252                    toupper((unsigned char)fg->pattern[i]) :             \
 253                    tolower((unsigned char)fg->pattern[i]);              \
 254           fg->qsBc[(unsigned char)c] = i + 1;                           \
 255           DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1));   \
 256         }                                                               \
 257     }
 258
 259 /*
 260  * Fills in the bad character shifts into a hastable for wide strings.
 261  * With wide characters it is not possible any more to use a normal
 262  * array because there are too many characters and we could not
 263  * provide enough memory. Fortunately, we only have to store distinct
 264  * values for so many characters as the number of distinct characters
 265  * in the pattern, so we can store them in a hashtable and store a
 266  * default shift value for the rest.
 267  */
 268 #define FILL_QSBC_WIDE                                                  \
 269   if (fg->reversed)                                                     \
 270     {                                                                   \
 271       _FILL_QSBC_WIDE_REVERSED                                          \
 272     }                                                                   \
 273   else                                                                  \
 274     {                                                                   \
 275       _FILL_QSBC_WIDE                                                   \
 276     }
 277
 278 #define _FILL_QSBC_WIDE                                                 \
 279   /* Adjust the shift based on location of the last dot ('.'). */       \
 280   fg->defBc = fg->wlen - whasdot;                                       \
 281                                                                         \
 282   /* Preprocess pattern. */                                             \
 283   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),       \
 284                                   sizeof(tre_char_t), sizeof(int));     \
 285   if (!fg->qsBc_table)                                                  \
 286     FAIL_COMP(REG_ESPACE);                                              \
 287   for (unsigned int i = whasdot + 1; i < fg->wlen; i++)                 \
 288     {                                                                   \
 289       int k = fg->wlen - i;                                             \
 290       int r;                                                            \
 291                                                                         \
 292       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);          \
 293       if ((r == HASH_FAIL) || (r == HASH_FULL))                         \
 294         FAIL_COMP(REG_ESPACE);                                          \
 295       DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
 296              k));                                                       \
 297       if (fg->icase)                                                    \
 298         {                                                               \
 299           tre_char_t wc = iswlower(fg->wpattern[i]) ?                   \
 300             towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);      \
 301           r = hashtable_put(fg->qsBc_table, &wc, &k);                   \
 302           if ((r == HASH_FAIL) || (r == HASH_FULL))                     \
 303             FAIL_COMP(REG_ESPACE);                                      \
 304           DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k));    \
 305         }                                                               \
 306     }
 307
 308 #define _FILL_QSBC_WIDE_REVERSED                                        \
 309   /* Adjust the shift based on location of the last dot ('.'). */       \
 310   fg->defBc = (size_t)wfirstdot;                                        \
 311                                                                         \
 312   /* Preprocess pattern. */                                             \
 313   fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4),       \
 314                                   sizeof(tre_char_t), sizeof(int));     \
 315   if (!fg->qsBc_table)                                                  \
 316     FAIL_COMP(REG_ESPACE);                                              \
 317   for (int i = wfirstdot - 1; i >= 0; i--)                              \
 318     {                                                                   \
 319       int k = i + 1;                                                    \
 320       int r;                                                            \
 321                                                                         \
 322       r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k);          \
 323       if ((r == HASH_FAIL) || (r == HASH_FULL))                         \
 324         FAIL_COMP(REG_ESPACE);                                          \
 325       DPRINT(("Reverse BC shift for wide char " CHF " is %d\n",         \
 326              fg->wpattern[i], k));                                      \
 327       if (fg->icase)                                                    \
 328         {                                                               \
 329           tre_char_t wc = iswlower(fg->wpattern[i]) ?                   \
 330             towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]);      \
 331           r = hashtable_put(fg->qsBc_table, &wc, &k);                   \
 332           if ((r == HASH_FAIL) || (r == HASH_FULL))                     \
 333             FAIL_COMP(REG_ESPACE);                                      \
 334           DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
 335                  k));                                                   \
 336         }                                                               \
 337     }
 338
 339 #ifdef _GREP_DEBUG
 340 #define DPRINT_BMGS(len, fmt_str, sh)                                   \
 341   for (unsigned int i = 0; i < len; i++)                                \
 342     DPRINT((fmt_str, i, sh[i]));
 343 #else
 344 #define DPRINT_BMGS(len, fmt_str, sh)                                   \
 345   do { } while(/*CONSTCOND*/0)
 346 #endif
 347
 348 /*
 349  * Fills in the good suffix table for SB/MB strings.
 350  */
 351 #define FILL_BMGS                                                       \
 352   if (fg->len > 0 && !fg->hasdot)                                       \
 353     {                                                                   \
 354       fg->sbmGs = malloc(fg->len * sizeof(*fg->sbmGs));                 \
 355       if (!fg->sbmGs)                                                   \
 356         return REG_ESPACE;                                              \
 357       if (fg->len == 1)                                                 \
 358         fg->sbmGs[0] = 1;                                               \
 359       else                                                              \
 360         _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);             \
 361       DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs);   \
 362     }
 363
 364 /*
 365  * Fills in the good suffix table for wide strings.
 366  */
 367 #define FILL_BMGS_WIDE                                                  \
 368   if (fg->wlen > 0 && !fg->hasdot)                                      \
 369     {                                                                   \
 370       fg->bmGs = malloc(fg->wlen * sizeof(*fg->bmGs));                  \
 371       if (!fg->bmGs)                                                    \
 372         return REG_ESPACE;                                              \
 373       if (fg->wlen == 1)                                                \
 374         fg->bmGs[0] = 1;                                                \
 375       else                                                              \
 376         _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);             \
 377       DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n",       \
 378                   fg->bmGs);                                            \
 379     }
 380
 381 #define _FILL_BMGS(arr, pat, plen, wide)                                \
 382   {                                                                     \
 383     char *p;                                                            \
 384     tre_char_t *wp;                                                     \
 385                                                                         \
 386     if (wide)                                                           \
 387       {                                                                 \
 388         if (fg->icase)                                                  \
 389           {                                                             \
 390             wp = malloc(plen * sizeof(tre_char_t));                     \
 391             if (wp == NULL)                                             \
 392               return REG_ESPACE;                                        \
 393             for (unsigned int i = 0; i < plen; i++)                     \
 394               wp[i] = towlower(pat[i]);                                 \
 395             _CALC_BMGS(arr, wp, plen);                                  \
 396             free(wp);                                                   \
 397           }                                                             \
 398         else                                                            \
 399           _CALC_BMGS(arr, pat, plen);                                   \
 400       }                                                                 \
 401     else                                                                \
 402       {                                                                 \
 403         if (fg->icase)                                                  \
 404           {                                                             \
 405             p = malloc(plen);                                           \
 406             if (p == NULL)                                              \
 407               return REG_ESPACE;                                        \
 408             for (unsigned int i = 0; i < plen; i++)                     \
 409               p[i] = tolower((unsigned char)pat[i]);                    \
 410             _CALC_BMGS(arr, p, plen);                                   \
 411             free(p);                                                    \
 412           }                                                             \
 413         else                                                            \
 414           _CALC_BMGS(arr, pat, plen);                                   \
 415       }                                                                 \
 416   }
 417
 418 #define _CALC_BMGS(arr, pat, plen)                                      \
 419   {                                                                     \
 420     int f = 0, g;                                                       \
 421                                                                         \
 422     int *suff = malloc(plen * sizeof(int));                             \
 423     if (suff == NULL)                                                   \
 424       return REG_ESPACE;                                                \
 425                                                                         \
 426     suff[plen - 1] = plen;                                              \
 427     g = plen - 1;                                                       \
 428     for (int i = plen - 2; i >= 0; i--)                                 \
 429       {                                                                 \
 430         if (i > g && suff[i + plen - 1 - f] < i - g)                    \
 431           suff[i] = suff[i + plen - 1 - f];                             \
 432         else                                                            \
 433           {                                                             \
 434             if (i < g)                                                  \
 435               g = i;                                                    \
 436             f = i;                                                      \
 437             while (g >= 0 && pat[g] == pat[g + plen - 1 - f])           \
 438               g--;                                                      \
 439             suff[i] = f - g;                                            \
 440           }                                                             \
 441       }                                                                 \
 442                                                                         \
 443     for (unsigned int i = 0; i < plen; i++)                             \
 444       arr[i] = plen;                                                    \
 445     g = 0;                                                              \
 446     for (int i = plen - 1; i >= 0; i--)                                 \
 447       if (suff[i] == i + 1)                                             \
 448         for(; (unsigned long)g < plen - 1 - i; g++)                     \
 449           if (arr[g] == plen)                                           \
 450             arr[g] = plen - 1 - i;                                      \
 451     for (unsigned int i = 0; i <= plen - 2; i++)                        \
 452       arr[plen - 1 - suff[i]] = plen - 1 - i;                           \
 453                                                                         \
 454     free(suff);                                                 \
 455   }
 456
 457 /*
 458  * Copies the pattern pat having length n to p and stores
 459  * the size in l.
 460  */
 461 #define SAVE_PATTERN(src, srclen, dst, dstlen)                          \
 462   dstlen = srclen;                                                      \
 463   dst = malloc((dstlen + 1) * sizeof(tre_char_t));                      \
 464   if (dst == NULL)                                                      \
 465     return REG_ESPACE;                                                  \
 466   if (dstlen > 0)                                                       \
 467     memcpy(dst, src, dstlen * sizeof(tre_char_t));                      \
 468   dst[dstlen] = TRE_CHAR('\0');
 469
 470 /*
 471  * Initializes pattern compiling.
 472  */
 473 #define INIT_COMP                                                       \
 474   /* Initialize. */                                                     \
 475   memset(fg, 0, sizeof(*fg));                                           \
 476   fg->icase = (cflags & REG_ICASE);                                     \
 477   fg->word = (cflags & REG_WORD);                                       \
 478   fg->newline = (cflags & REG_NEWLINE);                                 \
 479   fg->nosub = (cflags & REG_NOSUB);                                     \
 480                                                                         \
 481   /* Cannot handle REG_ICASE with MB string */                          \
 482   if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0)                       \
 483     {                                                                   \
 484       DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n"));     \
 485       return REG_BADPAT;                                                \
 486     }
 487
 488 /*
 489  * Checks whether we have a 0-length pattern that will match
 490  * anything. If literal is set to false, the EOL anchor is also
 491  * taken into account.
 492  */
 493 #define CHECK_MATCHALL(literal)                                         \
 494   if (!literal && n == 1 && pat[0] == TRE_CHAR('$'))                    \
 495     {                                                                   \
 496       n--;                                                              \
 497       fg->eol = true;                                                   \
 498     }                                                                   \
 499                                                                         \
 500   if (n == 0)                                                           \
 501     {                                                                   \
 502       fg->matchall = true;                                              \
 503       fg->pattern = malloc(sizeof(char));                               \
 504       if (!fg->pattern)                                                 \
 505         FAIL_COMP(REG_ESPACE);                                          \
 506       fg->pattern[0] = '\0';                                            \
 507       fg->wpattern = malloc(sizeof(tre_char_t));                        \
 508       if (!fg->wpattern)                                                \
 509         FAIL_COMP(REG_ESPACE);                                          \
 510       fg->wpattern[0] = TRE_CHAR('\0');                                 \
 511       DPRINT(("Matching every input\n"));                               \
 512       return REG_OK;                                                    \
 513     }
 514
 515 /*
 516  * Returns: REG_OK on success, error code otherwise
 517  */
 518 int
 519 tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
 520                     int cflags)
 521 {
 522   size_t hasdot = 0, whasdot = 0;
 523   ssize_t firstdot = -1, wfirstdot = -1;
 524
 525   INIT_COMP;
 526
 527   CHECK_MATCHALL(true);
 528
 529   /* Cannot handle word boundaries with MB string */
 530   if (fg->word && (TRE_MB_CUR_MAX > 1))
 531     return REG_BADPAT;
 532
 533 #ifdef TRE_WCHAR
 534   SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
 535   STORE_MBS_PAT;
 536 #else
 537   SAVE_PATTERN(pat, n, fg->pattern, fg->len);
 538 #endif
 539
 540   DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
 541          "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
 542          fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
 543
 544   FILL_QSBC;
 545   FILL_BMGS;
 546 #ifdef TRE_WCHAR
 547   FILL_QSBC_WIDE;
 548   FILL_BMGS_WIDE;
 549 #endif
 550
 551   return REG_OK;
 552 }
 553
 554 /*
 555  * Returns: REG_OK on success, error code otherwise
 556  */
 557 int
 558 tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
 559                  int cflags)
 560 {
 561   tre_char_t *tmp;
 562   size_t pos = 0, hasdot = 0, whasdot = 0;
 563   ssize_t firstdot = -1, wfirstdot = -1;
 564   bool escaped = false;
 565   bool *_escmap = NULL;
 566
 567   INIT_COMP;
 568
 569   /* Remove beginning-of-line character ('^'). */
 570   if (pat[0] == TRE_CHAR('^'))
 571     {
 572       fg->bol = true;
 573       n--;
 574       pat++;
 575     }
 576
 577   CHECK_MATCHALL(false);
 578
 579   /* Handle word-boundary matching when GNU extensions are enabled */
 580   if ((cflags & REG_GNU) && (n >= 14) &&
 581       (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
 582       (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
 583               7 * sizeof(tre_char_t)) == 0))
 584     {
 585       n -= 14;
 586       pat += 7;
 587       fg->word = true;
 588     }
 589
 590   /* Cannot handle word boundaries with MB string */
 591   if (fg->word && (TRE_MB_CUR_MAX > 1))
 592     return REG_BADPAT;
 593
 594   tmp = malloc((n + 1) * sizeof(tre_char_t));
 595   if (tmp == NULL)
 596     return REG_ESPACE;
 597
 598 /* Copies the char into the stored pattern and skips to the next char. */
 599 #define STORE_CHAR                                                      \
 600   do                                                                    \
 601     {                                                                   \
 602       tmp[pos++] = pat[i];                                              \
 603       escaped = false;                                                  \
 604       continue;                                                         \
 605     } while (0)
 606
 607   /* Traverse the input pattern for processing */
 608   for (unsigned int i = 0; i < n; i++)
 609     {
 610       switch (pat[i])
 611         {
 612           case TRE_CHAR('\\'):
 613             if (escaped)
 614               STORE_CHAR;
 615             else if (i == n - 1)
 616               goto badpat;
 617             else
 618               escaped = true;
 619             continue;
 620           case TRE_CHAR('['):
 621             if (escaped)
 622               STORE_CHAR;
 623             else
 624               goto badpat;
 625             continue;
 626           case TRE_CHAR('*'):
 627             if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
 628               STORE_CHAR;
 629             else
 630               goto badpat;
 631             continue;
 632           case TRE_CHAR('+'):
 633           case TRE_CHAR('?'):
 634             if ((cflags & REG_EXTENDED) && (i == 0))
 635               goto badpat;
 636             else if ((cflags & REG_EXTENDED) ^ !escaped)
 637               STORE_CHAR;
 638             else
 639               goto badpat;
 640             continue;
 641           case TRE_CHAR('.'):
 642             if (escaped)
 643               {
 644                 if (!_escmap)
 645                   _escmap = calloc(n, sizeof(bool));
 646                 if (!_escmap)
 647                   {
 648                     free(tmp);
 649                     return REG_ESPACE;
 650                   }
 651                 _escmap[i] = true;
 652                 STORE_CHAR;
 653               }
 654             else
 655               {
 656                 whasdot = i;
 657                 if (wfirstdot == -1)
 658                         wfirstdot = i;
 659                 STORE_CHAR;
 660               }
 661             continue;
 662           case TRE_CHAR('^'):
 663             STORE_CHAR;
 664             continue;
 665           case TRE_CHAR('$'):
 666             if (!escaped && (i == n - 1))
 667               fg->eol = true;
 668             else
 669               STORE_CHAR;
 670             continue;
 671           case TRE_CHAR('('):
 672             if ((cflags & REG_EXTENDED) ^ escaped)
 673               goto badpat;
 674             else
 675               STORE_CHAR;
 676             continue;
 677           case TRE_CHAR('{'):
 678             if (!(cflags & REG_EXTENDED) ^ escaped)
 679               STORE_CHAR;
 680             else if (!(cflags & REG_EXTENDED) && (i == 0))
 681               STORE_CHAR;
 682             else if ((cflags & REG_EXTENDED) && (i == 0))
 683               continue;
 684             else
 685               goto badpat;
 686             continue;
 687           case TRE_CHAR('|'):
 688             if ((cflags & REG_EXTENDED) ^ escaped)
 689               goto badpat;
 690             else
 691               STORE_CHAR;
 692             continue;
 693           default:
 694             if (escaped)
 695               goto badpat;
 696             else
 697               STORE_CHAR;
 698             continue;
 699         }
 700       continue;
 701 badpat:
 702       free(tmp);
 703       DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
 704               "back to NFA\n"));
 705       return REG_BADPAT;
 706     }
 707
 708   fg->hasdot = wfirstdot > -1;
 709
 710   /*
 711    * The pattern has been processed and copied to tmp as a literal string
 712    * with escapes, anchors (^$) and the word boundary match character
 713    * classes stripped out.
 714    */
 715 #ifdef TRE_WCHAR
 716   SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
 717   fg->wescmap = _escmap;
 718   STORE_MBS_PAT;
 719
 720   /*
 721    * The position of dots and escaped dots is different in the MB string
 722    * than in to the wide string so traverse the converted string, as well,
 723    * to store these positions.
 724    */
 725   if (fg->hasdot || (fg->wescmap != NULL))
 726     {
 727       if (fg->wescmap != NULL)
 728         {
 729           fg->escmap = calloc(fg->len, sizeof(bool));
 730           if (fg->escmap == NULL)
 731             {
 732               tre_free_fast(fg);
 733               return REG_ESPACE;
 734             }
 735         }
 736
 737       escaped = false;
 738       char *_checkpat = NULL;
 739       size_t _checklen = 0;
 740       unsigned int escofs = 0;
 741       /*
 742        * Make a copy here of the original pattern, because fg->pattern has
 743        * already been stripped of all escape sequences in the above processing.
 744        * This is necessary if we wish to later treat fg->escmap as an actual,
 745        * functional replacement of fg->wescmap.
 746        */
 747       CONV_MBS_PAT(pat, _checkpat, _checklen);
 748       for (unsigned int i = 0; i < n; i++)
 749         if (_checkpat[i] == '\\')
 750           {
 751             escaped = !escaped;
 752             if (escaped)
 753               ++escofs;
 754           }
 755         else if (_checkpat[i] == '.' && fg->escmap != NULL && escaped)
 756           {
 757             fg->escmap[i - escofs] = true;
 758             escaped = false;
 759           }
 760         else if (_checkpat[i] == '.' && !escaped)
 761           {
 762             hasdot = i;
 763             if (firstdot == -1)
 764               firstdot = i;
 765           }
 766         else
 767           escaped = false;
 768       free(_checkpat);
 769     }
 770 #else
 771   SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
 772   fg->escmap = _escmap;
 773 #endif
 774
 775   free(tmp);
 776
 777   DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
 778          "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
 779          fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
 780          fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
 781          fg->newline ? 'y' : 'n'));
 782
 783   /* Check whether reverse QS algorithm is more efficient */
 784   if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
 785       fg->nosub)
 786     {
 787       fg->reversed = true;
 788       DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
 789     }
 790
 791   FILL_QSBC;
 792   FILL_BMGS;
 793 #ifdef TRE_WCHAR
 794   FILL_QSBC_WIDE;
 795   FILL_BMGS_WIDE;
 796 #endif
 797
 798   return REG_OK;
 799 }
 800
 801 #define _SHIFT_ONE                                                      \
 802   {                                                                     \
 803     shift = 1;                                                          \
 804     j = !fg->reversed ? j + shift : j - shift;                          \
 805     continue;                                                           \
 806   }
 807
 808 #define _BBOUND_COND                                                    \
 809   ((type == STR_WIDE) ?                                                 \
 810     ((j == 0) || !(tre_isalnum(str_wide[j - 1]) ||                      \
 811       (str_wide[j - 1] == TRE_CHAR('_')))) :                            \
 812     ((j == 0) || !(tre_isalnum(str_byte[j - 1]) ||                      \
 813       (str_byte[j - 1] == '_'))))
 814
 815 #define _EBOUND_COND                                                    \
 816   ((type == STR_WIDE) ?                                                 \
 817     ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) ||  \
 818       (str_wide[j + fg->wlen] == TRE_CHAR('_')))) :                     \
 819     ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) ||    \
 820       (str_byte[j + fg->len] == '_'))))
 821
 822 /*
 823  * Condition to check whether the match on position j is on a
 824  * word boundary.
 825  */
 826 #define IS_ON_WORD_BOUNDARY                                             \
 827   (_BBOUND_COND && _EBOUND_COND)
 828
 829 /*
 830  * Checks word boundary and shifts one if match is not on a
 831  * boundary.
 832  */
 833 #define CHECK_WORD_BOUNDARY                                             \
 834     if (!IS_ON_WORD_BOUNDARY)                                           \
 835       _SHIFT_ONE;
 836
 837 #define _BOL_COND                                                       \
 838   ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
 839                                    : (str_byte[j - 1] == '\n')))
 840
 841 /*
 842  * Checks BOL anchor and shifts one if match is not on a
 843  * boundary.
 844  */
 845 #define CHECK_BOL_ANCHOR                                                \
 846     if (!_BOL_COND)                                                     \
 847       _SHIFT_ONE;
 848
 849 #define _EOL_COND                                                       \
 850   ((type == STR_WIDE)                                                   \
 851     ? ((j + fg->wlen == len) ||                                         \
 852                 (str_wide[j + fg->wlen] == TRE_CHAR('\n')))             \
 853     : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
 854
 855 /*
 856  * Checks EOL anchor and shifts one if match is not on a
 857  * boundary.
 858  */
 859 #define CHECK_EOL_ANCHOR                                                \
 860     if (!_EOL_COND)                                                     \
 861       _SHIFT_ONE;
 862
 863 /*
 864  * Executes matching of the precompiled pattern on the input string.
 865  * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
 866  */
 867 int
 868 tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
 869     tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
 870 {
 871   unsigned int shift, u = 0, v = 0;
 872   ssize_t j = 0;
 873   int ret = REG_NOMATCH;
 874   int mismatch;
 875   const char *str_byte = data;
 876   const void *startptr = NULL;
 877   const tre_char_t *str_wide = data;
 878
 879   /* Calculate length if unspecified. */
 880   if (len == (size_t)-1)
 881     switch (type)
 882       {
 883         case STR_WIDE:
 884           len = tre_strlen(str_wide);
 885           break;
 886         default:
 887           len = strlen(str_byte);
 888           break;
 889       }
 890
 891   /* Shortcut for empty pattern */
 892   if (fg->matchall)
 893     {
 894       if (!fg->nosub && nmatch >= 1)
 895         {
 896           pmatch[0].rm_so = 0;
 897           pmatch[0].rm_eo = len;
 898         }
 899       if (fg->bol && fg->eol)
 900         return (len == 0) ? REG_OK : REG_NOMATCH;
 901       else
 902         return REG_OK;
 903     }
 904
 905   /* No point in going farther if we do not have enough data. */
 906   switch (type)
 907     {
 908       case STR_WIDE:
 909         if (len < fg->wlen)
 910           return ret;
 911         shift = fg->wlen;
 912         break;
 913       default:
 914         if (len < fg->len)
 915           return ret;
 916         shift = fg->len;
 917     }
 918
 919   /*
 920    * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
 921    * can shift one because there can't be a match at the beginning.
 922    */
 923   if (fg->bol && (eflags & REG_NOTBOL))
 924     j = 1;
 925
 926   /*
 927    * Like above, we cannot have a match at the very end when anchoring to
 928    * the end and REG_NOTEOL is specified.
 929    */
 930   if (fg->eol && (eflags & REG_NOTEOL))
 931     len--;
 932
 933   if (fg->reversed)
 934     j = len - (type == STR_WIDE ? fg->wlen : fg->len);
 935
 936
 937   /* Only try once at the beginning or ending of the line. */
 938   if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
 939       !(eflags & REG_NOTEOL))
 940     {
 941       /* Simple text comparison. */
 942       if (!((fg->bol && fg->eol) &&
 943           (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
 944         {
 945           /* Determine where in data to start search at. */
 946           j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
 947           SKIP_CHARS(j);
 948           mismatch = fastcmp(fg, startptr, type);
 949           if (mismatch == REG_OK)
 950             {
 951               if (fg->word && !IS_ON_WORD_BOUNDARY)
 952                 return ret;
 953               if (!fg->nosub && nmatch >= 1)
 954                 {
 955                   pmatch[0].rm_so = j;
 956                   pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
 957                 }
 958               return REG_OK;
 959             }
 960         }
 961     }
 962   else
 963     {
 964       /* Quick Search / Turbo Boyer-Moore algorithm. */
 965       do
 966         {
 967           SKIP_CHARS(j);
 968           mismatch = fastcmp(fg, startptr, type);
 969           if (mismatch == REG_OK)
 970             {
 971               if (fg->word)
 972                 CHECK_WORD_BOUNDARY;
 973               if (fg->bol)
 974                 CHECK_BOL_ANCHOR;
 975               if (fg->eol)
 976                 CHECK_EOL_ANCHOR;
 977               if (!fg->nosub && nmatch >= 1)
 978                 {
 979                   pmatch[0].rm_so = j;
 980                   pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
 981                 }
 982               return REG_OK;
 983             }
 984           else if (mismatch > 0)
 985             return mismatch;
 986           mismatch = -mismatch - 1;
 987           SHIFT;
 988         } while (!IS_OUT_OF_BOUNDS);
 989     }
 990     return ret;
 991 }
 992
 993 /*
 994  * Frees the resources that were allocated when the pattern was compiled.
 995  */
 996 void
 997 tre_free_fast(fastmatch_t *fg)
 998 {
 999
1000   DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
1001          fg->pattern));
1002
1003 #ifdef TRE_WCHAR
1004   hashtable_free(fg->qsBc_table);
1005   if (!fg->hasdot)
1006     free(fg->bmGs);
1007   if (fg->wescmap)
1008     free(fg->wescmap);
1009   free(fg->wpattern);
1010 #endif
1011   if (!fg->hasdot)
1012     free(fg->sbmGs);
1013   if (fg->escmap)
1014     free(fg->escmap);
1015   free(fg->pattern);
1016 }
1017
1018 /*
1019  * Returns:     -(i + 1) on failure (position that it failed with minus sign)
1020  *              error code on error
1021  *              REG_OK on success
1022  */
1023 static inline int
1024 fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
1025 {
1026   const char *str_byte = data;
1027   const char *pat_byte = fg->pattern;
1028   const tre_char_t *str_wide = data;
1029   const tre_char_t *pat_wide = fg->wpattern;
1030   const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
1031   size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
1032   int ret = REG_OK;
1033
1034   /* Compare the pattern and the input char-by-char from the last position. */
1035   for (int i = len - 1; i >= 0; i--) {
1036     switch (type)
1037       {
1038         case STR_WIDE:
1039
1040           /* Check dot */
1041           if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
1042               (!escmap || !escmap[i]) &&
1043               (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
1044             continue;
1045
1046           /* Compare */
1047           if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
1048                     : (pat_wide[i] == str_wide[i]))
1049             continue;
1050           break;
1051         default:
1052           /* Check dot */
1053           if (fg->hasdot && pat_byte[i] == '.' &&
1054               (!escmap || !escmap[i]) &&
1055               (!fg->newline || (str_byte[i] != '\n')))
1056             continue;
1057
1058           /* Compare */
1059           if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
1060                     : (pat_byte[i] == str_byte[i]))
1061           continue;
1062       }
1063     DPRINT(("fastcmp: mismatch at position %d\n", i));
1064     ret = -(i + 1);
1065     break;
1066   }
1067   return ret;
1068 }