kernel: print release number in boot banner
[unleashed.git] / bin / grep / regex / tre-fastmatch.c
blob6bf7c446529e823ab4cd19da1ec76089bb4bc40a
1 /* $FreeBSD$ */
3 /*-
4 * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5 * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
6 * All rights reserved.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
30 #include "glue.h"
32 #include <ctype.h>
33 #include <limits.h>
34 #include <regex.h>
35 #include <stdbool.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #ifdef TRE_WCHAR
39 #include <wchar.h>
40 #include <wctype.h>
41 #endif
43 #include "hashtable.h"
44 #include "tre-fastmatch.h"
46 static int fastcmp(const fastmatch_t *fg, const void *data,
47 tre_str_type_t type);
50 * Clean up if pattern compilation fails.
52 #define FAIL_COMP(errcode) \
53 { \
54 if (fg->pattern) \
55 free(fg->pattern); \
56 if (fg->wpattern) \
57 free(fg->wpattern); \
58 if (fg->qsBc_table) \
59 hashtable_free(fg->qsBc_table); \
60 fg = NULL; \
61 return errcode; \
65 * Skips n characters in the input string and assigns the start
66 * address to startptr. Note: as per IEEE Std 1003.1-2008
67 * matching is based on bit pattern not character representations
68 * so we can handle MB strings as byte sequences just like
69 * SB strings.
71 #define SKIP_CHARS(n) \
72 switch (type) \
73 { \
74 case STR_WIDE: \
75 startptr = str_wide + n; \
76 break; \
77 default: \
78 startptr = str_byte + n; \
82 * Converts the wide string pattern to SB/MB string and stores
83 * it in fg->pattern. Sets fg->len to the byte length of the
84 * converted string.
86 #define STORE_MBS_PAT \
87 { \
88 size_t siz; \
90 siz = wcstombs(NULL, fg->wpattern, 0); \
91 if (siz == (size_t)-1) \
92 return REG_BADPAT; \
93 fg->len = siz; \
94 fg->pattern = malloc(siz + 1); \
95 if (fg->pattern == NULL) \
96 return REG_ESPACE; \
97 wcstombs(fg->pattern, fg->wpattern, siz); \
98 fg->pattern[siz] = '\0'; \
99 } \
101 #define CONV_MBS_PAT(src, dest, destsz) \
103 destsz = wcstombs(NULL, src, 0); \
104 if (destsz == (size_t)-1) \
105 return REG_BADPAT; \
106 dest = malloc(destsz + 1); \
107 if (dest == NULL) \
108 return REG_ESPACE; \
109 wcstombs(dest, src, destsz); \
110 dest[destsz] = '\0'; \
113 #define IS_OUT_OF_BOUNDS \
114 ((!fg->reversed \
115 ? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \
116 : ((j + fg->len) > len)) \
117 : (j < 0)))
120 * Checks whether the new position after shifting in the input string
121 * is out of the bounds and break out from the loop if so.
123 #define CHECKBOUNDS \
124 if (IS_OUT_OF_BOUNDS) \
125 break; \
128 * Shifts in the input string after a mismatch. The position of the
129 * mismatch is stored in the mismatch variable.
131 #define SHIFT \
132 CHECKBOUNDS; \
135 int r = -1; \
136 unsigned int bc = 0, gs = 0, ts; \
138 switch (type) \
140 case STR_WIDE: \
141 if (!fg->hasdot) \
143 if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \
144 mismatch -= u; \
145 v = fg->wlen - 1 - mismatch; \
146 r = hashtable_get(fg->qsBc_table, \
147 &str_wide[!fg->reversed ? (size_t)j + fg->wlen \
148 : (size_t)j - 1], &bc); \
149 gs = fg->bmGs[mismatch]; \
151 bc = (r == HASH_OK) ? bc : fg->defBc; \
152 DPRINT(("tre_fast_match: mismatch on character" CHF ", " \
153 "BC %d, GS %d\n", \
154 ((const tre_char_t *)startptr)[mismatch + 1], \
155 bc, gs)); \
156 break; \
157 default: \
158 if (!fg->hasdot) \
160 if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift) \
161 mismatch -= u; \
162 v = fg->len - 1 - mismatch; \
163 gs = fg->sbmGs[mismatch]; \
165 bc = fg->qsBc[((const unsigned char *)str_byte) \
166 [!fg->reversed ? (size_t)j + fg->len \
167 : (size_t)j - 1]]; \
168 DPRINT(("tre_fast_match: mismatch on character %c, " \
169 "BC %d, GS %d\n", \
170 ((const unsigned char *)startptr)[mismatch + 1], \
171 bc, gs)); \
173 if (fg->hasdot) \
174 shift = bc; \
175 else \
177 ts = (u >= v) ? (u - v) : 0; \
178 shift = MAX(ts, bc); \
179 shift = MAX(shift, gs); \
180 if (shift == gs) \
181 u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \
182 else \
184 if (ts < bc) \
185 shift = MAX(shift, u + 1); \
186 u = 0; \
189 DPRINT(("tre_fast_match: shifting %u characters\n", shift)); \
190 j = !fg->reversed ? j + shift : j - shift; \
194 * Normal Quick Search would require a shift based on the position the
195 * next character after the comparison is within the pattern. With
196 * wildcards, the position of the last dot effects the maximum shift
197 * distance.
198 * The closer to the end the wild card is the slower the search.
200 * Examples:
201 * Pattern Max shift
202 * ------- ---------
203 * this 5
204 * .his 4
205 * t.is 3
206 * th.s 2
207 * thi. 1
211 * Fills in the bad character shift array for SB/MB strings.
213 #define FILL_QSBC \
214 if (fg->reversed) \
216 _FILL_QSBC_REVERSED \
218 else \
220 _FILL_QSBC \
223 #define _FILL_QSBC \
224 for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
225 fg->qsBc[i] = fg->len - hasdot; \
226 for (unsigned int i = hasdot + 1; i < fg->len; i++) \
228 fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \
229 DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \
230 fg->len - i)); \
231 if (fg->icase) \
233 char c = islower((unsigned char)fg->pattern[i]) ? \
234 toupper((unsigned char)fg->pattern[i]) : \
235 tolower((unsigned char)fg->pattern[i]); \
236 fg->qsBc[(unsigned char)c] = fg->len - i; \
237 DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \
241 #define _FILL_QSBC_REVERSED \
242 for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
243 fg->qsBc[i] = firstdot + 1; \
244 for (int i = firstdot - 1; i >= 0; i--) \
246 fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1; \
247 DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i], \
248 i + 1)); \
249 if (fg->icase) \
251 char c = islower((unsigned char)fg->pattern[i]) ? \
252 toupper((unsigned char)fg->pattern[i]) : \
253 tolower((unsigned char)fg->pattern[i]); \
254 fg->qsBc[(unsigned char)c] = i + 1; \
255 DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1)); \
260 * Fills in the bad character shifts into a hastable for wide strings.
261 * With wide characters it is not possible any more to use a normal
262 * array because there are too many characters and we could not
263 * provide enough memory. Fortunately, we only have to store distinct
264 * values for so many characters as the number of distinct characters
265 * in the pattern, so we can store them in a hashtable and store a
266 * default shift value for the rest.
268 #define FILL_QSBC_WIDE \
269 if (fg->reversed) \
271 _FILL_QSBC_WIDE_REVERSED \
273 else \
275 _FILL_QSBC_WIDE \
278 #define _FILL_QSBC_WIDE \
279 /* Adjust the shift based on location of the last dot ('.'). */ \
280 fg->defBc = fg->wlen - whasdot; \
282 /* Preprocess pattern. */ \
283 fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
284 sizeof(tre_char_t), sizeof(int)); \
285 if (!fg->qsBc_table) \
286 FAIL_COMP(REG_ESPACE); \
287 for (unsigned int i = whasdot + 1; i < fg->wlen; i++) \
289 int k = fg->wlen - i; \
290 int r; \
292 r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
293 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
294 FAIL_COMP(REG_ESPACE); \
295 DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
296 k)); \
297 if (fg->icase) \
299 tre_char_t wc = iswlower(fg->wpattern[i]) ? \
300 towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
301 r = hashtable_put(fg->qsBc_table, &wc, &k); \
302 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
303 FAIL_COMP(REG_ESPACE); \
304 DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \
308 #define _FILL_QSBC_WIDE_REVERSED \
309 /* Adjust the shift based on location of the last dot ('.'). */ \
310 fg->defBc = (size_t)wfirstdot; \
312 /* Preprocess pattern. */ \
313 fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
314 sizeof(tre_char_t), sizeof(int)); \
315 if (!fg->qsBc_table) \
316 FAIL_COMP(REG_ESPACE); \
317 for (int i = wfirstdot - 1; i >= 0; i--) \
319 int k = i + 1; \
320 int r; \
322 r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
323 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
324 FAIL_COMP(REG_ESPACE); \
325 DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \
326 fg->wpattern[i], k)); \
327 if (fg->icase) \
329 tre_char_t wc = iswlower(fg->wpattern[i]) ? \
330 towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
331 r = hashtable_put(fg->qsBc_table, &wc, &k); \
332 if ((r == HASH_FAIL) || (r == HASH_FULL)) \
333 FAIL_COMP(REG_ESPACE); \
334 DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
335 k)); \
339 #ifdef _GREP_DEBUG
340 #define DPRINT_BMGS(len, fmt_str, sh) \
341 for (unsigned int i = 0; i < len; i++) \
342 DPRINT((fmt_str, i, sh[i]));
343 #else
344 #define DPRINT_BMGS(len, fmt_str, sh) \
345 do { } while(/*CONSTCOND*/0)
346 #endif
349 * Fills in the good suffix table for SB/MB strings.
351 #define FILL_BMGS \
352 if (fg->len > 0 && !fg->hasdot) \
354 fg->sbmGs = malloc(fg->len * sizeof(*fg->sbmGs)); \
355 if (!fg->sbmGs) \
356 return REG_ESPACE; \
357 if (fg->len == 1) \
358 fg->sbmGs[0] = 1; \
359 else \
360 _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \
361 DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \
365 * Fills in the good suffix table for wide strings.
367 #define FILL_BMGS_WIDE \
368 if (fg->wlen > 0 && !fg->hasdot) \
370 fg->bmGs = malloc(fg->wlen * sizeof(*fg->bmGs)); \
371 if (!fg->bmGs) \
372 return REG_ESPACE; \
373 if (fg->wlen == 1) \
374 fg->bmGs[0] = 1; \
375 else \
376 _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \
377 DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \
378 fg->bmGs); \
381 #define _FILL_BMGS(arr, pat, plen, wide) \
383 char *p; \
384 tre_char_t *wp; \
386 if (wide) \
388 if (fg->icase) \
390 wp = malloc(plen * sizeof(tre_char_t)); \
391 if (wp == NULL) \
392 return REG_ESPACE; \
393 for (unsigned int i = 0; i < plen; i++) \
394 wp[i] = towlower(pat[i]); \
395 _CALC_BMGS(arr, wp, plen); \
396 free(wp); \
398 else \
399 _CALC_BMGS(arr, pat, plen); \
401 else \
403 if (fg->icase) \
405 p = malloc(plen); \
406 if (p == NULL) \
407 return REG_ESPACE; \
408 for (unsigned int i = 0; i < plen; i++) \
409 p[i] = tolower((unsigned char)pat[i]); \
410 _CALC_BMGS(arr, p, plen); \
411 free(p); \
413 else \
414 _CALC_BMGS(arr, pat, plen); \
418 #define _CALC_BMGS(arr, pat, plen) \
420 int f = 0, g; \
422 int *suff = malloc(plen * sizeof(int)); \
423 if (suff == NULL) \
424 return REG_ESPACE; \
426 suff[plen - 1] = plen; \
427 g = plen - 1; \
428 for (int i = plen - 2; i >= 0; i--) \
430 if (i > g && suff[i + plen - 1 - f] < i - g) \
431 suff[i] = suff[i + plen - 1 - f]; \
432 else \
434 if (i < g) \
435 g = i; \
436 f = i; \
437 while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \
438 g--; \
439 suff[i] = f - g; \
443 for (unsigned int i = 0; i < plen; i++) \
444 arr[i] = plen; \
445 g = 0; \
446 for (int i = plen - 1; i >= 0; i--) \
447 if (suff[i] == i + 1) \
448 for(; (unsigned long)g < plen - 1 - i; g++) \
449 if (arr[g] == plen) \
450 arr[g] = plen - 1 - i; \
451 for (unsigned int i = 0; i <= plen - 2; i++) \
452 arr[plen - 1 - suff[i]] = plen - 1 - i; \
454 free(suff); \
458 * Copies the pattern pat having length n to p and stores
459 * the size in l.
461 #define SAVE_PATTERN(src, srclen, dst, dstlen) \
462 dstlen = srclen; \
463 dst = malloc((dstlen + 1) * sizeof(tre_char_t)); \
464 if (dst == NULL) \
465 return REG_ESPACE; \
466 if (dstlen > 0) \
467 memcpy(dst, src, dstlen * sizeof(tre_char_t)); \
468 dst[dstlen] = TRE_CHAR('\0');
471 * Initializes pattern compiling.
473 #define INIT_COMP \
474 /* Initialize. */ \
475 memset(fg, 0, sizeof(*fg)); \
476 fg->icase = (cflags & REG_ICASE); \
477 fg->word = (cflags & REG_WORD); \
478 fg->newline = (cflags & REG_NEWLINE); \
479 fg->nosub = (cflags & REG_NOSUB); \
481 /* Cannot handle REG_ICASE with MB string */ \
482 if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0) \
484 DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \
485 return REG_BADPAT; \
489 * Checks whether we have a 0-length pattern that will match
490 * anything. If literal is set to false, the EOL anchor is also
491 * taken into account.
493 #define CHECK_MATCHALL(literal) \
494 if (!literal && n == 1 && pat[0] == TRE_CHAR('$')) \
496 n--; \
497 fg->eol = true; \
500 if (n == 0) \
502 fg->matchall = true; \
503 fg->pattern = malloc(sizeof(char)); \
504 if (!fg->pattern) \
505 FAIL_COMP(REG_ESPACE); \
506 fg->pattern[0] = '\0'; \
507 fg->wpattern = malloc(sizeof(tre_char_t)); \
508 if (!fg->wpattern) \
509 FAIL_COMP(REG_ESPACE); \
510 fg->wpattern[0] = TRE_CHAR('\0'); \
511 DPRINT(("Matching every input\n")); \
512 return REG_OK; \
516 * Returns: REG_OK on success, error code otherwise
519 tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
520 int cflags)
522 size_t hasdot = 0, whasdot = 0;
523 ssize_t firstdot = -1, wfirstdot = -1;
525 INIT_COMP;
527 CHECK_MATCHALL(true);
529 /* Cannot handle word boundaries with MB string */
530 if (fg->word && (TRE_MB_CUR_MAX > 1))
531 return REG_BADPAT;
533 #ifdef TRE_WCHAR
534 SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
535 STORE_MBS_PAT;
536 #else
537 SAVE_PATTERN(pat, n, fg->pattern, fg->len);
538 #endif
540 DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
541 "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
542 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
544 FILL_QSBC;
545 FILL_BMGS;
546 #ifdef TRE_WCHAR
547 FILL_QSBC_WIDE;
548 FILL_BMGS_WIDE;
549 #endif
551 return REG_OK;
555 * Returns: REG_OK on success, error code otherwise
558 tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
559 int cflags)
561 tre_char_t *tmp;
562 size_t pos = 0, hasdot = 0, whasdot = 0;
563 ssize_t firstdot = -1, wfirstdot = -1;
564 bool escaped = false;
565 bool *_escmap = NULL;
567 INIT_COMP;
569 /* Remove beginning-of-line character ('^'). */
570 if (pat[0] == TRE_CHAR('^'))
572 fg->bol = true;
573 n--;
574 pat++;
577 CHECK_MATCHALL(false);
579 /* Handle word-boundary matching when GNU extensions are enabled */
580 if ((cflags & REG_GNU) && (n >= 14) &&
581 (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
582 (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
583 7 * sizeof(tre_char_t)) == 0))
585 n -= 14;
586 pat += 7;
587 fg->word = true;
590 /* Cannot handle word boundaries with MB string */
591 if (fg->word && (TRE_MB_CUR_MAX > 1))
592 return REG_BADPAT;
594 tmp = malloc((n + 1) * sizeof(tre_char_t));
595 if (tmp == NULL)
596 return REG_ESPACE;
598 /* Copies the char into the stored pattern and skips to the next char. */
599 #define STORE_CHAR \
600 do \
602 tmp[pos++] = pat[i]; \
603 escaped = false; \
604 continue; \
605 } while (0)
607 /* Traverse the input pattern for processing */
608 for (unsigned int i = 0; i < n; i++)
610 switch (pat[i])
612 case TRE_CHAR('\\'):
613 if (escaped)
614 STORE_CHAR;
615 else if (i == n - 1)
616 goto badpat;
617 else
618 escaped = true;
619 continue;
620 case TRE_CHAR('['):
621 if (escaped)
622 STORE_CHAR;
623 else
624 goto badpat;
625 continue;
626 case TRE_CHAR('*'):
627 if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
628 STORE_CHAR;
629 else
630 goto badpat;
631 continue;
632 case TRE_CHAR('+'):
633 case TRE_CHAR('?'):
634 if ((cflags & REG_EXTENDED) && (i == 0))
635 goto badpat;
636 else if ((cflags & REG_EXTENDED) ^ !escaped)
637 STORE_CHAR;
638 else
639 goto badpat;
640 continue;
641 case TRE_CHAR('.'):
642 if (escaped)
644 if (!_escmap)
645 _escmap = calloc(n, sizeof(bool));
646 if (!_escmap)
648 free(tmp);
649 return REG_ESPACE;
651 _escmap[i] = true;
652 STORE_CHAR;
654 else
656 whasdot = i;
657 if (wfirstdot == -1)
658 wfirstdot = i;
659 STORE_CHAR;
661 continue;
662 case TRE_CHAR('^'):
663 STORE_CHAR;
664 continue;
665 case TRE_CHAR('$'):
666 if (!escaped && (i == n - 1))
667 fg->eol = true;
668 else
669 STORE_CHAR;
670 continue;
671 case TRE_CHAR('('):
672 if ((cflags & REG_EXTENDED) ^ escaped)
673 goto badpat;
674 else
675 STORE_CHAR;
676 continue;
677 case TRE_CHAR('{'):
678 if (!(cflags & REG_EXTENDED) ^ escaped)
679 STORE_CHAR;
680 else if (!(cflags & REG_EXTENDED) && (i == 0))
681 STORE_CHAR;
682 else if ((cflags & REG_EXTENDED) && (i == 0))
683 continue;
684 else
685 goto badpat;
686 continue;
687 case TRE_CHAR('|'):
688 if ((cflags & REG_EXTENDED) ^ escaped)
689 goto badpat;
690 else
691 STORE_CHAR;
692 continue;
693 default:
694 if (escaped)
695 goto badpat;
696 else
697 STORE_CHAR;
698 continue;
700 continue;
701 badpat:
702 free(tmp);
703 DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
704 "back to NFA\n"));
705 return REG_BADPAT;
708 fg->hasdot = wfirstdot > -1;
711 * The pattern has been processed and copied to tmp as a literal string
712 * with escapes, anchors (^$) and the word boundary match character
713 * classes stripped out.
715 #ifdef TRE_WCHAR
716 SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
717 fg->wescmap = _escmap;
718 STORE_MBS_PAT;
721 * The position of dots and escaped dots is different in the MB string
722 * than in to the wide string so traverse the converted string, as well,
723 * to store these positions.
725 if (fg->hasdot || (fg->wescmap != NULL))
727 if (fg->wescmap != NULL)
729 fg->escmap = calloc(fg->len, sizeof(bool));
730 if (fg->escmap == NULL)
732 tre_free_fast(fg);
733 return REG_ESPACE;
737 escaped = false;
738 char *_checkpat = NULL;
739 size_t _checklen = 0;
740 unsigned int escofs = 0;
742 * Make a copy here of the original pattern, because fg->pattern has
743 * already been stripped of all escape sequences in the above processing.
744 * This is necessary if we wish to later treat fg->escmap as an actual,
745 * functional replacement of fg->wescmap.
747 CONV_MBS_PAT(pat, _checkpat, _checklen);
748 for (unsigned int i = 0; i < n; i++)
749 if (_checkpat[i] == '\\')
751 escaped = !escaped;
752 if (escaped)
753 ++escofs;
755 else if (_checkpat[i] == '.' && fg->escmap != NULL && escaped)
757 fg->escmap[i - escofs] = true;
758 escaped = false;
760 else if (_checkpat[i] == '.' && !escaped)
762 hasdot = i;
763 if (firstdot == -1)
764 firstdot = i;
766 else
767 escaped = false;
768 free(_checkpat);
770 #else
771 SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
772 fg->escmap = _escmap;
773 #endif
775 free(tmp);
777 DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
778 "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
779 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
780 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
781 fg->newline ? 'y' : 'n'));
783 /* Check whether reverse QS algorithm is more efficient */
784 if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
785 fg->nosub)
787 fg->reversed = true;
788 DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
791 FILL_QSBC;
792 FILL_BMGS;
793 #ifdef TRE_WCHAR
794 FILL_QSBC_WIDE;
795 FILL_BMGS_WIDE;
796 #endif
798 return REG_OK;
801 #define _SHIFT_ONE \
803 shift = 1; \
804 j = !fg->reversed ? j + shift : j - shift; \
805 continue; \
808 #define _BBOUND_COND \
809 ((type == STR_WIDE) ? \
810 ((j == 0) || !(tre_isalnum(str_wide[j - 1]) || \
811 (str_wide[j - 1] == TRE_CHAR('_')))) : \
812 ((j == 0) || !(tre_isalnum(str_byte[j - 1]) || \
813 (str_byte[j - 1] == '_'))))
815 #define _EBOUND_COND \
816 ((type == STR_WIDE) ? \
817 ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) || \
818 (str_wide[j + fg->wlen] == TRE_CHAR('_')))) : \
819 ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) || \
820 (str_byte[j + fg->len] == '_'))))
823 * Condition to check whether the match on position j is on a
824 * word boundary.
826 #define IS_ON_WORD_BOUNDARY \
827 (_BBOUND_COND && _EBOUND_COND)
830 * Checks word boundary and shifts one if match is not on a
831 * boundary.
833 #define CHECK_WORD_BOUNDARY \
834 if (!IS_ON_WORD_BOUNDARY) \
835 _SHIFT_ONE;
837 #define _BOL_COND \
838 ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
839 : (str_byte[j - 1] == '\n')))
842 * Checks BOL anchor and shifts one if match is not on a
843 * boundary.
845 #define CHECK_BOL_ANCHOR \
846 if (!_BOL_COND) \
847 _SHIFT_ONE;
849 #define _EOL_COND \
850 ((type == STR_WIDE) \
851 ? ((j + fg->wlen == len) || \
852 (str_wide[j + fg->wlen] == TRE_CHAR('\n'))) \
853 : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
856 * Checks EOL anchor and shifts one if match is not on a
857 * boundary.
859 #define CHECK_EOL_ANCHOR \
860 if (!_EOL_COND) \
861 _SHIFT_ONE;
864 * Executes matching of the precompiled pattern on the input string.
865 * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
868 tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
869 tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
871 unsigned int shift, u = 0, v = 0;
872 ssize_t j = 0;
873 int ret = REG_NOMATCH;
874 int mismatch;
875 const char *str_byte = data;
876 const void *startptr = NULL;
877 const tre_char_t *str_wide = data;
879 /* Calculate length if unspecified. */
880 if (len == (size_t)-1)
881 switch (type)
883 case STR_WIDE:
884 len = tre_strlen(str_wide);
885 break;
886 default:
887 len = strlen(str_byte);
888 break;
891 /* Shortcut for empty pattern */
892 if (fg->matchall)
894 if (!fg->nosub && nmatch >= 1)
896 pmatch[0].rm_so = 0;
897 pmatch[0].rm_eo = len;
899 if (fg->bol && fg->eol)
900 return (len == 0) ? REG_OK : REG_NOMATCH;
901 else
902 return REG_OK;
905 /* No point in going farther if we do not have enough data. */
906 switch (type)
908 case STR_WIDE:
909 if (len < fg->wlen)
910 return ret;
911 shift = fg->wlen;
912 break;
913 default:
914 if (len < fg->len)
915 return ret;
916 shift = fg->len;
920 * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
921 * can shift one because there can't be a match at the beginning.
923 if (fg->bol && (eflags & REG_NOTBOL))
924 j = 1;
927 * Like above, we cannot have a match at the very end when anchoring to
928 * the end and REG_NOTEOL is specified.
930 if (fg->eol && (eflags & REG_NOTEOL))
931 len--;
933 if (fg->reversed)
934 j = len - (type == STR_WIDE ? fg->wlen : fg->len);
937 /* Only try once at the beginning or ending of the line. */
938 if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
939 !(eflags & REG_NOTEOL))
941 /* Simple text comparison. */
942 if (!((fg->bol && fg->eol) &&
943 (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
945 /* Determine where in data to start search at. */
946 j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
947 SKIP_CHARS(j);
948 mismatch = fastcmp(fg, startptr, type);
949 if (mismatch == REG_OK)
951 if (fg->word && !IS_ON_WORD_BOUNDARY)
952 return ret;
953 if (!fg->nosub && nmatch >= 1)
955 pmatch[0].rm_so = j;
956 pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
958 return REG_OK;
962 else
964 /* Quick Search / Turbo Boyer-Moore algorithm. */
967 SKIP_CHARS(j);
968 mismatch = fastcmp(fg, startptr, type);
969 if (mismatch == REG_OK)
971 if (fg->word)
972 CHECK_WORD_BOUNDARY;
973 if (fg->bol)
974 CHECK_BOL_ANCHOR;
975 if (fg->eol)
976 CHECK_EOL_ANCHOR;
977 if (!fg->nosub && nmatch >= 1)
979 pmatch[0].rm_so = j;
980 pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
982 return REG_OK;
984 else if (mismatch > 0)
985 return mismatch;
986 mismatch = -mismatch - 1;
987 SHIFT;
988 } while (!IS_OUT_OF_BOUNDS);
990 return ret;
994 * Frees the resources that were allocated when the pattern was compiled.
996 void
997 tre_free_fast(fastmatch_t *fg)
1000 DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
1001 fg->pattern));
1003 #ifdef TRE_WCHAR
1004 hashtable_free(fg->qsBc_table);
1005 if (!fg->hasdot)
1006 free(fg->bmGs);
1007 if (fg->wescmap)
1008 free(fg->wescmap);
1009 free(fg->wpattern);
1010 #endif
1011 if (!fg->hasdot)
1012 free(fg->sbmGs);
1013 if (fg->escmap)
1014 free(fg->escmap);
1015 free(fg->pattern);
1019 * Returns: -(i + 1) on failure (position that it failed with minus sign)
1020 * error code on error
1021 * REG_OK on success
1023 static inline int
1024 fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
1026 const char *str_byte = data;
1027 const char *pat_byte = fg->pattern;
1028 const tre_char_t *str_wide = data;
1029 const tre_char_t *pat_wide = fg->wpattern;
1030 const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
1031 size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
1032 int ret = REG_OK;
1034 /* Compare the pattern and the input char-by-char from the last position. */
1035 for (int i = len - 1; i >= 0; i--) {
1036 switch (type)
1038 case STR_WIDE:
1040 /* Check dot */
1041 if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
1042 (!escmap || !escmap[i]) &&
1043 (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
1044 continue;
1046 /* Compare */
1047 if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
1048 : (pat_wide[i] == str_wide[i]))
1049 continue;
1050 break;
1051 default:
1052 /* Check dot */
1053 if (fg->hasdot && pat_byte[i] == '.' &&
1054 (!escmap || !escmap[i]) &&
1055 (!fg->newline || (str_byte[i] != '\n')))
1056 continue;
1058 /* Compare */
1059 if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
1060 : (pat_byte[i] == str_byte[i]))
1061 continue;
1063 DPRINT(("fastcmp: mismatch at position %d\n", i));
1064 ret = -(i + 1);
1065 break;
1067 return ret;