IMPORT openssh-9.8p1
[dragonfly.git] / contrib / grep / src / kwset.c
blob403af7e8d9f0d78022f2b12f38ee751db025a4a2
1 /* kwset.c - search for any of a set of keywords.
2 Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2020 Free Software
3 Foundation, Inc.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
18 02110-1301, USA. */
20 /* Written August 1989 by Mike Haertel. */
22 /* For the Aho-Corasick algorithm, see:
23 Aho AV, Corasick MJ. Efficient string matching: an aid to
24 bibliographic search. CACM 18, 6 (1975), 333-40
25 <https://dx.doi.org/10.1145/360825.360855>, which describes the
26 failure function used below.
28 For the Boyer-Moore algorithm, see: Boyer RS, Moore JS.
29 A fast string searching algorithm. CACM 20, 10 (1977), 762-72
30 <https://dx.doi.org/10.1145/359842.359859>.
32 For a survey of more-recent string matching algorithms that might
33 help improve performance, see: Faro S, Lecroq T. The exact online
34 string matching problem: a review of the most recent results.
35 ACM Computing Surveys 45, 2 (2013), 13
36 <https://dx.doi.org/10.1145/2431211.2431212>. */
38 #include <config.h>
40 #include "kwset.h"
42 #include <stdint.h>
43 #include <sys/types.h>
44 #include "system.h"
45 #include "intprops.h"
46 #include "memchr2.h"
47 #include "obstack.h"
48 #include "xalloc.h"
49 #include "verify.h"
51 #define obstack_chunk_alloc xmalloc
52 #define obstack_chunk_free free
54 static unsigned char
55 U (char ch)
57 return to_uchar (ch);
60 /* Balanced tree of edges and labels leaving a given trie node. */
61 struct tree
63 struct tree *llink; /* Left link; MUST be first field. */
64 struct tree *rlink; /* Right link (to larger labels). */
65 struct trie *trie; /* Trie node pointed to by this edge. */
66 unsigned char label; /* Label on this edge. */
67 char balance; /* Difference in depths of subtrees. */
70 /* Node of a trie representing a set of keywords. */
71 struct trie
73 /* If an accepting node, this is either 2*W + 1 where W is the word
74 index, or is SIZE_MAX if Aho-Corasick is in use and FAIL
75 specifies where to look for more info. If not an accepting node,
76 this is zero. */
77 size_t accepting;
79 struct tree *links; /* Tree of edges leaving this node. */
80 struct trie *parent; /* Parent of this node. */
81 struct trie *next; /* List of all trie nodes in level order. */
82 struct trie *fail; /* Aho-Corasick failure function. */
83 ptrdiff_t depth; /* Depth of this node from the root. */
84 ptrdiff_t shift; /* Shift function for search failures. */
85 ptrdiff_t maxshift; /* Max shift of self and descendants. */
88 /* Structure returned opaquely to the caller, containing everything. */
89 struct kwset
91 struct obstack obstack; /* Obstack for node allocation. */
92 ptrdiff_t words; /* Number of words in the trie. */
93 struct trie *trie; /* The trie itself. */
94 ptrdiff_t mind; /* Minimum depth of an accepting node. */
95 ptrdiff_t maxd; /* Maximum depth of any node. */
96 unsigned char delta[NCHAR]; /* Delta table for rapid search. */
97 struct trie *next[NCHAR]; /* Table of children of the root. */
98 char *target; /* Target string if there's only one. */
99 ptrdiff_t *shift; /* Used in Boyer-Moore search for one
100 string. */
101 char const *trans; /* Character translation table. */
103 /* This helps to match a terminal byte, which is the first byte
104 for Aho-Corasick, and the last byte for Boyer-More. If all the
105 patterns have the same terminal byte (after translation via TRANS
106 if TRANS is nonnull), then this is that byte as an unsigned char.
107 Otherwise this is -1 if there is disagreement among the strings
108 about terminal bytes, and -2 if there are no terminal bytes and
109 no disagreement because all the patterns are empty. */
110 int gc1;
112 /* This helps to match a terminal byte. If 0 <= GC1HELP, B is
113 terminal when B == GC1 || B == GC1HELP (note that GC1 == GCHELP
114 is common here). This is typically faster than evaluating
115 to_uchar (TRANS[B]) == GC1. */
116 int gc1help;
118 /* If the string has two or more bytes, this is the penultimate byte,
119 after translation via TRANS if TRANS is nonnull. This variable
120 is used only by Boyer-Moore. */
121 char gc2;
123 /* kwsexec implementation. */
124 ptrdiff_t (*kwsexec) (kwset_t, char const *, ptrdiff_t,
125 struct kwsmatch *, bool);
128 /* Use TRANS to transliterate C. A null TRANS does no transliteration. */
129 static inline char
130 tr (char const *trans, char c)
132 return trans ? trans[U(c)] : c;
135 static ptrdiff_t acexec (kwset_t, char const *, ptrdiff_t,
136 struct kwsmatch *, bool);
137 static ptrdiff_t bmexec (kwset_t, char const *, ptrdiff_t,
138 struct kwsmatch *, bool);
140 /* Return a newly allocated keyword set. A nonnull TRANS specifies a
141 table of character translations to be applied to all pattern and
142 search text. */
143 kwset_t
144 kwsalloc (char const *trans)
146 struct kwset *kwset = xmalloc (sizeof *kwset);
148 obstack_init (&kwset->obstack);
149 kwset->words = 0;
150 kwset->trie = obstack_alloc (&kwset->obstack, sizeof *kwset->trie);
151 kwset->trie->accepting = 0;
152 kwset->trie->links = NULL;
153 kwset->trie->parent = NULL;
154 kwset->trie->next = NULL;
155 kwset->trie->fail = NULL;
156 kwset->trie->depth = 0;
157 kwset->trie->shift = 0;
158 kwset->mind = PTRDIFF_MAX;
159 kwset->maxd = -1;
160 kwset->target = NULL;
161 kwset->trans = trans;
162 kwset->kwsexec = acexec;
164 return kwset;
167 /* This upper bound is valid for CHAR_BIT >= 4 and
168 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
169 enum { DEPTH_SIZE = CHAR_BIT + CHAR_BIT / 2 };
171 /* Add the given string to the contents of the keyword set. */
172 void
173 kwsincr (kwset_t kwset, char const *text, ptrdiff_t len)
175 assume (0 <= len);
176 struct trie *trie = kwset->trie;
177 char const *trans = kwset->trans;
178 bool reverse = kwset->kwsexec == bmexec;
180 if (reverse)
181 text += len;
183 /* Descend the trie (built of keywords) character-by-character,
184 installing new nodes when necessary. */
185 while (len--)
187 unsigned char uc = reverse ? *--text : *text++;
188 unsigned char label = trans ? trans[uc] : uc;
190 /* Descend the tree of outgoing links for this trie node,
191 looking for the current character and keeping track
192 of the path followed. */
193 struct tree *cur = trie->links;
194 struct tree *links[DEPTH_SIZE];
195 enum { L, R } dirs[DEPTH_SIZE];
196 links[0] = (struct tree *) &trie->links;
197 dirs[0] = L;
198 ptrdiff_t depth = 1;
200 while (cur && label != cur->label)
202 links[depth] = cur;
203 if (label < cur->label)
204 dirs[depth++] = L, cur = cur->llink;
205 else
206 dirs[depth++] = R, cur = cur->rlink;
209 /* The current character doesn't have an outgoing link at
210 this trie node, so build a new trie node and install
211 a link in the current trie node's tree. */
212 if (!cur)
214 cur = obstack_alloc (&kwset->obstack, sizeof *cur);
215 cur->llink = NULL;
216 cur->rlink = NULL;
217 cur->trie = obstack_alloc (&kwset->obstack, sizeof *cur->trie);
218 cur->trie->accepting = 0;
219 cur->trie->links = NULL;
220 cur->trie->parent = trie;
221 cur->trie->next = NULL;
222 cur->trie->fail = NULL;
223 cur->trie->depth = trie->depth + 1;
224 cur->trie->shift = 0;
225 cur->label = label;
226 cur->balance = 0;
228 /* Install the new tree node in its parent. */
229 if (dirs[--depth] == L)
230 links[depth]->llink = cur;
231 else
232 links[depth]->rlink = cur;
234 /* Back up the tree fixing the balance flags. */
235 while (depth && !links[depth]->balance)
237 if (dirs[depth] == L)
238 --links[depth]->balance;
239 else
240 ++links[depth]->balance;
241 --depth;
244 /* Rebalance the tree by pointer rotations if necessary. */
245 if (depth && ((dirs[depth] == L && --links[depth]->balance)
246 || (dirs[depth] == R && ++links[depth]->balance)))
248 struct tree *t, *r, *l, *rl, *lr;
250 switch (links[depth]->balance)
252 case (char) -2:
253 switch (dirs[depth + 1])
255 case L:
256 r = links[depth], t = r->llink, rl = t->rlink;
257 t->rlink = r, r->llink = rl;
258 t->balance = r->balance = 0;
259 break;
260 case R:
261 r = links[depth], l = r->llink, t = l->rlink;
262 rl = t->rlink, lr = t->llink;
263 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
264 l->balance = t->balance != 1 ? 0 : -1;
265 r->balance = t->balance != (char) -1 ? 0 : 1;
266 t->balance = 0;
267 break;
268 default:
269 abort ();
271 break;
272 case 2:
273 switch (dirs[depth + 1])
275 case R:
276 l = links[depth], t = l->rlink, lr = t->llink;
277 t->llink = l, l->rlink = lr;
278 t->balance = l->balance = 0;
279 break;
280 case L:
281 l = links[depth], r = l->rlink, t = r->llink;
282 lr = t->llink, rl = t->rlink;
283 t->llink = l, l->rlink = lr, t->rlink = r, r->llink = rl;
284 l->balance = t->balance != 1 ? 0 : -1;
285 r->balance = t->balance != (char) -1 ? 0 : 1;
286 t->balance = 0;
287 break;
288 default:
289 abort ();
291 break;
292 default:
293 abort ();
296 if (dirs[depth - 1] == L)
297 links[depth - 1]->llink = t;
298 else
299 links[depth - 1]->rlink = t;
303 trie = cur->trie;
306 /* Mark the node finally reached as accepting, encoding the
307 index number of this word in the keyword set so far. */
308 if (!trie->accepting)
310 size_t words = kwset->words;
311 trie->accepting = 2 * words + 1;
313 ++kwset->words;
315 /* Keep track of the longest and shortest string of the keyword set. */
316 if (trie->depth < kwset->mind)
317 kwset->mind = trie->depth;
318 if (trie->depth > kwset->maxd)
319 kwset->maxd = trie->depth;
322 ptrdiff_t
323 kwswords (kwset_t kwset)
325 return kwset->words;
328 /* Enqueue the trie nodes referenced from the given tree in the
329 given queue. */
330 static void
331 enqueue (struct tree *tree, struct trie **last)
333 if (!tree)
334 return;
335 enqueue (tree->llink, last);
336 enqueue (tree->rlink, last);
337 (*last) = (*last)->next = tree->trie;
340 /* Compute the Aho-Corasick failure function for the trie nodes referenced
341 from the given tree, given the failure function for their parent as
342 well as a last resort failure node. */
343 static void
344 treefails (struct tree const *tree, struct trie const *fail,
345 struct trie *recourse, bool reverse)
347 struct tree *cur;
349 if (!tree)
350 return;
352 treefails (tree->llink, fail, recourse, reverse);
353 treefails (tree->rlink, fail, recourse, reverse);
355 /* Find, in the chain of fails going back to the root, the first
356 node that has a descendant on the current label. */
357 while (fail)
359 cur = fail->links;
360 while (cur && tree->label != cur->label)
361 if (tree->label < cur->label)
362 cur = cur->llink;
363 else
364 cur = cur->rlink;
365 if (cur)
367 tree->trie->fail = cur->trie;
368 if (!reverse && cur->trie->accepting && !tree->trie->accepting)
369 tree->trie->accepting = SIZE_MAX;
370 return;
372 fail = fail->fail;
375 tree->trie->fail = recourse;
378 /* Set delta entries for the links of the given tree such that
379 the preexisting delta value is larger than the current depth. */
380 static void
381 treedelta (struct tree const *tree, ptrdiff_t depth, unsigned char delta[])
383 if (!tree)
384 return;
385 treedelta (tree->llink, depth, delta);
386 treedelta (tree->rlink, depth, delta);
387 if (depth < delta[tree->label])
388 delta[tree->label] = depth;
391 /* Return true if A has every label in B. */
392 static bool _GL_ATTRIBUTE_PURE
393 hasevery (struct tree const *a, struct tree const *b)
395 if (!b)
396 return true;
397 if (!hasevery (a, b->llink))
398 return false;
399 if (!hasevery (a, b->rlink))
400 return false;
401 while (a && b->label != a->label)
402 if (b->label < a->label)
403 a = a->llink;
404 else
405 a = a->rlink;
406 return !!a;
409 /* Compute a vector, indexed by character code, of the trie nodes
410 referenced from the given tree. */
411 static void
412 treenext (struct tree const *tree, struct trie *next[])
414 if (!tree)
415 return;
416 treenext (tree->llink, next);
417 treenext (tree->rlink, next);
418 next[tree->label] = tree->trie;
421 /* Prepare a built keyword set for use. */
422 void
423 kwsprep (kwset_t kwset)
425 char const *trans = kwset->trans;
426 ptrdiff_t i;
427 unsigned char deltabuf[NCHAR];
428 unsigned char *delta = trans ? deltabuf : kwset->delta;
429 struct trie *curr, *last;
431 /* Use Boyer-Moore if just one pattern, Aho-Corasick otherwise. */
432 bool reverse = kwset->words == 1;
434 if (reverse)
436 kwset_t new_kwset;
438 /* Enqueue the immediate descendants in the level order queue. */
439 for (curr = last = kwset->trie; curr; curr = curr->next)
440 enqueue (curr->links, &last);
442 /* Looking for just one string. Extract it from the trie. */
443 kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
444 for (i = 0, curr = kwset->trie; i < kwset->mind; ++i)
446 kwset->target[i] = curr->links->label;
447 curr = curr->next;
450 new_kwset = kwsalloc (kwset->trans);
451 new_kwset->kwsexec = bmexec;
452 kwsincr (new_kwset, kwset->target, kwset->mind);
453 obstack_free (&kwset->obstack, NULL);
454 *kwset = *new_kwset;
455 free (new_kwset);
458 /* Initial values for the delta table; will be changed later. The
459 delta entry for a given character is the smallest depth of any
460 node at which an outgoing edge is labeled by that character. */
461 memset (delta, MIN (kwset->mind, UCHAR_MAX), sizeof deltabuf);
463 /* Traverse the nodes of the trie in level order, simultaneously
464 computing the delta table, failure function, and shift function. */
465 for (curr = last = kwset->trie; curr; curr = curr->next)
467 /* Enqueue the immediate descendants in the level order queue. */
468 enqueue (curr->links, &last);
470 /* Update the delta table for the descendants of this node. */
471 treedelta (curr->links, curr->depth, delta);
473 /* Compute the failure function for the descendants of this node. */
474 treefails (curr->links, curr->fail, kwset->trie, reverse);
476 if (reverse)
478 curr->shift = kwset->mind;
479 curr->maxshift = kwset->mind;
481 /* Update the shifts at each node in the current node's chain
482 of fails back to the root. */
483 struct trie *fail;
484 for (fail = curr->fail; fail; fail = fail->fail)
486 /* If the current node has some outgoing edge that the fail
487 doesn't, then the shift at the fail should be no larger
488 than the difference of their depths. */
489 if (!hasevery (fail->links, curr->links))
490 if (curr->depth - fail->depth < fail->shift)
491 fail->shift = curr->depth - fail->depth;
493 /* If the current node is accepting then the shift at the
494 fail and its descendants should be no larger than the
495 difference of their depths. */
496 if (curr->accepting && fail->maxshift > curr->depth - fail->depth)
497 fail->maxshift = curr->depth - fail->depth;
502 if (reverse)
504 /* Traverse the trie in level order again, fixing up all nodes whose
505 shift exceeds their inherited maxshift. */
506 for (curr = kwset->trie->next; curr; curr = curr->next)
508 if (curr->maxshift > curr->parent->maxshift)
509 curr->maxshift = curr->parent->maxshift;
510 if (curr->shift > curr->maxshift)
511 curr->shift = curr->maxshift;
515 /* Create a vector, indexed by character code, of the outgoing links
516 from the root node. Accumulate GC1 and GC1HELP. */
517 struct trie *nextbuf[NCHAR];
518 struct trie **next = trans ? nextbuf : kwset->next;
519 memset (next, 0, sizeof nextbuf);
520 treenext (kwset->trie->links, next);
521 int gc1 = -2;
522 int gc1help = -1;
523 for (i = 0; i < NCHAR; i++)
525 int ti = i;
526 if (trans)
528 ti = U(trans[i]);
529 kwset->next[i] = next[ti];
531 if (kwset->next[i])
533 if (gc1 < -1)
535 gc1 = ti;
536 gc1help = i;
538 else if (gc1 == ti)
539 gc1help = gc1help == ti ? i : -1;
540 else if (i == ti && gc1 == gc1help)
541 gc1help = i;
542 else
543 gc1 = -1;
546 kwset->gc1 = gc1;
547 kwset->gc1help = gc1help;
549 if (reverse)
551 /* Looking for just one string. Extract it from the trie. */
552 kwset->target = obstack_alloc (&kwset->obstack, kwset->mind);
553 for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)
555 kwset->target[i] = curr->links->label;
556 curr = curr->next;
559 if (kwset->mind > 1)
561 /* Looking for the delta2 shift that might be made after a
562 backwards match has failed. Extract it from the trie. */
563 kwset->shift
564 = obstack_alloc (&kwset->obstack,
565 sizeof *kwset->shift * (kwset->mind - 1));
566 for (i = 0, curr = kwset->trie->next; i < kwset->mind - 1; ++i)
568 kwset->shift[i] = curr->shift;
569 curr = curr->next;
572 /* The penultimate byte. */
573 kwset->gc2 = tr (trans, kwset->target[kwset->mind - 2]);
577 /* Fix things up for any translation table. */
578 if (trans)
579 for (i = 0; i < NCHAR; ++i)
580 kwset->delta[i] = delta[U(trans[i])];
583 /* Delta2 portion of a Boyer-Moore search. *TP is the string text
584 pointer; it is updated in place. EP is the end of the string text,
585 and SP the end of the pattern. LEN is the pattern length; it must
586 be at least 2. TRANS, if nonnull, is the input translation table.
587 GC1 and GC2 are the last and second-from last bytes of the pattern,
588 transliterated by TRANS; the caller precomputes them for
589 efficiency. If D1 is nonnull, it is a delta1 table for shifting *TP
590 when failing. KWSET->shift says how much to shift. */
591 static inline bool
592 bm_delta2_search (char const **tpp, char const *ep, char const *sp,
593 ptrdiff_t len,
594 char const *trans, char gc1, char gc2,
595 unsigned char const *d1, kwset_t kwset)
597 char const *tp = *tpp;
598 ptrdiff_t d = len, skip = 0;
600 while (true)
602 ptrdiff_t i = 2;
603 if (tr (trans, tp[-2]) == gc2)
605 while (++i <= d)
606 if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
607 break;
608 if (i > d)
610 for (i = d + skip + 1; i <= len; ++i)
611 if (tr (trans, tp[-i]) != tr (trans, sp[-i]))
612 break;
613 if (i > len)
615 *tpp = tp - len;
616 return true;
621 tp += d = kwset->shift[i - 2];
622 if (tp > ep)
623 break;
624 if (tr (trans, tp[-1]) != gc1)
626 if (d1)
627 tp += d1[U(tp[-1])];
628 break;
630 skip = i - 1;
633 *tpp = tp;
634 return false;
637 /* Return the address of the first byte in the buffer S (of size N)
638 that matches the terminal byte specified by KWSET, or NULL if there
639 is no match. KWSET->gc1 should be nonnegative. */
640 static char const *
641 memchr_kwset (char const *s, ptrdiff_t n, kwset_t kwset)
643 char const *slim = s + n;
644 if (kwset->gc1help < 0)
646 for (; s < slim; s++)
647 if (kwset->next[U(*s)])
648 return s;
650 else
652 int small_heuristic = 2;
653 size_t small_bytes = small_heuristic * sizeof (unsigned long int);
654 while (s < slim)
656 if (kwset->next[U(*s)])
657 return s;
658 s++;
659 if ((uintptr_t) s % small_bytes == 0)
660 return memchr2 (s, kwset->gc1, kwset->gc1help, slim - s);
663 return NULL;
666 /* Fast Boyer-Moore search (inlinable version). */
667 static inline ptrdiff_t _GL_ATTRIBUTE_PURE
668 bmexec_trans (kwset_t kwset, char const *text, ptrdiff_t size)
670 assume (0 <= size);
671 unsigned char const *d1;
672 char const *ep, *sp, *tp;
673 int d;
674 ptrdiff_t len = kwset->mind;
675 char const *trans = kwset->trans;
677 if (len == 0)
678 return 0;
679 if (len > size)
680 return -1;
681 if (len == 1)
683 tp = memchr_kwset (text, size, kwset);
684 return tp ? tp - text : -1;
687 d1 = kwset->delta;
688 sp = kwset->target + len;
689 tp = text + len;
690 char gc1 = kwset->gc1;
691 char gc2 = kwset->gc2;
693 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
694 ptrdiff_t len12;
695 if (!INT_MULTIPLY_WRAPV (len, 12, &len12) && len12 < size)
696 /* 11 is not a bug, the initial offset happens only once. */
697 for (ep = text + size - 11 * len; tp <= ep; )
699 char const *tp0 = tp;
700 d = d1[U(tp[-1])], tp += d;
701 d = d1[U(tp[-1])], tp += d;
702 if (d != 0)
704 d = d1[U(tp[-1])], tp += d;
705 d = d1[U(tp[-1])], tp += d;
706 d = d1[U(tp[-1])], tp += d;
707 if (d != 0)
709 d = d1[U(tp[-1])], tp += d;
710 d = d1[U(tp[-1])], tp += d;
711 d = d1[U(tp[-1])], tp += d;
712 if (d != 0)
714 d = d1[U(tp[-1])], tp += d;
715 d = d1[U(tp[-1])], tp += d;
717 /* As a heuristic, prefer memchr to seeking by
718 delta1 when the latter doesn't advance much. */
719 int advance_heuristic = 16 * sizeof (long);
720 if (advance_heuristic <= tp - tp0)
721 continue;
722 tp--;
723 tp = memchr_kwset (tp, text + size - tp, kwset);
724 if (! tp)
725 return -1;
726 tp++;
727 if (ep <= tp)
728 break;
732 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, d1, kwset))
733 return tp - text;
736 /* Now only a few characters are left to search. Carefully avoid
737 ever producing an out-of-bounds pointer. */
738 ep = text + size;
739 d = d1[U(tp[-1])];
740 while (d <= ep - tp)
742 d = d1[U((tp += d)[-1])];
743 if (d != 0)
744 continue;
745 if (bm_delta2_search (&tp, ep, sp, len, trans, gc1, gc2, NULL, kwset))
746 return tp - text;
749 return -1;
752 /* Fast Boyer-Moore search. */
753 static ptrdiff_t
754 bmexec (kwset_t kwset, char const *text, ptrdiff_t size,
755 struct kwsmatch *kwsmatch, bool longest)
757 /* Help the compiler inline in two ways, depending on whether
758 kwset->trans is null. */
759 ptrdiff_t ret = (IGNORE_DUPLICATE_BRANCH_WARNING
760 (kwset->trans
761 ? bmexec_trans (kwset, text, size)
762 : bmexec_trans (kwset, text, size)));
763 if (0 <= ret)
765 kwsmatch->index = 0;
766 kwsmatch->offset[0] = ret;
767 kwsmatch->size[0] = kwset->mind;
770 return ret;
773 /* Hairy multiple string search with the Aho-Corasick algorithm.
774 (inlinable version) */
775 static inline ptrdiff_t
776 acexec_trans (kwset_t kwset, char const *text, ptrdiff_t len,
777 struct kwsmatch *kwsmatch, bool longest)
779 struct trie const *trie, *accept;
780 char const *tp, *left, *lim;
781 struct tree const *tree;
782 char const *trans;
784 /* Initialize register copies and look for easy ways out. */
785 if (len < kwset->mind)
786 return -1;
787 trans = kwset->trans;
788 trie = kwset->trie;
789 lim = text + len;
790 tp = text;
792 if (!trie->accepting)
794 unsigned char c;
795 int gc1 = kwset->gc1;
797 while (true)
799 if (gc1 < 0)
801 while (! (trie = kwset->next[c = tr (trans, *tp++)]))
802 if (tp >= lim)
803 return -1;
805 else
807 tp = memchr_kwset (tp, lim - tp, kwset);
808 if (!tp)
809 return -1;
810 c = tr (trans, *tp++);
811 trie = kwset->next[c];
814 while (true)
816 if (trie->accepting)
817 goto match;
818 if (tp >= lim)
819 return -1;
820 c = tr (trans, *tp++);
822 for (tree = trie->links; c != tree->label; )
824 tree = c < tree->label ? tree->llink : tree->rlink;
825 if (! tree)
827 trie = trie->fail;
828 if (!trie)
830 trie = kwset->next[c];
831 if (trie)
832 goto have_trie;
833 if (tp >= lim)
834 return -1;
835 goto next_c;
837 if (trie->accepting)
839 --tp;
840 goto match;
842 tree = trie->links;
845 trie = tree->trie;
846 have_trie:;
848 next_c:;
852 match:
853 accept = trie;
854 while (accept->accepting == SIZE_MAX)
855 accept = accept->fail;
856 left = tp - accept->depth;
858 /* Try left-most longest match. */
859 if (longest)
861 while (tp < lim)
863 struct trie const *accept1;
864 char const *left1;
865 unsigned char c = tr (trans, *tp++);
869 tree = trie->links;
870 while (tree && c != tree->label)
871 tree = c < tree->label ? tree->llink : tree->rlink;
873 while (!tree && (trie = trie->fail) && accept->depth <= trie->depth);
875 if (!tree)
876 break;
877 trie = tree->trie;
878 if (trie->accepting)
880 accept1 = trie;
881 while (accept1->accepting == SIZE_MAX)
882 accept1 = accept1->fail;
883 left1 = tp - accept1->depth;
884 if (left1 <= left)
886 left = left1;
887 accept = accept1;
893 kwsmatch->index = accept->accepting / 2;
894 kwsmatch->offset[0] = left - text;
895 kwsmatch->size[0] = accept->depth;
897 return left - text;
900 /* Hairy multiple string search with Aho-Corasick algorithm. */
901 static ptrdiff_t
902 acexec (kwset_t kwset, char const *text, ptrdiff_t size,
903 struct kwsmatch *kwsmatch, bool longest)
905 assume (0 <= size);
906 /* Help the compiler inline in two ways, depending on whether
907 kwset->trans is null. */
908 return (IGNORE_DUPLICATE_BRANCH_WARNING
909 (kwset->trans
910 ? acexec_trans (kwset, text, size, kwsmatch, longest)
911 : acexec_trans (kwset, text, size, kwsmatch, longest)));
914 /* Find the first instance of a KWSET member in TEXT, which has SIZE bytes.
915 Return the offset (into TEXT) of the first byte of the matching substring,
916 or -1 if no match is found. Upon a match, store details in
917 *KWSMATCH: index of matched keyword, start offset (same as the return
918 value), and length. If LONGEST, find the longest match; otherwise
919 any match will do. */
920 ptrdiff_t
921 kwsexec (kwset_t kwset, char const *text, ptrdiff_t size,
922 struct kwsmatch *kwsmatch, bool longest)
924 return kwset->kwsexec (kwset, text, size, kwsmatch, longest);
927 /* Free the components of the given keyword set. */
928 void
929 kwsfree (kwset_t kwset)
931 obstack_free (&kwset->obstack, NULL);
932 free (kwset);