1 /* kwset.c - search for any of a set of keywords.
2 Copyright (C) 1989, 1998, 2000, 2005, 2007, 2009-2015 Free Software
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
20 /* Written August 1989 by Mike Haertel.
21 The author may be reached (Email) at the address mike@ai.mit.edu,
22 or (US mail) as Mike Haertel c/o Free Software Foundation. */
24 /* The algorithm implemented by these routines bears a startling resemblance
25 to one discovered by Beate Commentz-Walter, although it is not identical.
26 See: Commentz-Walter B. A string matching algorithm fast on the average.
27 Lecture Notes in Computer Science 71 (1979), 118-32
28 <http://dx.doi.org/10.1007/3-540-09510-1_10>.
29 See also: Aho AV, Corasick MJ. Efficient string matching: an aid to
30 bibliographic search. CACM 18, 6 (1975), 333-40
31 <http://dx.doi.org/10.1145/360825.360855>, which describes the
32 failure function used below. */
40 #include <sys/types.h>
46 #define link kwset_link
51 # define malloc xmalloc
54 #define NCHAR (UCHAR_MAX + 1)
55 #define obstack_chunk_alloc malloc
56 #define obstack_chunk_free free
58 #define U(c) (to_uchar (c))
60 /* Balanced tree of edges and labels leaving a given trie node. */
63 struct tree
*llink
; /* Left link; MUST be first field. */
64 struct tree
*rlink
; /* Right link (to larger labels). */
65 struct trie
*trie
; /* Trie node pointed to by this edge. */
66 unsigned char label
; /* Label on this edge. */
67 char balance
; /* Difference in depths of subtrees. */
70 /* Node of a trie representing a set of reversed keywords. */
73 size_t accepting
; /* Word index of accepted word, or zero. */
74 struct tree
*links
; /* Tree of edges leaving this node. */
75 struct trie
*parent
; /* Parent of this node. */
76 struct trie
*next
; /* List of all trie nodes in level order. */
77 struct trie
*fail
; /* Aho-Corasick failure function. */
78 int depth
; /* Depth of this node from the root. */
79 int shift
; /* Shift function for search failures. */
80 int maxshift
; /* Max shift of self and descendants. */
83 /* Structure returned opaquely to the caller, containing everything. */
86 struct obstack obstack
; /* Obstack for node allocation. */
87 ptrdiff_t words
; /* Number of words in the trie. */
88 struct trie
*trie
; /* The trie itself. */
89 int mind
; /* Minimum depth of an accepting node. */
90 int maxd
; /* Maximum depth of any node. */
91 unsigned char delta
[NCHAR
]; /* Delta table for rapid search. */
92 struct trie
*next
[NCHAR
]; /* Table of children of the root. */
93 char *target
; /* Target string if there's only one. */
94 int *shift
; /* Used in Boyer-Moore search for one string. */
95 char const *trans
; /* Character translation table. */
97 /* If there's only one string, this is the string's last byte,
98 translated via TRANS if TRANS is nonnull. */
101 /* Likewise for the string's penultimate byte, if it has two or more
105 /* If there's only one string, this helps to match the string's last byte.
106 If GC1HELP is negative, only GC1 matches the string's last byte;
107 otherwise at least two bytes match, and B matches if TRANS[B] == GC1.
108 If GC1HELP is in the range 0..(NCHAR - 1), there are exactly two
109 such matches, and GC1HELP is the other match after conversion to
110 unsigned char. If GC1HELP is at least NCHAR, there are three or
111 more such matches; e.g., Greek has three sigma characters that
112 all match when case-folding. */
116 /* Use TRANS to transliterate C. A null TRANS does no transliteration. */
118 tr (char const *trans
, char c
)
120 return trans
? trans
[U(c
)] : c
;
123 /* Allocate and initialize a keyword set object, returning an opaque
126 kwsalloc (char const *trans
)
128 struct kwset
*kwset
= xmalloc (sizeof *kwset
);
130 obstack_init (&kwset
->obstack
);
132 kwset
->trie
= obstack_alloc (&kwset
->obstack
, sizeof *kwset
->trie
);
133 kwset
->trie
->accepting
= 0;
134 kwset
->trie
->links
= NULL
;
135 kwset
->trie
->parent
= NULL
;
136 kwset
->trie
->next
= NULL
;
137 kwset
->trie
->fail
= NULL
;
138 kwset
->trie
->depth
= 0;
139 kwset
->trie
->shift
= 0;
140 kwset
->mind
= INT_MAX
;
142 kwset
->target
= NULL
;
143 kwset
->trans
= trans
;
148 /* This upper bound is valid for CHAR_BIT >= 4 and
149 exact for CHAR_BIT in { 4..11, 13, 15, 17, 19 }. */
150 #define DEPTH_SIZE (CHAR_BIT + CHAR_BIT/2)
152 /* Add the given string to the contents of the keyword set. */
154 kwsincr (kwset_t kwset
, char const *text
, size_t len
)
156 struct trie
*trie
= kwset
->trie
;
157 char const *trans
= kwset
->trans
;
161 /* Descend the trie (built of reversed keywords) character-by-character,
162 installing new nodes when necessary. */
165 unsigned char uc
= *--text
;
166 unsigned char label
= trans
? trans
[uc
] : uc
;
168 /* Descend the tree of outgoing links for this trie node,
169 looking for the current character and keeping track
170 of the path followed. */
171 struct tree
*link
= trie
->links
;
172 struct tree
*links
[DEPTH_SIZE
];
173 enum { L
, R
} dirs
[DEPTH_SIZE
];
174 links
[0] = (struct tree
*) &trie
->links
;
178 while (link
&& label
!= link
->label
)
181 if (label
< link
->label
)
182 dirs
[depth
++] = L
, link
= link
->llink
;
184 dirs
[depth
++] = R
, link
= link
->rlink
;
187 /* The current character doesn't have an outgoing link at
188 this trie node, so build a new trie node and install
189 a link in the current trie node's tree. */
192 link
= obstack_alloc (&kwset
->obstack
, sizeof *link
);
195 link
->trie
= obstack_alloc (&kwset
->obstack
, sizeof *link
->trie
);
196 link
->trie
->accepting
= 0;
197 link
->trie
->links
= NULL
;
198 link
->trie
->parent
= trie
;
199 link
->trie
->next
= NULL
;
200 link
->trie
->fail
= NULL
;
201 link
->trie
->depth
= trie
->depth
+ 1;
202 link
->trie
->shift
= 0;
206 /* Install the new tree node in its parent. */
207 if (dirs
[--depth
] == L
)
208 links
[depth
]->llink
= link
;
210 links
[depth
]->rlink
= link
;
212 /* Back up the tree fixing the balance flags. */
213 while (depth
&& !links
[depth
]->balance
)
215 if (dirs
[depth
] == L
)
216 --links
[depth
]->balance
;
218 ++links
[depth
]->balance
;
222 /* Rebalance the tree by pointer rotations if necessary. */
223 if (depth
&& ((dirs
[depth
] == L
&& --links
[depth
]->balance
)
224 || (dirs
[depth
] == R
&& ++links
[depth
]->balance
)))
226 struct tree
*t
, *r
, *l
, *rl
, *lr
;
228 switch (links
[depth
]->balance
)
231 switch (dirs
[depth
+ 1])
234 r
= links
[depth
], t
= r
->llink
, rl
= t
->rlink
;
235 t
->rlink
= r
, r
->llink
= rl
;
236 t
->balance
= r
->balance
= 0;
239 r
= links
[depth
], l
= r
->llink
, t
= l
->rlink
;
240 rl
= t
->rlink
, lr
= t
->llink
;
241 t
->llink
= l
, l
->rlink
= lr
, t
->rlink
= r
, r
->llink
= rl
;
242 l
->balance
= t
->balance
!= 1 ? 0 : -1;
243 r
->balance
= t
->balance
!= (char) -1 ? 0 : 1;
251 switch (dirs
[depth
+ 1])
254 l
= links
[depth
], t
= l
->rlink
, lr
= t
->llink
;
255 t
->llink
= l
, l
->rlink
= lr
;
256 t
->balance
= l
->balance
= 0;
259 l
= links
[depth
], r
= l
->rlink
, t
= r
->llink
;
260 lr
= t
->llink
, rl
= t
->rlink
;
261 t
->llink
= l
, l
->rlink
= lr
, t
->rlink
= r
, r
->llink
= rl
;
262 l
->balance
= t
->balance
!= 1 ? 0 : -1;
263 r
->balance
= t
->balance
!= (char) -1 ? 0 : 1;
274 if (dirs
[depth
- 1] == L
)
275 links
[depth
- 1]->llink
= t
;
277 links
[depth
- 1]->rlink
= t
;
284 /* Mark the node we finally reached as accepting, encoding the
285 index number of this word in the keyword set so far. */
286 if (!trie
->accepting
)
287 trie
->accepting
= 1 + 2 * kwset
->words
;
290 /* Keep track of the longest and shortest string of the keyword set. */
291 if (trie
->depth
< kwset
->mind
)
292 kwset
->mind
= trie
->depth
;
293 if (trie
->depth
> kwset
->maxd
)
294 kwset
->maxd
= trie
->depth
;
297 /* Enqueue the trie nodes referenced from the given tree in the
300 enqueue (struct tree
*tree
, struct trie
**last
)
304 enqueue(tree
->llink
, last
);
305 enqueue(tree
->rlink
, last
);
306 (*last
) = (*last
)->next
= tree
->trie
;
309 /* Compute the Aho-Corasick failure function for the trie nodes referenced
310 from the given tree, given the failure function for their parent as
311 well as a last resort failure node. */
313 treefails (struct tree
const *tree
, struct trie
const *fail
,
314 struct trie
*recourse
)
321 treefails(tree
->llink
, fail
, recourse
);
322 treefails(tree
->rlink
, fail
, recourse
);
324 /* Find, in the chain of fails going back to the root, the first
325 node that has a descendant on the current label. */
329 while (link
&& tree
->label
!= link
->label
)
330 if (tree
->label
< link
->label
)
336 tree
->trie
->fail
= link
->trie
;
342 tree
->trie
->fail
= recourse
;
345 /* Set delta entries for the links of the given tree such that
346 the preexisting delta value is larger than the current depth. */
348 treedelta (struct tree
const *tree
,
350 unsigned char delta
[])
354 treedelta(tree
->llink
, depth
, delta
);
355 treedelta(tree
->rlink
, depth
, delta
);
356 if (depth
< delta
[tree
->label
])
357 delta
[tree
->label
] = depth
;
360 /* Return true if A has every label in B. */
361 static int _GL_ATTRIBUTE_PURE
362 hasevery (struct tree
const *a
, struct tree
const *b
)
366 if (!hasevery(a
, b
->llink
))
368 if (!hasevery(a
, b
->rlink
))
370 while (a
&& b
->label
!= a
->label
)
371 if (b
->label
< a
->label
)
378 /* Compute a vector, indexed by character code, of the trie nodes
379 referenced from the given tree. */
381 treenext (struct tree
const *tree
, struct trie
*next
[])
385 treenext(tree
->llink
, next
);
386 treenext(tree
->rlink
, next
);
387 next
[tree
->label
] = tree
->trie
;
390 /* Compute the shift for each trie node, as well as the delta
391 table and next cache for the given keyword set. */
393 kwsprep (kwset_t kwset
)
395 char const *trans
= kwset
->trans
;
397 unsigned char deltabuf
[NCHAR
];
398 unsigned char *delta
= trans
? deltabuf
: kwset
->delta
;
400 /* Initial values for the delta table; will be changed later. The
401 delta entry for a given character is the smallest depth of any
402 node at which an outgoing edge is labeled by that character. */
403 memset (delta
, MIN (kwset
->mind
, UCHAR_MAX
), sizeof deltabuf
);
405 /* Traverse the nodes of the trie in level order, simultaneously
406 computing the delta table, failure function, and shift function. */
407 struct trie
*curr
, *last
;
408 for (curr
= last
= kwset
->trie
; curr
; curr
= curr
->next
)
410 /* Enqueue the immediate descendants in the level order queue. */
411 enqueue (curr
->links
, &last
);
413 curr
->shift
= kwset
->mind
;
414 curr
->maxshift
= kwset
->mind
;
416 /* Update the delta table for the descendants of this node. */
417 treedelta (curr
->links
, curr
->depth
, delta
);
419 /* Compute the failure function for the descendants of this node. */
420 treefails (curr
->links
, curr
->fail
, kwset
->trie
);
422 /* Update the shifts at each node in the current node's chain
423 of fails back to the root. */
425 for (fail
= curr
->fail
; fail
; fail
= fail
->fail
)
427 /* If the current node has some outgoing edge that the fail
428 doesn't, then the shift at the fail should be no larger
429 than the difference of their depths. */
430 if (!hasevery (fail
->links
, curr
->links
))
431 if (curr
->depth
- fail
->depth
< fail
->shift
)
432 fail
->shift
= curr
->depth
- fail
->depth
;
434 /* If the current node is accepting then the shift at the
435 fail and its descendants should be no larger than the
436 difference of their depths. */
437 if (curr
->accepting
&& fail
->maxshift
> curr
->depth
- fail
->depth
)
438 fail
->maxshift
= curr
->depth
- fail
->depth
;
442 /* Traverse the trie in level order again, fixing up all nodes whose
443 shift exceeds their inherited maxshift. */
444 for (curr
= kwset
->trie
->next
; curr
; curr
= curr
->next
)
446 if (curr
->maxshift
> curr
->parent
->maxshift
)
447 curr
->maxshift
= curr
->parent
->maxshift
;
448 if (curr
->shift
> curr
->maxshift
)
449 curr
->shift
= curr
->maxshift
;
452 /* Create a vector, indexed by character code, of the outgoing links
453 from the root node. */
454 struct trie
*nextbuf
[NCHAR
];
455 struct trie
**next
= trans
? nextbuf
: kwset
->next
;
456 memset (next
, 0, sizeof nextbuf
);
457 treenext (kwset
->trie
->links
, next
);
459 for (i
= 0; i
< NCHAR
; ++i
)
460 kwset
->next
[i
] = next
[U(trans
[i
])];
462 /* Check if we can use the simple boyer-moore algorithm, instead
463 of the hairy commentz-walter algorithm. */
464 if (kwset
->words
== 1)
466 /* Looking for just one string. Extract it from the trie. */
467 kwset
->target
= obstack_alloc (&kwset
->obstack
, kwset
->mind
);
468 for (i
= kwset
->mind
- 1, curr
= kwset
->trie
; i
>= 0; --i
)
470 kwset
->target
[i
] = curr
->links
->label
;
473 /* Looking for the delta2 shift that we might make after a
474 backwards match has failed. Extract it from the trie. */
478 = obstack_alloc (&kwset
->obstack
,
479 sizeof *kwset
->shift
* (kwset
->mind
- 1));
480 for (i
= 0, curr
= kwset
->trie
->next
; i
< kwset
->mind
- 1; ++i
)
482 kwset
->shift
[i
] = curr
->shift
;
487 char gc1
= tr (trans
, kwset
->target
[kwset
->mind
- 1]);
489 /* Set GC1HELP according to whether exactly one, exactly two, or
490 three-or-more characters match GC1. */
494 char const *equiv1
= memchr (trans
, gc1
, NCHAR
);
495 char const *equiv2
= memchr (equiv1
+ 1, gc1
,
496 trans
+ NCHAR
- (equiv1
+ 1));
498 gc1help
= (memchr (equiv2
+ 1, gc1
, trans
+ NCHAR
- (equiv2
+ 1))
500 : U(gc1
) ^ (equiv1
- trans
) ^ (equiv2
- trans
));
504 kwset
->gc1help
= gc1help
;
506 kwset
->gc2
= tr (trans
, kwset
->target
[kwset
->mind
- 2]);
509 /* Fix things up for any translation table. */
511 for (i
= 0; i
< NCHAR
; ++i
)
512 kwset
->delta
[i
] = delta
[U(trans
[i
])];
515 /* Delta2 portion of a Boyer-Moore search. *TP is the string text
516 pointer; it is updated in place. EP is the end of the string text,
517 and SP the end of the pattern. LEN is the pattern length; it must
518 be at least 2. TRANS, if nonnull, is the input translation table.
519 GC1 and GC2 are the last and second-from last bytes of the pattern,
520 transliterated by TRANS; the caller precomputes them for
521 efficiency. If D1 is nonnull, it is a delta1 table for shifting *TP
522 when failing. KWSET->shift says how much to shift. */
524 bm_delta2_search (char const **tpp
, char const *ep
, char const *sp
, int len
,
525 char const *trans
, char gc1
, char gc2
,
526 unsigned char const *d1
, kwset_t kwset
)
528 char const *tp
= *tpp
;
529 int d
= len
, skip
= 0;
534 if (tr (trans
, tp
[-2]) == gc2
)
537 if (tr (trans
, tp
[-i
]) != tr (trans
, sp
[-i
]))
541 for (i
= d
+ skip
+ 1; i
<= len
; ++i
)
542 if (tr (trans
, tp
[-i
]) != tr (trans
, sp
[-i
]))
552 tp
+= d
= kwset
->shift
[i
- 2];
555 if (tr (trans
, tp
[-1]) != gc1
)
568 /* Return the address of the first byte in the buffer S (of size N)
569 that matches the last byte specified by KWSET, a singleton. */
571 memchr_kwset (char const *s
, size_t n
, kwset_t kwset
)
573 if (kwset
->gc1help
< 0)
574 return memchr (s
, kwset
->gc1
, n
);
575 int small_heuristic
= 2;
576 int small
= (- (uintptr_t) s
% sizeof (long)
577 + small_heuristic
* sizeof (long));
578 size_t ntrans
= kwset
->gc1help
< NCHAR
&& small
< n
? small
: n
;
579 char const *slim
= s
+ ntrans
;
580 for (; s
< slim
; s
++)
581 if (kwset
->trans
[U(*s
)] == kwset
->gc1
)
584 return n
== 0 ? NULL
: memchr2 (s
, kwset
->gc1
, kwset
->gc1help
, n
);
587 /* Fast Boyer-Moore search (inlinable version). */
588 static inline size_t _GL_ATTRIBUTE_PURE
589 bmexec_trans (kwset_t kwset
, char const *text
, size_t size
)
591 unsigned char const *d1
;
592 char const *ep
, *sp
, *tp
;
594 int len
= kwset
->mind
;
595 char const *trans
= kwset
->trans
;
603 tp
= memchr_kwset (text
, size
, kwset
);
604 return tp
? tp
- text
: -1;
608 sp
= kwset
->target
+ len
;
610 char gc1
= kwset
->gc1
;
611 char gc2
= kwset
->gc2
;
613 /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */
615 /* 11 is not a bug, the initial offset happens only once. */
616 for (ep
= text
+ size
- 11 * len
; tp
<= ep
; )
618 char const *tp0
= tp
;
619 d
= d1
[U(tp
[-1])], tp
+= d
;
620 d
= d1
[U(tp
[-1])], tp
+= d
;
623 d
= d1
[U(tp
[-1])], tp
+= d
;
624 d
= d1
[U(tp
[-1])], tp
+= d
;
625 d
= d1
[U(tp
[-1])], tp
+= d
;
628 d
= d1
[U(tp
[-1])], tp
+= d
;
629 d
= d1
[U(tp
[-1])], tp
+= d
;
630 d
= d1
[U(tp
[-1])], tp
+= d
;
633 d
= d1
[U(tp
[-1])], tp
+= d
;
634 d
= d1
[U(tp
[-1])], tp
+= d
;
636 /* As a heuristic, prefer memchr to seeking by
637 delta1 when the latter doesn't advance much. */
638 int advance_heuristic
= 16 * sizeof (long);
639 if (advance_heuristic
<= tp
- tp0
)
642 tp
= memchr_kwset (tp
, text
+ size
- tp
, kwset
);
651 if (bm_delta2_search (&tp
, ep
, sp
, len
, trans
, gc1
, gc2
, d1
, kwset
))
655 /* Now we have only a few characters left to search. We
656 carefully avoid ever producing an out-of-bounds pointer. */
661 d
= d1
[U((tp
+= d
)[-1])];
664 if (bm_delta2_search (&tp
, ep
, sp
, len
, trans
, gc1
, gc2
, NULL
, kwset
))
671 /* Fast Boyer-Moore search. */
673 bmexec (kwset_t kwset
, char const *text
, size_t size
)
675 /* Help the compiler inline bmexec_trans in two ways, depending on
676 whether kwset->trans is null. */
678 ? bmexec_trans (kwset
, text
, size
)
679 : bmexec_trans (kwset
, text
, size
));
682 /* Hairy multiple string search. */
683 static size_t _GL_ARG_NONNULL ((4))
684 cwexec (kwset_t kwset
, char const *text
, size_t len
, struct kwsmatch
*kwsmatch
)
686 struct trie
* const *next
;
687 struct trie
const *trie
;
688 struct trie
const *accept
;
689 char const *beg
, *lim
, *mch
, *lmch
;
691 unsigned char const *delta
;
693 char const *end
, *qlim
;
694 struct tree
const *tree
;
701 /* Initialize register copies and look for easy ways out. */
702 if (len
< kwset
->mind
)
705 delta
= kwset
->delta
;
706 trans
= kwset
->trans
;
709 if ((d
= kwset
->mind
) != 0)
713 mch
= text
, accept
= kwset
->trie
;
717 if (len
>= 4 * kwset
->mind
)
718 qlim
= lim
- 4 * kwset
->mind
;
722 while (lim
- end
>= d
)
724 if (qlim
&& end
<= qlim
)
727 while ((d
= delta
[c
= *end
]) && end
< qlim
)
730 end
+= delta
[U(*end
)];
731 end
+= delta
[U(*end
)];
736 d
= delta
[c
= (end
+= d
)[-1]];
749 unsigned char uc
= *--beg
;
750 c
= trans
? trans
[uc
] : uc
;
752 while (tree
&& c
!= tree
->label
)
776 /* Given a known match, find the longest possible match anchored
777 at or before its starting point. This is nearly a verbatim
778 copy of the preceding main search loops. */
779 if (lim
- mch
> kwset
->maxd
)
780 lim
= mch
+ kwset
->maxd
;
783 while (lim
- end
>= d
)
785 if ((d
= delta
[c
= (end
+= d
)[-1]]) != 0)
788 if (!(trie
= next
[c
]))
793 if (trie
->accepting
&& beg
<= mch
)
801 unsigned char uc
= *--beg
;
802 c
= trans
? trans
[uc
] : uc
;
804 while (tree
&& c
!= tree
->label
)
812 if (trie
->accepting
&& beg
<= mch
)
831 kwsmatch
->index
= accept
->accepting
/ 2;
832 kwsmatch
->offset
[0] = mch
- text
;
833 kwsmatch
->size
[0] = accept
->depth
;
838 /* Search TEXT for a match of any member of KWSET.
839 Return the offset (into TEXT) of the first byte of the matching substring,
840 or (size_t) -1 if no match is found. Upon a match, store details in
841 *KWSMATCH: index of matched keyword, start offset (same as the return
842 value), and length. */
844 kwsexec (kwset_t kwset
, char const *text
, size_t size
,
845 struct kwsmatch
*kwsmatch
)
847 if (kwset
->words
== 1)
849 size_t ret
= bmexec (kwset
, text
, size
);
850 if (ret
!= (size_t) -1)
853 kwsmatch
->offset
[0] = ret
;
854 kwsmatch
->size
[0] = kwset
->mind
;
859 return cwexec (kwset
, text
, size
, kwsmatch
);
862 /* Free the components of the given keyword set. */
864 kwsfree (kwset_t kwset
)
866 obstack_free (&kwset
->obstack
, NULL
);