1 /* Byte-wise substring search, using the Two-Way algorithm.
2 Copyright (C) 2008-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Written by Eric Blake <ebb9@byu.net>, 2008.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License along
17 with this program; if not, see <https://www.gnu.org/licenses/>. */
19 /* Before including this file, you need to include <config.h> and
20 <string.h>, and define:
21 RETURN_TYPE A macro that expands to the return type.
22 AVAILABLE(h, h_l, j, n_l)
23 A macro that returns nonzero if there are
24 at least N_L bytes left starting at H[J].
25 H is 'unsigned char *', H_L, J, and N_L
26 are 'size_t'; H_L is an lvalue. For
27 NUL-terminated searches, H_L can be
28 modified each iteration to avoid having
29 to compute the end of H up front.
31 For case-insensitivity, you may optionally define:
32 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
33 characters of P1 and P2 are equal.
34 CANON_ELEMENT(c) A macro that canonicalizes an element right after
35 it has been fetched from one of the two strings.
36 The argument is an 'unsigned char'; the result
37 must be an 'unsigned char' as well.
39 This file undefines the macros documented above, and defines
40 LONG_NEEDLE_THRESHOLD.
46 /* We use the Two-Way string matching algorithm (also known as
47 Chrochemore-Perrin), which guarantees linear complexity with
48 constant space. Additionally, for long needles, we also use a bad
49 character shift table similar to the Boyer-Moore algorithm to
50 achieve improved (potentially sub-linear) performance.
52 See https://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260,
53 https://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm,
54 https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.34.6641&rep=rep1&type=pdf
57 /* Point at which computing a bad-byte shift table is likely to be
58 worthwhile. Small needles should not compute a table, since it
59 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
60 speedup no greater than a factor of NEEDLE_LEN. The larger the
61 needle, the better the potential performance gain. On the other
62 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
63 memory required for the table is prohibitive. */
65 # define LONG_NEEDLE_THRESHOLD 32U
67 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
71 # define MAX(a, b) ((a < b) ? (b) : (a))
75 # define CANON_ELEMENT(c) c
78 # define CMP_FUNC memcmp
81 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
82 Return the index of the first byte in the right half, and set
83 *PERIOD to the global period of the right half.
85 The global period of a string is the smallest index (possibly its
86 length) at which all remaining bytes in the string are repetitions
87 of the prefix (the last repetition may be a subset of the prefix).
89 When NEEDLE is factored into two halves, a local period is the
90 length of the smallest word that shares a suffix with the left half
91 and shares a prefix with the right half. All factorizations of a
92 non-empty NEEDLE have a local period of at least 1 and no greater
95 A critical factorization has the property that the local period
96 equals the global period. All strings have at least one critical
97 factorization with the left half smaller than the global period.
98 And while some strings have more than one critical factorization,
99 it is provable that with an ordered alphabet, at least one of the
100 critical factorizations corresponds to a maximal suffix.
102 Given an ordered alphabet, a critical factorization can be computed
103 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
104 shorter of two ordered maximal suffixes. The ordered maximal
105 suffixes are determined by lexicographic comparison while tracking
108 critical_factorization (const unsigned char *needle
, size_t needle_len
,
111 /* Index of last byte of left half, or SIZE_MAX. */
112 size_t max_suffix
, max_suffix_rev
;
113 size_t j
; /* Index into NEEDLE for current candidate suffix. */
114 size_t k
; /* Offset into current period. */
115 size_t p
; /* Intermediate period. */
116 unsigned char a
, b
; /* Current comparison bytes. */
118 /* Special case NEEDLE_LEN of 1 or 2 (all callers already filtered
119 out 0-length needles. */
123 return needle_len
- 1;
127 0 <= j < NEEDLE_LEN - 1
128 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
129 min(max_suffix, max_suffix_rev) < global period of NEEDLE
130 1 <= p <= global period of NEEDLE
131 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
135 /* Perform lexicographic search. */
136 max_suffix
= SIZE_MAX
;
139 while (j
+ k
< needle_len
)
141 a
= CANON_ELEMENT (needle
[j
+ k
]);
142 b
= CANON_ELEMENT (needle
[max_suffix
+ k
]);
145 /* Suffix is smaller, period is entire prefix so far. */
152 /* Advance through repetition of the current period. */
163 /* Suffix is larger, start over from current location. */
170 /* Perform reverse lexicographic search. */
171 max_suffix_rev
= SIZE_MAX
;
174 while (j
+ k
< needle_len
)
176 a
= CANON_ELEMENT (needle
[j
+ k
]);
177 b
= CANON_ELEMENT (needle
[max_suffix_rev
+ k
]);
180 /* Suffix is smaller, period is entire prefix so far. */
183 p
= j
- max_suffix_rev
;
187 /* Advance through repetition of the current period. */
198 /* Suffix is larger, start over from current location. */
199 max_suffix_rev
= j
++;
204 /* Choose the shorter suffix. Return the index of the first byte of
205 the right half, rather than the last byte of the left half.
207 For some examples, 'banana' has two critical factorizations, both
208 exposed by the two lexicographic extreme suffixes of 'anana' and
209 'nana', where both suffixes have a period of 2. On the other
210 hand, with 'aab' and 'bba', both strings have a single critical
211 factorization of the last byte, with the suffix having a period
212 of 1. While the maximal lexicographic suffix of 'aab' is 'b',
213 the maximal lexicographic suffix of 'bba' is 'ba', which is not a
214 critical factorization. Conversely, the maximal reverse
215 lexicographic suffix of 'a' works for 'bba', but not 'ab' for
216 'aab'. The shorter suffix of the two will always be a critical
218 if (max_suffix_rev
+ 1 < max_suffix
+ 1)
219 return max_suffix
+ 1;
221 return max_suffix_rev
+ 1;
224 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
225 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
226 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
227 Performance is guaranteed to be linear, with an initialization cost
228 of 2 * NEEDLE_LEN comparisons.
230 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
231 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
232 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
233 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
235 two_way_short_needle (const unsigned char *haystack
, size_t haystack_len
,
236 const unsigned char *needle
, size_t needle_len
)
238 size_t i
; /* Index into current byte of NEEDLE. */
239 size_t j
; /* Index into current window of HAYSTACK. */
240 size_t period
; /* The period of the right half of needle. */
241 size_t suffix
; /* The index of the right half of needle. */
243 /* Factor the needle into two halves, such that the left half is
244 smaller than the global period, and the right half is
245 periodic (with a period as large as NEEDLE_LEN - suffix). */
246 suffix
= critical_factorization (needle
, needle_len
, &period
);
248 /* Perform the search. Each iteration compares the right half
250 if (CMP_FUNC (needle
, needle
+ period
, suffix
) == 0)
252 /* Entire needle is periodic; a mismatch in the left half can
253 only advance by the period, so use memory to avoid rescanning
254 known occurrences of the period in the right half. */
257 while (AVAILABLE (haystack
, haystack_len
, j
, needle_len
))
259 /* Scan for matches in right half. */
260 i
= MAX (suffix
, memory
);
261 while (i
< needle_len
&& (CANON_ELEMENT (needle
[i
])
262 == CANON_ELEMENT (haystack
[i
+ j
])))
266 /* Scan for matches in left half. */
268 while (memory
< i
+ 1 && (CANON_ELEMENT (needle
[i
])
269 == CANON_ELEMENT (haystack
[i
+ j
])))
271 if (i
+ 1 < memory
+ 1)
272 return (RETURN_TYPE
) (haystack
+ j
);
273 /* No match, so remember how many repetitions of period
274 on the right half were scanned. */
276 memory
= needle_len
- period
;
287 /* The two halves of needle are distinct; no extra memory is
288 required, and any mismatch results in a maximal shift. */
289 period
= MAX (suffix
, needle_len
- suffix
) + 1;
291 while (AVAILABLE (haystack
, haystack_len
, j
, needle_len
))
293 /* Scan for matches in right half. */
295 while (i
< needle_len
&& (CANON_ELEMENT (needle
[i
])
296 == CANON_ELEMENT (haystack
[i
+ j
])))
300 /* Scan for matches in left half. */
302 while (i
!= SIZE_MAX
&& (CANON_ELEMENT (needle
[i
])
303 == CANON_ELEMENT (haystack
[i
+ j
])))
306 return (RETURN_TYPE
) (haystack
+ j
);
316 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
317 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
318 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
319 Performance is guaranteed to be linear, with an initialization cost
320 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
322 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
323 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
324 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
325 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
326 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
327 sublinear performance is not possible. */
329 two_way_long_needle (const unsigned char *haystack
, size_t haystack_len
,
330 const unsigned char *needle
, size_t needle_len
)
332 size_t i
; /* Index into current byte of NEEDLE. */
333 size_t j
; /* Index into current window of HAYSTACK. */
334 size_t period
; /* The period of the right half of needle. */
335 size_t suffix
; /* The index of the right half of needle. */
336 size_t shift_table
[1U << CHAR_BIT
]; /* See below. */
338 /* Factor the needle into two halves, such that the left half is
339 smaller than the global period, and the right half is
340 periodic (with a period as large as NEEDLE_LEN - suffix). */
341 suffix
= critical_factorization (needle
, needle_len
, &period
);
343 /* Populate shift_table. For each possible byte value c,
344 shift_table[c] is the distance from the last occurrence of c to
345 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
346 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
347 for (i
= 0; i
< 1U << CHAR_BIT
; i
++)
348 shift_table
[i
] = needle_len
;
349 for (i
= 0; i
< needle_len
; i
++)
350 shift_table
[CANON_ELEMENT (needle
[i
])] = needle_len
- i
- 1;
352 /* Perform the search. Each iteration compares the right half
354 if (CMP_FUNC (needle
, needle
+ period
, suffix
) == 0)
356 /* Entire needle is periodic; a mismatch in the left half can
357 only advance by the period, so use memory to avoid rescanning
358 known occurrences of the period in the right half. */
362 while (AVAILABLE (haystack
, haystack_len
, j
, needle_len
))
364 /* Check the last byte first; if it does not match, then
365 shift to the next possible match location. */
366 shift
= shift_table
[CANON_ELEMENT (haystack
[j
+ needle_len
- 1])];
369 if (memory
&& shift
< period
)
371 /* Since needle is periodic, but the last period has
372 a byte out of place, there can be no match until
373 after the mismatch. */
374 shift
= needle_len
- period
;
380 /* Scan for matches in right half. The last byte has
381 already been matched, by virtue of the shift table. */
382 i
= MAX (suffix
, memory
);
383 while (i
< needle_len
- 1 && (CANON_ELEMENT (needle
[i
])
384 == CANON_ELEMENT (haystack
[i
+ j
])))
386 if (needle_len
- 1 <= i
)
388 /* Scan for matches in left half. */
390 while (memory
< i
+ 1 && (CANON_ELEMENT (needle
[i
])
391 == CANON_ELEMENT (haystack
[i
+ j
])))
393 if (i
+ 1 < memory
+ 1)
394 return (RETURN_TYPE
) (haystack
+ j
);
395 /* No match, so remember how many repetitions of period
396 on the right half were scanned. */
398 memory
= needle_len
- period
;
409 /* The two halves of needle are distinct; no extra memory is
410 required, and any mismatch results in a maximal shift. */
412 period
= MAX (suffix
, needle_len
- suffix
) + 1;
414 while (AVAILABLE (haystack
, haystack_len
, j
, needle_len
))
416 /* Check the last byte first; if it does not match, then
417 shift to the next possible match location. */
418 shift
= shift_table
[CANON_ELEMENT (haystack
[j
+ needle_len
- 1])];
424 /* Scan for matches in right half. The last byte has
425 already been matched, by virtue of the shift table. */
427 while (i
< needle_len
- 1 && (CANON_ELEMENT (needle
[i
])
428 == CANON_ELEMENT (haystack
[i
+ j
])))
430 if (needle_len
- 1 <= i
)
432 /* Scan for matches in left half. */
434 while (i
!= SIZE_MAX
&& (CANON_ELEMENT (needle
[i
])
435 == CANON_ELEMENT (haystack
[i
+ j
])))
438 return (RETURN_TYPE
) (haystack
+ j
);