1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 // stringFinder efficiently finds strings in a source text. It's implemented
8 // using the Boyer-Moore string search algorithm:
9 // http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
10 // http://www.cs.utexas.edu/~moore/publications/fstrpos.pdf (note: this aged
11 // document uses 1-based indexing)
12 type stringFinder
struct {
13 // pattern is the string that we are searching for in the text.
16 // badCharSkip[b] contains the distance between the last byte of pattern
17 // and the rightmost occurrence of b in pattern. If b is not in pattern,
18 // badCharSkip[b] is len(pattern).
20 // Whenever a mismatch is found with byte b in the text, we can safely
21 // shift the matching frame at least badCharSkip[b] until the next time
22 // the matching char could be in alignment.
25 // goodSuffixSkip[i] defines how far we can shift the matching frame given
26 // that the suffix pattern[i+1:] matches, but the byte pattern[i] does
27 // not. There are two cases to consider:
29 // 1. The matched suffix occurs elsewhere in pattern (with a different
30 // byte preceding it that we might possibly match). In this case, we can
31 // shift the matching frame to align with the next suffix chunk. For
32 // example, the pattern "mississi" has the suffix "issi" next occurring
33 // (in right-to-left order) at index 1, so goodSuffixSkip[3] ==
34 // shift+len(suffix) == 3+4 == 7.
36 // 2. If the matched suffix does not occur elsewhere in pattern, then the
37 // matching frame may share part of its prefix with the end of the
38 // matching suffix. In this case, goodSuffixSkip[i] will contain how far
39 // to shift the frame to align this portion of the prefix to the
40 // suffix. For example, in the pattern "abcxxxabc", when the first
41 // mismatch from the back is found to be in position 3, the matching
42 // suffix "xxabc" is not found elsewhere in the pattern. However, its
43 // rightmost "abc" (at position 6) is a prefix of the whole pattern, so
44 // goodSuffixSkip[3] == shift+len(suffix) == 6+5 == 11.
48 func makeStringFinder(pattern
string) *stringFinder
{
51 goodSuffixSkip
: make([]int, len(pattern
)),
53 // last is the index of the last character in the pattern.
54 last
:= len(pattern
) - 1
56 // Build bad character table.
57 // Bytes not in the pattern can skip one pattern's length.
58 for i
:= range f
.badCharSkip
{
59 f
.badCharSkip
[i
] = len(pattern
)
61 // The loop condition is < instead of <= so that the last byte does not
62 // have a zero distance to itself. Finding this byte out of place implies
63 // that it is not in the last position.
64 for i
:= 0; i
< last
; i
++ {
65 f
.badCharSkip
[pattern
[i
]] = last
- i
68 // Build good suffix table.
69 // First pass: set each value to the next index which starts a prefix of
72 for i
:= last
; i
>= 0; i
-- {
73 if HasPrefix(pattern
, pattern
[i
+1:]) {
76 // lastPrefix is the shift, and (last-i) is len(suffix).
77 f
.goodSuffixSkip
[i
] = lastPrefix
+ last
- i
79 // Second pass: find repeats of pattern's suffix starting from the front.
80 for i
:= 0; i
< last
; i
++ {
81 lenSuffix
:= longestCommonSuffix(pattern
, pattern
[1:i
+1])
82 if pattern
[i
-lenSuffix
] != pattern
[last
-lenSuffix
] {
83 // (last-i) is the shift, and lenSuffix is len(suffix).
84 f
.goodSuffixSkip
[last
-lenSuffix
] = lenSuffix
+ last
- i
91 func longestCommonSuffix(a
, b
string) (i
int) {
92 for ; i
< len(a
) && i
< len(b
); i
++ {
93 if a
[len(a
)-1-i
] != b
[len(b
)-1-i
] {
100 // next returns the index in text of the first occurrence of the pattern. If
101 // the pattern is not found, it returns -1.
102 func (f
*stringFinder
) next(text
string) int {
103 i
:= len(f
.pattern
) - 1
105 // Compare backwards from the end until the first unmatching character.
106 j
:= len(f
.pattern
) - 1
107 for j
>= 0 && text
[i
] == f
.pattern
[j
] {
112 return i
+ 1 // match
114 i
+= max(f
.badCharSkip
[text
[i
]], f
.goodSuffixSkip
[j
])
119 func max(a
, b
int) int {