1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // The suffixarray package implements substring search in logarithmic time
6 // using an in-memory suffix array.
10 // // create index for some data
11 // index := suffixarray.New(data)
13 // // lookup byte slice s
14 // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
15 // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
25 // BUG(gri): For larger data (10MB) which contains very long (say 100000)
26 // contiguous sequences of identical bytes, index creation time will be extremely slow.
28 // TODO(gri): Use a more sophisticated algorithm to create the suffix array.
31 // Index implements a suffix array for fast substring search.
34 sa
[]int // suffix array for data
38 // New creates a new Index for data.
39 // Index creation time is approximately O(N*log(N)) for N = len(data).
41 func New(data
[]byte) *Index
{
42 sa
:= make([]int, len(data
))
43 for i
, _
:= range sa
{
47 sort
.Sort((*index
)(x
))
52 func (x
*Index
) at(i
int) []byte {
53 return x
.data
[x
.sa
[i
]:]
57 // Binary search according to "A Method of Programming", E.W. Dijkstra.
58 func (x
*Index
) search(s
[]byte) int {
60 // i < j for non-empty x
62 // 0 <= i < j <= len(x.sa) && (x.at(i) <= s < x.at(j) || (s is not in x))
63 h
:= i
+ (j
-i
)/2 // i < h < j
64 if bytes
.Compare(x
.at(h
), s
) <= 0 {
66 } else { // s < x.at(h)
70 // i+1 == j for non-empty x
75 // Lookup returns an unsorted list of at most n indices where the byte string s
76 // occurs in the indexed data. If n < 0, all occurrences are returned.
77 // The result is nil if s is empty, s is not found, or n == 0.
78 // Lookup time is O((log(N) + len(result))*len(s)) where N is the
79 // size of the indexed data.
81 func (x
*Index
) Lookup(s
[]byte, n
int) []int {
82 var res vector
.IntVector
84 if len(s
) > 0 && n
!= 0 {
85 // find matching suffix index i
87 // x.at(i) <= s < x.at(i+1)
89 // ignore the first suffix if it is < s
90 if i
< len(x
.sa
) && bytes
.Compare(x
.at(i
), s
) < 0 {
94 // collect the following suffixes with matching prefixes
95 for (n
< 0 ||
len(res
) < n
) && i
< len(x
.sa
) && bytes
.HasPrefix(x
.at(i
), s
) {
105 // index is used to hide the sort.Interface
108 func (x
*index
) Len() int { return len(x
.sa
) }
109 func (x
*index
) Less(i
, j
int) bool { return bytes
.Compare(x
.at(i
), x
.at(j
)) < 0 }
110 func (x
*index
) Swap(i
, j
int) { x
.sa
[i
], x
.sa
[j
] = x
.sa
[j
], x
.sa
[i
] }
111 func (a
*index
) at(i
int) []byte { return a
.data
[a
.sa
[i
]:] }