Merge from mainline (167278:168000).
[official-gcc/graphite-test-results.git] / libgo / go / index / suffixarray / suffixarray.go
blob0a17472962fadb036d35848c955c55c47ff15cc8
1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // The suffixarray package implements substring search in logarithmic time
6 // using an in-memory suffix array.
7 //
8 // Example use:
9 //
10 // // create index for some data
11 // index := suffixarray.New(data)
13 // // lookup byte slice s
14 // offsets1 := index.Lookup(s, -1) // the list of all indices where s occurs in data
15 // offsets2 := index.Lookup(s, 3) // the list of at most 3 indices where s occurs in data
17 package suffixarray
19 import (
20 "bytes"
21 "container/vector"
22 "sort"
25 // BUG(gri): For larger data (10MB) which contains very long (say 100000)
26 // contiguous sequences of identical bytes, index creation time will be extremely slow.
28 // TODO(gri): Use a more sophisticated algorithm to create the suffix array.
31 // Index implements a suffix array for fast substring search.
32 type Index struct {
33 data []byte
34 sa []int // suffix array for data
38 // New creates a new Index for data.
39 // Index creation time is approximately O(N*log(N)) for N = len(data).
41 func New(data []byte) *Index {
42 sa := make([]int, len(data))
43 for i, _ := range sa {
44 sa[i] = i
46 x := &Index{data, sa}
47 sort.Sort((*index)(x))
48 return x
52 func (x *Index) at(i int) []byte {
53 return x.data[x.sa[i]:]
57 // Binary search according to "A Method of Programming", E.W. Dijkstra.
58 func (x *Index) search(s []byte) int {
59 i, j := 0, len(x.sa)
60 // i < j for non-empty x
61 for i+1 < j {
62 // 0 <= i < j <= len(x.sa) && (x.at(i) <= s < x.at(j) || (s is not in x))
63 h := i + (j-i)/2 // i < h < j
64 if bytes.Compare(x.at(h), s) <= 0 {
65 i = h
66 } else { // s < x.at(h)
67 j = h
70 // i+1 == j for non-empty x
71 return i
75 // Lookup returns an unsorted list of at most n indices where the byte string s
76 // occurs in the indexed data. If n < 0, all occurrences are returned.
77 // The result is nil if s is empty, s is not found, or n == 0.
78 // Lookup time is O((log(N) + len(result))*len(s)) where N is the
79 // size of the indexed data.
81 func (x *Index) Lookup(s []byte, n int) []int {
82 var res vector.IntVector
84 if len(s) > 0 && n != 0 {
85 // find matching suffix index i
86 i := x.search(s)
87 // x.at(i) <= s < x.at(i+1)
89 // ignore the first suffix if it is < s
90 if i < len(x.sa) && bytes.Compare(x.at(i), s) < 0 {
91 i++
94 // collect the following suffixes with matching prefixes
95 for (n < 0 || len(res) < n) && i < len(x.sa) && bytes.HasPrefix(x.at(i), s) {
96 res.Push(x.sa[i])
97 i++
101 return res
105 // index is used to hide the sort.Interface
106 type index Index
108 func (x *index) Len() int { return len(x.sa) }
109 func (x *index) Less(i, j int) bool { return bytes.Compare(x.at(i), x.at(j)) < 0 }
110 func (x *index) Swap(i, j int) { x.sa[i], x.sa[j] = x.sa[j], x.sa[i] }
111 func (a *index) at(i int) []byte { return a.data[a.sa[i]:] }