1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
9 // An index stored on disk has the format:
11 // "csearch index 1\n"
14 // list of posting lists
19 // The list of paths is a sorted sequence of NUL-terminated file or directory names.
20 // The index covers the file trees rooted at those paths.
21 // The list ends with an empty name ("\x00").
23 // The list of names is a sorted sequence of NUL-terminated file names.
24 // The initial entry in the list corresponds to file #0,
25 // the next to file #1, and so on. The list ends with an
26 // empty name ("\x00").
28 // The list of posting lists are a sequence of posting lists.
29 // Each posting list has the form:
34 // The trigram gives the 3 byte trigram that this list describes. The
35 // delta list is a sequence of varint-encoded deltas between file
36 // IDs, ending with a zero delta. For example, the delta list [2,5,1,1,0]
37 // encodes the file ID list 1, 6, 7, 8. The delta list [0] would
38 // encode the empty file ID list, but empty posting lists are usually
39 // not recorded at all. The list of posting lists ends with an entry
40 // with trigram "\xff\xff\xff" and a delta list consisting a single zero.
42 // The indexes enable efficient random access to the lists. The name
43 // index is a sequence of 4-byte big-endian values listing the byte
44 // offset in the name list where each name begins. The posting list
45 // index is a sequence of index entries describing each successive
46 // posting list. Each index entry has the form:
52 // Index entries are only written for the non-empty posting lists,
53 // so finding the posting list for a specific trigram requires a
54 // binary search over the posting list index. In practice, the majority
55 // of the possible trigrams are never seen, so omitting the missing
56 // ones represents a significant storage savings.
58 // The trailer has the form:
60 // offset of path list [4]
61 // offset of name list [4]
62 // offset of posting lists [4]
63 // offset of name index [4]
64 // offset of posting list index [4]
65 // "\ncsearch trailr\n"
78 magic
= "csearch index 1\n"
79 trailerMagic
= "\ncsearch trailr\n"
82 // An Index implements read-only access to a trigram index.
96 const postEntrySize
= 3 + 4 + 4
98 func Open(file
string) *Index
{
100 if len(mm
.d
) < 4*4+len(trailerMagic
) ||
string(mm
.d
[len(mm
.d
)-len(trailerMagic
):]) != trailerMagic
{
103 n
:= uint32(len(mm
.d
) - len(trailerMagic
) - 5*4)
104 ix
:= &Index
{data
: mm
}
106 ix
.pathData
= ix
.uint32(n
)
107 ix
.nameData
= ix
.uint32(n
+ 4)
108 ix
.postData
= ix
.uint32(n
+ 8)
109 ix
.nameIndex
= ix
.uint32(n
+ 12)
110 ix
.postIndex
= ix
.uint32(n
+ 16)
111 ix
.numName
= int((ix
.postIndex
-ix
.nameIndex
)/4) - 1
112 ix
.numPost
= int((n
- ix
.postIndex
) / postEntrySize
)
116 func (ix
*Index
) Close() {
117 if err
:= syscall
.Munmap(ix
.data
.orig
); err
!= nil {
118 log
.Fatalf("munmap: %v", err
)
123 // slice returns the slice of index data starting at the given byte offset.
124 // If n >= 0, the slice must have length at least n and is truncated to length n.
125 func (ix
*Index
) slice(off
uint32, n
int) []byte {
127 if uint32(o
) != off || n
>= 0 && o
+n
> len(ix
.data
.d
) {
133 return ix
.data
.d
[o
: o
+n
]
136 // uint32 returns the uint32 value at the given offset in the index data.
137 func (ix
*Index
) uint32(off
uint32) uint32 {
138 return binary
.BigEndian
.Uint32(ix
.slice(off
, 4))
141 // uvarint returns the varint value at the given offset in the index data.
142 func (ix
*Index
) uvarint(off
uint32) uint32 {
143 v
, n
:= binary
.Uvarint(ix
.slice(off
, -1))
150 // Paths returns the list of indexed paths.
151 func (ix
*Index
) Paths() []string {
159 x
= append(x
, string(s
))
160 off
+= uint32(len(s
) + 1)
165 // NameBytes returns the name corresponding to the given fileid.
166 func (ix
*Index
) NameBytes(fileid
uint32) []byte {
167 off
:= ix
.uint32(ix
.nameIndex
+ 4*fileid
)
168 return ix
.str(ix
.nameData
+ off
)
171 func (ix
*Index
) str(off
uint32) []byte {
172 str
:= ix
.slice(off
, -1)
173 i
:= bytes
.IndexByte(str
, '\x00')
180 // Name returns the name corresponding to the given fileid.
181 func (ix
*Index
) Name(fileid
uint32) string {
182 return string(ix
.NameBytes(fileid
))
185 // listAt returns the index list entry at the given offset.
186 func (ix
*Index
) listAt(off
uint32) (trigram
, count
, offset
uint32) {
187 d
:= ix
.slice(ix
.postIndex
+off
, postEntrySize
)
188 trigram
= uint32(d
[0])<<16 |
uint32(d
[1])<<8 |
uint32(d
[2])
189 count
= binary
.BigEndian
.Uint32(d
[3:])
190 offset
= binary
.BigEndian
.Uint32(d
[3+4:])
194 func (ix
*Index
) dumpPosting() {
195 d
:= ix
.slice(ix
.postIndex
, postEntrySize
*ix
.numPost
)
196 for i
:= 0; i
< ix
.numPost
; i
++ {
197 j
:= i
* postEntrySize
198 t
:= uint32(d
[j
])<<16 |
uint32(d
[j
+1])<<8 |
uint32(d
[j
+2])
199 count
:= int(binary
.BigEndian
.Uint32(d
[j
+3:]))
200 offset
:= binary
.BigEndian
.Uint32(d
[j
+3+4:])
201 log
.Printf("%#x: %d at %d", t
, count
, offset
)
205 func (ix
*Index
) findList(trigram
uint32) (count
int, offset
uint32) {
207 d
:= ix
.slice(ix
.postIndex
, postEntrySize
*ix
.numPost
)
208 i
:= sort
.Search(ix
.numPost
, func(i
int) bool {
210 t
:= uint32(d
[i
])<<16 |
uint32(d
[i
+1])<<8 |
uint32(d
[i
+2])
217 t
:= uint32(d
[i
])<<16 |
uint32(d
[i
+1])<<8 |
uint32(d
[i
+2])
221 count
= int(binary
.BigEndian
.Uint32(d
[i
+3:]))
222 offset
= binary
.BigEndian
.Uint32(d
[i
+3+4:])
226 type postReader
struct {
235 func (r
*postReader
) init(ix
*Index
, trigram
uint32, restrict
[]uint32) {
236 count
, offset
:= ix
.findList(trigram
)
243 r
.fileid
= ^uint32(0)
244 r
.d
= ix
.slice(ix
.postData
+offset
+3, -1)
245 r
.restrict
= restrict
248 func (r
*postReader
) max() int {
252 func (r
*postReader
) next() bool {
255 delta64
, n
:= binary
.Uvarint(r
.d
)
256 delta
:= uint32(delta64
)
257 if n
<= 0 || delta
== 0 {
262 if r
.restrict
!= nil {
264 for i
< len(r
.restrict
) && r
.restrict
[i
] < r
.fileid
{
267 r
.restrict
= r
.restrict
[i
:]
268 if len(r
.restrict
) == 0 || r
.restrict
[0] != r
.fileid
{
274 // list should end with terminating 0 delta
275 if r
.d
!= nil && (len(r
.d
) == 0 || r
.d
[0] != 0) {
278 r
.fileid
= ^uint32(0)
282 func (ix
*Index
) PostingList(trigram
uint32) []uint32 {
283 return ix
.postingList(trigram
, nil)
286 func (ix
*Index
) postingList(trigram
uint32, restrict
[]uint32) []uint32 {
288 r
.init(ix
, trigram
, restrict
)
289 return myPostingList(r
.d
, r
.max(), restrict
)
292 func (ix
*Index
) PostingAnd(list
[]uint32, trigram
uint32) []uint32 {
293 return ix
.postingAnd(list
, trigram
, nil)
296 func (ix
*Index
) postingAnd(list
[]uint32, trigram
uint32, restrict
[]uint32) []uint32 {
298 r
.init(ix
, trigram
, restrict
)
299 return myPostingAnd(r
.d
, r
.max(), list
, restrict
)
302 func (ix
*Index
) PostingOr(list
[]uint32, trigram
uint32) []uint32 {
303 return ix
.postingOr(list
, trigram
, nil)
306 func (ix
*Index
) postingOr(list
[]uint32, trigram
uint32, restrict
[]uint32) []uint32 {
308 r
.init(ix
, trigram
, restrict
)
309 return myPostingOr(r
.d
, r
.max(), list
, restrict
)
312 func (ix
*Index
) PostingQuery(q
*Query
) []uint32 {
313 return ix
.postingQuery(q
, nil)
316 // Implements sort.Interface
317 type trigramCnt
struct {
323 type trigramCnts
[]trigramCnt
325 func (t trigramCnts
) Len() int {
329 func (t trigramCnts
) Less(i
, j
int) bool {
330 return t
[i
].count
< t
[j
].count
333 func (t trigramCnts
) Swap(i
, j
int) {
334 t
[i
], t
[j
] = t
[j
], t
[i
]
337 func (ix
*Index
) postingQuery(q
*Query
, restrict
[]uint32) (ret
[]uint32) {
346 list
= make([]uint32, ix
.numName
)
347 for i
:= range list
{
352 // "Query planner": we first sort the posting lists by their
353 // length (ascending)
354 withCount
:= make(trigramCnts
, len(q
.Trigram
))
355 for idx
, t
:= range q
.Trigram
{
356 tri
:= uint32(t
[0])<<16 |
uint32(t
[1])<<8 |
uint32(t
[2])
357 count
, _
:= ix
.findList(tri
)
358 withCount
[idx
] = trigramCnt
{tri
, count
, 0}
363 for idx
, t
:= range withCount
{
364 previous
:= len(list
)
366 list
= ix
.postingList(t
.trigram
, restrict
)
368 list
= ix
.postingAnd(list
, t
.trigram
, restrict
)
373 withCount
[idx
].listcnt
= len(list
)
375 minIdx
:= 0.70 * float32(len(withCount
))
376 if (previous
-len(list
)) < 10 && stoppedAt
== 0 && float32(idx
) > minIdx
{
377 stoppedAt
= len(list
)
380 if previous
> 0 && (previous
-len(list
)) < 10 {
381 //fmt.Printf("difference is %d, break!\n", previous - len(list))
386 for _
, sub
:= range q
.Sub
{
390 list
= ix
.postingQuery(sub
, list
)
396 for _
, t
:= range q
.Trigram
{
397 tri
:= uint32(t
[0])<<16 |
uint32(t
[1])<<8 |
uint32(t
[2])
399 list
= ix
.postingList(tri
, restrict
)
401 list
= ix
.postingOr(list
, tri
, restrict
)
404 for _
, sub
:= range q
.Sub
{
405 list1
:= ix
.postingQuery(sub
, restrict
)
406 list
= mergeOr(list
, list1
)
412 func mergeOr(l1
, l2
[]uint32) []uint32 {
416 for i
< len(l1
) || j
< len(l2
) {
418 case j
== len(l2
) ||
(i
< len(l1
) && l1
[i
] < l2
[j
]):
421 case i
== len(l1
) ||
(j
< len(l2
) && l1
[i
] > l2
[j
]):
433 func corrupt(file
string) {
434 log
.Fatal("corrupt index: remove " + file
)
437 // An mmapData is mmap'ed read-only data from a file.
438 type mmapData
struct {
444 // mmap maps the given file into memory.
445 func mmap(file
string) mmapData
{
446 f
, err
:= os
.Open(file
)
453 // File returns the name of the index file to use.
454 // It is either $CSEARCHINDEX or $HOME/.csearchindex.
456 f
:= os
.Getenv("CSEARCHINDEX")
461 if runtime
.GOOS
== "windows" {
462 home
= os
.Getenv("HOMEPATH")
464 home
= os
.Getenv("HOME")
466 return home
+ "/.csearchindex"