fix server-rendered results by generating result pages into a buffer and reading...
[debiancodesearch.git] / index / read.go
blob49e3d6f3bc5a4ea485a55fe7aa4dd50c8a013d4b
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package index
7 // Index format.
8 //
9 // An index stored on disk has the format:
11 // "csearch index 1\n"
12 // list of paths
13 // list of names
14 // list of posting lists
15 // name index
16 // posting list index
17 // trailer
19 // The list of paths is a sorted sequence of NUL-terminated file or directory names.
20 // The index covers the file trees rooted at those paths.
21 // The list ends with an empty name ("\x00").
23 // The list of names is a sorted sequence of NUL-terminated file names.
24 // The initial entry in the list corresponds to file #0,
25 // the next to file #1, and so on. The list ends with an
26 // empty name ("\x00").
28 // The list of posting lists are a sequence of posting lists.
29 // Each posting list has the form:
31 // trigram [3]
32 // deltas [v]...
34 // The trigram gives the 3 byte trigram that this list describes. The
35 // delta list is a sequence of varint-encoded deltas between file
36 // IDs, ending with a zero delta. For example, the delta list [2,5,1,1,0]
37 // encodes the file ID list 1, 6, 7, 8. The delta list [0] would
38 // encode the empty file ID list, but empty posting lists are usually
39 // not recorded at all. The list of posting lists ends with an entry
40 // with trigram "\xff\xff\xff" and a delta list consisting a single zero.
42 // The indexes enable efficient random access to the lists. The name
43 // index is a sequence of 4-byte big-endian values listing the byte
44 // offset in the name list where each name begins. The posting list
45 // index is a sequence of index entries describing each successive
46 // posting list. Each index entry has the form:
48 // trigram [3]
49 // file count [4]
50 // offset [4]
52 // Index entries are only written for the non-empty posting lists,
53 // so finding the posting list for a specific trigram requires a
54 // binary search over the posting list index. In practice, the majority
55 // of the possible trigrams are never seen, so omitting the missing
56 // ones represents a significant storage savings.
58 // The trailer has the form:
60 // offset of path list [4]
61 // offset of name list [4]
62 // offset of posting lists [4]
63 // offset of name index [4]
64 // offset of posting list index [4]
65 // "\ncsearch trailr\n"
67 import (
68 "bytes"
69 "encoding/binary"
70 "log"
71 "os"
72 "runtime"
73 "sort"
74 "syscall"
77 const (
78 magic = "csearch index 1\n"
79 trailerMagic = "\ncsearch trailr\n"
82 // An Index implements read-only access to a trigram index.
83 type Index struct {
84 File string
85 Verbose bool
86 data mmapData
87 pathData uint32
88 nameData uint32
89 postData uint32
90 nameIndex uint32
91 postIndex uint32
92 numName int
93 numPost int
96 const postEntrySize = 3 + 4 + 4
98 func Open(file string) *Index {
99 mm := mmap(file)
100 if len(mm.d) < 4*4+len(trailerMagic) || string(mm.d[len(mm.d)-len(trailerMagic):]) != trailerMagic {
101 corrupt(file)
103 n := uint32(len(mm.d) - len(trailerMagic) - 5*4)
104 ix := &Index{data: mm}
105 ix.File = file
106 ix.pathData = ix.uint32(n)
107 ix.nameData = ix.uint32(n + 4)
108 ix.postData = ix.uint32(n + 8)
109 ix.nameIndex = ix.uint32(n + 12)
110 ix.postIndex = ix.uint32(n + 16)
111 ix.numName = int((ix.postIndex-ix.nameIndex)/4) - 1
112 ix.numPost = int((n - ix.postIndex) / postEntrySize)
113 return ix
116 func (ix *Index) Close() {
117 if err := syscall.Munmap(ix.data.orig); err != nil {
118 log.Fatalf("munmap: %v", err)
120 ix.data.f.Close()
123 // slice returns the slice of index data starting at the given byte offset.
124 // If n >= 0, the slice must have length at least n and is truncated to length n.
125 func (ix *Index) slice(off uint32, n int) []byte {
126 o := int(off)
127 if uint32(o) != off || n >= 0 && o+n > len(ix.data.d) {
128 corrupt(ix.File)
130 if n < 0 {
131 return ix.data.d[o:]
133 return ix.data.d[o : o+n]
136 // uint32 returns the uint32 value at the given offset in the index data.
137 func (ix *Index) uint32(off uint32) uint32 {
138 return binary.BigEndian.Uint32(ix.slice(off, 4))
141 // uvarint returns the varint value at the given offset in the index data.
142 func (ix *Index) uvarint(off uint32) uint32 {
143 v, n := binary.Uvarint(ix.slice(off, -1))
144 if n <= 0 {
145 corrupt(ix.File)
147 return uint32(v)
150 // Paths returns the list of indexed paths.
151 func (ix *Index) Paths() []string {
152 off := ix.pathData
153 var x []string
154 for {
155 s := ix.str(off)
156 if len(s) == 0 {
157 break
159 x = append(x, string(s))
160 off += uint32(len(s) + 1)
162 return x
165 // NameBytes returns the name corresponding to the given fileid.
166 func (ix *Index) NameBytes(fileid uint32) []byte {
167 off := ix.uint32(ix.nameIndex + 4*fileid)
168 return ix.str(ix.nameData + off)
171 func (ix *Index) str(off uint32) []byte {
172 str := ix.slice(off, -1)
173 i := bytes.IndexByte(str, '\x00')
174 if i < 0 {
175 corrupt(ix.File)
177 return str[:i]
180 // Name returns the name corresponding to the given fileid.
181 func (ix *Index) Name(fileid uint32) string {
182 return string(ix.NameBytes(fileid))
185 // listAt returns the index list entry at the given offset.
186 func (ix *Index) listAt(off uint32) (trigram, count, offset uint32) {
187 d := ix.slice(ix.postIndex+off, postEntrySize)
188 trigram = uint32(d[0])<<16 | uint32(d[1])<<8 | uint32(d[2])
189 count = binary.BigEndian.Uint32(d[3:])
190 offset = binary.BigEndian.Uint32(d[3+4:])
191 return
194 func (ix *Index) dumpPosting() {
195 d := ix.slice(ix.postIndex, postEntrySize*ix.numPost)
196 for i := 0; i < ix.numPost; i++ {
197 j := i * postEntrySize
198 t := uint32(d[j])<<16 | uint32(d[j+1])<<8 | uint32(d[j+2])
199 count := int(binary.BigEndian.Uint32(d[j+3:]))
200 offset := binary.BigEndian.Uint32(d[j+3+4:])
201 log.Printf("%#x: %d at %d", t, count, offset)
205 func (ix *Index) findList(trigram uint32) (count int, offset uint32) {
206 // binary search
207 d := ix.slice(ix.postIndex, postEntrySize*ix.numPost)
208 i := sort.Search(ix.numPost, func(i int) bool {
209 i *= postEntrySize
210 t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2])
211 return t >= trigram
213 if i >= ix.numPost {
214 return 0, 0
216 i *= postEntrySize
217 t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2])
218 if t != trigram {
219 return 0, 0
221 count = int(binary.BigEndian.Uint32(d[i+3:]))
222 offset = binary.BigEndian.Uint32(d[i+3+4:])
223 return
226 type postReader struct {
227 ix *Index
228 count int
229 offset uint32
230 fileid uint32
231 d []byte
232 restrict []uint32
235 func (r *postReader) init(ix *Index, trigram uint32, restrict []uint32) {
236 count, offset := ix.findList(trigram)
237 if count == 0 {
238 return
240 r.ix = ix
241 r.count = count
242 r.offset = offset
243 r.fileid = ^uint32(0)
244 r.d = ix.slice(ix.postData+offset+3, -1)
245 r.restrict = restrict
248 func (r *postReader) max() int {
249 return int(r.count)
252 func (r *postReader) next() bool {
253 for r.count > 0 {
254 r.count--
255 delta64, n := binary.Uvarint(r.d)
256 delta := uint32(delta64)
257 if n <= 0 || delta == 0 {
258 corrupt(r.ix.File)
260 r.d = r.d[n:]
261 r.fileid += delta
262 if r.restrict != nil {
263 i := 0
264 for i < len(r.restrict) && r.restrict[i] < r.fileid {
267 r.restrict = r.restrict[i:]
268 if len(r.restrict) == 0 || r.restrict[0] != r.fileid {
269 continue
272 return true
274 // list should end with terminating 0 delta
275 if r.d != nil && (len(r.d) == 0 || r.d[0] != 0) {
276 corrupt(r.ix.File)
278 r.fileid = ^uint32(0)
279 return false
282 func (ix *Index) PostingList(trigram uint32) []uint32 {
283 return ix.postingList(trigram, nil)
286 func (ix *Index) postingList(trigram uint32, restrict []uint32) []uint32 {
287 var r postReader
288 r.init(ix, trigram, restrict)
289 return myPostingList(r.d, r.max(), restrict)
292 func (ix *Index) PostingAnd(list []uint32, trigram uint32) []uint32 {
293 return ix.postingAnd(list, trigram, nil)
296 func (ix *Index) postingAnd(list []uint32, trigram uint32, restrict []uint32) []uint32 {
297 var r postReader
298 r.init(ix, trigram, restrict)
299 return myPostingAnd(r.d, r.max(), list, restrict)
302 func (ix *Index) PostingOr(list []uint32, trigram uint32) []uint32 {
303 return ix.postingOr(list, trigram, nil)
306 func (ix *Index) postingOr(list []uint32, trigram uint32, restrict []uint32) []uint32 {
307 var r postReader
308 r.init(ix, trigram, restrict)
309 return myPostingOr(r.d, r.max(), list, restrict)
312 func (ix *Index) PostingQuery(q *Query) []uint32 {
313 return ix.postingQuery(q, nil)
316 // Implements sort.Interface
317 type trigramCnt struct {
318 trigram uint32
319 count int
320 listcnt int
323 type trigramCnts []trigramCnt
325 func (t trigramCnts) Len() int {
326 return len(t)
329 func (t trigramCnts) Less(i, j int) bool {
330 return t[i].count < t[j].count
333 func (t trigramCnts) Swap(i, j int) {
334 t[i], t[j] = t[j], t[i]
337 func (ix *Index) postingQuery(q *Query, restrict []uint32) (ret []uint32) {
338 var list []uint32
339 switch q.Op {
340 case QNone:
341 // nothing
342 case QAll:
343 if restrict != nil {
344 return restrict
346 list = make([]uint32, ix.numName)
347 for i := range list {
348 list[i] = uint32(i)
350 return list
351 case QAnd:
352 // "Query planner": we first sort the posting lists by their
353 // length (ascending)
354 withCount := make(trigramCnts, len(q.Trigram))
355 for idx, t := range q.Trigram {
356 tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2])
357 count, _ := ix.findList(tri)
358 withCount[idx] = trigramCnt{tri, count, 0}
360 sort.Sort(withCount)
362 stoppedAt := 0
363 for idx, t := range withCount {
364 previous := len(list)
365 if list == nil {
366 list = ix.postingList(t.trigram, restrict)
367 } else {
368 list = ix.postingAnd(list, t.trigram, restrict)
370 if len(list) == 0 {
371 return nil
373 withCount[idx].listcnt = len(list)
374 if previous > 0 {
375 minIdx := 0.70 * float32(len(withCount))
376 if (previous-len(list)) < 10 && stoppedAt == 0 && float32(idx) > minIdx {
377 stoppedAt = len(list)
380 if previous > 0 && (previous-len(list)) < 10 {
381 //fmt.Printf("difference is %d, break!\n", previous - len(list))
382 break
386 for _, sub := range q.Sub {
387 if list == nil {
388 list = restrict
390 list = ix.postingQuery(sub, list)
391 if len(list) == 0 {
392 return nil
395 case QOr:
396 for _, t := range q.Trigram {
397 tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2])
398 if list == nil {
399 list = ix.postingList(tri, restrict)
400 } else {
401 list = ix.postingOr(list, tri, restrict)
404 for _, sub := range q.Sub {
405 list1 := ix.postingQuery(sub, restrict)
406 list = mergeOr(list, list1)
409 return list
412 func mergeOr(l1, l2 []uint32) []uint32 {
413 var l []uint32
414 i := 0
415 j := 0
416 for i < len(l1) || j < len(l2) {
417 switch {
418 case j == len(l2) || (i < len(l1) && l1[i] < l2[j]):
419 l = append(l, l1[i])
421 case i == len(l1) || (j < len(l2) && l1[i] > l2[j]):
422 l = append(l, l2[j])
424 case l1[i] == l2[j]:
425 l = append(l, l1[i])
430 return l
433 func corrupt(file string) {
434 log.Fatal("corrupt index: remove " + file)
437 // An mmapData is mmap'ed read-only data from a file.
438 type mmapData struct {
439 f *os.File
440 d []byte
441 orig []byte
444 // mmap maps the given file into memory.
445 func mmap(file string) mmapData {
446 f, err := os.Open(file)
447 if err != nil {
448 log.Fatal(err)
450 return mmapFile(f)
453 // File returns the name of the index file to use.
454 // It is either $CSEARCHINDEX or $HOME/.csearchindex.
455 func File() string {
456 f := os.Getenv("CSEARCHINDEX")
457 if f != "" {
458 return f
460 var home string
461 if runtime.GOOS == "windows" {
462 home = os.Getenv("HOMEPATH")
463 } else {
464 home = os.Getenv("HOME")
466 return home + "/.csearchindex"