index/read.go

   1 // Copyright 2011 The Go Authors.  All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package index
   6
   7 // Index format.
   8 //
   9 // An index stored on disk has the format:
  10 //
  11 //      "csearch index 1\n"
  12 //      list of paths
  13 //      list of names
  14 //      list of posting lists
  15 //      name index
  16 //      posting list index
  17 //      trailer
  18 //
  19 // The list of paths is a sorted sequence of NUL-terminated file or directory names.
  20 // The index covers the file trees rooted at those paths.
  21 // The list ends with an empty name ("\x00").
  22 //
  23 // The list of names is a sorted sequence of NUL-terminated file names.
  24 // The initial entry in the list corresponds to file #0,
  25 // the next to file #1, and so on.  The list ends with an
  26 // empty name ("\x00").
  27 //
  28 // The list of posting lists are a sequence of posting lists.
  29 // Each posting list has the form:
  30 //
  31 //      trigram [3]
  32 //      deltas [v]...
  33 //
  34 // The trigram gives the 3 byte trigram that this list describes.  The
  35 // delta list is a sequence of varint-encoded deltas between file
  36 // IDs, ending with a zero delta.  For example, the delta list [2,5,1,1,0]
  37 // encodes the file ID list 1, 6, 7, 8.  The delta list [0] would
  38 // encode the empty file ID list, but empty posting lists are usually
  39 // not recorded at all.  The list of posting lists ends with an entry
  40 // with trigram "\xff\xff\xff" and a delta list consisting a single zero.
  41 //
  42 // The indexes enable efficient random access to the lists.  The name
  43 // index is a sequence of 4-byte big-endian values listing the byte
  44 // offset in the name list where each name begins.  The posting list
  45 // index is a sequence of index entries describing each successive
  46 // posting list.  Each index entry has the form:
  47 //
  48 //      trigram [3]
  49 //      file count [4]
  50 //      offset [4]
  51 //
  52 // Index entries are only written for the non-empty posting lists,
  53 // so finding the posting list for a specific trigram requires a
  54 // binary search over the posting list index.  In practice, the majority
  55 // of the possible trigrams are never seen, so omitting the missing
  56 // ones represents a significant storage savings.
  57 //
  58 // The trailer has the form:
  59 //
  60 //      offset of path list [4]
  61 //      offset of name list [4]
  62 //      offset of posting lists [4]
  63 //      offset of name index [4]
  64 //      offset of posting list index [4]
  65 //      "\ncsearch trailr\n"
  66
  67 import (
  68         "bytes"
  69         "encoding/binary"
  70         "log"
  71         "os"
  72         "runtime"
  73         "sort"
  74         "syscall"
  75 )
  76
  77 const (
  78         magic        = "csearch index 1\n"
  79         trailerMagic = "\ncsearch trailr\n"
  80 )
  81
  82 // An Index implements read-only access to a trigram index.
  83 type Index struct {
  84         File      string
  85         Verbose   bool
  86         data      mmapData
  87         pathData  uint32
  88         nameData  uint32
  89         postData  uint32
  90         nameIndex uint32
  91         postIndex uint32
  92         numName   int
  93         numPost   int
  94 }
  95
  96 const postEntrySize = 3 + 4 + 4
  97
  98 func Open(file string) *Index {
  99         mm := mmap(file)
 100         if len(mm.d) < 4*4+len(trailerMagic) || string(mm.d[len(mm.d)-len(trailerMagic):]) != trailerMagic {
 101                 corrupt(file)
 102         }
 103         n := uint32(len(mm.d) - len(trailerMagic) - 5*4)
 104         ix := &Index{data: mm}
 105         ix.File = file
 106         ix.pathData = ix.uint32(n)
 107         ix.nameData = ix.uint32(n + 4)
 108         ix.postData = ix.uint32(n + 8)
 109         ix.nameIndex = ix.uint32(n + 12)
 110         ix.postIndex = ix.uint32(n + 16)
 111         ix.numName = int((ix.postIndex-ix.nameIndex)/4) - 1
 112         ix.numPost = int((n - ix.postIndex) / postEntrySize)
 113         return ix
 114 }
 115
 116 func (ix *Index) Close() {
 117         if err := syscall.Munmap(ix.data.orig); err != nil {
 118                 log.Fatalf("munmap: %v", err)
 119         }
 120         ix.data.f.Close()
 121 }
 122
 123 // slice returns the slice of index data starting at the given byte offset.
 124 // If n >= 0, the slice must have length at least n and is truncated to length n.
 125 func (ix *Index) slice(off uint32, n int) []byte {
 126         o := int(off)
 127         if uint32(o) != off || n >= 0 && o+n > len(ix.data.d) {
 128                 corrupt(ix.File)
 129         }
 130         if n < 0 {
 131                 return ix.data.d[o:]
 132         }
 133         return ix.data.d[o : o+n]
 134 }
 135
 136 // uint32 returns the uint32 value at the given offset in the index data.
 137 func (ix *Index) uint32(off uint32) uint32 {
 138         return binary.BigEndian.Uint32(ix.slice(off, 4))
 139 }
 140
 141 // uvarint returns the varint value at the given offset in the index data.
 142 func (ix *Index) uvarint(off uint32) uint32 {
 143         v, n := binary.Uvarint(ix.slice(off, -1))
 144         if n <= 0 {
 145                 corrupt(ix.File)
 146         }
 147         return uint32(v)
 148 }
 149
 150 // Paths returns the list of indexed paths.
 151 func (ix *Index) Paths() []string {
 152         off := ix.pathData
 153         var x []string
 154         for {
 155                 s := ix.str(off)
 156                 if len(s) == 0 {
 157                         break
 158                 }
 159                 x = append(x, string(s))
 160                 off += uint32(len(s) + 1)
 161         }
 162         return x
 163 }
 164
 165 // NameBytes returns the name corresponding to the given fileid.
 166 func (ix *Index) NameBytes(fileid uint32) []byte {
 167         off := ix.uint32(ix.nameIndex + 4*fileid)
 168         return ix.str(ix.nameData + off)
 169 }
 170
 171 func (ix *Index) str(off uint32) []byte {
 172         str := ix.slice(off, -1)
 173         i := bytes.IndexByte(str, '\x00')
 174         if i < 0 {
 175                 corrupt(ix.File)
 176         }
 177         return str[:i]
 178 }
 179
 180 // Name returns the name corresponding to the given fileid.
 181 func (ix *Index) Name(fileid uint32) string {
 182         return string(ix.NameBytes(fileid))
 183 }
 184
 185 // listAt returns the index list entry at the given offset.
 186 func (ix *Index) listAt(off uint32) (trigram, count, offset uint32) {
 187         d := ix.slice(ix.postIndex+off, postEntrySize)
 188         trigram = uint32(d[0])<<16 | uint32(d[1])<<8 | uint32(d[2])
 189         count = binary.BigEndian.Uint32(d[3:])
 190         offset = binary.BigEndian.Uint32(d[3+4:])
 191         return
 192 }
 193
 194 func (ix *Index) dumpPosting() {
 195         d := ix.slice(ix.postIndex, postEntrySize*ix.numPost)
 196         for i := 0; i < ix.numPost; i++ {
 197                 j := i * postEntrySize
 198                 t := uint32(d[j])<<16 | uint32(d[j+1])<<8 | uint32(d[j+2])
 199                 count := int(binary.BigEndian.Uint32(d[j+3:]))
 200                 offset := binary.BigEndian.Uint32(d[j+3+4:])
 201                 log.Printf("%#x: %d at %d", t, count, offset)
 202         }
 203 }
 204
 205 func (ix *Index) findList(trigram uint32) (count int, offset uint32) {
 206         // binary search
 207         d := ix.slice(ix.postIndex, postEntrySize*ix.numPost)
 208         i := sort.Search(ix.numPost, func(i int) bool {
 209                 i *= postEntrySize
 210                 t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2])
 211                 return t >= trigram
 212         })
 213         if i >= ix.numPost {
 214                 return 0, 0
 215         }
 216         i *= postEntrySize
 217         t := uint32(d[i])<<16 | uint32(d[i+1])<<8 | uint32(d[i+2])
 218         if t != trigram {
 219                 return 0, 0
 220         }
 221         count = int(binary.BigEndian.Uint32(d[i+3:]))
 222         offset = binary.BigEndian.Uint32(d[i+3+4:])
 223         return
 224 }
 225
 226 type postReader struct {
 227         ix       *Index
 228         count    int
 229         offset   uint32
 230         fileid   uint32
 231         d        []byte
 232         restrict []uint32
 233 }
 234
 235 func (r *postReader) init(ix *Index, trigram uint32, restrict []uint32) {
 236         count, offset := ix.findList(trigram)
 237         if count == 0 {
 238                 return
 239         }
 240         r.ix = ix
 241         r.count = count
 242         r.offset = offset
 243         r.fileid = ^uint32(0)
 244         r.d = ix.slice(ix.postData+offset+3, -1)
 245         r.restrict = restrict
 246 }
 247
 248 func (r *postReader) max() int {
 249         return int(r.count)
 250 }
 251
 252 func (r *postReader) next() bool {
 253         for r.count > 0 {
 254                 r.count--
 255                 delta64, n := binary.Uvarint(r.d)
 256                 delta := uint32(delta64)
 257                 if n <= 0 || delta == 0 {
 258                         corrupt(r.ix.File)
 259                 }
 260                 r.d = r.d[n:]
 261                 r.fileid += delta
 262                 if r.restrict != nil {
 263                         i := 0
 264                         for i < len(r.restrict) && r.restrict[i] < r.fileid {
 265                                 i++
 266                         }
 267                         r.restrict = r.restrict[i:]
 268                         if len(r.restrict) == 0 || r.restrict[0] != r.fileid {
 269                                 continue
 270                         }
 271                 }
 272                 return true
 273         }
 274         // list should end with terminating 0 delta
 275         if r.d != nil && (len(r.d) == 0 || r.d[0] != 0) {
 276                 corrupt(r.ix.File)
 277         }
 278         r.fileid = ^uint32(0)
 279         return false
 280 }
 281
 282 func (ix *Index) PostingList(trigram uint32) []uint32 {
 283         return ix.postingList(trigram, nil)
 284 }
 285
 286 func (ix *Index) postingList(trigram uint32, restrict []uint32) []uint32 {
 287         var r postReader
 288         r.init(ix, trigram, restrict)
 289         return myPostingList(r.d, r.max(), restrict)
 290 }
 291
 292 func (ix *Index) PostingAnd(list []uint32, trigram uint32) []uint32 {
 293         return ix.postingAnd(list, trigram, nil)
 294 }
 295
 296 func (ix *Index) postingAnd(list []uint32, trigram uint32, restrict []uint32) []uint32 {
 297         var r postReader
 298         r.init(ix, trigram, restrict)
 299         return myPostingAnd(r.d, r.max(), list, restrict)
 300 }
 301
 302 func (ix *Index) PostingOr(list []uint32, trigram uint32) []uint32 {
 303         return ix.postingOr(list, trigram, nil)
 304 }
 305
 306 func (ix *Index) postingOr(list []uint32, trigram uint32, restrict []uint32) []uint32 {
 307         var r postReader
 308         r.init(ix, trigram, restrict)
 309         return myPostingOr(r.d, r.max(), list, restrict)
 310 }
 311
 312 func (ix *Index) PostingQuery(q *Query) []uint32 {
 313         return ix.postingQuery(q, nil)
 314 }
 315
 316 // Implements sort.Interface
 317 type trigramCnt struct {
 318         trigram uint32
 319         count   int
 320         listcnt int
 321 }
 322
 323 type trigramCnts []trigramCnt
 324
 325 func (t trigramCnts) Len() int {
 326         return len(t)
 327 }
 328
 329 func (t trigramCnts) Less(i, j int) bool {
 330         return t[i].count < t[j].count
 331 }
 332
 333 func (t trigramCnts) Swap(i, j int) {
 334         t[i], t[j] = t[j], t[i]
 335 }
 336
 337 func (ix *Index) postingQuery(q *Query, restrict []uint32) (ret []uint32) {
 338         var list []uint32
 339         switch q.Op {
 340         case QNone:
 341                 // nothing
 342         case QAll:
 343                 if restrict != nil {
 344                         return restrict
 345                 }
 346                 list = make([]uint32, ix.numName)
 347                 for i := range list {
 348                         list[i] = uint32(i)
 349                 }
 350                 return list
 351         case QAnd:
 352                 // "Query planner": we first sort the posting lists by their
 353                 // length (ascending)
 354                 withCount := make(trigramCnts, len(q.Trigram))
 355                 for idx, t := range q.Trigram {
 356                         tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2])
 357                         count, _ := ix.findList(tri)
 358                         withCount[idx] = trigramCnt{tri, count, 0}
 359                 }
 360                 sort.Sort(withCount)
 361
 362                 stoppedAt := 0
 363                 for idx, t := range withCount {
 364                         previous := len(list)
 365                         if list == nil {
 366                                 list = ix.postingList(t.trigram, restrict)
 367                         } else {
 368                                 list = ix.postingAnd(list, t.trigram, restrict)
 369                         }
 370                         if len(list) == 0 {
 371                                 return nil
 372                         }
 373                         withCount[idx].listcnt = len(list)
 374                         if previous > 0 {
 375                                 minIdx := 0.70 * float32(len(withCount))
 376                                 if (previous-len(list)) < 10 && stoppedAt == 0 && float32(idx) > minIdx {
 377                                         stoppedAt = len(list)
 378                                 }
 379                         }
 380                         if previous > 0 && (previous-len(list)) < 10 {
 381                                 //fmt.Printf("difference is %d, break!\n", previous - len(list))
 382                                 break
 383                         }
 384                 }
 385
 386                 for _, sub := range q.Sub {
 387                         if list == nil {
 388                                 list = restrict
 389                         }
 390                         list = ix.postingQuery(sub, list)
 391                         if len(list) == 0 {
 392                                 return nil
 393                         }
 394                 }
 395         case QOr:
 396                 for _, t := range q.Trigram {
 397                         tri := uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2])
 398                         if list == nil {
 399                                 list = ix.postingList(tri, restrict)
 400                         } else {
 401                                 list = ix.postingOr(list, tri, restrict)
 402                         }
 403                 }
 404                 for _, sub := range q.Sub {
 405                         list1 := ix.postingQuery(sub, restrict)
 406                         list = mergeOr(list, list1)
 407                 }
 408         }
 409         return list
 410 }
 411
 412 func mergeOr(l1, l2 []uint32) []uint32 {
 413         var l []uint32
 414         i := 0
 415         j := 0
 416         for i < len(l1) || j < len(l2) {
 417                 switch {
 418                 case j == len(l2) || (i < len(l1) && l1[i] < l2[j]):
 419                         l = append(l, l1[i])
 420                         i++
 421                 case i == len(l1) || (j < len(l2) && l1[i] > l2[j]):
 422                         l = append(l, l2[j])
 423                         j++
 424                 case l1[i] == l2[j]:
 425                         l = append(l, l1[i])
 426                         i++
 427                         j++
 428                 }
 429         }
 430         return l
 431 }
 432
 433 func corrupt(file string) {
 434         log.Fatal("corrupt index: remove " + file)
 435 }
 436
 437 // An mmapData is mmap'ed read-only data from a file.
 438 type mmapData struct {
 439         f    *os.File
 440         d    []byte
 441         orig []byte
 442 }
 443
 444 // mmap maps the given file into memory.
 445 func mmap(file string) mmapData {
 446         f, err := os.Open(file)
 447         if err != nil {
 448                 log.Fatal(err)
 449         }
 450         return mmapFile(f)
 451 }
 452
 453 // File returns the name of the index file to use.
 454 // It is either $CSEARCHINDEX or $HOME/.csearchindex.
 455 func File() string {
 456         f := os.Getenv("CSEARCHINDEX")
 457         if f != "" {
 458                 return f
 459         }
 460         var home string
 461         if runtime.GOOS == "windows" {
 462                 home = os.Getenv("HOMEPATH")
 463         } else {
 464                 home = os.Getenv("HOME")
 465         }
 466         return home + "/.csearchindex"
 467 }