index/write.go

   1 // Copyright 2011 The Go Authors.  All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package index
   6
   7 import (
   8         "errors"
   9         "io"
  10         "io/ioutil"
  11         "log"
  12         "os"
  13         "strings"
  14         "unsafe"
  15
  16         "code.google.com/p/codesearch/sparse"
  17 )
  18
  19 // Index writing.  See read.go for details of on-disk format.
  20 //
  21 // It would suffice to make a single large list of (trigram, file#) pairs
  22 // while processing the files one at a time, sort that list by trigram,
  23 // and then create the posting lists from subsequences of the list.
  24 // However, we do not assume that the entire index fits in memory.
  25 // Instead, we sort and flush the list to a new temporary file each time
  26 // it reaches its maximum in-memory size, and then at the end we
  27 // create the final posting lists by merging the temporary files as we
  28 // read them back in.
  29 //
  30 // It would also be useful to be able to create an index for a subset
  31 // of the files and then merge that index into an existing one.  This would
  32 // allow incremental updating of an existing index when a directory changes.
  33 // But we have not implemented that.
  34
  35 // sortPost sorts the postentry list.
  36 // The list is already sorted by fileid (bottom 32 bits)
  37 // and the top 8 bits are always zero, so there are only
  38 // 24 bits to sort.  Run two rounds of 12-bit radix sort.
  39 const sortK = 12
  40
  41 // An IndexWriter creates an on-disk index corresponding to a set of files.
  42 type IndexWriter struct {
  43         LogSkip bool // log information about skipped files
  44         Verbose bool // log status using package log
  45
  46         trigram *sparse.Set // trigrams for the current file
  47         buf     [8]byte     // scratch buffer
  48
  49         paths []string
  50
  51         nameData   *bufWriter // temp file holding list of names
  52         nameLen    uint32     // number of bytes written to nameData
  53         nameIndex  *bufWriter // temp file holding name index
  54         numName    int        // number of names written
  55         totalBytes int64
  56
  57         post      []postEntry // list of (trigram, file#) pairs
  58         postFile  []*os.File  // flushed post entries
  59         postIndex *bufWriter  // temp file holding posting list index
  60
  61         inbuf []byte     // input buffer
  62         main  *bufWriter // main index file
  63
  64         sortTmp []postEntry
  65         sortN   [1 << sortK]int
  66 }
  67
  68 const npost = 64 << 20 / 8 // 64 MB worth of post entries
  69
  70 // Create returns a new IndexWriter that will write the index to file.
  71 func Create(file string) *IndexWriter {
  72         return &IndexWriter{
  73                 // 1 << 24 = 16777216, how many numbers can be represented by 3 uint8_t’s.
  74                 trigram:   sparse.NewSet(1 << 24),
  75                 nameData:  bufCreate(""),
  76                 nameIndex: bufCreate(""),
  77                 postIndex: bufCreate(""),
  78                 main:      bufCreate(file),
  79                 post:      make([]postEntry, 0, npost),
  80                 inbuf:     make([]byte, 16384),
  81         }
  82 }
  83
  84 // A postEntry is an in-memory (trigram, file#) pair.
  85 type postEntry uint64
  86
  87 func (p postEntry) trigram() uint32 {
  88         return uint32(p >> 32)
  89 }
  90
  91 func (p postEntry) fileid() uint32 {
  92         return uint32(p)
  93 }
  94
  95 func makePostEntry(trigram, fileid uint32) postEntry {
  96         return postEntry(trigram)<<32 | postEntry(fileid)
  97 }
  98
  99 // Tuning constants for detecting text files.
 100 // A file is assumed not to be text files (and thus not indexed)
 101 // if it contains an invalid UTF-8 sequences, if it is longer than maxFileLength
 102 // bytes, if it contains a line longer than maxLineLen bytes,
 103 // or if it contains more than maxTextTrigrams distinct trigrams.
 104 const (
 105         maxFileLen      = 1 << 30
 106         maxLineLen      = 2000
 107         maxTextTrigrams = 20000
 108 )
 109
 110 // AddPaths adds the given paths to the index's list of paths.
 111 func (ix *IndexWriter) AddPaths(paths []string) {
 112         ix.paths = append(ix.paths, paths...)
 113 }
 114
 115 // AddFile adds the file with the given name (opened using os.Open)
 116 // to the index.  It logs errors using package log.
 117 func (ix *IndexWriter) AddFile(name string, indexname string) error {
 118         f, err := os.Open(name)
 119         if err != nil {
 120                 log.Print(err)
 121                 return err
 122         }
 123         defer f.Close()
 124         return ix.Add(indexname, f)
 125 }
 126
 127 // Add adds the file f to the index under the given name.
 128 // It logs errors using package log.
 129 func (ix *IndexWriter) Add(name string, f io.Reader) error {
 130         ix.trigram.Reset()
 131         var (
 132                 c       = byte(0)
 133                 i       = 0
 134                 buf     = ix.inbuf[:0]
 135                 tv      = uint32(0)
 136                 n       = int64(0)
 137                 linelen = 0
 138         )
 139         for {
 140                 tv = (tv << 8) & (1<<24 - 1)
 141                 if i >= len(buf) {
 142                         n, err := f.Read(buf[:cap(buf)])
 143                         if n == 0 {
 144                                 if err != nil {
 145                                         if err == io.EOF {
 146                                                 break
 147                                         }
 148                                         log.Printf("%s: %v\n", name, err)
 149                                         return err
 150                                 }
 151                                 log.Printf("%s: 0-length read\n", name)
 152                                 return errors.New("0-length read")
 153                         }
 154                         buf = buf[:n]
 155                         i = 0
 156                 }
 157                 c = buf[i]
 158                 i++
 159                 tv |= uint32(c)
 160                 if n++; n >= 3 {
 161                         ix.trigram.Add(tv)
 162                 }
 163                 if !validUTF8((tv>>8)&0xFF, tv&0xFF) {
 164                         if ix.LogSkip {
 165                                 log.Printf("%s: invalid UTF-8, ignoring\n", name)
 166                         }
 167                         return errors.New("invalid UTF-8, ignoring")
 168                 }
 169                 if n > maxFileLen {
 170                         if ix.LogSkip {
 171                                 log.Printf("%s: too long, ignoring\n", name)
 172                         }
 173                         return errors.New("too long, ignoring")
 174                 }
 175                 if linelen++; linelen > maxLineLen {
 176                         if ix.LogSkip {
 177                                 log.Printf("%s: very long lines, ignoring\n", name)
 178                         }
 179                         return errors.New("very long lines, ignoring")
 180                 }
 181                 if c == '\n' {
 182                         linelen = 0
 183                 }
 184         }
 185         if ix.trigram.Len() > maxTextTrigrams {
 186                 if ix.LogSkip {
 187                         log.Printf("%s: too many trigrams, probably not text, ignoring\n", name)
 188                 }
 189                 return errors.New("too many trigrams, probably not text, ignoring")
 190         }
 191         ix.totalBytes += n
 192
 193         if ix.Verbose {
 194                 log.Printf("%d %d %s\n", n, ix.trigram.Len(), name)
 195         }
 196
 197         fileid := ix.addName(name)
 198         for _, trigram := range ix.trigram.Dense() {
 199                 if len(ix.post) >= cap(ix.post) {
 200                         ix.flushPost()
 201                 }
 202                 ix.post = append(ix.post, makePostEntry(trigram, fileid))
 203         }
 204
 205         return nil
 206 }
 207
 208 // Flush flushes the index entry to the target file.
 209 func (ix *IndexWriter) Flush() {
 210         ix.addName("")
 211
 212         var off [5]uint32
 213         ix.main.writeString(magic)
 214         off[0] = ix.main.offset()
 215         for _, p := range ix.paths {
 216                 ix.main.writeString(p)
 217                 ix.main.writeString("\x00")
 218         }
 219         ix.main.writeString("\x00")
 220         off[1] = ix.main.offset()
 221         copyFile(ix.main, ix.nameData)
 222         off[2] = ix.main.offset()
 223         ix.mergePost(ix.main)
 224         off[3] = ix.main.offset()
 225         copyFile(ix.main, ix.nameIndex)
 226         off[4] = ix.main.offset()
 227         copyFile(ix.main, ix.postIndex)
 228         for _, v := range off {
 229                 ix.main.writeUint32(v)
 230         }
 231         ix.main.writeString(trailerMagic)
 232
 233         os.Remove(ix.nameData.name)
 234         for _, f := range ix.postFile {
 235                 os.Remove(f.Name())
 236         }
 237         os.Remove(ix.nameIndex.name)
 238         os.Remove(ix.postIndex.name)
 239
 240         log.Printf("%d data bytes, %d index bytes", ix.totalBytes, ix.main.offset())
 241
 242         ix.main.flush()
 243 }
 244
 245 func copyFile(dst, src *bufWriter) {
 246         dst.flush()
 247         _, err := io.Copy(dst.file, src.finish())
 248         if err != nil {
 249                 log.Fatalf("copying %s to %s: %v", src.name, dst.name, err)
 250         }
 251 }
 252
 253 // addName adds the file with the given name to the index.
 254 // It returns the assigned file ID number.
 255 func (ix *IndexWriter) addName(name string) uint32 {
 256         if strings.Contains(name, "\x00") {
 257                 log.Fatalf("%q: file has NUL byte in name", name)
 258         }
 259
 260         ix.nameIndex.writeUint32(ix.nameData.offset())
 261         ix.nameData.writeString(name)
 262         ix.nameData.writeByte(0)
 263         id := ix.numName
 264         ix.numName++
 265         return uint32(id)
 266 }
 267
 268 // flushPost writes ix.post to a new temporary file and
 269 // clears the slice.
 270 func (ix *IndexWriter) flushPost() {
 271         w, err := ioutil.TempFile("", "csearch-index")
 272         if err != nil {
 273                 log.Fatal(err)
 274         }
 275         if ix.Verbose {
 276                 log.Printf("flush %d entries to %s", len(ix.post), w.Name())
 277         }
 278         ix.sortPost(ix.post)
 279
 280         // Write the raw ix.post array to disk as is.
 281         // This process is the one reading it back in, so byte order is not a concern.
 282         data := (*[npost * 8]byte)(unsafe.Pointer(&ix.post[0]))[:len(ix.post)*8]
 283         if n, err := w.Write(data); err != nil || n < len(data) {
 284                 if err != nil {
 285                         log.Fatal(err)
 286                 }
 287                 log.Fatalf("short write writing %s", w.Name())
 288         }
 289
 290         ix.post = ix.post[:0]
 291         w.Seek(0, 0)
 292         ix.postFile = append(ix.postFile, w)
 293 }
 294
 295 // mergePost reads the flushed index entries and merges them
 296 // into posting lists, writing the resulting lists to out.
 297 func (ix *IndexWriter) mergePost(out *bufWriter) {
 298         var h postHeap
 299
 300         log.Printf("merge %d files + mem", len(ix.postFile))
 301         for _, f := range ix.postFile {
 302                 h.addFile(f)
 303         }
 304         ix.sortPost(ix.post)
 305         h.addMem(ix.post)
 306
 307         npost := 0
 308         e := h.next()
 309         offset0 := out.offset()
 310         for {
 311                 npost++
 312                 offset := out.offset() - offset0
 313                 trigram := e.trigram()
 314                 ix.buf[0] = byte(trigram >> 16)
 315                 ix.buf[1] = byte(trigram >> 8)
 316                 ix.buf[2] = byte(trigram)
 317
 318                 // posting list
 319                 fileid := ^uint32(0)
 320                 nfile := uint32(0)
 321                 out.write(ix.buf[:3])
 322                 for ; e.trigram() == trigram && trigram != 1<<24-1; e = h.next() {
 323                         out.writeUvarint(e.fileid() - fileid)
 324                         fileid = e.fileid()
 325                         nfile++
 326                 }
 327                 out.writeUvarint(0)
 328
 329                 // index entry
 330                 ix.postIndex.write(ix.buf[:3])
 331                 ix.postIndex.writeUint32(nfile)
 332                 ix.postIndex.writeUint32(offset)
 333
 334                 if trigram == 1<<24-1 {
 335                         break
 336                 }
 337         }
 338 }
 339
 340 // A postChunk represents a chunk of post entries flushed to disk or
 341 // still in memory.
 342 type postChunk struct {
 343         e postEntry   // next entry
 344         m []postEntry // remaining entries after e
 345 }
 346
 347 const postBuf = 4096
 348
 349 // A postHeap is a heap (priority queue) of postChunks.
 350 type postHeap struct {
 351         ch []*postChunk
 352 }
 353
 354 func (h *postHeap) addFile(f *os.File) {
 355         data := mmapFile(f).d
 356         m := (*[npost]postEntry)(unsafe.Pointer(&data[0]))[:len(data)/8]
 357         h.addMem(m)
 358 }
 359
 360 func (h *postHeap) addMem(x []postEntry) {
 361         h.add(&postChunk{m: x})
 362 }
 363
 364 // step reads the next entry from ch and saves it in ch.e.
 365 // It returns false if ch is over.
 366 func (h *postHeap) step(ch *postChunk) bool {
 367         old := ch.e
 368         m := ch.m
 369         if len(m) == 0 {
 370                 return false
 371         }
 372         ch.e = postEntry(m[0])
 373         m = m[1:]
 374         ch.m = m
 375         if old >= ch.e {
 376                 panic("bad sort")
 377         }
 378         return true
 379 }
 380
 381 // add adds the chunk to the postHeap.
 382 // All adds must be called before the first call to next.
 383 func (h *postHeap) add(ch *postChunk) {
 384         if len(ch.m) > 0 {
 385                 ch.e = ch.m[0]
 386                 ch.m = ch.m[1:]
 387                 h.push(ch)
 388         }
 389 }
 390
 391 // empty reports whether the postHeap is empty.
 392 func (h *postHeap) empty() bool {
 393         return len(h.ch) == 0
 394 }
 395
 396 // next returns the next entry from the postHeap.
 397 // It returns a postEntry with trigram == 1<<24 - 1 if h is empty.
 398 func (h *postHeap) next() postEntry {
 399         if len(h.ch) == 0 {
 400                 return makePostEntry(1<<24-1, 0)
 401         }
 402         ch := h.ch[0]
 403         e := ch.e
 404         m := ch.m
 405         if len(m) == 0 {
 406                 h.pop()
 407         } else {
 408                 ch.e = m[0]
 409                 ch.m = m[1:]
 410                 h.siftDown(0)
 411         }
 412         return e
 413 }
 414
 415 func (h *postHeap) pop() *postChunk {
 416         ch := h.ch[0]
 417         n := len(h.ch) - 1
 418         h.ch[0] = h.ch[n]
 419         h.ch = h.ch[:n]
 420         if n > 1 {
 421                 h.siftDown(0)
 422         }
 423         return ch
 424 }
 425
 426 func (h *postHeap) push(ch *postChunk) {
 427         n := len(h.ch)
 428         h.ch = append(h.ch, ch)
 429         if len(h.ch) >= 2 {
 430                 h.siftUp(n)
 431         }
 432 }
 433
 434 func (h *postHeap) siftDown(i int) {
 435         ch := h.ch
 436         for {
 437                 j1 := 2*i + 1
 438                 if j1 >= len(ch) {
 439                         break
 440                 }
 441                 j := j1
 442                 if j2 := j1 + 1; j2 < len(ch) && ch[j1].e >= ch[j2].e {
 443                         j = j2
 444                 }
 445                 if ch[i].e < ch[j].e {
 446                         break
 447                 }
 448                 ch[i], ch[j] = ch[j], ch[i]
 449                 i = j
 450         }
 451 }
 452
 453 func (h *postHeap) siftUp(j int) {
 454         ch := h.ch
 455         for {
 456                 i := (j - 1) / 2
 457                 if i == j || ch[i].e < ch[j].e {
 458                         break
 459                 }
 460                 ch[i], ch[j] = ch[j], ch[i]
 461         }
 462 }
 463
 464 // A bufWriter is a convenience wrapper: a closeable bufio.Writer.
 465 type bufWriter struct {
 466         name string
 467         file *os.File
 468         buf  []byte
 469         tmp  [8]byte
 470 }
 471
 472 // bufCreate creates a new file with the given name and returns a
 473 // corresponding bufWriter.  If name is empty, bufCreate uses a
 474 // temporary file.
 475 func bufCreate(name string) *bufWriter {
 476         var (
 477                 f   *os.File
 478                 err error
 479         )
 480         if name != "" {
 481                 f, err = os.Create(name)
 482         } else {
 483                 f, err = ioutil.TempFile("", "csearch")
 484         }
 485         if err != nil {
 486                 log.Fatal(err)
 487         }
 488         return &bufWriter{
 489                 name: f.Name(),
 490                 buf:  make([]byte, 0, 256<<10),
 491                 file: f,
 492         }
 493 }
 494
 495 func (b *bufWriter) write(x []byte) {
 496         n := cap(b.buf) - len(b.buf)
 497         if len(x) > n {
 498                 b.flush()
 499                 if len(x) >= cap(b.buf) {
 500                         if _, err := b.file.Write(x); err != nil {
 501                                 log.Fatalf("writing %s: %v", b.name, err)
 502                         }
 503                         return
 504                 }
 505         }
 506         b.buf = append(b.buf, x...)
 507 }
 508
 509 func (b *bufWriter) writeByte(x byte) {
 510         if len(b.buf) >= cap(b.buf) {
 511                 b.flush()
 512         }
 513         b.buf = append(b.buf, x)
 514 }
 515
 516 func (b *bufWriter) writeString(s string) {
 517         n := cap(b.buf) - len(b.buf)
 518         if len(s) > n {
 519                 b.flush()
 520                 if len(s) >= cap(b.buf) {
 521                         if _, err := b.file.WriteString(s); err != nil {
 522                                 log.Fatalf("writing %s: %v", b.name, err)
 523                         }
 524                         return
 525                 }
 526         }
 527         b.buf = append(b.buf, s...)
 528 }
 529
 530 // offset returns the current write offset.
 531 func (b *bufWriter) offset() uint32 {
 532         off, _ := b.file.Seek(0, 1)
 533         off += int64(len(b.buf))
 534         if int64(uint32(off)) != off {
 535                 log.Fatalf("index is larger than 4GB")
 536         }
 537         return uint32(off)
 538 }
 539
 540 func (b *bufWriter) flush() {
 541         if len(b.buf) == 0 {
 542                 return
 543         }
 544         _, err := b.file.Write(b.buf)
 545         if err != nil {
 546                 log.Fatalf("writing %s: %v", b.name, err)
 547         }
 548         b.buf = b.buf[:0]
 549 }
 550
 551 // finish flushes the file to disk and returns an open file ready for reading.
 552 func (b *bufWriter) finish() *os.File {
 553         b.flush()
 554         f := b.file
 555         f.Seek(0, 0)
 556         return f
 557 }
 558
 559 func (b *bufWriter) writeTrigram(t uint32) {
 560         if cap(b.buf)-len(b.buf) < 3 {
 561                 b.flush()
 562         }
 563         b.buf = append(b.buf, byte(t>>16), byte(t>>8), byte(t))
 564 }
 565
 566 func (b *bufWriter) writeUint32(x uint32) {
 567         if cap(b.buf)-len(b.buf) < 4 {
 568                 b.flush()
 569         }
 570         b.buf = append(b.buf, byte(x>>24), byte(x>>16), byte(x>>8), byte(x))
 571 }
 572
 573 func (b *bufWriter) writeUvarint(x uint32) {
 574         if cap(b.buf)-len(b.buf) < 5 {
 575                 b.flush()
 576         }
 577         switch {
 578         case x < 1<<7:
 579                 b.buf = append(b.buf, byte(x))
 580         case x < 1<<14:
 581                 b.buf = append(b.buf, byte(x|0x80), byte(x>>7))
 582         case x < 1<<21:
 583                 b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14))
 584         case x < 1<<28:
 585                 b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21))
 586         default:
 587                 b.buf = append(b.buf, byte(x|0x80), byte(x>>7|0x80), byte(x>>14|0x80), byte(x>>21|0x80), byte(x>>28))
 588         }
 589 }
 590
 591 // validUTF8 reports whether the byte pair can appear in a
 592 // valid sequence of UTF-8-encoded code points.
 593 func validUTF8(c1, c2 uint32) bool {
 594         switch {
 595         case c1 < 0x80:
 596                 // 1-byte, must be followed by 1-byte or first of multi-byte
 597                 return c2 < 0x80 || 0xc0 <= c2 && c2 < 0xf8
 598         case c1 < 0xc0:
 599                 // continuation byte, can be followed by nearly anything
 600                 return c2 < 0xf8
 601         case c1 < 0xf8:
 602                 // first of multi-byte, must be followed by continuation byte
 603                 return 0x80 <= c2 && c2 < 0xc0
 604         }
 605         return false
 606 }
 607
 608 func (ix *IndexWriter) sortPost(post []postEntry) {
 609         if len(post) > len(ix.sortTmp) {
 610                 ix.sortTmp = make([]postEntry, len(post))
 611         }
 612         tmp := ix.sortTmp[:len(post)]
 613
 614         const k = sortK
 615         for i := range ix.sortN {
 616                 ix.sortN[i] = 0
 617         }
 618         for _, p := range post {
 619                 r := uintptr(p>>32) & (1<<k - 1)
 620                 ix.sortN[r]++
 621         }
 622         tot := 0
 623         for i, count := range ix.sortN {
 624                 ix.sortN[i] = tot
 625                 tot += count
 626         }
 627         for _, p := range post {
 628                 r := uintptr(p>>32) & (1<<k - 1)
 629                 o := ix.sortN[r]
 630                 ix.sortN[r]++
 631                 tmp[o] = p
 632         }
 633         tmp, post = post, tmp
 634
 635         for i := range ix.sortN {
 636                 ix.sortN[i] = 0
 637         }
 638         for _, p := range post {
 639                 r := uintptr(p>>(32+k)) & (1<<k - 1)
 640                 ix.sortN[r]++
 641         }
 642         tot = 0
 643         for i, count := range ix.sortN {
 644                 ix.sortN[i] = tot
 645                 tot += count
 646         }
 647         for _, p := range post {
 648                 r := uintptr(p>>(32+k)) & (1<<k - 1)
 649                 o := ix.sortN[r]
 650                 ix.sortN[r]++
 651                 tmp[o] = p
 652         }
 653 }