1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
16 "github.com/google/codesearch/sparse"
19 // Index writing. See read.go for details of on-disk format.
21 // It would suffice to make a single large list of (trigram, file#) pairs
22 // while processing the files one at a time, sort that list by trigram,
23 // and then create the posting lists from subsequences of the list.
24 // However, we do not assume that the entire index fits in memory.
25 // Instead, we sort and flush the list to a new temporary file each time
26 // it reaches its maximum in-memory size, and then at the end we
27 // create the final posting lists by merging the temporary files as we
30 // It would also be useful to be able to create an index for a subset
31 // of the files and then merge that index into an existing one. This would
32 // allow incremental updating of an existing index when a directory changes.
33 // But we have not implemented that.
35 // sortPost sorts the postentry list.
36 // The list is already sorted by fileid (bottom 32 bits)
37 // and the top 8 bits are always zero, so there are only
38 // 24 bits to sort. Run two rounds of 12-bit radix sort.
41 // An IndexWriter creates an on-disk index corresponding to a set of files.
42 type IndexWriter
struct {
43 LogSkip
bool // log information about skipped files
44 Verbose
bool // log status using package log
46 trigram
*sparse
.Set
// trigrams for the current file
47 buf
[8]byte // scratch buffer
51 nameData
*bufWriter
// temp file holding list of names
52 nameLen
uint32 // number of bytes written to nameData
53 nameIndex
*bufWriter
// temp file holding name index
54 numName
int // number of names written
57 post
[]postEntry
// list of (trigram, file#) pairs
58 postFile
[]*os
.File
// flushed post entries
59 postIndex
*bufWriter
// temp file holding posting list index
61 inbuf
[]byte // input buffer
62 main
*bufWriter
// main index file
68 const npost
= 64 << 20 / 8 // 64 MB worth of post entries
70 // Create returns a new IndexWriter that will write the index to file.
71 func Create(file
string) *IndexWriter
{
73 // 1 << 24 = 16777216, how many numbers can be represented by 3 uint8_t’s.
74 trigram
: sparse
.NewSet(1 << 24),
75 nameData
: bufCreate(""),
76 nameIndex
: bufCreate(""),
77 postIndex
: bufCreate(""),
78 main
: bufCreate(file
),
79 post
: make([]postEntry
, 0, npost
),
80 inbuf
: make([]byte, 16384),
84 // A postEntry is an in-memory (trigram, file#) pair.
87 func (p postEntry
) trigram() uint32 {
88 return uint32(p
>> 32)
91 func (p postEntry
) fileid() uint32 {
95 func makePostEntry(trigram
, fileid
uint32) postEntry
{
96 return postEntry(trigram
)<<32 |
postEntry(fileid
)
99 // Tuning constants for detecting text files.
100 // A file is assumed not to be text files (and thus not indexed)
101 // if it contains an invalid UTF-8 sequences, if it is longer than maxFileLength
102 // bytes, if it contains a line longer than maxLineLen bytes,
103 // or if it contains more than maxTextTrigrams distinct trigrams.
107 maxTextTrigrams
= 20000
110 // AddPaths adds the given paths to the index's list of paths.
111 func (ix
*IndexWriter
) AddPaths(paths
[]string) {
112 ix
.paths
= append(ix
.paths
, paths
...)
115 // AddFile adds the file with the given name (opened using os.Open)
116 // to the index. It logs errors using package log.
117 func (ix
*IndexWriter
) AddFile(name
string, indexname
string) error
{
118 f
, err
:= os
.Open(name
)
124 return ix
.Add(indexname
, f
)
127 // Add adds the file f to the index under the given name.
128 // It logs errors using package log.
129 func (ix
*IndexWriter
) Add(name
string, f io
.Reader
) error
{
140 tv
= (tv
<< 8) & (1<<24 - 1)
142 n
, err
:= f
.Read(buf
[:cap(buf
)])
148 log
.Printf("%s: %v\n", name
, err
)
151 log
.Printf("%s: 0-length read\n", name
)
152 return errors
.New("0-length read")
163 if !validUTF8((tv
>>8)&0xFF, tv
&0xFF) {
165 log
.Printf("%s: invalid UTF-8, ignoring\n", name
)
167 return errors
.New("invalid UTF-8, ignoring")
171 log
.Printf("%s: too long, ignoring\n", name
)
173 return errors
.New("too long, ignoring")
175 if linelen
++; linelen
> maxLineLen
{
177 log
.Printf("%s: very long lines, ignoring\n", name
)
179 return errors
.New("very long lines, ignoring")
185 if ix
.trigram
.Len() > maxTextTrigrams
{
187 log
.Printf("%s: too many trigrams, probably not text, ignoring\n", name
)
189 return errors
.New("too many trigrams, probably not text, ignoring")
194 log
.Printf("%d %d %s\n", n
, ix
.trigram
.Len(), name
)
197 fileid
:= ix
.addName(name
)
198 for _
, trigram
:= range ix
.trigram
.Dense() {
199 if len(ix
.post
) >= cap(ix
.post
) {
202 ix
.post
= append(ix
.post
, makePostEntry(trigram
, fileid
))
208 // Flush flushes the index entry to the target file.
209 func (ix
*IndexWriter
) Flush() {
213 ix
.main
.writeString(magic
)
214 off
[0] = ix
.main
.offset()
215 for _
, p
:= range ix
.paths
{
216 ix
.main
.writeString(p
)
217 ix
.main
.writeString("\x00")
219 ix
.main
.writeString("\x00")
220 off
[1] = ix
.main
.offset()
221 copyFile(ix
.main
, ix
.nameData
)
222 off
[2] = ix
.main
.offset()
223 ix
.mergePost(ix
.main
)
224 off
[3] = ix
.main
.offset()
225 copyFile(ix
.main
, ix
.nameIndex
)
226 off
[4] = ix
.main
.offset()
227 copyFile(ix
.main
, ix
.postIndex
)
228 for _
, v
:= range off
{
229 ix
.main
.writeUint32(v
)
231 ix
.main
.writeString(trailerMagic
)
233 os
.Remove(ix
.nameData
.name
)
234 for _
, f
:= range ix
.postFile
{
237 os
.Remove(ix
.nameIndex
.name
)
238 os
.Remove(ix
.postIndex
.name
)
240 log
.Printf("%d data bytes, %d index bytes", ix
.totalBytes
, ix
.main
.offset())
245 func copyFile(dst
, src
*bufWriter
) {
247 _
, err
:= io
.Copy(dst
.file
, src
.finish())
249 log
.Fatalf("copying %s to %s: %v", src
.name
, dst
.name
, err
)
253 // addName adds the file with the given name to the index.
254 // It returns the assigned file ID number.
255 func (ix
*IndexWriter
) addName(name
string) uint32 {
256 if strings
.Contains(name
, "\x00") {
257 log
.Fatalf("%q: file has NUL byte in name", name
)
260 ix
.nameIndex
.writeUint32(ix
.nameData
.offset())
261 ix
.nameData
.writeString(name
)
262 ix
.nameData
.writeByte(0)
268 // flushPost writes ix.post to a new temporary file and
270 func (ix
*IndexWriter
) flushPost() {
271 w
, err
:= ioutil
.TempFile("", "csearch-index")
276 log
.Printf("flush %d entries to %s", len(ix
.post
), w
.Name())
280 // Write the raw ix.post array to disk as is.
281 // This process is the one reading it back in, so byte order is not a concern.
282 data
:= (*[npost
* 8]byte)(unsafe
.Pointer(&ix
.post
[0]))[:len(ix
.post
)*8]
283 if n
, err
:= w
.Write(data
); err
!= nil || n
< len(data
) {
287 log
.Fatalf("short write writing %s", w
.Name())
290 ix
.post
= ix
.post
[:0]
292 ix
.postFile
= append(ix
.postFile
, w
)
295 // mergePost reads the flushed index entries and merges them
296 // into posting lists, writing the resulting lists to out.
297 func (ix
*IndexWriter
) mergePost(out
*bufWriter
) {
300 log
.Printf("merge %d files + mem", len(ix
.postFile
))
301 for _
, f
:= range ix
.postFile
{
309 offset0
:= out
.offset()
312 offset
:= out
.offset() - offset0
313 trigram
:= e
.trigram()
314 ix
.buf
[0] = byte(trigram
>> 16)
315 ix
.buf
[1] = byte(trigram
>> 8)
316 ix
.buf
[2] = byte(trigram
)
321 out
.write(ix
.buf
[:3])
322 for ; e
.trigram() == trigram
&& trigram
!= 1<<24-1; e
= h
.next() {
323 out
.writeUvarint(e
.fileid() - fileid
)
330 ix
.postIndex
.write(ix
.buf
[:3])
331 ix
.postIndex
.writeUint32(nfile
)
332 ix
.postIndex
.writeUint32(offset
)
334 if trigram
== 1<<24-1 {
340 // A postChunk represents a chunk of post entries flushed to disk or
342 type postChunk
struct {
343 e postEntry
// next entry
344 m
[]postEntry
// remaining entries after e
349 // A postHeap is a heap (priority queue) of postChunks.
350 type postHeap
struct {
354 func (h
*postHeap
) addFile(f
*os
.File
) {
355 data
:= mmapFile(f
).d
356 m
:= (*[npost
]postEntry
)(unsafe
.Pointer(&data
[0]))[:len(data
)/8]
360 func (h
*postHeap
) addMem(x
[]postEntry
) {
361 h
.add(&postChunk
{m
: x
})
364 // step reads the next entry from ch and saves it in ch.e.
365 // It returns false if ch is over.
366 func (h
*postHeap
) step(ch
*postChunk
) bool {
372 ch
.e
= postEntry(m
[0])
381 // add adds the chunk to the postHeap.
382 // All adds must be called before the first call to next.
383 func (h
*postHeap
) add(ch
*postChunk
) {
391 // empty reports whether the postHeap is empty.
392 func (h
*postHeap
) empty() bool {
393 return len(h
.ch
) == 0
396 // next returns the next entry from the postHeap.
397 // It returns a postEntry with trigram == 1<<24 - 1 if h is empty.
398 func (h
*postHeap
) next() postEntry
{
400 return makePostEntry(1<<24-1, 0)
415 func (h
*postHeap
) pop() *postChunk
{
426 func (h
*postHeap
) push(ch
*postChunk
) {
428 h
.ch
= append(h
.ch
, ch
)
434 func (h
*postHeap
) siftDown(i
int) {
442 if j2
:= j1
+ 1; j2
< len(ch
) && ch
[j1
].e
>= ch
[j2
].e
{
445 if ch
[i
].e
< ch
[j
].e
{
448 ch
[i
], ch
[j
] = ch
[j
], ch
[i
]
453 func (h
*postHeap
) siftUp(j
int) {
457 if i
== j || ch
[i
].e
< ch
[j
].e
{
460 ch
[i
], ch
[j
] = ch
[j
], ch
[i
]
464 // A bufWriter is a convenience wrapper: a closeable bufio.Writer.
465 type bufWriter
struct {
472 // bufCreate creates a new file with the given name and returns a
473 // corresponding bufWriter. If name is empty, bufCreate uses a
475 func bufCreate(name
string) *bufWriter
{
481 f
, err
= os
.Create(name
)
483 f
, err
= ioutil
.TempFile("", "csearch")
490 buf
: make([]byte, 0, 256<<10),
495 func (b
*bufWriter
) write(x
[]byte) {
496 n
:= cap(b
.buf
) - len(b
.buf
)
499 if len(x
) >= cap(b
.buf
) {
500 if _
, err
:= b
.file
.Write(x
); err
!= nil {
501 log
.Fatalf("writing %s: %v", b
.name
, err
)
506 b
.buf
= append(b
.buf
, x
...)
509 func (b
*bufWriter
) writeByte(x
byte) {
510 if len(b
.buf
) >= cap(b
.buf
) {
513 b
.buf
= append(b
.buf
, x
)
516 func (b
*bufWriter
) writeString(s
string) {
517 n
:= cap(b
.buf
) - len(b
.buf
)
520 if len(s
) >= cap(b
.buf
) {
521 if _
, err
:= b
.file
.WriteString(s
); err
!= nil {
522 log
.Fatalf("writing %s: %v", b
.name
, err
)
527 b
.buf
= append(b
.buf
, s
...)
530 // offset returns the current write offset.
531 func (b
*bufWriter
) offset() uint32 {
532 off
, _
:= b
.file
.Seek(0, 1)
533 off
+= int64(len(b
.buf
))
534 if int64(uint32(off
)) != off
{
535 log
.Fatalf("index is larger than 4GB")
540 func (b
*bufWriter
) flush() {
544 _
, err
:= b
.file
.Write(b
.buf
)
546 log
.Fatalf("writing %s: %v", b
.name
, err
)
551 // finish flushes the file to disk and returns an open file ready for reading.
552 func (b
*bufWriter
) finish() *os
.File
{
559 func (b
*bufWriter
) writeTrigram(t
uint32) {
560 if cap(b
.buf
)-len(b
.buf
) < 3 {
563 b
.buf
= append(b
.buf
, byte(t
>>16), byte(t
>>8), byte(t
))
566 func (b
*bufWriter
) writeUint32(x
uint32) {
567 if cap(b
.buf
)-len(b
.buf
) < 4 {
570 b
.buf
= append(b
.buf
, byte(x
>>24), byte(x
>>16), byte(x
>>8), byte(x
))
573 func (b
*bufWriter
) writeUvarint(x
uint32) {
574 if cap(b
.buf
)-len(b
.buf
) < 5 {
579 b
.buf
= append(b
.buf
, byte(x
))
581 b
.buf
= append(b
.buf
, byte(x|
0x80), byte(x
>>7))
583 b
.buf
= append(b
.buf
, byte(x|
0x80), byte(x
>>7|
0x80), byte(x
>>14))
585 b
.buf
= append(b
.buf
, byte(x|
0x80), byte(x
>>7|
0x80), byte(x
>>14|
0x80), byte(x
>>21))
587 b
.buf
= append(b
.buf
, byte(x|
0x80), byte(x
>>7|
0x80), byte(x
>>14|
0x80), byte(x
>>21|
0x80), byte(x
>>28))
591 // validUTF8 reports whether the byte pair can appear in a
592 // valid sequence of UTF-8-encoded code points.
593 func validUTF8(c1
, c2
uint32) bool {
596 // 1-byte, must be followed by 1-byte or first of multi-byte
597 return c2
< 0x80 ||
0xc0 <= c2
&& c2
< 0xf8
599 // continuation byte, can be followed by nearly anything
602 // first of multi-byte, must be followed by continuation byte
603 return 0x80 <= c2
&& c2
< 0xc0
608 func (ix
*IndexWriter
) sortPost(post
[]postEntry
) {
609 if len(post
) > len(ix
.sortTmp
) {
610 ix
.sortTmp
= make([]postEntry
, len(post
))
612 tmp
:= ix
.sortTmp
[:len(post
)]
615 for i
:= range ix
.sortN
{
618 for _
, p
:= range post
{
619 r
:= uintptr(p
>>32) & (1<<k
- 1)
623 for i
, count
:= range ix
.sortN
{
627 for _
, p
:= range post
{
628 r
:= uintptr(p
>>32) & (1<<k
- 1)
633 tmp
, post
= post
, tmp
635 for i
:= range ix
.sortN
{
638 for _
, p
:= range post
{
639 r
:= uintptr(p
>>(32+k
)) & (1<<k
- 1)
643 for i
, count
:= range ix
.sortN
{
647 for _
, p
:= range post
{
648 r
:= uintptr(p
>>(32+k
)) & (1<<k
- 1)