1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
9 // To merge two indexes A and B (newer) into a combined index C:
11 // Load the path list from B and determine for each path the docid ranges
12 // that it will replace in A.
14 // Read A's and B's name lists together, merging them into C's name list.
15 // Discard the identified ranges from A during the merge. Also during the merge,
16 // record the mapping from A's docids to C's docids, and also the mapping from
17 // B's docids to C's docids. Both mappings can be summarized in a table like
21 // 25-34 maps to 40-49
23 // The number of ranges will be at most the combined number of paths.
24 // Also during the merge, write the name index to a temporary file as usual.
26 // Now merge the posting lists (this is why they begin with the trigram).
27 // During the merge, translate the docid numbers to the new C docid space.
28 // Also during the merge, write the posting list index to a temporary file as usual.
30 // Copy the name index and posting list index into C's index and write the trailer.
31 // Rename C's index onto the new index.
39 // An idrange records that the half-open interval [lo, hi) maps to [new, new+hi-lo).
44 type postIndex
struct {
50 // Merge creates a new index in the file dst that corresponds to merging
51 // the two indices src1 and src2. If both src1 and src2 claim responsibility
52 // for a path, src2 is assumed to be newer and is given preference.
53 func Merge(dst
, src1
, src2
string) {
60 var i1
, i2
, new uint32
61 var map1
, map2
[]idrange
62 for _
, path
:= range paths2
{
63 // Determine range shadowed by this path.
65 for i1
< uint32(ix1
.numName
) && ix1
.Name(i1
) < path
{
69 limit
:= path
[:len(path
)-1] + string(path
[len(path
)-1]+1)
70 for i1
< uint32(ix1
.numName
) && ix1
.Name(i1
) < limit
{
75 // Record range before the shadow.
77 map1
= append(map1
, idrange
{old
, lo
, new})
81 // Determine range defined by this path.
82 // Because we are iterating over the ix2 paths,
83 // there can't be gaps, so it must start at i2.
84 if i2
< uint32(ix2
.numName
) && ix2
.Name(i2
) < path
{
85 panic("merge: inconsistent index")
88 for i2
< uint32(ix2
.numName
) && ix2
.Name(i2
) < limit
{
93 map2
= append(map2
, idrange
{lo
, hi
, new})
98 if i1
< uint32(ix1
.numName
) {
99 map1
= append(map1
, idrange
{i1
, uint32(ix1
.numName
), new})
100 new += uint32(ix1
.numName
) - i1
102 if i2
< uint32(ix2
.numName
) {
103 panic("merge: inconsistent index")
107 ix3
:= bufCreate(dst
)
108 ix3
.writeString(magic
)
110 // Merged list of paths.
111 pathData
:= ix3
.offset()
114 last
:= "\x00" // not a prefix of anything
115 for mi1
< len(paths1
) || mi2
< len(paths2
) {
117 if mi2
>= len(paths2
) || mi1
< len(paths1
) && paths1
[mi1
] <= paths2
[mi2
] {
124 if strings
.HasPrefix(p
, last
) {
129 ix3
.writeString("\x00")
131 ix3
.writeString("\x00")
133 // Merged list of names.
134 nameData
:= ix3
.offset()
135 nameIndexFile
:= bufCreate("")
140 if mi1
< len(map1
) && map1
[mi1
].new == new {
141 for i
:= map1
[mi1
].lo
; i
< map1
[mi1
].hi
; i
++ {
143 nameIndexFile
.writeUint32(ix3
.offset() - nameData
)
144 ix3
.writeString(name
)
145 ix3
.writeString("\x00")
149 } else if mi2
< len(map2
) && map2
[mi2
].new == new {
150 for i
:= map2
[mi2
].lo
; i
< map2
[mi2
].hi
; i
++ {
152 nameIndexFile
.writeUint32(ix3
.offset() - nameData
)
153 ix3
.writeString(name
)
154 ix3
.writeString("\x00")
159 panic("merge: inconsistent index")
162 if new*4 != nameIndexFile
.offset() {
163 panic("merge: inconsistent index")
165 nameIndexFile
.writeUint32(ix3
.offset())
167 // Merged list of posting lists.
168 postData
:= ix3
.offset()
176 if r1
.trigram
< r2
.trigram
{
177 w
.trigram(r1
.trigram
)
183 } else if r2
.trigram
< r1
.trigram
{
184 w
.trigram(r2
.trigram
)
191 if r1
.trigram
== ^uint32(0) {
194 w
.trigram(r1
.trigram
)
197 for r1
.fileid
< ^uint32(0) || r2
.fileid
< ^uint32(0) {
198 if r1
.fileid
< r2
.fileid
{
201 } else if r2
.fileid
< r1
.fileid
{
205 panic("merge: inconsistent index")
215 nameIndex
:= ix3
.offset()
216 copyFile(ix3
, nameIndexFile
)
218 // Posting list index
219 postIndex
:= ix3
.offset()
220 copyFile(ix3
, w
.postIndexFile
)
222 ix3
.writeUint32(pathData
)
223 ix3
.writeUint32(nameData
)
224 ix3
.writeUint32(postData
)
225 ix3
.writeUint32(nameIndex
)
226 ix3
.writeUint32(postIndex
)
227 ix3
.writeString(trailerMagic
)
230 os
.Remove(nameIndexFile
.name
)
231 os
.Remove(w
.postIndexFile
.name
)
234 // src1 and src2 must not cover the same files. dst will contain an index that
235 // contains src1 and src2.
236 func Concat(dst
, src1
, src2
string) {
240 ix3
:= bufCreate(dst
)
241 ix3
.writeString(magic
)
243 // Merged list of paths.
244 pathData
:= ix3
.offset()
245 ix3
.writeString("\x00")
247 // Merged list of names.
248 nameData
:= ix3
.offset()
249 nameIndexFile
:= bufCreate("")
250 for i
:= 0; i
< ix1
.numName
; i
++ {
251 nameIndexFile
.writeUint32(ix3
.offset() - nameData
)
252 ix3
.writeString(ix1
.Name(uint32(i
)))
253 ix3
.writeString("\x00")
255 for i
:= 0; i
< ix2
.numName
; i
++ {
256 nameIndexFile
.writeUint32(ix3
.offset() - nameData
)
257 ix3
.writeString(ix2
.Name(uint32(i
)))
258 ix3
.writeString("\x00")
261 nameIndexFile
.writeUint32(ix3
.offset())
263 // Merged list of posting lists.
264 postData
:= ix3
.offset()
270 r1
.init(ix1
, []idrange
{{lo
: 0, hi
: uint32(ix1
.numName
), new: 0}})
271 r2
.init(ix2
, []idrange
{{lo
: 0, hi
: uint32(ix2
.numName
), new: uint32(ix1
.numName
)}})
273 if r1
.trigram
< r2
.trigram
{
274 w
.trigram(r1
.trigram
)
280 } else if r2
.trigram
< r1
.trigram
{
281 w
.trigram(r2
.trigram
)
288 if r1
.trigram
== ^uint32(0) {
291 w
.trigram(r1
.trigram
)
305 nameIndex
:= ix3
.offset()
306 copyFile(ix3
, nameIndexFile
)
308 // Posting list index
309 postIndex
:= ix3
.offset()
310 copyFile(ix3
, w
.postIndexFile
)
312 ix3
.writeUint32(pathData
)
313 ix3
.writeUint32(nameData
)
314 ix3
.writeUint32(postData
)
315 ix3
.writeUint32(nameIndex
)
316 ix3
.writeUint32(postIndex
)
317 ix3
.writeString(trailerMagic
)
320 os
.Remove(nameIndexFile
.name
)
321 os
.Remove(w
.postIndexFile
.name
)
324 type postMapReader
struct {
337 func (r
*postMapReader
) init(ix
*Index
, idmap
[]idrange
) {
340 r
.trigram
= ^uint32(0)
344 func (r
*postMapReader
) nextTrigram() {
349 func (r
*postMapReader
) load() {
350 if r
.triNum
>= uint32(r
.ix
.numPost
) {
351 r
.trigram
= ^uint32(0)
353 r
.fileid
= ^uint32(0)
356 r
.trigram
, r
.count
, r
.offset
= r
.ix
.listAt(r
.triNum
* postEntrySize
)
358 r
.fileid
= ^uint32(0)
361 r
.d
= r
.ix
.slice(r
.ix
.postData
+r
.offset
+3, -1)
366 func (r
*postMapReader
) nextId() bool {
369 delta64
, n
:= binary
.Uvarint(r
.d
)
370 delta
:= uint32(delta64
)
371 if n
<= 0 || delta
== 0 {
376 for r
.i
< len(r
.idmap
) && r
.idmap
[r
.i
].hi
<= r
.oldid
{
379 if r
.i
>= len(r
.idmap
) {
383 if r
.oldid
< r
.idmap
[r
.i
].lo
{
386 r
.fileid
= r
.idmap
[r
.i
].new + r
.oldid
- r
.idmap
[r
.i
].lo
390 r
.fileid
= ^uint32(0)
394 // Directly writes the entire posting list to w.
395 // Useful to avoid function call overhead, and also expects to be called from
396 // ConcatN only (i.e. takes shortcuts that may break usage of idmap other than
397 // what ConcatN does).
398 func (r
*postMapReader
) writePostingList(w
*postDataWriter
) {
403 // The first entry is fully decoded and re-encoded because w may be in the
404 // middle of a posting list and we need to delta-encode _in relation to_
407 delta64
, n
:= binary
.Uvarint(r
.d
)
408 delta
:= uint32(delta64
)
409 if n
<= 0 || delta
== 0 {
413 r
.oldid
= r
.idmap
[0].new + ^uint32(0) + delta
417 // If the posting list has only one entry, we’re done now.
419 r
.fileid
= ^uint32(0)
423 // TODO: myPostingLast finds the amount of bytes and the last value. In the
424 // long run, we may want to store the last value of a posting list so that
425 // we don’t need to read the entire thing to copy it properly. The overhead
426 // is 2 * sizeof(uint32) * num_posting_lists. calculate that once we have
428 last
, totalbytes
:= myPostingLast(r
.d
, r
.count
, r
.oldid
)
429 w
.out
.write(r
.d
[:totalbytes
])
430 w
.last
= uint32(last
)
431 r
.fileid
= ^uint32(0)
434 type postDataWriter
struct {
436 postIndexFile
*bufWriter
444 func (w
*postDataWriter
) init(out
*bufWriter
) {
446 w
.postIndexFile
= bufCreate("")
447 w
.base
= out
.offset()
450 func (w
*postDataWriter
) trigram(t
uint32) {
451 w
.offset
= w
.out
.offset()
457 func (w
*postDataWriter
) fileid(id
uint32) {
459 w
.out
.writeTrigram(w
.t
)
461 w
.out
.writeUvarint(id
- w
.last
)
466 func (w
*postDataWriter
) endTrigram() {
470 w
.out
.writeUvarint(0)
471 w
.postIndexFile
.writeTrigram(w
.t
)
472 w
.postIndexFile
.writeUint32(w
.count
)
473 w
.postIndexFile
.writeUint32(w
.offset
- w
.base
)