fix server-rendered results by generating result pages into a buffer and reading...
[debiancodesearch.git] / index / merge.go
blob66e36fd3e0ce937e8c6db84cf401834edc9e70e2
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package index
7 // Merging indexes.
8 //
9 // To merge two indexes A and B (newer) into a combined index C:
11 // Load the path list from B and determine for each path the docid ranges
12 // that it will replace in A.
14 // Read A's and B's name lists together, merging them into C's name list.
15 // Discard the identified ranges from A during the merge. Also during the merge,
16 // record the mapping from A's docids to C's docids, and also the mapping from
17 // B's docids to C's docids. Both mappings can be summarized in a table like
19 // 10-14 map to 20-24
20 // 15-24 is deleted
21 // 25-34 maps to 40-49
23 // The number of ranges will be at most the combined number of paths.
24 // Also during the merge, write the name index to a temporary file as usual.
26 // Now merge the posting lists (this is why they begin with the trigram).
27 // During the merge, translate the docid numbers to the new C docid space.
28 // Also during the merge, write the posting list index to a temporary file as usual.
30 // Copy the name index and posting list index into C's index and write the trailer.
31 // Rename C's index onto the new index.
33 import (
34 "encoding/binary"
35 "os"
36 "strings"
39 // An idrange records that the half-open interval [lo, hi) maps to [new, new+hi-lo).
40 type idrange struct {
41 lo, hi, new uint32
44 type postIndex struct {
45 tri uint32
46 count uint32
47 offset uint32
50 // Merge creates a new index in the file dst that corresponds to merging
51 // the two indices src1 and src2. If both src1 and src2 claim responsibility
52 // for a path, src2 is assumed to be newer and is given preference.
53 func Merge(dst, src1, src2 string) {
54 ix1 := Open(src1)
55 ix2 := Open(src2)
56 paths1 := ix1.Paths()
57 paths2 := ix2.Paths()
59 // Build docid maps.
60 var i1, i2, new uint32
61 var map1, map2 []idrange
62 for _, path := range paths2 {
63 // Determine range shadowed by this path.
64 old := i1
65 for i1 < uint32(ix1.numName) && ix1.Name(i1) < path {
66 i1++
68 lo := i1
69 limit := path[:len(path)-1] + string(path[len(path)-1]+1)
70 for i1 < uint32(ix1.numName) && ix1.Name(i1) < limit {
71 i1++
73 hi := i1
75 // Record range before the shadow.
76 if old < lo {
77 map1 = append(map1, idrange{old, lo, new})
78 new += lo - old
81 // Determine range defined by this path.
82 // Because we are iterating over the ix2 paths,
83 // there can't be gaps, so it must start at i2.
84 if i2 < uint32(ix2.numName) && ix2.Name(i2) < path {
85 panic("merge: inconsistent index")
87 lo = i2
88 for i2 < uint32(ix2.numName) && ix2.Name(i2) < limit {
89 i2++
91 hi = i2
92 if lo < hi {
93 map2 = append(map2, idrange{lo, hi, new})
94 new += hi - lo
98 if i1 < uint32(ix1.numName) {
99 map1 = append(map1, idrange{i1, uint32(ix1.numName), new})
100 new += uint32(ix1.numName) - i1
102 if i2 < uint32(ix2.numName) {
103 panic("merge: inconsistent index")
105 numName := new
107 ix3 := bufCreate(dst)
108 ix3.writeString(magic)
110 // Merged list of paths.
111 pathData := ix3.offset()
112 mi1 := 0
113 mi2 := 0
114 last := "\x00" // not a prefix of anything
115 for mi1 < len(paths1) || mi2 < len(paths2) {
116 var p string
117 if mi2 >= len(paths2) || mi1 < len(paths1) && paths1[mi1] <= paths2[mi2] {
118 p = paths1[mi1]
119 mi1++
120 } else {
121 p = paths2[mi2]
122 mi2++
124 if strings.HasPrefix(p, last) {
125 continue
127 last = p
128 ix3.writeString(p)
129 ix3.writeString("\x00")
131 ix3.writeString("\x00")
133 // Merged list of names.
134 nameData := ix3.offset()
135 nameIndexFile := bufCreate("")
136 new = 0
137 mi1 = 0
138 mi2 = 0
139 for new < numName {
140 if mi1 < len(map1) && map1[mi1].new == new {
141 for i := map1[mi1].lo; i < map1[mi1].hi; i++ {
142 name := ix1.Name(i)
143 nameIndexFile.writeUint32(ix3.offset() - nameData)
144 ix3.writeString(name)
145 ix3.writeString("\x00")
146 new++
148 mi1++
149 } else if mi2 < len(map2) && map2[mi2].new == new {
150 for i := map2[mi2].lo; i < map2[mi2].hi; i++ {
151 name := ix2.Name(i)
152 nameIndexFile.writeUint32(ix3.offset() - nameData)
153 ix3.writeString(name)
154 ix3.writeString("\x00")
155 new++
157 mi2++
158 } else {
159 panic("merge: inconsistent index")
162 if new*4 != nameIndexFile.offset() {
163 panic("merge: inconsistent index")
165 nameIndexFile.writeUint32(ix3.offset())
167 // Merged list of posting lists.
168 postData := ix3.offset()
169 var r1 postMapReader
170 var r2 postMapReader
171 var w postDataWriter
172 r1.init(ix1, map1)
173 r2.init(ix2, map2)
174 w.init(ix3)
175 for {
176 if r1.trigram < r2.trigram {
177 w.trigram(r1.trigram)
178 for r1.nextId() {
179 w.fileid(r1.fileid)
181 r1.nextTrigram()
182 w.endTrigram()
183 } else if r2.trigram < r1.trigram {
184 w.trigram(r2.trigram)
185 for r2.nextId() {
186 w.fileid(r2.fileid)
188 r2.nextTrigram()
189 w.endTrigram()
190 } else {
191 if r1.trigram == ^uint32(0) {
192 break
194 w.trigram(r1.trigram)
195 r1.nextId()
196 r2.nextId()
197 for r1.fileid < ^uint32(0) || r2.fileid < ^uint32(0) {
198 if r1.fileid < r2.fileid {
199 w.fileid(r1.fileid)
200 r1.nextId()
201 } else if r2.fileid < r1.fileid {
202 w.fileid(r2.fileid)
203 r2.nextId()
204 } else {
205 panic("merge: inconsistent index")
208 r1.nextTrigram()
209 r2.nextTrigram()
210 w.endTrigram()
214 // Name index
215 nameIndex := ix3.offset()
216 copyFile(ix3, nameIndexFile)
218 // Posting list index
219 postIndex := ix3.offset()
220 copyFile(ix3, w.postIndexFile)
222 ix3.writeUint32(pathData)
223 ix3.writeUint32(nameData)
224 ix3.writeUint32(postData)
225 ix3.writeUint32(nameIndex)
226 ix3.writeUint32(postIndex)
227 ix3.writeString(trailerMagic)
228 ix3.flush()
230 os.Remove(nameIndexFile.name)
231 os.Remove(w.postIndexFile.name)
234 // src1 and src2 must not cover the same files. dst will contain an index that
235 // contains src1 and src2.
236 func Concat(dst, src1, src2 string) {
237 ix1 := Open(src1)
238 ix2 := Open(src2)
240 ix3 := bufCreate(dst)
241 ix3.writeString(magic)
243 // Merged list of paths.
244 pathData := ix3.offset()
245 ix3.writeString("\x00")
247 // Merged list of names.
248 nameData := ix3.offset()
249 nameIndexFile := bufCreate("")
250 for i := 0; i < ix1.numName; i++ {
251 nameIndexFile.writeUint32(ix3.offset() - nameData)
252 ix3.writeString(ix1.Name(uint32(i)))
253 ix3.writeString("\x00")
255 for i := 0; i < ix2.numName; i++ {
256 nameIndexFile.writeUint32(ix3.offset() - nameData)
257 ix3.writeString(ix2.Name(uint32(i)))
258 ix3.writeString("\x00")
261 nameIndexFile.writeUint32(ix3.offset())
263 // Merged list of posting lists.
264 postData := ix3.offset()
265 var r1 postMapReader
266 var r2 postMapReader
267 var w postDataWriter
269 w.init(ix3)
270 r1.init(ix1, []idrange{{lo: 0, hi: uint32(ix1.numName), new: 0}})
271 r2.init(ix2, []idrange{{lo: 0, hi: uint32(ix2.numName), new: uint32(ix1.numName)}})
272 for {
273 if r1.trigram < r2.trigram {
274 w.trigram(r1.trigram)
275 for r1.nextId() {
276 w.fileid(r1.fileid)
278 r1.nextTrigram()
279 w.endTrigram()
280 } else if r2.trigram < r1.trigram {
281 w.trigram(r2.trigram)
282 for r2.nextId() {
283 w.fileid(r2.fileid)
285 r2.nextTrigram()
286 w.endTrigram()
287 } else {
288 if r1.trigram == ^uint32(0) {
289 break
291 w.trigram(r1.trigram)
292 for r1.nextId() {
293 w.fileid(r1.fileid)
295 for r2.nextId() {
296 w.fileid(r2.fileid)
298 r1.nextTrigram()
299 r2.nextTrigram()
300 w.endTrigram()
304 // Name index
305 nameIndex := ix3.offset()
306 copyFile(ix3, nameIndexFile)
308 // Posting list index
309 postIndex := ix3.offset()
310 copyFile(ix3, w.postIndexFile)
312 ix3.writeUint32(pathData)
313 ix3.writeUint32(nameData)
314 ix3.writeUint32(postData)
315 ix3.writeUint32(nameIndex)
316 ix3.writeUint32(postIndex)
317 ix3.writeString(trailerMagic)
318 ix3.flush()
320 os.Remove(nameIndexFile.name)
321 os.Remove(w.postIndexFile.name)
324 type postMapReader struct {
325 ix *Index
326 idmap []idrange
327 triNum uint32
328 trigram uint32
329 count uint32
330 offset uint32
331 d []byte
332 oldid uint32
333 fileid uint32
334 i int
337 func (r *postMapReader) init(ix *Index, idmap []idrange) {
338 r.ix = ix
339 r.idmap = idmap
340 r.trigram = ^uint32(0)
341 r.load()
344 func (r *postMapReader) nextTrigram() {
345 r.triNum++
346 r.load()
349 func (r *postMapReader) load() {
350 if r.triNum >= uint32(r.ix.numPost) {
351 r.trigram = ^uint32(0)
352 r.count = 0
353 r.fileid = ^uint32(0)
354 return
356 r.trigram, r.count, r.offset = r.ix.listAt(r.triNum * postEntrySize)
357 if r.count == 0 {
358 r.fileid = ^uint32(0)
359 return
361 r.d = r.ix.slice(r.ix.postData+r.offset+3, -1)
362 r.oldid = ^uint32(0)
363 r.i = 0
366 func (r *postMapReader) nextId() bool {
367 for r.count > 0 {
368 r.count--
369 delta64, n := binary.Uvarint(r.d)
370 delta := uint32(delta64)
371 if n <= 0 || delta == 0 {
372 corrupt(r.ix.File)
374 r.d = r.d[n:]
375 r.oldid += delta
376 for r.i < len(r.idmap) && r.idmap[r.i].hi <= r.oldid {
377 r.i++
379 if r.i >= len(r.idmap) {
380 r.count = 0
381 break
383 if r.oldid < r.idmap[r.i].lo {
384 continue
386 r.fileid = r.idmap[r.i].new + r.oldid - r.idmap[r.i].lo
387 return true
390 r.fileid = ^uint32(0)
391 return false
394 // Directly writes the entire posting list to w.
395 // Useful to avoid function call overhead, and also expects to be called from
396 // ConcatN only (i.e. takes shortcuts that may break usage of idmap other than
397 // what ConcatN does).
398 func (r *postMapReader) writePostingList(w *postDataWriter) {
399 if r.count == 0 {
400 return
403 // The first entry is fully decoded and re-encoded because w may be in the
404 // middle of a posting list and we need to delta-encode _in relation to_
405 // the last entry.
406 r.count--
407 delta64, n := binary.Uvarint(r.d)
408 delta := uint32(delta64)
409 if n <= 0 || delta == 0 {
410 corrupt(r.ix.File)
412 r.d = r.d[n:]
413 r.oldid = r.idmap[0].new + ^uint32(0) + delta
414 w.fileid(r.oldid)
415 w.count += r.count
417 // If the posting list has only one entry, we’re done now.
418 if r.count == 0 {
419 r.fileid = ^uint32(0)
420 return
423 // TODO: myPostingLast finds the amount of bytes and the last value. In the
424 // long run, we may want to store the last value of a posting list so that
425 // we don’t need to read the entire thing to copy it properly. The overhead
426 // is 2 * sizeof(uint32) * num_posting_lists. calculate that once we have
427 // the entire index.
428 last, totalbytes := myPostingLast(r.d, r.count, r.oldid)
429 w.out.write(r.d[:totalbytes])
430 w.last = uint32(last)
431 r.fileid = ^uint32(0)
434 type postDataWriter struct {
435 out *bufWriter
436 postIndexFile *bufWriter
437 buf [10]byte
438 base uint32
439 count, offset uint32
440 last uint32
441 t uint32
444 func (w *postDataWriter) init(out *bufWriter) {
445 w.out = out
446 w.postIndexFile = bufCreate("")
447 w.base = out.offset()
450 func (w *postDataWriter) trigram(t uint32) {
451 w.offset = w.out.offset()
452 w.count = 0
453 w.t = t
454 w.last = ^uint32(0)
457 func (w *postDataWriter) fileid(id uint32) {
458 if w.count == 0 {
459 w.out.writeTrigram(w.t)
461 w.out.writeUvarint(id - w.last)
462 w.last = id
463 w.count++
466 func (w *postDataWriter) endTrigram() {
467 if w.count == 0 {
468 return
470 w.out.writeUvarint(0)
471 w.postIndexFile.writeTrigram(w.t)
472 w.postIndexFile.writeUint32(w.count)
473 w.postIndexFile.writeUint32(w.offset - w.base)