[honey] Fix reading doclength off end of db
[xapian.git] / xapian-core / backends / honey / honey_alldocspostlist.cc
blobed6ff730f96fdf43b185d189aa3907a90f3cd9cd
1 /** @file honey_alldocspostlist.cc
2 * @brief A PostList which iterates over all documents in a HoneyDatabase.
3 */
4 /* Copyright (C) 2006,2007,2008,2009,2018 Olly Betts
5 * Copyright (C) 2008 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 #include <config.h>
23 #include "honey_alldocspostlist.h"
25 #include "honey_database.h"
27 #include "debuglog.h"
28 #include "str.h"
29 #include "wordaccess.h"
31 #include <string>
33 using namespace Honey;
34 using namespace std;
36 HoneyAllDocsPostList::HoneyAllDocsPostList(const HoneyDatabase* db,
37 Xapian::doccount doccount_)
38 : LeafPostList(string()),
39 cursor(db->get_postlist_cursor()),
40 doccount(doccount_)
42 LOGCALL_CTOR(DB, "HoneyAllDocsPostList", db | doccount_);
43 cursor->find_entry_ge(string("\0\xe0", 2));
46 HoneyAllDocsPostList::~HoneyAllDocsPostList()
48 delete cursor;
51 Xapian::doccount
52 HoneyAllDocsPostList::get_termfreq() const
54 LOGCALL(DB, Xapian::doccount, "HoneyAllDocsPostList::get_termfreq", NO_ARGS);
55 RETURN(doccount);
58 Xapian::termcount
59 HoneyAllDocsPostList::get_doclength() const
61 LOGCALL(DB, Xapian::termcount, "HoneyAllDocsPostList::get_doclength", NO_ARGS);
62 RETURN(reader.get_doclength());
65 Xapian::docid
66 HoneyAllDocsPostList::get_docid() const
68 return reader.get_docid();
71 Xapian::termcount
72 HoneyAllDocsPostList::get_wdf() const
74 LOGCALL(DB, Xapian::termcount, "HoneyAllDocsPostList::get_wdf", NO_ARGS);
75 AssertParanoid(!at_end());
76 RETURN(1);
79 bool
80 HoneyAllDocsPostList::at_end() const
82 return cursor == NULL;
85 PostList*
86 HoneyAllDocsPostList::next(double)
88 Assert(cursor);
89 if (!reader.at_end()) {
90 if (reader.next()) return NULL;
91 cursor->next();
94 if (!cursor->after_end()) {
95 if (reader.update(cursor)) {
96 if (!reader.at_end()) return NULL;
100 // We've reached the end.
101 delete cursor;
102 cursor = NULL;
103 return NULL;
106 PostList*
107 HoneyAllDocsPostList::skip_to(Xapian::docid did, double)
109 if (rare(!cursor)) {
110 // No-op if already at_end.
111 return NULL;
114 Assert(!reader.at_end());
116 if (reader.skip_to(did))
117 return NULL;
119 if (cursor->find_entry_ge(make_doclenchunk_key(did))) {
120 // Exact match.
121 if (rare(!reader.update(cursor))) {
122 // Shouldn't be possible.
123 Assert(false);
125 if (reader.skip_to(did)) return NULL;
126 // The chunk's last docid is did, so skip_to() should always succeed.
127 Assert(false);
128 } else if (!cursor->after_end()) {
129 if (reader.update(cursor)) {
130 if (reader.skip_to(did)) return NULL;
131 // The chunk's last docid is >= did, so skip_to() should always
132 // succeed.
133 Assert(false);
137 // We've reached the end.
138 delete cursor;
139 cursor = NULL;
140 return NULL;
143 PostList*
144 HoneyAllDocsPostList::check(Xapian::docid did, double, bool& valid)
146 if (rare(!cursor)) {
147 // Already at_end.
148 valid = true;
149 return NULL;
152 if (!reader.at_end()) {
153 // Check for the requested docid in the current block.
154 if (reader.skip_to(did)) {
155 valid = true;
156 return NULL;
160 // Try moving to the appropriate chunk.
161 if (!cursor->find_entry_ge(make_doclenchunk_key(did))) {
162 // We're in a chunk which might contain the docid.
163 if (reader.update(cursor)) {
164 if (reader.skip_to(did)) {
165 valid = true;
166 return NULL;
169 valid = false;
170 return NULL;
173 // We had an exact match for a chunk starting with specified docid.
174 Assert(!cursor->after_end());
175 if (!reader.update(cursor)) {
176 // We found the exact key we built so it must be a doclen chunk.
177 // Therefore reader.update() "can't possibly fail".
178 Assert(false);
181 valid = true;
182 return NULL;
185 string
186 HoneyAllDocsPostList::get_description() const
188 string desc = "HoneyAllDocsPostList(did=";
189 desc += str(get_docid());
190 desc += ",doccount=";
191 desc += str(doccount);
192 desc += ')';
193 return desc;
196 namespace Honey {
198 bool
199 DocLenChunkReader::read_doclen(const unsigned char* q)
201 switch (width) {
202 case 1:
203 doclen = *q;
204 return doclen != 0xff;
205 case 2:
206 doclen = unaligned_read2(q);
207 return doclen != 0xffff;
208 case 3:
209 // q - 1 is always a valid byte - either the leading byte holding
210 // the data width, or else the last byte of the previous value.
211 // unaligned_read4() uses bigendian order, so we just need to mask
212 // off the most significant byte.
213 doclen = unaligned_read4(q - 1) & 0xffffff;
214 return doclen != 0xffffff;
215 default:
216 doclen = unaligned_read4(q);
217 return doclen != 0xffffffff;
221 bool
222 DocLenChunkReader::update(HoneyCursor* cursor)
224 Xapian::docid last_did = docid_from_key(cursor->current_key);
225 if (!last_did) return false;
227 cursor->read_tag();
229 size_t len = cursor->current_tag.size();
230 if (rare(len == 0))
231 throw Xapian::DatabaseCorruptError("Doclen data chunk is empty");
233 p = reinterpret_cast<const unsigned char*>(cursor->current_tag.data());
234 end = p + len;
235 width = *p++;
236 if (((width - 8) &~ 0x18) != 0) {
237 throw Xapian::DatabaseCorruptError("Invalid doclen width - currently "
238 "8, 16, 24 and 32 are supported");
240 width /= 8;
241 if ((len - 1) % width != 0)
242 throw Xapian::DatabaseCorruptError("Doclen data chunk has junk at end");
243 Xapian::docid first_did = last_did - (len - 1) / width + 1;
245 did = first_did;
246 if (!read_doclen(p)) {
247 // The first doclen value shouldn't be missing.
248 throw Xapian::DatabaseCorruptError("Invalid first doclen value");
250 return true;
253 bool
254 DocLenChunkReader::next()
256 do {
257 p += width;
258 if (p == end) {
259 p = NULL;
260 return false;
263 ++did;
264 } while (!read_doclen(p));
265 return true;
268 bool
269 DocLenChunkReader::skip_to(Xapian::docid target)
271 if (p == NULL)
272 return false;
274 if (target <= did)
275 return true;
277 Xapian::docid delta = target - did;
278 if (delta >= Xapian::docid(end - p) / width) {
279 p = NULL;
280 return false;
283 did = target;
284 p += delta * width;
286 return read_doclen(p) || next();
289 // FIXME: Add check() method, which doesn't advance when read_doclen() returns
290 // false?
292 bool
293 DocLenChunkReader::find_doclength(Xapian::docid target)
295 if (target < did)
296 return false;
298 Xapian::docid delta = target - did;
299 Assert(width > 0);
300 if (delta >= Xapian::docid(end - p) / width) {
301 return false;
304 return read_doclen(p + delta * width);