myhtmlparse.cc: Remove unused header.
[xapian.git] / xapian-core / api / compactor.cc
blob4270067d6536e8f5f880e39865fc969c2fce4678
1 /** @file compactor.cc
2 * @brief Compact a database, or merge and compact several.
3 */
4 /* Copyright (C) 2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013 Olly Betts
5 * Copyright (C) 2008 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
20 * USA
23 #include <config.h>
25 #include <xapian/compactor.h>
27 #include "safeerrno.h"
29 #include <algorithm>
30 #include <fstream>
32 #include <cstdlib>
33 #include <cstring>
34 #include <ctime>
35 #include "safesysstat.h"
36 #include <sys/types.h>
38 #include "safeunistd.h"
39 #include "safefcntl.h"
41 #include "noreturn.h"
42 #include "omassert.h"
43 #include "filetests.h"
44 #include "fileutils.h"
45 #include "posixy_wrapper.h"
46 #include "stringutils.h"
47 #include "str.h"
49 #ifdef XAPIAN_HAS_GLASS_BACKEND
50 #include "backends/glass/glass_compact.h"
51 #include "backends/glass/glass_version.h"
52 #endif
53 #ifdef XAPIAN_HAS_CHERT_BACKEND
54 #include "backends/chert/chert_compact.h"
55 #include "backends/chert/chert_version.h"
56 #endif
58 #include <xapian/database.h>
59 #include <xapian/error.h>
61 using namespace std;
63 class CmpByFirstUsed {
64 const vector<pair<Xapian::docid, Xapian::docid> > & used_ranges;
66 public:
67 CmpByFirstUsed(const vector<pair<Xapian::docid, Xapian::docid> > & ur)
68 : used_ranges(ur) { }
70 bool operator()(size_t a, size_t b) {
71 return used_ranges[a].first < used_ranges[b].first;
75 static const char * backend_names[] = {
76 NULL,
77 "chert",
78 "glass",
81 enum { STUB_NO, STUB_FILE, STUB_DIR };
83 namespace Xapian {
85 class Compactor::Internal : public Xapian::Internal::intrusive_base {
86 friend class Compactor;
88 string destdir;
89 bool renumber;
90 bool multipass;
91 int compact_to_stub;
92 size_t block_size;
93 compaction_level compaction;
95 Xapian::docid tot_off;
96 Xapian::docid last_docid;
98 enum { UNKNOWN, CHERT, GLASS } backend;
100 string first_source;
102 vector<string> sources;
103 vector<Xapian::docid> offset;
104 vector<pair<Xapian::docid, Xapian::docid> > used_ranges;
105 public:
106 Internal()
107 : renumber(true), multipass(false),
108 block_size(8192), compaction(FULL), tot_off(0),
109 last_docid(0), backend(UNKNOWN)
113 void set_destdir(const string & destdir_);
115 void add_source(const string & srcdir);
117 void compact(Xapian::Compactor & compactor);
120 Compactor::Compactor() : internal(new Compactor::Internal()) { }
122 Compactor::~Compactor() { }
124 void
125 Compactor::set_block_size(size_t block_size)
127 internal->block_size = block_size;
130 void
131 Compactor::set_renumber(bool renumber)
133 internal->renumber = renumber;
136 void
137 Compactor::set_multipass(bool multipass)
139 internal->multipass = multipass;
142 void
143 Compactor::set_compaction_level(compaction_level compaction)
145 internal->compaction = compaction;
148 void
149 Compactor::set_destdir(const string & destdir)
151 internal->set_destdir(destdir);
154 void
155 Compactor::add_source(const string & srcdir)
157 internal->add_source(srcdir);
160 void
161 Compactor::compact()
163 internal->compact(*this);
166 void
167 Compactor::set_status(const string & table, const string & status)
169 (void)table;
170 (void)status;
173 string
174 Compactor::resolve_duplicate_metadata(const string & key,
175 size_t num_tags, const std::string tags[])
177 (void)key;
178 (void)num_tags;
179 return tags[0];
184 XAPIAN_NORETURN(
185 static void
186 backend_mismatch(const string &dbpath1, int backend1,
187 const string &dbpath2, int backend2)
189 static void
190 backend_mismatch(const string &dbpath1, int backend1,
191 const string &dbpath2, int backend2)
193 string msg = "All databases must be the same type ('";
194 msg += dbpath1;
195 msg += "' is ";
196 msg += backend_names[backend1];
197 msg += ", but '";
198 msg += dbpath2;
199 msg += "' is ";
200 msg += backend_names[backend2];
201 msg += ')';
202 throw Xapian::InvalidArgumentError(msg);
205 namespace Xapian {
207 void
208 Compactor::Internal::set_destdir(const string & destdir_) {
209 destdir = destdir_;
210 compact_to_stub = STUB_NO;
211 if (file_exists(destdir)) {
212 // Stub file.
213 compact_to_stub = STUB_FILE;
214 } else if (file_exists(destdir + "/XAPIANDB")) {
215 // Stub directory.
216 compact_to_stub = STUB_DIR;
220 void
221 Compactor::Internal::add_source(const string & srcdir)
223 // Check destdir isn't the same as any source directory, unless it is a
224 // stub database.
225 if (!compact_to_stub && srcdir == destdir) {
226 throw Xapian::InvalidArgumentError("destination may not be the same as any source directory, unless it is a stub database");
229 struct stat sb;
230 if (stat(srcdir.c_str(), &sb) == 0) {
231 bool is_stub = false;
232 string file = srcdir;
233 if (S_ISREG(sb.st_mode)) {
234 // Stub database file.
235 is_stub = true;
236 } else if (S_ISDIR(sb.st_mode)) {
237 file += "/XAPIANDB";
238 if (stat(file.c_str(), &sb) == 0 && S_ISREG(sb.st_mode)) {
239 // Stub database directory.
240 is_stub = true;
243 if (is_stub) {
244 ifstream stub(file.c_str());
245 string line;
246 unsigned int line_no = 0;
247 while (getline(stub, line)) {
248 ++line_no;
249 if (line.empty() || line[0] == '#')
250 continue;
251 string::size_type space = line.find(' ');
252 if (space == string::npos) space = line.size();
254 string type(line, 0, space);
255 line.erase(0, space + 1);
257 if (type == "auto" || type == "chert" || type == "glass") {
258 resolve_relative_path(line, file);
259 add_source(line);
260 continue;
263 if (type == "remote" || type == "inmemory") {
264 string msg = "Can't compact stub entry of type '";
265 msg += type;
266 msg += '\'';
267 throw Xapian::InvalidOperationError(msg);
270 if (type == "flint") {
271 throw Xapian::DatabaseError("Flint backend no longer supported");
273 throw Xapian::DatabaseError("Bad line in stub file");
275 return;
279 if (file_exists(string(srcdir) + "/iamchert")) {
280 if (backend == UNKNOWN) {
281 backend = CHERT;
282 } else if (backend != CHERT) {
283 backend_mismatch(first_source, backend, srcdir, CHERT);
285 } else if (file_exists(string(srcdir) + "/iamglass")) {
286 if (backend == UNKNOWN) {
287 backend = GLASS;
288 } else if (backend != GLASS) {
289 backend_mismatch(first_source, backend, srcdir, GLASS);
291 } else if (file_exists(string(srcdir) + "/iamflint")) {
292 throw Xapian::DatabaseError("Flint backend no longer supported");
293 } else if (file_exists(string(srcdir) + "/iambrass")) {
294 throw Xapian::DatabaseError("Brass backend no longer supported");
295 } else {
296 string msg = srcdir;
297 msg += ": not a chert or glass database";
298 throw Xapian::InvalidArgumentError(msg);
301 if (first_source.empty())
302 first_source = srcdir;
304 Xapian::Database db(srcdir);
305 Xapian::docid first = 0, last = 0;
307 // "Empty" databases might have spelling or synonym data so can't
308 // just be completely ignored.
309 Xapian::doccount num_docs = db.get_doccount();
310 if (num_docs != 0) {
311 Xapian::PostingIterator it = db.postlist_begin(string());
312 // This test should never fail, since db.get_doccount() is
313 // non-zero!
314 Assert(it != db.postlist_end(string()));
315 first = *it;
317 if (renumber && first) {
318 // Prune any unused docids off the start of this source
319 // database.
321 // tot_off could wrap here, but it's unsigned, so that's
322 // OK.
323 tot_off -= (first - 1);
326 // There may be unused documents at the end of the range.
327 // Binary chop using skip_to to find the last actually used
328 // document id.
329 last = db.get_lastdocid();
330 Xapian::docid last_lbound = first + num_docs - 1;
331 while (last_lbound < last) {
332 Xapian::docid mid;
333 mid = last_lbound + (last - last_lbound + 1) / 2;
334 it.skip_to(mid);
335 if (it == db.postlist_end(string())) {
336 last = mid - 1;
337 it = db.postlist_begin(string());
338 continue;
340 last_lbound = *it;
343 offset.push_back(tot_off);
344 if (renumber)
345 tot_off += last;
346 else if (last_docid < db.get_lastdocid())
347 last_docid = db.get_lastdocid();
348 used_ranges.push_back(make_pair(first, last));
350 sources.push_back(string(srcdir) + '/');
353 void
354 Compactor::Internal::compact(Xapian::Compactor & compactor)
356 if (renumber)
357 last_docid = tot_off;
359 if (!renumber && sources.size() > 1) {
360 // We want to process the sources in ascending order of first
361 // docid. So we create a vector "order" with ascending integers
362 // and then sort so the indirected order is right. Then we reorder
363 // the vectors into that order and check the ranges are disjoint.
364 vector<size_t> order;
365 order.reserve(sources.size());
366 for (size_t i = 0; i < sources.size(); ++i)
367 order.push_back(i);
369 sort(order.begin(), order.end(), CmpByFirstUsed(used_ranges));
371 // Reorder the vectors to be in ascending of first docid, and
372 // set all the offsets to 0.
373 vector<string> sources_(sources.size());
374 vector<pair<Xapian::docid, Xapian::docid> > used_ranges_;
375 used_ranges_.reserve(sources.size());
377 Xapian::docid last_start = 0, last_end = 0;
378 for (size_t j = 0; j != order.size(); ++j) {
379 size_t n = order[j];
381 swap(sources_[j], sources[n]);
382 used_ranges_.push_back(used_ranges[n]);
384 const pair<Xapian::docid, Xapian::docid> p = used_ranges[n];
385 // Skip empty databases.
386 if (p.first == 0 && p.second == 0)
387 continue;
388 // Check for overlap with the previous database's range.
389 if (p.first <= last_end) {
390 string msg = "when merging databases, --no-renumber is only currently supported if the databases have disjoint ranges of used document ids: ";
391 msg += sources[order[j - 1]];
392 msg += " has range ";
393 msg += str(last_start);
394 msg += '-';
395 msg += str(last_end);
396 msg += ", ";
397 msg += sources[n];
398 msg += " has range ";
399 msg += str(p.first);
400 msg += '-';
401 msg += str(p.second);
402 throw Xapian::InvalidOperationError(msg);
404 last_start = p.first;
405 last_end = p.second;
408 swap(sources, sources_);
409 swap(used_ranges, used_ranges_);
412 string stub_file;
413 if (compact_to_stub) {
414 stub_file = destdir;
415 if (compact_to_stub == STUB_DIR) {
416 stub_file += "/XAPIANDB";
417 destdir += '/';
418 } else {
419 destdir += '_';
421 size_t sfx = destdir.size();
422 time_t now = time(NULL);
423 while (true) {
424 destdir.resize(sfx);
425 destdir += str(now++);
426 if (mkdir(destdir.c_str(), 0755) == 0)
427 break;
428 if (errno != EEXIST) {
429 string msg = destdir;
430 msg += ": mkdir failed";
431 throw Xapian::DatabaseError(msg, errno);
434 } else {
435 // If the destination database directory doesn't exist, create it.
436 if (mkdir(destdir.c_str(), 0755) < 0) {
437 // Check why mkdir failed. It's ok if the directory already
438 // exists, but we also get EEXIST if there's an existing file with
439 // that name.
440 if (errno == EEXIST) {
441 if (dir_exists(destdir))
442 errno = 0;
443 else
444 errno = EEXIST; // dir_exists() might have changed it
446 if (errno) {
447 string msg = destdir;
448 msg += ": cannot create directory";
449 throw Xapian::DatabaseError(msg, errno);
454 if (backend == CHERT) {
455 #ifdef XAPIAN_HAS_CHERT_BACKEND
456 compact_chert(compactor, destdir.c_str(), sources, offset, block_size,
457 compaction, multipass, last_docid);
458 #else
459 (void)compactor;
460 throw Xapian::FeatureUnavailableError("Chert backend disabled at build time");
461 #endif
462 } else if (backend == GLASS) {
463 #ifdef XAPIAN_HAS_GLASS_BACKEND
464 compact_glass(compactor, destdir.c_str(), sources, offset, block_size,
465 compaction, multipass, last_docid);
466 #else
467 (void)compactor;
468 throw Xapian::FeatureUnavailableError("Glass backend disabled at build time");
469 #endif
472 // Create the version file ("iamchert", etc).
474 // This file contains a UUID, and we want the copy to have a fresh
475 // UUID since its revision counter is reset to 1.
476 if (backend == CHERT) {
477 #ifdef XAPIAN_HAS_CHERT_BACKEND
478 ChertVersion(destdir).create();
479 #else
480 // Handled above.
481 exit(1);
482 #endif
485 if (compact_to_stub) {
486 string new_stub_file = destdir;
487 new_stub_file += "/new_stub.tmp";
489 ofstream new_stub(new_stub_file.c_str());
490 #ifndef __WIN32__
491 size_t slash = destdir.find_last_of('/');
492 #else
493 size_t slash = destdir.find_last_of("/\\");
494 #endif
495 new_stub << "auto " << destdir.substr(slash + 1) << '\n';
497 if (posixy_rename(new_stub_file.c_str(), stub_file.c_str()) < 0) {
498 // FIXME: try to clean up?
499 string msg = "Cannot rename '";
500 msg += new_stub_file;
501 msg += "' to '";
502 msg += stub_file;
503 msg += '\'';
504 throw Xapian::DatabaseError(msg, errno);