2 * @brief Compact a database, or merge and compact several.
4 /* Copyright (C) 2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013 Olly Betts
5 * Copyright (C) 2008 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include <xapian/compactor.h>
27 #include "safeerrno.h"
35 #include "safesysstat.h"
36 #include <sys/types.h>
38 #include "safeunistd.h"
39 #include "safefcntl.h"
43 #include "filetests.h"
44 #include "fileutils.h"
45 #include "posixy_wrapper.h"
46 #include "stringutils.h"
49 #ifdef XAPIAN_HAS_GLASS_BACKEND
50 #include "backends/glass/glass_compact.h"
51 #include "backends/glass/glass_version.h"
53 #ifdef XAPIAN_HAS_CHERT_BACKEND
54 #include "backends/chert/chert_compact.h"
55 #include "backends/chert/chert_version.h"
58 #include <xapian/database.h>
59 #include <xapian/error.h>
63 class CmpByFirstUsed
{
64 const vector
<pair
<Xapian::docid
, Xapian::docid
> > & used_ranges
;
67 CmpByFirstUsed(const vector
<pair
<Xapian::docid
, Xapian::docid
> > & ur
)
70 bool operator()(size_t a
, size_t b
) {
71 return used_ranges
[a
].first
< used_ranges
[b
].first
;
75 static const char * backend_names
[] = {
81 enum { STUB_NO
, STUB_FILE
, STUB_DIR
};
85 class Compactor::Internal
: public Xapian::Internal::intrusive_base
{
86 friend class Compactor
;
93 compaction_level compaction
;
95 Xapian::docid tot_off
;
96 Xapian::docid last_docid
;
98 enum { UNKNOWN
, CHERT
, GLASS
} backend
;
102 vector
<string
> sources
;
103 vector
<Xapian::docid
> offset
;
104 vector
<pair
<Xapian::docid
, Xapian::docid
> > used_ranges
;
107 : renumber(true), multipass(false),
108 block_size(8192), compaction(FULL
), tot_off(0),
109 last_docid(0), backend(UNKNOWN
)
113 void set_destdir(const string
& destdir_
);
115 void add_source(const string
& srcdir
);
117 void compact(Xapian::Compactor
& compactor
);
120 Compactor::Compactor() : internal(new Compactor::Internal()) { }
122 Compactor::~Compactor() { }
125 Compactor::set_block_size(size_t block_size
)
127 internal
->block_size
= block_size
;
131 Compactor::set_renumber(bool renumber
)
133 internal
->renumber
= renumber
;
137 Compactor::set_multipass(bool multipass
)
139 internal
->multipass
= multipass
;
143 Compactor::set_compaction_level(compaction_level compaction
)
145 internal
->compaction
= compaction
;
149 Compactor::set_destdir(const string
& destdir
)
151 internal
->set_destdir(destdir
);
155 Compactor::add_source(const string
& srcdir
)
157 internal
->add_source(srcdir
);
163 internal
->compact(*this);
167 Compactor::set_status(const string
& table
, const string
& status
)
174 Compactor::resolve_duplicate_metadata(const string
& key
,
175 size_t num_tags
, const std::string tags
[])
186 backend_mismatch(const string
&dbpath1
, int backend1
,
187 const string
&dbpath2
, int backend2
)
190 backend_mismatch(const string
&dbpath1
, int backend1
,
191 const string
&dbpath2
, int backend2
)
193 string msg
= "All databases must be the same type ('";
196 msg
+= backend_names
[backend1
];
200 msg
+= backend_names
[backend2
];
202 throw Xapian::InvalidArgumentError(msg
);
208 Compactor::Internal::set_destdir(const string
& destdir_
) {
210 compact_to_stub
= STUB_NO
;
211 if (file_exists(destdir
)) {
213 compact_to_stub
= STUB_FILE
;
214 } else if (file_exists(destdir
+ "/XAPIANDB")) {
216 compact_to_stub
= STUB_DIR
;
221 Compactor::Internal::add_source(const string
& srcdir
)
223 // Check destdir isn't the same as any source directory, unless it is a
225 if (!compact_to_stub
&& srcdir
== destdir
) {
226 throw Xapian::InvalidArgumentError("destination may not be the same as any source directory, unless it is a stub database");
230 if (stat(srcdir
.c_str(), &sb
) == 0) {
231 bool is_stub
= false;
232 string file
= srcdir
;
233 if (S_ISREG(sb
.st_mode
)) {
234 // Stub database file.
236 } else if (S_ISDIR(sb
.st_mode
)) {
238 if (stat(file
.c_str(), &sb
) == 0 && S_ISREG(sb
.st_mode
)) {
239 // Stub database directory.
244 ifstream
stub(file
.c_str());
246 unsigned int line_no
= 0;
247 while (getline(stub
, line
)) {
249 if (line
.empty() || line
[0] == '#')
251 string::size_type space
= line
.find(' ');
252 if (space
== string::npos
) space
= line
.size();
254 string
type(line
, 0, space
);
255 line
.erase(0, space
+ 1);
257 if (type
== "auto" || type
== "chert" || type
== "glass") {
258 resolve_relative_path(line
, file
);
263 if (type
== "remote" || type
== "inmemory") {
264 string msg
= "Can't compact stub entry of type '";
267 throw Xapian::InvalidOperationError(msg
);
270 if (type
== "flint") {
271 throw Xapian::DatabaseError("Flint backend no longer supported");
273 throw Xapian::DatabaseError("Bad line in stub file");
279 if (file_exists(string(srcdir
) + "/iamchert")) {
280 if (backend
== UNKNOWN
) {
282 } else if (backend
!= CHERT
) {
283 backend_mismatch(first_source
, backend
, srcdir
, CHERT
);
285 } else if (file_exists(string(srcdir
) + "/iamglass")) {
286 if (backend
== UNKNOWN
) {
288 } else if (backend
!= GLASS
) {
289 backend_mismatch(first_source
, backend
, srcdir
, GLASS
);
291 } else if (file_exists(string(srcdir
) + "/iamflint")) {
292 throw Xapian::DatabaseError("Flint backend no longer supported");
293 } else if (file_exists(string(srcdir
) + "/iambrass")) {
294 throw Xapian::DatabaseError("Brass backend no longer supported");
297 msg
+= ": not a chert or glass database";
298 throw Xapian::InvalidArgumentError(msg
);
301 if (first_source
.empty())
302 first_source
= srcdir
;
304 Xapian::Database
db(srcdir
);
305 Xapian::docid first
= 0, last
= 0;
307 // "Empty" databases might have spelling or synonym data so can't
308 // just be completely ignored.
309 Xapian::doccount num_docs
= db
.get_doccount();
311 Xapian::PostingIterator it
= db
.postlist_begin(string());
312 // This test should never fail, since db.get_doccount() is
314 Assert(it
!= db
.postlist_end(string()));
317 if (renumber
&& first
) {
318 // Prune any unused docids off the start of this source
321 // tot_off could wrap here, but it's unsigned, so that's
323 tot_off
-= (first
- 1);
326 // There may be unused documents at the end of the range.
327 // Binary chop using skip_to to find the last actually used
329 last
= db
.get_lastdocid();
330 Xapian::docid last_lbound
= first
+ num_docs
- 1;
331 while (last_lbound
< last
) {
333 mid
= last_lbound
+ (last
- last_lbound
+ 1) / 2;
335 if (it
== db
.postlist_end(string())) {
337 it
= db
.postlist_begin(string());
343 offset
.push_back(tot_off
);
346 else if (last_docid
< db
.get_lastdocid())
347 last_docid
= db
.get_lastdocid();
348 used_ranges
.push_back(make_pair(first
, last
));
350 sources
.push_back(string(srcdir
) + '/');
354 Compactor::Internal::compact(Xapian::Compactor
& compactor
)
357 last_docid
= tot_off
;
359 if (!renumber
&& sources
.size() > 1) {
360 // We want to process the sources in ascending order of first
361 // docid. So we create a vector "order" with ascending integers
362 // and then sort so the indirected order is right. Then we reorder
363 // the vectors into that order and check the ranges are disjoint.
364 vector
<size_t> order
;
365 order
.reserve(sources
.size());
366 for (size_t i
= 0; i
< sources
.size(); ++i
)
369 sort(order
.begin(), order
.end(), CmpByFirstUsed(used_ranges
));
371 // Reorder the vectors to be in ascending of first docid, and
372 // set all the offsets to 0.
373 vector
<string
> sources_(sources
.size());
374 vector
<pair
<Xapian::docid
, Xapian::docid
> > used_ranges_
;
375 used_ranges_
.reserve(sources
.size());
377 Xapian::docid last_start
= 0, last_end
= 0;
378 for (size_t j
= 0; j
!= order
.size(); ++j
) {
381 swap(sources_
[j
], sources
[n
]);
382 used_ranges_
.push_back(used_ranges
[n
]);
384 const pair
<Xapian::docid
, Xapian::docid
> p
= used_ranges
[n
];
385 // Skip empty databases.
386 if (p
.first
== 0 && p
.second
== 0)
388 // Check for overlap with the previous database's range.
389 if (p
.first
<= last_end
) {
390 string msg
= "when merging databases, --no-renumber is only currently supported if the databases have disjoint ranges of used document ids: ";
391 msg
+= sources
[order
[j
- 1]];
392 msg
+= " has range ";
393 msg
+= str(last_start
);
395 msg
+= str(last_end
);
398 msg
+= " has range ";
401 msg
+= str(p
.second
);
402 throw Xapian::InvalidOperationError(msg
);
404 last_start
= p
.first
;
408 swap(sources
, sources_
);
409 swap(used_ranges
, used_ranges_
);
413 if (compact_to_stub
) {
415 if (compact_to_stub
== STUB_DIR
) {
416 stub_file
+= "/XAPIANDB";
421 size_t sfx
= destdir
.size();
422 time_t now
= time(NULL
);
425 destdir
+= str(now
++);
426 if (mkdir(destdir
.c_str(), 0755) == 0)
428 if (errno
!= EEXIST
) {
429 string msg
= destdir
;
430 msg
+= ": mkdir failed";
431 throw Xapian::DatabaseError(msg
, errno
);
435 // If the destination database directory doesn't exist, create it.
436 if (mkdir(destdir
.c_str(), 0755) < 0) {
437 // Check why mkdir failed. It's ok if the directory already
438 // exists, but we also get EEXIST if there's an existing file with
440 if (errno
== EEXIST
) {
441 if (dir_exists(destdir
))
444 errno
= EEXIST
; // dir_exists() might have changed it
447 string msg
= destdir
;
448 msg
+= ": cannot create directory";
449 throw Xapian::DatabaseError(msg
, errno
);
454 if (backend
== CHERT
) {
455 #ifdef XAPIAN_HAS_CHERT_BACKEND
456 compact_chert(compactor
, destdir
.c_str(), sources
, offset
, block_size
,
457 compaction
, multipass
, last_docid
);
460 throw Xapian::FeatureUnavailableError("Chert backend disabled at build time");
462 } else if (backend
== GLASS
) {
463 #ifdef XAPIAN_HAS_GLASS_BACKEND
464 compact_glass(compactor
, destdir
.c_str(), sources
, offset
, block_size
,
465 compaction
, multipass
, last_docid
);
468 throw Xapian::FeatureUnavailableError("Glass backend disabled at build time");
472 // Create the version file ("iamchert", etc).
474 // This file contains a UUID, and we want the copy to have a fresh
475 // UUID since its revision counter is reset to 1.
476 if (backend
== CHERT
) {
477 #ifdef XAPIAN_HAS_CHERT_BACKEND
478 ChertVersion(destdir
).create();
485 if (compact_to_stub
) {
486 string new_stub_file
= destdir
;
487 new_stub_file
+= "/new_stub.tmp";
489 ofstream
new_stub(new_stub_file
.c_str());
491 size_t slash
= destdir
.find_last_of('/');
493 size_t slash
= destdir
.find_last_of("/\\");
495 new_stub
<< "auto " << destdir
.substr(slash
+ 1) << '\n';
497 if (posixy_rename(new_stub_file
.c_str(), stub_file
.c_str()) < 0) {
498 // FIXME: try to clean up?
499 string msg
= "Cannot rename '";
500 msg
+= new_stub_file
;
504 throw Xapian::DatabaseError(msg
, errno
);