2 * @brief Compact a database, or merge and compact several.
4 /* Copyright (C) 2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016 Olly Betts
5 * Copyright (C) 2008 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include <xapian/compactor.h>
27 #include "safeerrno.h"
35 #include "safesysstat.h"
36 #include <sys/types.h>
38 #include "safeunistd.h"
39 #include "safefcntl.h"
41 #include "backends/backends.h"
42 #include "backends/database.h"
44 #include "leafpostlist.h"
47 #include "filetests.h"
48 #include "fileutils.h"
50 #include "stringutils.h"
53 #ifdef XAPIAN_HAS_GLASS_BACKEND
54 #include "backends/glass/glass_database.h"
55 #include "backends/glass/glass_version.h"
57 #ifdef XAPIAN_HAS_CHERT_BACKEND
58 #include "backends/chert/chert_database.h"
59 #include "backends/chert/chert_version.h"
62 #include <xapian/constants.h>
63 #include <xapian/database.h>
64 #include <xapian/error.h>
68 class CmpByFirstUsed
{
69 const vector
<pair
<Xapian::docid
, Xapian::docid
> > & used_ranges
;
72 CmpByFirstUsed(const vector
<pair
<Xapian::docid
, Xapian::docid
> > & ur
)
75 bool operator()(size_t a
, size_t b
) {
76 return used_ranges
[a
].first
< used_ranges
[b
].first
;
82 class Compactor::Internal
: public Xapian::Internal::intrusive_base
{
83 friend class Compactor
;
85 string destdir_compat
;
89 vector
<string
> srcdirs_compat
;
92 Internal() : block_size(8192), flags(FULL
) { }
95 Compactor::Compactor() : internal(new Compactor::Internal()) { }
97 Compactor::~Compactor() { }
100 Compactor::set_block_size(size_t block_size
)
102 internal
->block_size
= block_size
;
106 Compactor::set_flags_(unsigned flags
, unsigned mask
)
108 internal
->flags
= (internal
->flags
& mask
) | flags
;
112 Compactor::set_destdir(const string
& destdir
)
114 internal
->destdir_compat
= destdir
;
118 Compactor::add_source(const string
& srcdir
)
120 internal
->srcdirs_compat
.push_back(srcdir
);
126 Xapian::Database src
;
127 for (auto srcdir
: internal
->srcdirs_compat
) {
128 src
.add_database(Xapian::Database(srcdir
));
130 src
.compact(internal
->destdir_compat
, internal
->flags
,
131 internal
->block_size
, *this);
135 Compactor::set_status(const string
& table
, const string
& status
)
142 Compactor::resolve_duplicate_metadata(const string
& key
,
143 size_t num_tags
, const std::string tags
[])
154 backend_mismatch(const Xapian::Database
& db
, int backend1
,
155 const string
&dbpath2
, int backend2
)
158 backend_mismatch(const Xapian::Database
& db
, int backend1
,
159 const string
&dbpath2
, int backend2
)
162 db
.internal
[0]->get_backend_info(&dbpath1
);
163 string msg
= "All databases must be the same type ('";
166 msg
+= backend_name(backend1
);
170 msg
+= backend_name(backend2
);
172 throw Xapian::InvalidArgumentError(msg
);
178 Database::compact_(const string
* output_ptr
, int fd
, unsigned flags
,
180 Xapian::Compactor
* compactor
) const
182 LOGCALL_VOID(API
, "Database::compact_", output_ptr
| fd
| flags
| block_size
| compactor
);
184 bool renumber
= !(flags
& DBCOMPACT_NO_RENUMBER
);
186 enum { STUB_NO
, STUB_FILE
, STUB_DIR
} compact_to_stub
= STUB_NO
;
189 // We need a modifiable destdir in this function.
190 destdir
= *output_ptr
;
191 if (!(flags
& DBCOMPACT_SINGLE_FILE
)) {
192 if (file_exists(destdir
)) {
194 compact_to_stub
= STUB_FILE
;
195 } else if (file_exists(destdir
+ "/XAPIANDB")) {
197 compact_to_stub
= STUB_DIR
;
201 // Single file is implied when writing to a file descriptor.
202 flags
|= DBCOMPACT_SINGLE_FILE
;
205 int backend
= BACKEND_UNKNOWN
;
206 for (const auto& it
: internal
) {
208 int type
= it
->get_backend_info(&srcdir
);
209 // Check destdir isn't the same as any source directory, unless it
210 // is a stub database.
211 if (!compact_to_stub
&& srcdir
== destdir
)
212 throw Xapian::InvalidArgumentError("destination may not be the same as any source database, unless it is a stub database");
216 if (backend
!= type
&& backend
!= BACKEND_UNKNOWN
) {
217 backend_mismatch(*this, backend
, srcdir
, type
);
222 throw Xapian::DatabaseError("Only chert and glass databases can be compacted");
226 Xapian::docid tot_off
= 0;
227 Xapian::docid last_docid
= 0;
229 vector
<Xapian::docid
> offset
;
230 vector
<pair
<Xapian::docid
, Xapian::docid
> > used_ranges
;
231 vector
<Xapian::Database::Internal
*> internals
;
232 offset
.reserve(internal
.size());
233 used_ranges
.reserve(internal
.size());
234 internals
.reserve(internal
.size());
236 for (const auto& i
: internal
) {
237 Xapian::Database::Internal
* db
= i
.get();
238 internals
.push_back(db
);
240 Xapian::docid first
= 0, last
= 0;
242 // "Empty" databases might have spelling or synonym data so can't
243 // just be completely ignored.
244 Xapian::doccount num_docs
= db
->get_doccount();
246 db
->get_used_docid_range(first
, last
);
248 if (renumber
&& first
) {
249 // Prune any unused docids off the start of this source
252 // tot_off could wrap here, but it's unsigned, so that's
254 tot_off
-= (first
- 1);
257 #ifdef XAPIAN_ASSERTIONS
258 LeafPostList
* pl
= db
->open_post_list(string());
260 // This test should never fail, since db->get_doccount() is
262 Assert(!pl
->at_end());
263 AssertEq(pl
->get_docid(), first
);
264 AssertRel(last
,>=,first
);
266 Assert(!pl
->at_end());
267 AssertEq(pl
->get_docid(), last
);
269 Assert(pl
->at_end());
274 offset
.push_back(tot_off
);
277 else if (last_docid
< db
->get_lastdocid())
278 last_docid
= db
->get_lastdocid();
279 used_ranges
.push_back(make_pair(first
, last
));
283 last_docid
= tot_off
;
285 if (!renumber
&& internal
.size() > 1) {
286 // We want to process the sources in ascending order of first
287 // docid. So we create a vector "order" with ascending integers
288 // and then sort so the indirected order is right. Then we reorder
289 // the vectors into that order and check the ranges are disjoint.
290 vector
<size_t> order
;
291 order
.reserve(internal
.size());
292 for (size_t i
= 0; i
< internal
.size(); ++i
)
295 sort(order
.begin(), order
.end(), CmpByFirstUsed(used_ranges
));
297 // Reorder the vectors to be in ascending of first docid, and
298 // set all the offsets to 0.
299 vector
<Xapian::Database::Internal
*> internals_
;
300 internals_
.reserve(internal
.size());
301 vector
<pair
<Xapian::docid
, Xapian::docid
> > used_ranges_
;
302 used_ranges_
.reserve(internal
.size());
304 Xapian::docid last_start
= 0, last_end
= 0;
305 for (size_t j
= 0; j
!= order
.size(); ++j
) {
308 internals_
.push_back(internals
[n
]);
309 used_ranges_
.push_back(used_ranges
[n
]);
311 const pair
<Xapian::docid
, Xapian::docid
> p
= used_ranges
[n
];
312 // Skip empty databases.
313 if (p
.first
== 0 && p
.second
== 0)
315 // Check for overlap with the previous database's range.
316 if (p
.first
<= last_end
) {
318 string msg
= "when merging databases, --no-renumber is only currently supported if the databases have disjoint ranges of used document ids: ";
319 internals_
[j
- 1]->get_backend_info(&tmp
);
321 msg
+= " has range ";
322 msg
+= str(last_start
);
324 msg
+= str(last_end
);
326 internals_
[j
]->get_backend_info(&tmp
);
328 msg
+= " has range ";
331 msg
+= str(p
.second
);
332 throw Xapian::InvalidOperationError(msg
);
334 last_start
= p
.first
;
338 swap(internals
, internals_
);
339 swap(used_ranges
, used_ranges_
);
343 if (compact_to_stub
) {
345 if (compact_to_stub
== STUB_DIR
) {
346 stub_file
+= "/XAPIANDB";
351 size_t sfx
= destdir
.size();
352 time_t now
= time(NULL
);
355 destdir
+= str(now
++);
356 if (mkdir(destdir
.c_str(), 0755) == 0)
358 if (errno
!= EEXIST
) {
359 string msg
= destdir
;
360 msg
+= ": mkdir failed";
361 throw Xapian::DatabaseError(msg
, errno
);
364 } else if (!(flags
& Xapian::DBCOMPACT_SINGLE_FILE
)) {
365 // If the destination database directory doesn't exist, create it.
366 if (mkdir(destdir
.c_str(), 0755) < 0) {
367 // Check why mkdir failed. It's ok if the directory already
368 // exists, but we also get EEXIST if there's an existing file with
370 if (errno
== EEXIST
) {
371 if (dir_exists(destdir
))
374 errno
= EEXIST
; // dir_exists() might have changed it
377 string msg
= destdir
;
378 msg
+= ": cannot create directory";
379 throw Xapian::DatabaseError(msg
, errno
);
384 #if defined XAPIAN_HAS_CHERT_BACKEND || defined XAPIAN_HAS_GLASS_BACKEND
385 Xapian::Compactor::compaction_level compaction
=
386 static_cast<Xapian::Compactor::compaction_level
>(flags
& (Xapian::Compactor::STANDARD
|Xapian::Compactor::FULL
|Xapian::Compactor::FULLER
));
392 if (backend
== BACKEND_CHERT
) {
393 #ifdef XAPIAN_HAS_CHERT_BACKEND
394 ChertDatabase::compact(compactor
, destdir
.c_str(), internals
, offset
,
395 block_size
, compaction
, flags
, last_docid
);
397 // Create the version file ("iamchert").
399 // This file contains a UUID, and we want the copy to have a fresh
400 // UUID since its revision counter is reset to 1.
401 ChertVersion(destdir
).create();
404 throw Xapian::FeatureUnavailableError("Chert backend disabled at build time");
406 } else if (backend
== BACKEND_GLASS
) {
407 #ifdef XAPIAN_HAS_GLASS_BACKEND
409 GlassDatabase::compact(compactor
, destdir
.c_str(), 0,
411 block_size
, compaction
, flags
, last_docid
);
413 GlassDatabase::compact(compactor
, NULL
, fd
,
415 block_size
, compaction
, flags
, last_docid
);
420 throw Xapian::FeatureUnavailableError("Glass backend disabled at build time");
424 if (compact_to_stub
) {
425 string new_stub_file
= destdir
;
426 new_stub_file
+= "/new_stub.tmp";
428 ofstream
new_stub(new_stub_file
.c_str());
430 size_t slash
= destdir
.find_last_of('/');
432 size_t slash
= destdir
.find_last_of("/\\");
434 new_stub
<< "auto " << destdir
.substr(slash
+ 1) << '\n';
436 if (!io_tmp_rename(new_stub_file
, stub_file
)) {
437 string msg
= "Cannot rename '";
438 msg
+= new_stub_file
;
442 throw Xapian::DatabaseError(msg
, errno
);