2 * @brief Compact a database, or merge and compact several.
4 /* Copyright (C) 2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2015,2016,2017,2018 Olly Betts
5 * Copyright (C) 2008 Lemur Consulting Ltd
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License as
9 * published by the Free Software Foundation; either version 2 of the
10 * License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
25 #include <xapian/compactor.h>
27 #include "safeerrno.h"
35 #include "safesysstat.h"
36 #include <sys/types.h>
38 #include "safeunistd.h"
39 #include "safefcntl.h"
41 #include "backends/backends.h"
42 #include "backends/databaseinternal.h"
44 #include "leafpostlist.h"
46 #include "filetests.h"
47 #include "fileutils.h"
49 #include "stringutils.h"
52 #ifdef XAPIAN_HAS_GLASS_BACKEND
53 #include "backends/glass/glass_database.h"
54 #include "backends/glass/glass_version.h"
57 #ifdef XAPIAN_HAS_HONEY_BACKEND
58 #include "backends/honey/honey_database.h"
59 #include "backends/honey/honey_version.h"
62 #include "backends/multi/multi_database.h"
64 #include <xapian/constants.h>
65 #include <xapian/database.h>
66 #include <xapian/error.h>
70 class CmpByFirstUsed
{
71 const vector
<pair
<Xapian::docid
, Xapian::docid
> > & used_ranges
;
75 CmpByFirstUsed(const vector
<pair
<Xapian::docid
, Xapian::docid
> > & ur
)
78 bool operator()(size_t a
, size_t b
) const {
79 return used_ranges
[a
].first
< used_ranges
[b
].first
;
85 Compactor::~Compactor() { }
88 Compactor::set_status(const string
& table
, const string
& status
)
95 Compactor::resolve_duplicate_metadata(const string
& key
,
96 size_t num_tags
, const std::string tags
[])
107 backend_mismatch(const Xapian::Database::Internal
* db
, int backend1
,
108 const string
&dbpath2
, int backend2
)
111 db
->get_backend_info(&dbpath1
);
112 string msg
= "All databases must be the same type ('";
115 msg
+= backend_name(backend1
);
119 msg
+= backend_name(backend2
);
121 throw Xapian::InvalidArgumentError(msg
);
127 Database::compact_(const string
* output_ptr
, int fd
, unsigned flags
,
129 Xapian::Compactor
* compactor
) const
131 LOGCALL_VOID(API
, "Database::compact_", output_ptr
| fd
| flags
| block_size
| compactor
);
133 bool renumber
= !(flags
& DBCOMPACT_NO_RENUMBER
);
135 enum { STUB_NO
, STUB_FILE
, STUB_DIR
} compact_to_stub
= STUB_NO
;
138 // We need a modifiable destdir in this function.
139 destdir
= *output_ptr
;
140 if (!(flags
& DBCOMPACT_SINGLE_FILE
)) {
141 if (file_exists(destdir
)) {
143 compact_to_stub
= STUB_FILE
;
144 } else if (file_exists(destdir
+ "/XAPIANDB")) {
146 compact_to_stub
= STUB_DIR
;
150 // Single file is implied when writing to a file descriptor.
151 flags
|= DBCOMPACT_SINGLE_FILE
;
154 auto n_shards
= internal
->size();
155 Xapian::docid tot_off
= 0;
156 Xapian::docid last_docid
= 0;
158 vector
<Xapian::docid
> offset
;
159 vector
<pair
<Xapian::docid
, Xapian::docid
> > used_ranges
;
160 vector
<const Xapian::Database::Internal
*> internals
;
161 offset
.reserve(n_shards
);
162 used_ranges
.reserve(n_shards
);
163 internals
.reserve(n_shards
);
166 auto multi_db
= static_cast<MultiDatabase
*>(internal
.get());
167 for (auto&& db
: multi_db
->shards
) {
168 internals
.push_back(db
);
171 internals
.push_back(internal
.get());
174 int backend
= BACKEND_UNKNOWN
;
175 for (auto&& shard
: internals
) {
177 int type
= shard
->get_backend_info(&srcdir
);
178 // Check destdir isn't the same as any source directory, unless it
179 // is a stub database or we're compacting to an fd.
180 if (!compact_to_stub
&& !destdir
.empty() && srcdir
== destdir
) {
181 throw InvalidArgumentError("destination may not be the same as "
182 "any source database, unless it is a "
187 if (backend
!= type
&& backend
!= BACKEND_UNKNOWN
) {
188 backend_mismatch(internals
[0], backend
, srcdir
, type
);
193 if (backend
!= type
&& backend
!= BACKEND_UNKNOWN
) {
194 backend_mismatch(internals
[0], backend
, srcdir
, type
);
199 throw DatabaseError("Only glass and honey databases can be "
203 Xapian::docid first
= 0, last
= 0;
205 // "Empty" databases might have spelling or synonym data so can't
206 // just be completely ignored.
207 Xapian::doccount num_docs
= shard
->get_doccount();
209 shard
->get_used_docid_range(first
, last
);
211 if (renumber
&& first
) {
212 // Prune any unused docids off the start of this source
215 // tot_off could wrap here, but it's unsigned, so that's
217 tot_off
-= (first
- 1);
220 #ifdef XAPIAN_ASSERTIONS
221 PostList
* pl
= shard
->open_post_list(string());
223 // This test should never fail, since shard->get_doccount() is
225 Assert(!pl
->at_end());
226 AssertEq(pl
->get_docid(), first
);
227 AssertRel(last
,>=,first
);
229 Assert(!pl
->at_end());
230 AssertEq(pl
->get_docid(), last
);
232 Assert(pl
->at_end());
237 offset
.push_back(tot_off
);
240 else if (last_docid
< shard
->get_lastdocid())
241 last_docid
= shard
->get_lastdocid();
242 used_ranges
.push_back(make_pair(first
, last
));
246 last_docid
= tot_off
;
248 if (!renumber
&& n_shards
> 1) {
249 // We want to process the sources in ascending order of first
250 // docid. So we create a vector "order" with ascending integers
251 // and then sort so the indirected order is right.
252 vector
<size_t> order
;
253 order
.reserve(n_shards
);
254 for (size_t i
= 0; i
< n_shards
; ++i
)
257 sort(order
.begin(), order
.end(), CmpByFirstUsed(used_ranges
));
259 // Now use order to reorder internals to be in ascending order by first
260 // docid, and while we're at it check the ranges are disjoint.
261 vector
<const Xapian::Database::Internal
*> internals_
;
262 internals_
.reserve(n_shards
);
263 vector
<pair
<Xapian::docid
, Xapian::docid
> > used_ranges_
;
264 used_ranges_
.reserve(n_shards
);
266 Xapian::docid last_start
= 0, last_end
= 0;
267 for (size_t j
= 0; j
!= order
.size(); ++j
) {
270 internals_
.push_back(internals
[n
]);
271 used_ranges_
.push_back(used_ranges
[n
]);
273 const pair
<Xapian::docid
, Xapian::docid
> p
= used_ranges
[n
];
274 // Skip empty databases.
275 if (p
.first
== 0 && p
.second
== 0)
277 // Check for overlap with the previous database's range.
278 if (p
.first
<= last_end
) {
280 string msg
= "when merging databases, --no-renumber is only currently supported if the databases have disjoint ranges of used document ids: ";
281 internals_
[j
- 1]->get_backend_info(&tmp
);
283 msg
+= " has range ";
284 msg
+= str(last_start
);
286 msg
+= str(last_end
);
288 internals_
[j
]->get_backend_info(&tmp
);
290 msg
+= " has range ";
293 msg
+= str(p
.second
);
294 throw Xapian::InvalidOperationError(msg
);
296 last_start
= p
.first
;
300 swap(internals
, internals_
);
301 swap(used_ranges
, used_ranges_
);
305 if (compact_to_stub
) {
307 if (compact_to_stub
== STUB_DIR
) {
308 stub_file
+= "/XAPIANDB";
313 size_t sfx
= destdir
.size();
314 time_t now
= time(NULL
);
317 destdir
+= str(now
++);
318 if (mkdir(destdir
.c_str(), 0755) == 0)
320 if (errno
!= EEXIST
) {
321 string msg
= destdir
;
322 msg
+= ": mkdir failed";
323 throw Xapian::DatabaseError(msg
, errno
);
326 } else if (!(flags
& Xapian::DBCOMPACT_SINGLE_FILE
)) {
327 // If the destination database directory doesn't exist, create it.
328 if (mkdir(destdir
.c_str(), 0755) < 0) {
329 // Check why mkdir failed. It's ok if the directory already
330 // exists, but we also get EEXIST if there's an existing file with
332 if (errno
== EEXIST
) {
333 if (dir_exists(destdir
))
336 errno
= EEXIST
; // dir_exists() might have changed it
339 string msg
= destdir
;
340 msg
+= ": cannot create directory";
341 throw Xapian::DatabaseError(msg
, errno
);
346 #if defined XAPIAN_HAS_GLASS_BACKEND
347 Xapian::Compactor::compaction_level compaction
=
348 static_cast<Xapian::Compactor::compaction_level
>(flags
& (Xapian::Compactor::STANDARD
|Xapian::Compactor::FULL
|Xapian::Compactor::FULLER
));
354 auto output_backend
= flags
& Xapian::DB_BACKEND_MASK_
;
355 if (backend
== BACKEND_GLASS
) {
356 switch (output_backend
) {
358 case Xapian::DB_BACKEND_GLASS
:
359 #ifdef XAPIAN_HAS_GLASS_BACKEND
361 GlassDatabase::compact(compactor
, destdir
.c_str(), 0,
363 block_size
, compaction
, flags
,
366 GlassDatabase::compact(compactor
, NULL
, fd
,
368 block_size
, compaction
, flags
,
375 throw Xapian::FeatureUnavailableError("Glass backend disabled "
378 case Xapian::DB_BACKEND_HONEY
:
379 #ifdef XAPIAN_HAS_HONEY_BACKEND
381 HoneyDatabase::compact(compactor
, destdir
.c_str(), 0,
382 Xapian::DB_BACKEND_GLASS
,
384 block_size
, compaction
, flags
,
387 HoneyDatabase::compact(compactor
, NULL
, fd
,
388 Xapian::DB_BACKEND_GLASS
,
390 block_size
, compaction
, flags
,
397 throw Xapian::FeatureUnavailableError("Honey backend disabled "
401 throw Xapian::UnimplementedError("Glass can only be "
402 "compacted to itself or "
405 } else if (backend
== BACKEND_HONEY
) {
406 switch (output_backend
) {
408 case Xapian::DB_BACKEND_HONEY
:
409 #ifdef XAPIAN_HAS_HONEY_BACKEND
411 HoneyDatabase::compact(compactor
, destdir
.c_str(), 0,
412 Xapian::DB_BACKEND_HONEY
,
414 block_size
, compaction
, flags
,
417 HoneyDatabase::compact(compactor
, NULL
, fd
,
418 Xapian::DB_BACKEND_HONEY
,
420 block_size
, compaction
, flags
,
427 throw Xapian::FeatureUnavailableError("Honey backend disabled "
431 throw Xapian::UnimplementedError("Honey can only be "
432 "compacted to itself");
436 if (compact_to_stub
) {
437 string new_stub_file
= destdir
;
438 new_stub_file
+= "/new_stub.tmp";
440 ofstream
new_stub(new_stub_file
.c_str());
441 size_t slash
= destdir
.find_last_of(DIR_SEPS
);
442 new_stub
<< "auto " << destdir
.substr(slash
+ 1) << '\n';
444 if (!io_tmp_rename(new_stub_file
, stub_file
)) {
445 string msg
= "Cannot rename '";
446 msg
+= new_stub_file
;
450 throw Xapian::DatabaseError(msg
, errno
);