1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. See the AUTHORS file for names of contributors.
5 // We recover the contents of the descriptor from the other files we find.
6 // (1) Any log files are first converted to tables
7 // (2) We scan every table to compute
8 // (a) smallest/largest for the table
9 // (b) largest sequence number in the table
10 // (3) We generate descriptor contents:
11 // - log number is set to zero
12 // - next-file-number is set to 1 + largest file number we found
13 // - last-sequence-number is set to largest sequence# found across
14 // all tables (see 2c)
15 // - compaction pointers are cleared
16 // - every table file is added at level 0
18 // Possible optimization 1:
19 // (a) Compute total size and use to pick appropriate max-level M
20 // (b) Sort tables by largest sequence# in the table
21 // (c) For each table: if it overlaps earlier table, place in level-0,
22 // else place in level-M.
23 // Possible optimization 2:
24 // Store per-table metadata (smallest, largest, largest-seq#, ...)
25 // in the table's meta section to speed up ScanTable.
27 #include "db/builder.h"
28 #include "db/db_impl.h"
29 #include "db/dbformat.h"
30 #include "db/filename.h"
31 #include "db/log_reader.h"
32 #include "db/log_writer.h"
33 #include "db/memtable.h"
34 #include "db/table_cache.h"
35 #include "db/version_edit.h"
36 #include "db/write_batch_internal.h"
37 #include "leveldb/comparator.h"
38 #include "leveldb/db.h"
39 #include "leveldb/env.h"
47 Repairer(const std::string
& dbname
, const Options
& options
)
50 icmp_(options
.comparator
),
51 ipolicy_(options
.filter_policy
),
52 options_(SanitizeOptions(dbname
, &icmp_
, &ipolicy_
, options
)),
53 owns_info_log_(options_
.info_log
!= options
.info_log
),
54 owns_cache_(options_
.block_cache
!= options
.block_cache
),
55 next_file_number_(1) {
56 // TableCache can be small since we expect each table to be opened once.
57 table_cache_
= new TableCache(dbname_
, &options_
, 10);
63 delete options_
.info_log
;
66 delete options_
.block_cache
;
71 Status status
= FindFiles();
73 ConvertLogFilesToTables();
75 status
= WriteDescriptor();
78 unsigned long long bytes
= 0;
79 for (size_t i
= 0; i
< tables_
.size(); i
++) {
80 bytes
+= tables_
[i
].meta
.file_size
;
82 Log(options_
.info_log
,
83 "**** Repaired leveldb %s; "
84 "recovered %d files; %llu bytes. "
85 "Some data may have been lost. "
88 static_cast<int>(tables_
.size()),
97 SequenceNumber max_sequence
;
100 std::string
const dbname_
;
102 InternalKeyComparator
const icmp_
;
103 InternalFilterPolicy
const ipolicy_
;
104 Options
const options_
;
107 TableCache
* table_cache_
;
110 std::vector
<std::string
> manifests_
;
111 std::vector
<uint64_t> table_numbers_
;
112 std::vector
<uint64_t> logs_
;
113 std::vector
<TableInfo
> tables_
;
114 uint64_t next_file_number_
;
117 std::vector
<std::string
> filenames
;
118 Status status
= env_
->GetChildren(dbname_
, &filenames
);
122 if (filenames
.empty()) {
123 return Status::IOError(dbname_
, "repair found no files");
128 for (size_t i
= 0; i
< filenames
.size(); i
++) {
129 if (ParseFileName(filenames
[i
], &number
, &type
)) {
130 if (type
== kDescriptorFile
) {
131 manifests_
.push_back(filenames
[i
]);
133 if (number
+ 1 > next_file_number_
) {
134 next_file_number_
= number
+ 1;
136 if (type
== kLogFile
) {
137 logs_
.push_back(number
);
138 } else if (type
== kTableFile
) {
139 table_numbers_
.push_back(number
);
141 // Ignore other files
149 void ConvertLogFilesToTables() {
150 for (size_t i
= 0; i
< logs_
.size(); i
++) {
151 std::string logname
= LogFileName(dbname_
, logs_
[i
]);
152 Status status
= ConvertLogToTable(logs_
[i
]);
154 Log(options_
.info_log
, "Log #%llu: ignoring conversion error: %s",
155 (unsigned long long) logs_
[i
],
156 status
.ToString().c_str());
158 ArchiveFile(logname
);
162 Status
ConvertLogToTable(uint64_t log
) {
163 struct LogReporter
: public log::Reader::Reporter
{
167 virtual void Corruption(size_t bytes
, const Status
& s
) {
168 // We print error messages for corruption, but continue repairing.
169 Log(info_log
, "Log #%llu: dropping %d bytes; %s",
170 (unsigned long long) lognum
,
171 static_cast<int>(bytes
),
172 s
.ToString().c_str());
177 std::string logname
= LogFileName(dbname_
, log
);
178 SequentialFile
* lfile
;
179 Status status
= env_
->NewSequentialFile(logname
, &lfile
);
184 // Create the log reader.
185 LogReporter reporter
;
187 reporter
.info_log
= options_
.info_log
;
188 reporter
.lognum
= log
;
189 // We intentionally make log::Reader do checksumming so that
190 // corruptions cause entire commits to be skipped instead of
191 // propagating bad information (like overly large sequence
193 log::Reader
reader(lfile
, &reporter
, false/*do not checksum*/,
194 0/*initial_offset*/);
196 // Read all the records and add to a memtable
200 MemTable
* mem
= new MemTable(icmp_
);
203 while (reader
.ReadRecord(&record
, &scratch
)) {
204 if (record
.size() < 12) {
206 record
.size(), Status::Corruption("log record too small"));
209 WriteBatchInternal::SetContents(&batch
, record
);
210 status
= WriteBatchInternal::InsertInto(&batch
, mem
);
212 counter
+= WriteBatchInternal::Count(&batch
);
214 Log(options_
.info_log
, "Log #%llu: ignoring %s",
215 (unsigned long long) log
,
216 status
.ToString().c_str());
217 status
= Status::OK(); // Keep going with rest of file
222 // Do not record a version edit for this conversion to a Table
223 // since ExtractMetaData() will also generate edits.
225 meta
.number
= next_file_number_
++;
226 Iterator
* iter
= mem
->NewIterator();
227 status
= BuildTable(dbname_
, env_
, options_
, table_cache_
, iter
, &meta
);
232 if (meta
.file_size
> 0) {
233 table_numbers_
.push_back(meta
.number
);
236 Log(options_
.info_log
, "Log #%llu: %d ops saved to Table #%llu %s",
237 (unsigned long long) log
,
239 (unsigned long long) meta
.number
,
240 status
.ToString().c_str());
244 void ExtractMetaData() {
245 for (size_t i
= 0; i
< table_numbers_
.size(); i
++) {
246 ScanTable(table_numbers_
[i
]);
250 Iterator
* NewTableIterator(const FileMetaData
& meta
) {
251 // Same as compaction iterators: if paranoid_checks are on, turn
252 // on checksum verification.
254 r
.verify_checksums
= options_
.paranoid_checks
;
255 return table_cache_
->NewIterator(r
, meta
.number
, meta
.file_size
);
258 void ScanTable(uint64_t number
) {
260 t
.meta
.number
= number
;
261 std::string fname
= TableFileName(dbname_
, number
);
262 Status status
= env_
->GetFileSize(fname
, &t
.meta
.file_size
);
264 // Try alternate file name.
265 fname
= SSTTableFileName(dbname_
, number
);
266 Status s2
= env_
->GetFileSize(fname
, &t
.meta
.file_size
);
268 status
= Status::OK();
272 ArchiveFile(TableFileName(dbname_
, number
));
273 ArchiveFile(SSTTableFileName(dbname_
, number
));
274 Log(options_
.info_log
, "Table #%llu: dropped: %s",
275 (unsigned long long) t
.meta
.number
,
276 status
.ToString().c_str());
280 // Extract metadata by scanning through table.
282 Iterator
* iter
= NewTableIterator(t
.meta
);
284 ParsedInternalKey parsed
;
286 for (iter
->SeekToFirst(); iter
->Valid(); iter
->Next()) {
287 Slice key
= iter
->key();
288 if (!ParseInternalKey(key
, &parsed
)) {
289 Log(options_
.info_log
, "Table #%llu: unparsable key %s",
290 (unsigned long long) t
.meta
.number
,
291 EscapeString(key
).c_str());
298 t
.meta
.smallest
.DecodeFrom(key
);
300 t
.meta
.largest
.DecodeFrom(key
);
301 if (parsed
.sequence
> t
.max_sequence
) {
302 t
.max_sequence
= parsed
.sequence
;
305 if (!iter
->status().ok()) {
306 status
= iter
->status();
309 Log(options_
.info_log
, "Table #%llu: %d entries %s",
310 (unsigned long long) t
.meta
.number
,
312 status
.ToString().c_str());
315 tables_
.push_back(t
);
317 RepairTable(fname
, t
); // RepairTable archives input file.
321 void RepairTable(const std::string
& src
, TableInfo t
) {
322 // We will copy src contents to a new table and then rename the
323 // new table over the source.
326 std::string copy
= TableFileName(dbname_
, next_file_number_
++);
328 Status s
= env_
->NewWritableFile(copy
, &file
);
332 TableBuilder
* builder
= new TableBuilder(options_
, file
);
335 Iterator
* iter
= NewTableIterator(t
.meta
);
337 for (iter
->SeekToFirst(); iter
->Valid(); iter
->Next()) {
338 builder
->Add(iter
->key(), iter
->value());
345 builder
->Abandon(); // Nothing to save
347 s
= builder
->Finish();
349 t
.meta
.file_size
= builder
->FileSize();
361 if (counter
> 0 && s
.ok()) {
362 std::string orig
= TableFileName(dbname_
, t
.meta
.number
);
363 s
= env_
->RenameFile(copy
, orig
);
365 Log(options_
.info_log
, "Table #%llu: %d entries repaired",
366 (unsigned long long) t
.meta
.number
, counter
);
367 tables_
.push_back(t
);
371 env_
->DeleteFile(copy
);
375 Status
WriteDescriptor() {
376 std::string tmp
= TempFileName(dbname_
, 1);
378 Status status
= env_
->NewWritableFile(tmp
, &file
);
383 SequenceNumber max_sequence
= 0;
384 for (size_t i
= 0; i
< tables_
.size(); i
++) {
385 if (max_sequence
< tables_
[i
].max_sequence
) {
386 max_sequence
= tables_
[i
].max_sequence
;
390 edit_
.SetComparatorName(icmp_
.user_comparator()->Name());
391 edit_
.SetLogNumber(0);
392 edit_
.SetNextFile(next_file_number_
);
393 edit_
.SetLastSequence(max_sequence
);
395 for (size_t i
= 0; i
< tables_
.size(); i
++) {
396 // TODO(opt): separate out into multiple levels
397 const TableInfo
& t
= tables_
[i
];
398 edit_
.AddFile(0, t
.meta
.number
, t
.meta
.file_size
,
399 t
.meta
.smallest
, t
.meta
.largest
);
402 //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str());
404 log::Writer
log(file
);
406 edit_
.EncodeTo(&record
);
407 status
= log
.AddRecord(record
);
410 status
= file
->Close();
416 env_
->DeleteFile(tmp
);
418 // Discard older manifests
419 for (size_t i
= 0; i
< manifests_
.size(); i
++) {
420 ArchiveFile(dbname_
+ "/" + manifests_
[i
]);
423 // Install new manifest
424 status
= env_
->RenameFile(tmp
, DescriptorFileName(dbname_
, 1));
426 status
= SetCurrentFile(env_
, dbname_
, 1);
428 env_
->DeleteFile(tmp
);
434 void ArchiveFile(const std::string
& fname
) {
435 // Move into another directory. E.g., for
439 const char* slash
= strrchr(fname
.c_str(), '/');
442 new_dir
.assign(fname
.data(), slash
- fname
.data());
444 new_dir
.append("/lost");
445 env_
->CreateDir(new_dir
); // Ignore error
446 std::string new_file
= new_dir
;
447 new_file
.append("/");
448 new_file
.append((slash
== NULL
) ? fname
.c_str() : slash
+ 1);
449 Status s
= env_
->RenameFile(fname
, new_file
);
450 Log(options_
.info_log
, "Archiving %s: %s\n",
451 fname
.c_str(), s
.ToString().c_str());
456 Status
RepairDB(const std::string
& dbname
, const Options
& options
) {
457 Repairer
repairer(dbname
, options
);
458 return repairer
.Run();
461 } // namespace leveldb