2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Rusty Russell 2009
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 3 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 #include "tdb1_private.h"
27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
28 static bool tdb1_check_header(struct tdb1_context
*tdb
, tdb1_off_t
*recovery
)
30 struct tdb1_header hdr
;
33 if (tdb
->methods
->tdb1_read(tdb
, 0, &hdr
, sizeof(hdr
), 0) == -1)
35 if (strcmp(hdr
.magic_food
, TDB1_MAGIC_FOOD
) != 0)
39 if (hdr
.version
!= TDB1_VERSION
)
42 if (hdr
.rwlocks
!= 0 && hdr
.rwlocks
!= TDB1_HASH_RWLOCK_MAGIC
)
45 tdb1_header_hash(tdb
, &h1
, &h2
);
46 if (hdr
.magic1_hash
&& hdr
.magic2_hash
&&
47 (hdr
.magic1_hash
!= h1
|| hdr
.magic2_hash
!= h2
))
50 if (hdr
.hash_size
== 0)
53 if (hdr
.hash_size
!= tdb
->header
.hash_size
)
56 if (hdr
.recovery_start
!= 0 &&
57 hdr
.recovery_start
< TDB1_DATA_START(tdb
->header
.hash_size
))
60 *recovery
= hdr
.recovery_start
;
64 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
65 "Header is corrupt\n");
69 /* Generic record header check. */
70 static bool tdb1_check_record(struct tdb1_context
*tdb
,
72 const struct tdb1_record
*rec
)
76 /* Check rec->next: 0 or points to record offset, aligned. */
77 if (rec
->next
> 0 && rec
->next
< TDB1_DATA_START(tdb
->header
.hash_size
)){
78 tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
79 "Record offset %d too small next %d\n",
83 if (rec
->next
+ sizeof(*rec
) < rec
->next
) {
84 tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
85 "Record offset %d too large next %d\n",
89 if ((rec
->next
% TDB1_ALIGNMENT
) != 0) {
90 tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
91 "Record offset %d misaligned next %d\n",
95 if (tdb
->methods
->tdb1_oob(tdb
, rec
->next
+sizeof(*rec
), 0))
98 /* Check rec_len: similar to rec->next, implies next record. */
99 if ((rec
->rec_len
% TDB1_ALIGNMENT
) != 0) {
100 tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
101 "Record offset %d misaligned length %d\n",
105 /* Must fit tailer. */
106 if (rec
->rec_len
< sizeof(tailer
)) {
107 tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
108 "Record offset %d too short length %d\n",
112 /* OOB allows "right at the end" access, so this works for last rec. */
113 if (tdb
->methods
->tdb1_oob(tdb
, off
+sizeof(*rec
)+rec
->rec_len
, 0))
117 if (tdb1_ofs_read(tdb
, off
+sizeof(*rec
)+rec
->rec_len
-sizeof(tailer
),
120 if (tailer
!= sizeof(*rec
) + rec
->rec_len
) {
121 tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
122 "Record offset %d invalid tailer\n", off
);
129 tdb
->last_error
= TDB_ERR_CORRUPT
;
133 /* Grab some bytes: may copy if can't use mmap.
134 Caller has already done bounds check. */
135 static TDB1_DATA
get_bytes(struct tdb1_context
*tdb
,
136 tdb1_off_t off
, tdb1_len_t len
)
142 if (tdb
->transaction
== NULL
&& tdb
->map_ptr
!= NULL
)
143 d
.dptr
= (unsigned char *)tdb
->map_ptr
+ off
;
145 d
.dptr
= tdb1_alloc_read(tdb
, off
, d
.dsize
);
149 /* Frees data if we're not able to simply use mmap. */
150 static void put_bytes(struct tdb1_context
*tdb
, TDB1_DATA d
)
152 if (tdb
->transaction
== NULL
&& tdb
->map_ptr
!= NULL
)
157 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
158 * See: http://burtleburtle.net/bob/c/lookup3.c
160 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
161 static void jhash(uint32_t key
, uint32_t *pc
, uint32_t *pb
)
165 /* Set up the internal state */
166 a
= b
= c
= 0xdeadbeef + *pc
;
169 c
^= b
; c
-= rot(b
,14);
170 a
^= c
; a
-= rot(c
,11);
171 b
^= a
; b
-= rot(a
,25);
172 c
^= b
; c
-= rot(b
,16);
173 a
^= c
; a
-= rot(c
,4);
174 b
^= a
; b
-= rot(a
,14);
175 c
^= b
; c
-= rot(b
,24);
180 We want to check that all free records are in the free list
181 (only once), and all free list entries are free records. Similarly
182 for each hash chain of used records.
184 Doing that naively (without walking hash chains, since we want to be
185 linear) means keeping a list of records which have been seen in each
186 hash chain, and another of records pointed to (ie. next pointers
187 from records and the initial hash chain heads). These two lists
188 should be equal. This will take 8 bytes per record, and require
191 So instead, we record each offset in a bitmap such a way that
192 recording it twice will cancel out. Since each offset should appear
193 exactly twice, the bitmap should be zero at the end.
195 The approach was inspired by Bloom Filters (see Wikipedia). For
196 each value, we flip K bits in a bitmap of size N. The number of
197 distinct arrangements is:
201 Of course, not all arrangements are actually distinct, but testing
202 shows this formula to be close enough.
204 So, if K == 8 and N == 256, the probability of two things flipping the same
205 bits is 1 in 409,663,695,276,000.
207 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
208 (320k) seems reasonable.
211 #define BITMAP_BITS 256
213 static void bit_flip(unsigned char bits
[], unsigned int idx
)
215 bits
[idx
/ CHAR_BIT
] ^= (1 << (idx
% CHAR_BIT
));
218 /* We record offsets in a bitmap for the particular chain it should be in. */
219 static void record_offset(unsigned char bits
[], tdb1_off_t off
)
221 uint32_t h1
= off
, h2
= 0;
224 /* We get two good hash values out of jhash2, so we use both. Then
225 * we keep going to produce further hash values. */
226 for (i
= 0; i
< NUM_HASHES
/ 2; i
++) {
227 jhash(off
, &h1
, &h2
);
228 bit_flip(bits
, h1
% BITMAP_BITS
);
229 bit_flip(bits
, h2
% BITMAP_BITS
);
234 /* Check that an in-use record is valid. */
235 static bool tdb1_check_used_record(struct tdb1_context
*tdb
,
237 const struct tdb1_record
*rec
,
238 unsigned char **hashes
,
239 int (*check
)(TDB1_DATA
, TDB1_DATA
, void *),
244 if (!tdb1_check_record(tdb
, off
, rec
))
247 /* key + data + tailer must fit in record */
248 if (rec
->key_len
+ rec
->data_len
+ sizeof(tdb1_off_t
) > rec
->rec_len
) {
249 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
250 "Record offset %d too short for contents\n", off
);
254 key
= get_bytes(tdb
, off
+ sizeof(*rec
), rec
->key_len
);
258 if (tdb
->hash_fn(&key
) != rec
->full_hash
) {
259 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
260 "Record offset %d has incorrect hash\n", off
);
264 /* Mark this offset as a known value for this hash bucket. */
265 record_offset(hashes
[TDB1_BUCKET(rec
->full_hash
)+1], off
);
266 /* And similarly if the next pointer is valid. */
268 record_offset(hashes
[TDB1_BUCKET(rec
->full_hash
)+1], rec
->next
);
270 /* If they supply a check function and this record isn't dead,
271 get data and feed it. */
272 if (check
&& rec
->magic
!= TDB1_DEAD_MAGIC
) {
273 data
= get_bytes(tdb
, off
+ sizeof(*rec
) + rec
->key_len
,
278 if (check(key
, data
, private_data
) == -1)
280 put_bytes(tdb
, data
);
287 put_bytes(tdb
, data
);
293 /* Check that an unused record is valid. */
294 static bool tdb1_check_free_record(struct tdb1_context
*tdb
,
296 const struct tdb1_record
*rec
,
297 unsigned char **hashes
)
299 if (!tdb1_check_record(tdb
, off
, rec
))
302 /* Mark this offset as a known value for the free list. */
303 record_offset(hashes
[0], off
);
304 /* And similarly if the next pointer is valid. */
306 record_offset(hashes
[0], rec
->next
);
310 /* Slow, but should be very rare. */
311 size_t tdb1_dead_space(struct tdb1_context
*tdb
, tdb1_off_t off
)
315 for (len
= 0; off
+ len
< tdb
->map_size
; len
++) {
317 if (tdb
->methods
->tdb1_read(tdb
, off
, &c
, 1, 0))
319 if (c
!= 0 && c
!= 0x42)
325 int tdb1_check(struct tdb1_context
*tdb
,
326 int (*check
)(TDB1_DATA key
, TDB1_DATA data
, void *private_data
),
330 unsigned char **hashes
;
331 tdb1_off_t off
, recovery_start
;
332 struct tdb1_record rec
;
333 bool found_recovery
= false;
337 /* Read-only databases use no locking at all: it's best-effort.
338 * We may have a write lock already, so skip that case too. */
339 if (tdb
->read_only
|| tdb
->allrecord_lock
.count
!= 0) {
342 if (tdb1_lockall_read(tdb
) == -1)
347 /* Make sure we know true size of the underlying file. */
348 tdb
->methods
->tdb1_oob(tdb
, tdb
->map_size
+ 1, 1);
350 /* Header must be OK: also gets us the recovery ptr, if any. */
351 if (!tdb1_check_header(tdb
, &recovery_start
))
354 /* We should have the whole header, too. */
355 if (tdb
->map_size
< TDB1_DATA_START(tdb
->header
.hash_size
)) {
356 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
357 "File too short for hashes\n");
361 /* One big malloc: pointers then bit arrays. */
362 hashes
= (unsigned char **)calloc(
363 1, sizeof(hashes
[0]) * (1+tdb
->header
.hash_size
)
364 + BITMAP_BITS
/ CHAR_BIT
* (1+tdb
->header
.hash_size
));
366 tdb
->last_error
= TDB_ERR_OOM
;
370 /* Initialize pointers */
371 hashes
[0] = (unsigned char *)(&hashes
[1+tdb
->header
.hash_size
]);
372 for (h
= 1; h
< 1+tdb
->header
.hash_size
; h
++)
373 hashes
[h
] = hashes
[h
-1] + BITMAP_BITS
/ CHAR_BIT
;
375 /* Freelist and hash headers are all in a row: read them. */
376 for (h
= 0; h
< 1+tdb
->header
.hash_size
; h
++) {
377 if (tdb1_ofs_read(tdb
, TDB1_FREELIST_TOP
+ h
*sizeof(tdb1_off_t
),
381 record_offset(hashes
[h
], off
);
384 /* For each record, read it in and check it's ok. */
385 for (off
= TDB1_DATA_START(tdb
->header
.hash_size
);
387 off
+= sizeof(rec
) + rec
.rec_len
) {
388 if (tdb
->methods
->tdb1_read(tdb
, off
, &rec
, sizeof(rec
),
389 TDB1_DOCONV()) == -1)
393 case TDB1_DEAD_MAGIC
:
394 if (!tdb1_check_used_record(tdb
, off
, &rec
, hashes
,
395 check
, private_data
))
398 case TDB1_FREE_MAGIC
:
399 if (!tdb1_check_free_record(tdb
, off
, &rec
, hashes
))
402 /* If we crash after ftruncate, we can get zeroes or fill. */
403 case TDB1_RECOVERY_INVALID_MAGIC
:
405 if (recovery_start
== off
) {
406 found_recovery
= true;
409 dead
= tdb1_dead_space(tdb
, off
);
410 if (dead
< sizeof(rec
))
413 tdb_logerr(tdb
, TDB_SUCCESS
, TDB_LOG_WARNING
,
414 "Dead space at %d-%d (of %u)\n",
415 off
, off
+ dead
, tdb
->map_size
);
416 rec
.rec_len
= dead
- sizeof(rec
);
418 case TDB1_RECOVERY_MAGIC
:
419 if (recovery_start
!= off
) {
420 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
421 "Unexpected recovery record at offset %d\n",
425 found_recovery
= true;
429 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
430 "Bad magic 0x%x at offset %d\n",
436 /* Now, hashes should all be empty: each record exists and is referred
437 * to by one other. */
438 for (h
= 0; h
< 1+tdb
->header
.hash_size
; h
++) {
440 for (i
= 0; i
< BITMAP_BITS
/ CHAR_BIT
; i
++) {
441 if (hashes
[h
][i
] != 0) {
442 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
443 "Hashes do not match records\n");
449 /* We must have found recovery area if there was one. */
450 if (recovery_start
!= 0 && !found_recovery
) {
451 tdb
->last_error
= tdb_logerr(tdb
, TDB_ERR_CORRUPT
, TDB_LOG_ERROR
,
452 "Expected a recovery area at %u\n",
459 tdb1_unlockall_read(tdb
);
467 tdb1_unlockall_read(tdb
);