tdb: fix tdb_check() on other-endian tdbs.
[Samba/id10ts.git] / lib / tdb / common / check.c
blobfd3c0a91f38348a17291d6fd3ef623ec288947d7
1 /*
2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Rusty Russell 2009
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
10 ** under the LGPL
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 3 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 #include "tdb_private.h"
27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
28 static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
30 struct tdb_header hdr;
32 if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
33 return false;
34 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
35 goto corrupt;
37 CONVERT(hdr);
38 if (hdr.version != TDB_VERSION)
39 goto corrupt;
41 if (hdr.rwlocks != 0)
42 goto corrupt;
44 if (hdr.hash_size == 0)
45 goto corrupt;
47 if (hdr.hash_size != tdb->header.hash_size)
48 goto corrupt;
50 if (hdr.recovery_start != 0 &&
51 hdr.recovery_start < TDB_DATA_START(tdb->header.hash_size))
52 goto corrupt;
54 *recovery = hdr.recovery_start;
55 return true;
57 corrupt:
58 tdb->ecode = TDB_ERR_CORRUPT;
59 TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n"));
60 return false;
63 /* Generic record header check. */
64 static bool tdb_check_record(struct tdb_context *tdb,
65 tdb_off_t off,
66 const struct tdb_record *rec)
68 tdb_off_t tailer;
70 /* Check rec->next: 0 or points to record offset, aligned. */
71 if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size)){
72 TDB_LOG((tdb, TDB_DEBUG_ERROR,
73 "Record offset %d too small next %d\n",
74 off, rec->next));
75 goto corrupt;
77 if (rec->next + sizeof(*rec) < rec->next) {
78 TDB_LOG((tdb, TDB_DEBUG_ERROR,
79 "Record offset %d too large next %d\n",
80 off, rec->next));
81 goto corrupt;
83 if ((rec->next % TDB_ALIGNMENT) != 0) {
84 TDB_LOG((tdb, TDB_DEBUG_ERROR,
85 "Record offset %d misaligned next %d\n",
86 off, rec->next));
87 goto corrupt;
89 if (tdb->methods->tdb_oob(tdb, rec->next+sizeof(*rec), 0))
90 goto corrupt;
92 /* Check rec_len: similar to rec->next, implies next record. */
93 if ((rec->rec_len % TDB_ALIGNMENT) != 0) {
94 TDB_LOG((tdb, TDB_DEBUG_ERROR,
95 "Record offset %d misaligned length %d\n",
96 off, rec->rec_len));
97 goto corrupt;
99 /* Must fit tailer. */
100 if (rec->rec_len < sizeof(tailer)) {
101 TDB_LOG((tdb, TDB_DEBUG_ERROR,
102 "Record offset %d too short length %d\n",
103 off, rec->rec_len));
104 goto corrupt;
106 /* OOB allows "right at the end" access, so this works for last rec. */
107 if (tdb->methods->tdb_oob(tdb, off+sizeof(*rec)+rec->rec_len, 0))
108 goto corrupt;
110 /* Check tailer. */
111 if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
112 &tailer) == -1)
113 goto corrupt;
114 if (tailer != sizeof(*rec) + rec->rec_len) {
115 TDB_LOG((tdb, TDB_DEBUG_ERROR,
116 "Record offset %d invalid tailer\n", off));
117 goto corrupt;
120 return true;
122 corrupt:
123 tdb->ecode = TDB_ERR_CORRUPT;
124 return false;
127 /* Grab some bytes: may copy if can't use mmap.
128 Caller has already done bounds check. */
129 static TDB_DATA get_bytes(struct tdb_context *tdb,
130 tdb_off_t off, tdb_len_t len)
132 TDB_DATA d;
134 d.dsize = len;
136 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
137 d.dptr = (unsigned char *)tdb->map_ptr + off;
138 else
139 d.dptr = tdb_alloc_read(tdb, off, d.dsize);
140 return d;
143 /* Frees data if we're not able to simply use mmap. */
144 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
146 if (tdb->transaction == NULL && tdb->map_ptr != NULL)
147 return;
148 free(d.dptr);
151 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
152 * See: http://burtleburtle.net/bob/c/lookup3.c
154 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
155 static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
157 uint32_t a,b,c;
159 /* Set up the internal state */
160 a = b = c = 0xdeadbeef + *pc;
161 c += *pb;
162 a += key;
163 c ^= b; c -= rot(b,14);
164 a ^= c; a -= rot(c,11);
165 b ^= a; b -= rot(a,25);
166 c ^= b; c -= rot(b,16);
167 a ^= c; a -= rot(c,4);
168 b ^= a; b -= rot(a,14);
169 c ^= b; c -= rot(b,24);
170 *pc=c; *pb=b;
174 We want to check that all free records are in the free list
175 (only once), and all free list entries are free records. Similarly
176 for each hash chain of used records.
178 Doing that naively (without walking hash chains, since we want to be
179 linear) means keeping a list of records which have been seen in each
180 hash chain, and another of records pointed to (ie. next pointers
181 from records and the initial hash chain heads). These two lists
182 should be equal. This will take 8 bytes per record, and require
183 sorting at the end.
185 So instead, we record each offset in a bitmap such a way that
186 recording it twice will cancel out. Since each offset should appear
187 exactly twice, the bitmap should be zero at the end.
189 The approach was inspired by Bloom Filters (see Wikipedia). For
190 each value, we flip K bits in a bitmap of size N. The number of
191 distinct arrangements is:
193 N! / (K! * (N-K)!)
195 Of course, not all arrangements are actually distinct, but testing
196 shows this formula to be close enough.
198 So, if K == 8 and N == 256, the probability of two things flipping the same
199 bits is 1 in 409,663,695,276,000.
201 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
202 (320k) seems reasonable.
204 #define NUM_HASHES 8
205 #define BITMAP_BITS 256
207 static void bit_flip(unsigned char bits[], unsigned int idx)
209 bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
212 /* We record offsets in a bitmap for the particular chain it should be in. */
213 static void record_offset(unsigned char bits[], tdb_off_t off)
215 uint32_t h1 = off, h2 = 0;
216 unsigned int i;
218 /* We get two good hash values out of jhash2, so we use both. Then
219 * we keep going to produce further hash values. */
220 for (i = 0; i < NUM_HASHES / 2; i++) {
221 hash(off, &h1, &h2);
222 bit_flip(bits, h1 % BITMAP_BITS);
223 bit_flip(bits, h2 % BITMAP_BITS);
224 h2++;
228 /* Check that an in-use record is valid. */
229 static bool tdb_check_used_record(struct tdb_context *tdb,
230 tdb_off_t off,
231 const struct tdb_record *rec,
232 unsigned char **hashes,
233 int (*check)(TDB_DATA, TDB_DATA, void *),
234 void *private_data)
236 TDB_DATA key, data;
238 if (!tdb_check_record(tdb, off, rec))
239 return false;
241 /* key + data + tailer must fit in record */
242 if (rec->key_len + rec->data_len + sizeof(tdb_off_t) > rec->rec_len) {
243 TDB_LOG((tdb, TDB_DEBUG_ERROR,
244 "Record offset %d too short for contents\n", off));
245 return false;
248 key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
249 if (!key.dptr)
250 return false;
252 if (tdb->hash_fn(&key) != rec->full_hash) {
253 TDB_LOG((tdb, TDB_DEBUG_ERROR,
254 "Record offset %d has incorrect hash\n", off));
255 goto fail_put_key;
258 /* Mark this offset as a known value for this hash bucket. */
259 record_offset(hashes[BUCKET(rec->full_hash)+1], off);
260 /* And similarly if the next pointer is valid. */
261 if (rec->next)
262 record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
264 /* If they supply a check function and this record isn't dead,
265 get data and feed it. */
266 if (check && rec->magic != TDB_DEAD_MAGIC) {
267 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
268 rec->data_len);
269 if (!data.dptr)
270 goto fail_put_key;
272 if (check(key, data, private_data) == -1)
273 goto fail_put_data;
274 put_bytes(tdb, data);
277 put_bytes(tdb, key);
278 return true;
280 fail_put_data:
281 put_bytes(tdb, data);
282 fail_put_key:
283 put_bytes(tdb, key);
284 return false;
287 /* Check that an unused record is valid. */
288 static bool tdb_check_free_record(struct tdb_context *tdb,
289 tdb_off_t off,
290 const struct tdb_record *rec,
291 unsigned char **hashes)
293 if (!tdb_check_record(tdb, off, rec))
294 return false;
296 /* Mark this offset as a known value for the free list. */
297 record_offset(hashes[0], off);
298 /* And similarly if the next pointer is valid. */
299 if (rec->next)
300 record_offset(hashes[0], rec->next);
301 return true;
304 /* Slow, but should be very rare. */
305 static size_t dead_space(struct tdb_context *tdb, tdb_off_t off)
307 size_t len;
309 for (len = 0; off + len < tdb->map_size; len++) {
310 char c;
311 if (tdb->methods->tdb_read(tdb, off, &c, 1, 0))
312 return 0;
313 if (c != 0 && c != 0x42)
314 break;
316 return len;
319 int tdb_check(struct tdb_context *tdb,
320 int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
321 void *private_data)
323 unsigned int h;
324 unsigned char **hashes;
325 tdb_off_t off, recovery_start;
326 struct tdb_record rec;
327 bool found_recovery = false;
328 tdb_len_t dead;
329 bool locked;
331 /* Read-only databases use no locking at all: it's best-effort.
332 * We may have a write lock already, so skip that case too. */
333 if (tdb->read_only || tdb->allrecord_lock.count != 0) {
334 locked = false;
335 } else {
336 if (tdb_lockall_read(tdb) == -1)
337 return -1;
338 locked = true;
341 /* Make sure we know true size of the underlying file. */
342 tdb->methods->tdb_oob(tdb, tdb->map_size + 1, 1);
344 /* Header must be OK: also gets us the recovery ptr, if any. */
345 if (!tdb_check_header(tdb, &recovery_start))
346 goto unlock;
348 /* We should have the whole header, too. */
349 if (tdb->map_size < TDB_DATA_START(tdb->header.hash_size)) {
350 tdb->ecode = TDB_ERR_CORRUPT;
351 TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n"));
352 goto unlock;
355 /* One big malloc: pointers then bit arrays. */
356 hashes = (unsigned char **)calloc(
357 1, sizeof(hashes[0]) * (1+tdb->header.hash_size)
358 + BITMAP_BITS / CHAR_BIT * (1+tdb->header.hash_size));
359 if (!hashes) {
360 tdb->ecode = TDB_ERR_OOM;
361 goto unlock;
364 /* Initialize pointers */
365 hashes[0] = (unsigned char *)(&hashes[1+tdb->header.hash_size]);
366 for (h = 1; h < 1+tdb->header.hash_size; h++)
367 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
369 /* Freelist and hash headers are all in a row: read them. */
370 for (h = 0; h < 1+tdb->header.hash_size; h++) {
371 if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
372 &off) == -1)
373 goto free;
374 if (off)
375 record_offset(hashes[h], off);
378 /* For each record, read it in and check it's ok. */
379 for (off = TDB_DATA_START(tdb->header.hash_size);
380 off < tdb->map_size;
381 off += sizeof(rec) + rec.rec_len) {
382 if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
383 DOCONV()) == -1)
384 goto free;
385 switch (rec.magic) {
386 case TDB_MAGIC:
387 case TDB_DEAD_MAGIC:
388 if (!tdb_check_used_record(tdb, off, &rec, hashes,
389 check, private_data))
390 goto free;
391 break;
392 case TDB_FREE_MAGIC:
393 if (!tdb_check_free_record(tdb, off, &rec, hashes))
394 goto free;
395 break;
396 /* If we crash after ftruncate, we can get zeroes or fill. */
397 case TDB_RECOVERY_INVALID_MAGIC:
398 case 0x42424242:
399 if (recovery_start == off) {
400 found_recovery = true;
401 break;
403 dead = dead_space(tdb, off);
404 if (dead < sizeof(rec))
405 goto corrupt;
407 TDB_LOG((tdb, TDB_DEBUG_ERROR,
408 "Dead space at %d-%d (of %u)\n",
409 off, off + dead, tdb->map_size));
410 rec.rec_len = dead - sizeof(rec);
411 break;
412 case TDB_RECOVERY_MAGIC:
413 if (recovery_start != off) {
414 TDB_LOG((tdb, TDB_DEBUG_ERROR,
415 "Unexpected recovery record at offset %d\n",
416 off));
417 goto free;
419 found_recovery = true;
420 break;
421 default: ;
422 corrupt:
423 tdb->ecode = TDB_ERR_CORRUPT;
424 TDB_LOG((tdb, TDB_DEBUG_ERROR,
425 "Bad magic 0x%x at offset %d\n",
426 rec.magic, off));
427 goto free;
431 /* Now, hashes should all be empty: each record exists and is referred
432 * to by one other. */
433 for (h = 0; h < 1+tdb->header.hash_size; h++) {
434 unsigned int i;
435 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
436 if (hashes[h][i] != 0) {
437 tdb->ecode = TDB_ERR_CORRUPT;
438 TDB_LOG((tdb, TDB_DEBUG_ERROR,
439 "Hashes do not match records\n"));
440 goto free;
445 /* We must have found recovery area if there was one. */
446 if (recovery_start != 0 && !found_recovery) {
447 TDB_LOG((tdb, TDB_DEBUG_ERROR,
448 "Expected a recovery area at %u\n",
449 recovery_start));
450 goto free;
453 free(hashes);
454 if (locked) {
455 tdb_unlockall_read(tdb);
457 return 0;
459 free:
460 free(hashes);
461 unlock:
462 if (locked) {
463 tdb_unlockall_read(tdb);
465 return -1;