s4:lib/stream: make use of smb_len_tcp()
[Samba.git] / lib / tdb2 / tdb1_check.c
bloba8e54b2ee092f4cb7e1866818551e4e6177f1e5b
1 /*
2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Rusty Russell 2009
8 ** NOTE! The following LGPL license applies to the tdb
9 ** library. This does NOT imply that all of Samba is released
10 ** under the LGPL
12 This library is free software; you can redistribute it and/or
13 modify it under the terms of the GNU Lesser General Public
14 License as published by the Free Software Foundation; either
15 version 3 of the License, or (at your option) any later version.
17 This library is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 Lesser General Public License for more details.
22 You should have received a copy of the GNU Lesser General Public
23 License along with this library; if not, see <http://www.gnu.org/licenses/>.
25 #include "tdb1_private.h"
27 /* Since we opened it, these shouldn't fail unless it's recent corruption. */
28 static bool tdb1_check_header(struct tdb_context *tdb, tdb1_off_t *recovery)
30 struct tdb1_header hdr;
31 uint32_t h1, h2;
33 if (tdb->tdb1.io->tdb1_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
34 return false;
35 if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
36 goto corrupt;
38 TDB1_CONV(hdr);
39 if (hdr.version != TDB1_VERSION)
40 goto corrupt;
42 if (hdr.rwlocks != 0 && hdr.rwlocks != TDB1_HASH_RWLOCK_MAGIC)
43 goto corrupt;
45 tdb1_header_hash(tdb, &h1, &h2);
46 if (hdr.magic1_hash && hdr.magic2_hash &&
47 (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
48 goto corrupt;
50 if (hdr.hash_size == 0)
51 goto corrupt;
53 if (hdr.hash_size != tdb->tdb1.header.hash_size)
54 goto corrupt;
56 if (hdr.recovery_start != 0 &&
57 hdr.recovery_start < TDB1_DATA_START(tdb->tdb1.header.hash_size))
58 goto corrupt;
60 *recovery = hdr.recovery_start;
61 return true;
63 corrupt:
64 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
65 "Header is corrupt\n");
66 return false;
69 /* Generic record header check. */
70 static bool tdb1_check_record(struct tdb_context *tdb,
71 tdb1_off_t off,
72 const struct tdb1_record *rec)
74 tdb1_off_t tailer;
76 /* Check rec->next: 0 or points to record offset, aligned. */
77 if (rec->next > 0 && rec->next < TDB1_DATA_START(tdb->tdb1.header.hash_size)){
78 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
79 "Record offset %d too small next %d\n",
80 off, rec->next);
81 goto corrupt;
83 if (rec->next + sizeof(*rec) < rec->next) {
84 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
85 "Record offset %d too large next %d\n",
86 off, rec->next);
87 goto corrupt;
89 if ((rec->next % TDB1_ALIGNMENT) != 0) {
90 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
91 "Record offset %d misaligned next %d\n",
92 off, rec->next);
93 goto corrupt;
95 if (tdb->tdb1.io->tdb1_oob(tdb, rec->next+sizeof(*rec), 0))
96 goto corrupt;
98 /* Check rec_len: similar to rec->next, implies next record. */
99 if ((rec->rec_len % TDB1_ALIGNMENT) != 0) {
100 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
101 "Record offset %d misaligned length %d\n",
102 off, rec->rec_len);
103 goto corrupt;
105 /* Must fit tailer. */
106 if (rec->rec_len < sizeof(tailer)) {
107 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
108 "Record offset %d too short length %d\n",
109 off, rec->rec_len);
110 goto corrupt;
112 /* OOB allows "right at the end" access, so this works for last rec. */
113 if (tdb->tdb1.io->tdb1_oob(tdb, off+sizeof(*rec)+rec->rec_len, 0))
114 goto corrupt;
116 /* Check tailer. */
117 if (tdb1_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
118 &tailer) == -1)
119 goto corrupt;
120 if (tailer != sizeof(*rec) + rec->rec_len) {
121 tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
122 "Record offset %d invalid tailer\n", off);
123 goto corrupt;
126 return true;
128 corrupt:
129 tdb->last_error = TDB_ERR_CORRUPT;
130 return false;
133 /* Grab some bytes: may copy if can't use mmap.
134 Caller has already done bounds check. */
135 static TDB_DATA get_bytes(struct tdb_context *tdb,
136 tdb1_off_t off, tdb1_len_t len)
138 TDB_DATA d;
140 d.dsize = len;
142 if (tdb->tdb1.transaction == NULL && tdb->file->map_ptr != NULL)
143 d.dptr = (unsigned char *)tdb->file->map_ptr + off;
144 else
145 d.dptr = tdb1_alloc_read(tdb, off, d.dsize);
146 return d;
149 /* Frees data if we're not able to simply use mmap. */
150 static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
152 if (tdb->tdb1.transaction == NULL && tdb->file->map_ptr != NULL)
153 return;
154 free(d.dptr);
157 /* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
158 * See: http://burtleburtle.net/bob/c/lookup3.c
160 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
161 static void jhash(uint32_t key, uint32_t *pc, uint32_t *pb)
163 uint32_t a,b,c;
165 /* Set up the internal state */
166 a = b = c = 0xdeadbeef + *pc;
167 c += *pb;
168 a += key;
169 c ^= b; c -= rot(b,14);
170 a ^= c; a -= rot(c,11);
171 b ^= a; b -= rot(a,25);
172 c ^= b; c -= rot(b,16);
173 a ^= c; a -= rot(c,4);
174 b ^= a; b -= rot(a,14);
175 c ^= b; c -= rot(b,24);
176 *pc=c; *pb=b;
180 We want to check that all free records are in the free list
181 (only once), and all free list entries are free records. Similarly
182 for each hash chain of used records.
184 Doing that naively (without walking hash chains, since we want to be
185 linear) means keeping a list of records which have been seen in each
186 hash chain, and another of records pointed to (ie. next pointers
187 from records and the initial hash chain heads). These two lists
188 should be equal. This will take 8 bytes per record, and require
189 sorting at the end.
191 So instead, we record each offset in a bitmap such a way that
192 recording it twice will cancel out. Since each offset should appear
193 exactly twice, the bitmap should be zero at the end.
195 The approach was inspired by Bloom Filters (see Wikipedia). For
196 each value, we flip K bits in a bitmap of size N. The number of
197 distinct arrangements is:
199 N! / (K! * (N-K)!)
201 Of course, not all arrangements are actually distinct, but testing
202 shows this formula to be close enough.
204 So, if K == 8 and N == 256, the probability of two things flipping the same
205 bits is 1 in 409,663,695,276,000.
207 Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
208 (320k) seems reasonable.
210 #define NUM_HASHES 8
211 #define BITMAP_BITS 256
213 static void bit_flip(unsigned char bits[], unsigned int idx)
215 bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
218 /* We record offsets in a bitmap for the particular chain it should be in. */
219 static void record_offset(unsigned char bits[], tdb1_off_t off)
221 uint32_t h1 = off, h2 = 0;
222 unsigned int i;
224 /* We get two good hash values out of jhash2, so we use both. Then
225 * we keep going to produce further hash values. */
226 for (i = 0; i < NUM_HASHES / 2; i++) {
227 jhash(off, &h1, &h2);
228 bit_flip(bits, h1 % BITMAP_BITS);
229 bit_flip(bits, h2 % BITMAP_BITS);
230 h2++;
234 /* Check that an in-use record is valid. */
235 static bool tdb1_check_used_record(struct tdb_context *tdb,
236 tdb1_off_t off,
237 const struct tdb1_record *rec,
238 unsigned char **hashes,
239 enum TDB_ERROR (*check)(TDB_DATA, TDB_DATA,
240 void *),
241 void *private_data)
243 TDB_DATA key, data;
245 if (!tdb1_check_record(tdb, off, rec))
246 return false;
248 /* key + data + tailer must fit in record */
249 if (rec->key_len + rec->data_len + sizeof(tdb1_off_t) > rec->rec_len) {
250 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
251 "Record offset %d too short for contents\n", off);
252 return false;
255 key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
256 if (!key.dptr)
257 return false;
259 if ((uint32_t)tdb_hash(tdb, key.dptr, key.dsize) != rec->full_hash) {
260 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
261 "Record offset %d has incorrect hash\n", off);
262 goto fail_put_key;
265 /* Mark this offset as a known value for this hash bucket. */
266 record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], off);
267 /* And similarly if the next pointer is valid. */
268 if (rec->next)
269 record_offset(hashes[TDB1_BUCKET(rec->full_hash)+1], rec->next);
271 /* If they supply a check function and this record isn't dead,
272 get data and feed it. */
273 if (check && rec->magic != TDB1_DEAD_MAGIC) {
274 enum TDB_ERROR ecode;
276 data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
277 rec->data_len);
278 if (!data.dptr)
279 goto fail_put_key;
281 ecode = check(key, data, private_data);
282 if (ecode != TDB_SUCCESS) {
283 tdb->last_error = ecode;
284 goto fail_put_data;
286 put_bytes(tdb, data);
289 put_bytes(tdb, key);
290 return true;
292 fail_put_data:
293 put_bytes(tdb, data);
294 fail_put_key:
295 put_bytes(tdb, key);
296 return false;
299 /* Check that an unused record is valid. */
300 static bool tdb1_check_free_record(struct tdb_context *tdb,
301 tdb1_off_t off,
302 const struct tdb1_record *rec,
303 unsigned char **hashes)
305 if (!tdb1_check_record(tdb, off, rec))
306 return false;
308 /* Mark this offset as a known value for the free list. */
309 record_offset(hashes[0], off);
310 /* And similarly if the next pointer is valid. */
311 if (rec->next)
312 record_offset(hashes[0], rec->next);
313 return true;
316 /* Slow, but should be very rare. */
317 size_t tdb1_dead_space(struct tdb_context *tdb, tdb1_off_t off)
319 size_t len;
321 for (len = 0; off + len < tdb->file->map_size; len++) {
322 char c;
323 if (tdb->tdb1.io->tdb1_read(tdb, off, &c, 1, 0))
324 return 0;
325 if (c != 0 && c != 0x42)
326 break;
328 return len;
331 int tdb1_check(struct tdb_context *tdb,
332 enum TDB_ERROR (*check)(TDB_DATA key, TDB_DATA data, void *),
333 void *private_data)
335 unsigned int h;
336 unsigned char **hashes;
337 tdb1_off_t off, recovery_start;
338 struct tdb1_record rec;
339 bool found_recovery = false;
340 tdb1_len_t dead;
341 bool locked;
342 size_t alloc_len;
344 /* We may have a write lock already, so don't re-lock. */
345 if (tdb->file->allrecord_lock.count != 0) {
346 locked = false;
347 } else {
348 if (tdb_lockall_read(tdb) != TDB_SUCCESS)
349 return -1;
350 locked = true;
353 /* Make sure we know true size of the underlying file. */
354 tdb->tdb1.io->tdb1_oob(tdb, tdb->file->map_size + 1, 1);
356 /* Header must be OK: also gets us the recovery ptr, if any. */
357 if (!tdb1_check_header(tdb, &recovery_start))
358 goto unlock;
360 /* We should have the whole header, too. */
361 if (tdb->file->map_size < TDB1_DATA_START(tdb->tdb1.header.hash_size)) {
362 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
363 "File too short for hashes\n");
364 goto unlock;
367 /* One big malloc: pointers then bit arrays. */
368 alloc_len = sizeof(hashes[0]) * (1+tdb->tdb1.header.hash_size)
369 + BITMAP_BITS / CHAR_BIT * (1+tdb->tdb1.header.hash_size);
370 hashes = (unsigned char **)calloc(1, alloc_len);
371 if (!hashes) {
372 tdb->last_error = tdb_logerr(tdb, TDB_ERR_OOM, TDB_LOG_ERROR,
373 "tdb_check: could not allocate %zu",
374 alloc_len);
375 goto unlock;
378 /* Initialize pointers */
379 hashes[0] = (unsigned char *)(&hashes[1+tdb->tdb1.header.hash_size]);
380 for (h = 1; h < 1+tdb->tdb1.header.hash_size; h++)
381 hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
383 /* Freelist and hash headers are all in a row: read them. */
384 for (h = 0; h < 1+tdb->tdb1.header.hash_size; h++) {
385 if (tdb1_ofs_read(tdb, TDB1_FREELIST_TOP + h*sizeof(tdb1_off_t),
386 &off) == -1)
387 goto free;
388 if (off)
389 record_offset(hashes[h], off);
392 /* For each record, read it in and check it's ok. */
393 for (off = TDB1_DATA_START(tdb->tdb1.header.hash_size);
394 off < tdb->file->map_size;
395 off += sizeof(rec) + rec.rec_len) {
396 if (tdb->tdb1.io->tdb1_read(tdb, off, &rec, sizeof(rec),
397 TDB1_DOCONV()) == -1)
398 goto free;
399 switch (rec.magic) {
400 case TDB1_MAGIC:
401 case TDB1_DEAD_MAGIC:
402 if (!tdb1_check_used_record(tdb, off, &rec, hashes,
403 check, private_data))
404 goto free;
405 break;
406 case TDB1_FREE_MAGIC:
407 if (!tdb1_check_free_record(tdb, off, &rec, hashes))
408 goto free;
409 break;
410 /* If we crash after ftruncate, we can get zeroes or fill. */
411 case TDB1_RECOVERY_INVALID_MAGIC:
412 case 0x42424242:
413 if (recovery_start == off) {
414 found_recovery = true;
415 break;
417 dead = tdb1_dead_space(tdb, off);
418 if (dead < sizeof(rec))
419 goto corrupt;
421 tdb_logerr(tdb, TDB_SUCCESS, TDB_LOG_WARNING,
422 "Dead space at %d-%d (of %u)\n",
423 off, off + dead, tdb->file->map_size);
424 rec.rec_len = dead - sizeof(rec);
425 break;
426 case TDB1_RECOVERY_MAGIC:
427 if (recovery_start != off) {
428 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
429 "Unexpected recovery record at offset %d\n",
430 off);
431 goto free;
433 found_recovery = true;
434 break;
435 default: ;
436 corrupt:
437 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
438 "Bad magic 0x%x at offset %d\n",
439 rec.magic, off);
440 goto free;
444 /* Now, hashes should all be empty: each record exists and is referred
445 * to by one other. */
446 for (h = 0; h < 1+tdb->tdb1.header.hash_size; h++) {
447 unsigned int i;
448 for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
449 if (hashes[h][i] != 0) {
450 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
451 "Hashes do not match records\n");
452 goto free;
457 /* We must have found recovery area if there was one. */
458 if (recovery_start != 0 && !found_recovery) {
459 tdb->last_error = tdb_logerr(tdb, TDB_ERR_CORRUPT, TDB_LOG_ERROR,
460 "Expected a recovery area at %u\n",
461 recovery_start);
462 goto free;
465 free(hashes);
466 if (locked) {
467 tdb_unlockall_read(tdb);
469 return 0;
471 free:
472 free(hashes);
473 unlock:
474 if (locked) {
475 tdb_unlockall_read(tdb);
477 return -1;