tdb: Reduce freelist contention
[Samba.git] / lib / tdb / common / tdb.c
blobba1c98edbe6df56720da992238e54946195708ed
1 /*
2 Unix SMB/CIFS implementation.
4 trivial database library
6 Copyright (C) Andrew Tridgell 1999-2005
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
12 ** under the LGPL
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 3 of the License, or (at your option) any later version.
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; if not, see <http://www.gnu.org/licenses/>.
28 #include "tdb_private.h"
30 _PUBLIC_ TDB_DATA tdb_null;
33 non-blocking increment of the tdb sequence number if the tdb has been opened using
34 the TDB_SEQNUM flag
36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
38 tdb_off_t seqnum=0;
40 if (!(tdb->flags & TDB_SEQNUM)) {
41 return;
44 /* we ignore errors from this, as we have no sane way of
45 dealing with them.
47 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
48 seqnum++;
49 tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
53 increment the tdb sequence number if the tdb has been opened using
54 the TDB_SEQNUM flag
56 static void tdb_increment_seqnum(struct tdb_context *tdb)
58 if (!(tdb->flags & TDB_SEQNUM)) {
59 return;
62 if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
63 TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
64 return;
67 tdb_increment_seqnum_nonblock(tdb);
69 tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
72 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
74 return memcmp(data.dptr, key.dptr, data.dsize);
77 /* Returns 0 on fail. On success, return offset of record, and fills
78 in rec */
79 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
80 struct tdb_record *r)
82 tdb_off_t rec_ptr;
84 /* read in the hash top */
85 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
86 return 0;
88 /* keep looking until we find the right record */
89 while (rec_ptr) {
90 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
91 return 0;
93 if (!TDB_DEAD(r) && hash==r->full_hash
94 && key.dsize==r->key_len
95 && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
96 r->key_len, tdb_key_compare,
97 NULL) == 0) {
98 return rec_ptr;
100 /* detect tight infinite loop */
101 if (rec_ptr == r->next) {
102 tdb->ecode = TDB_ERR_CORRUPT;
103 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
104 return 0;
106 rec_ptr = r->next;
108 tdb->ecode = TDB_ERR_NOEXIST;
109 return 0;
112 /* As tdb_find, but if you succeed, keep the lock */
113 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
114 struct tdb_record *rec)
116 uint32_t rec_ptr;
118 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
119 return 0;
120 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
121 tdb_unlock(tdb, BUCKET(hash), locktype);
122 return rec_ptr;
125 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
127 static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
129 TDB_DATA *dbuf = (TDB_DATA *)private_data;
131 if (dbuf->dsize != data.dsize) {
132 return -1;
134 if (memcmp(dbuf->dptr, data.dptr, data.dsize) != 0) {
135 return -1;
137 return 0;
140 /* update an entry in place - this only works if the new data size
141 is <= the old data size and the key exists.
142 on failure return -1.
144 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
146 struct tdb_record rec;
147 tdb_off_t rec_ptr;
149 /* find entry */
150 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
151 return -1;
153 /* it could be an exact duplicate of what is there - this is
154 * surprisingly common (eg. with a ldb re-index). */
155 if (rec.key_len == key.dsize &&
156 rec.data_len == dbuf.dsize &&
157 rec.full_hash == hash &&
158 tdb_parse_record(tdb, key, tdb_update_hash_cmp, &dbuf) == 0) {
159 return 0;
162 /* must be long enough key, data and tailer */
163 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
164 tdb->ecode = TDB_SUCCESS; /* Not really an error */
165 return -1;
168 if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
169 dbuf.dptr, dbuf.dsize) == -1)
170 return -1;
172 if (dbuf.dsize != rec.data_len) {
173 /* update size */
174 rec.data_len = dbuf.dsize;
175 return tdb_rec_write(tdb, rec_ptr, &rec);
178 return 0;
181 /* find an entry in the database given a key */
182 /* If an entry doesn't exist tdb_err will be set to
183 * TDB_ERR_NOEXIST. If a key has no data attached
184 * then the TDB_DATA will have zero length but
185 * a non-zero pointer
187 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
189 tdb_off_t rec_ptr;
190 struct tdb_record rec;
191 TDB_DATA ret;
192 uint32_t hash;
194 /* find which hash bucket it is in */
195 hash = tdb->hash_fn(&key);
196 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
197 return tdb_null;
199 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
200 rec.data_len);
201 ret.dsize = rec.data_len;
202 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
203 return ret;
206 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
208 TDB_DATA ret = _tdb_fetch(tdb, key);
210 tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
211 return ret;
215 * Find an entry in the database and hand the record's data to a parsing
216 * function. The parsing function is executed under the chain read lock, so it
217 * should be fast and should not block on other syscalls.
219 * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
221 * For mmapped tdb's that do not have a transaction open it points the parsing
222 * function directly at the mmap area, it avoids the malloc/memcpy in this
223 * case. If a transaction is open or no mmap is available, it has to do
224 * malloc/read/parse/free.
226 * This is interesting for all readers of potentially large data structures in
227 * the tdb records, ldb indexes being one example.
229 * Return -1 if the record was not found.
232 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
233 int (*parser)(TDB_DATA key, TDB_DATA data,
234 void *private_data),
235 void *private_data)
237 tdb_off_t rec_ptr;
238 struct tdb_record rec;
239 int ret;
240 uint32_t hash;
242 /* find which hash bucket it is in */
243 hash = tdb->hash_fn(&key);
245 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
246 /* record not found */
247 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
248 tdb->ecode = TDB_ERR_NOEXIST;
249 return -1;
251 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
253 ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
254 rec.data_len, parser, private_data);
256 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
258 return ret;
261 /* check if an entry in the database exists
263 note that 1 is returned if the key is found and 0 is returned if not found
264 this doesn't match the conventions in the rest of this module, but is
265 compatible with gdbm
267 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
269 struct tdb_record rec;
271 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
272 return 0;
273 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
274 return 1;
277 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
279 uint32_t hash = tdb->hash_fn(&key);
280 int ret;
282 ret = tdb_exists_hash(tdb, key, hash);
283 tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
284 return ret;
287 /* actually delete an entry in the database given the offset */
288 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
290 tdb_off_t last_ptr, i;
291 struct tdb_record lastrec;
293 if (tdb->read_only || tdb->traverse_read) return -1;
295 if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
296 tdb_write_lock_record(tdb, rec_ptr) == -1) {
297 /* Someone traversing here: mark it as dead */
298 rec->magic = TDB_DEAD_MAGIC;
299 return tdb_rec_write(tdb, rec_ptr, rec);
301 if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
302 return -1;
304 /* find previous record in hash chain */
305 if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
306 return -1;
307 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
308 if (tdb_rec_read(tdb, i, &lastrec) == -1)
309 return -1;
311 /* unlink it: next ptr is at start of record. */
312 if (last_ptr == 0)
313 last_ptr = TDB_HASH_TOP(rec->full_hash);
314 if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
315 return -1;
317 /* recover the space */
318 if (tdb_free(tdb, rec_ptr, rec) == -1)
319 return -1;
320 return 0;
323 static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
325 int res = 0;
326 tdb_off_t rec_ptr;
327 struct tdb_record rec;
329 /* read in the hash top */
330 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
331 return 0;
333 while (rec_ptr) {
334 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
335 return 0;
337 if (rec.magic == TDB_DEAD_MAGIC) {
338 res += 1;
340 rec_ptr = rec.next;
342 return res;
346 * Purge all DEAD records from a hash chain
348 int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
350 int res = -1;
351 struct tdb_record rec;
352 tdb_off_t rec_ptr;
354 if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == -1) {
356 * Don't block the freelist if not strictly necessary
358 return -1;
361 /* read in the hash top */
362 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
363 goto fail;
365 while (rec_ptr) {
366 tdb_off_t next;
368 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
369 goto fail;
372 next = rec.next;
374 if (rec.magic == TDB_DEAD_MAGIC
375 && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
376 goto fail;
378 rec_ptr = next;
380 res = 0;
381 fail:
382 tdb_unlock(tdb, -1, F_WRLCK);
383 return res;
386 /* delete an entry in the database given a key */
387 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
389 tdb_off_t rec_ptr;
390 struct tdb_record rec;
391 int ret;
393 rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec);
394 if (rec_ptr == 0) {
395 return -1;
398 if (tdb->max_dead_records != 0) {
400 uint32_t magic = TDB_DEAD_MAGIC;
403 * Allow for some dead records per hash chain, mainly for
404 * tdb's with a very high create/delete rate like locking.tdb.
407 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
409 * Don't let the per-chain freelist grow too large,
410 * delete all existing dead records
412 tdb_purge_dead(tdb, hash);
416 * Just mark the record as dead.
418 ret = tdb_ofs_write(
419 tdb, rec_ptr + offsetof(struct tdb_record, magic),
420 &magic);
422 else {
423 ret = tdb_do_delete(tdb, rec_ptr, &rec);
426 if (ret == 0) {
427 tdb_increment_seqnum(tdb);
430 if (tdb_unlock(tdb, BUCKET(hash), F_WRLCK) != 0)
431 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
432 return ret;
435 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
437 uint32_t hash = tdb->hash_fn(&key);
438 int ret;
440 ret = tdb_delete_hash(tdb, key, hash);
441 tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
442 return ret;
446 * See if we have a dead record around with enough space
448 tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
449 struct tdb_record *r, tdb_len_t length,
450 tdb_off_t *p_last_ptr)
452 tdb_off_t rec_ptr, last_ptr;
453 tdb_off_t best_rec_ptr = 0;
454 tdb_off_t best_last_ptr = 0;
455 struct tdb_record best = { .rec_len = UINT32_MAX };
457 length += sizeof(tdb_off_t); /* tailer */
459 last_ptr = TDB_HASH_TOP(hash);
461 /* read in the hash top */
462 if (tdb_ofs_read(tdb, last_ptr, &rec_ptr) == -1)
463 return 0;
465 /* keep looking until we find the right record */
466 while (rec_ptr) {
467 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
468 return 0;
470 if (TDB_DEAD(r) && (r->rec_len >= length) &&
471 (r->rec_len < best.rec_len)) {
472 best_rec_ptr = rec_ptr;
473 best_last_ptr = last_ptr;
474 best = *r;
476 last_ptr = rec_ptr;
477 rec_ptr = r->next;
480 if (best.rec_len == UINT32_MAX) {
481 return 0;
484 *r = best;
485 *p_last_ptr = best_last_ptr;
486 return best_rec_ptr;
489 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
490 TDB_DATA dbuf, int flag, uint32_t hash)
492 struct tdb_record rec;
493 tdb_off_t rec_ptr;
494 int ret = -1;
496 /* check for it existing, on insert. */
497 if (flag == TDB_INSERT) {
498 if (tdb_exists_hash(tdb, key, hash)) {
499 tdb->ecode = TDB_ERR_EXISTS;
500 goto fail;
502 } else {
503 /* first try in-place update, on modify or replace. */
504 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
505 goto done;
507 if (tdb->ecode == TDB_ERR_NOEXIST &&
508 flag == TDB_MODIFY) {
509 /* if the record doesn't exist and we are in TDB_MODIFY mode then
510 we should fail the store */
511 goto fail;
514 /* reset the error code potentially set by the tdb_update() */
515 tdb->ecode = TDB_SUCCESS;
517 /* delete any existing record - if it doesn't exist we don't
518 care. Doing this first reduces fragmentation, and avoids
519 coalescing with `allocated' block before it's updated. */
520 if (flag != TDB_INSERT)
521 tdb_delete_hash(tdb, key, hash);
523 if (tdb->max_dead_records != 0) {
524 tdb_off_t last_ptr;
526 * Allow for some dead records per hash chain, look if we can
527 * find one that can hold the new record. We need enough space
528 * for key, data and tailer. If we find one, we don't have to
529 * consult the central freelist.
531 rec_ptr = tdb_find_dead(tdb, hash, &rec,
532 key.dsize + dbuf.dsize,
533 &last_ptr);
535 if (rec_ptr != 0) {
536 rec.key_len = key.dsize;
537 rec.data_len = dbuf.dsize;
538 rec.full_hash = hash;
539 rec.magic = TDB_MAGIC;
540 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
541 || tdb->methods->tdb_write(
542 tdb, rec_ptr + sizeof(rec),
543 key.dptr, key.dsize) == -1
544 || tdb->methods->tdb_write(
545 tdb, rec_ptr + sizeof(rec) + key.dsize,
546 dbuf.dptr, dbuf.dsize) == -1) {
547 goto fail;
549 goto done;
553 /* we have to allocate some space */
554 rec_ptr = tdb_allocate(tdb, hash, key.dsize + dbuf.dsize, &rec);
556 if (rec_ptr == 0) {
557 goto fail;
560 /* Read hash top into next ptr */
561 if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
562 goto fail;
564 rec.key_len = key.dsize;
565 rec.data_len = dbuf.dsize;
566 rec.full_hash = hash;
567 rec.magic = TDB_MAGIC;
569 /* write out and point the top of the hash chain at it */
570 if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
571 || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec),
572 key.dptr, key.dsize) == -1
573 || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec)+key.dsize,
574 dbuf.dptr, dbuf.dsize) == -1
575 || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
576 /* Need to tdb_unallocate() here */
577 goto fail;
580 done:
581 ret = 0;
582 fail:
583 if (ret == 0) {
584 tdb_increment_seqnum(tdb);
586 return ret;
589 /* store an element in the database, replacing any existing element
590 with the same key
592 return 0 on success, -1 on failure
594 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
596 uint32_t hash;
597 int ret;
599 if (tdb->read_only || tdb->traverse_read) {
600 tdb->ecode = TDB_ERR_RDONLY;
601 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
602 return -1;
605 /* find which hash bucket it is in */
606 hash = tdb->hash_fn(&key);
607 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
608 return -1;
610 ret = _tdb_store(tdb, key, dbuf, flag, hash);
611 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
612 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
613 return ret;
616 /* Append to an entry. Create if not exist. */
617 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
619 uint32_t hash;
620 TDB_DATA dbuf;
621 int ret = -1;
623 /* find which hash bucket it is in */
624 hash = tdb->hash_fn(&key);
625 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
626 return -1;
628 dbuf = _tdb_fetch(tdb, key);
630 if (dbuf.dptr == NULL) {
631 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
632 } else {
633 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
634 unsigned char *new_dptr;
636 /* realloc '0' is special: don't do that. */
637 if (new_len == 0)
638 new_len = 1;
639 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
640 if (new_dptr == NULL) {
641 free(dbuf.dptr);
643 dbuf.dptr = new_dptr;
646 if (dbuf.dptr == NULL) {
647 tdb->ecode = TDB_ERR_OOM;
648 goto failed;
651 memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
652 dbuf.dsize += new_dbuf.dsize;
654 ret = _tdb_store(tdb, key, dbuf, 0, hash);
655 tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
657 failed:
658 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
659 SAFE_FREE(dbuf.dptr);
660 return ret;
665 return the name of the current tdb file
666 useful for external logging functions
668 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
670 return tdb->name;
674 return the underlying file descriptor being used by tdb, or -1
675 useful for external routines that want to check the device/inode
676 of the fd
678 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
680 return tdb->fd;
684 return the current logging function
685 useful for external tdb routines that wish to log tdb errors
687 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
689 return tdb->log.log_fn;
694 get the tdb sequence number. Only makes sense if the writers opened
695 with TDB_SEQNUM set. Note that this sequence number will wrap quite
696 quickly, so it should only be used for a 'has something changed'
697 test, not for code that relies on the count of the number of changes
698 made. If you want a counter then use a tdb record.
700 The aim of this sequence number is to allow for a very lightweight
701 test of a possible tdb change.
703 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
705 tdb_off_t seqnum=0;
707 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
708 return seqnum;
711 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
713 return tdb->hash_size;
716 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
718 return tdb->map_size;
721 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
723 return tdb->flags;
726 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
728 if ((flags & TDB_ALLOW_NESTING) &&
729 (flags & TDB_DISALLOW_NESTING)) {
730 tdb->ecode = TDB_ERR_NESTING;
731 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
732 "allow_nesting and disallow_nesting are not allowed together!"));
733 return;
736 if (flags & TDB_ALLOW_NESTING) {
737 tdb->flags &= ~TDB_DISALLOW_NESTING;
739 if (flags & TDB_DISALLOW_NESTING) {
740 tdb->flags &= ~TDB_ALLOW_NESTING;
743 tdb->flags |= flags;
746 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
748 if ((flags & TDB_ALLOW_NESTING) &&
749 (flags & TDB_DISALLOW_NESTING)) {
750 tdb->ecode = TDB_ERR_NESTING;
751 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
752 "allow_nesting and disallow_nesting are not allowed together!"));
753 return;
756 if (flags & TDB_ALLOW_NESTING) {
757 tdb->flags |= TDB_DISALLOW_NESTING;
759 if (flags & TDB_DISALLOW_NESTING) {
760 tdb->flags |= TDB_ALLOW_NESTING;
763 tdb->flags &= ~flags;
768 enable sequence number handling on an open tdb
770 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
772 tdb->flags |= TDB_SEQNUM;
777 add a region of the file to the freelist. Length is the size of the region in bytes,
778 which includes the free list header that needs to be added
780 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
782 struct tdb_record rec;
783 if (length <= sizeof(rec)) {
784 /* the region is not worth adding */
785 return 0;
787 if (length + offset > tdb->map_size) {
788 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
789 return -1;
791 memset(&rec,'\0',sizeof(rec));
792 rec.rec_len = length - sizeof(rec);
793 if (tdb_free(tdb, offset, &rec) == -1) {
794 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
795 return -1;
797 return 0;
801 wipe the entire database, deleting all records. This can be done
802 very fast by using a allrecord lock. The entire data portion of the
803 file becomes a single entry in the freelist.
805 This code carefully steps around the recovery area, leaving it alone
807 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
809 int i;
810 tdb_off_t offset = 0;
811 ssize_t data_len;
812 tdb_off_t recovery_head;
813 tdb_len_t recovery_size = 0;
815 if (tdb_lockall(tdb) != 0) {
816 return -1;
819 tdb_trace(tdb, "tdb_wipe_all");
821 /* see if the tdb has a recovery area, and remember its size
822 if so. We don't want to lose this as otherwise each
823 tdb_wipe_all() in a transaction will increase the size of
824 the tdb by the size of the recovery area */
825 if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
826 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
827 goto failed;
830 if (recovery_head != 0) {
831 struct tdb_record rec;
832 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
833 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
834 return -1;
836 recovery_size = rec.rec_len + sizeof(rec);
839 /* wipe the hashes */
840 for (i=0;i<tdb->hash_size;i++) {
841 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
842 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
843 goto failed;
847 /* wipe the freelist */
848 if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
849 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
850 goto failed;
853 /* add all the rest of the file to the freelist, possibly leaving a gap
854 for the recovery area */
855 if (recovery_size == 0) {
856 /* the simple case - the whole file can be used as a freelist */
857 data_len = (tdb->map_size - TDB_DATA_START(tdb->hash_size));
858 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
859 goto failed;
861 } else {
862 /* we need to add two freelist entries - one on either
863 side of the recovery area
865 Note that we cannot shift the recovery area during
866 this operation. Only the transaction.c code may
867 move the recovery area or we risk subtle data
868 corruption
870 data_len = (recovery_head - TDB_DATA_START(tdb->hash_size));
871 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
872 goto failed;
874 /* and the 2nd free list entry after the recovery area - if any */
875 data_len = tdb->map_size - (recovery_head+recovery_size);
876 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
877 goto failed;
881 tdb_increment_seqnum_nonblock(tdb);
883 if (tdb_unlockall(tdb) != 0) {
884 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
885 goto failed;
888 return 0;
890 failed:
891 tdb_unlockall(tdb);
892 return -1;
895 struct traverse_state {
896 bool error;
897 struct tdb_context *dest_db;
901 traverse function for repacking
903 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
905 struct traverse_state *state = (struct traverse_state *)private_data;
906 if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
907 state->error = true;
908 return -1;
910 return 0;
914 repack a tdb
916 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
918 struct tdb_context *tmp_db;
919 struct traverse_state state;
921 tdb_trace(tdb, "tdb_repack");
923 if (tdb_transaction_start(tdb) != 0) {
924 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
925 return -1;
928 tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
929 if (tmp_db == NULL) {
930 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
931 tdb_transaction_cancel(tdb);
932 return -1;
935 state.error = false;
936 state.dest_db = tmp_db;
938 if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
939 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
940 tdb_transaction_cancel(tdb);
941 tdb_close(tmp_db);
942 return -1;
945 if (state.error) {
946 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
947 tdb_transaction_cancel(tdb);
948 tdb_close(tmp_db);
949 return -1;
952 if (tdb_wipe_all(tdb) != 0) {
953 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
954 tdb_transaction_cancel(tdb);
955 tdb_close(tmp_db);
956 return -1;
959 state.error = false;
960 state.dest_db = tdb;
962 if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
963 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
964 tdb_transaction_cancel(tdb);
965 tdb_close(tmp_db);
966 return -1;
969 if (state.error) {
970 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
971 tdb_transaction_cancel(tdb);
972 tdb_close(tmp_db);
973 return -1;
976 tdb_close(tmp_db);
978 if (tdb_transaction_commit(tdb) != 0) {
979 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
980 return -1;
983 return 0;
986 /* Even on files, we can get partial writes due to signals. */
987 bool tdb_write_all(int fd, const void *buf, size_t count)
989 while (count) {
990 ssize_t ret;
991 ret = write(fd, buf, count);
992 if (ret < 0)
993 return false;
994 buf = (const char *)buf + ret;
995 count -= ret;
997 return true;
1000 bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret)
1002 tdb_off_t ret = a + b;
1004 if ((ret < a) || (ret < b)) {
1005 return false;
1007 *pret = ret;
1008 return true;
1011 #ifdef TDB_TRACE
1012 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
1014 if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
1015 close(tdb->tracefd);
1016 tdb->tracefd = -1;
1020 static void tdb_trace_start(struct tdb_context *tdb)
1022 tdb_off_t seqnum=0;
1023 char msg[sizeof(tdb_off_t) * 4 + 1];
1025 tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1026 snprintf(msg, sizeof(msg), "%u ", seqnum);
1027 tdb_trace_write(tdb, msg);
1030 static void tdb_trace_end(struct tdb_context *tdb)
1032 tdb_trace_write(tdb, "\n");
1035 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1037 char msg[sizeof(ret) * 4 + 4];
1038 snprintf(msg, sizeof(msg), " = %i\n", ret);
1039 tdb_trace_write(tdb, msg);
1042 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1044 char msg[20 + rec.dsize*2], *p;
1045 unsigned int i;
1047 /* We differentiate zero-length records from non-existent ones. */
1048 if (rec.dptr == NULL) {
1049 tdb_trace_write(tdb, " NULL");
1050 return;
1053 /* snprintf here is purely cargo-cult programming. */
1054 p = msg;
1055 p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1056 for (i = 0; i < rec.dsize; i++)
1057 p += snprintf(p, 2, "%02x", rec.dptr[i]);
1059 tdb_trace_write(tdb, msg);
1062 void tdb_trace(struct tdb_context *tdb, const char *op)
1064 tdb_trace_start(tdb);
1065 tdb_trace_write(tdb, op);
1066 tdb_trace_end(tdb);
1069 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1071 char msg[sizeof(tdb_off_t) * 4 + 1];
1073 snprintf(msg, sizeof(msg), "%u ", seqnum);
1074 tdb_trace_write(tdb, msg);
1075 tdb_trace_write(tdb, op);
1076 tdb_trace_end(tdb);
1079 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1080 unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1082 char msg[128];
1084 snprintf(msg, sizeof(msg),
1085 "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1086 tdb_trace_start(tdb);
1087 tdb_trace_write(tdb, msg);
1088 tdb_trace_end(tdb);
1091 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1093 tdb_trace_start(tdb);
1094 tdb_trace_write(tdb, op);
1095 tdb_trace_end_ret(tdb, ret);
1098 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1100 tdb_trace_start(tdb);
1101 tdb_trace_write(tdb, op);
1102 tdb_trace_write(tdb, " =");
1103 tdb_trace_record(tdb, ret);
1104 tdb_trace_end(tdb);
1107 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1108 TDB_DATA rec)
1110 tdb_trace_start(tdb);
1111 tdb_trace_write(tdb, op);
1112 tdb_trace_record(tdb, rec);
1113 tdb_trace_end(tdb);
1116 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1117 TDB_DATA rec, int ret)
1119 tdb_trace_start(tdb);
1120 tdb_trace_write(tdb, op);
1121 tdb_trace_record(tdb, rec);
1122 tdb_trace_end_ret(tdb, ret);
1125 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1126 TDB_DATA rec, TDB_DATA ret)
1128 tdb_trace_start(tdb);
1129 tdb_trace_write(tdb, op);
1130 tdb_trace_record(tdb, rec);
1131 tdb_trace_write(tdb, " =");
1132 tdb_trace_record(tdb, ret);
1133 tdb_trace_end(tdb);
1136 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1137 TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1138 int ret)
1140 char msg[1 + sizeof(ret) * 4];
1142 snprintf(msg, sizeof(msg), " %#x", flag);
1143 tdb_trace_start(tdb);
1144 tdb_trace_write(tdb, op);
1145 tdb_trace_record(tdb, rec1);
1146 tdb_trace_record(tdb, rec2);
1147 tdb_trace_write(tdb, msg);
1148 tdb_trace_end_ret(tdb, ret);
1151 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1152 TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1154 tdb_trace_start(tdb);
1155 tdb_trace_write(tdb, op);
1156 tdb_trace_record(tdb, rec1);
1157 tdb_trace_record(tdb, rec2);
1158 tdb_trace_write(tdb, " =");
1159 tdb_trace_record(tdb, ret);
1160 tdb_trace_end(tdb);
1162 #endif