lib/tdb/common/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ TDB_DATA tdb_null;
  31
  32 /*
  33   non-blocking increment of the tdb sequence number if the tdb has been opened using
  34   the TDB_SEQNUM flag
  35 */
  36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
  37 {
  38         tdb_off_t seqnum=0;
  39
  40         if (!(tdb->flags & TDB_SEQNUM)) {
  41                 return;
  42         }
  43
  44         /* we ignore errors from this, as we have no sane way of
  45            dealing with them.
  46         */
  47         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
  48         seqnum++;
  49         tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
  50 }
  51
  52 /*
  53   increment the tdb sequence number if the tdb has been opened using
  54   the TDB_SEQNUM flag
  55 */
  56 static void tdb_increment_seqnum(struct tdb_context *tdb)
  57 {
  58         if (!(tdb->flags & TDB_SEQNUM)) {
  59                 return;
  60         }
  61
  62         if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
  63                           TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  64                 return;
  65         }
  66
  67         tdb_increment_seqnum_nonblock(tdb);
  68
  69         tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
  70 }
  71
  72 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  73 {
  74         return memcmp(data.dptr, key.dptr, data.dsize);
  75 }
  76
  77 /* Returns 0 on fail.  On success, return offset of record, and fills
  78    in rec */
  79 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  80                         struct tdb_record *r)
  81 {
  82         tdb_off_t rec_ptr;
  83
  84         /* read in the hash top */
  85         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
  86                 return 0;
  87
  88         /* keep looking until we find the right record */
  89         while (rec_ptr) {
  90                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
  91                         return 0;
  92
  93                 if (!TDB_DEAD(r) && hash==r->full_hash
  94                     && key.dsize==r->key_len
  95                     && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
  96                                       r->key_len, tdb_key_compare,
  97                                       NULL) == 0) {
  98                         return rec_ptr;
  99                 }
 100                 /* detect tight infinite loop */
 101                 if (rec_ptr == r->next) {
 102                         tdb->ecode = TDB_ERR_CORRUPT;
 103                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
 104                         return 0;
 105                 }
 106                 rec_ptr = r->next;
 107         }
 108         tdb->ecode = TDB_ERR_NOEXIST;
 109         return 0;
 110 }
 111
 112 /* As tdb_find, but if you succeed, keep the lock */
 113 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 114                            struct tdb_record *rec)
 115 {
 116         uint32_t rec_ptr;
 117
 118         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 119                 return 0;
 120         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 121                 tdb_unlock(tdb, BUCKET(hash), locktype);
 122         return rec_ptr;
 123 }
 124
 125 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
 126
 127 /* update an entry in place - this only works if the new data size
 128    is <= the old data size and the key exists.
 129    on failure return -1.
 130 */
 131 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 132 {
 133         struct tdb_record rec;
 134         tdb_off_t rec_ptr;
 135
 136         /* find entry */
 137         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 138                 return -1;
 139
 140         /* it could be an exact duplicate of what is there - this is
 141          * surprisingly common (eg. with a ldb re-index). */
 142         if (rec.key_len == key.dsize &&
 143             rec.data_len == dbuf.dsize &&
 144             rec.full_hash == hash) {
 145                 TDB_DATA data = _tdb_fetch(tdb, key);
 146                 if (data.dsize == dbuf.dsize &&
 147                     memcmp(data.dptr, dbuf.dptr, data.dsize) == 0) {
 148                         if (data.dptr) {
 149                                 free(data.dptr);
 150                         }
 151                         return 0;
 152                 }
 153                 if (data.dptr) {
 154                         free(data.dptr);
 155                 }
 156         }
 157
 158         /* must be long enough key, data and tailer */
 159         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
 160                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
 161                 return -1;
 162         }
 163
 164         if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 165                       dbuf.dptr, dbuf.dsize) == -1)
 166                 return -1;
 167
 168         if (dbuf.dsize != rec.data_len) {
 169                 /* update size */
 170                 rec.data_len = dbuf.dsize;
 171                 return tdb_rec_write(tdb, rec_ptr, &rec);
 172         }
 173
 174         return 0;
 175 }
 176
 177 /* find an entry in the database given a key */
 178 /* If an entry doesn't exist tdb_err will be set to
 179  * TDB_ERR_NOEXIST. If a key has no data attached
 180  * then the TDB_DATA will have zero length but
 181  * a non-zero pointer
 182  */
 183 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 184 {
 185         tdb_off_t rec_ptr;
 186         struct tdb_record rec;
 187         TDB_DATA ret;
 188         uint32_t hash;
 189
 190         /* find which hash bucket it is in */
 191         hash = tdb->hash_fn(&key);
 192         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 193                 return tdb_null;
 194
 195         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 196                                   rec.data_len);
 197         ret.dsize = rec.data_len;
 198         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 199         return ret;
 200 }
 201
 202 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 203 {
 204         TDB_DATA ret = _tdb_fetch(tdb, key);
 205
 206         tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
 207         return ret;
 208 }
 209
 210 /*
 211  * Find an entry in the database and hand the record's data to a parsing
 212  * function. The parsing function is executed under the chain read lock, so it
 213  * should be fast and should not block on other syscalls.
 214  *
 215  * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
 216  *
 217  * For mmapped tdb's that do not have a transaction open it points the parsing
 218  * function directly at the mmap area, it avoids the malloc/memcpy in this
 219  * case. If a transaction is open or no mmap is available, it has to do
 220  * malloc/read/parse/free.
 221  *
 222  * This is interesting for all readers of potentially large data structures in
 223  * the tdb records, ldb indexes being one example.
 224  *
 225  * Return -1 if the record was not found.
 226  */
 227
 228 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
 229                      int (*parser)(TDB_DATA key, TDB_DATA data,
 230                                    void *private_data),
 231                      void *private_data)
 232 {
 233         tdb_off_t rec_ptr;
 234         struct tdb_record rec;
 235         int ret;
 236         uint32_t hash;
 237
 238         /* find which hash bucket it is in */
 239         hash = tdb->hash_fn(&key);
 240
 241         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 242                 /* record not found */
 243                 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
 244                 tdb->ecode = TDB_ERR_NOEXIST;
 245                 return -1;
 246         }
 247         tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
 248
 249         ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 250                              rec.data_len, parser, private_data);
 251
 252         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 253
 254         return ret;
 255 }
 256
 257 /* check if an entry in the database exists
 258
 259    note that 1 is returned if the key is found and 0 is returned if not found
 260    this doesn't match the conventions in the rest of this module, but is
 261    compatible with gdbm
 262 */
 263 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 264 {
 265         struct tdb_record rec;
 266
 267         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 268                 return 0;
 269         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 270         return 1;
 271 }
 272
 273 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
 274 {
 275         uint32_t hash = tdb->hash_fn(&key);
 276         int ret;
 277
 278         ret = tdb_exists_hash(tdb, key, hash);
 279         tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
 280         return ret;
 281 }
 282
 283 /* actually delete an entry in the database given the offset */
 284 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
 285 {
 286         tdb_off_t last_ptr, i;
 287         struct tdb_record lastrec;
 288
 289         if (tdb->read_only || tdb->traverse_read) return -1;
 290
 291         if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
 292             tdb_write_lock_record(tdb, rec_ptr) == -1) {
 293                 /* Someone traversing here: mark it as dead */
 294                 rec->magic = TDB_DEAD_MAGIC;
 295                 return tdb_rec_write(tdb, rec_ptr, rec);
 296         }
 297         if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
 298                 return -1;
 299
 300         /* find previous record in hash chain */
 301         if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 302                 return -1;
 303         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 304                 if (tdb_rec_read(tdb, i, &lastrec) == -1)
 305                         return -1;
 306
 307         /* unlink it: next ptr is at start of record. */
 308         if (last_ptr == 0)
 309                 last_ptr = TDB_HASH_TOP(rec->full_hash);
 310         if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
 311                 return -1;
 312
 313         /* recover the space */
 314         if (tdb_free(tdb, rec_ptr, rec) == -1)
 315                 return -1;
 316         return 0;
 317 }
 318
 319 static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
 320 {
 321         int res = 0;
 322         tdb_off_t rec_ptr;
 323         struct tdb_record rec;
 324
 325         /* read in the hash top */
 326         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 327                 return 0;
 328
 329         while (rec_ptr) {
 330                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
 331                         return 0;
 332
 333                 if (rec.magic == TDB_DEAD_MAGIC) {
 334                         res += 1;
 335                 }
 336                 rec_ptr = rec.next;
 337         }
 338         return res;
 339 }
 340
 341 /*
 342  * Purge all DEAD records from a hash chain
 343  */
 344 static int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
 345 {
 346         int res = -1;
 347         struct tdb_record rec;
 348         tdb_off_t rec_ptr;
 349
 350         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 351                 return -1;
 352         }
 353
 354         /* read in the hash top */
 355         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 356                 goto fail;
 357
 358         while (rec_ptr) {
 359                 tdb_off_t next;
 360
 361                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
 362                         goto fail;
 363                 }
 364
 365                 next = rec.next;
 366
 367                 if (rec.magic == TDB_DEAD_MAGIC
 368                     && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
 369                         goto fail;
 370                 }
 371                 rec_ptr = next;
 372         }
 373         res = 0;
 374  fail:
 375         tdb_unlock(tdb, -1, F_WRLCK);
 376         return res;
 377 }
 378
 379 /* delete an entry in the database given a key */
 380 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 381 {
 382         tdb_off_t rec_ptr;
 383         struct tdb_record rec;
 384         int ret;
 385
 386         if (tdb->max_dead_records != 0) {
 387
 388                 /*
 389                  * Allow for some dead records per hash chain, mainly for
 390                  * tdb's with a very high create/delete rate like locking.tdb.
 391                  */
 392
 393                 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 394                         return -1;
 395
 396                 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
 397                         /*
 398                          * Don't let the per-chain freelist grow too large,
 399                          * delete all existing dead records
 400                          */
 401                         tdb_purge_dead(tdb, hash);
 402                 }
 403
 404                 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
 405                         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 406                         return -1;
 407                 }
 408
 409                 /*
 410                  * Just mark the record as dead.
 411                  */
 412                 rec.magic = TDB_DEAD_MAGIC;
 413                 ret = tdb_rec_write(tdb, rec_ptr, &rec);
 414         }
 415         else {
 416                 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK,
 417                                                    &rec)))
 418                         return -1;
 419
 420                 ret = tdb_do_delete(tdb, rec_ptr, &rec);
 421         }
 422
 423         if (ret == 0) {
 424                 tdb_increment_seqnum(tdb);
 425         }
 426
 427         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
 428                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
 429         return ret;
 430 }
 431
 432 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
 433 {
 434         uint32_t hash = tdb->hash_fn(&key);
 435         int ret;
 436
 437         ret = tdb_delete_hash(tdb, key, hash);
 438         tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
 439         return ret;
 440 }
 441
 442 /*
 443  * See if we have a dead record around with enough space
 444  */
 445 static tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
 446                                struct tdb_record *r, tdb_len_t length)
 447 {
 448         tdb_off_t rec_ptr;
 449
 450         /* read in the hash top */
 451         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 452                 return 0;
 453
 454         /* keep looking until we find the right record */
 455         while (rec_ptr) {
 456                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
 457                         return 0;
 458
 459                 if (TDB_DEAD(r) && r->rec_len >= length) {
 460                         /*
 461                          * First fit for simple coding, TODO: change to best
 462                          * fit
 463                          */
 464                         return rec_ptr;
 465                 }
 466                 rec_ptr = r->next;
 467         }
 468         return 0;
 469 }
 470
 471 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
 472                        TDB_DATA dbuf, int flag, uint32_t hash)
 473 {
 474         struct tdb_record rec;
 475         tdb_off_t rec_ptr;
 476         char *p = NULL;
 477         int ret = -1;
 478
 479         /* check for it existing, on insert. */
 480         if (flag == TDB_INSERT) {
 481                 if (tdb_exists_hash(tdb, key, hash)) {
 482                         tdb->ecode = TDB_ERR_EXISTS;
 483                         goto fail;
 484                 }
 485         } else {
 486                 /* first try in-place update, on modify or replace. */
 487                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
 488                         goto done;
 489                 }
 490                 if (tdb->ecode == TDB_ERR_NOEXIST &&
 491                     flag == TDB_MODIFY) {
 492                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
 493                          we should fail the store */
 494                         goto fail;
 495                 }
 496         }
 497         /* reset the error code potentially set by the tdb_update() */
 498         tdb->ecode = TDB_SUCCESS;
 499
 500         /* delete any existing record - if it doesn't exist we don't
 501            care.  Doing this first reduces fragmentation, and avoids
 502            coalescing with `allocated' block before it's updated. */
 503         if (flag != TDB_INSERT)
 504                 tdb_delete_hash(tdb, key, hash);
 505
 506         /* Copy key+value *before* allocating free space in case malloc
 507            fails and we are left with a dead spot in the tdb. */
 508
 509         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
 510                 tdb->ecode = TDB_ERR_OOM;
 511                 goto fail;
 512         }
 513
 514         memcpy(p, key.dptr, key.dsize);
 515         if (dbuf.dsize)
 516                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
 517
 518         if (tdb->max_dead_records != 0) {
 519                 /*
 520                  * Allow for some dead records per hash chain, look if we can
 521                  * find one that can hold the new record. We need enough space
 522                  * for key, data and tailer. If we find one, we don't have to
 523                  * consult the central freelist.
 524                  */
 525                 rec_ptr = tdb_find_dead(
 526                         tdb, hash, &rec,
 527                         key.dsize + dbuf.dsize + sizeof(tdb_off_t));
 528
 529                 if (rec_ptr != 0) {
 530                         rec.key_len = key.dsize;
 531                         rec.data_len = dbuf.dsize;
 532                         rec.full_hash = hash;
 533                         rec.magic = TDB_MAGIC;
 534                         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 535                             || tdb->methods->tdb_write(
 536                                     tdb, rec_ptr + sizeof(rec),
 537                                     p, key.dsize + dbuf.dsize) == -1) {
 538                                 goto fail;
 539                         }
 540                         goto done;
 541                 }
 542         }
 543
 544         /*
 545          * We have to allocate some space from the freelist, so this means we
 546          * have to lock it. Use the chance to purge all the DEAD records from
 547          * the hash chain under the freelist lock.
 548          */
 549
 550         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 551                 goto fail;
 552         }
 553
 554         if ((tdb->max_dead_records != 0)
 555             && (tdb_purge_dead(tdb, hash) == -1)) {
 556                 tdb_unlock(tdb, -1, F_WRLCK);
 557                 goto fail;
 558         }
 559
 560         /* we have to allocate some space */
 561         rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec);
 562
 563         tdb_unlock(tdb, -1, F_WRLCK);
 564
 565         if (rec_ptr == 0) {
 566                 goto fail;
 567         }
 568
 569         /* Read hash top into next ptr */
 570         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 571                 goto fail;
 572
 573         rec.key_len = key.dsize;
 574         rec.data_len = dbuf.dsize;
 575         rec.full_hash = hash;
 576         rec.magic = TDB_MAGIC;
 577
 578         /* write out and point the top of the hash chain at it */
 579         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 580             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
 581             || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 582                 /* Need to tdb_unallocate() here */
 583                 goto fail;
 584         }
 585
 586  done:
 587         ret = 0;
 588  fail:
 589         if (ret == 0) {
 590                 tdb_increment_seqnum(tdb);
 591         }
 592
 593         SAFE_FREE(p);
 594         return ret;
 595 }
 596
 597 /* store an element in the database, replacing any existing element
 598    with the same key
 599
 600    return 0 on success, -1 on failure
 601 */
 602 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 603 {
 604         uint32_t hash;
 605         int ret;
 606
 607         if (tdb->read_only || tdb->traverse_read) {
 608                 tdb->ecode = TDB_ERR_RDONLY;
 609                 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
 610                 return -1;
 611         }
 612
 613         /* find which hash bucket it is in */
 614         hash = tdb->hash_fn(&key);
 615         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 616                 return -1;
 617
 618         ret = _tdb_store(tdb, key, dbuf, flag, hash);
 619         tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
 620         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 621         return ret;
 622 }
 623
 624 /* Append to an entry. Create if not exist. */
 625 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 626 {
 627         uint32_t hash;
 628         TDB_DATA dbuf;
 629         int ret = -1;
 630
 631         /* find which hash bucket it is in */
 632         hash = tdb->hash_fn(&key);
 633         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 634                 return -1;
 635
 636         dbuf = _tdb_fetch(tdb, key);
 637
 638         if (dbuf.dptr == NULL) {
 639                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 640         } else {
 641                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 642                 unsigned char *new_dptr;
 643
 644                 /* realloc '0' is special: don't do that. */
 645                 if (new_len == 0)
 646                         new_len = 1;
 647                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 648                 if (new_dptr == NULL) {
 649                         free(dbuf.dptr);
 650                 }
 651                 dbuf.dptr = new_dptr;
 652         }
 653
 654         if (dbuf.dptr == NULL) {
 655                 tdb->ecode = TDB_ERR_OOM;
 656                 goto failed;
 657         }
 658
 659         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 660         dbuf.dsize += new_dbuf.dsize;
 661
 662         ret = _tdb_store(tdb, key, dbuf, 0, hash);
 663         tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
 664
 665 failed:
 666         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 667         SAFE_FREE(dbuf.dptr);
 668         return ret;
 669 }
 670
 671
 672 /*
 673   return the name of the current tdb file
 674   useful for external logging functions
 675 */
 676 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
 677 {
 678         return tdb->name;
 679 }
 680
 681 /*
 682   return the underlying file descriptor being used by tdb, or -1
 683   useful for external routines that want to check the device/inode
 684   of the fd
 685 */
 686 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
 687 {
 688         return tdb->fd;
 689 }
 690
 691 /*
 692   return the current logging function
 693   useful for external tdb routines that wish to log tdb errors
 694 */
 695 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
 696 {
 697         return tdb->log.log_fn;
 698 }
 699
 700
 701 /*
 702   get the tdb sequence number. Only makes sense if the writers opened
 703   with TDB_SEQNUM set. Note that this sequence number will wrap quite
 704   quickly, so it should only be used for a 'has something changed'
 705   test, not for code that relies on the count of the number of changes
 706   made. If you want a counter then use a tdb record.
 707
 708   The aim of this sequence number is to allow for a very lightweight
 709   test of a possible tdb change.
 710 */
 711 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
 712 {
 713         tdb_off_t seqnum=0;
 714
 715         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
 716         return seqnum;
 717 }
 718
 719 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
 720 {
 721         return tdb->header.hash_size;
 722 }
 723
 724 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
 725 {
 726         return tdb->map_size;
 727 }
 728
 729 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
 730 {
 731         return tdb->flags;
 732 }
 733
 734 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
 735 {
 736         if ((flags & TDB_ALLOW_NESTING) &&
 737             (flags & TDB_DISALLOW_NESTING)) {
 738                 tdb->ecode = TDB_ERR_NESTING;
 739                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
 740                         "allow_nesting and disallow_nesting are not allowed together!"));
 741                 return;
 742         }
 743
 744         if (flags & TDB_ALLOW_NESTING) {
 745                 tdb->flags &= ~TDB_DISALLOW_NESTING;
 746         }
 747         if (flags & TDB_DISALLOW_NESTING) {
 748                 tdb->flags &= ~TDB_ALLOW_NESTING;
 749         }
 750
 751         tdb->flags |= flags;
 752 }
 753
 754 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
 755 {
 756         if ((flags & TDB_ALLOW_NESTING) &&
 757             (flags & TDB_DISALLOW_NESTING)) {
 758                 tdb->ecode = TDB_ERR_NESTING;
 759                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 760                         "allow_nesting and disallow_nesting are not allowed together!"));
 761                 return;
 762         }
 763
 764         if (flags & TDB_ALLOW_NESTING) {
 765                 tdb->flags |= TDB_DISALLOW_NESTING;
 766         }
 767         if (flags & TDB_DISALLOW_NESTING) {
 768                 tdb->flags |= TDB_ALLOW_NESTING;
 769         }
 770
 771         tdb->flags &= ~flags;
 772 }
 773
 774
 775 /*
 776   enable sequence number handling on an open tdb
 777 */
 778 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
 779 {
 780         tdb->flags |= TDB_SEQNUM;
 781 }
 782
 783
 784 /*
 785   add a region of the file to the freelist. Length is the size of the region in bytes,
 786   which includes the free list header that needs to be added
 787  */
 788 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
 789 {
 790         struct tdb_record rec;
 791         if (length <= sizeof(rec)) {
 792                 /* the region is not worth adding */
 793                 return 0;
 794         }
 795         if (length + offset > tdb->map_size) {
 796                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
 797                 return -1;
 798         }
 799         memset(&rec,'\0',sizeof(rec));
 800         rec.rec_len = length - sizeof(rec);
 801         if (tdb_free(tdb, offset, &rec) == -1) {
 802                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
 803                 return -1;
 804         }
 805         return 0;
 806 }
 807
 808 /*
 809   wipe the entire database, deleting all records. This can be done
 810   very fast by using a allrecord lock. The entire data portion of the
 811   file becomes a single entry in the freelist.
 812
 813   This code carefully steps around the recovery area, leaving it alone
 814  */
 815 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
 816 {
 817         int i;
 818         tdb_off_t offset = 0;
 819         ssize_t data_len;
 820         tdb_off_t recovery_head;
 821         tdb_len_t recovery_size = 0;
 822
 823         if (tdb_lockall(tdb) != 0) {
 824                 return -1;
 825         }
 826
 827         tdb_trace(tdb, "tdb_wipe_all");
 828
 829         /* see if the tdb has a recovery area, and remember its size
 830            if so. We don't want to lose this as otherwise each
 831            tdb_wipe_all() in a transaction will increase the size of
 832            the tdb by the size of the recovery area */
 833         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 834                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
 835                 goto failed;
 836         }
 837
 838         if (recovery_head != 0) {
 839                 struct tdb_record rec;
 840                 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
 841                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
 842                         return -1;
 843                 }
 844                 recovery_size = rec.rec_len + sizeof(rec);
 845         }
 846
 847         /* wipe the hashes */
 848         for (i=0;i<tdb->header.hash_size;i++) {
 849                 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
 850                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
 851                         goto failed;
 852                 }
 853         }
 854
 855         /* wipe the freelist */
 856         if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 857                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
 858                 goto failed;
 859         }
 860
 861         /* add all the rest of the file to the freelist, possibly leaving a gap
 862            for the recovery area */
 863         if (recovery_size == 0) {
 864                 /* the simple case - the whole file can be used as a freelist */
 865                 data_len = (tdb->map_size - TDB_DATA_START(tdb->header.hash_size));
 866                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->header.hash_size), data_len) != 0) {
 867                         goto failed;
 868                 }
 869         } else {
 870                 /* we need to add two freelist entries - one on either
 871                    side of the recovery area
 872
 873                    Note that we cannot shift the recovery area during
 874                    this operation. Only the transaction.c code may
 875                    move the recovery area or we risk subtle data
 876                    corruption
 877                 */
 878                 data_len = (recovery_head - TDB_DATA_START(tdb->header.hash_size));
 879                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->header.hash_size), data_len) != 0) {
 880                         goto failed;
 881                 }
 882                 /* and the 2nd free list entry after the recovery area - if any */
 883                 data_len = tdb->map_size - (recovery_head+recovery_size);
 884                 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 885                         goto failed;
 886                 }
 887         }
 888
 889         tdb_increment_seqnum_nonblock(tdb);
 890
 891         if (tdb_unlockall(tdb) != 0) {
 892                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
 893                 goto failed;
 894         }
 895
 896         return 0;
 897
 898 failed:
 899         tdb_unlockall(tdb);
 900         return -1;
 901 }
 902
 903 struct traverse_state {
 904         bool error;
 905         struct tdb_context *dest_db;
 906 };
 907
 908 /*
 909   traverse function for repacking
 910  */
 911 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 912 {
 913         struct traverse_state *state = (struct traverse_state *)private_data;
 914         if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 915                 state->error = true;
 916                 return -1;
 917         }
 918         return 0;
 919 }
 920
 921 /*
 922   repack a tdb
 923  */
 924 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
 925 {
 926         struct tdb_context *tmp_db;
 927         struct traverse_state state;
 928
 929         tdb_trace(tdb, "tdb_repack");
 930
 931         if (tdb_transaction_start(tdb) != 0) {
 932                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
 933                 return -1;
 934         }
 935
 936         tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
 937         if (tmp_db == NULL) {
 938                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
 939                 tdb_transaction_cancel(tdb);
 940                 return -1;
 941         }
 942
 943         state.error = false;
 944         state.dest_db = tmp_db;
 945
 946         if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
 947                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
 948                 tdb_transaction_cancel(tdb);
 949                 tdb_close(tmp_db);
 950                 return -1;
 951         }
 952
 953         if (state.error) {
 954                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
 955                 tdb_transaction_cancel(tdb);
 956                 tdb_close(tmp_db);
 957                 return -1;
 958         }
 959
 960         if (tdb_wipe_all(tdb) != 0) {
 961                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
 962                 tdb_transaction_cancel(tdb);
 963                 tdb_close(tmp_db);
 964                 return -1;
 965         }
 966
 967         state.error = false;
 968         state.dest_db = tdb;
 969
 970         if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
 971                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
 972                 tdb_transaction_cancel(tdb);
 973                 tdb_close(tmp_db);
 974                 return -1;
 975         }
 976
 977         if (state.error) {
 978                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
 979                 tdb_transaction_cancel(tdb);
 980                 tdb_close(tmp_db);
 981                 return -1;
 982         }
 983
 984         tdb_close(tmp_db);
 985
 986         if (tdb_transaction_commit(tdb) != 0) {
 987                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
 988                 return -1;
 989         }
 990
 991         return 0;
 992 }
 993
 994 /* Even on files, we can get partial writes due to signals. */
 995 bool tdb_write_all(int fd, const void *buf, size_t count)
 996 {
 997         while (count) {
 998                 ssize_t ret;
 999                 ret = write(fd, buf, count);
1000                 if (ret < 0)
1001                         return false;
1002                 buf = (const char *)buf + ret;
1003                 count -= ret;
1004         }
1005         return true;
1006 }
1007
1008 #ifdef TDB_TRACE
1009 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
1010 {
1011         if (!tdb_write_alltdb->tracefd, str, strlen(str)) {
1012                 close(tdb->tracefd);
1013                 tdb->tracefd = -1;
1014         }
1015 }
1016
1017 static void tdb_trace_start(struct tdb_context *tdb)
1018 {
1019         tdb_off_t seqnum=0;
1020         char msg[sizeof(tdb_off_t) * 4 + 1];
1021
1022         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1023         snprintf(msg, sizeof(msg), "%u ", seqnum);
1024         tdb_trace_write(tdb, msg);
1025 }
1026
1027 static void tdb_trace_end(struct tdb_context *tdb)
1028 {
1029         tdb_trace_write(tdb, "\n");
1030 }
1031
1032 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1033 {
1034         char msg[sizeof(ret) * 4 + 4];
1035         snprintf(msg, sizeof(msg), " = %i\n", ret);
1036         tdb_trace_write(tdb, msg);
1037 }
1038
1039 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1040 {
1041         char msg[20 + rec.dsize*2], *p;
1042         unsigned int i;
1043
1044         /* We differentiate zero-length records from non-existent ones. */
1045         if (rec.dptr == NULL) {
1046                 tdb_trace_write(tdb, " NULL");
1047                 return;
1048         }
1049
1050         /* snprintf here is purely cargo-cult programming. */
1051         p = msg;
1052         p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1053         for (i = 0; i < rec.dsize; i++)
1054                 p += snprintf(p, 2, "%02x", rec.dptr[i]);
1055
1056         tdb_trace_write(tdb, msg);
1057 }
1058
1059 void tdb_trace(struct tdb_context *tdb, const char *op)
1060 {
1061         tdb_trace_start(tdb);
1062         tdb_trace_write(tdb, op);
1063         tdb_trace_end(tdb);
1064 }
1065
1066 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1067 {
1068         char msg[sizeof(tdb_off_t) * 4 + 1];
1069
1070         snprintf(msg, sizeof(msg), "%u ", seqnum);
1071         tdb_trace_write(tdb, msg);
1072         tdb_trace_write(tdb, op);
1073         tdb_trace_end(tdb);
1074 }
1075
1076 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1077                     unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1078 {
1079         char msg[128];
1080
1081         snprintf(msg, sizeof(msg),
1082                  "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1083         tdb_trace_start(tdb);
1084         tdb_trace_write(tdb, msg);
1085         tdb_trace_end(tdb);
1086 }
1087
1088 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1089 {
1090         tdb_trace_start(tdb);
1091         tdb_trace_write(tdb, op);
1092         tdb_trace_end_ret(tdb, ret);
1093 }
1094
1095 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1096 {
1097         tdb_trace_start(tdb);
1098         tdb_trace_write(tdb, op);
1099         tdb_trace_write(tdb, " =");
1100         tdb_trace_record(tdb, ret);
1101         tdb_trace_end(tdb);
1102 }
1103
1104 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1105                     TDB_DATA rec)
1106 {
1107         tdb_trace_start(tdb);
1108         tdb_trace_write(tdb, op);
1109         tdb_trace_record(tdb, rec);
1110         tdb_trace_end(tdb);
1111 }
1112
1113 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1114                         TDB_DATA rec, int ret)
1115 {
1116         tdb_trace_start(tdb);
1117         tdb_trace_write(tdb, op);
1118         tdb_trace_record(tdb, rec);
1119         tdb_trace_end_ret(tdb, ret);
1120 }
1121
1122 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1123                            TDB_DATA rec, TDB_DATA ret)
1124 {
1125         tdb_trace_start(tdb);
1126         tdb_trace_write(tdb, op);
1127         tdb_trace_record(tdb, rec);
1128         tdb_trace_write(tdb, " =");
1129         tdb_trace_record(tdb, ret);
1130         tdb_trace_end(tdb);
1131 }
1132
1133 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1134                              TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1135                              int ret)
1136 {
1137         char msg[1 + sizeof(ret) * 4];
1138
1139         snprintf(msg, sizeof(msg), " %#x", flag);
1140         tdb_trace_start(tdb);
1141         tdb_trace_write(tdb, op);
1142         tdb_trace_record(tdb, rec1);
1143         tdb_trace_record(tdb, rec2);
1144         tdb_trace_write(tdb, msg);
1145         tdb_trace_end_ret(tdb, ret);
1146 }
1147
1148 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1149                            TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1150 {
1151         tdb_trace_start(tdb);
1152         tdb_trace_write(tdb, op);
1153         tdb_trace_record(tdb, rec1);
1154         tdb_trace_record(tdb, rec2);
1155         tdb_trace_write(tdb, " =");
1156         tdb_trace_record(tdb, ret);
1157         tdb_trace_end(tdb);
1158 }
1159 #endif