lib/tdb/common/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ TDB_DATA tdb_null;
  31
  32 /*
  33   non-blocking increment of the tdb sequence number if the tdb has been opened using
  34   the TDB_SEQNUM flag
  35 */
  36 _PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
  37 {
  38         tdb_off_t seqnum=0;
  39
  40         if (!(tdb->flags & TDB_SEQNUM)) {
  41                 return;
  42         }
  43
  44         /* we ignore errors from this, as we have no sane way of
  45            dealing with them.
  46         */
  47         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
  48         seqnum++;
  49         tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
  50 }
  51
  52 /*
  53   increment the tdb sequence number if the tdb has been opened using
  54   the TDB_SEQNUM flag
  55 */
  56 static void tdb_increment_seqnum(struct tdb_context *tdb)
  57 {
  58         if (!(tdb->flags & TDB_SEQNUM)) {
  59                 return;
  60         }
  61
  62         if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
  63                           TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
  64                 return;
  65         }
  66
  67         tdb_increment_seqnum_nonblock(tdb);
  68
  69         tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
  70 }
  71
  72 static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
  73 {
  74         return memcmp(data.dptr, key.dptr, data.dsize);
  75 }
  76
  77 /* Returns 0 on fail.  On success, return offset of record, and fills
  78    in rec */
  79 static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
  80                         struct tdb_record *r)
  81 {
  82         tdb_off_t rec_ptr;
  83
  84         /* read in the hash top */
  85         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
  86                 return 0;
  87
  88         /* keep looking until we find the right record */
  89         while (rec_ptr) {
  90                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
  91                         return 0;
  92
  93                 if (!TDB_DEAD(r) && hash==r->full_hash
  94                     && key.dsize==r->key_len
  95                     && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
  96                                       r->key_len, tdb_key_compare,
  97                                       NULL) == 0) {
  98                         return rec_ptr;
  99                 }
 100                 /* detect tight infinite loop */
 101                 if (rec_ptr == r->next) {
 102                         tdb->ecode = TDB_ERR_CORRUPT;
 103                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
 104                         return 0;
 105                 }
 106                 rec_ptr = r->next;
 107         }
 108         tdb->ecode = TDB_ERR_NOEXIST;
 109         return 0;
 110 }
 111
 112 /* As tdb_find, but if you succeed, keep the lock */
 113 tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
 114                            struct tdb_record *rec)
 115 {
 116         uint32_t rec_ptr;
 117
 118         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 119                 return 0;
 120         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 121                 tdb_unlock(tdb, BUCKET(hash), locktype);
 122         return rec_ptr;
 123 }
 124
 125 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
 126
 127 static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
 128 {
 129         TDB_DATA *dbuf = (TDB_DATA *)private_data;
 130
 131         if (dbuf->dsize != data.dsize) {
 132                 return -1;
 133         }
 134         if (memcmp(dbuf->dptr, data.dptr, data.dsize) != 0) {
 135                 return -1;
 136         }
 137         return 0;
 138 }
 139
 140 /* update an entry in place - this only works if the new data size
 141    is <= the old data size and the key exists.
 142    on failure return -1.
 143 */
 144 static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
 145 {
 146         struct tdb_record rec;
 147         tdb_off_t rec_ptr;
 148
 149         /* find entry */
 150         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
 151                 return -1;
 152
 153         /* it could be an exact duplicate of what is there - this is
 154          * surprisingly common (eg. with a ldb re-index). */
 155         if (rec.key_len == key.dsize &&
 156             rec.data_len == dbuf.dsize &&
 157             rec.full_hash == hash &&
 158             tdb_parse_record(tdb, key, tdb_update_hash_cmp, &dbuf) == 0) {
 159                 return 0;
 160         }
 161
 162         /* must be long enough key, data and tailer */
 163         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
 164                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
 165                 return -1;
 166         }
 167
 168         if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 169                       dbuf.dptr, dbuf.dsize) == -1)
 170                 return -1;
 171
 172         if (dbuf.dsize != rec.data_len) {
 173                 /* update size */
 174                 rec.data_len = dbuf.dsize;
 175                 return tdb_rec_write(tdb, rec_ptr, &rec);
 176         }
 177
 178         return 0;
 179 }
 180
 181 /* find an entry in the database given a key */
 182 /* If an entry doesn't exist tdb_err will be set to
 183  * TDB_ERR_NOEXIST. If a key has no data attached
 184  * then the TDB_DATA will have zero length but
 185  * a non-zero pointer
 186  */
 187 static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 188 {
 189         tdb_off_t rec_ptr;
 190         struct tdb_record rec;
 191         TDB_DATA ret;
 192         uint32_t hash;
 193
 194         /* find which hash bucket it is in */
 195         hash = tdb->hash_fn(&key);
 196         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
 197                 return tdb_null;
 198
 199         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
 200                                   rec.data_len);
 201         ret.dsize = rec.data_len;
 202         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 203         return ret;
 204 }
 205
 206 _PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
 207 {
 208         TDB_DATA ret = _tdb_fetch(tdb, key);
 209
 210         tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
 211         return ret;
 212 }
 213
 214 /*
 215  * Find an entry in the database and hand the record's data to a parsing
 216  * function. The parsing function is executed under the chain read lock, so it
 217  * should be fast and should not block on other syscalls.
 218  *
 219  * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
 220  *
 221  * For mmapped tdb's that do not have a transaction open it points the parsing
 222  * function directly at the mmap area, it avoids the malloc/memcpy in this
 223  * case. If a transaction is open or no mmap is available, it has to do
 224  * malloc/read/parse/free.
 225  *
 226  * This is interesting for all readers of potentially large data structures in
 227  * the tdb records, ldb indexes being one example.
 228  *
 229  * Return -1 if the record was not found.
 230  */
 231
 232 _PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
 233                      int (*parser)(TDB_DATA key, TDB_DATA data,
 234                                    void *private_data),
 235                      void *private_data)
 236 {
 237         tdb_off_t rec_ptr;
 238         struct tdb_record rec;
 239         int ret;
 240         uint32_t hash;
 241
 242         /* find which hash bucket it is in */
 243         hash = tdb->hash_fn(&key);
 244
 245         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
 246                 /* record not found */
 247                 tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
 248                 tdb->ecode = TDB_ERR_NOEXIST;
 249                 return -1;
 250         }
 251         tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
 252
 253         ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
 254                              rec.data_len, parser, private_data);
 255
 256         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 257
 258         return ret;
 259 }
 260
 261 /* check if an entry in the database exists
 262
 263    note that 1 is returned if the key is found and 0 is returned if not found
 264    this doesn't match the conventions in the rest of this module, but is
 265    compatible with gdbm
 266 */
 267 static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 268 {
 269         struct tdb_record rec;
 270
 271         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
 272                 return 0;
 273         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
 274         return 1;
 275 }
 276
 277 _PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
 278 {
 279         uint32_t hash = tdb->hash_fn(&key);
 280         int ret;
 281
 282         ret = tdb_exists_hash(tdb, key, hash);
 283         tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
 284         return ret;
 285 }
 286
 287 /* actually delete an entry in the database given the offset */
 288 int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
 289 {
 290         tdb_off_t last_ptr, i;
 291         struct tdb_record lastrec;
 292
 293         if (tdb->read_only || tdb->traverse_read) return -1;
 294
 295         if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
 296             tdb_write_lock_record(tdb, rec_ptr) == -1) {
 297                 /* Someone traversing here: mark it as dead */
 298                 rec->magic = TDB_DEAD_MAGIC;
 299                 return tdb_rec_write(tdb, rec_ptr, rec);
 300         }
 301         if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
 302                 return -1;
 303
 304         /* find previous record in hash chain */
 305         if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 306                 return -1;
 307         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
 308                 if (tdb_rec_read(tdb, i, &lastrec) == -1)
 309                         return -1;
 310
 311         /* unlink it: next ptr is at start of record. */
 312         if (last_ptr == 0)
 313                 last_ptr = TDB_HASH_TOP(rec->full_hash);
 314         if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
 315                 return -1;
 316
 317         /* recover the space */
 318         if (tdb_free(tdb, rec_ptr, rec) == -1)
 319                 return -1;
 320         return 0;
 321 }
 322
 323 static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
 324 {
 325         int res = 0;
 326         tdb_off_t rec_ptr;
 327         struct tdb_record rec;
 328
 329         /* read in the hash top */
 330         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 331                 return 0;
 332
 333         while (rec_ptr) {
 334                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
 335                         return 0;
 336
 337                 if (rec.magic == TDB_DEAD_MAGIC) {
 338                         res += 1;
 339                 }
 340                 rec_ptr = rec.next;
 341         }
 342         return res;
 343 }
 344
 345 /*
 346  * Purge all DEAD records from a hash chain
 347  */
 348 int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
 349 {
 350         int res = -1;
 351         struct tdb_record rec;
 352         tdb_off_t rec_ptr;
 353
 354         if (tdb_lock_nonblock(tdb, -1, F_WRLCK) == -1) {
 355                 /*
 356                  * Don't block the freelist if not strictly necessary
 357                  */
 358                 return -1;
 359         }
 360
 361         /* read in the hash top */
 362         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 363                 goto fail;
 364
 365         while (rec_ptr) {
 366                 tdb_off_t next;
 367
 368                 if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
 369                         goto fail;
 370                 }
 371
 372                 next = rec.next;
 373
 374                 if (rec.magic == TDB_DEAD_MAGIC
 375                     && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
 376                         goto fail;
 377                 }
 378                 rec_ptr = next;
 379         }
 380         res = 0;
 381  fail:
 382         tdb_unlock(tdb, -1, F_WRLCK);
 383         return res;
 384 }
 385
 386 /* delete an entry in the database given a key */
 387 static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
 388 {
 389         tdb_off_t rec_ptr;
 390         struct tdb_record rec;
 391         int ret;
 392
 393         rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec);
 394         if (rec_ptr == 0) {
 395                 return -1;
 396         }
 397
 398         if (tdb->max_dead_records != 0) {
 399
 400                 uint32_t magic = TDB_DEAD_MAGIC;
 401
 402                 /*
 403                  * Allow for some dead records per hash chain, mainly for
 404                  * tdb's with a very high create/delete rate like locking.tdb.
 405                  */
 406
 407                 if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
 408                         /*
 409                          * Don't let the per-chain freelist grow too large,
 410                          * delete all existing dead records
 411                          */
 412                         tdb_purge_dead(tdb, hash);
 413                 }
 414
 415                 /*
 416                  * Just mark the record as dead.
 417                  */
 418                 ret = tdb_ofs_write(
 419                         tdb, rec_ptr + offsetof(struct tdb_record, magic),
 420                         &magic);
 421         }
 422         else {
 423                 ret = tdb_do_delete(tdb, rec_ptr, &rec);
 424         }
 425
 426         if (ret == 0) {
 427                 tdb_increment_seqnum(tdb);
 428         }
 429
 430         if (tdb_unlock(tdb, BUCKET(hash), F_WRLCK) != 0)
 431                 TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
 432         return ret;
 433 }
 434
 435 _PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
 436 {
 437         uint32_t hash = tdb->hash_fn(&key);
 438         int ret;
 439
 440         ret = tdb_delete_hash(tdb, key, hash);
 441         tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
 442         return ret;
 443 }
 444
 445 /*
 446  * See if we have a dead record around with enough space
 447  */
 448 tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
 449                         struct tdb_record *r, tdb_len_t length,
 450                         tdb_off_t *p_last_ptr)
 451 {
 452         tdb_off_t rec_ptr, last_ptr;
 453         tdb_off_t best_rec_ptr = 0;
 454         tdb_off_t best_last_ptr = 0;
 455         struct tdb_record best = { .rec_len = UINT32_MAX };
 456
 457         length += sizeof(tdb_off_t); /* tailer */
 458
 459         last_ptr = TDB_HASH_TOP(hash);
 460
 461         /* read in the hash top */
 462         if (tdb_ofs_read(tdb, last_ptr, &rec_ptr) == -1)
 463                 return 0;
 464
 465         /* keep looking until we find the right record */
 466         while (rec_ptr) {
 467                 if (tdb_rec_read(tdb, rec_ptr, r) == -1)
 468                         return 0;
 469
 470                 if (TDB_DEAD(r) && (r->rec_len >= length) &&
 471                     (r->rec_len < best.rec_len)) {
 472                         best_rec_ptr = rec_ptr;
 473                         best_last_ptr = last_ptr;
 474                         best = *r;
 475                 }
 476                 last_ptr = rec_ptr;
 477                 rec_ptr = r->next;
 478         }
 479
 480         if (best.rec_len == UINT32_MAX) {
 481                 return 0;
 482         }
 483
 484         *r = best;
 485         *p_last_ptr = best_last_ptr;
 486         return best_rec_ptr;
 487 }
 488
 489 static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
 490                        TDB_DATA dbuf, int flag, uint32_t hash)
 491 {
 492         struct tdb_record rec;
 493         tdb_off_t rec_ptr;
 494         int ret = -1;
 495
 496         /* check for it existing, on insert. */
 497         if (flag == TDB_INSERT) {
 498                 if (tdb_exists_hash(tdb, key, hash)) {
 499                         tdb->ecode = TDB_ERR_EXISTS;
 500                         goto fail;
 501                 }
 502         } else {
 503                 /* first try in-place update, on modify or replace. */
 504                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
 505                         goto done;
 506                 }
 507                 if (tdb->ecode == TDB_ERR_NOEXIST &&
 508                     flag == TDB_MODIFY) {
 509                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
 510                          we should fail the store */
 511                         goto fail;
 512                 }
 513         }
 514         /* reset the error code potentially set by the tdb_update() */
 515         tdb->ecode = TDB_SUCCESS;
 516
 517         /* delete any existing record - if it doesn't exist we don't
 518            care.  Doing this first reduces fragmentation, and avoids
 519            coalescing with `allocated' block before it's updated. */
 520         if (flag != TDB_INSERT)
 521                 tdb_delete_hash(tdb, key, hash);
 522
 523         if (tdb->max_dead_records != 0) {
 524                 tdb_off_t last_ptr;
 525                 /*
 526                  * Allow for some dead records per hash chain, look if we can
 527                  * find one that can hold the new record. We need enough space
 528                  * for key, data and tailer. If we find one, we don't have to
 529                  * consult the central freelist.
 530                  */
 531                 rec_ptr = tdb_find_dead(tdb, hash, &rec,
 532                                         key.dsize + dbuf.dsize,
 533                                         &last_ptr);
 534
 535                 if (rec_ptr != 0) {
 536                         rec.key_len = key.dsize;
 537                         rec.data_len = dbuf.dsize;
 538                         rec.full_hash = hash;
 539                         rec.magic = TDB_MAGIC;
 540                         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 541                             || tdb->methods->tdb_write(
 542                                     tdb, rec_ptr + sizeof(rec),
 543                                     key.dptr, key.dsize) == -1
 544                             || tdb->methods->tdb_write(
 545                                     tdb, rec_ptr + sizeof(rec) + key.dsize,
 546                                     dbuf.dptr, dbuf.dsize) == -1) {
 547                                 goto fail;
 548                         }
 549                         goto done;
 550                 }
 551         }
 552
 553         /* we have to allocate some space */
 554         rec_ptr = tdb_allocate(tdb, hash, key.dsize + dbuf.dsize, &rec);
 555
 556         if (rec_ptr == 0) {
 557                 goto fail;
 558         }
 559
 560         /* Read hash top into next ptr */
 561         if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
 562                 goto fail;
 563
 564         rec.key_len = key.dsize;
 565         rec.data_len = dbuf.dsize;
 566         rec.full_hash = hash;
 567         rec.magic = TDB_MAGIC;
 568
 569         /* write out and point the top of the hash chain at it */
 570         if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
 571             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec),
 572                                        key.dptr, key.dsize) == -1
 573             || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec)+key.dsize,
 574                                        dbuf.dptr, dbuf.dsize) == -1
 575             || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
 576                 /* Need to tdb_unallocate() here */
 577                 goto fail;
 578         }
 579
 580  done:
 581         ret = 0;
 582  fail:
 583         if (ret == 0) {
 584                 tdb_increment_seqnum(tdb);
 585         }
 586         return ret;
 587 }
 588
 589 /* store an element in the database, replacing any existing element
 590    with the same key
 591
 592    return 0 on success, -1 on failure
 593 */
 594 _PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
 595 {
 596         uint32_t hash;
 597         int ret;
 598
 599         if (tdb->read_only || tdb->traverse_read) {
 600                 tdb->ecode = TDB_ERR_RDONLY;
 601                 tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
 602                 return -1;
 603         }
 604
 605         /* find which hash bucket it is in */
 606         hash = tdb->hash_fn(&key);
 607         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 608                 return -1;
 609
 610         ret = _tdb_store(tdb, key, dbuf, flag, hash);
 611         tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
 612         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 613         return ret;
 614 }
 615
 616 /* Append to an entry. Create if not exist. */
 617 _PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
 618 {
 619         uint32_t hash;
 620         TDB_DATA dbuf;
 621         int ret = -1;
 622
 623         /* find which hash bucket it is in */
 624         hash = tdb->hash_fn(&key);
 625         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
 626                 return -1;
 627
 628         dbuf = _tdb_fetch(tdb, key);
 629
 630         if (dbuf.dptr == NULL) {
 631                 dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
 632         } else {
 633                 unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
 634                 unsigned char *new_dptr;
 635
 636                 /* realloc '0' is special: don't do that. */
 637                 if (new_len == 0)
 638                         new_len = 1;
 639                 new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
 640                 if (new_dptr == NULL) {
 641                         free(dbuf.dptr);
 642                 }
 643                 dbuf.dptr = new_dptr;
 644         }
 645
 646         if (dbuf.dptr == NULL) {
 647                 tdb->ecode = TDB_ERR_OOM;
 648                 goto failed;
 649         }
 650
 651         memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
 652         dbuf.dsize += new_dbuf.dsize;
 653
 654         ret = _tdb_store(tdb, key, dbuf, 0, hash);
 655         tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
 656
 657 failed:
 658         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
 659         SAFE_FREE(dbuf.dptr);
 660         return ret;
 661 }
 662
 663
 664 /*
 665   return the name of the current tdb file
 666   useful for external logging functions
 667 */
 668 _PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
 669 {
 670         return tdb->name;
 671 }
 672
 673 /*
 674   return the underlying file descriptor being used by tdb, or -1
 675   useful for external routines that want to check the device/inode
 676   of the fd
 677 */
 678 _PUBLIC_ int tdb_fd(struct tdb_context *tdb)
 679 {
 680         return tdb->fd;
 681 }
 682
 683 /*
 684   return the current logging function
 685   useful for external tdb routines that wish to log tdb errors
 686 */
 687 _PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
 688 {
 689         return tdb->log.log_fn;
 690 }
 691
 692
 693 /*
 694   get the tdb sequence number. Only makes sense if the writers opened
 695   with TDB_SEQNUM set. Note that this sequence number will wrap quite
 696   quickly, so it should only be used for a 'has something changed'
 697   test, not for code that relies on the count of the number of changes
 698   made. If you want a counter then use a tdb record.
 699
 700   The aim of this sequence number is to allow for a very lightweight
 701   test of a possible tdb change.
 702 */
 703 _PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
 704 {
 705         tdb_off_t seqnum=0;
 706
 707         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
 708         return seqnum;
 709 }
 710
 711 _PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
 712 {
 713         return tdb->hash_size;
 714 }
 715
 716 _PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
 717 {
 718         return tdb->map_size;
 719 }
 720
 721 _PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
 722 {
 723         return tdb->flags;
 724 }
 725
 726 _PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
 727 {
 728         if ((flags & TDB_ALLOW_NESTING) &&
 729             (flags & TDB_DISALLOW_NESTING)) {
 730                 tdb->ecode = TDB_ERR_NESTING;
 731                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
 732                         "allow_nesting and disallow_nesting are not allowed together!"));
 733                 return;
 734         }
 735
 736         if (flags & TDB_ALLOW_NESTING) {
 737                 tdb->flags &= ~TDB_DISALLOW_NESTING;
 738         }
 739         if (flags & TDB_DISALLOW_NESTING) {
 740                 tdb->flags &= ~TDB_ALLOW_NESTING;
 741         }
 742
 743         tdb->flags |= flags;
 744 }
 745
 746 _PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
 747 {
 748         if ((flags & TDB_ALLOW_NESTING) &&
 749             (flags & TDB_DISALLOW_NESTING)) {
 750                 tdb->ecode = TDB_ERR_NESTING;
 751                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
 752                         "allow_nesting and disallow_nesting are not allowed together!"));
 753                 return;
 754         }
 755
 756         if (flags & TDB_ALLOW_NESTING) {
 757                 tdb->flags |= TDB_DISALLOW_NESTING;
 758         }
 759         if (flags & TDB_DISALLOW_NESTING) {
 760                 tdb->flags |= TDB_ALLOW_NESTING;
 761         }
 762
 763         tdb->flags &= ~flags;
 764 }
 765
 766
 767 /*
 768   enable sequence number handling on an open tdb
 769 */
 770 _PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
 771 {
 772         tdb->flags |= TDB_SEQNUM;
 773 }
 774
 775
 776 /*
 777   add a region of the file to the freelist. Length is the size of the region in bytes,
 778   which includes the free list header that needs to be added
 779  */
 780 static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
 781 {
 782         struct tdb_record rec;
 783         if (length <= sizeof(rec)) {
 784                 /* the region is not worth adding */
 785                 return 0;
 786         }
 787         if (length + offset > tdb->map_size) {
 788                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
 789                 return -1;
 790         }
 791         memset(&rec,'\0',sizeof(rec));
 792         rec.rec_len = length - sizeof(rec);
 793         if (tdb_free(tdb, offset, &rec) == -1) {
 794                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
 795                 return -1;
 796         }
 797         return 0;
 798 }
 799
 800 /*
 801   wipe the entire database, deleting all records. This can be done
 802   very fast by using a allrecord lock. The entire data portion of the
 803   file becomes a single entry in the freelist.
 804
 805   This code carefully steps around the recovery area, leaving it alone
 806  */
 807 _PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
 808 {
 809         int i;
 810         tdb_off_t offset = 0;
 811         ssize_t data_len;
 812         tdb_off_t recovery_head;
 813         tdb_len_t recovery_size = 0;
 814
 815         if (tdb_lockall(tdb) != 0) {
 816                 return -1;
 817         }
 818
 819         tdb_trace(tdb, "tdb_wipe_all");
 820
 821         /* see if the tdb has a recovery area, and remember its size
 822            if so. We don't want to lose this as otherwise each
 823            tdb_wipe_all() in a transaction will increase the size of
 824            the tdb by the size of the recovery area */
 825         if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
 826                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
 827                 goto failed;
 828         }
 829
 830         if (recovery_head != 0) {
 831                 struct tdb_record rec;
 832                 if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
 833                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
 834                         return -1;
 835                 }
 836                 recovery_size = rec.rec_len + sizeof(rec);
 837         }
 838
 839         /* wipe the hashes */
 840         for (i=0;i<tdb->hash_size;i++) {
 841                 if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
 842                         TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
 843                         goto failed;
 844                 }
 845         }
 846
 847         /* wipe the freelist */
 848         if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 849                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
 850                 goto failed;
 851         }
 852
 853         /* add all the rest of the file to the freelist, possibly leaving a gap
 854            for the recovery area */
 855         if (recovery_size == 0) {
 856                 /* the simple case - the whole file can be used as a freelist */
 857                 data_len = (tdb->map_size - TDB_DATA_START(tdb->hash_size));
 858                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 859                         goto failed;
 860                 }
 861         } else {
 862                 /* we need to add two freelist entries - one on either
 863                    side of the recovery area
 864
 865                    Note that we cannot shift the recovery area during
 866                    this operation. Only the transaction.c code may
 867                    move the recovery area or we risk subtle data
 868                    corruption
 869                 */
 870                 data_len = (recovery_head - TDB_DATA_START(tdb->hash_size));
 871                 if (tdb_free_region(tdb, TDB_DATA_START(tdb->hash_size), data_len) != 0) {
 872                         goto failed;
 873                 }
 874                 /* and the 2nd free list entry after the recovery area - if any */
 875                 data_len = tdb->map_size - (recovery_head+recovery_size);
 876                 if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
 877                         goto failed;
 878                 }
 879         }
 880
 881         tdb_increment_seqnum_nonblock(tdb);
 882
 883         if (tdb_unlockall(tdb) != 0) {
 884                 TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
 885                 goto failed;
 886         }
 887
 888         return 0;
 889
 890 failed:
 891         tdb_unlockall(tdb);
 892         return -1;
 893 }
 894
 895 struct traverse_state {
 896         bool error;
 897         struct tdb_context *dest_db;
 898 };
 899
 900 /*
 901   traverse function for repacking
 902  */
 903 static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
 904 {
 905         struct traverse_state *state = (struct traverse_state *)private_data;
 906         if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
 907                 state->error = true;
 908                 return -1;
 909         }
 910         return 0;
 911 }
 912
 913 /*
 914   repack a tdb
 915  */
 916 _PUBLIC_ int tdb_repack(struct tdb_context *tdb)
 917 {
 918         struct tdb_context *tmp_db;
 919         struct traverse_state state;
 920
 921         tdb_trace(tdb, "tdb_repack");
 922
 923         if (tdb_transaction_start(tdb) != 0) {
 924                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
 925                 return -1;
 926         }
 927
 928         tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
 929         if (tmp_db == NULL) {
 930                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
 931                 tdb_transaction_cancel(tdb);
 932                 return -1;
 933         }
 934
 935         state.error = false;
 936         state.dest_db = tmp_db;
 937
 938         if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
 939                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
 940                 tdb_transaction_cancel(tdb);
 941                 tdb_close(tmp_db);
 942                 return -1;
 943         }
 944
 945         if (state.error) {
 946                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
 947                 tdb_transaction_cancel(tdb);
 948                 tdb_close(tmp_db);
 949                 return -1;
 950         }
 951
 952         if (tdb_wipe_all(tdb) != 0) {
 953                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
 954                 tdb_transaction_cancel(tdb);
 955                 tdb_close(tmp_db);
 956                 return -1;
 957         }
 958
 959         state.error = false;
 960         state.dest_db = tdb;
 961
 962         if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
 963                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
 964                 tdb_transaction_cancel(tdb);
 965                 tdb_close(tmp_db);
 966                 return -1;
 967         }
 968
 969         if (state.error) {
 970                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
 971                 tdb_transaction_cancel(tdb);
 972                 tdb_close(tmp_db);
 973                 return -1;
 974         }
 975
 976         tdb_close(tmp_db);
 977
 978         if (tdb_transaction_commit(tdb) != 0) {
 979                 TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
 980                 return -1;
 981         }
 982
 983         return 0;
 984 }
 985
 986 /* Even on files, we can get partial writes due to signals. */
 987 bool tdb_write_all(int fd, const void *buf, size_t count)
 988 {
 989         while (count) {
 990                 ssize_t ret;
 991                 ret = write(fd, buf, count);
 992                 if (ret < 0)
 993                         return false;
 994                 buf = (const char *)buf + ret;
 995                 count -= ret;
 996         }
 997         return true;
 998 }
 999
1000 bool tdb_add_off_t(tdb_off_t a, tdb_off_t b, tdb_off_t *pret)
1001 {
1002         tdb_off_t ret = a + b;
1003
1004         if ((ret < a) || (ret < b)) {
1005                 return false;
1006         }
1007         *pret = ret;
1008         return true;
1009 }
1010
1011 #ifdef TDB_TRACE
1012 static void tdb_trace_write(struct tdb_context *tdb, const char *str)
1013 {
1014         if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
1015                 close(tdb->tracefd);
1016                 tdb->tracefd = -1;
1017         }
1018 }
1019
1020 static void tdb_trace_start(struct tdb_context *tdb)
1021 {
1022         tdb_off_t seqnum=0;
1023         char msg[sizeof(tdb_off_t) * 4 + 1];
1024
1025         tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
1026         snprintf(msg, sizeof(msg), "%u ", seqnum);
1027         tdb_trace_write(tdb, msg);
1028 }
1029
1030 static void tdb_trace_end(struct tdb_context *tdb)
1031 {
1032         tdb_trace_write(tdb, "\n");
1033 }
1034
1035 static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
1036 {
1037         char msg[sizeof(ret) * 4 + 4];
1038         snprintf(msg, sizeof(msg), " = %i\n", ret);
1039         tdb_trace_write(tdb, msg);
1040 }
1041
1042 static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
1043 {
1044         char msg[20 + rec.dsize*2], *p;
1045         unsigned int i;
1046
1047         /* We differentiate zero-length records from non-existent ones. */
1048         if (rec.dptr == NULL) {
1049                 tdb_trace_write(tdb, " NULL");
1050                 return;
1051         }
1052
1053         /* snprintf here is purely cargo-cult programming. */
1054         p = msg;
1055         p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
1056         for (i = 0; i < rec.dsize; i++)
1057                 p += snprintf(p, 2, "%02x", rec.dptr[i]);
1058
1059         tdb_trace_write(tdb, msg);
1060 }
1061
1062 void tdb_trace(struct tdb_context *tdb, const char *op)
1063 {
1064         tdb_trace_start(tdb);
1065         tdb_trace_write(tdb, op);
1066         tdb_trace_end(tdb);
1067 }
1068
1069 void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
1070 {
1071         char msg[sizeof(tdb_off_t) * 4 + 1];
1072
1073         snprintf(msg, sizeof(msg), "%u ", seqnum);
1074         tdb_trace_write(tdb, msg);
1075         tdb_trace_write(tdb, op);
1076         tdb_trace_end(tdb);
1077 }
1078
1079 void tdb_trace_open(struct tdb_context *tdb, const char *op,
1080                     unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
1081 {
1082         char msg[128];
1083
1084         snprintf(msg, sizeof(msg),
1085                  "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
1086         tdb_trace_start(tdb);
1087         tdb_trace_write(tdb, msg);
1088         tdb_trace_end(tdb);
1089 }
1090
1091 void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
1092 {
1093         tdb_trace_start(tdb);
1094         tdb_trace_write(tdb, op);
1095         tdb_trace_end_ret(tdb, ret);
1096 }
1097
1098 void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
1099 {
1100         tdb_trace_start(tdb);
1101         tdb_trace_write(tdb, op);
1102         tdb_trace_write(tdb, " =");
1103         tdb_trace_record(tdb, ret);
1104         tdb_trace_end(tdb);
1105 }
1106
1107 void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
1108                     TDB_DATA rec)
1109 {
1110         tdb_trace_start(tdb);
1111         tdb_trace_write(tdb, op);
1112         tdb_trace_record(tdb, rec);
1113         tdb_trace_end(tdb);
1114 }
1115
1116 void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
1117                         TDB_DATA rec, int ret)
1118 {
1119         tdb_trace_start(tdb);
1120         tdb_trace_write(tdb, op);
1121         tdb_trace_record(tdb, rec);
1122         tdb_trace_end_ret(tdb, ret);
1123 }
1124
1125 void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
1126                            TDB_DATA rec, TDB_DATA ret)
1127 {
1128         tdb_trace_start(tdb);
1129         tdb_trace_write(tdb, op);
1130         tdb_trace_record(tdb, rec);
1131         tdb_trace_write(tdb, " =");
1132         tdb_trace_record(tdb, ret);
1133         tdb_trace_end(tdb);
1134 }
1135
1136 void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
1137                              TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
1138                              int ret)
1139 {
1140         char msg[1 + sizeof(ret) * 4];
1141
1142         snprintf(msg, sizeof(msg), " %#x", flag);
1143         tdb_trace_start(tdb);
1144         tdb_trace_write(tdb, op);
1145         tdb_trace_record(tdb, rec1);
1146         tdb_trace_record(tdb, rec2);
1147         tdb_trace_write(tdb, msg);
1148         tdb_trace_end_ret(tdb, ret);
1149 }
1150
1151 void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
1152                            TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
1153 {
1154         tdb_trace_start(tdb);
1155         tdb_trace_write(tdb, op);
1156         tdb_trace_record(tdb, rec1);
1157         tdb_trace_record(tdb, rec2);
1158         tdb_trace_write(tdb, " =");
1159         tdb_trace_record(tdb, ret);
1160         tdb_trace_end(tdb);
1161 }
1162 #endif