source/tdb/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3    Samba database functions
   4    Copyright (C) Andrew Tridgell              1999-2000
   5    Copyright (C) Luke Kenneth Casson Leighton      2000
   6    Copyright (C) Paul `Rusty' Russell              2000
   7    Copyright (C) Jeremy Allison                    2000
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 2 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; if not, write to the Free Software
  21    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22 */
  23 #ifdef STANDALONE
  24 #if HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <fcntl.h>
  31 #include <unistd.h>
  32 #include <string.h>
  33 #include <fcntl.h>
  34 #include <errno.h>
  35 #include <sys/mman.h>
  36 #include <sys/stat.h>
  37 #include <signal.h>
  38 #include "tdb.h"
  39 #include "spinlock.h"
  40 #else
  41 #include "includes.h"
  42 #endif
  43
  44 #define TDB_MAGIC_FOOD "TDB file\n"
  45 #define TDB_VERSION (0x26011967 + 6)
  46 #define TDB_MAGIC (0x26011999U)
  47 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  48 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  49 #define TDB_ALIGNMENT 4
  50 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
  51 #define DEFAULT_HASH_SIZE 131
  52 #define TDB_PAGE_SIZE 0x2000
  53 #define FREELIST_TOP (sizeof(struct tdb_header))
  54 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
  55 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
  56 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
  57 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
  58 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
  59
  60 /* NB assumes there is a local variable called "tdb" that is the
  61  * current context, also takes doubly-parenthesized print-style
  62  * argument. */
  63 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
  64
  65 /* lock offsets */
  66 #define GLOBAL_LOCK 0
  67 #define ACTIVE_LOCK 4
  68
  69 #ifndef MAP_FILE
  70 #define MAP_FILE 0
  71 #endif
  72
  73 #ifndef MAP_FAILED
  74 #define MAP_FAILED ((void *)-1)
  75 #endif
  76
  77 /* free memory if the pointer is valid and zero the pointer */
  78 #ifndef SAFE_FREE
  79 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
  80 #endif
  81
  82 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
  83 TDB_DATA tdb_null;
  84
  85 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
  86 static TDB_CONTEXT *tdbs = NULL;
  87
  88 static int tdb_munmap(TDB_CONTEXT *tdb)
  89 {
  90         if (tdb->flags & TDB_INTERNAL)
  91                 return 0;
  92
  93 #ifdef HAVE_MMAP
  94         if (tdb->map_ptr) {
  95                 int ret = munmap(tdb->map_ptr, tdb->map_size);
  96                 if (ret != 0)
  97                         return ret;
  98         }
  99 #endif
 100         tdb->map_ptr = NULL;
 101         return 0;
 102 }
 103
 104 static void tdb_mmap(TDB_CONTEXT *tdb)
 105 {
 106         if (tdb->flags & TDB_INTERNAL)
 107                 return;
 108
 109 #ifdef HAVE_MMAP
 110         if (!(tdb->flags & TDB_NOMMAP)) {
 111                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 112                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 113                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 114
 115                 /*
 116                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 117                  */
 118
 119                 if (tdb->map_ptr == MAP_FAILED) {
 120                         tdb->map_ptr = NULL;
 121                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 122                                  tdb->map_size, strerror(errno)));
 123                 }
 124         } else {
 125                 tdb->map_ptr = NULL;
 126         }
 127 #else
 128         tdb->map_ptr = NULL;
 129 #endif
 130 }
 131
 132 /* Endian conversion: we only ever deal with 4 byte quantities */
 133 static void *convert(void *buf, u32 size)
 134 {
 135         u32 i, *p = buf;
 136         for (i = 0; i < size / 4; i++)
 137                 p[i] = TDB_BYTEREV(p[i]);
 138         return buf;
 139 }
 140 #define DOCONV() (tdb->flags & TDB_CONVERT)
 141 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 142
 143 /* the body of the database is made of one list_struct for the free space
 144    plus a separate data list for each hash value */
 145 struct list_struct {
 146         tdb_off next; /* offset of the next record in the list */
 147         tdb_len rec_len; /* total byte length of record */
 148         tdb_len key_len; /* byte length of key */
 149         tdb_len data_len; /* byte length of data */
 150         u32 full_hash; /* the full 32 bit hash of the key */
 151         u32 magic;   /* try to catch errors */
 152         /* the following union is implied:
 153                 union {
 154                         char record[rec_len];
 155                         struct {
 156                                 char key[key_len];
 157                                 char data[data_len];
 158                         }
 159                         u32 totalsize; (tailer)
 160                 }
 161         */
 162 };
 163
 164 /***************************************************************
 165  Allow a caller to set a "alarm" flag that tdb can check to abort
 166  a blocking lock on SIGALRM.
 167 ***************************************************************/
 168
 169 static sig_atomic_t *palarm_fired;
 170
 171 void tdb_set_lock_alarm(sig_atomic_t *palarm)
 172 {
 173         palarm_fired = palarm;
 174 }
 175
 176 /* a byte range locking function - return 0 on success
 177    this functions locks/unlocks 1 byte at the specified offset.
 178
 179    On error, errno is also set so that errors are passed back properly
 180    through tdb_open(). */
 181 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 182                       int rw_type, int lck_type, int probe)
 183 {
 184         struct flock fl;
 185         int ret;
 186
 187         if (tdb->flags & TDB_NOLOCK)
 188                 return 0;
 189         if (tdb->read_only) {
 190                 errno = EACCES;
 191                 return -1;
 192         }
 193
 194         fl.l_type = rw_type;
 195         fl.l_whence = SEEK_SET;
 196         fl.l_start = offset;
 197         fl.l_len = 1;
 198         fl.l_pid = 0;
 199
 200         do {
 201                 ret = fcntl(tdb->fd,lck_type,&fl);
 202                 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
 203                         break;
 204         } while (ret == -1 && errno == EINTR);
 205
 206         if (ret == -1) {
 207                 if (!probe && lck_type != F_SETLK) {
 208                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 209                                  tdb->fd, offset, rw_type, lck_type));
 210                 }
 211                 /* errno set by fcntl */
 212                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 213         }
 214         return 0;
 215 }
 216
 217 /* lock a list in the database. list -1 is the alloc list */
 218 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 219 {
 220         if (list < -1 || list >= (int)tdb->header.hash_size) {
 221                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 222                            list, ltype));
 223                 return -1;
 224         }
 225         if (tdb->flags & TDB_NOLOCK)
 226                 return 0;
 227
 228         /* Since fcntl locks don't nest, we do a lock for the first one,
 229            and simply bump the count for future ones */
 230         if (tdb->locked[list+1].count == 0) {
 231                 if (!tdb->read_only && tdb->header.rwlocks) {
 232                         if (tdb_spinlock(tdb, list, ltype)) {
 233                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list ltype=%d\n",
 234                                            list, ltype));
 235                                 return -1;
 236                         }
 237                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 238                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 239                                            list, ltype, strerror(errno)));
 240                         return -1;
 241                 }
 242                 tdb->locked[list+1].ltype = ltype;
 243         }
 244         tdb->locked[list+1].count++;
 245         return 0;
 246 }
 247
 248 /* unlock the database: returns void because it's too late for errors. */
 249         /* changed to return int it may be interesting to know there
 250            has been an error  --simo */
 251 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 252 {
 253         int ret = -1;
 254
 255         if (tdb->flags & TDB_NOLOCK)
 256                 return 0;
 257
 258         /* Sanity checks */
 259         if (list < -1 || list >= (int)tdb->header.hash_size) {
 260                 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
 261                 return ret;
 262         }
 263
 264         if (tdb->locked[list+1].count==0) {
 265                 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
 266                 return ret;
 267         }
 268
 269         if (tdb->locked[list+1].count == 1) {
 270                 /* Down to last nested lock: unlock underneath */
 271                 if (!tdb->read_only && tdb->header.rwlocks) {
 272                         ret = tdb_spinunlock(tdb, list, ltype);
 273                 } else {
 274                         ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 275                 }
 276         } else {
 277                 ret = 0;
 278         }
 279         tdb->locked[list+1].count--;
 280
 281         if (ret)
 282                 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
 283         return ret;
 284 }
 285
 286 /* This is based on the hash algorithm from gdbm */
 287 static u32 tdb_hash(TDB_DATA *key)
 288 {
 289         u32 value;      /* Used to compute the hash value.  */
 290         u32   i;        /* Used to cycle through random values. */
 291
 292         /* Set the initial value from the key size. */
 293         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
 294                 value = (value + (key->dptr[i] << (i*5 % 24)));
 295
 296         return (1103515243 * value + 12345);
 297 }
 298
 299 /* check for an out of bounds access - if it is out of bounds then
 300    see if the database has been expanded by someone else and expand
 301    if necessary
 302    note that "len" is the minimum length needed for the db
 303 */
 304 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 305 {
 306         struct stat st;
 307         if (len <= tdb->map_size)
 308                 return 0;
 309         if (tdb->flags & TDB_INTERNAL) {
 310                 if (!probe) {
 311                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 312                                  (int)len, (int)tdb->map_size));
 313                 }
 314                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 315         }
 316
 317         if (fstat(tdb->fd, &st) == -1)
 318                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 319
 320         if (st.st_size < (size_t)len) {
 321                 if (!probe) {
 322                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 323                                  (int)len, (int)st.st_size));
 324                 }
 325                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 326         }
 327
 328         /* Unmap, update size, remap */
 329         if (tdb_munmap(tdb) == -1)
 330                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 331         tdb->map_size = st.st_size;
 332         tdb_mmap(tdb);
 333         return 0;
 334 }
 335
 336 /* write a lump of data at a specified offset */
 337 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 338 {
 339         if (tdb_oob(tdb, off + len, 0) != 0)
 340                 return -1;
 341
 342         if (tdb->map_ptr)
 343                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 344 #ifdef HAVE_PWRITE
 345         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 346 #else
 347         else if (lseek(tdb->fd, off, SEEK_SET) != off
 348                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 349 #endif
 350                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 351                            off, len, strerror(errno)));
 352                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 353         }
 354         return 0;
 355 }
 356
 357 /* read a lump of data at a specified offset, maybe convert */
 358 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 359 {
 360         if (tdb_oob(tdb, off + len, 0) != 0)
 361                 return -1;
 362
 363         if (tdb->map_ptr)
 364                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 365 #ifdef HAVE_PREAD
 366         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 367 #else
 368         else if (lseek(tdb->fd, off, SEEK_SET) != off
 369                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 370 #endif
 371                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 372                            off, len, strerror(errno)));
 373                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 374         }
 375         if (cv)
 376                 convert(buf, len);
 377         return 0;
 378 }
 379
 380 /* read a lump of data, allocating the space for it */
 381 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 382 {
 383         char *buf;
 384
 385         if (!(buf = malloc(len))) {
 386                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 387                            len, strerror(errno)));
 388                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 389         }
 390         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 391                 SAFE_FREE(buf);
 392                 return NULL;
 393         }
 394         return buf;
 395 }
 396
 397 /* read/write a tdb_off */
 398 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 399 {
 400         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 401 }
 402 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 403 {
 404         tdb_off off = *d;
 405         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 406 }
 407
 408 /* read/write a record */
 409 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 410 {
 411         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 412                 return -1;
 413         if (TDB_BAD_MAGIC(rec)) {
 414                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 415                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 416         }
 417         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 418 }
 419 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 420 {
 421         struct list_struct r = *rec;
 422         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 423 }
 424
 425 /* read a freelist record and check for simple errors */
 426 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 427 {
 428         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 429                 return -1;
 430
 431         if (rec->magic == TDB_MAGIC) {
 432                 /* this happens when a app is showdown while deleting a record - we should
 433                    not completely fail when this happens */
 434                 TDB_LOG((tdb, 0,"rec_free_read non-free magic at offset=%d - fixing\n",
 435                          rec->magic, off));
 436                 rec->magic = TDB_FREE_MAGIC;
 437                 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
 438                         return -1;
 439         }
 440
 441         if (rec->magic != TDB_FREE_MAGIC) {
 442                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 443                            rec->magic, off));
 444                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 445         }
 446         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 447                 return -1;
 448         return 0;
 449 }
 450
 451 /* update a record tailer (must hold allocation lock) */
 452 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 453                          const struct list_struct *rec)
 454 {
 455         tdb_off totalsize;
 456
 457         /* Offset of tailer from record header */
 458         totalsize = sizeof(*rec) + rec->rec_len;
 459         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 460                          &totalsize);
 461 }
 462
 463 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 464 {
 465         struct list_struct rec;
 466         tdb_off tailer_ofs, tailer;
 467
 468         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 469                 printf("ERROR: failed to read record at %u\n", offset);
 470                 return 0;
 471         }
 472
 473         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 474                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 475
 476         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 477         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 478                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 479                 return rec.next;
 480         }
 481
 482         if (tailer != rec.rec_len + sizeof(rec)) {
 483                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 484                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 485         }
 486         return rec.next;
 487 }
 488
 489 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 490 {
 491         tdb_off rec_ptr, top;
 492
 493         top = TDB_HASH_TOP(i);
 494
 495         if (tdb_lock(tdb, i, F_WRLCK) != 0)
 496                 return -1;
 497
 498         if (ofs_read(tdb, top, &rec_ptr) == -1)
 499                 return tdb_unlock(tdb, i, F_WRLCK);
 500
 501         if (rec_ptr)
 502                 printf("hash=%d\n", i);
 503
 504         while (rec_ptr) {
 505                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 506         }
 507
 508         return tdb_unlock(tdb, i, F_WRLCK);
 509 }
 510
 511 void tdb_dump_all(TDB_CONTEXT *tdb)
 512 {
 513         int i;
 514         for (i=0;i<tdb->header.hash_size;i++) {
 515                 tdb_dump_chain(tdb, i);
 516         }
 517         printf("freelist:\n");
 518         tdb_dump_chain(tdb, -1);
 519 }
 520
 521 int tdb_printfreelist(TDB_CONTEXT *tdb)
 522 {
 523         int ret;
 524         long total_free = 0;
 525         tdb_off offset, rec_ptr;
 526         struct list_struct rec;
 527
 528         if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
 529                 return ret;
 530
 531         offset = FREELIST_TOP;
 532
 533         /* read in the freelist top */
 534         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 535                 tdb_unlock(tdb, -1, F_WRLCK);
 536                 return 0;
 537         }
 538
 539         printf("freelist top=[0x%08x]\n", rec_ptr );
 540         while (rec_ptr) {
 541                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 542                         tdb_unlock(tdb, -1, F_WRLCK);
 543                         return -1;
 544                 }
 545
 546                 if (rec.magic != TDB_FREE_MAGIC) {
 547                         printf("bad magic 0x%08x in free list\n", rec.magic);
 548                         tdb_unlock(tdb, -1, F_WRLCK);
 549                         return -1;
 550                 }
 551
 552                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 553                 total_free += rec.rec_len;
 554
 555                 /* move to the next record */
 556                 rec_ptr = rec.next;
 557         }
 558         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 559                (int)total_free);
 560
 561         return tdb_unlock(tdb, -1, F_WRLCK);
 562 }
 563
 564 /* Remove an element from the freelist.  Must have alloc lock. */
 565 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 566 {
 567         tdb_off last_ptr, i;
 568
 569         /* read in the freelist top */
 570         last_ptr = FREELIST_TOP;
 571         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 572                 if (i == off) {
 573                         /* We've found it! */
 574                         return ofs_write(tdb, last_ptr, &next);
 575                 }
 576                 /* Follow chain (next offset is at start of record) */
 577                 last_ptr = i;
 578         }
 579         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 580         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 581 }
 582
 583 /* Add an element into the freelist. Merge adjacent records if
 584    neccessary. */
 585 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 586 {
 587         tdb_off right, left;
 588
 589         /* Allocation and tailer lock */
 590         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 591                 return -1;
 592
 593         /* set an initial tailer, so if we fail we don't leave a bogus record */
 594         if (update_tailer(tdb, offset, rec) != 0) {
 595                 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
 596                 goto fail;
 597         }
 598
 599         /* Look right first (I'm an Australian, dammit) */
 600         right = offset + sizeof(*rec) + rec->rec_len;
 601         if (right + sizeof(*rec) <= tdb->map_size) {
 602                 struct list_struct r;
 603
 604                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 605                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 606                         goto left;
 607                 }
 608
 609                 /* If it's free, expand to include it. */
 610                 if (r.magic == TDB_FREE_MAGIC) {
 611                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 612                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 613                                 goto left;
 614                         }
 615                         rec->rec_len += sizeof(r) + r.rec_len;
 616                 }
 617         }
 618
 619 left:
 620         /* Look left */
 621         left = offset - sizeof(tdb_off);
 622         if (left > TDB_HASH_TOP(tdb->header.hash_size-1)) {
 623                 struct list_struct l;
 624                 tdb_off leftsize;
 625
 626                 /* Read in tailer and jump back to header */
 627                 if (ofs_read(tdb, left, &leftsize) == -1) {
 628                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 629                         goto update;
 630                 }
 631                 left = offset - leftsize;
 632
 633                 /* Now read in record */
 634                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 635                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 636                         goto update;
 637                 }
 638
 639                 /* If it's free, expand to include it. */
 640                 if (l.magic == TDB_FREE_MAGIC) {
 641                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 642                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 643                                 goto update;
 644                         } else {
 645                                 offset = left;
 646                                 rec->rec_len += leftsize;
 647                         }
 648                 }
 649         }
 650
 651 update:
 652         if (update_tailer(tdb, offset, rec) == -1) {
 653                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 654                 goto fail;
 655         }
 656
 657         /* Now, prepend to free list */
 658         rec->magic = TDB_FREE_MAGIC;
 659
 660         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 661             rec_write(tdb, offset, rec) == -1 ||
 662             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 663                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 664                 goto fail;
 665         }
 666
 667         /* And we're done. */
 668         tdb_unlock(tdb, -1, F_WRLCK);
 669         return 0;
 670
 671  fail:
 672         tdb_unlock(tdb, -1, F_WRLCK);
 673         return -1;
 674 }
 675
 676
 677 /* expand a file.  we prefer to use ftruncate, as that is what posix
 678   says to use for mmap expansion */
 679 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 680 {
 681         char buf[1024];
 682 #if HAVE_FTRUNCATE_EXTEND
 683         if (ftruncate(tdb->fd, size+addition) != 0) {
 684                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 685                            size+addition, strerror(errno)));
 686                 return -1;
 687         }
 688 #else
 689         char b = 0;
 690
 691 #ifdef HAVE_PWRITE
 692         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 693 #else
 694         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 695             write(tdb->fd, &b, 1) != 1) {
 696 #endif
 697                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 698                            size+addition, strerror(errno)));
 699                 return -1;
 700         }
 701 #endif
 702
 703         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 704            very bad if we ran out of disk. This must be done with write, not via mmap */
 705         memset(buf, 0x42, sizeof(buf));
 706         while (addition) {
 707                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 708 #ifdef HAVE_PWRITE
 709                 int ret = pwrite(tdb->fd, buf, n, size);
 710 #else
 711                 int ret;
 712                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 713                         return -1;
 714                 ret = write(tdb->fd, buf, n);
 715 #endif
 716                 if (ret != n) {
 717                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 718                                    n, strerror(errno)));
 719                         return -1;
 720                 }
 721                 addition -= n;
 722                 size += n;
 723         }
 724         return 0;
 725 }
 726
 727
 728 /* expand the database at least size bytes by expanding the underlying
 729    file and doing the mmap again if necessary */
 730 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 731 {
 732         struct list_struct rec;
 733         tdb_off offset;
 734
 735         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 736                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 737                 return -1;
 738         }
 739
 740         /* must know about any previous expansions by another process */
 741         tdb_oob(tdb, tdb->map_size + 1, 1);
 742
 743         /* always make room for at least 10 more records, and round
 744            the database up to a multiple of TDB_PAGE_SIZE */
 745         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 746
 747         if (!(tdb->flags & TDB_INTERNAL))
 748                 tdb_munmap(tdb);
 749
 750         /*
 751          * We must ensure the file is unmapped before doing this
 752          * to ensure consistency with systems like OpenBSD where
 753          * writes and mmaps are not consistent.
 754          */
 755
 756         /* expand the file itself */
 757         if (!(tdb->flags & TDB_INTERNAL)) {
 758                 if (expand_file(tdb, tdb->map_size, size) != 0)
 759                         goto fail;
 760         }
 761
 762         tdb->map_size += size;
 763
 764         if (tdb->flags & TDB_INTERNAL)
 765                 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 766         else {
 767                 /*
 768                  * We must ensure the file is remapped before adding the space
 769                  * to ensure consistency with systems like OpenBSD where
 770                  * writes and mmaps are not consistent.
 771                  */
 772
 773                 /* We're ok if the mmap fails as we'll fallback to read/write */
 774                 tdb_mmap(tdb);
 775         }
 776
 777         /* form a new freelist record */
 778         memset(&rec,'\0',sizeof(rec));
 779         rec.rec_len = size - sizeof(rec);
 780
 781         /* link it into the free list */
 782         offset = tdb->map_size - size;
 783         if (tdb_free(tdb, offset, &rec) == -1)
 784                 goto fail;
 785
 786         tdb_unlock(tdb, -1, F_WRLCK);
 787         return 0;
 788  fail:
 789         tdb_unlock(tdb, -1, F_WRLCK);
 790         return -1;
 791 }
 792
 793 /* allocate some space from the free list. The offset returned points
 794    to a unconnected list_struct within the database with room for at
 795    least length bytes of total data
 796
 797    0 is returned if the space could not be allocated
 798  */
 799 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 800                             struct list_struct *rec)
 801 {
 802         tdb_off rec_ptr, last_ptr, newrec_ptr;
 803         struct list_struct newrec;
 804
 805         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 806                 return 0;
 807
 808         /* Extra bytes required for tailer */
 809         length += sizeof(tdb_off);
 810
 811  again:
 812         last_ptr = FREELIST_TOP;
 813
 814         /* read in the freelist top */
 815         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 816                 goto fail;
 817
 818         /* keep looking until we find a freelist record big enough */
 819         while (rec_ptr) {
 820                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 821                         goto fail;
 822
 823                 if (rec->rec_len >= length) {
 824                         /* found it - now possibly split it up  */
 825                         if (rec->rec_len > length + MIN_REC_SIZE) {
 826                                 /* Length of left piece */
 827                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 828
 829                                 /* Right piece to go on free list */
 830                                 newrec.rec_len = rec->rec_len
 831                                         - (sizeof(*rec) + length);
 832                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 833
 834                                 /* And left record is shortened */
 835                                 rec->rec_len = length;
 836                         } else
 837                                 newrec_ptr = 0;
 838
 839                         /* Remove allocated record from the free list */
 840                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 841                                 goto fail;
 842
 843                         /* Update header: do this before we drop alloc
 844                            lock, otherwise tdb_free() might try to
 845                            merge with us, thinking we're free.
 846                            (Thanks Jeremy Allison). */
 847                         rec->magic = TDB_MAGIC;
 848                         if (rec_write(tdb, rec_ptr, rec) == -1)
 849                                 goto fail;
 850
 851                         /* Did we create new block? */
 852                         if (newrec_ptr) {
 853                                 /* Update allocated record tailer (we
 854                                    shortened it). */
 855                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 856                                         goto fail;
 857
 858                                 /* Free new record */
 859                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 860                                         goto fail;
 861                         }
 862
 863                         /* all done - return the new record offset */
 864                         tdb_unlock(tdb, -1, F_WRLCK);
 865                         return rec_ptr;
 866                 }
 867                 /* move to the next record */
 868                 last_ptr = rec_ptr;
 869                 rec_ptr = rec->next;
 870         }
 871         /* we didn't find enough space. See if we can expand the
 872            database and if we can then try again */
 873         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 874                 goto again;
 875  fail:
 876         tdb_unlock(tdb, -1, F_WRLCK);
 877         return 0;
 878 }
 879
 880 /* initialise a new database with a specified hash size */
 881 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 882 {
 883         struct tdb_header *newdb;
 884         int size, ret = -1;
 885
 886         /* We make it up in memory, then write it out if not internal */
 887         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 888         if (!(newdb = calloc(size, 1)))
 889                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 890
 891         /* Fill in the header */
 892         newdb->version = TDB_VERSION;
 893         newdb->hash_size = hash_size;
 894 #ifdef USE_SPINLOCKS
 895         newdb->rwlocks = size;
 896 #endif
 897         if (tdb->flags & TDB_INTERNAL) {
 898                 tdb->map_size = size;
 899                 tdb->map_ptr = (char *)newdb;
 900                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 901                 /* Convert the `ondisk' version if asked. */
 902                 CONVERT(*newdb);
 903                 return 0;
 904         }
 905         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 906                 goto fail;
 907
 908         if (ftruncate(tdb->fd, 0) == -1)
 909                 goto fail;
 910
 911         /* This creates an endian-converted header, as if read from disk */
 912         CONVERT(*newdb);
 913         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 914         /* Don't endian-convert the magic food! */
 915         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 916         if (write(tdb->fd, newdb, size) != size)
 917                 ret = -1;
 918         else
 919                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 920
 921   fail:
 922         SAFE_FREE(newdb);
 923         return ret;
 924 }
 925
 926 /* Returns 0 on fail.  On success, return offset of record, and fills
 927    in rec */
 928 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 929                         struct list_struct *r)
 930 {
 931         tdb_off rec_ptr;
 932
 933         /* read in the hash top */
 934         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 935                 return 0;
 936
 937         /* keep looking until we find the right record */
 938         while (rec_ptr) {
 939                 if (rec_read(tdb, rec_ptr, r) == -1)
 940                         return 0;
 941
 942                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 943                         char *k;
 944                         /* a very likely hit - read the key */
 945                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
 946                                            r->key_len);
 947                         if (!k)
 948                                 return 0;
 949
 950                         if (memcmp(key.dptr, k, key.dsize) == 0) {
 951                                 SAFE_FREE(k);
 952                                 return rec_ptr;
 953                         }
 954                         SAFE_FREE(k);
 955                 }
 956                 rec_ptr = r->next;
 957         }
 958         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
 959 }
 960
 961 /* If they do lockkeys, check that this hash is one they locked */
 962 static int tdb_keylocked(TDB_CONTEXT *tdb, u32 hash)
 963 {
 964         u32 i;
 965         if (!tdb->lockedkeys)
 966                 return 1;
 967         for (i = 0; i < tdb->lockedkeys[0]; i++)
 968                 if (tdb->lockedkeys[i+1] == hash)
 969                         return 1;
 970         return TDB_ERRCODE(TDB_ERR_NOLOCK, 0);
 971 }
 972
 973 /* As tdb_find, but if you succeed, keep the lock */
 974 static tdb_off tdb_find_lock(TDB_CONTEXT *tdb, TDB_DATA key, int locktype,
 975                              struct list_struct *rec)
 976 {
 977         u32 hash, rec_ptr;
 978
 979         hash = tdb_hash(&key);
 980         if (!tdb_keylocked(tdb, hash))
 981                 return 0;
 982         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 983                 return 0;
 984         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 985                 tdb_unlock(tdb, BUCKET(hash), locktype);
 986         return rec_ptr;
 987 }
 988
 989 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
 990 {
 991         return tdb->ecode;
 992 }
 993
 994 static struct tdb_errname {
 995         enum TDB_ERROR ecode; const char *estring;
 996 } emap[] = { {TDB_SUCCESS, "Success"},
 997              {TDB_ERR_CORRUPT, "Corrupt database"},
 998              {TDB_ERR_IO, "IO Error"},
 999              {TDB_ERR_LOCK, "Locking error"},
1000              {TDB_ERR_OOM, "Out of memory"},
1001              {TDB_ERR_EXISTS, "Record exists"},
1002              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1003              {TDB_ERR_NOEXIST, "Record does not exist"} };
1004
1005 /* Error string for the last tdb error */
1006 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1007 {
1008         u32 i;
1009         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1010                 if (tdb->ecode == emap[i].ecode)
1011                         return emap[i].estring;
1012         return "Invalid error code";
1013 }
1014
1015 /* update an entry in place - this only works if the new data size
1016    is <= the old data size and the key exists.
1017    on failure return -1
1018 */
1019 static int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
1020 {
1021         struct list_struct rec;
1022         tdb_off rec_ptr;
1023         int ret = -1;
1024
1025         /* find entry */
1026         if (!(rec_ptr = tdb_find_lock(tdb, key, F_WRLCK, &rec)))
1027                 return -1;
1028
1029         /* must be long enough key, data and tailer */
1030         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1031                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1032                 goto out;
1033         }
1034
1035         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1036                       dbuf.dptr, dbuf.dsize) == -1)
1037                 goto out;
1038
1039         if (dbuf.dsize != rec.data_len) {
1040                 /* update size */
1041                 rec.data_len = dbuf.dsize;
1042                 ret = rec_write(tdb, rec_ptr, &rec);
1043         } else
1044                 ret = 0;
1045  out:
1046         tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK);
1047         return ret;
1048 }
1049
1050 /* find an entry in the database given a key */
1051 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1052 {
1053         tdb_off rec_ptr;
1054         struct list_struct rec;
1055         TDB_DATA ret;
1056
1057         /* find which hash bucket it is in */
1058         if (!(rec_ptr = tdb_find_lock(tdb,key,F_RDLCK,&rec)))
1059                 return tdb_null;
1060
1061         ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1062                                   rec.data_len);
1063         ret.dsize = rec.data_len;
1064         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1065         return ret;
1066 }
1067
1068 /* check if an entry in the database exists
1069
1070    note that 1 is returned if the key is found and 0 is returned if not found
1071    this doesn't match the conventions in the rest of this module, but is
1072    compatible with gdbm
1073 */
1074 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1075 {
1076         struct list_struct rec;
1077
1078         if (tdb_find_lock(tdb, key, F_RDLCK, &rec) == 0)
1079                 return 0;
1080         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1081         return 1;
1082 }
1083
1084 /* record lock stops delete underneath */
1085 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1086 {
1087         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1088 }
1089 /*
1090   Write locks override our own fcntl readlocks, so check it here.
1091   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1092   an error to fail to get the lock here.
1093 */
1094
1095 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1096 {
1097         struct tdb_traverse_lock *i;
1098         for (i = &tdb->travlocks; i; i = i->next)
1099                 if (i->off == off)
1100                         return -1;
1101         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1102 }
1103
1104 /*
1105   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1106   an error to fail to get the lock here.
1107 */
1108
1109 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1110 {
1111         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1112 }
1113 /* fcntl locks don't stack: avoid unlocking someone else's */
1114 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1115 {
1116         struct tdb_traverse_lock *i;
1117         u32 count = 0;
1118
1119         if (off == 0)
1120                 return 0;
1121         for (i = &tdb->travlocks; i; i = i->next)
1122                 if (i->off == off)
1123                         count++;
1124         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1125 }
1126
1127 /* actually delete an entry in the database given the offset */
1128 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1129 {
1130         tdb_off last_ptr, i;
1131         struct list_struct lastrec;
1132
1133         if (tdb->read_only) return -1;
1134
1135         if (write_lock_record(tdb, rec_ptr) == -1) {
1136                 /* Someone traversing here: mark it as dead */
1137                 rec->magic = TDB_DEAD_MAGIC;
1138                 return rec_write(tdb, rec_ptr, rec);
1139         }
1140         if (write_unlock_record(tdb, rec_ptr) != 0)
1141                 return -1;
1142
1143         /* find previous record in hash chain */
1144         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1145                 return -1;
1146         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1147                 if (rec_read(tdb, i, &lastrec) == -1)
1148                         return -1;
1149
1150         /* unlink it: next ptr is at start of record. */
1151         if (last_ptr == 0)
1152                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1153         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1154                 return -1;
1155
1156         /* recover the space */
1157         if (tdb_free(tdb, rec_ptr, rec) == -1)
1158                 return -1;
1159         return 0;
1160 }
1161
1162 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1163 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1164                          struct list_struct *rec)
1165 {
1166         int want_next = (tlock->off != 0);
1167
1168         /* No traversal allows if you've called tdb_lockkeys() */
1169         if (tdb->lockedkeys)
1170                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1171
1172         /* Lock each chain from the start one. */
1173         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1174                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1175                         return -1;
1176
1177                 /* No previous record?  Start at top of chain. */
1178                 if (!tlock->off) {
1179                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1180                                      &tlock->off) == -1)
1181                                 goto fail;
1182                 } else {
1183                         /* Otherwise unlock the previous record. */
1184                         if (unlock_record(tdb, tlock->off) != 0)
1185                                 goto fail;
1186                 }
1187
1188                 if (want_next) {
1189                         /* We have offset of old record: grab next */
1190                         if (rec_read(tdb, tlock->off, rec) == -1)
1191                                 goto fail;
1192                         tlock->off = rec->next;
1193                 }
1194
1195                 /* Iterate through chain */
1196                 while( tlock->off) {
1197                         tdb_off current;
1198                         if (rec_read(tdb, tlock->off, rec) == -1)
1199                                 goto fail;
1200                         if (!TDB_DEAD(rec)) {
1201                                 /* Woohoo: we found one! */
1202                                 if (lock_record(tdb, tlock->off) != 0)
1203                                         goto fail;
1204                                 return tlock->off;
1205                         }
1206                         /* Try to clean dead ones from old traverses */
1207                         current = tlock->off;
1208                         tlock->off = rec->next;
1209                         if (do_delete(tdb, current, rec) != 0)
1210                                 goto fail;
1211                 }
1212                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1213                 want_next = 0;
1214         }
1215         /* We finished iteration without finding anything */
1216         return TDB_ERRCODE(TDB_SUCCESS, 0);
1217
1218  fail:
1219         tlock->off = 0;
1220         if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1221                 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1222         return -1;
1223 }
1224
1225 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1226    return -1 on error or the record count traversed
1227    if fn is NULL then it is not called
1228    a non-zero return value from fn() indicates that the traversal should stop
1229   */
1230 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *state)
1231 {
1232         TDB_DATA key, dbuf;
1233         struct list_struct rec;
1234         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1235         int ret, count = 0;
1236
1237         /* This was in the initializaton, above, but the IRIX compiler
1238          * did not like it.  crh
1239          */
1240         tl.next = tdb->travlocks.next;
1241
1242         /* fcntl locks don't stack: beware traverse inside traverse */
1243         tdb->travlocks.next = &tl;
1244
1245         /* tdb_next_lock places locks on the record returned, and its chain */
1246         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1247                 count++;
1248                 /* now read the full record */
1249                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1250                                           rec.key_len + rec.data_len);
1251                 if (!key.dptr) {
1252                         ret = -1;
1253                         if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1254                                 goto out;
1255                         if (unlock_record(tdb, tl.off) != 0)
1256                                 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1257                         goto out;
1258                 }
1259                 key.dsize = rec.key_len;
1260                 dbuf.dptr = key.dptr + rec.key_len;
1261                 dbuf.dsize = rec.data_len;
1262
1263                 /* Drop chain lock, call out */
1264                 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1265                         ret = -1;
1266                         goto out;
1267                 }
1268                 if (fn && fn(tdb, key, dbuf, state)) {
1269                         /* They want us to terminate traversal */
1270                         ret = count;
1271                         if (unlock_record(tdb, tl.off) != 0) {
1272                                 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1273                                 ret = -1;
1274                         }
1275                         tdb->travlocks.next = tl.next;
1276                         SAFE_FREE(key.dptr);
1277                         return count;
1278                 }
1279                 SAFE_FREE(key.dptr);
1280         }
1281 out:
1282         tdb->travlocks.next = tl.next;
1283         if (ret < 0)
1284                 return -1;
1285         else
1286                 return count;
1287 }
1288
1289 /* find the first entry in the database and return its key */
1290 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1291 {
1292         TDB_DATA key;
1293         struct list_struct rec;
1294
1295         /* release any old lock */
1296         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1297                 return tdb_null;
1298         tdb->travlocks.off = tdb->travlocks.hash = 0;
1299
1300         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1301                 return tdb_null;
1302         /* now read the key */
1303         key.dsize = rec.key_len;
1304         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1305         if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1306                 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1307         return key;
1308 }
1309
1310 /* find the next entry in the database, returning its key */
1311 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1312 {
1313         u32 oldhash;
1314         TDB_DATA key = tdb_null;
1315         struct list_struct rec;
1316         char *k = NULL;
1317
1318         /* Is locked key the old key?  If so, traverse will be reliable. */
1319         if (tdb->travlocks.off) {
1320                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1321                         return tdb_null;
1322                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1323                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1324                                             rec.key_len))
1325                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1326                         /* No, it wasn't: unlock it and start from scratch */
1327                         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1328                                 return tdb_null;
1329                         if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1330                                 return tdb_null;
1331                         tdb->travlocks.off = 0;
1332                 }
1333
1334                 SAFE_FREE(k);
1335         }
1336
1337         if (!tdb->travlocks.off) {
1338                 /* No previous element: do normal find, and lock record */
1339                 tdb->travlocks.off = tdb_find_lock(tdb, oldkey, F_WRLCK, &rec);
1340                 if (!tdb->travlocks.off)
1341                         return tdb_null;
1342                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1343                 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1344                         TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1345                         return tdb_null;
1346                 }
1347         }
1348         oldhash = tdb->travlocks.hash;
1349
1350         /* Grab next record: locks chain and returned record,
1351            unlocks old record */
1352         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1353                 key.dsize = rec.key_len;
1354                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1355                                           key.dsize);
1356                 /* Unlock the chain of this new record */
1357                 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1358                         TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1359         }
1360         /* Unlock the chain of old record */
1361         if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1362                 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1363         return key;
1364 }
1365
1366 /* delete an entry in the database given a key */
1367 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1368 {
1369         tdb_off rec_ptr;
1370         struct list_struct rec;
1371         int ret;
1372
1373         if (!(rec_ptr = tdb_find_lock(tdb, key, F_WRLCK, &rec)))
1374                 return -1;
1375         ret = do_delete(tdb, rec_ptr, &rec);
1376         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1377                 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1378         return ret;
1379 }
1380
1381 /* store an element in the database, replacing any existing element
1382    with the same key
1383
1384    return 0 on success, -1 on failure
1385 */
1386 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1387 {
1388         struct list_struct rec;
1389         u32 hash;
1390         tdb_off rec_ptr;
1391         char *p = NULL;
1392         int ret = 0;
1393
1394         /* find which hash bucket it is in */
1395         hash = tdb_hash(&key);
1396         if (!tdb_keylocked(tdb, hash))
1397                 return -1;
1398         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1399                 return -1;
1400
1401         /* check for it existing, on insert. */
1402         if (flag == TDB_INSERT) {
1403                 if (tdb_exists(tdb, key)) {
1404                         tdb->ecode = TDB_ERR_EXISTS;
1405                         goto fail;
1406                 }
1407         } else {
1408                 /* first try in-place update, on modify or replace. */
1409                 if (tdb_update(tdb, key, dbuf) == 0)
1410                         goto out;
1411                 if (flag == TDB_MODIFY && tdb->ecode == TDB_ERR_NOEXIST)
1412                         goto fail;
1413         }
1414         /* reset the error code potentially set by the tdb_update() */
1415         tdb->ecode = TDB_SUCCESS;
1416
1417         /* delete any existing record - if it doesn't exist we don't
1418            care.  Doing this first reduces fragmentation, and avoids
1419            coalescing with `allocated' block before it's updated. */
1420         if (flag != TDB_INSERT)
1421                 tdb_delete(tdb, key);
1422
1423         /* Copy key+value *before* allocating free space in case malloc
1424            fails and we are left with a dead spot in the tdb. */
1425
1426         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1427                 tdb->ecode = TDB_ERR_OOM;
1428                 goto fail;
1429         }
1430
1431         memcpy(p, key.dptr, key.dsize);
1432         memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1433
1434         /* now we're into insert / modify / replace of a record which
1435          * we know could not be optimised by an in-place store (for
1436          * various reasons).  */
1437         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1438                 goto fail;
1439
1440         /* Read hash top into next ptr */
1441         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1442                 goto fail;
1443
1444         rec.key_len = key.dsize;
1445         rec.data_len = dbuf.dsize;
1446         rec.full_hash = hash;
1447         rec.magic = TDB_MAGIC;
1448
1449         /* write out and point the top of the hash chain at it */
1450         if (rec_write(tdb, rec_ptr, &rec) == -1
1451             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1452             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1453                 /* Need to tdb_unallocate() here */
1454                 goto fail;
1455         }
1456  out:
1457         SAFE_FREE(p);
1458         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1459         return ret;
1460 fail:
1461         ret = -1;
1462         goto out;
1463 }
1464
1465 static int tdb_already_open(dev_t device,
1466                             ino_t ino)
1467 {
1468         TDB_CONTEXT *i;
1469
1470         for (i = tdbs; i; i = i->next) {
1471                 if (i->device == device && i->inode == ino) {
1472                         return 1;
1473                 }
1474         }
1475
1476         return 0;
1477 }
1478
1479 /* open the database, creating it if necessary
1480
1481    The open_flags and mode are passed straight to the open call on the
1482    database file. A flags value of O_WRONLY is invalid. The hash size
1483    is advisory, use zero for a default value.
1484
1485    Return is NULL on error, in which case errno is also set.  Don't
1486    try to call tdb_error or tdb_errname, just do strerror(errno).
1487
1488    @param name may be NULL for internal databases. */
1489 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1490                       int open_flags, mode_t mode)
1491 {
1492         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL);
1493 }
1494
1495
1496 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1497                          int open_flags, mode_t mode,
1498                          tdb_log_func log_fn)
1499 {
1500         TDB_CONTEXT *tdb;
1501         struct stat st;
1502         int rev = 0, locked;
1503         unsigned char *vp;
1504         u32 vertest;
1505
1506         if (!(tdb = calloc(1, sizeof *tdb))) {
1507                 /* Can't log this */
1508                 errno = ENOMEM;
1509                 goto fail;
1510         }
1511         tdb->fd = -1;
1512         tdb->name = NULL;
1513         tdb->map_ptr = NULL;
1514         tdb->lockedkeys = NULL;
1515         tdb->flags = tdb_flags;
1516         tdb->open_flags = open_flags;
1517         tdb->log_fn = log_fn;
1518
1519         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1520                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1521                          name));
1522                 errno = EINVAL;
1523                 goto fail;
1524         }
1525
1526         if (hash_size == 0)
1527                 hash_size = DEFAULT_HASH_SIZE;
1528         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1529                 tdb->read_only = 1;
1530                 /* read only databases don't do locking or clear if first */
1531                 tdb->flags |= TDB_NOLOCK;
1532                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1533         }
1534
1535         /* internal databases don't mmap or lock, and start off cleared */
1536         if (tdb->flags & TDB_INTERNAL) {
1537                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1538                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1539                 if (tdb_new_database(tdb, hash_size) != 0) {
1540                         TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1541                         goto fail;
1542                 }
1543                 goto internal;
1544         }
1545
1546         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1547                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1548                          name, strerror(errno)));
1549                 goto fail;      /* errno set by open(2) */
1550         }
1551
1552         /* ensure there is only one process initialising at once */
1553         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1554                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1555                          name, strerror(errno)));
1556                 goto fail;      /* errno set by tdb_brlock */
1557         }
1558
1559         /* we need to zero database if we are the only one with it open */
1560         if ((locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))
1561             && (tdb_flags & TDB_CLEAR_IF_FIRST)) {
1562                 open_flags |= O_CREAT;
1563                 if (ftruncate(tdb->fd, 0) == -1) {
1564                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1565                                  "failed to truncate %s: %s\n",
1566                                  name, strerror(errno)));
1567                         goto fail; /* errno set by ftruncate */
1568                 }
1569         }
1570
1571         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1572             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1573             || (tdb->header.version != TDB_VERSION
1574                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1575                 /* its not a valid database - possibly initialise it */
1576                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1577                         errno = EIO; /* ie bad format or something */
1578                         goto fail;
1579                 }
1580                 rev = (tdb->flags & TDB_CONVERT);
1581         }
1582         vp = (unsigned char *)&tdb->header.version;
1583         vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1584                   (((u32)vp[2]) << 8) | (u32)vp[3];
1585         tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1586         if (!rev)
1587                 tdb->flags &= ~TDB_CONVERT;
1588         else {
1589                 tdb->flags |= TDB_CONVERT;
1590                 convert(&tdb->header, sizeof(tdb->header));
1591         }
1592         if (fstat(tdb->fd, &st) == -1)
1593                 goto fail;
1594
1595         /* Is it already in the open list?  If so, fail. */
1596         if (tdb_already_open(st.st_dev, st.st_ino)) {
1597                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1598                          "%s (%d,%d) is already open in this process\n",
1599                          name, st.st_dev, st.st_ino));
1600                 errno = EBUSY;
1601                 goto fail;
1602         }
1603
1604         if (!(tdb->name = (char *)strdup(name))) {
1605                 errno = ENOMEM;
1606                 goto fail;
1607         }
1608
1609         tdb->map_size = st.st_size;
1610         tdb->device = st.st_dev;
1611         tdb->inode = st.st_ino;
1612         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1613         if (!tdb->locked) {
1614                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1615                          "failed to allocate lock structure for %s\n",
1616                          name));
1617                 errno = ENOMEM;
1618                 goto fail;
1619         }
1620         tdb_mmap(tdb);
1621         if (locked) {
1622                 if (!tdb->read_only)
1623                         if (tdb_clear_spinlocks(tdb) != 0) {
1624                                 TDB_LOG((tdb, 0, "tdb_open_ex: "
1625                                 "failed to clear spinlock\n"));
1626                                 goto fail;
1627                         }
1628                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1629                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1630                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1631                                  name, strerror(errno)));
1632                         goto fail;
1633                 }
1634         }
1635         /* leave this lock in place to indicate it's in use */
1636         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1637                 goto fail;
1638
1639  internal:
1640         /* Internal (memory-only) databases skip all the code above to
1641          * do with disk files, and resume here by releasing their
1642          * global lock and hooking into the active list. */
1643         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1644                 goto fail;
1645         tdb->next = tdbs;
1646         tdbs = tdb;
1647         return tdb;
1648
1649  fail:
1650         { int save_errno = errno;
1651
1652         if (!tdb)
1653                 return NULL;
1654
1655         if (tdb->map_ptr) {
1656                 if (tdb->flags & TDB_INTERNAL)
1657                         SAFE_FREE(tdb->map_ptr);
1658                 else
1659                         tdb_munmap(tdb);
1660         }
1661         SAFE_FREE(tdb->name);
1662         if (tdb->fd != -1)
1663                 if (close(tdb->fd) != 0)
1664                         TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1665         SAFE_FREE(tdb->locked);
1666         SAFE_FREE(tdb);
1667         errno = save_errno;
1668         return NULL;
1669         }
1670 }
1671
1672 /* close a database */
1673 int tdb_close(TDB_CONTEXT *tdb)
1674 {
1675         TDB_CONTEXT **i;
1676         int ret = 0;
1677
1678         if (tdb->map_ptr) {
1679                 if (tdb->flags & TDB_INTERNAL)
1680                         SAFE_FREE(tdb->map_ptr);
1681                 else
1682                         tdb_munmap(tdb);
1683         }
1684         SAFE_FREE(tdb->name);
1685         if (tdb->fd != -1)
1686                 ret = close(tdb->fd);
1687         SAFE_FREE(tdb->locked);
1688         SAFE_FREE(tdb->lockedkeys);
1689
1690         /* Remove from contexts list */
1691         for (i = &tdbs; *i; i = &(*i)->next) {
1692                 if (*i == tdb) {
1693                         *i = tdb->next;
1694                         break;
1695                 }
1696         }
1697
1698         memset(tdb, 0, sizeof(*tdb));
1699         SAFE_FREE(tdb);
1700
1701         return ret;
1702 }
1703
1704 /* lock/unlock entire database */
1705 int tdb_lockall(TDB_CONTEXT *tdb)
1706 {
1707         u32 i;
1708
1709         /* There are no locks on read-only dbs */
1710         if (tdb->read_only)
1711                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1712         if (tdb->lockedkeys)
1713                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1714         for (i = 0; i < tdb->header.hash_size; i++)
1715                 if (tdb_lock(tdb, i, F_WRLCK))
1716                         break;
1717
1718         /* If error, release locks we have... */
1719         if (i < tdb->header.hash_size) {
1720                 u32 j;
1721
1722                 for ( j = 0; j < i; j++)
1723                         tdb_unlock(tdb, j, F_WRLCK);
1724                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1725         }
1726
1727         return 0;
1728 }
1729 void tdb_unlockall(TDB_CONTEXT *tdb)
1730 {
1731         u32 i;
1732         for (i=0; i < tdb->header.hash_size; i++)
1733                 tdb_unlock(tdb, i, F_WRLCK);
1734 }
1735
1736 int tdb_lockkeys(TDB_CONTEXT *tdb, u32 number, TDB_DATA keys[])
1737 {
1738         u32 i, j, hash;
1739
1740         /* Can't lock more keys if already locked */
1741         if (tdb->lockedkeys)
1742                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1743         if (!(tdb->lockedkeys = malloc(sizeof(u32) * (number+1))))
1744                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
1745         /* First number in array is # keys */
1746         tdb->lockedkeys[0] = number;
1747
1748         /* Insertion sort by bucket */
1749         for (i = 0; i < number; i++) {
1750                 hash = tdb_hash(&keys[i]);
1751                 for (j = 0; j < i && BUCKET(tdb->lockedkeys[j+1]) < BUCKET(hash); j++);
1752                         memmove(&tdb->lockedkeys[j+2], &tdb->lockedkeys[j+1], sizeof(u32) * (i-j));
1753                 tdb->lockedkeys[j+1] = hash;
1754         }
1755         /* Finally, lock in order */
1756         for (i = 0; i < number; i++)
1757                 if (tdb_lock(tdb, i, F_WRLCK))
1758                         break;
1759
1760         /* If error, release locks we have... */
1761         if (i < number) {
1762                 for ( j = 0; j < i; j++)
1763                         tdb_unlock(tdb, j, F_WRLCK);
1764                 SAFE_FREE(tdb->lockedkeys);
1765                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1766         }
1767         return 0;
1768 }
1769
1770 /* Unlock the keys previously locked by tdb_lockkeys() */
1771 void tdb_unlockkeys(TDB_CONTEXT *tdb)
1772 {
1773         u32 i;
1774         for (i = 0; i < tdb->lockedkeys[0]; i++)
1775                 tdb_unlock(tdb, tdb->lockedkeys[i+1], F_WRLCK);
1776         SAFE_FREE(tdb->lockedkeys);
1777 }
1778
1779 /* lock/unlock one hash chain. This is meant to be used to reduce
1780    contention - it cannot guarantee how many records will be locked */
1781 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1782 {
1783         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1784 }
1785
1786 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1787 {
1788         return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1789 }
1790
1791
1792 /* register a loging function */
1793 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1794 {
1795         tdb->log_fn = fn;
1796 }
1797
1798
1799 /* reopen a tdb - this is used after a fork to ensure that we have an independent
1800    seek pointer from our parent and to re-establish locks */
1801 int tdb_reopen(TDB_CONTEXT *tdb)
1802 {
1803         struct stat st;
1804
1805         if (tdb_munmap(tdb) != 0) {
1806                 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
1807                 goto fail;
1808         }
1809         if (close(tdb->fd) != 0)
1810                 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
1811         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
1812         if (tdb->fd == -1) {
1813                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
1814                 goto fail;
1815         }
1816         if (fstat(tdb->fd, &st) != 0) {
1817                 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
1818                 goto fail;
1819         }
1820         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
1821                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
1822                 goto fail;
1823         }
1824         tdb_mmap(tdb);
1825         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1) {
1826                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
1827                 goto fail;
1828         }
1829
1830         return 0;
1831
1832 fail:
1833         tdb_close(tdb);
1834         return -1;
1835 }
1836
1837 /* reopen all tdb's */
1838 int tdb_reopen_all(void)
1839 {
1840         TDB_CONTEXT *tdb;
1841
1842         for (tdb=tdbs; tdb; tdb = tdb->next) {
1843                 if (tdb_reopen(tdb) != 0) return -1;
1844         }
1845
1846         return 0;
1847 }