source/tdb/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3    Samba database functions
   4    Copyright (C) Andrew Tridgell              1999-2000
   5    Copyright (C) Luke Kenneth Casson Leighton      2000
   6    Copyright (C) Paul `Rusty' Russell              2000
   7    Copyright (C) Jeremy Allison                    2000-2003
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 2 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; if not, write to the Free Software
  21    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22 */
  23 #ifdef STANDALONE
  24 #if HAVE_CONFIG_H
  25 #include <config.h>
  26 #endif
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <fcntl.h>
  31 #include <unistd.h>
  32 #include <string.h>
  33 #include <fcntl.h>
  34 #include <errno.h>
  35 #include <sys/mman.h>
  36 #include <sys/stat.h>
  37 #include <signal.h>
  38 #include "tdb.h"
  39 #include "spinlock.h"
  40 #else
  41 #include "includes.h"
  42 #endif
  43
  44 #define TDB_MAGIC_FOOD "TDB file\n"
  45 #define TDB_VERSION (0x26011967 + 6)
  46 #define TDB_MAGIC (0x26011999U)
  47 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  48 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  49 #define TDB_ALIGNMENT 4
  50 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
  51 #define DEFAULT_HASH_SIZE 131
  52 #define TDB_PAGE_SIZE 0x2000
  53 #define FREELIST_TOP (sizeof(struct tdb_header))
  54 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
  55 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
  56 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
  57 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
  58 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
  59
  60 /* NB assumes there is a local variable called "tdb" that is the
  61  * current context, also takes doubly-parenthesized print-style
  62  * argument. */
  63 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
  64
  65 /* lock offsets */
  66 #define GLOBAL_LOCK 0
  67 #define ACTIVE_LOCK 4
  68
  69 #ifndef MAP_FILE
  70 #define MAP_FILE 0
  71 #endif
  72
  73 #ifndef MAP_FAILED
  74 #define MAP_FAILED ((void *)-1)
  75 #endif
  76
  77 /* free memory if the pointer is valid and zero the pointer */
  78 #ifndef SAFE_FREE
  79 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
  80 #endif
  81
  82 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
  83 TDB_DATA tdb_null;
  84
  85 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
  86 static TDB_CONTEXT *tdbs = NULL;
  87
  88 static int tdb_munmap(TDB_CONTEXT *tdb)
  89 {
  90         if (tdb->flags & TDB_INTERNAL)
  91                 return 0;
  92
  93 #ifdef HAVE_MMAP
  94         if (tdb->map_ptr) {
  95                 int ret = munmap(tdb->map_ptr, tdb->map_size);
  96                 if (ret != 0)
  97                         return ret;
  98         }
  99 #endif
 100         tdb->map_ptr = NULL;
 101         return 0;
 102 }
 103
 104 static void tdb_mmap(TDB_CONTEXT *tdb)
 105 {
 106         if (tdb->flags & TDB_INTERNAL)
 107                 return;
 108
 109 #ifdef HAVE_MMAP
 110         if (!(tdb->flags & TDB_NOMMAP)) {
 111                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 112                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 113                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 114
 115                 /*
 116                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 117                  */
 118
 119                 if (tdb->map_ptr == MAP_FAILED) {
 120                         tdb->map_ptr = NULL;
 121                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 122                                  tdb->map_size, strerror(errno)));
 123                 }
 124         } else {
 125                 tdb->map_ptr = NULL;
 126         }
 127 #else
 128         tdb->map_ptr = NULL;
 129 #endif
 130 }
 131
 132 /* Endian conversion: we only ever deal with 4 byte quantities */
 133 static void *convert(void *buf, u32 size)
 134 {
 135         u32 i, *p = buf;
 136         for (i = 0; i < size / 4; i++)
 137                 p[i] = TDB_BYTEREV(p[i]);
 138         return buf;
 139 }
 140 #define DOCONV() (tdb->flags & TDB_CONVERT)
 141 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 142
 143 /* the body of the database is made of one list_struct for the free space
 144    plus a separate data list for each hash value */
 145 struct list_struct {
 146         tdb_off next; /* offset of the next record in the list */
 147         tdb_len rec_len; /* total byte length of record */
 148         tdb_len key_len; /* byte length of key */
 149         tdb_len data_len; /* byte length of data */
 150         u32 full_hash; /* the full 32 bit hash of the key */
 151         u32 magic;   /* try to catch errors */
 152         /* the following union is implied:
 153                 union {
 154                         char record[rec_len];
 155                         struct {
 156                                 char key[key_len];
 157                                 char data[data_len];
 158                         }
 159                         u32 totalsize; (tailer)
 160                 }
 161         */
 162 };
 163
 164 /***************************************************************
 165  Allow a caller to set a "alarm" flag that tdb can check to abort
 166  a blocking lock on SIGALRM.
 167 ***************************************************************/
 168
 169 static sig_atomic_t *palarm_fired;
 170
 171 void tdb_set_lock_alarm(sig_atomic_t *palarm)
 172 {
 173         palarm_fired = palarm;
 174 }
 175
 176 /* a byte range locking function - return 0 on success
 177    this functions locks/unlocks 1 byte at the specified offset.
 178
 179    On error, errno is also set so that errors are passed back properly
 180    through tdb_open(). */
 181 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 182                       int rw_type, int lck_type, int probe)
 183 {
 184         struct flock fl;
 185         int ret;
 186
 187         if (tdb->flags & TDB_NOLOCK)
 188                 return 0;
 189         if ((rw_type == F_WRLCK) && (tdb->read_only)) {
 190                 errno = EACCES;
 191                 return -1;
 192         }
 193
 194         fl.l_type = rw_type;
 195         fl.l_whence = SEEK_SET;
 196         fl.l_start = offset;
 197         fl.l_len = 1;
 198         fl.l_pid = 0;
 199
 200         do {
 201                 ret = fcntl(tdb->fd,lck_type,&fl);
 202                 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
 203                         break;
 204         } while (ret == -1 && errno == EINTR);
 205
 206         if (ret == -1) {
 207                 if (!probe && lck_type != F_SETLK) {
 208                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 209                                  tdb->fd, offset, rw_type, lck_type));
 210                 }
 211                 /* Was it an alarm timeout ? */
 212                 if (errno == EINTR && palarm_fired && *palarm_fired)
 213                         return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
 214                 /* Otherwise - generic lock error. */
 215                 /* errno set by fcntl */
 216                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 217         }
 218         return 0;
 219 }
 220
 221 /* lock a list in the database. list -1 is the alloc list */
 222 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 223 {
 224         if (list < -1 || list >= (int)tdb->header.hash_size) {
 225                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 226                            list, ltype));
 227                 return -1;
 228         }
 229         if (tdb->flags & TDB_NOLOCK)
 230                 return 0;
 231
 232         /* Since fcntl locks don't nest, we do a lock for the first one,
 233            and simply bump the count for future ones */
 234         if (tdb->locked[list+1].count == 0) {
 235                 if (!tdb->read_only && tdb->header.rwlocks) {
 236                         if (tdb_spinlock(tdb, list, ltype)) {
 237                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list ltype=%d\n",
 238                                            list, ltype));
 239                                 return -1;
 240                         }
 241                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 242                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 243                                            list, ltype, strerror(errno)));
 244                         return -1;
 245                 }
 246                 tdb->locked[list+1].ltype = ltype;
 247         }
 248         tdb->locked[list+1].count++;
 249         return 0;
 250 }
 251
 252 /* unlock the database: returns void because it's too late for errors. */
 253         /* changed to return int it may be interesting to know there
 254            has been an error  --simo */
 255 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 256 {
 257         int ret = -1;
 258
 259         if (tdb->flags & TDB_NOLOCK)
 260                 return 0;
 261
 262         /* Sanity checks */
 263         if (list < -1 || list >= (int)tdb->header.hash_size) {
 264                 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
 265                 return ret;
 266         }
 267
 268         if (tdb->locked[list+1].count==0) {
 269                 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
 270                 return ret;
 271         }
 272
 273         if (tdb->locked[list+1].count == 1) {
 274                 /* Down to last nested lock: unlock underneath */
 275                 if (!tdb->read_only && tdb->header.rwlocks) {
 276                         ret = tdb_spinunlock(tdb, list, ltype);
 277                 } else {
 278                         ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 279                 }
 280         } else {
 281                 ret = 0;
 282         }
 283         tdb->locked[list+1].count--;
 284
 285         if (ret)
 286                 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
 287         return ret;
 288 }
 289
 290 /* This is based on the hash algorithm from gdbm */
 291 static u32 tdb_hash(TDB_DATA *key)
 292 {
 293         u32 value;      /* Used to compute the hash value.  */
 294         u32   i;        /* Used to cycle through random values. */
 295
 296         /* Set the initial value from the key size. */
 297         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
 298                 value = (value + (key->dptr[i] << (i*5 % 24)));
 299
 300         return (1103515243 * value + 12345);
 301 }
 302
 303 /* check for an out of bounds access - if it is out of bounds then
 304    see if the database has been expanded by someone else and expand
 305    if necessary
 306    note that "len" is the minimum length needed for the db
 307 */
 308 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 309 {
 310         struct stat st;
 311         if (len <= tdb->map_size)
 312                 return 0;
 313         if (tdb->flags & TDB_INTERNAL) {
 314                 if (!probe) {
 315                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 316                                  (int)len, (int)tdb->map_size));
 317                 }
 318                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 319         }
 320
 321         if (fstat(tdb->fd, &st) == -1)
 322                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 323
 324         if (st.st_size < (size_t)len) {
 325                 if (!probe) {
 326                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 327                                  (int)len, (int)st.st_size));
 328                 }
 329                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 330         }
 331
 332         /* Unmap, update size, remap */
 333         if (tdb_munmap(tdb) == -1)
 334                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 335         tdb->map_size = st.st_size;
 336         tdb_mmap(tdb);
 337         return 0;
 338 }
 339
 340 /* write a lump of data at a specified offset */
 341 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 342 {
 343         if (tdb_oob(tdb, off + len, 0) != 0)
 344                 return -1;
 345
 346         if (tdb->map_ptr)
 347                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 348 #ifdef HAVE_PWRITE
 349         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 350 #else
 351         else if (lseek(tdb->fd, off, SEEK_SET) != off
 352                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 353 #endif
 354                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 355                            off, len, strerror(errno)));
 356                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 357         }
 358         return 0;
 359 }
 360
 361 /* read a lump of data at a specified offset, maybe convert */
 362 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 363 {
 364         if (tdb_oob(tdb, off + len, 0) != 0)
 365                 return -1;
 366
 367         if (tdb->map_ptr)
 368                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 369 #ifdef HAVE_PREAD
 370         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 371 #else
 372         else if (lseek(tdb->fd, off, SEEK_SET) != off
 373                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 374 #endif
 375                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 376                            off, len, strerror(errno)));
 377                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 378         }
 379         if (cv)
 380                 convert(buf, len);
 381         return 0;
 382 }
 383
 384 /* read a lump of data, allocating the space for it */
 385 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 386 {
 387         char *buf;
 388
 389         if (!(buf = malloc(len))) {
 390                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 391                            len, strerror(errno)));
 392                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 393         }
 394         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 395                 SAFE_FREE(buf);
 396                 return NULL;
 397         }
 398         return buf;
 399 }
 400
 401 /* read/write a tdb_off */
 402 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 403 {
 404         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 405 }
 406 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 407 {
 408         tdb_off off = *d;
 409         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 410 }
 411
 412 /* read/write a record */
 413 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 414 {
 415         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 416                 return -1;
 417         if (TDB_BAD_MAGIC(rec)) {
 418                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 419                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 420         }
 421         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 422 }
 423 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 424 {
 425         struct list_struct r = *rec;
 426         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 427 }
 428
 429 /* read a freelist record and check for simple errors */
 430 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 431 {
 432         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 433                 return -1;
 434
 435         if (rec->magic == TDB_MAGIC) {
 436                 /* this happens when a app is showdown while deleting a record - we should
 437                    not completely fail when this happens */
 438                 TDB_LOG((tdb, 0,"rec_free_read non-free magic at offset=%d - fixing\n",
 439                          rec->magic, off));
 440                 rec->magic = TDB_FREE_MAGIC;
 441                 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
 442                         return -1;
 443         }
 444
 445         if (rec->magic != TDB_FREE_MAGIC) {
 446                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 447                            rec->magic, off));
 448                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 449         }
 450         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 451                 return -1;
 452         return 0;
 453 }
 454
 455 /* update a record tailer (must hold allocation lock) */
 456 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 457                          const struct list_struct *rec)
 458 {
 459         tdb_off totalsize;
 460
 461         /* Offset of tailer from record header */
 462         totalsize = sizeof(*rec) + rec->rec_len;
 463         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 464                          &totalsize);
 465 }
 466
 467 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 468 {
 469         struct list_struct rec;
 470         tdb_off tailer_ofs, tailer;
 471
 472         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 473                 printf("ERROR: failed to read record at %u\n", offset);
 474                 return 0;
 475         }
 476
 477         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 478                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 479
 480         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 481         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 482                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 483                 return rec.next;
 484         }
 485
 486         if (tailer != rec.rec_len + sizeof(rec)) {
 487                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 488                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 489         }
 490         return rec.next;
 491 }
 492
 493 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 494 {
 495         tdb_off rec_ptr, top;
 496
 497         top = TDB_HASH_TOP(i);
 498
 499         if (tdb_lock(tdb, i, F_WRLCK) != 0)
 500                 return -1;
 501
 502         if (ofs_read(tdb, top, &rec_ptr) == -1)
 503                 return tdb_unlock(tdb, i, F_WRLCK);
 504
 505         if (rec_ptr)
 506                 printf("hash=%d\n", i);
 507
 508         while (rec_ptr) {
 509                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 510         }
 511
 512         return tdb_unlock(tdb, i, F_WRLCK);
 513 }
 514
 515 void tdb_dump_all(TDB_CONTEXT *tdb)
 516 {
 517         int i;
 518         for (i=0;i<tdb->header.hash_size;i++) {
 519                 tdb_dump_chain(tdb, i);
 520         }
 521         printf("freelist:\n");
 522         tdb_dump_chain(tdb, -1);
 523 }
 524
 525 int tdb_printfreelist(TDB_CONTEXT *tdb)
 526 {
 527         int ret;
 528         long total_free = 0;
 529         tdb_off offset, rec_ptr;
 530         struct list_struct rec;
 531
 532         if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
 533                 return ret;
 534
 535         offset = FREELIST_TOP;
 536
 537         /* read in the freelist top */
 538         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 539                 tdb_unlock(tdb, -1, F_WRLCK);
 540                 return 0;
 541         }
 542
 543         printf("freelist top=[0x%08x]\n", rec_ptr );
 544         while (rec_ptr) {
 545                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 546                         tdb_unlock(tdb, -1, F_WRLCK);
 547                         return -1;
 548                 }
 549
 550                 if (rec.magic != TDB_FREE_MAGIC) {
 551                         printf("bad magic 0x%08x in free list\n", rec.magic);
 552                         tdb_unlock(tdb, -1, F_WRLCK);
 553                         return -1;
 554                 }
 555
 556                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 557                 total_free += rec.rec_len;
 558
 559                 /* move to the next record */
 560                 rec_ptr = rec.next;
 561         }
 562         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 563                (int)total_free);
 564
 565         return tdb_unlock(tdb, -1, F_WRLCK);
 566 }
 567
 568 /* Remove an element from the freelist.  Must have alloc lock. */
 569 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 570 {
 571         tdb_off last_ptr, i;
 572
 573         /* read in the freelist top */
 574         last_ptr = FREELIST_TOP;
 575         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 576                 if (i == off) {
 577                         /* We've found it! */
 578                         return ofs_write(tdb, last_ptr, &next);
 579                 }
 580                 /* Follow chain (next offset is at start of record) */
 581                 last_ptr = i;
 582         }
 583         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 584         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 585 }
 586
 587 /* Add an element into the freelist. Merge adjacent records if
 588    neccessary. */
 589 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 590 {
 591         tdb_off right, left;
 592
 593         /* Allocation and tailer lock */
 594         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 595                 return -1;
 596
 597         /* set an initial tailer, so if we fail we don't leave a bogus record */
 598         if (update_tailer(tdb, offset, rec) != 0) {
 599                 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
 600                 goto fail;
 601         }
 602
 603         /* Look right first (I'm an Australian, dammit) */
 604         right = offset + sizeof(*rec) + rec->rec_len;
 605         if (right + sizeof(*rec) <= tdb->map_size) {
 606                 struct list_struct r;
 607
 608                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 609                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 610                         goto left;
 611                 }
 612
 613                 /* If it's free, expand to include it. */
 614                 if (r.magic == TDB_FREE_MAGIC) {
 615                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 616                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 617                                 goto left;
 618                         }
 619                         rec->rec_len += sizeof(r) + r.rec_len;
 620                 }
 621         }
 622
 623 left:
 624         /* Look left */
 625         left = offset - sizeof(tdb_off);
 626         if (left > TDB_HASH_TOP(tdb->header.hash_size-1)) {
 627                 struct list_struct l;
 628                 tdb_off leftsize;
 629
 630                 /* Read in tailer and jump back to header */
 631                 if (ofs_read(tdb, left, &leftsize) == -1) {
 632                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 633                         goto update;
 634                 }
 635                 left = offset - leftsize;
 636
 637                 /* Now read in record */
 638                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 639                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 640                         goto update;
 641                 }
 642
 643                 /* If it's free, expand to include it. */
 644                 if (l.magic == TDB_FREE_MAGIC) {
 645                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 646                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 647                                 goto update;
 648                         } else {
 649                                 offset = left;
 650                                 rec->rec_len += leftsize;
 651                         }
 652                 }
 653         }
 654
 655 update:
 656         if (update_tailer(tdb, offset, rec) == -1) {
 657                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 658                 goto fail;
 659         }
 660
 661         /* Now, prepend to free list */
 662         rec->magic = TDB_FREE_MAGIC;
 663
 664         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 665             rec_write(tdb, offset, rec) == -1 ||
 666             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 667                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 668                 goto fail;
 669         }
 670
 671         /* And we're done. */
 672         tdb_unlock(tdb, -1, F_WRLCK);
 673         return 0;
 674
 675  fail:
 676         tdb_unlock(tdb, -1, F_WRLCK);
 677         return -1;
 678 }
 679
 680
 681 /* expand a file.  we prefer to use ftruncate, as that is what posix
 682   says to use for mmap expansion */
 683 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 684 {
 685         char buf[1024];
 686 #if HAVE_FTRUNCATE_EXTEND
 687         if (ftruncate(tdb->fd, size+addition) != 0) {
 688                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 689                            size+addition, strerror(errno)));
 690                 return -1;
 691         }
 692 #else
 693         char b = 0;
 694
 695 #ifdef HAVE_PWRITE
 696         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 697 #else
 698         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 699             write(tdb->fd, &b, 1) != 1) {
 700 #endif
 701                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 702                            size+addition, strerror(errno)));
 703                 return -1;
 704         }
 705 #endif
 706
 707         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 708            very bad if we ran out of disk. This must be done with write, not via mmap */
 709         memset(buf, 0x42, sizeof(buf));
 710         while (addition) {
 711                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 712 #ifdef HAVE_PWRITE
 713                 int ret = pwrite(tdb->fd, buf, n, size);
 714 #else
 715                 int ret;
 716                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 717                         return -1;
 718                 ret = write(tdb->fd, buf, n);
 719 #endif
 720                 if (ret != n) {
 721                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 722                                    n, strerror(errno)));
 723                         return -1;
 724                 }
 725                 addition -= n;
 726                 size += n;
 727         }
 728         return 0;
 729 }
 730
 731
 732 /* expand the database at least size bytes by expanding the underlying
 733    file and doing the mmap again if necessary */
 734 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 735 {
 736         struct list_struct rec;
 737         tdb_off offset;
 738
 739         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 740                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 741                 return -1;
 742         }
 743
 744         /* must know about any previous expansions by another process */
 745         tdb_oob(tdb, tdb->map_size + 1, 1);
 746
 747         /* always make room for at least 10 more records, and round
 748            the database up to a multiple of TDB_PAGE_SIZE */
 749         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 750
 751         if (!(tdb->flags & TDB_INTERNAL))
 752                 tdb_munmap(tdb);
 753
 754         /*
 755          * We must ensure the file is unmapped before doing this
 756          * to ensure consistency with systems like OpenBSD where
 757          * writes and mmaps are not consistent.
 758          */
 759
 760         /* expand the file itself */
 761         if (!(tdb->flags & TDB_INTERNAL)) {
 762                 if (expand_file(tdb, tdb->map_size, size) != 0)
 763                         goto fail;
 764         }
 765
 766         tdb->map_size += size;
 767
 768         if (tdb->flags & TDB_INTERNAL)
 769                 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 770         else {
 771                 /*
 772                  * We must ensure the file is remapped before adding the space
 773                  * to ensure consistency with systems like OpenBSD where
 774                  * writes and mmaps are not consistent.
 775                  */
 776
 777                 /* We're ok if the mmap fails as we'll fallback to read/write */
 778                 tdb_mmap(tdb);
 779         }
 780
 781         /* form a new freelist record */
 782         memset(&rec,'\0',sizeof(rec));
 783         rec.rec_len = size - sizeof(rec);
 784
 785         /* link it into the free list */
 786         offset = tdb->map_size - size;
 787         if (tdb_free(tdb, offset, &rec) == -1)
 788                 goto fail;
 789
 790         tdb_unlock(tdb, -1, F_WRLCK);
 791         return 0;
 792  fail:
 793         tdb_unlock(tdb, -1, F_WRLCK);
 794         return -1;
 795 }
 796
 797 /* allocate some space from the free list. The offset returned points
 798    to a unconnected list_struct within the database with room for at
 799    least length bytes of total data
 800
 801    0 is returned if the space could not be allocated
 802  */
 803 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 804                             struct list_struct *rec)
 805 {
 806         tdb_off rec_ptr, last_ptr, newrec_ptr;
 807         struct list_struct newrec;
 808
 809         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 810                 return 0;
 811
 812         /* Extra bytes required for tailer */
 813         length += sizeof(tdb_off);
 814
 815  again:
 816         last_ptr = FREELIST_TOP;
 817
 818         /* read in the freelist top */
 819         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 820                 goto fail;
 821
 822         /* keep looking until we find a freelist record big enough */
 823         while (rec_ptr) {
 824                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 825                         goto fail;
 826
 827                 if (rec->rec_len >= length) {
 828                         /* found it - now possibly split it up  */
 829                         if (rec->rec_len > length + MIN_REC_SIZE) {
 830                                 /* Length of left piece */
 831                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 832
 833                                 /* Right piece to go on free list */
 834                                 newrec.rec_len = rec->rec_len
 835                                         - (sizeof(*rec) + length);
 836                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 837
 838                                 /* And left record is shortened */
 839                                 rec->rec_len = length;
 840                         } else
 841                                 newrec_ptr = 0;
 842
 843                         /* Remove allocated record from the free list */
 844                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 845                                 goto fail;
 846
 847                         /* Update header: do this before we drop alloc
 848                            lock, otherwise tdb_free() might try to
 849                            merge with us, thinking we're free.
 850                            (Thanks Jeremy Allison). */
 851                         rec->magic = TDB_MAGIC;
 852                         if (rec_write(tdb, rec_ptr, rec) == -1)
 853                                 goto fail;
 854
 855                         /* Did we create new block? */
 856                         if (newrec_ptr) {
 857                                 /* Update allocated record tailer (we
 858                                    shortened it). */
 859                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 860                                         goto fail;
 861
 862                                 /* Free new record */
 863                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 864                                         goto fail;
 865                         }
 866
 867                         /* all done - return the new record offset */
 868                         tdb_unlock(tdb, -1, F_WRLCK);
 869                         return rec_ptr;
 870                 }
 871                 /* move to the next record */
 872                 last_ptr = rec_ptr;
 873                 rec_ptr = rec->next;
 874         }
 875         /* we didn't find enough space. See if we can expand the
 876            database and if we can then try again */
 877         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 878                 goto again;
 879  fail:
 880         tdb_unlock(tdb, -1, F_WRLCK);
 881         return 0;
 882 }
 883
 884 /* initialise a new database with a specified hash size */
 885 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 886 {
 887         struct tdb_header *newdb;
 888         int size, ret = -1;
 889
 890         /* We make it up in memory, then write it out if not internal */
 891         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 892         if (!(newdb = calloc(size, 1)))
 893                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 894
 895         /* Fill in the header */
 896         newdb->version = TDB_VERSION;
 897         newdb->hash_size = hash_size;
 898 #ifdef USE_SPINLOCKS
 899         newdb->rwlocks = size;
 900 #endif
 901         if (tdb->flags & TDB_INTERNAL) {
 902                 tdb->map_size = size;
 903                 tdb->map_ptr = (char *)newdb;
 904                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 905                 /* Convert the `ondisk' version if asked. */
 906                 CONVERT(*newdb);
 907                 return 0;
 908         }
 909         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 910                 goto fail;
 911
 912         if (ftruncate(tdb->fd, 0) == -1)
 913                 goto fail;
 914
 915         /* This creates an endian-converted header, as if read from disk */
 916         CONVERT(*newdb);
 917         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 918         /* Don't endian-convert the magic food! */
 919         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 920         if (write(tdb->fd, newdb, size) != size)
 921                 ret = -1;
 922         else
 923                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 924
 925   fail:
 926         SAFE_FREE(newdb);
 927         return ret;
 928 }
 929
 930 /* Returns 0 on fail.  On success, return offset of record, and fills
 931    in rec */
 932 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 933                         struct list_struct *r)
 934 {
 935         tdb_off rec_ptr;
 936
 937         /* read in the hash top */
 938         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 939                 return 0;
 940
 941         /* keep looking until we find the right record */
 942         while (rec_ptr) {
 943                 if (rec_read(tdb, rec_ptr, r) == -1)
 944                         return 0;
 945
 946                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 947                         char *k;
 948                         /* a very likely hit - read the key */
 949                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
 950                                            r->key_len);
 951                         if (!k)
 952                                 return 0;
 953
 954                         if (memcmp(key.dptr, k, key.dsize) == 0) {
 955                                 SAFE_FREE(k);
 956                                 return rec_ptr;
 957                         }
 958                         SAFE_FREE(k);
 959                 }
 960                 rec_ptr = r->next;
 961         }
 962         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
 963 }
 964
 965 /* If they do lockkeys, check that this hash is one they locked */
 966 static int tdb_keylocked(TDB_CONTEXT *tdb, u32 hash)
 967 {
 968         u32 i;
 969         if (!tdb->lockedkeys)
 970                 return 1;
 971         for (i = 0; i < tdb->lockedkeys[0]; i++)
 972                 if (tdb->lockedkeys[i+1] == hash)
 973                         return 1;
 974         return TDB_ERRCODE(TDB_ERR_NOLOCK, 0);
 975 }
 976
 977 /* As tdb_find, but if you succeed, keep the lock */
 978 static tdb_off tdb_find_lock(TDB_CONTEXT *tdb, TDB_DATA key, int locktype,
 979                              struct list_struct *rec)
 980 {
 981         u32 hash, rec_ptr;
 982
 983         hash = tdb_hash(&key);
 984         if (!tdb_keylocked(tdb, hash))
 985                 return 0;
 986         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
 987                 return 0;
 988         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
 989                 tdb_unlock(tdb, BUCKET(hash), locktype);
 990         return rec_ptr;
 991 }
 992
 993 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
 994 {
 995         return tdb->ecode;
 996 }
 997
 998 static struct tdb_errname {
 999         enum TDB_ERROR ecode; const char *estring;
1000 } emap[] = { {TDB_SUCCESS, "Success"},
1001              {TDB_ERR_CORRUPT, "Corrupt database"},
1002              {TDB_ERR_IO, "IO Error"},
1003              {TDB_ERR_LOCK, "Locking error"},
1004              {TDB_ERR_OOM, "Out of memory"},
1005              {TDB_ERR_EXISTS, "Record exists"},
1006              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1007              {TDB_ERR_NOEXIST, "Record does not exist"} };
1008
1009 /* Error string for the last tdb error */
1010 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1011 {
1012         u32 i;
1013         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1014                 if (tdb->ecode == emap[i].ecode)
1015                         return emap[i].estring;
1016         return "Invalid error code";
1017 }
1018
1019 /* update an entry in place - this only works if the new data size
1020    is <= the old data size and the key exists.
1021    on failure return -1.
1022 */
1023
1024 static int tdb_update(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf)
1025 {
1026         struct list_struct rec;
1027         tdb_off rec_ptr;
1028
1029         /* find entry */
1030         if (!(rec_ptr = tdb_find(tdb, key, tdb_hash(&key), &rec)))
1031                 return -1;
1032
1033         /* must be long enough key, data and tailer */
1034         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1035                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1036                 return -1;
1037         }
1038
1039         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1040                       dbuf.dptr, dbuf.dsize) == -1)
1041                 return -1;
1042
1043         if (dbuf.dsize != rec.data_len) {
1044                 /* update size */
1045                 rec.data_len = dbuf.dsize;
1046                 return rec_write(tdb, rec_ptr, &rec);
1047         }
1048
1049         return 0;
1050 }
1051
1052 /* find an entry in the database given a key */
1053 /* If an entry doesn't exist tdb_err will be set to
1054  * TDB_ERR_NOEXIST. If a key has no data attached
1055  * tdb_err will not be set. Both will return a
1056  * zero pptr and zero dsize.
1057  */
1058
1059 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1060 {
1061         tdb_off rec_ptr;
1062         struct list_struct rec;
1063         TDB_DATA ret;
1064
1065         /* find which hash bucket it is in */
1066         if (!(rec_ptr = tdb_find_lock(tdb,key,F_RDLCK,&rec)))
1067                 return tdb_null;
1068
1069         if (rec.data_len)
1070                 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1071                                           rec.data_len);
1072         else
1073                 ret.dptr = NULL;
1074         ret.dsize = rec.data_len;
1075         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1076         return ret;
1077 }
1078
1079 /* check if an entry in the database exists
1080
1081    note that 1 is returned if the key is found and 0 is returned if not found
1082    this doesn't match the conventions in the rest of this module, but is
1083    compatible with gdbm
1084 */
1085 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1086 {
1087         struct list_struct rec;
1088
1089         if (tdb_find_lock(tdb, key, F_RDLCK, &rec) == 0)
1090                 return 0;
1091         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1092         return 1;
1093 }
1094
1095 /* record lock stops delete underneath */
1096 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1097 {
1098         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1099 }
1100 /*
1101   Write locks override our own fcntl readlocks, so check it here.
1102   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1103   an error to fail to get the lock here.
1104 */
1105
1106 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1107 {
1108         struct tdb_traverse_lock *i;
1109         for (i = &tdb->travlocks; i; i = i->next)
1110                 if (i->off == off)
1111                         return -1;
1112         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1113 }
1114
1115 /*
1116   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1117   an error to fail to get the lock here.
1118 */
1119
1120 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1121 {
1122         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1123 }
1124 /* fcntl locks don't stack: avoid unlocking someone else's */
1125 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1126 {
1127         struct tdb_traverse_lock *i;
1128         u32 count = 0;
1129
1130         if (off == 0)
1131                 return 0;
1132         for (i = &tdb->travlocks; i; i = i->next)
1133                 if (i->off == off)
1134                         count++;
1135         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1136 }
1137
1138 /* actually delete an entry in the database given the offset */
1139 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1140 {
1141         tdb_off last_ptr, i;
1142         struct list_struct lastrec;
1143
1144         if (tdb->read_only) return -1;
1145
1146         if (write_lock_record(tdb, rec_ptr) == -1) {
1147                 /* Someone traversing here: mark it as dead */
1148                 rec->magic = TDB_DEAD_MAGIC;
1149                 return rec_write(tdb, rec_ptr, rec);
1150         }
1151         if (write_unlock_record(tdb, rec_ptr) != 0)
1152                 return -1;
1153
1154         /* find previous record in hash chain */
1155         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1156                 return -1;
1157         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1158                 if (rec_read(tdb, i, &lastrec) == -1)
1159                         return -1;
1160
1161         /* unlink it: next ptr is at start of record. */
1162         if (last_ptr == 0)
1163                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1164         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1165                 return -1;
1166
1167         /* recover the space */
1168         if (tdb_free(tdb, rec_ptr, rec) == -1)
1169                 return -1;
1170         return 0;
1171 }
1172
1173 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1174 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1175                          struct list_struct *rec)
1176 {
1177         int want_next = (tlock->off != 0);
1178
1179         /* No traversal allows if you've called tdb_lockkeys() */
1180         if (tdb->lockedkeys)
1181                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1182
1183         /* Lock each chain from the start one. */
1184         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1185                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1186                         return -1;
1187
1188                 /* No previous record?  Start at top of chain. */
1189                 if (!tlock->off) {
1190                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1191                                      &tlock->off) == -1)
1192                                 goto fail;
1193                 } else {
1194                         /* Otherwise unlock the previous record. */
1195                         if (unlock_record(tdb, tlock->off) != 0)
1196                                 goto fail;
1197                 }
1198
1199                 if (want_next) {
1200                         /* We have offset of old record: grab next */
1201                         if (rec_read(tdb, tlock->off, rec) == -1)
1202                                 goto fail;
1203                         tlock->off = rec->next;
1204                 }
1205
1206                 /* Iterate through chain */
1207                 while( tlock->off) {
1208                         tdb_off current;
1209                         if (rec_read(tdb, tlock->off, rec) == -1)
1210                                 goto fail;
1211                         if (!TDB_DEAD(rec)) {
1212                                 /* Woohoo: we found one! */
1213                                 if (lock_record(tdb, tlock->off) != 0)
1214                                         goto fail;
1215                                 return tlock->off;
1216                         }
1217                         /* Try to clean dead ones from old traverses */
1218                         current = tlock->off;
1219                         tlock->off = rec->next;
1220                         if (do_delete(tdb, current, rec) != 0)
1221                                 goto fail;
1222                 }
1223                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1224                 want_next = 0;
1225         }
1226         /* We finished iteration without finding anything */
1227         return TDB_ERRCODE(TDB_SUCCESS, 0);
1228
1229  fail:
1230         tlock->off = 0;
1231         if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1232                 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1233         return -1;
1234 }
1235
1236 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1237    return -1 on error or the record count traversed
1238    if fn is NULL then it is not called
1239    a non-zero return value from fn() indicates that the traversal should stop
1240   */
1241 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *state)
1242 {
1243         TDB_DATA key, dbuf;
1244         struct list_struct rec;
1245         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1246         int ret, count = 0;
1247
1248         /* This was in the initializaton, above, but the IRIX compiler
1249          * did not like it.  crh
1250          */
1251         tl.next = tdb->travlocks.next;
1252
1253         /* fcntl locks don't stack: beware traverse inside traverse */
1254         tdb->travlocks.next = &tl;
1255
1256         /* tdb_next_lock places locks on the record returned, and its chain */
1257         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1258                 count++;
1259                 /* now read the full record */
1260                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1261                                           rec.key_len + rec.data_len);
1262                 if (!key.dptr) {
1263                         ret = -1;
1264                         if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1265                                 goto out;
1266                         if (unlock_record(tdb, tl.off) != 0)
1267                                 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1268                         goto out;
1269                 }
1270                 key.dsize = rec.key_len;
1271                 dbuf.dptr = key.dptr + rec.key_len;
1272                 dbuf.dsize = rec.data_len;
1273
1274                 /* Drop chain lock, call out */
1275                 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1276                         ret = -1;
1277                         goto out;
1278                 }
1279                 if (fn && fn(tdb, key, dbuf, state)) {
1280                         /* They want us to terminate traversal */
1281                         ret = count;
1282                         if (unlock_record(tdb, tl.off) != 0) {
1283                                 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1284                                 ret = -1;
1285                         }
1286                         tdb->travlocks.next = tl.next;
1287                         SAFE_FREE(key.dptr);
1288                         return count;
1289                 }
1290                 SAFE_FREE(key.dptr);
1291         }
1292 out:
1293         tdb->travlocks.next = tl.next;
1294         if (ret < 0)
1295                 return -1;
1296         else
1297                 return count;
1298 }
1299
1300 /* find the first entry in the database and return its key */
1301 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1302 {
1303         TDB_DATA key;
1304         struct list_struct rec;
1305
1306         /* release any old lock */
1307         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1308                 return tdb_null;
1309         tdb->travlocks.off = tdb->travlocks.hash = 0;
1310
1311         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1312                 return tdb_null;
1313         /* now read the key */
1314         key.dsize = rec.key_len;
1315         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1316         if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1317                 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1318         return key;
1319 }
1320
1321 /* find the next entry in the database, returning its key */
1322 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1323 {
1324         u32 oldhash;
1325         TDB_DATA key = tdb_null;
1326         struct list_struct rec;
1327         char *k = NULL;
1328
1329         /* Is locked key the old key?  If so, traverse will be reliable. */
1330         if (tdb->travlocks.off) {
1331                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1332                         return tdb_null;
1333                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1334                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1335                                             rec.key_len))
1336                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1337                         /* No, it wasn't: unlock it and start from scratch */
1338                         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1339                                 return tdb_null;
1340                         if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1341                                 return tdb_null;
1342                         tdb->travlocks.off = 0;
1343                 }
1344
1345                 SAFE_FREE(k);
1346         }
1347
1348         if (!tdb->travlocks.off) {
1349                 /* No previous element: do normal find, and lock record */
1350                 tdb->travlocks.off = tdb_find_lock(tdb, oldkey, F_WRLCK, &rec);
1351                 if (!tdb->travlocks.off)
1352                         return tdb_null;
1353                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1354                 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1355                         TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1356                         return tdb_null;
1357                 }
1358         }
1359         oldhash = tdb->travlocks.hash;
1360
1361         /* Grab next record: locks chain and returned record,
1362            unlocks old record */
1363         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1364                 key.dsize = rec.key_len;
1365                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1366                                           key.dsize);
1367                 /* Unlock the chain of this new record */
1368                 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1369                         TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1370         }
1371         /* Unlock the chain of old record */
1372         if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1373                 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1374         return key;
1375 }
1376
1377 /* delete an entry in the database given a key */
1378 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1379 {
1380         tdb_off rec_ptr;
1381         struct list_struct rec;
1382         int ret;
1383
1384         if (!(rec_ptr = tdb_find_lock(tdb, key, F_WRLCK, &rec)))
1385                 return -1;
1386         ret = do_delete(tdb, rec_ptr, &rec);
1387         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1388                 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1389         return ret;
1390 }
1391
1392 /* store an element in the database, replacing any existing element
1393    with the same key
1394
1395    return 0 on success, -1 on failure
1396 */
1397 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1398 {
1399         struct list_struct rec;
1400         u32 hash;
1401         tdb_off rec_ptr;
1402         char *p = NULL;
1403         int ret = 0;
1404
1405         /* find which hash bucket it is in */
1406         hash = tdb_hash(&key);
1407         if (!tdb_keylocked(tdb, hash))
1408                 return -1;
1409         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1410                 return -1;
1411
1412         /* check for it existing, on insert. */
1413         if (flag == TDB_INSERT) {
1414                 if (tdb_exists(tdb, key)) {
1415                         tdb->ecode = TDB_ERR_EXISTS;
1416                         goto fail;
1417                 }
1418         } else {
1419                 /* first try in-place update, on modify or replace. */
1420                 if (tdb_update(tdb, key, dbuf) == 0)
1421                         goto out;
1422                 if (flag == TDB_MODIFY && tdb->ecode == TDB_ERR_NOEXIST)
1423                         goto fail;
1424         }
1425         /* reset the error code potentially set by the tdb_update() */
1426         tdb->ecode = TDB_SUCCESS;
1427
1428         /* delete any existing record - if it doesn't exist we don't
1429            care.  Doing this first reduces fragmentation, and avoids
1430            coalescing with `allocated' block before it's updated. */
1431         if (flag != TDB_INSERT)
1432                 tdb_delete(tdb, key);
1433
1434         /* Copy key+value *before* allocating free space in case malloc
1435            fails and we are left with a dead spot in the tdb. */
1436
1437         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1438                 tdb->ecode = TDB_ERR_OOM;
1439                 goto fail;
1440         }
1441
1442         memcpy(p, key.dptr, key.dsize);
1443         if (dbuf.dsize)
1444                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1445
1446         /* now we're into insert / modify / replace of a record which
1447          * we know could not be optimised by an in-place store (for
1448          * various reasons).  */
1449         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1450                 goto fail;
1451
1452         /* Read hash top into next ptr */
1453         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1454                 goto fail;
1455
1456         rec.key_len = key.dsize;
1457         rec.data_len = dbuf.dsize;
1458         rec.full_hash = hash;
1459         rec.magic = TDB_MAGIC;
1460
1461         /* write out and point the top of the hash chain at it */
1462         if (rec_write(tdb, rec_ptr, &rec) == -1
1463             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1464             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1465                 /* Need to tdb_unallocate() here */
1466                 goto fail;
1467         }
1468  out:
1469         SAFE_FREE(p);
1470         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1471         return ret;
1472 fail:
1473         ret = -1;
1474         goto out;
1475 }
1476
1477 /* Attempt to append data to an entry in place - this only works if the new data size
1478    is <= the old data size and the key exists.
1479    on failure return -1. Record must be locked before calling.
1480 */
1481 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1482 {
1483         struct list_struct rec;
1484         tdb_off rec_ptr;
1485
1486         /* find entry */
1487         if (!(rec_ptr = tdb_find(tdb, key, tdb_hash(&key), &rec)))
1488                 return -1;
1489
1490         /* Append of 0 is always ok. */
1491         if (new_dbuf.dsize == 0)
1492                 return 0;
1493
1494         /* must be long enough for key, old data + new data and tailer */
1495         if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1496                 /* No room. */
1497                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1498                 return -1;
1499         }
1500
1501         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1502                       new_dbuf.dptr, new_dbuf.dsize) == -1)
1503                 return -1;
1504
1505         /* update size */
1506         rec.data_len += new_dbuf.dsize;
1507         return rec_write(tdb, rec_ptr, &rec);
1508 }
1509
1510 /* Append to an entry. Create if not exist. */
1511
1512 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1513 {
1514         struct list_struct rec;
1515         u32 hash;
1516         tdb_off rec_ptr;
1517         char *p = NULL;
1518         int ret = 0;
1519         size_t new_data_size = 0;
1520
1521         /* find which hash bucket it is in */
1522         hash = tdb_hash(&key);
1523         if (!tdb_keylocked(tdb, hash))
1524                 return -1;
1525         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1526                 return -1;
1527
1528         /* first try in-place. */
1529         if (tdb_append_inplace(tdb, key, new_dbuf) == 0)
1530                 goto out;
1531
1532         /* reset the error code potentially set by the tdb_append_inplace() */
1533         tdb->ecode = TDB_SUCCESS;
1534
1535         /* find entry */
1536         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1537                 if (tdb->ecode != TDB_ERR_NOEXIST)
1538                         goto fail;
1539
1540                 /* Not found - create. */
1541
1542                 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1543                 goto out;
1544         }
1545
1546         new_data_size = rec.data_len + new_dbuf.dsize;
1547
1548         /* Copy key+old_value+value *before* allocating free space in case malloc
1549            fails and we are left with a dead spot in the tdb. */
1550
1551         if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1552                 tdb->ecode = TDB_ERR_OOM;
1553                 goto fail;
1554         }
1555
1556         /* Copy the key in place. */
1557         memcpy(p, key.dptr, key.dsize);
1558
1559         /* Now read the old data into place. */
1560         if (rec.data_len &&
1561                 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1562                         goto fail;
1563
1564         /* Finally append the new data. */
1565         if (new_dbuf.dsize)
1566                 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1567
1568         /* delete any existing record - if it doesn't exist we don't
1569            care.  Doing this first reduces fragmentation, and avoids
1570            coalescing with `allocated' block before it's updated. */
1571
1572         tdb_delete(tdb, key);
1573
1574         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1575                 goto fail;
1576
1577         /* Read hash top into next ptr */
1578         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1579                 goto fail;
1580
1581         rec.key_len = key.dsize;
1582         rec.data_len = new_data_size;
1583         rec.full_hash = hash;
1584         rec.magic = TDB_MAGIC;
1585
1586         /* write out and point the top of the hash chain at it */
1587         if (rec_write(tdb, rec_ptr, &rec) == -1
1588             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1589             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1590                 /* Need to tdb_unallocate() here */
1591                 goto fail;
1592         }
1593
1594  out:
1595         SAFE_FREE(p);
1596         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1597         return ret;
1598
1599 fail:
1600         ret = -1;
1601         goto out;
1602 }
1603
1604 static int tdb_already_open(dev_t device,
1605                             ino_t ino)
1606 {
1607         TDB_CONTEXT *i;
1608
1609         for (i = tdbs; i; i = i->next) {
1610                 if (i->device == device && i->inode == ino) {
1611                         return 1;
1612                 }
1613         }
1614
1615         return 0;
1616 }
1617
1618 /* open the database, creating it if necessary
1619
1620    The open_flags and mode are passed straight to the open call on the
1621    database file. A flags value of O_WRONLY is invalid. The hash size
1622    is advisory, use zero for a default value.
1623
1624    Return is NULL on error, in which case errno is also set.  Don't
1625    try to call tdb_error or tdb_errname, just do strerror(errno).
1626
1627    @param name may be NULL for internal databases. */
1628 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1629                       int open_flags, mode_t mode)
1630 {
1631         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL);
1632 }
1633
1634
1635 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1636                          int open_flags, mode_t mode,
1637                          tdb_log_func log_fn)
1638 {
1639         TDB_CONTEXT *tdb;
1640         struct stat st;
1641         int rev = 0, locked;
1642         unsigned char *vp;
1643         u32 vertest;
1644
1645         if (!(tdb = calloc(1, sizeof *tdb))) {
1646                 /* Can't log this */
1647                 errno = ENOMEM;
1648                 goto fail;
1649         }
1650         tdb->fd = -1;
1651         tdb->name = NULL;
1652         tdb->map_ptr = NULL;
1653         tdb->lockedkeys = NULL;
1654         tdb->flags = tdb_flags;
1655         tdb->open_flags = open_flags;
1656         tdb->log_fn = log_fn;
1657
1658         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1659                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1660                          name));
1661                 errno = EINVAL;
1662                 goto fail;
1663         }
1664
1665         if (hash_size == 0)
1666                 hash_size = DEFAULT_HASH_SIZE;
1667         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1668                 tdb->read_only = 1;
1669                 /* read only databases don't do locking or clear if first */
1670                 tdb->flags |= TDB_NOLOCK;
1671                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1672         }
1673
1674         /* internal databases don't mmap or lock, and start off cleared */
1675         if (tdb->flags & TDB_INTERNAL) {
1676                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1677                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1678                 if (tdb_new_database(tdb, hash_size) != 0) {
1679                         TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1680                         goto fail;
1681                 }
1682                 goto internal;
1683         }
1684
1685         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1686                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1687                          name, strerror(errno)));
1688                 goto fail;      /* errno set by open(2) */
1689         }
1690
1691         /* ensure there is only one process initialising at once */
1692         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1693                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1694                          name, strerror(errno)));
1695                 goto fail;      /* errno set by tdb_brlock */
1696         }
1697
1698         /* we need to zero database if we are the only one with it open */
1699         if ((locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))
1700             && (tdb_flags & TDB_CLEAR_IF_FIRST)) {
1701                 open_flags |= O_CREAT;
1702                 if (ftruncate(tdb->fd, 0) == -1) {
1703                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1704                                  "failed to truncate %s: %s\n",
1705                                  name, strerror(errno)));
1706                         goto fail; /* errno set by ftruncate */
1707                 }
1708         }
1709
1710         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1711             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1712             || tdb->header.version != TDB_VERSION
1713             || (tdb->header.hash_size != hash_size
1714                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1715                 /* its not a valid database - possibly initialise it */
1716                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1717                         errno = EIO; /* ie bad format or something */
1718                         goto fail;
1719                 }
1720                 rev = (tdb->flags & TDB_CONVERT);
1721         }
1722         vp = (unsigned char *)&tdb->header.version;
1723         vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1724                   (((u32)vp[2]) << 8) | (u32)vp[3];
1725         tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1726         if (!rev)
1727                 tdb->flags &= ~TDB_CONVERT;
1728         else {
1729                 tdb->flags |= TDB_CONVERT;
1730                 convert(&tdb->header, sizeof(tdb->header));
1731         }
1732         if (fstat(tdb->fd, &st) == -1)
1733                 goto fail;
1734
1735         /* Is it already in the open list?  If so, fail. */
1736         if (tdb_already_open(st.st_dev, st.st_ino)) {
1737                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1738                          "%s (%d,%d) is already open in this process\n",
1739                          name, st.st_dev, st.st_ino));
1740                 errno = EBUSY;
1741                 goto fail;
1742         }
1743
1744         if (!(tdb->name = (char *)strdup(name))) {
1745                 errno = ENOMEM;
1746                 goto fail;
1747         }
1748
1749         tdb->map_size = st.st_size;
1750         tdb->device = st.st_dev;
1751         tdb->inode = st.st_ino;
1752         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1753         if (!tdb->locked) {
1754                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1755                          "failed to allocate lock structure for %s\n",
1756                          name));
1757                 errno = ENOMEM;
1758                 goto fail;
1759         }
1760         tdb_mmap(tdb);
1761         if (locked) {
1762                 if (!tdb->read_only)
1763                         if (tdb_clear_spinlocks(tdb) != 0) {
1764                                 TDB_LOG((tdb, 0, "tdb_open_ex: "
1765                                 "failed to clear spinlock\n"));
1766                                 goto fail;
1767                         }
1768                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1769                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1770                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1771                                  name, strerror(errno)));
1772                         goto fail;
1773                 }
1774         }
1775         /* leave this lock in place to indicate it's in use */
1776         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1777                 goto fail;
1778
1779  internal:
1780         /* Internal (memory-only) databases skip all the code above to
1781          * do with disk files, and resume here by releasing their
1782          * global lock and hooking into the active list. */
1783         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1784                 goto fail;
1785         tdb->next = tdbs;
1786         tdbs = tdb;
1787         return tdb;
1788
1789  fail:
1790         { int save_errno = errno;
1791
1792         if (!tdb)
1793                 return NULL;
1794
1795         if (tdb->map_ptr) {
1796                 if (tdb->flags & TDB_INTERNAL)
1797                         SAFE_FREE(tdb->map_ptr);
1798                 else
1799                         tdb_munmap(tdb);
1800         }
1801         SAFE_FREE(tdb->name);
1802         if (tdb->fd != -1)
1803                 if (close(tdb->fd) != 0)
1804                         TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1805         SAFE_FREE(tdb->locked);
1806         SAFE_FREE(tdb);
1807         errno = save_errno;
1808         return NULL;
1809         }
1810 }
1811
1812 /* close a database */
1813 int tdb_close(TDB_CONTEXT *tdb)
1814 {
1815         TDB_CONTEXT **i;
1816         int ret = 0;
1817
1818         if (tdb->map_ptr) {
1819                 if (tdb->flags & TDB_INTERNAL)
1820                         SAFE_FREE(tdb->map_ptr);
1821                 else
1822                         tdb_munmap(tdb);
1823         }
1824         SAFE_FREE(tdb->name);
1825         if (tdb->fd != -1)
1826                 ret = close(tdb->fd);
1827         SAFE_FREE(tdb->locked);
1828         SAFE_FREE(tdb->lockedkeys);
1829
1830         /* Remove from contexts list */
1831         for (i = &tdbs; *i; i = &(*i)->next) {
1832                 if (*i == tdb) {
1833                         *i = tdb->next;
1834                         break;
1835                 }
1836         }
1837
1838         memset(tdb, 0, sizeof(*tdb));
1839         SAFE_FREE(tdb);
1840
1841         return ret;
1842 }
1843
1844 /* lock/unlock entire database */
1845 int tdb_lockall(TDB_CONTEXT *tdb)
1846 {
1847         u32 i;
1848
1849         /* There are no locks on read-only dbs */
1850         if (tdb->read_only)
1851                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1852         if (tdb->lockedkeys)
1853                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1854         for (i = 0; i < tdb->header.hash_size; i++)
1855                 if (tdb_lock(tdb, i, F_WRLCK))
1856                         break;
1857
1858         /* If error, release locks we have... */
1859         if (i < tdb->header.hash_size) {
1860                 u32 j;
1861
1862                 for ( j = 0; j < i; j++)
1863                         tdb_unlock(tdb, j, F_WRLCK);
1864                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1865         }
1866
1867         return 0;
1868 }
1869 void tdb_unlockall(TDB_CONTEXT *tdb)
1870 {
1871         u32 i;
1872         for (i=0; i < tdb->header.hash_size; i++)
1873                 tdb_unlock(tdb, i, F_WRLCK);
1874 }
1875
1876 int tdb_lockkeys(TDB_CONTEXT *tdb, u32 number, TDB_DATA keys[])
1877 {
1878         u32 i, j, hash;
1879
1880         /* Can't lock more keys if already locked */
1881         if (tdb->lockedkeys)
1882                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1883         if (!(tdb->lockedkeys = malloc(sizeof(u32) * (number+1))))
1884                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
1885         /* First number in array is # keys */
1886         tdb->lockedkeys[0] = number;
1887
1888         /* Insertion sort by bucket */
1889         for (i = 0; i < number; i++) {
1890                 hash = tdb_hash(&keys[i]);
1891                 for (j = 0; j < i && BUCKET(tdb->lockedkeys[j+1]) < BUCKET(hash); j++);
1892                         memmove(&tdb->lockedkeys[j+2], &tdb->lockedkeys[j+1], sizeof(u32) * (i-j));
1893                 tdb->lockedkeys[j+1] = hash;
1894         }
1895         /* Finally, lock in order */
1896         for (i = 0; i < number; i++)
1897                 if (tdb_lock(tdb, i, F_WRLCK))
1898                         break;
1899
1900         /* If error, release locks we have... */
1901         if (i < number) {
1902                 for ( j = 0; j < i; j++)
1903                         tdb_unlock(tdb, j, F_WRLCK);
1904                 SAFE_FREE(tdb->lockedkeys);
1905                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1906         }
1907         return 0;
1908 }
1909
1910 /* Unlock the keys previously locked by tdb_lockkeys() */
1911 void tdb_unlockkeys(TDB_CONTEXT *tdb)
1912 {
1913         u32 i;
1914         if (!tdb->lockedkeys)
1915                 return;
1916         for (i = 0; i < tdb->lockedkeys[0]; i++)
1917                 tdb_unlock(tdb, tdb->lockedkeys[i+1], F_WRLCK);
1918         SAFE_FREE(tdb->lockedkeys);
1919 }
1920
1921 /* lock/unlock one hash chain. This is meant to be used to reduce
1922    contention - it cannot guarantee how many records will be locked */
1923 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1924 {
1925         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1926 }
1927
1928 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1929 {
1930         return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1931 }
1932
1933 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1934 {
1935         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
1936 }
1937
1938 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1939 {
1940         return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
1941 }
1942
1943
1944 /* register a loging function */
1945 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1946 {
1947         tdb->log_fn = fn;
1948 }
1949
1950
1951 /* reopen a tdb - this is used after a fork to ensure that we have an independent
1952    seek pointer from our parent and to re-establish locks */
1953 int tdb_reopen(TDB_CONTEXT *tdb)
1954 {
1955         struct stat st;
1956
1957         if (tdb_munmap(tdb) != 0) {
1958                 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
1959                 goto fail;
1960         }
1961         if (close(tdb->fd) != 0)
1962                 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
1963         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
1964         if (tdb->fd == -1) {
1965                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
1966                 goto fail;
1967         }
1968         if (fstat(tdb->fd, &st) != 0) {
1969                 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
1970                 goto fail;
1971         }
1972         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
1973                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
1974                 goto fail;
1975         }
1976         tdb_mmap(tdb);
1977         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1) {
1978                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
1979                 goto fail;
1980         }
1981
1982         return 0;
1983
1984 fail:
1985         tdb_close(tdb);
1986         return -1;
1987 }
1988
1989 /* reopen all tdb's */
1990 int tdb_reopen_all(void)
1991 {
1992         TDB_CONTEXT *tdb;
1993
1994         for (tdb=tdbs; tdb; tdb = tdb->next) {
1995                 if (tdb_reopen(tdb) != 0) return -1;
1996         }
1997
1998         return 0;
1999 }