source/tdb/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3    Samba database functions
   4    Copyright (C) Andrew Tridgell              1999-2000
   5    Copyright (C) Luke Kenneth Casson Leighton      2000
   6    Copyright (C) Paul `Rusty' Russell              2000
   7    Copyright (C) Jeremy Allison                    2000-2003
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 2 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; if not, write to the Free Software
  21    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22 */
  23
  24
  25 /* NOTE: If you use tdbs under valgrind, and in particular if you run
  26  * tdbtorture, you may get spurious "uninitialized value" warnings.  I
  27  * think this is because valgrind doesn't understand that the mmap'd
  28  * area may be written to by other processes.  Memory can, from the
  29  * point of view of the grinded process, spontaneously become
  30  * initialized.
  31  *
  32  * I can think of a few solutions.  [mbp 20030311]
  33  *
  34  * 1 - Write suppressions for Valgrind so that it doesn't complain
  35  * about this.  Probably the most reasonable but people need to
  36  * remember to use them.
  37  *
  38  * 2 - Use IO not mmap when running under valgrind.  Not so nice.
  39  *
  40  * 3 - Use the special valgrind macros to mark memory as valid at the
  41  * right time.  Probably too hard -- the process just doesn't know.
  42  */
  43
  44 #ifdef STANDALONE
  45 #if HAVE_CONFIG_H
  46 #include <config.h>
  47 #endif
  48
  49 #include <stdlib.h>
  50 #include <stdio.h>
  51 #include <fcntl.h>
  52 #include <unistd.h>
  53 #include <string.h>
  54 #include <fcntl.h>
  55 #include <errno.h>
  56 #include <sys/mman.h>
  57 #include <sys/stat.h>
  58 #include <signal.h>
  59 #include "tdb.h"
  60 #include "spinlock.h"
  61 #else
  62 #include "includes.h"
  63 #endif
  64
  65 #define TDB_MAGIC_FOOD "TDB file\n"
  66 #define TDB_VERSION (0x26011967 + 6)
  67 #define TDB_MAGIC (0x26011999U)
  68 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  69 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  70 #define TDB_ALIGNMENT 4
  71 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
  72 #define DEFAULT_HASH_SIZE 131
  73 #define TDB_PAGE_SIZE 0x2000
  74 #define FREELIST_TOP (sizeof(struct tdb_header))
  75 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
  76 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
  77 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
  78 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
  79 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
  80 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
  81
  82
  83 /* NB assumes there is a local variable called "tdb" that is the
  84  * current context, also takes doubly-parenthesized print-style
  85  * argument. */
  86 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
  87
  88 /* lock offsets */
  89 #define GLOBAL_LOCK 0
  90 #define ACTIVE_LOCK 4
  91
  92 #ifndef MAP_FILE
  93 #define MAP_FILE 0
  94 #endif
  95
  96 #ifndef MAP_FAILED
  97 #define MAP_FAILED ((void *)-1)
  98 #endif
  99
 100 /* free memory if the pointer is valid and zero the pointer */
 101 #ifndef SAFE_FREE
 102 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
 103 #endif
 104
 105 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
 106 TDB_DATA tdb_null;
 107
 108 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 109 static TDB_CONTEXT *tdbs = NULL;
 110
 111 static int tdb_munmap(TDB_CONTEXT *tdb)
 112 {
 113         if (tdb->flags & TDB_INTERNAL)
 114                 return 0;
 115
 116 #ifdef HAVE_MMAP
 117         if (tdb->map_ptr) {
 118                 int ret = munmap(tdb->map_ptr, tdb->map_size);
 119                 if (ret != 0)
 120                         return ret;
 121         }
 122 #endif
 123         tdb->map_ptr = NULL;
 124         return 0;
 125 }
 126
 127 static void tdb_mmap(TDB_CONTEXT *tdb)
 128 {
 129         if (tdb->flags & TDB_INTERNAL)
 130                 return;
 131
 132 #ifdef HAVE_MMAP
 133         if (!(tdb->flags & TDB_NOMMAP)) {
 134                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 135                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 136                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 137
 138                 /*
 139                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 140                  */
 141
 142                 if (tdb->map_ptr == MAP_FAILED) {
 143                         tdb->map_ptr = NULL;
 144                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 145                                  tdb->map_size, strerror(errno)));
 146                 }
 147         } else {
 148                 tdb->map_ptr = NULL;
 149         }
 150 #else
 151         tdb->map_ptr = NULL;
 152 #endif
 153 }
 154
 155 /* Endian conversion: we only ever deal with 4 byte quantities */
 156 static void *convert(void *buf, u32 size)
 157 {
 158         u32 i, *p = buf;
 159         for (i = 0; i < size / 4; i++)
 160                 p[i] = TDB_BYTEREV(p[i]);
 161         return buf;
 162 }
 163 #define DOCONV() (tdb->flags & TDB_CONVERT)
 164 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 165
 166 /* the body of the database is made of one list_struct for the free space
 167    plus a separate data list for each hash value */
 168 struct list_struct {
 169         tdb_off next; /* offset of the next record in the list */
 170         tdb_len rec_len; /* total byte length of record */
 171         tdb_len key_len; /* byte length of key */
 172         tdb_len data_len; /* byte length of data */
 173         u32 full_hash; /* the full 32 bit hash of the key */
 174         u32 magic;   /* try to catch errors */
 175         /* the following union is implied:
 176                 union {
 177                         char record[rec_len];
 178                         struct {
 179                                 char key[key_len];
 180                                 char data[data_len];
 181                         }
 182                         u32 totalsize; (tailer)
 183                 }
 184         */
 185 };
 186
 187 /***************************************************************
 188  Allow a caller to set a "alarm" flag that tdb can check to abort
 189  a blocking lock on SIGALRM.
 190 ***************************************************************/
 191
 192 static sig_atomic_t *palarm_fired;
 193
 194 void tdb_set_lock_alarm(sig_atomic_t *palarm)
 195 {
 196         palarm_fired = palarm;
 197 }
 198
 199 /* a byte range locking function - return 0 on success
 200    this functions locks/unlocks 1 byte at the specified offset.
 201
 202    On error, errno is also set so that errors are passed back properly
 203    through tdb_open(). */
 204 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 205                       int rw_type, int lck_type, int probe)
 206 {
 207         struct flock fl;
 208         int ret;
 209
 210         if (tdb->flags & TDB_NOLOCK)
 211                 return 0;
 212         if ((rw_type == F_WRLCK) && (tdb->read_only)) {
 213                 errno = EACCES;
 214                 return -1;
 215         }
 216
 217         fl.l_type = rw_type;
 218         fl.l_whence = SEEK_SET;
 219         fl.l_start = offset;
 220         fl.l_len = 1;
 221         fl.l_pid = 0;
 222
 223         do {
 224                 ret = fcntl(tdb->fd,lck_type,&fl);
 225                 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
 226                         break;
 227         } while (ret == -1 && errno == EINTR);
 228
 229         if (ret == -1) {
 230                 if (!probe && lck_type != F_SETLK) {
 231                         /* Ensure error code is set for log fun to examine. */
 232                         if (errno == EINTR && palarm_fired && *palarm_fired)
 233                                 tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
 234                         else
 235                                 tdb->ecode = TDB_ERR_LOCK;
 236                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 237                                  tdb->fd, offset, rw_type, lck_type));
 238                 }
 239                 /* Was it an alarm timeout ? */
 240                 if (errno == EINTR && palarm_fired && *palarm_fired) {
 241                         TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 242                                  tdb->fd, offset, rw_type, lck_type));
 243                         return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
 244                 }
 245                 /* Otherwise - generic lock error. */
 246                 /* errno set by fcntl */
 247                 TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
 248                          tdb->fd, offset, rw_type, lck_type, strerror(errno)));
 249                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 250         }
 251         return 0;
 252 }
 253
 254 /* lock a list in the database. list -1 is the alloc list */
 255 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 256 {
 257         if (list < -1 || list >= (int)tdb->header.hash_size) {
 258                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 259                            list, ltype));
 260                 return -1;
 261         }
 262         if (tdb->flags & TDB_NOLOCK)
 263                 return 0;
 264
 265         /* Since fcntl locks don't nest, we do a lock for the first one,
 266            and simply bump the count for future ones */
 267         if (tdb->locked[list+1].count == 0) {
 268                 if (!tdb->read_only && tdb->header.rwlocks) {
 269                         if (tdb_spinlock(tdb, list, ltype)) {
 270                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list ltype=%d\n",
 271                                            list, ltype));
 272                                 return -1;
 273                         }
 274                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 275                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 276                                            list, ltype, strerror(errno)));
 277                         return -1;
 278                 }
 279                 tdb->locked[list+1].ltype = ltype;
 280         }
 281         tdb->locked[list+1].count++;
 282         return 0;
 283 }
 284
 285 /* unlock the database: returns void because it's too late for errors. */
 286         /* changed to return int it may be interesting to know there
 287            has been an error  --simo */
 288 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 289 {
 290         int ret = -1;
 291
 292         if (tdb->flags & TDB_NOLOCK)
 293                 return 0;
 294
 295         /* Sanity checks */
 296         if (list < -1 || list >= (int)tdb->header.hash_size) {
 297                 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
 298                 return ret;
 299         }
 300
 301         if (tdb->locked[list+1].count==0) {
 302                 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
 303                 return ret;
 304         }
 305
 306         if (tdb->locked[list+1].count == 1) {
 307                 /* Down to last nested lock: unlock underneath */
 308                 if (!tdb->read_only && tdb->header.rwlocks) {
 309                         ret = tdb_spinunlock(tdb, list, ltype);
 310                 } else {
 311                         ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 312                 }
 313         } else {
 314                 ret = 0;
 315         }
 316         tdb->locked[list+1].count--;
 317
 318         if (ret)
 319                 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
 320         return ret;
 321 }
 322
 323 /* This is based on the hash algorithm from gdbm */
 324 static u32 tdb_hash(TDB_DATA *key)
 325 {
 326         u32 value;      /* Used to compute the hash value.  */
 327         u32   i;        /* Used to cycle through random values. */
 328
 329         /* Set the initial value from the key size. */
 330         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
 331                 value = (value + (key->dptr[i] << (i*5 % 24)));
 332
 333         return (1103515243 * value + 12345);
 334 }
 335
 336 /* check for an out of bounds access - if it is out of bounds then
 337    see if the database has been expanded by someone else and expand
 338    if necessary
 339    note that "len" is the minimum length needed for the db
 340 */
 341 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 342 {
 343         struct stat st;
 344         if (len <= tdb->map_size)
 345                 return 0;
 346         if (tdb->flags & TDB_INTERNAL) {
 347                 if (!probe) {
 348                         /* Ensure ecode is set for log fn. */
 349                         tdb->ecode = TDB_ERR_IO;
 350                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 351                                  (int)len, (int)tdb->map_size));
 352                 }
 353                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 354         }
 355
 356         if (fstat(tdb->fd, &st) == -1)
 357                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 358
 359         if (st.st_size < (size_t)len) {
 360                 if (!probe) {
 361                         /* Ensure ecode is set for log fn. */
 362                         tdb->ecode = TDB_ERR_IO;
 363                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 364                                  (int)len, (int)st.st_size));
 365                 }
 366                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 367         }
 368
 369         /* Unmap, update size, remap */
 370         if (tdb_munmap(tdb) == -1)
 371                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 372         tdb->map_size = st.st_size;
 373         tdb_mmap(tdb);
 374         return 0;
 375 }
 376
 377 /* write a lump of data at a specified offset */
 378 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 379 {
 380         if (tdb_oob(tdb, off + len, 0) != 0)
 381                 return -1;
 382
 383         if (tdb->map_ptr)
 384                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 385 #ifdef HAVE_PWRITE
 386         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 387 #else
 388         else if (lseek(tdb->fd, off, SEEK_SET) != off
 389                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 390 #endif
 391                 /* Ensure ecode is set for log fn. */
 392                 tdb->ecode = TDB_ERR_IO;
 393                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 394                            off, len, strerror(errno)));
 395                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 396         }
 397         return 0;
 398 }
 399
 400 /* read a lump of data at a specified offset, maybe convert */
 401 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 402 {
 403         if (tdb_oob(tdb, off + len, 0) != 0)
 404                 return -1;
 405
 406         if (tdb->map_ptr)
 407                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 408 #ifdef HAVE_PREAD
 409         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 410 #else
 411         else if (lseek(tdb->fd, off, SEEK_SET) != off
 412                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 413 #endif
 414                 /* Ensure ecode is set for log fn. */
 415                 tdb->ecode = TDB_ERR_IO;
 416                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 417                            off, len, strerror(errno)));
 418                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 419         }
 420         if (cv)
 421                 convert(buf, len);
 422         return 0;
 423 }
 424
 425 /* read a lump of data, allocating the space for it */
 426 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 427 {
 428         char *buf;
 429
 430         if (!(buf = malloc(len))) {
 431                 /* Ensure ecode is set for log fn. */
 432                 tdb->ecode = TDB_ERR_OOM;
 433                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 434                            len, strerror(errno)));
 435                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 436         }
 437         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 438                 SAFE_FREE(buf);
 439                 return NULL;
 440         }
 441         return buf;
 442 }
 443
 444 /* read/write a tdb_off */
 445 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 446 {
 447         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 448 }
 449 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 450 {
 451         tdb_off off = *d;
 452         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 453 }
 454
 455 /* read/write a record */
 456 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 457 {
 458         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 459                 return -1;
 460         if (TDB_BAD_MAGIC(rec)) {
 461                 /* Ensure ecode is set for log fn. */
 462                 tdb->ecode = TDB_ERR_CORRUPT;
 463                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 464                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 465         }
 466         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 467 }
 468 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 469 {
 470         struct list_struct r = *rec;
 471         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 472 }
 473
 474 /* read a freelist record and check for simple errors */
 475 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 476 {
 477         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 478                 return -1;
 479
 480         if (rec->magic == TDB_MAGIC) {
 481                 /* this happens when a app is showdown while deleting a record - we should
 482                    not completely fail when this happens */
 483                 TDB_LOG((tdb, 0,"rec_free_read non-free magic at offset=%d - fixing\n",
 484                          rec->magic, off));
 485                 rec->magic = TDB_FREE_MAGIC;
 486                 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
 487                         return -1;
 488         }
 489
 490         if (rec->magic != TDB_FREE_MAGIC) {
 491                 /* Ensure ecode is set for log fn. */
 492                 tdb->ecode = TDB_ERR_CORRUPT;
 493                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 494                            rec->magic, off));
 495                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 496         }
 497         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 498                 return -1;
 499         return 0;
 500 }
 501
 502 /* update a record tailer (must hold allocation lock) */
 503 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 504                          const struct list_struct *rec)
 505 {
 506         tdb_off totalsize;
 507
 508         /* Offset of tailer from record header */
 509         totalsize = sizeof(*rec) + rec->rec_len;
 510         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 511                          &totalsize);
 512 }
 513
 514 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 515 {
 516         struct list_struct rec;
 517         tdb_off tailer_ofs, tailer;
 518
 519         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 520                 printf("ERROR: failed to read record at %u\n", offset);
 521                 return 0;
 522         }
 523
 524         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 525                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 526
 527         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 528         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 529                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 530                 return rec.next;
 531         }
 532
 533         if (tailer != rec.rec_len + sizeof(rec)) {
 534                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 535                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 536         }
 537         return rec.next;
 538 }
 539
 540 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 541 {
 542         tdb_off rec_ptr, top;
 543
 544         top = TDB_HASH_TOP(i);
 545
 546         if (tdb_lock(tdb, i, F_WRLCK) != 0)
 547                 return -1;
 548
 549         if (ofs_read(tdb, top, &rec_ptr) == -1)
 550                 return tdb_unlock(tdb, i, F_WRLCK);
 551
 552         if (rec_ptr)
 553                 printf("hash=%d\n", i);
 554
 555         while (rec_ptr) {
 556                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 557         }
 558
 559         return tdb_unlock(tdb, i, F_WRLCK);
 560 }
 561
 562 void tdb_dump_all(TDB_CONTEXT *tdb)
 563 {
 564         int i;
 565         for (i=0;i<tdb->header.hash_size;i++) {
 566                 tdb_dump_chain(tdb, i);
 567         }
 568         printf("freelist:\n");
 569         tdb_dump_chain(tdb, -1);
 570 }
 571
 572 int tdb_printfreelist(TDB_CONTEXT *tdb)
 573 {
 574         int ret;
 575         long total_free = 0;
 576         tdb_off offset, rec_ptr;
 577         struct list_struct rec;
 578
 579         if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
 580                 return ret;
 581
 582         offset = FREELIST_TOP;
 583
 584         /* read in the freelist top */
 585         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 586                 tdb_unlock(tdb, -1, F_WRLCK);
 587                 return 0;
 588         }
 589
 590         printf("freelist top=[0x%08x]\n", rec_ptr );
 591         while (rec_ptr) {
 592                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 593                         tdb_unlock(tdb, -1, F_WRLCK);
 594                         return -1;
 595                 }
 596
 597                 if (rec.magic != TDB_FREE_MAGIC) {
 598                         printf("bad magic 0x%08x in free list\n", rec.magic);
 599                         tdb_unlock(tdb, -1, F_WRLCK);
 600                         return -1;
 601                 }
 602
 603                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 604                 total_free += rec.rec_len;
 605
 606                 /* move to the next record */
 607                 rec_ptr = rec.next;
 608         }
 609         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 610                (int)total_free);
 611
 612         return tdb_unlock(tdb, -1, F_WRLCK);
 613 }
 614
 615 /* Remove an element from the freelist.  Must have alloc lock. */
 616 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 617 {
 618         tdb_off last_ptr, i;
 619
 620         /* read in the freelist top */
 621         last_ptr = FREELIST_TOP;
 622         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 623                 if (i == off) {
 624                         /* We've found it! */
 625                         return ofs_write(tdb, last_ptr, &next);
 626                 }
 627                 /* Follow chain (next offset is at start of record) */
 628                 last_ptr = i;
 629         }
 630         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 631         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 632 }
 633
 634 /* Add an element into the freelist. Merge adjacent records if
 635    neccessary. */
 636 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 637 {
 638         tdb_off right, left;
 639
 640         /* Allocation and tailer lock */
 641         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 642                 return -1;
 643
 644         /* set an initial tailer, so if we fail we don't leave a bogus record */
 645         if (update_tailer(tdb, offset, rec) != 0) {
 646                 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
 647                 goto fail;
 648         }
 649
 650         /* Look right first (I'm an Australian, dammit) */
 651         right = offset + sizeof(*rec) + rec->rec_len;
 652         if (right + sizeof(*rec) <= tdb->map_size) {
 653                 struct list_struct r;
 654
 655                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 656                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 657                         goto left;
 658                 }
 659
 660                 /* If it's free, expand to include it. */
 661                 if (r.magic == TDB_FREE_MAGIC) {
 662                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 663                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 664                                 goto left;
 665                         }
 666                         rec->rec_len += sizeof(r) + r.rec_len;
 667                 }
 668         }
 669
 670 left:
 671         /* Look left */
 672         left = offset - sizeof(tdb_off);
 673         if (left > TDB_DATA_START(tdb->header.hash_size)) {
 674                 struct list_struct l;
 675                 tdb_off leftsize;
 676
 677                 /* Read in tailer and jump back to header */
 678                 if (ofs_read(tdb, left, &leftsize) == -1) {
 679                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 680                         goto update;
 681                 }
 682                 left = offset - leftsize;
 683
 684                 /* Now read in record */
 685                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 686                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 687                         goto update;
 688                 }
 689
 690                 /* If it's free, expand to include it. */
 691                 if (l.magic == TDB_FREE_MAGIC) {
 692                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 693                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 694                                 goto update;
 695                         } else {
 696                                 offset = left;
 697                                 rec->rec_len += leftsize;
 698                         }
 699                 }
 700         }
 701
 702 update:
 703         if (update_tailer(tdb, offset, rec) == -1) {
 704                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 705                 goto fail;
 706         }
 707
 708         /* Now, prepend to free list */
 709         rec->magic = TDB_FREE_MAGIC;
 710
 711         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 712             rec_write(tdb, offset, rec) == -1 ||
 713             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 714                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 715                 goto fail;
 716         }
 717
 718         /* And we're done. */
 719         tdb_unlock(tdb, -1, F_WRLCK);
 720         return 0;
 721
 722  fail:
 723         tdb_unlock(tdb, -1, F_WRLCK);
 724         return -1;
 725 }
 726
 727
 728 /* expand a file.  we prefer to use ftruncate, as that is what posix
 729   says to use for mmap expansion */
 730 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 731 {
 732         char buf[1024];
 733 #if HAVE_FTRUNCATE_EXTEND
 734         if (ftruncate(tdb->fd, size+addition) != 0) {
 735                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 736                            size+addition, strerror(errno)));
 737                 return -1;
 738         }
 739 #else
 740         char b = 0;
 741
 742 #ifdef HAVE_PWRITE
 743         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 744 #else
 745         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 746             write(tdb->fd, &b, 1) != 1) {
 747 #endif
 748                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 749                            size+addition, strerror(errno)));
 750                 return -1;
 751         }
 752 #endif
 753
 754         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 755            very bad if we ran out of disk. This must be done with write, not via mmap */
 756         memset(buf, 0x42, sizeof(buf));
 757         while (addition) {
 758                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 759 #ifdef HAVE_PWRITE
 760                 int ret = pwrite(tdb->fd, buf, n, size);
 761 #else
 762                 int ret;
 763                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 764                         return -1;
 765                 ret = write(tdb->fd, buf, n);
 766 #endif
 767                 if (ret != n) {
 768                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 769                                    n, strerror(errno)));
 770                         return -1;
 771                 }
 772                 addition -= n;
 773                 size += n;
 774         }
 775         return 0;
 776 }
 777
 778
 779 /* expand the database at least size bytes by expanding the underlying
 780    file and doing the mmap again if necessary */
 781 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 782 {
 783         struct list_struct rec;
 784         tdb_off offset;
 785
 786         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 787                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 788                 return -1;
 789         }
 790
 791         /* must know about any previous expansions by another process */
 792         tdb_oob(tdb, tdb->map_size + 1, 1);
 793
 794         /* always make room for at least 10 more records, and round
 795            the database up to a multiple of TDB_PAGE_SIZE */
 796         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 797
 798         if (!(tdb->flags & TDB_INTERNAL))
 799                 tdb_munmap(tdb);
 800
 801         /*
 802          * We must ensure the file is unmapped before doing this
 803          * to ensure consistency with systems like OpenBSD where
 804          * writes and mmaps are not consistent.
 805          */
 806
 807         /* expand the file itself */
 808         if (!(tdb->flags & TDB_INTERNAL)) {
 809                 if (expand_file(tdb, tdb->map_size, size) != 0)
 810                         goto fail;
 811         }
 812
 813         tdb->map_size += size;
 814
 815         if (tdb->flags & TDB_INTERNAL)
 816                 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 817         else {
 818                 /*
 819                  * We must ensure the file is remapped before adding the space
 820                  * to ensure consistency with systems like OpenBSD where
 821                  * writes and mmaps are not consistent.
 822                  */
 823
 824                 /* We're ok if the mmap fails as we'll fallback to read/write */
 825                 tdb_mmap(tdb);
 826         }
 827
 828         /* form a new freelist record */
 829         memset(&rec,'\0',sizeof(rec));
 830         rec.rec_len = size - sizeof(rec);
 831
 832         /* link it into the free list */
 833         offset = tdb->map_size - size;
 834         if (tdb_free(tdb, offset, &rec) == -1)
 835                 goto fail;
 836
 837         tdb_unlock(tdb, -1, F_WRLCK);
 838         return 0;
 839  fail:
 840         tdb_unlock(tdb, -1, F_WRLCK);
 841         return -1;
 842 }
 843
 844 /* allocate some space from the free list. The offset returned points
 845    to a unconnected list_struct within the database with room for at
 846    least length bytes of total data
 847
 848    0 is returned if the space could not be allocated
 849  */
 850 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 851                             struct list_struct *rec)
 852 {
 853         tdb_off rec_ptr, last_ptr, newrec_ptr;
 854         struct list_struct newrec;
 855
 856         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 857                 return 0;
 858
 859         /* Extra bytes required for tailer */
 860         length += sizeof(tdb_off);
 861
 862  again:
 863         last_ptr = FREELIST_TOP;
 864
 865         /* read in the freelist top */
 866         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 867                 goto fail;
 868
 869         /* keep looking until we find a freelist record big enough */
 870         while (rec_ptr) {
 871                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 872                         goto fail;
 873
 874                 if (rec->rec_len >= length) {
 875                         /* found it - now possibly split it up  */
 876                         if (rec->rec_len > length + MIN_REC_SIZE) {
 877                                 /* Length of left piece */
 878                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 879
 880                                 /* Right piece to go on free list */
 881                                 newrec.rec_len = rec->rec_len
 882                                         - (sizeof(*rec) + length);
 883                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 884
 885                                 /* And left record is shortened */
 886                                 rec->rec_len = length;
 887                         } else
 888                                 newrec_ptr = 0;
 889
 890                         /* Remove allocated record from the free list */
 891                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 892                                 goto fail;
 893
 894                         /* Update header: do this before we drop alloc
 895                            lock, otherwise tdb_free() might try to
 896                            merge with us, thinking we're free.
 897                            (Thanks Jeremy Allison). */
 898                         rec->magic = TDB_MAGIC;
 899                         if (rec_write(tdb, rec_ptr, rec) == -1)
 900                                 goto fail;
 901
 902                         /* Did we create new block? */
 903                         if (newrec_ptr) {
 904                                 /* Update allocated record tailer (we
 905                                    shortened it). */
 906                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 907                                         goto fail;
 908
 909                                 /* Free new record */
 910                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 911                                         goto fail;
 912                         }
 913
 914                         /* all done - return the new record offset */
 915                         tdb_unlock(tdb, -1, F_WRLCK);
 916                         return rec_ptr;
 917                 }
 918                 /* move to the next record */
 919                 last_ptr = rec_ptr;
 920                 rec_ptr = rec->next;
 921         }
 922         /* we didn't find enough space. See if we can expand the
 923            database and if we can then try again */
 924         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 925                 goto again;
 926  fail:
 927         tdb_unlock(tdb, -1, F_WRLCK);
 928         return 0;
 929 }
 930
 931 /* initialise a new database with a specified hash size */
 932 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 933 {
 934         struct tdb_header *newdb;
 935         int size, ret = -1;
 936
 937         /* We make it up in memory, then write it out if not internal */
 938         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 939         if (!(newdb = calloc(size, 1)))
 940                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 941
 942         /* Fill in the header */
 943         newdb->version = TDB_VERSION;
 944         newdb->hash_size = hash_size;
 945 #ifdef USE_SPINLOCKS
 946         newdb->rwlocks = size;
 947 #endif
 948         if (tdb->flags & TDB_INTERNAL) {
 949                 tdb->map_size = size;
 950                 tdb->map_ptr = (char *)newdb;
 951                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 952                 /* Convert the `ondisk' version if asked. */
 953                 CONVERT(*newdb);
 954                 return 0;
 955         }
 956         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 957                 goto fail;
 958
 959         if (ftruncate(tdb->fd, 0) == -1)
 960                 goto fail;
 961
 962         /* This creates an endian-converted header, as if read from disk */
 963         CONVERT(*newdb);
 964         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 965         /* Don't endian-convert the magic food! */
 966         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 967         if (write(tdb->fd, newdb, size) != size)
 968                 ret = -1;
 969         else
 970                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 971
 972   fail:
 973         SAFE_FREE(newdb);
 974         return ret;
 975 }
 976
 977 /* Returns 0 on fail.  On success, return offset of record, and fills
 978    in rec */
 979 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 980                         struct list_struct *r)
 981 {
 982         tdb_off rec_ptr;
 983
 984         /* read in the hash top */
 985         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 986                 return 0;
 987
 988         /* keep looking until we find the right record */
 989         while (rec_ptr) {
 990                 if (rec_read(tdb, rec_ptr, r) == -1)
 991                         return 0;
 992
 993                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
 994                         char *k;
 995                         /* a very likely hit - read the key */
 996                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
 997                                            r->key_len);
 998                         if (!k)
 999                                 return 0;
1000
1001                         if (memcmp(key.dptr, k, key.dsize) == 0) {
1002                                 SAFE_FREE(k);
1003                                 return rec_ptr;
1004                         }
1005                         SAFE_FREE(k);
1006                 }
1007                 rec_ptr = r->next;
1008         }
1009         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1010 }
1011
1012 /* If they do lockkeys, check that this hash is one they locked */
1013 static int tdb_keylocked(TDB_CONTEXT *tdb, u32 hash)
1014 {
1015         u32 i;
1016         if (!tdb->lockedkeys)
1017                 return 1;
1018         for (i = 0; i < tdb->lockedkeys[0]; i++)
1019                 if (tdb->lockedkeys[i+1] == hash)
1020                         return 1;
1021         return TDB_ERRCODE(TDB_ERR_NOLOCK, 0);
1022 }
1023
1024 /* As tdb_find, but if you succeed, keep the lock */
1025 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1026                              struct list_struct *rec)
1027 {
1028         u32 rec_ptr;
1029
1030         if (!tdb_keylocked(tdb, hash))
1031                 return 0;
1032         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1033                 return 0;
1034         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1035                 tdb_unlock(tdb, BUCKET(hash), locktype);
1036         return rec_ptr;
1037 }
1038
1039 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1040 {
1041         return tdb->ecode;
1042 }
1043
1044 static struct tdb_errname {
1045         enum TDB_ERROR ecode; const char *estring;
1046 } emap[] = { {TDB_SUCCESS, "Success"},
1047              {TDB_ERR_CORRUPT, "Corrupt database"},
1048              {TDB_ERR_IO, "IO Error"},
1049              {TDB_ERR_LOCK, "Locking error"},
1050              {TDB_ERR_OOM, "Out of memory"},
1051              {TDB_ERR_EXISTS, "Record exists"},
1052              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1053              {TDB_ERR_NOEXIST, "Record does not exist"} };
1054
1055 /* Error string for the last tdb error */
1056 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1057 {
1058         u32 i;
1059         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1060                 if (tdb->ecode == emap[i].ecode)
1061                         return emap[i].estring;
1062         return "Invalid error code";
1063 }
1064
1065 /* update an entry in place - this only works if the new data size
1066    is <= the old data size and the key exists.
1067    on failure return -1.
1068 */
1069
1070 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1071 {
1072         struct list_struct rec;
1073         tdb_off rec_ptr;
1074
1075         /* find entry */
1076         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1077                 return -1;
1078
1079         /* must be long enough key, data and tailer */
1080         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1081                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1082                 return -1;
1083         }
1084
1085         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1086                       dbuf.dptr, dbuf.dsize) == -1)
1087                 return -1;
1088
1089         if (dbuf.dsize != rec.data_len) {
1090                 /* update size */
1091                 rec.data_len = dbuf.dsize;
1092                 return rec_write(tdb, rec_ptr, &rec);
1093         }
1094
1095         return 0;
1096 }
1097
1098 /* find an entry in the database given a key */
1099 /* If an entry doesn't exist tdb_err will be set to
1100  * TDB_ERR_NOEXIST. If a key has no data attached
1101  * tdb_err will not be set. Both will return a
1102  * zero pptr and zero dsize.
1103  */
1104
1105 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1106 {
1107         tdb_off rec_ptr;
1108         struct list_struct rec;
1109         TDB_DATA ret;
1110         u32 hash;
1111
1112         /* find which hash bucket it is in */
1113         hash = tdb_hash(&key);
1114         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1115                 return tdb_null;
1116
1117         if (rec.data_len)
1118                 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1119                                           rec.data_len);
1120         else
1121                 ret.dptr = NULL;
1122         ret.dsize = rec.data_len;
1123         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1124         return ret;
1125 }
1126
1127 /* check if an entry in the database exists
1128
1129    note that 1 is returned if the key is found and 0 is returned if not found
1130    this doesn't match the conventions in the rest of this module, but is
1131    compatible with gdbm
1132 */
1133 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1134 {
1135         struct list_struct rec;
1136
1137         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1138                 return 0;
1139         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1140         return 1;
1141 }
1142
1143 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1144 {
1145         u32 hash = tdb_hash(&key);
1146         return tdb_exists_hash(tdb, key, hash);
1147 }
1148
1149 /* record lock stops delete underneath */
1150 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1151 {
1152         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1153 }
1154 /*
1155   Write locks override our own fcntl readlocks, so check it here.
1156   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1157   an error to fail to get the lock here.
1158 */
1159
1160 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1161 {
1162         struct tdb_traverse_lock *i;
1163         for (i = &tdb->travlocks; i; i = i->next)
1164                 if (i->off == off)
1165                         return -1;
1166         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1167 }
1168
1169 /*
1170   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1171   an error to fail to get the lock here.
1172 */
1173
1174 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1175 {
1176         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1177 }
1178 /* fcntl locks don't stack: avoid unlocking someone else's */
1179 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1180 {
1181         struct tdb_traverse_lock *i;
1182         u32 count = 0;
1183
1184         if (off == 0)
1185                 return 0;
1186         for (i = &tdb->travlocks; i; i = i->next)
1187                 if (i->off == off)
1188                         count++;
1189         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1190 }
1191
1192 /* actually delete an entry in the database given the offset */
1193 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1194 {
1195         tdb_off last_ptr, i;
1196         struct list_struct lastrec;
1197
1198         if (tdb->read_only) return -1;
1199
1200         if (write_lock_record(tdb, rec_ptr) == -1) {
1201                 /* Someone traversing here: mark it as dead */
1202                 rec->magic = TDB_DEAD_MAGIC;
1203                 return rec_write(tdb, rec_ptr, rec);
1204         }
1205         if (write_unlock_record(tdb, rec_ptr) != 0)
1206                 return -1;
1207
1208         /* find previous record in hash chain */
1209         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1210                 return -1;
1211         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1212                 if (rec_read(tdb, i, &lastrec) == -1)
1213                         return -1;
1214
1215         /* unlink it: next ptr is at start of record. */
1216         if (last_ptr == 0)
1217                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1218         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1219                 return -1;
1220
1221         /* recover the space */
1222         if (tdb_free(tdb, rec_ptr, rec) == -1)
1223                 return -1;
1224         return 0;
1225 }
1226
1227 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1228 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1229                          struct list_struct *rec)
1230 {
1231         int want_next = (tlock->off != 0);
1232
1233         /* No traversal allows if you've called tdb_lockkeys() */
1234         if (tdb->lockedkeys)
1235                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1236
1237         /* Lock each chain from the start one. */
1238         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1239                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1240                         return -1;
1241
1242                 /* No previous record?  Start at top of chain. */
1243                 if (!tlock->off) {
1244                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1245                                      &tlock->off) == -1)
1246                                 goto fail;
1247                 } else {
1248                         /* Otherwise unlock the previous record. */
1249                         if (unlock_record(tdb, tlock->off) != 0)
1250                                 goto fail;
1251                 }
1252
1253                 if (want_next) {
1254                         /* We have offset of old record: grab next */
1255                         if (rec_read(tdb, tlock->off, rec) == -1)
1256                                 goto fail;
1257                         tlock->off = rec->next;
1258                 }
1259
1260                 /* Iterate through chain */
1261                 while( tlock->off) {
1262                         tdb_off current;
1263                         if (rec_read(tdb, tlock->off, rec) == -1)
1264                                 goto fail;
1265                         if (!TDB_DEAD(rec)) {
1266                                 /* Woohoo: we found one! */
1267                                 if (lock_record(tdb, tlock->off) != 0)
1268                                         goto fail;
1269                                 return tlock->off;
1270                         }
1271                         /* Try to clean dead ones from old traverses */
1272                         current = tlock->off;
1273                         tlock->off = rec->next;
1274                         if (!tdb->read_only &&
1275                             do_delete(tdb, current, rec) != 0)
1276                                 goto fail;
1277                 }
1278                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1279                 want_next = 0;
1280         }
1281         /* We finished iteration without finding anything */
1282         return TDB_ERRCODE(TDB_SUCCESS, 0);
1283
1284  fail:
1285         tlock->off = 0;
1286         if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1287                 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1288         return -1;
1289 }
1290
1291 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1292    return -1 on error or the record count traversed
1293    if fn is NULL then it is not called
1294    a non-zero return value from fn() indicates that the traversal should stop
1295   */
1296 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *state)
1297 {
1298         TDB_DATA key, dbuf;
1299         struct list_struct rec;
1300         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1301         int ret, count = 0;
1302
1303         /* This was in the initializaton, above, but the IRIX compiler
1304          * did not like it.  crh
1305          */
1306         tl.next = tdb->travlocks.next;
1307
1308         /* fcntl locks don't stack: beware traverse inside traverse */
1309         tdb->travlocks.next = &tl;
1310
1311         /* tdb_next_lock places locks on the record returned, and its chain */
1312         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1313                 count++;
1314                 /* now read the full record */
1315                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1316                                           rec.key_len + rec.data_len);
1317                 if (!key.dptr) {
1318                         ret = -1;
1319                         if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1320                                 goto out;
1321                         if (unlock_record(tdb, tl.off) != 0)
1322                                 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1323                         goto out;
1324                 }
1325                 key.dsize = rec.key_len;
1326                 dbuf.dptr = key.dptr + rec.key_len;
1327                 dbuf.dsize = rec.data_len;
1328
1329                 /* Drop chain lock, call out */
1330                 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1331                         ret = -1;
1332                         goto out;
1333                 }
1334                 if (fn && fn(tdb, key, dbuf, state)) {
1335                         /* They want us to terminate traversal */
1336                         ret = count;
1337                         if (unlock_record(tdb, tl.off) != 0) {
1338                                 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1339                                 ret = -1;
1340                         }
1341                         tdb->travlocks.next = tl.next;
1342                         SAFE_FREE(key.dptr);
1343                         return count;
1344                 }
1345                 SAFE_FREE(key.dptr);
1346         }
1347 out:
1348         tdb->travlocks.next = tl.next;
1349         if (ret < 0)
1350                 return -1;
1351         else
1352                 return count;
1353 }
1354
1355 /* find the first entry in the database and return its key */
1356 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1357 {
1358         TDB_DATA key;
1359         struct list_struct rec;
1360
1361         /* release any old lock */
1362         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1363                 return tdb_null;
1364         tdb->travlocks.off = tdb->travlocks.hash = 0;
1365
1366         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1367                 return tdb_null;
1368         /* now read the key */
1369         key.dsize = rec.key_len;
1370         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1371         if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1372                 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1373         return key;
1374 }
1375
1376 /* find the next entry in the database, returning its key */
1377 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1378 {
1379         u32 oldhash;
1380         TDB_DATA key = tdb_null;
1381         struct list_struct rec;
1382         char *k = NULL;
1383
1384         /* Is locked key the old key?  If so, traverse will be reliable. */
1385         if (tdb->travlocks.off) {
1386                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1387                         return tdb_null;
1388                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1389                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1390                                             rec.key_len))
1391                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1392                         /* No, it wasn't: unlock it and start from scratch */
1393                         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1394                                 return tdb_null;
1395                         if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1396                                 return tdb_null;
1397                         tdb->travlocks.off = 0;
1398                 }
1399
1400                 SAFE_FREE(k);
1401         }
1402
1403         if (!tdb->travlocks.off) {
1404                 /* No previous element: do normal find, and lock record */
1405                 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb_hash(&oldkey), F_WRLCK, &rec);
1406                 if (!tdb->travlocks.off)
1407                         return tdb_null;
1408                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1409                 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1410                         TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1411                         return tdb_null;
1412                 }
1413         }
1414         oldhash = tdb->travlocks.hash;
1415
1416         /* Grab next record: locks chain and returned record,
1417            unlocks old record */
1418         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1419                 key.dsize = rec.key_len;
1420                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1421                                           key.dsize);
1422                 /* Unlock the chain of this new record */
1423                 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1424                         TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1425         }
1426         /* Unlock the chain of old record */
1427         if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1428                 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1429         return key;
1430 }
1431
1432 /* delete an entry in the database given a key */
1433 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1434 {
1435         tdb_off rec_ptr;
1436         struct list_struct rec;
1437         int ret;
1438
1439         if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1440                 return -1;
1441         ret = do_delete(tdb, rec_ptr, &rec);
1442         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1443                 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1444         return ret;
1445 }
1446
1447 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1448 {
1449         u32 hash = tdb_hash(&key);
1450         return tdb_delete_hash(tdb, key, hash);
1451 }
1452
1453 /* store an element in the database, replacing any existing element
1454    with the same key
1455
1456    return 0 on success, -1 on failure
1457 */
1458 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1459 {
1460         struct list_struct rec;
1461         u32 hash;
1462         tdb_off rec_ptr;
1463         char *p = NULL;
1464         int ret = 0;
1465
1466         /* find which hash bucket it is in */
1467         hash = tdb_hash(&key);
1468         if (!tdb_keylocked(tdb, hash))
1469                 return -1;
1470         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1471                 return -1;
1472
1473         /* check for it existing, on insert. */
1474         if (flag == TDB_INSERT) {
1475                 if (tdb_exists_hash(tdb, key, hash)) {
1476                         tdb->ecode = TDB_ERR_EXISTS;
1477                         goto fail;
1478                 }
1479         } else {
1480                 /* first try in-place update, on modify or replace. */
1481                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1482                         goto out;
1483                 if (flag == TDB_MODIFY && tdb->ecode == TDB_ERR_NOEXIST)
1484                         goto fail;
1485         }
1486         /* reset the error code potentially set by the tdb_update() */
1487         tdb->ecode = TDB_SUCCESS;
1488
1489         /* delete any existing record - if it doesn't exist we don't
1490            care.  Doing this first reduces fragmentation, and avoids
1491            coalescing with `allocated' block before it's updated. */
1492         if (flag != TDB_INSERT)
1493                 tdb_delete_hash(tdb, key, hash);
1494
1495         /* Copy key+value *before* allocating free space in case malloc
1496            fails and we are left with a dead spot in the tdb. */
1497
1498         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1499                 tdb->ecode = TDB_ERR_OOM;
1500                 goto fail;
1501         }
1502
1503         memcpy(p, key.dptr, key.dsize);
1504         if (dbuf.dsize)
1505                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1506
1507         /* now we're into insert / modify / replace of a record which
1508          * we know could not be optimised by an in-place store (for
1509          * various reasons).  */
1510         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1511                 goto fail;
1512
1513         /* Read hash top into next ptr */
1514         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1515                 goto fail;
1516
1517         rec.key_len = key.dsize;
1518         rec.data_len = dbuf.dsize;
1519         rec.full_hash = hash;
1520         rec.magic = TDB_MAGIC;
1521
1522         /* write out and point the top of the hash chain at it */
1523         if (rec_write(tdb, rec_ptr, &rec) == -1
1524             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1525             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1526                 /* Need to tdb_unallocate() here */
1527                 goto fail;
1528         }
1529  out:
1530         SAFE_FREE(p);
1531         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1532         return ret;
1533 fail:
1534         ret = -1;
1535         goto out;
1536 }
1537
1538 /* Attempt to append data to an entry in place - this only works if the new data size
1539    is <= the old data size and the key exists.
1540    on failure return -1. Record must be locked before calling.
1541 */
1542 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1543 {
1544         struct list_struct rec;
1545         tdb_off rec_ptr;
1546
1547         /* find entry */
1548         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1549                 return -1;
1550
1551         /* Append of 0 is always ok. */
1552         if (new_dbuf.dsize == 0)
1553                 return 0;
1554
1555         /* must be long enough for key, old data + new data and tailer */
1556         if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1557                 /* No room. */
1558                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1559                 return -1;
1560         }
1561
1562         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1563                       new_dbuf.dptr, new_dbuf.dsize) == -1)
1564                 return -1;
1565
1566         /* update size */
1567         rec.data_len += new_dbuf.dsize;
1568         return rec_write(tdb, rec_ptr, &rec);
1569 }
1570
1571 /* Append to an entry. Create if not exist. */
1572
1573 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1574 {
1575         struct list_struct rec;
1576         u32 hash;
1577         tdb_off rec_ptr;
1578         char *p = NULL;
1579         int ret = 0;
1580         size_t new_data_size = 0;
1581
1582         /* find which hash bucket it is in */
1583         hash = tdb_hash(&key);
1584         if (!tdb_keylocked(tdb, hash))
1585                 return -1;
1586         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1587                 return -1;
1588
1589         /* first try in-place. */
1590         if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1591                 goto out;
1592
1593         /* reset the error code potentially set by the tdb_append_inplace() */
1594         tdb->ecode = TDB_SUCCESS;
1595
1596         /* find entry */
1597         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1598                 if (tdb->ecode != TDB_ERR_NOEXIST)
1599                         goto fail;
1600
1601                 /* Not found - create. */
1602
1603                 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1604                 goto out;
1605         }
1606
1607         new_data_size = rec.data_len + new_dbuf.dsize;
1608
1609         /* Copy key+old_value+value *before* allocating free space in case malloc
1610            fails and we are left with a dead spot in the tdb. */
1611
1612         if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1613                 tdb->ecode = TDB_ERR_OOM;
1614                 goto fail;
1615         }
1616
1617         /* Copy the key in place. */
1618         memcpy(p, key.dptr, key.dsize);
1619
1620         /* Now read the old data into place. */
1621         if (rec.data_len &&
1622                 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1623                         goto fail;
1624
1625         /* Finally append the new data. */
1626         if (new_dbuf.dsize)
1627                 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1628
1629         /* delete any existing record - if it doesn't exist we don't
1630            care.  Doing this first reduces fragmentation, and avoids
1631            coalescing with `allocated' block before it's updated. */
1632
1633         tdb_delete_hash(tdb, key, hash);
1634
1635         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1636                 goto fail;
1637
1638         /* Read hash top into next ptr */
1639         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1640                 goto fail;
1641
1642         rec.key_len = key.dsize;
1643         rec.data_len = new_data_size;
1644         rec.full_hash = hash;
1645         rec.magic = TDB_MAGIC;
1646
1647         /* write out and point the top of the hash chain at it */
1648         if (rec_write(tdb, rec_ptr, &rec) == -1
1649             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1650             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1651                 /* Need to tdb_unallocate() here */
1652                 goto fail;
1653         }
1654
1655  out:
1656         SAFE_FREE(p);
1657         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1658         return ret;
1659
1660 fail:
1661         ret = -1;
1662         goto out;
1663 }
1664
1665 static int tdb_already_open(dev_t device,
1666                             ino_t ino)
1667 {
1668         TDB_CONTEXT *i;
1669
1670         for (i = tdbs; i; i = i->next) {
1671                 if (i->device == device && i->inode == ino) {
1672                         return 1;
1673                 }
1674         }
1675
1676         return 0;
1677 }
1678
1679 /* open the database, creating it if necessary
1680
1681    The open_flags and mode are passed straight to the open call on the
1682    database file. A flags value of O_WRONLY is invalid. The hash size
1683    is advisory, use zero for a default value.
1684
1685    Return is NULL on error, in which case errno is also set.  Don't
1686    try to call tdb_error or tdb_errname, just do strerror(errno).
1687
1688    @param name may be NULL for internal databases. */
1689 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1690                       int open_flags, mode_t mode)
1691 {
1692         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL);
1693 }
1694
1695
1696 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1697                          int open_flags, mode_t mode,
1698                          tdb_log_func log_fn)
1699 {
1700         TDB_CONTEXT *tdb;
1701         struct stat st;
1702         int rev = 0, locked;
1703         unsigned char *vp;
1704         u32 vertest;
1705
1706         if (!(tdb = calloc(1, sizeof *tdb))) {
1707                 /* Can't log this */
1708                 errno = ENOMEM;
1709                 goto fail;
1710         }
1711         tdb->fd = -1;
1712         tdb->name = NULL;
1713         tdb->map_ptr = NULL;
1714         tdb->lockedkeys = NULL;
1715         tdb->flags = tdb_flags;
1716         tdb->open_flags = open_flags;
1717         tdb->log_fn = log_fn;
1718
1719         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1720                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1721                          name));
1722                 errno = EINVAL;
1723                 goto fail;
1724         }
1725
1726         if (hash_size == 0)
1727                 hash_size = DEFAULT_HASH_SIZE;
1728         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1729                 tdb->read_only = 1;
1730                 /* read only databases don't do locking or clear if first */
1731                 tdb->flags |= TDB_NOLOCK;
1732                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1733         }
1734
1735         /* internal databases don't mmap or lock, and start off cleared */
1736         if (tdb->flags & TDB_INTERNAL) {
1737                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1738                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1739                 if (tdb_new_database(tdb, hash_size) != 0) {
1740                         TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1741                         goto fail;
1742                 }
1743                 goto internal;
1744         }
1745
1746         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1747                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1748                          name, strerror(errno)));
1749                 goto fail;      /* errno set by open(2) */
1750         }
1751
1752         /* ensure there is only one process initialising at once */
1753         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1754                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1755                          name, strerror(errno)));
1756                 goto fail;      /* errno set by tdb_brlock */
1757         }
1758
1759         /* we need to zero database if we are the only one with it open */
1760         if ((locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))
1761             && (tdb_flags & TDB_CLEAR_IF_FIRST)) {
1762                 open_flags |= O_CREAT;
1763                 if (ftruncate(tdb->fd, 0) == -1) {
1764                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1765                                  "failed to truncate %s: %s\n",
1766                                  name, strerror(errno)));
1767                         goto fail; /* errno set by ftruncate */
1768                 }
1769         }
1770
1771         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1772             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1773             || (tdb->header.version != TDB_VERSION
1774                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1775                 /* its not a valid database - possibly initialise it */
1776                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1777                         errno = EIO; /* ie bad format or something */
1778                         goto fail;
1779                 }
1780                 rev = (tdb->flags & TDB_CONVERT);
1781         }
1782         vp = (unsigned char *)&tdb->header.version;
1783         vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1784                   (((u32)vp[2]) << 8) | (u32)vp[3];
1785         tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1786         if (!rev)
1787                 tdb->flags &= ~TDB_CONVERT;
1788         else {
1789                 tdb->flags |= TDB_CONVERT;
1790                 convert(&tdb->header, sizeof(tdb->header));
1791         }
1792         if (fstat(tdb->fd, &st) == -1)
1793                 goto fail;
1794
1795         /* Is it already in the open list?  If so, fail. */
1796         if (tdb_already_open(st.st_dev, st.st_ino)) {
1797                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1798                          "%s (%d,%d) is already open in this process\n",
1799                          name, st.st_dev, st.st_ino));
1800                 errno = EBUSY;
1801                 goto fail;
1802         }
1803
1804         if (!(tdb->name = (char *)strdup(name))) {
1805                 errno = ENOMEM;
1806                 goto fail;
1807         }
1808
1809         tdb->map_size = st.st_size;
1810         tdb->device = st.st_dev;
1811         tdb->inode = st.st_ino;
1812         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1813         if (!tdb->locked) {
1814                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1815                          "failed to allocate lock structure for %s\n",
1816                          name));
1817                 errno = ENOMEM;
1818                 goto fail;
1819         }
1820         tdb_mmap(tdb);
1821         if (locked) {
1822                 if (!tdb->read_only)
1823                         if (tdb_clear_spinlocks(tdb) != 0) {
1824                                 TDB_LOG((tdb, 0, "tdb_open_ex: "
1825                                 "failed to clear spinlock\n"));
1826                                 goto fail;
1827                         }
1828                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1829                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1830                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1831                                  name, strerror(errno)));
1832                         goto fail;
1833                 }
1834         }
1835         /* leave this lock in place to indicate it's in use */
1836         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1837                 goto fail;
1838
1839  internal:
1840         /* Internal (memory-only) databases skip all the code above to
1841          * do with disk files, and resume here by releasing their
1842          * global lock and hooking into the active list. */
1843         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1844                 goto fail;
1845         tdb->next = tdbs;
1846         tdbs = tdb;
1847         return tdb;
1848
1849  fail:
1850         { int save_errno = errno;
1851
1852         if (!tdb)
1853                 return NULL;
1854
1855         if (tdb->map_ptr) {
1856                 if (tdb->flags & TDB_INTERNAL)
1857                         SAFE_FREE(tdb->map_ptr);
1858                 else
1859                         tdb_munmap(tdb);
1860         }
1861         SAFE_FREE(tdb->name);
1862         if (tdb->fd != -1)
1863                 if (close(tdb->fd) != 0)
1864                         TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1865         SAFE_FREE(tdb->locked);
1866         SAFE_FREE(tdb);
1867         errno = save_errno;
1868         return NULL;
1869         }
1870 }
1871
1872 /**
1873  * Close a database.
1874  *
1875  * @returns -1 for error; 0 for success.
1876  **/
1877 int tdb_close(TDB_CONTEXT *tdb)
1878 {
1879         TDB_CONTEXT **i;
1880         int ret = 0;
1881
1882         if (tdb->map_ptr) {
1883                 if (tdb->flags & TDB_INTERNAL)
1884                         SAFE_FREE(tdb->map_ptr);
1885                 else
1886                         tdb_munmap(tdb);
1887         }
1888         SAFE_FREE(tdb->name);
1889         if (tdb->fd != -1)
1890                 ret = close(tdb->fd);
1891         SAFE_FREE(tdb->locked);
1892         SAFE_FREE(tdb->lockedkeys);
1893
1894         /* Remove from contexts list */
1895         for (i = &tdbs; *i; i = &(*i)->next) {
1896                 if (*i == tdb) {
1897                         *i = tdb->next;
1898                         break;
1899                 }
1900         }
1901
1902         memset(tdb, 0, sizeof(*tdb));
1903         SAFE_FREE(tdb);
1904
1905         return ret;
1906 }
1907
1908 /* lock/unlock entire database */
1909 int tdb_lockall(TDB_CONTEXT *tdb)
1910 {
1911         u32 i;
1912
1913         /* There are no locks on read-only dbs */
1914         if (tdb->read_only)
1915                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1916         if (tdb->lockedkeys)
1917                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1918         for (i = 0; i < tdb->header.hash_size; i++)
1919                 if (tdb_lock(tdb, i, F_WRLCK))
1920                         break;
1921
1922         /* If error, release locks we have... */
1923         if (i < tdb->header.hash_size) {
1924                 u32 j;
1925
1926                 for ( j = 0; j < i; j++)
1927                         tdb_unlock(tdb, j, F_WRLCK);
1928                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1929         }
1930
1931         return 0;
1932 }
1933 void tdb_unlockall(TDB_CONTEXT *tdb)
1934 {
1935         u32 i;
1936         for (i=0; i < tdb->header.hash_size; i++)
1937                 tdb_unlock(tdb, i, F_WRLCK);
1938 }
1939
1940 int tdb_lockkeys(TDB_CONTEXT *tdb, u32 number, TDB_DATA keys[])
1941 {
1942         u32 i, j, hash;
1943
1944         /* Can't lock more keys if already locked */
1945         if (tdb->lockedkeys)
1946                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1947         if (!(tdb->lockedkeys = malloc(sizeof(u32) * (number+1))))
1948                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
1949         /* First number in array is # keys */
1950         tdb->lockedkeys[0] = number;
1951
1952         /* Insertion sort by bucket */
1953         for (i = 0; i < number; i++) {
1954                 hash = tdb_hash(&keys[i]);
1955                 for (j = 0; j < i && BUCKET(tdb->lockedkeys[j+1]) < BUCKET(hash); j++);
1956                         memmove(&tdb->lockedkeys[j+2], &tdb->lockedkeys[j+1], sizeof(u32) * (i-j));
1957                 tdb->lockedkeys[j+1] = hash;
1958         }
1959         /* Finally, lock in order */
1960         for (i = 0; i < number; i++)
1961                 if (tdb_lock(tdb, i, F_WRLCK))
1962                         break;
1963
1964         /* If error, release locks we have... */
1965         if (i < number) {
1966                 for ( j = 0; j < i; j++)
1967                         tdb_unlock(tdb, j, F_WRLCK);
1968                 SAFE_FREE(tdb->lockedkeys);
1969                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1970         }
1971         return 0;
1972 }
1973
1974 /* Unlock the keys previously locked by tdb_lockkeys() */
1975 void tdb_unlockkeys(TDB_CONTEXT *tdb)
1976 {
1977         u32 i;
1978         if (!tdb->lockedkeys)
1979                 return;
1980         for (i = 0; i < tdb->lockedkeys[0]; i++)
1981                 tdb_unlock(tdb, tdb->lockedkeys[i+1], F_WRLCK);
1982         SAFE_FREE(tdb->lockedkeys);
1983 }
1984
1985 /* lock/unlock one hash chain. This is meant to be used to reduce
1986    contention - it cannot guarantee how many records will be locked */
1987 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1988 {
1989         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1990 }
1991
1992 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1993 {
1994         return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
1995 }
1996
1997 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1998 {
1999         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
2000 }
2001
2002 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
2003 {
2004         return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
2005 }
2006
2007
2008 /* register a loging function */
2009 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
2010 {
2011         tdb->log_fn = fn;
2012 }
2013
2014
2015 /* reopen a tdb - this is used after a fork to ensure that we have an independent
2016    seek pointer from our parent and to re-establish locks */
2017 int tdb_reopen(TDB_CONTEXT *tdb)
2018 {
2019         struct stat st;
2020
2021         if (tdb_munmap(tdb) != 0) {
2022                 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
2023                 goto fail;
2024         }
2025         if (close(tdb->fd) != 0)
2026                 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
2027         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
2028         if (tdb->fd == -1) {
2029                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
2030                 goto fail;
2031         }
2032         if (fstat(tdb->fd, &st) != 0) {
2033                 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
2034                 goto fail;
2035         }
2036         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
2037                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
2038                 goto fail;
2039         }
2040         tdb_mmap(tdb);
2041         if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1) {
2042                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
2043                 goto fail;
2044         }
2045
2046         return 0;
2047
2048 fail:
2049         tdb_close(tdb);
2050         return -1;
2051 }
2052
2053 /* reopen all tdb's */
2054 int tdb_reopen_all(void)
2055 {
2056         TDB_CONTEXT *tdb;
2057
2058         for (tdb=tdbs; tdb; tdb = tdb->next) {
2059                 if (tdb_reopen(tdb) != 0) return -1;
2060         }
2061
2062         return 0;
2063 }