source3/tdb/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3    Samba database functions
   4    Copyright (C) Andrew Tridgell              1999-2000
   5    Copyright (C) Luke Kenneth Casson Leighton      2000
   6    Copyright (C) Paul `Rusty' Russell              2000
   7    Copyright (C) Jeremy Allison                    2000-2003
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 2 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program; if not, write to the Free Software
  21    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  22 */
  23
  24
  25 /* NOTE: If you use tdbs under valgrind, and in particular if you run
  26  * tdbtorture, you may get spurious "uninitialized value" warnings.  I
  27  * think this is because valgrind doesn't understand that the mmap'd
  28  * area may be written to by other processes.  Memory can, from the
  29  * point of view of the grinded process, spontaneously become
  30  * initialized.
  31  *
  32  * I can think of a few solutions.  [mbp 20030311]
  33  *
  34  * 1 - Write suppressions for Valgrind so that it doesn't complain
  35  * about this.  Probably the most reasonable but people need to
  36  * remember to use them.
  37  *
  38  * 2 - Use IO not mmap when running under valgrind.  Not so nice.
  39  *
  40  * 3 - Use the special valgrind macros to mark memory as valid at the
  41  * right time.  Probably too hard -- the process just doesn't know.
  42  */
  43
  44 #ifdef STANDALONE
  45 #if HAVE_CONFIG_H
  46 #include <config.h>
  47 #endif
  48
  49 #include <stdlib.h>
  50 #include <stdio.h>
  51 #include <fcntl.h>
  52 #include <unistd.h>
  53 #include <string.h>
  54 #include <fcntl.h>
  55 #include <errno.h>
  56 #include <sys/mman.h>
  57 #include <sys/stat.h>
  58 #include <signal.h>
  59 #include "tdb.h"
  60 #include "spinlock.h"
  61 #else
  62 #include "includes.h"
  63 #endif
  64
  65 #define TDB_MAGIC_FOOD "TDB file\n"
  66 #define TDB_VERSION (0x26011967 + 6)
  67 #define TDB_MAGIC (0x26011999U)
  68 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  69 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  70 #define TDB_ALIGNMENT 4
  71 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
  72 #define DEFAULT_HASH_SIZE 131
  73 #define TDB_PAGE_SIZE 0x2000
  74 #define FREELIST_TOP (sizeof(struct tdb_header))
  75 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
  76 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
  77 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
  78 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
  79 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
  80 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
  81
  82
  83 /* NB assumes there is a local variable called "tdb" that is the
  84  * current context, also takes doubly-parenthesized print-style
  85  * argument. */
  86 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
  87
  88 /* lock offsets */
  89 #define GLOBAL_LOCK 0
  90 #define ACTIVE_LOCK 4
  91
  92 #ifndef MAP_FILE
  93 #define MAP_FILE 0
  94 #endif
  95
  96 #ifndef MAP_FAILED
  97 #define MAP_FAILED ((void *)-1)
  98 #endif
  99
 100 /* free memory if the pointer is valid and zero the pointer */
 101 #ifndef SAFE_FREE
 102 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
 103 #endif
 104
 105 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
 106 TDB_DATA tdb_null;
 107
 108 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 109 static TDB_CONTEXT *tdbs = NULL;
 110
 111 static int tdb_munmap(TDB_CONTEXT *tdb)
 112 {
 113         if (tdb->flags & TDB_INTERNAL)
 114                 return 0;
 115
 116 #ifdef HAVE_MMAP
 117         if (tdb->map_ptr) {
 118                 int ret = munmap(tdb->map_ptr, tdb->map_size);
 119                 if (ret != 0)
 120                         return ret;
 121         }
 122 #endif
 123         tdb->map_ptr = NULL;
 124         return 0;
 125 }
 126
 127 static void tdb_mmap(TDB_CONTEXT *tdb)
 128 {
 129         if (tdb->flags & TDB_INTERNAL)
 130                 return;
 131
 132 #ifdef HAVE_MMAP
 133         if (!(tdb->flags & TDB_NOMMAP)) {
 134                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 135                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 136                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 137
 138                 /*
 139                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 140                  */
 141
 142                 if (tdb->map_ptr == MAP_FAILED) {
 143                         tdb->map_ptr = NULL;
 144                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 145                                  tdb->map_size, strerror(errno)));
 146                 }
 147         } else {
 148                 tdb->map_ptr = NULL;
 149         }
 150 #else
 151         tdb->map_ptr = NULL;
 152 #endif
 153 }
 154
 155 /* Endian conversion: we only ever deal with 4 byte quantities */
 156 static void *convert(void *buf, u32 size)
 157 {
 158         u32 i, *p = buf;
 159         for (i = 0; i < size / 4; i++)
 160                 p[i] = TDB_BYTEREV(p[i]);
 161         return buf;
 162 }
 163 #define DOCONV() (tdb->flags & TDB_CONVERT)
 164 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 165
 166 /* the body of the database is made of one list_struct for the free space
 167    plus a separate data list for each hash value */
 168 struct list_struct {
 169         tdb_off next; /* offset of the next record in the list */
 170         tdb_len rec_len; /* total byte length of record */
 171         tdb_len key_len; /* byte length of key */
 172         tdb_len data_len; /* byte length of data */
 173         u32 full_hash; /* the full 32 bit hash of the key */
 174         u32 magic;   /* try to catch errors */
 175         /* the following union is implied:
 176                 union {
 177                         char record[rec_len];
 178                         struct {
 179                                 char key[key_len];
 180                                 char data[data_len];
 181                         }
 182                         u32 totalsize; (tailer)
 183                 }
 184         */
 185 };
 186
 187 /***************************************************************
 188  Allow a caller to set a "alarm" flag that tdb can check to abort
 189  a blocking lock on SIGALRM.
 190 ***************************************************************/
 191
 192 static sig_atomic_t *palarm_fired;
 193
 194 void tdb_set_lock_alarm(sig_atomic_t *palarm)
 195 {
 196         palarm_fired = palarm;
 197 }
 198
 199 /* a byte range locking function - return 0 on success
 200    this functions locks/unlocks 1 byte at the specified offset.
 201
 202    On error, errno is also set so that errors are passed back properly
 203    through tdb_open(). */
 204 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 205                       int rw_type, int lck_type, int probe)
 206 {
 207         struct flock fl;
 208         int ret;
 209
 210         if (tdb->flags & TDB_NOLOCK)
 211                 return 0;
 212         if ((rw_type == F_WRLCK) && (tdb->read_only)) {
 213                 errno = EACCES;
 214                 return -1;
 215         }
 216
 217         fl.l_type = rw_type;
 218         fl.l_whence = SEEK_SET;
 219         fl.l_start = offset;
 220         fl.l_len = 1;
 221         fl.l_pid = 0;
 222
 223         do {
 224                 ret = fcntl(tdb->fd,lck_type,&fl);
 225                 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
 226                         break;
 227         } while (ret == -1 && errno == EINTR);
 228
 229         if (ret == -1) {
 230                 if (!probe && lck_type != F_SETLK) {
 231                         /* Ensure error code is set for log fun to examine. */
 232                         if (errno == EINTR && palarm_fired && *palarm_fired)
 233                                 tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
 234                         else
 235                                 tdb->ecode = TDB_ERR_LOCK;
 236                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 237                                  tdb->fd, offset, rw_type, lck_type));
 238                 }
 239                 /* Was it an alarm timeout ? */
 240                 if (errno == EINTR && palarm_fired && *palarm_fired) {
 241                         TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 242                                  tdb->fd, offset, rw_type, lck_type));
 243                         return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
 244                 }
 245                 /* Otherwise - generic lock error. errno set by fcntl.
 246                  * EAGAIN is an expected return from non-blocking
 247                  * locks. */
 248                 if (errno != EAGAIN) {
 249                         TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
 250                                  tdb->fd, offset, rw_type, lck_type,
 251                                  strerror(errno)));
 252                 }
 253                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 254         }
 255         return 0;
 256 }
 257
 258 /* lock a list in the database. list -1 is the alloc list */
 259 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 260 {
 261         if (list < -1 || list >= (int)tdb->header.hash_size) {
 262                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 263                            list, ltype));
 264                 return -1;
 265         }
 266         if (tdb->flags & TDB_NOLOCK)
 267                 return 0;
 268
 269         /* Since fcntl locks don't nest, we do a lock for the first one,
 270            and simply bump the count for future ones */
 271         if (tdb->locked[list+1].count == 0) {
 272                 if (!tdb->read_only && tdb->header.rwlocks) {
 273                         if (tdb_spinlock(tdb, list, ltype)) {
 274                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list ltype=%d\n",
 275                                            list, ltype));
 276                                 return -1;
 277                         }
 278                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 279                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 280                                            list, ltype, strerror(errno)));
 281                         return -1;
 282                 }
 283                 tdb->locked[list+1].ltype = ltype;
 284         }
 285         tdb->locked[list+1].count++;
 286         return 0;
 287 }
 288
 289 /* unlock the database: returns void because it's too late for errors. */
 290         /* changed to return int it may be interesting to know there
 291            has been an error  --simo */
 292 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 293 {
 294         int ret = -1;
 295
 296         if (tdb->flags & TDB_NOLOCK)
 297                 return 0;
 298
 299         /* Sanity checks */
 300         if (list < -1 || list >= (int)tdb->header.hash_size) {
 301                 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
 302                 return ret;
 303         }
 304
 305         if (tdb->locked[list+1].count==0) {
 306                 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
 307                 return ret;
 308         }
 309
 310         if (tdb->locked[list+1].count == 1) {
 311                 /* Down to last nested lock: unlock underneath */
 312                 if (!tdb->read_only && tdb->header.rwlocks) {
 313                         ret = tdb_spinunlock(tdb, list, ltype);
 314                 } else {
 315                         ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 316                 }
 317         } else {
 318                 ret = 0;
 319         }
 320         tdb->locked[list+1].count--;
 321
 322         if (ret)
 323                 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
 324         return ret;
 325 }
 326
 327 /* This is based on the hash algorithm from gdbm */
 328 static u32 tdb_hash(TDB_DATA *key)
 329 {
 330         u32 value;      /* Used to compute the hash value.  */
 331         u32   i;        /* Used to cycle through random values. */
 332
 333         /* Set the initial value from the key size. */
 334         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
 335                 value = (value + (key->dptr[i] << (i*5 % 24)));
 336
 337         return (1103515243 * value + 12345);
 338 }
 339
 340 /* check for an out of bounds access - if it is out of bounds then
 341    see if the database has been expanded by someone else and expand
 342    if necessary
 343    note that "len" is the minimum length needed for the db
 344 */
 345 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 346 {
 347         struct stat st;
 348         if (len <= tdb->map_size)
 349                 return 0;
 350         if (tdb->flags & TDB_INTERNAL) {
 351                 if (!probe) {
 352                         /* Ensure ecode is set for log fn. */
 353                         tdb->ecode = TDB_ERR_IO;
 354                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 355                                  (int)len, (int)tdb->map_size));
 356                 }
 357                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 358         }
 359
 360         if (fstat(tdb->fd, &st) == -1)
 361                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 362
 363         if (st.st_size < (size_t)len) {
 364                 if (!probe) {
 365                         /* Ensure ecode is set for log fn. */
 366                         tdb->ecode = TDB_ERR_IO;
 367                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 368                                  (int)len, (int)st.st_size));
 369                 }
 370                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 371         }
 372
 373         /* Unmap, update size, remap */
 374         if (tdb_munmap(tdb) == -1)
 375                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 376         tdb->map_size = st.st_size;
 377         tdb_mmap(tdb);
 378         return 0;
 379 }
 380
 381 /* write a lump of data at a specified offset */
 382 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 383 {
 384         if (tdb_oob(tdb, off + len, 0) != 0)
 385                 return -1;
 386
 387         if (tdb->map_ptr)
 388                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 389 #ifdef HAVE_PWRITE
 390         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 391 #else
 392         else if (lseek(tdb->fd, off, SEEK_SET) != off
 393                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 394 #endif
 395                 /* Ensure ecode is set for log fn. */
 396                 tdb->ecode = TDB_ERR_IO;
 397                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 398                            off, len, strerror(errno)));
 399                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 400         }
 401         return 0;
 402 }
 403
 404 /* read a lump of data at a specified offset, maybe convert */
 405 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 406 {
 407         if (tdb_oob(tdb, off + len, 0) != 0)
 408                 return -1;
 409
 410         if (tdb->map_ptr)
 411                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 412 #ifdef HAVE_PREAD
 413         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 414 #else
 415         else if (lseek(tdb->fd, off, SEEK_SET) != off
 416                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 417 #endif
 418                 /* Ensure ecode is set for log fn. */
 419                 tdb->ecode = TDB_ERR_IO;
 420                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 421                            off, len, strerror(errno)));
 422                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 423         }
 424         if (cv)
 425                 convert(buf, len);
 426         return 0;
 427 }
 428
 429 /* read a lump of data, allocating the space for it */
 430 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 431 {
 432         char *buf;
 433
 434         if (!(buf = malloc(len))) {
 435                 /* Ensure ecode is set for log fn. */
 436                 tdb->ecode = TDB_ERR_OOM;
 437                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 438                            len, strerror(errno)));
 439                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 440         }
 441         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 442                 SAFE_FREE(buf);
 443                 return NULL;
 444         }
 445         return buf;
 446 }
 447
 448 /* read/write a tdb_off */
 449 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 450 {
 451         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 452 }
 453 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 454 {
 455         tdb_off off = *d;
 456         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 457 }
 458
 459 /* read/write a record */
 460 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 461 {
 462         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 463                 return -1;
 464         if (TDB_BAD_MAGIC(rec)) {
 465                 /* Ensure ecode is set for log fn. */
 466                 tdb->ecode = TDB_ERR_CORRUPT;
 467                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 468                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 469         }
 470         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 471 }
 472 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 473 {
 474         struct list_struct r = *rec;
 475         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 476 }
 477
 478 /* read a freelist record and check for simple errors */
 479 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 480 {
 481         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 482                 return -1;
 483
 484         if (rec->magic == TDB_MAGIC) {
 485                 /* this happens when a app is showdown while deleting a record - we should
 486                    not completely fail when this happens */
 487                 TDB_LOG((tdb, 0,"rec_free_read non-free magic at offset=%d - fixing\n",
 488                          rec->magic, off));
 489                 rec->magic = TDB_FREE_MAGIC;
 490                 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
 491                         return -1;
 492         }
 493
 494         if (rec->magic != TDB_FREE_MAGIC) {
 495                 /* Ensure ecode is set for log fn. */
 496                 tdb->ecode = TDB_ERR_CORRUPT;
 497                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 498                            rec->magic, off));
 499                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 500         }
 501         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 502                 return -1;
 503         return 0;
 504 }
 505
 506 /* update a record tailer (must hold allocation lock) */
 507 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 508                          const struct list_struct *rec)
 509 {
 510         tdb_off totalsize;
 511
 512         /* Offset of tailer from record header */
 513         totalsize = sizeof(*rec) + rec->rec_len;
 514         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 515                          &totalsize);
 516 }
 517
 518 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 519 {
 520         struct list_struct rec;
 521         tdb_off tailer_ofs, tailer;
 522
 523         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 524                 printf("ERROR: failed to read record at %u\n", offset);
 525                 return 0;
 526         }
 527
 528         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 529                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 530
 531         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 532         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 533                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 534                 return rec.next;
 535         }
 536
 537         if (tailer != rec.rec_len + sizeof(rec)) {
 538                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 539                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 540         }
 541         return rec.next;
 542 }
 543
 544 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 545 {
 546         tdb_off rec_ptr, top;
 547
 548         top = TDB_HASH_TOP(i);
 549
 550         if (tdb_lock(tdb, i, F_WRLCK) != 0)
 551                 return -1;
 552
 553         if (ofs_read(tdb, top, &rec_ptr) == -1)
 554                 return tdb_unlock(tdb, i, F_WRLCK);
 555
 556         if (rec_ptr)
 557                 printf("hash=%d\n", i);
 558
 559         while (rec_ptr) {
 560                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 561         }
 562
 563         return tdb_unlock(tdb, i, F_WRLCK);
 564 }
 565
 566 void tdb_dump_all(TDB_CONTEXT *tdb)
 567 {
 568         int i;
 569         for (i=0;i<tdb->header.hash_size;i++) {
 570                 tdb_dump_chain(tdb, i);
 571         }
 572         printf("freelist:\n");
 573         tdb_dump_chain(tdb, -1);
 574 }
 575
 576 int tdb_printfreelist(TDB_CONTEXT *tdb)
 577 {
 578         int ret;
 579         long total_free = 0;
 580         tdb_off offset, rec_ptr;
 581         struct list_struct rec;
 582
 583         if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
 584                 return ret;
 585
 586         offset = FREELIST_TOP;
 587
 588         /* read in the freelist top */
 589         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 590                 tdb_unlock(tdb, -1, F_WRLCK);
 591                 return 0;
 592         }
 593
 594         printf("freelist top=[0x%08x]\n", rec_ptr );
 595         while (rec_ptr) {
 596                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 597                         tdb_unlock(tdb, -1, F_WRLCK);
 598                         return -1;
 599                 }
 600
 601                 if (rec.magic != TDB_FREE_MAGIC) {
 602                         printf("bad magic 0x%08x in free list\n", rec.magic);
 603                         tdb_unlock(tdb, -1, F_WRLCK);
 604                         return -1;
 605                 }
 606
 607                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 608                 total_free += rec.rec_len;
 609
 610                 /* move to the next record */
 611                 rec_ptr = rec.next;
 612         }
 613         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 614                (int)total_free);
 615
 616         return tdb_unlock(tdb, -1, F_WRLCK);
 617 }
 618
 619 /* Remove an element from the freelist.  Must have alloc lock. */
 620 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 621 {
 622         tdb_off last_ptr, i;
 623
 624         /* read in the freelist top */
 625         last_ptr = FREELIST_TOP;
 626         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 627                 if (i == off) {
 628                         /* We've found it! */
 629                         return ofs_write(tdb, last_ptr, &next);
 630                 }
 631                 /* Follow chain (next offset is at start of record) */
 632                 last_ptr = i;
 633         }
 634         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 635         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 636 }
 637
 638 /* Add an element into the freelist. Merge adjacent records if
 639    neccessary. */
 640 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 641 {
 642         tdb_off right, left;
 643
 644         /* Allocation and tailer lock */
 645         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 646                 return -1;
 647
 648         /* set an initial tailer, so if we fail we don't leave a bogus record */
 649         if (update_tailer(tdb, offset, rec) != 0) {
 650                 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
 651                 goto fail;
 652         }
 653
 654         /* Look right first (I'm an Australian, dammit) */
 655         right = offset + sizeof(*rec) + rec->rec_len;
 656         if (right + sizeof(*rec) <= tdb->map_size) {
 657                 struct list_struct r;
 658
 659                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 660                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 661                         goto left;
 662                 }
 663
 664                 /* If it's free, expand to include it. */
 665                 if (r.magic == TDB_FREE_MAGIC) {
 666                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 667                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 668                                 goto left;
 669                         }
 670                         rec->rec_len += sizeof(r) + r.rec_len;
 671                 }
 672         }
 673
 674 left:
 675         /* Look left */
 676         left = offset - sizeof(tdb_off);
 677         if (left > TDB_DATA_START(tdb->header.hash_size)) {
 678                 struct list_struct l;
 679                 tdb_off leftsize;
 680
 681                 /* Read in tailer and jump back to header */
 682                 if (ofs_read(tdb, left, &leftsize) == -1) {
 683                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 684                         goto update;
 685                 }
 686                 left = offset - leftsize;
 687
 688                 /* Now read in record */
 689                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 690                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 691                         goto update;
 692                 }
 693
 694                 /* If it's free, expand to include it. */
 695                 if (l.magic == TDB_FREE_MAGIC) {
 696                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 697                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 698                                 goto update;
 699                         } else {
 700                                 offset = left;
 701                                 rec->rec_len += leftsize;
 702                         }
 703                 }
 704         }
 705
 706 update:
 707         if (update_tailer(tdb, offset, rec) == -1) {
 708                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 709                 goto fail;
 710         }
 711
 712         /* Now, prepend to free list */
 713         rec->magic = TDB_FREE_MAGIC;
 714
 715         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 716             rec_write(tdb, offset, rec) == -1 ||
 717             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 718                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 719                 goto fail;
 720         }
 721
 722         /* And we're done. */
 723         tdb_unlock(tdb, -1, F_WRLCK);
 724         return 0;
 725
 726  fail:
 727         tdb_unlock(tdb, -1, F_WRLCK);
 728         return -1;
 729 }
 730
 731
 732 /* expand a file.  we prefer to use ftruncate, as that is what posix
 733   says to use for mmap expansion */
 734 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 735 {
 736         char buf[1024];
 737 #if HAVE_FTRUNCATE_EXTEND
 738         if (ftruncate(tdb->fd, size+addition) != 0) {
 739                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 740                            size+addition, strerror(errno)));
 741                 return -1;
 742         }
 743 #else
 744         char b = 0;
 745
 746 #ifdef HAVE_PWRITE
 747         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 748 #else
 749         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 750             write(tdb->fd, &b, 1) != 1) {
 751 #endif
 752                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 753                            size+addition, strerror(errno)));
 754                 return -1;
 755         }
 756 #endif
 757
 758         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 759            very bad if we ran out of disk. This must be done with write, not via mmap */
 760         memset(buf, 0x42, sizeof(buf));
 761         while (addition) {
 762                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 763 #ifdef HAVE_PWRITE
 764                 int ret = pwrite(tdb->fd, buf, n, size);
 765 #else
 766                 int ret;
 767                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 768                         return -1;
 769                 ret = write(tdb->fd, buf, n);
 770 #endif
 771                 if (ret != n) {
 772                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 773                                    n, strerror(errno)));
 774                         return -1;
 775                 }
 776                 addition -= n;
 777                 size += n;
 778         }
 779         return 0;
 780 }
 781
 782
 783 /* expand the database at least size bytes by expanding the underlying
 784    file and doing the mmap again if necessary */
 785 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 786 {
 787         struct list_struct rec;
 788         tdb_off offset;
 789
 790         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 791                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 792                 return -1;
 793         }
 794
 795         /* must know about any previous expansions by another process */
 796         tdb_oob(tdb, tdb->map_size + 1, 1);
 797
 798         /* always make room for at least 10 more records, and round
 799            the database up to a multiple of TDB_PAGE_SIZE */
 800         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 801
 802         if (!(tdb->flags & TDB_INTERNAL))
 803                 tdb_munmap(tdb);
 804
 805         /*
 806          * We must ensure the file is unmapped before doing this
 807          * to ensure consistency with systems like OpenBSD where
 808          * writes and mmaps are not consistent.
 809          */
 810
 811         /* expand the file itself */
 812         if (!(tdb->flags & TDB_INTERNAL)) {
 813                 if (expand_file(tdb, tdb->map_size, size) != 0)
 814                         goto fail;
 815         }
 816
 817         tdb->map_size += size;
 818
 819         if (tdb->flags & TDB_INTERNAL)
 820                 tdb->map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 821         else {
 822                 /*
 823                  * We must ensure the file is remapped before adding the space
 824                  * to ensure consistency with systems like OpenBSD where
 825                  * writes and mmaps are not consistent.
 826                  */
 827
 828                 /* We're ok if the mmap fails as we'll fallback to read/write */
 829                 tdb_mmap(tdb);
 830         }
 831
 832         /* form a new freelist record */
 833         memset(&rec,'\0',sizeof(rec));
 834         rec.rec_len = size - sizeof(rec);
 835
 836         /* link it into the free list */
 837         offset = tdb->map_size - size;
 838         if (tdb_free(tdb, offset, &rec) == -1)
 839                 goto fail;
 840
 841         tdb_unlock(tdb, -1, F_WRLCK);
 842         return 0;
 843  fail:
 844         tdb_unlock(tdb, -1, F_WRLCK);
 845         return -1;
 846 }
 847
 848 /* allocate some space from the free list. The offset returned points
 849    to a unconnected list_struct within the database with room for at
 850    least length bytes of total data
 851
 852    0 is returned if the space could not be allocated
 853  */
 854 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 855                             struct list_struct *rec)
 856 {
 857         tdb_off rec_ptr, last_ptr, newrec_ptr;
 858         struct list_struct newrec;
 859
 860         memset(&newrec, '\0', sizeof(newrec));
 861
 862         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 863                 return 0;
 864
 865         /* Extra bytes required for tailer */
 866         length += sizeof(tdb_off);
 867
 868  again:
 869         last_ptr = FREELIST_TOP;
 870
 871         /* read in the freelist top */
 872         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 873                 goto fail;
 874
 875         /* keep looking until we find a freelist record big enough */
 876         while (rec_ptr) {
 877                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 878                         goto fail;
 879
 880                 if (rec->rec_len >= length) {
 881                         /* found it - now possibly split it up  */
 882                         if (rec->rec_len > length + MIN_REC_SIZE) {
 883                                 /* Length of left piece */
 884                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 885
 886                                 /* Right piece to go on free list */
 887                                 newrec.rec_len = rec->rec_len
 888                                         - (sizeof(*rec) + length);
 889                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 890
 891                                 /* And left record is shortened */
 892                                 rec->rec_len = length;
 893                         } else
 894                                 newrec_ptr = 0;
 895
 896                         /* Remove allocated record from the free list */
 897                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 898                                 goto fail;
 899
 900                         /* Update header: do this before we drop alloc
 901                            lock, otherwise tdb_free() might try to
 902                            merge with us, thinking we're free.
 903                            (Thanks Jeremy Allison). */
 904                         rec->magic = TDB_MAGIC;
 905                         if (rec_write(tdb, rec_ptr, rec) == -1)
 906                                 goto fail;
 907
 908                         /* Did we create new block? */
 909                         if (newrec_ptr) {
 910                                 /* Update allocated record tailer (we
 911                                    shortened it). */
 912                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 913                                         goto fail;
 914
 915                                 /* Free new record */
 916                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 917                                         goto fail;
 918                         }
 919
 920                         /* all done - return the new record offset */
 921                         tdb_unlock(tdb, -1, F_WRLCK);
 922                         return rec_ptr;
 923                 }
 924                 /* move to the next record */
 925                 last_ptr = rec_ptr;
 926                 rec_ptr = rec->next;
 927         }
 928         /* we didn't find enough space. See if we can expand the
 929            database and if we can then try again */
 930         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 931                 goto again;
 932  fail:
 933         tdb_unlock(tdb, -1, F_WRLCK);
 934         return 0;
 935 }
 936
 937 /* initialise a new database with a specified hash size */
 938 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 939 {
 940         struct tdb_header *newdb;
 941         int size, ret = -1;
 942
 943         /* We make it up in memory, then write it out if not internal */
 944         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 945         if (!(newdb = calloc(size, 1)))
 946                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 947
 948         /* Fill in the header */
 949         newdb->version = TDB_VERSION;
 950         newdb->hash_size = hash_size;
 951 #ifdef USE_SPINLOCKS
 952         newdb->rwlocks = size;
 953 #endif
 954         if (tdb->flags & TDB_INTERNAL) {
 955                 tdb->map_size = size;
 956                 tdb->map_ptr = (char *)newdb;
 957                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 958                 /* Convert the `ondisk' version if asked. */
 959                 CONVERT(*newdb);
 960                 return 0;
 961         }
 962         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 963                 goto fail;
 964
 965         if (ftruncate(tdb->fd, 0) == -1)
 966                 goto fail;
 967
 968         /* This creates an endian-converted header, as if read from disk */
 969         CONVERT(*newdb);
 970         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 971         /* Don't endian-convert the magic food! */
 972         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 973         if (write(tdb->fd, newdb, size) != size)
 974                 ret = -1;
 975         else
 976                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 977
 978   fail:
 979         SAFE_FREE(newdb);
 980         return ret;
 981 }
 982
 983 /* Returns 0 on fail.  On success, return offset of record, and fills
 984    in rec */
 985 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
 986                         struct list_struct *r)
 987 {
 988         tdb_off rec_ptr;
 989
 990         /* read in the hash top */
 991         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 992                 return 0;
 993
 994         /* keep looking until we find the right record */
 995         while (rec_ptr) {
 996                 if (rec_read(tdb, rec_ptr, r) == -1)
 997                         return 0;
 998
 999                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
1000                         char *k;
1001                         /* a very likely hit - read the key */
1002                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
1003                                            r->key_len);
1004                         if (!k)
1005                                 return 0;
1006
1007                         if (memcmp(key.dptr, k, key.dsize) == 0) {
1008                                 SAFE_FREE(k);
1009                                 return rec_ptr;
1010                         }
1011                         SAFE_FREE(k);
1012                 }
1013                 rec_ptr = r->next;
1014         }
1015         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1016 }
1017
1018 /* If they do lockkeys, check that this hash is one they locked */
1019 static int tdb_keylocked(TDB_CONTEXT *tdb, u32 hash)
1020 {
1021         u32 i;
1022         if (!tdb->lockedkeys)
1023                 return 1;
1024         for (i = 0; i < tdb->lockedkeys[0]; i++)
1025                 if (tdb->lockedkeys[i+1] == hash)
1026                         return 1;
1027         return TDB_ERRCODE(TDB_ERR_NOLOCK, 0);
1028 }
1029
1030 /* As tdb_find, but if you succeed, keep the lock */
1031 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1032                              struct list_struct *rec)
1033 {
1034         u32 rec_ptr;
1035
1036         if (!tdb_keylocked(tdb, hash))
1037                 return 0;
1038         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1039                 return 0;
1040         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1041                 tdb_unlock(tdb, BUCKET(hash), locktype);
1042         return rec_ptr;
1043 }
1044
1045 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1046 {
1047         return tdb->ecode;
1048 }
1049
1050 static struct tdb_errname {
1051         enum TDB_ERROR ecode; const char *estring;
1052 } emap[] = { {TDB_SUCCESS, "Success"},
1053              {TDB_ERR_CORRUPT, "Corrupt database"},
1054              {TDB_ERR_IO, "IO Error"},
1055              {TDB_ERR_LOCK, "Locking error"},
1056              {TDB_ERR_OOM, "Out of memory"},
1057              {TDB_ERR_EXISTS, "Record exists"},
1058              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1059              {TDB_ERR_NOEXIST, "Record does not exist"} };
1060
1061 /* Error string for the last tdb error */
1062 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1063 {
1064         u32 i;
1065         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1066                 if (tdb->ecode == emap[i].ecode)
1067                         return emap[i].estring;
1068         return "Invalid error code";
1069 }
1070
1071 /* update an entry in place - this only works if the new data size
1072    is <= the old data size and the key exists.
1073    on failure return -1.
1074 */
1075
1076 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1077 {
1078         struct list_struct rec;
1079         tdb_off rec_ptr;
1080
1081         /* find entry */
1082         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1083                 return -1;
1084
1085         /* must be long enough key, data and tailer */
1086         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1087                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1088                 return -1;
1089         }
1090
1091         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1092                       dbuf.dptr, dbuf.dsize) == -1)
1093                 return -1;
1094
1095         if (dbuf.dsize != rec.data_len) {
1096                 /* update size */
1097                 rec.data_len = dbuf.dsize;
1098                 return rec_write(tdb, rec_ptr, &rec);
1099         }
1100
1101         return 0;
1102 }
1103
1104 /* find an entry in the database given a key */
1105 /* If an entry doesn't exist tdb_err will be set to
1106  * TDB_ERR_NOEXIST. If a key has no data attached
1107  * tdb_err will not be set. Both will return a
1108  * zero pptr and zero dsize.
1109  */
1110
1111 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1112 {
1113         tdb_off rec_ptr;
1114         struct list_struct rec;
1115         TDB_DATA ret;
1116         u32 hash;
1117
1118         /* find which hash bucket it is in */
1119         hash = tdb_hash(&key);
1120         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1121                 return tdb_null;
1122
1123         if (rec.data_len)
1124                 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1125                                           rec.data_len);
1126         else
1127                 ret.dptr = NULL;
1128         ret.dsize = rec.data_len;
1129         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1130         return ret;
1131 }
1132
1133 /* check if an entry in the database exists
1134
1135    note that 1 is returned if the key is found and 0 is returned if not found
1136    this doesn't match the conventions in the rest of this module, but is
1137    compatible with gdbm
1138 */
1139 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1140 {
1141         struct list_struct rec;
1142
1143         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1144                 return 0;
1145         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1146         return 1;
1147 }
1148
1149 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1150 {
1151         u32 hash = tdb_hash(&key);
1152         return tdb_exists_hash(tdb, key, hash);
1153 }
1154
1155 /* record lock stops delete underneath */
1156 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1157 {
1158         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1159 }
1160 /*
1161   Write locks override our own fcntl readlocks, so check it here.
1162   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1163   an error to fail to get the lock here.
1164 */
1165
1166 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1167 {
1168         struct tdb_traverse_lock *i;
1169         for (i = &tdb->travlocks; i; i = i->next)
1170                 if (i->off == off)
1171                         return -1;
1172         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1173 }
1174
1175 /*
1176   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1177   an error to fail to get the lock here.
1178 */
1179
1180 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1181 {
1182         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1183 }
1184 /* fcntl locks don't stack: avoid unlocking someone else's */
1185 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1186 {
1187         struct tdb_traverse_lock *i;
1188         u32 count = 0;
1189
1190         if (off == 0)
1191                 return 0;
1192         for (i = &tdb->travlocks; i; i = i->next)
1193                 if (i->off == off)
1194                         count++;
1195         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1196 }
1197
1198 /* actually delete an entry in the database given the offset */
1199 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1200 {
1201         tdb_off last_ptr, i;
1202         struct list_struct lastrec;
1203
1204         if (tdb->read_only) return -1;
1205
1206         if (write_lock_record(tdb, rec_ptr) == -1) {
1207                 /* Someone traversing here: mark it as dead */
1208                 rec->magic = TDB_DEAD_MAGIC;
1209                 return rec_write(tdb, rec_ptr, rec);
1210         }
1211         if (write_unlock_record(tdb, rec_ptr) != 0)
1212                 return -1;
1213
1214         /* find previous record in hash chain */
1215         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1216                 return -1;
1217         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1218                 if (rec_read(tdb, i, &lastrec) == -1)
1219                         return -1;
1220
1221         /* unlink it: next ptr is at start of record. */
1222         if (last_ptr == 0)
1223                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1224         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1225                 return -1;
1226
1227         /* recover the space */
1228         if (tdb_free(tdb, rec_ptr, rec) == -1)
1229                 return -1;
1230         return 0;
1231 }
1232
1233 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1234 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1235                          struct list_struct *rec)
1236 {
1237         int want_next = (tlock->off != 0);
1238
1239         /* No traversal allows if you've called tdb_lockkeys() */
1240         if (tdb->lockedkeys)
1241                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1242
1243         /* Lock each chain from the start one. */
1244         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1245                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1246                         return -1;
1247
1248                 /* No previous record?  Start at top of chain. */
1249                 if (!tlock->off) {
1250                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1251                                      &tlock->off) == -1)
1252                                 goto fail;
1253                 } else {
1254                         /* Otherwise unlock the previous record. */
1255                         if (unlock_record(tdb, tlock->off) != 0)
1256                                 goto fail;
1257                 }
1258
1259                 if (want_next) {
1260                         /* We have offset of old record: grab next */
1261                         if (rec_read(tdb, tlock->off, rec) == -1)
1262                                 goto fail;
1263                         tlock->off = rec->next;
1264                 }
1265
1266                 /* Iterate through chain */
1267                 while( tlock->off) {
1268                         tdb_off current;
1269                         if (rec_read(tdb, tlock->off, rec) == -1)
1270                                 goto fail;
1271                         if (!TDB_DEAD(rec)) {
1272                                 /* Woohoo: we found one! */
1273                                 if (lock_record(tdb, tlock->off) != 0)
1274                                         goto fail;
1275                                 return tlock->off;
1276                         }
1277                         /* Try to clean dead ones from old traverses */
1278                         current = tlock->off;
1279                         tlock->off = rec->next;
1280                         if (!tdb->read_only &&
1281                             do_delete(tdb, current, rec) != 0)
1282                                 goto fail;
1283                 }
1284                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1285                 want_next = 0;
1286         }
1287         /* We finished iteration without finding anything */
1288         return TDB_ERRCODE(TDB_SUCCESS, 0);
1289
1290  fail:
1291         tlock->off = 0;
1292         if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1293                 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1294         return -1;
1295 }
1296
1297 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1298    return -1 on error or the record count traversed
1299    if fn is NULL then it is not called
1300    a non-zero return value from fn() indicates that the traversal should stop
1301   */
1302 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *state)
1303 {
1304         TDB_DATA key, dbuf;
1305         struct list_struct rec;
1306         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1307         int ret, count = 0;
1308
1309         /* This was in the initializaton, above, but the IRIX compiler
1310          * did not like it.  crh
1311          */
1312         tl.next = tdb->travlocks.next;
1313
1314         /* fcntl locks don't stack: beware traverse inside traverse */
1315         tdb->travlocks.next = &tl;
1316
1317         /* tdb_next_lock places locks on the record returned, and its chain */
1318         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1319                 count++;
1320                 /* now read the full record */
1321                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1322                                           rec.key_len + rec.data_len);
1323                 if (!key.dptr) {
1324                         ret = -1;
1325                         if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1326                                 goto out;
1327                         if (unlock_record(tdb, tl.off) != 0)
1328                                 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1329                         goto out;
1330                 }
1331                 key.dsize = rec.key_len;
1332                 dbuf.dptr = key.dptr + rec.key_len;
1333                 dbuf.dsize = rec.data_len;
1334
1335                 /* Drop chain lock, call out */
1336                 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1337                         ret = -1;
1338                         goto out;
1339                 }
1340                 if (fn && fn(tdb, key, dbuf, state)) {
1341                         /* They want us to terminate traversal */
1342                         ret = count;
1343                         if (unlock_record(tdb, tl.off) != 0) {
1344                                 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1345                                 ret = -1;
1346                         }
1347                         tdb->travlocks.next = tl.next;
1348                         SAFE_FREE(key.dptr);
1349                         return count;
1350                 }
1351                 SAFE_FREE(key.dptr);
1352         }
1353 out:
1354         tdb->travlocks.next = tl.next;
1355         if (ret < 0)
1356                 return -1;
1357         else
1358                 return count;
1359 }
1360
1361 /* find the first entry in the database and return its key */
1362 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1363 {
1364         TDB_DATA key;
1365         struct list_struct rec;
1366
1367         /* release any old lock */
1368         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1369                 return tdb_null;
1370         tdb->travlocks.off = tdb->travlocks.hash = 0;
1371
1372         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1373                 return tdb_null;
1374         /* now read the key */
1375         key.dsize = rec.key_len;
1376         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1377         if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1378                 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1379         return key;
1380 }
1381
1382 /* find the next entry in the database, returning its key */
1383 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1384 {
1385         u32 oldhash;
1386         TDB_DATA key = tdb_null;
1387         struct list_struct rec;
1388         char *k = NULL;
1389
1390         /* Is locked key the old key?  If so, traverse will be reliable. */
1391         if (tdb->travlocks.off) {
1392                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1393                         return tdb_null;
1394                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1395                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1396                                             rec.key_len))
1397                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1398                         /* No, it wasn't: unlock it and start from scratch */
1399                         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1400                                 return tdb_null;
1401                         if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1402                                 return tdb_null;
1403                         tdb->travlocks.off = 0;
1404                 }
1405
1406                 SAFE_FREE(k);
1407         }
1408
1409         if (!tdb->travlocks.off) {
1410                 /* No previous element: do normal find, and lock record */
1411                 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb_hash(&oldkey), F_WRLCK, &rec);
1412                 if (!tdb->travlocks.off)
1413                         return tdb_null;
1414                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1415                 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1416                         TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1417                         return tdb_null;
1418                 }
1419         }
1420         oldhash = tdb->travlocks.hash;
1421
1422         /* Grab next record: locks chain and returned record,
1423            unlocks old record */
1424         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1425                 key.dsize = rec.key_len;
1426                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1427                                           key.dsize);
1428                 /* Unlock the chain of this new record */
1429                 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1430                         TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1431         }
1432         /* Unlock the chain of old record */
1433         if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1434                 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1435         return key;
1436 }
1437
1438 /* delete an entry in the database given a key */
1439 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1440 {
1441         tdb_off rec_ptr;
1442         struct list_struct rec;
1443         int ret;
1444
1445         if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1446                 return -1;
1447         ret = do_delete(tdb, rec_ptr, &rec);
1448         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1449                 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1450         return ret;
1451 }
1452
1453 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1454 {
1455         u32 hash = tdb_hash(&key);
1456         return tdb_delete_hash(tdb, key, hash);
1457 }
1458
1459 /* store an element in the database, replacing any existing element
1460    with the same key
1461
1462    return 0 on success, -1 on failure
1463 */
1464 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1465 {
1466         struct list_struct rec;
1467         u32 hash;
1468         tdb_off rec_ptr;
1469         char *p = NULL;
1470         int ret = 0;
1471
1472         /* find which hash bucket it is in */
1473         hash = tdb_hash(&key);
1474         if (!tdb_keylocked(tdb, hash))
1475                 return -1;
1476         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1477                 return -1;
1478
1479         /* check for it existing, on insert. */
1480         if (flag == TDB_INSERT) {
1481                 if (tdb_exists_hash(tdb, key, hash)) {
1482                         tdb->ecode = TDB_ERR_EXISTS;
1483                         goto fail;
1484                 }
1485         } else {
1486                 /* first try in-place update, on modify or replace. */
1487                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1488                         goto out;
1489                 if (flag == TDB_MODIFY && tdb->ecode == TDB_ERR_NOEXIST)
1490                         goto fail;
1491         }
1492         /* reset the error code potentially set by the tdb_update() */
1493         tdb->ecode = TDB_SUCCESS;
1494
1495         /* delete any existing record - if it doesn't exist we don't
1496            care.  Doing this first reduces fragmentation, and avoids
1497            coalescing with `allocated' block before it's updated. */
1498         if (flag != TDB_INSERT)
1499                 tdb_delete_hash(tdb, key, hash);
1500
1501         /* Copy key+value *before* allocating free space in case malloc
1502            fails and we are left with a dead spot in the tdb. */
1503
1504         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1505                 tdb->ecode = TDB_ERR_OOM;
1506                 goto fail;
1507         }
1508
1509         memcpy(p, key.dptr, key.dsize);
1510         if (dbuf.dsize)
1511                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1512
1513         /* now we're into insert / modify / replace of a record which
1514          * we know could not be optimised by an in-place store (for
1515          * various reasons).  */
1516         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1517                 goto fail;
1518
1519         /* Read hash top into next ptr */
1520         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1521                 goto fail;
1522
1523         rec.key_len = key.dsize;
1524         rec.data_len = dbuf.dsize;
1525         rec.full_hash = hash;
1526         rec.magic = TDB_MAGIC;
1527
1528         /* write out and point the top of the hash chain at it */
1529         if (rec_write(tdb, rec_ptr, &rec) == -1
1530             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1531             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1532                 /* Need to tdb_unallocate() here */
1533                 goto fail;
1534         }
1535  out:
1536         SAFE_FREE(p);
1537         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1538         return ret;
1539 fail:
1540         ret = -1;
1541         goto out;
1542 }
1543
1544 /* Attempt to append data to an entry in place - this only works if the new data size
1545    is <= the old data size and the key exists.
1546    on failure return -1. Record must be locked before calling.
1547 */
1548 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1549 {
1550         struct list_struct rec;
1551         tdb_off rec_ptr;
1552
1553         /* find entry */
1554         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1555                 return -1;
1556
1557         /* Append of 0 is always ok. */
1558         if (new_dbuf.dsize == 0)
1559                 return 0;
1560
1561         /* must be long enough for key, old data + new data and tailer */
1562         if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1563                 /* No room. */
1564                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1565                 return -1;
1566         }
1567
1568         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1569                       new_dbuf.dptr, new_dbuf.dsize) == -1)
1570                 return -1;
1571
1572         /* update size */
1573         rec.data_len += new_dbuf.dsize;
1574         return rec_write(tdb, rec_ptr, &rec);
1575 }
1576
1577 /* Append to an entry. Create if not exist. */
1578
1579 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1580 {
1581         struct list_struct rec;
1582         u32 hash;
1583         tdb_off rec_ptr;
1584         char *p = NULL;
1585         int ret = 0;
1586         size_t new_data_size = 0;
1587
1588         /* find which hash bucket it is in */
1589         hash = tdb_hash(&key);
1590         if (!tdb_keylocked(tdb, hash))
1591                 return -1;
1592         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1593                 return -1;
1594
1595         /* first try in-place. */
1596         if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1597                 goto out;
1598
1599         /* reset the error code potentially set by the tdb_append_inplace() */
1600         tdb->ecode = TDB_SUCCESS;
1601
1602         /* find entry */
1603         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1604                 if (tdb->ecode != TDB_ERR_NOEXIST)
1605                         goto fail;
1606
1607                 /* Not found - create. */
1608
1609                 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1610                 goto out;
1611         }
1612
1613         new_data_size = rec.data_len + new_dbuf.dsize;
1614
1615         /* Copy key+old_value+value *before* allocating free space in case malloc
1616            fails and we are left with a dead spot in the tdb. */
1617
1618         if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1619                 tdb->ecode = TDB_ERR_OOM;
1620                 goto fail;
1621         }
1622
1623         /* Copy the key in place. */
1624         memcpy(p, key.dptr, key.dsize);
1625
1626         /* Now read the old data into place. */
1627         if (rec.data_len &&
1628                 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1629                         goto fail;
1630
1631         /* Finally append the new data. */
1632         if (new_dbuf.dsize)
1633                 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1634
1635         /* delete any existing record - if it doesn't exist we don't
1636            care.  Doing this first reduces fragmentation, and avoids
1637            coalescing with `allocated' block before it's updated. */
1638
1639         tdb_delete_hash(tdb, key, hash);
1640
1641         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1642                 goto fail;
1643
1644         /* Read hash top into next ptr */
1645         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1646                 goto fail;
1647
1648         rec.key_len = key.dsize;
1649         rec.data_len = new_data_size;
1650         rec.full_hash = hash;
1651         rec.magic = TDB_MAGIC;
1652
1653         /* write out and point the top of the hash chain at it */
1654         if (rec_write(tdb, rec_ptr, &rec) == -1
1655             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1656             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1657                 /* Need to tdb_unallocate() here */
1658                 goto fail;
1659         }
1660
1661  out:
1662         SAFE_FREE(p);
1663         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1664         return ret;
1665
1666 fail:
1667         ret = -1;
1668         goto out;
1669 }
1670
1671 static int tdb_already_open(dev_t device,
1672                             ino_t ino)
1673 {
1674         TDB_CONTEXT *i;
1675
1676         for (i = tdbs; i; i = i->next) {
1677                 if (i->device == device && i->inode == ino) {
1678                         return 1;
1679                 }
1680         }
1681
1682         return 0;
1683 }
1684
1685 /* open the database, creating it if necessary
1686
1687    The open_flags and mode are passed straight to the open call on the
1688    database file. A flags value of O_WRONLY is invalid. The hash size
1689    is advisory, use zero for a default value.
1690
1691    Return is NULL on error, in which case errno is also set.  Don't
1692    try to call tdb_error or tdb_errname, just do strerror(errno).
1693
1694    @param name may be NULL for internal databases. */
1695 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1696                       int open_flags, mode_t mode)
1697 {
1698         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL);
1699 }
1700
1701
1702 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1703                          int open_flags, mode_t mode,
1704                          tdb_log_func log_fn)
1705 {
1706         TDB_CONTEXT *tdb;
1707         struct stat st;
1708         int rev = 0, locked = 0;
1709         unsigned char *vp;
1710         u32 vertest;
1711
1712         if (!(tdb = calloc(1, sizeof *tdb))) {
1713                 /* Can't log this */
1714                 errno = ENOMEM;
1715                 goto fail;
1716         }
1717         tdb->fd = -1;
1718         tdb->name = NULL;
1719         tdb->map_ptr = NULL;
1720         tdb->lockedkeys = NULL;
1721         tdb->flags = tdb_flags;
1722         tdb->open_flags = open_flags;
1723         tdb->log_fn = log_fn;
1724
1725         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1726                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1727                          name));
1728                 errno = EINVAL;
1729                 goto fail;
1730         }
1731
1732         if (hash_size == 0)
1733                 hash_size = DEFAULT_HASH_SIZE;
1734         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1735                 tdb->read_only = 1;
1736                 /* read only databases don't do locking or clear if first */
1737                 tdb->flags |= TDB_NOLOCK;
1738                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1739         }
1740
1741         /* internal databases don't mmap or lock, and start off cleared */
1742         if (tdb->flags & TDB_INTERNAL) {
1743                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1744                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1745                 if (tdb_new_database(tdb, hash_size) != 0) {
1746                         TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1747                         goto fail;
1748                 }
1749                 goto internal;
1750         }
1751
1752         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1753                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1754                          name, strerror(errno)));
1755                 goto fail;      /* errno set by open(2) */
1756         }
1757
1758         /* ensure there is only one process initialising at once */
1759         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1760                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1761                          name, strerror(errno)));
1762                 goto fail;      /* errno set by tdb_brlock */
1763         }
1764
1765         /* we need to zero database if we are the only one with it open */
1766         if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1767                 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1768                 open_flags |= O_CREAT;
1769                 if (ftruncate(tdb->fd, 0) == -1) {
1770                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1771                                  "failed to truncate %s: %s\n",
1772                                  name, strerror(errno)));
1773                         goto fail; /* errno set by ftruncate */
1774                 }
1775         }
1776
1777         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1778             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1779             || (tdb->header.version != TDB_VERSION
1780                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1781                 /* its not a valid database - possibly initialise it */
1782                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1783                         errno = EIO; /* ie bad format or something */
1784                         goto fail;
1785                 }
1786                 rev = (tdb->flags & TDB_CONVERT);
1787         }
1788         vp = (unsigned char *)&tdb->header.version;
1789         vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1790                   (((u32)vp[2]) << 8) | (u32)vp[3];
1791         tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1792         if (!rev)
1793                 tdb->flags &= ~TDB_CONVERT;
1794         else {
1795                 tdb->flags |= TDB_CONVERT;
1796                 convert(&tdb->header, sizeof(tdb->header));
1797         }
1798         if (fstat(tdb->fd, &st) == -1)
1799                 goto fail;
1800
1801         /* Is it already in the open list?  If so, fail. */
1802         if (tdb_already_open(st.st_dev, st.st_ino)) {
1803                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1804                          "%s (%d,%d) is already open in this process\n",
1805                          name, st.st_dev, st.st_ino));
1806                 errno = EBUSY;
1807                 goto fail;
1808         }
1809
1810         if (!(tdb->name = (char *)strdup(name))) {
1811                 errno = ENOMEM;
1812                 goto fail;
1813         }
1814
1815         tdb->map_size = st.st_size;
1816         tdb->device = st.st_dev;
1817         tdb->inode = st.st_ino;
1818         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1819         if (!tdb->locked) {
1820                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1821                          "failed to allocate lock structure for %s\n",
1822                          name));
1823                 errno = ENOMEM;
1824                 goto fail;
1825         }
1826         tdb_mmap(tdb);
1827         if (locked) {
1828                 if (!tdb->read_only)
1829                         if (tdb_clear_spinlocks(tdb) != 0) {
1830                                 TDB_LOG((tdb, 0, "tdb_open_ex: "
1831                                 "failed to clear spinlock\n"));
1832                                 goto fail;
1833                         }
1834                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1835                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1836                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1837                                  name, strerror(errno)));
1838                         goto fail;
1839                 }
1840
1841         }
1842
1843         /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1844            we didn't get the initial exclusive lock as we need to let all other
1845            users know we're using it. */
1846
1847         if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1848                 /* leave this lock in place to indicate it's in use */
1849                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1850                         goto fail;
1851         }
1852
1853
1854  internal:
1855         /* Internal (memory-only) databases skip all the code above to
1856          * do with disk files, and resume here by releasing their
1857          * global lock and hooking into the active list. */
1858         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1859                 goto fail;
1860         tdb->next = tdbs;
1861         tdbs = tdb;
1862         return tdb;
1863
1864  fail:
1865         { int save_errno = errno;
1866
1867         if (!tdb)
1868                 return NULL;
1869
1870         if (tdb->map_ptr) {
1871                 if (tdb->flags & TDB_INTERNAL)
1872                         SAFE_FREE(tdb->map_ptr);
1873                 else
1874                         tdb_munmap(tdb);
1875         }
1876         SAFE_FREE(tdb->name);
1877         if (tdb->fd != -1)
1878                 if (close(tdb->fd) != 0)
1879                         TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1880         SAFE_FREE(tdb->locked);
1881         SAFE_FREE(tdb);
1882         errno = save_errno;
1883         return NULL;
1884         }
1885 }
1886
1887 /**
1888  * Close a database.
1889  *
1890  * @returns -1 for error; 0 for success.
1891  **/
1892 int tdb_close(TDB_CONTEXT *tdb)
1893 {
1894         TDB_CONTEXT **i;
1895         int ret = 0;
1896
1897         if (tdb->map_ptr) {
1898                 if (tdb->flags & TDB_INTERNAL)
1899                         SAFE_FREE(tdb->map_ptr);
1900                 else
1901                         tdb_munmap(tdb);
1902         }
1903         SAFE_FREE(tdb->name);
1904         if (tdb->fd != -1)
1905                 ret = close(tdb->fd);
1906         SAFE_FREE(tdb->locked);
1907         SAFE_FREE(tdb->lockedkeys);
1908
1909         /* Remove from contexts list */
1910         for (i = &tdbs; *i; i = &(*i)->next) {
1911                 if (*i == tdb) {
1912                         *i = tdb->next;
1913                         break;
1914                 }
1915         }
1916
1917         memset(tdb, 0, sizeof(*tdb));
1918         SAFE_FREE(tdb);
1919
1920         return ret;
1921 }
1922
1923 /* lock/unlock entire database */
1924 int tdb_lockall(TDB_CONTEXT *tdb)
1925 {
1926         u32 i;
1927
1928         /* There are no locks on read-only dbs */
1929         if (tdb->read_only)
1930                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1931         if (tdb->lockedkeys)
1932                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1933         for (i = 0; i < tdb->header.hash_size; i++)
1934                 if (tdb_lock(tdb, i, F_WRLCK))
1935                         break;
1936
1937         /* If error, release locks we have... */
1938         if (i < tdb->header.hash_size) {
1939                 u32 j;
1940
1941                 for ( j = 0; j < i; j++)
1942                         tdb_unlock(tdb, j, F_WRLCK);
1943                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1944         }
1945
1946         return 0;
1947 }
1948 void tdb_unlockall(TDB_CONTEXT *tdb)
1949 {
1950         u32 i;
1951         for (i=0; i < tdb->header.hash_size; i++)
1952                 tdb_unlock(tdb, i, F_WRLCK);
1953 }
1954
1955 int tdb_lockkeys(TDB_CONTEXT *tdb, u32 number, TDB_DATA keys[])
1956 {
1957         u32 i, j, hash;
1958
1959         /* Can't lock more keys if already locked */
1960         if (tdb->lockedkeys)
1961                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1962         if (!(tdb->lockedkeys = malloc(sizeof(u32) * (number+1))))
1963                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
1964         /* First number in array is # keys */
1965         tdb->lockedkeys[0] = number;
1966
1967         /* Insertion sort by bucket */
1968         for (i = 0; i < number; i++) {
1969                 hash = tdb_hash(&keys[i]);
1970                 for (j = 0; j < i && BUCKET(tdb->lockedkeys[j+1]) < BUCKET(hash); j++);
1971                         memmove(&tdb->lockedkeys[j+2], &tdb->lockedkeys[j+1], sizeof(u32) * (i-j));
1972                 tdb->lockedkeys[j+1] = hash;
1973         }
1974         /* Finally, lock in order */
1975         for (i = 0; i < number; i++)
1976                 if (tdb_lock(tdb, i, F_WRLCK))
1977                         break;
1978
1979         /* If error, release locks we have... */
1980         if (i < number) {
1981                 for ( j = 0; j < i; j++)
1982                         tdb_unlock(tdb, j, F_WRLCK);
1983                 SAFE_FREE(tdb->lockedkeys);
1984                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1985         }
1986         return 0;
1987 }
1988
1989 /* Unlock the keys previously locked by tdb_lockkeys() */
1990 void tdb_unlockkeys(TDB_CONTEXT *tdb)
1991 {
1992         u32 i;
1993         if (!tdb->lockedkeys)
1994                 return;
1995         for (i = 0; i < tdb->lockedkeys[0]; i++)
1996                 tdb_unlock(tdb, tdb->lockedkeys[i+1], F_WRLCK);
1997         SAFE_FREE(tdb->lockedkeys);
1998 }
1999
2000 /* lock/unlock one hash chain. This is meant to be used to reduce
2001    contention - it cannot guarantee how many records will be locked */
2002 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
2003 {
2004         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
2005 }
2006
2007 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
2008 {
2009         return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_WRLCK);
2010 }
2011
2012 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
2013 {
2014         return tdb_lock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
2015 }
2016
2017 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
2018 {
2019         return tdb_unlock(tdb, BUCKET(tdb_hash(&key)), F_RDLCK);
2020 }
2021
2022
2023 /* register a loging function */
2024 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
2025 {
2026         tdb->log_fn = fn;
2027 }
2028
2029
2030 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
2031    seek pointer from our parent and to re-establish locks */
2032 int tdb_reopen(TDB_CONTEXT *tdb)
2033 {
2034         struct stat st;
2035
2036         if (tdb->flags & TDB_INTERNAL)
2037                 return 0; /* Nothing to do. */
2038         if (tdb_munmap(tdb) != 0) {
2039                 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
2040                 goto fail;
2041         }
2042         if (close(tdb->fd) != 0)
2043                 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
2044         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
2045         if (tdb->fd == -1) {
2046                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
2047                 goto fail;
2048         }
2049         if (fstat(tdb->fd, &st) != 0) {
2050                 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
2051                 goto fail;
2052         }
2053         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
2054                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
2055                 goto fail;
2056         }
2057         tdb_mmap(tdb);
2058         if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
2059                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
2060                 goto fail;
2061         }
2062
2063         return 0;
2064
2065 fail:
2066         tdb_close(tdb);
2067         return -1;
2068 }
2069
2070 /* reopen all tdb's */
2071 int tdb_reopen_all(void)
2072 {
2073         TDB_CONTEXT *tdb;
2074
2075         for (tdb=tdbs; tdb; tdb = tdb->next) {
2076                 /* Ensure no clear-if-first. */
2077                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2078                 if (tdb_reopen(tdb) != 0)
2079                         return -1;
2080         }
2081
2082         return 0;
2083 }