source/tdb/tdb.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2004
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 2 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, write to the Free Software
  26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  27 */
  28
  29
  30 /* NOTE: If you use tdbs under valgrind, and in particular if you run
  31  * tdbtorture, you may get spurious "uninitialized value" warnings.  I
  32  * think this is because valgrind doesn't understand that the mmap'd
  33  * area may be written to by other processes.  Memory can, from the
  34  * point of view of the grinded process, spontaneously become
  35  * initialized.
  36  *
  37  * I can think of a few solutions.  [mbp 20030311]
  38  *
  39  * 1 - Write suppressions for Valgrind so that it doesn't complain
  40  * about this.  Probably the most reasonable but people need to
  41  * remember to use them.
  42  *
  43  * 2 - Use IO not mmap when running under valgrind.  Not so nice.
  44  *
  45  * 3 - Use the special valgrind macros to mark memory as valid at the
  46  * right time.  Probably too hard -- the process just doesn't know.
  47  */
  48
  49 #ifdef STANDALONE
  50 #if HAVE_CONFIG_H
  51 #include <config.h>
  52 #endif
  53
  54 #include <stdlib.h>
  55 #include <stdio.h>
  56 #include <fcntl.h>
  57 #include <unistd.h>
  58 #include <string.h>
  59 #include <fcntl.h>
  60 #include <errno.h>
  61 #include <sys/mman.h>
  62 #include <sys/stat.h>
  63 #include <signal.h>
  64 #include "tdb.h"
  65 #include "spinlock.h"
  66 #else
  67 #include "includes.h"
  68
  69 #if defined(PARANOID_MALLOC_CHECKER)
  70 #ifdef malloc
  71 #undef malloc
  72 #endif
  73
  74 #ifdef realloc
  75 #undef realloc
  76 #endif
  77
  78 #ifdef calloc
  79 #undef calloc
  80 #endif
  81
  82 #ifdef strdup
  83 #undef strdup
  84 #endif
  85
  86 #ifdef strndup
  87 #undef strndup
  88 #endif
  89
  90 #endif
  91
  92 #endif
  93
  94 #define TDB_MAGIC_FOOD "TDB file\n"
  95 #define TDB_VERSION (0x26011967 + 6)
  96 #define TDB_MAGIC (0x26011999U)
  97 #define TDB_FREE_MAGIC (~TDB_MAGIC)
  98 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
  99 #define TDB_ALIGNMENT 4
 100 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
 101 #define DEFAULT_HASH_SIZE 131
 102 #define TDB_PAGE_SIZE 0x2000
 103 #define FREELIST_TOP (sizeof(struct tdb_header))
 104 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
 105 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
 106 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
 107 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
 108 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
 109 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + TDB_SPINLOCK_SIZE(hash_size))
 110
 111
 112 /* NB assumes there is a local variable called "tdb" that is the
 113  * current context, also takes doubly-parenthesized print-style
 114  * argument. */
 115 #define TDB_LOG(x) (tdb->log_fn?((tdb->log_fn x),0) : 0)
 116
 117 /* lock offsets */
 118 #define GLOBAL_LOCK 0
 119 #define ACTIVE_LOCK 4
 120
 121 #ifndef MAP_FILE
 122 #define MAP_FILE 0
 123 #endif
 124
 125 #ifndef MAP_FAILED
 126 #define MAP_FAILED ((void *)-1)
 127 #endif
 128
 129 /* free memory if the pointer is valid and zero the pointer */
 130 #ifndef SAFE_FREE
 131 #define SAFE_FREE(x) do { if ((x) != NULL) {free((x)); (x)=NULL;} } while(0)
 132 #endif
 133
 134 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
 135 TDB_DATA tdb_null;
 136
 137 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
 138 static TDB_CONTEXT *tdbs = NULL;
 139
 140 static int tdb_munmap(TDB_CONTEXT *tdb)
 141 {
 142         if (tdb->flags & TDB_INTERNAL)
 143                 return 0;
 144
 145 #ifdef HAVE_MMAP
 146         if (tdb->map_ptr) {
 147                 int ret = munmap(tdb->map_ptr, tdb->map_size);
 148                 if (ret != 0)
 149                         return ret;
 150         }
 151 #endif
 152         tdb->map_ptr = NULL;
 153         return 0;
 154 }
 155
 156 static void tdb_mmap(TDB_CONTEXT *tdb)
 157 {
 158         if (tdb->flags & TDB_INTERNAL)
 159                 return;
 160
 161 #ifdef HAVE_MMAP
 162         if (!(tdb->flags & TDB_NOMMAP)) {
 163                 tdb->map_ptr = mmap(NULL, tdb->map_size,
 164                                     PROT_READ|(tdb->read_only? 0:PROT_WRITE),
 165                                     MAP_SHARED|MAP_FILE, tdb->fd, 0);
 166
 167                 /*
 168                  * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
 169                  */
 170
 171                 if (tdb->map_ptr == MAP_FAILED) {
 172                         tdb->map_ptr = NULL;
 173                         TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
 174                                  tdb->map_size, strerror(errno)));
 175                 }
 176         } else {
 177                 tdb->map_ptr = NULL;
 178         }
 179 #else
 180         tdb->map_ptr = NULL;
 181 #endif
 182 }
 183
 184 /* Endian conversion: we only ever deal with 4 byte quantities */
 185 static void *convert(void *buf, u32 size)
 186 {
 187         u32 i, *p = buf;
 188         for (i = 0; i < size / 4; i++)
 189                 p[i] = TDB_BYTEREV(p[i]);
 190         return buf;
 191 }
 192 #define DOCONV() (tdb->flags & TDB_CONVERT)
 193 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
 194
 195 /* the body of the database is made of one list_struct for the free space
 196    plus a separate data list for each hash value */
 197 struct list_struct {
 198         tdb_off next; /* offset of the next record in the list */
 199         tdb_len rec_len; /* total byte length of record */
 200         tdb_len key_len; /* byte length of key */
 201         tdb_len data_len; /* byte length of data */
 202         u32 full_hash; /* the full 32 bit hash of the key */
 203         u32 magic;   /* try to catch errors */
 204         /* the following union is implied:
 205                 union {
 206                         char record[rec_len];
 207                         struct {
 208                                 char key[key_len];
 209                                 char data[data_len];
 210                         }
 211                         u32 totalsize; (tailer)
 212                 }
 213         */
 214 };
 215
 216 /***************************************************************
 217  Allow a caller to set a "alarm" flag that tdb can check to abort
 218  a blocking lock on SIGALRM.
 219 ***************************************************************/
 220
 221 static sig_atomic_t *palarm_fired;
 222
 223 void tdb_set_lock_alarm(sig_atomic_t *palarm)
 224 {
 225         palarm_fired = palarm;
 226 }
 227
 228 /* a byte range locking function - return 0 on success
 229    this functions locks/unlocks 1 byte at the specified offset.
 230
 231    On error, errno is also set so that errors are passed back properly
 232    through tdb_open(). */
 233 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
 234                       int rw_type, int lck_type, int probe)
 235 {
 236         struct flock fl;
 237         int ret;
 238
 239         if (tdb->flags & TDB_NOLOCK)
 240                 return 0;
 241         if ((rw_type == F_WRLCK) && (tdb->read_only)) {
 242                 errno = EACCES;
 243                 return -1;
 244         }
 245
 246         fl.l_type = rw_type;
 247         fl.l_whence = SEEK_SET;
 248         fl.l_start = offset;
 249         fl.l_len = 1;
 250         fl.l_pid = 0;
 251
 252         do {
 253                 ret = fcntl(tdb->fd,lck_type,&fl);
 254                 if (ret == -1 && errno == EINTR && palarm_fired && *palarm_fired)
 255                         break;
 256         } while (ret == -1 && errno == EINTR);
 257
 258         if (ret == -1) {
 259                 if (!probe && lck_type != F_SETLK) {
 260                         /* Ensure error code is set for log fun to examine. */
 261                         if (errno == EINTR && palarm_fired && *palarm_fired)
 262                                 tdb->ecode = TDB_ERR_LOCK_TIMEOUT;
 263                         else
 264                                 tdb->ecode = TDB_ERR_LOCK;
 265                         TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 266                                  tdb->fd, offset, rw_type, lck_type));
 267                 }
 268                 /* Was it an alarm timeout ? */
 269                 if (errno == EINTR && palarm_fired && *palarm_fired) {
 270                         TDB_LOG((tdb, 5, "tdb_brlock timed out (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
 271                                  tdb->fd, offset, rw_type, lck_type));
 272                         return TDB_ERRCODE(TDB_ERR_LOCK_TIMEOUT, -1);
 273                 }
 274                 /* Otherwise - generic lock error. errno set by fcntl.
 275                  * EAGAIN is an expected return from non-blocking
 276                  * locks. */
 277                 if (errno != EAGAIN) {
 278                         TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
 279                                  tdb->fd, offset, rw_type, lck_type,
 280                                  strerror(errno)));
 281                 }
 282                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
 283         }
 284         return 0;
 285 }
 286
 287 /* lock a list in the database. list -1 is the alloc list */
 288 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
 289 {
 290         if (list < -1 || list >= (int)tdb->header.hash_size) {
 291                 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
 292                            list, ltype));
 293                 return -1;
 294         }
 295         if (tdb->flags & TDB_NOLOCK)
 296                 return 0;
 297
 298         /* Since fcntl locks don't nest, we do a lock for the first one,
 299            and simply bump the count for future ones */
 300         if (tdb->locked[list+1].count == 0) {
 301                 if (!tdb->read_only && tdb->header.rwlocks) {
 302                         if (tdb_spinlock(tdb, list, ltype)) {
 303                                 TDB_LOG((tdb, 0, "tdb_lock spinlock failed on list %d ltype=%d\n",
 304                                            list, ltype));
 305                                 return -1;
 306                         }
 307                 } else if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
 308                         TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
 309                                            list, ltype, strerror(errno)));
 310                         return -1;
 311                 }
 312                 tdb->locked[list+1].ltype = ltype;
 313         }
 314         tdb->locked[list+1].count++;
 315         return 0;
 316 }
 317
 318 /* unlock the database: returns void because it's too late for errors. */
 319         /* changed to return int it may be interesting to know there
 320            has been an error  --simo */
 321 static int tdb_unlock(TDB_CONTEXT *tdb, int list, int ltype)
 322 {
 323         int ret = -1;
 324
 325         if (tdb->flags & TDB_NOLOCK)
 326                 return 0;
 327
 328         /* Sanity checks */
 329         if (list < -1 || list >= (int)tdb->header.hash_size) {
 330                 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
 331                 return ret;
 332         }
 333
 334         if (tdb->locked[list+1].count==0) {
 335                 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
 336                 return ret;
 337         }
 338
 339         if (tdb->locked[list+1].count == 1) {
 340                 /* Down to last nested lock: unlock underneath */
 341                 if (!tdb->read_only && tdb->header.rwlocks) {
 342                         ret = tdb_spinunlock(tdb, list, ltype);
 343                 } else {
 344                         ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
 345                 }
 346         } else {
 347                 ret = 0;
 348         }
 349         tdb->locked[list+1].count--;
 350
 351         if (ret)
 352                 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
 353         return ret;
 354 }
 355
 356 /* check for an out of bounds access - if it is out of bounds then
 357    see if the database has been expanded by someone else and expand
 358    if necessary
 359    note that "len" is the minimum length needed for the db
 360 */
 361 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
 362 {
 363         struct stat st;
 364         if (len <= tdb->map_size)
 365                 return 0;
 366         if (tdb->flags & TDB_INTERNAL) {
 367                 if (!probe) {
 368                         /* Ensure ecode is set for log fn. */
 369                         tdb->ecode = TDB_ERR_IO;
 370                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
 371                                  (int)len, (int)tdb->map_size));
 372                 }
 373                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 374         }
 375
 376         if (fstat(tdb->fd, &st) == -1)
 377                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 378
 379         if (st.st_size < (size_t)len) {
 380                 if (!probe) {
 381                         /* Ensure ecode is set for log fn. */
 382                         tdb->ecode = TDB_ERR_IO;
 383                         TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
 384                                  (int)len, (int)st.st_size));
 385                 }
 386                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 387         }
 388
 389         /* Unmap, update size, remap */
 390         if (tdb_munmap(tdb) == -1)
 391                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 392         tdb->map_size = st.st_size;
 393         tdb_mmap(tdb);
 394         return 0;
 395 }
 396
 397 /* write a lump of data at a specified offset */
 398 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
 399 {
 400         if (tdb_oob(tdb, off + len, 0) != 0)
 401                 return -1;
 402
 403         if (tdb->map_ptr)
 404                 memcpy(off + (char *)tdb->map_ptr, buf, len);
 405 #ifdef HAVE_PWRITE
 406         else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
 407 #else
 408         else if (lseek(tdb->fd, off, SEEK_SET) != off
 409                  || write(tdb->fd, buf, len) != (ssize_t)len) {
 410 #endif
 411                 /* Ensure ecode is set for log fn. */
 412                 tdb->ecode = TDB_ERR_IO;
 413                 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
 414                            off, len, strerror(errno)));
 415                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 416         }
 417         return 0;
 418 }
 419
 420 /* read a lump of data at a specified offset, maybe convert */
 421 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
 422 {
 423         if (tdb_oob(tdb, off + len, 0) != 0)
 424                 return -1;
 425
 426         if (tdb->map_ptr)
 427                 memcpy(buf, off + (char *)tdb->map_ptr, len);
 428 #ifdef HAVE_PREAD
 429         else if (pread(tdb->fd, buf, len, off) != (ssize_t)len) {
 430 #else
 431         else if (lseek(tdb->fd, off, SEEK_SET) != off
 432                  || read(tdb->fd, buf, len) != (ssize_t)len) {
 433 #endif
 434                 /* Ensure ecode is set for log fn. */
 435                 tdb->ecode = TDB_ERR_IO;
 436                 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
 437                            off, len, strerror(errno)));
 438                 return TDB_ERRCODE(TDB_ERR_IO, -1);
 439         }
 440         if (cv)
 441                 convert(buf, len);
 442         return 0;
 443 }
 444
 445 /* read a lump of data, allocating the space for it */
 446 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
 447 {
 448         char *buf;
 449
 450         if (!(buf = malloc(len))) {
 451                 /* Ensure ecode is set for log fn. */
 452                 tdb->ecode = TDB_ERR_OOM;
 453                 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
 454                            len, strerror(errno)));
 455                 return TDB_ERRCODE(TDB_ERR_OOM, buf);
 456         }
 457         if (tdb_read(tdb, offset, buf, len, 0) == -1) {
 458                 SAFE_FREE(buf);
 459                 return NULL;
 460         }
 461         return buf;
 462 }
 463
 464 /* read/write a tdb_off */
 465 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 466 {
 467         return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
 468 }
 469 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
 470 {
 471         tdb_off off = *d;
 472         return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
 473 }
 474
 475 /* read/write a record */
 476 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 477 {
 478         if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
 479                 return -1;
 480         if (TDB_BAD_MAGIC(rec)) {
 481                 /* Ensure ecode is set for log fn. */
 482                 tdb->ecode = TDB_ERR_CORRUPT;
 483                 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
 484                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 485         }
 486         return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
 487 }
 488 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 489 {
 490         struct list_struct r = *rec;
 491         return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
 492 }
 493
 494 /* read a freelist record and check for simple errors */
 495 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
 496 {
 497         if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
 498                 return -1;
 499
 500         if (rec->magic == TDB_MAGIC) {
 501                 /* this happens when a app is showdown while deleting a record - we should
 502                    not completely fail when this happens */
 503                 TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
 504                          rec->magic, off));
 505                 rec->magic = TDB_FREE_MAGIC;
 506                 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
 507                         return -1;
 508         }
 509
 510         if (rec->magic != TDB_FREE_MAGIC) {
 511                 /* Ensure ecode is set for log fn. */
 512                 tdb->ecode = TDB_ERR_CORRUPT;
 513                 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
 514                            rec->magic, off));
 515                 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 516         }
 517         if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
 518                 return -1;
 519         return 0;
 520 }
 521
 522 /* update a record tailer (must hold allocation lock) */
 523 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
 524                          const struct list_struct *rec)
 525 {
 526         tdb_off totalsize;
 527
 528         /* Offset of tailer from record header */
 529         totalsize = sizeof(*rec) + rec->rec_len;
 530         return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
 531                          &totalsize);
 532 }
 533
 534 static tdb_off tdb_dump_record(TDB_CONTEXT *tdb, tdb_off offset)
 535 {
 536         struct list_struct rec;
 537         tdb_off tailer_ofs, tailer;
 538
 539         if (tdb_read(tdb, offset, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 540                 printf("ERROR: failed to read record at %u\n", offset);
 541                 return 0;
 542         }
 543
 544         printf(" rec: offset=%u next=%d rec_len=%d key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
 545                offset, rec.next, rec.rec_len, rec.key_len, rec.data_len, rec.full_hash, rec.magic);
 546
 547         tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off);
 548         if (ofs_read(tdb, tailer_ofs, &tailer) == -1) {
 549                 printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
 550                 return rec.next;
 551         }
 552
 553         if (tailer != rec.rec_len + sizeof(rec)) {
 554                 printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
 555                                 (unsigned)tailer, (unsigned)(rec.rec_len + sizeof(rec)));
 556         }
 557         return rec.next;
 558 }
 559
 560 static int tdb_dump_chain(TDB_CONTEXT *tdb, int i)
 561 {
 562         tdb_off rec_ptr, top;
 563
 564         top = TDB_HASH_TOP(i);
 565
 566         if (tdb_lock(tdb, i, F_WRLCK) != 0)
 567                 return -1;
 568
 569         if (ofs_read(tdb, top, &rec_ptr) == -1)
 570                 return tdb_unlock(tdb, i, F_WRLCK);
 571
 572         if (rec_ptr)
 573                 printf("hash=%d\n", i);
 574
 575         while (rec_ptr) {
 576                 rec_ptr = tdb_dump_record(tdb, rec_ptr);
 577         }
 578
 579         return tdb_unlock(tdb, i, F_WRLCK);
 580 }
 581
 582 void tdb_dump_all(TDB_CONTEXT *tdb)
 583 {
 584         int i;
 585         for (i=0;i<tdb->header.hash_size;i++) {
 586                 tdb_dump_chain(tdb, i);
 587         }
 588         printf("freelist:\n");
 589         tdb_dump_chain(tdb, -1);
 590 }
 591
 592 int tdb_printfreelist(TDB_CONTEXT *tdb)
 593 {
 594         int ret;
 595         long total_free = 0;
 596         tdb_off offset, rec_ptr;
 597         struct list_struct rec;
 598
 599         if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
 600                 return ret;
 601
 602         offset = FREELIST_TOP;
 603
 604         /* read in the freelist top */
 605         if (ofs_read(tdb, offset, &rec_ptr) == -1) {
 606                 tdb_unlock(tdb, -1, F_WRLCK);
 607                 return 0;
 608         }
 609
 610         printf("freelist top=[0x%08x]\n", rec_ptr );
 611         while (rec_ptr) {
 612                 if (tdb_read(tdb, rec_ptr, (char *)&rec, sizeof(rec), DOCONV()) == -1) {
 613                         tdb_unlock(tdb, -1, F_WRLCK);
 614                         return -1;
 615                 }
 616
 617                 if (rec.magic != TDB_FREE_MAGIC) {
 618                         printf("bad magic 0x%08x in free list\n", rec.magic);
 619                         tdb_unlock(tdb, -1, F_WRLCK);
 620                         return -1;
 621                 }
 622
 623                 printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)]\n", rec.next, rec.rec_len, rec.rec_len );
 624                 total_free += rec.rec_len;
 625
 626                 /* move to the next record */
 627                 rec_ptr = rec.next;
 628         }
 629         printf("total rec_len = [0x%08x (%d)]\n", (int)total_free,
 630                (int)total_free);
 631
 632         return tdb_unlock(tdb, -1, F_WRLCK);
 633 }
 634
 635 /* Remove an element from the freelist.  Must have alloc lock. */
 636 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 637 {
 638         tdb_off last_ptr, i;
 639
 640         /* read in the freelist top */
 641         last_ptr = FREELIST_TOP;
 642         while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 643                 if (i == off) {
 644                         /* We've found it! */
 645                         return ofs_write(tdb, last_ptr, &next);
 646                 }
 647                 /* Follow chain (next offset is at start of record) */
 648                 last_ptr = i;
 649         }
 650         TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 651         return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
 652 }
 653
 654 /* Add an element into the freelist. Merge adjacent records if
 655    neccessary. */
 656 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
 657 {
 658         tdb_off right, left;
 659
 660         /* Allocation and tailer lock */
 661         if (tdb_lock(tdb, -1, F_WRLCK) != 0)
 662                 return -1;
 663
 664         /* set an initial tailer, so if we fail we don't leave a bogus record */
 665         if (update_tailer(tdb, offset, rec) != 0) {
 666                 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
 667                 goto fail;
 668         }
 669
 670         /* Look right first (I'm an Australian, dammit) */
 671         right = offset + sizeof(*rec) + rec->rec_len;
 672         if (right + sizeof(*rec) <= tdb->map_size) {
 673                 struct list_struct r;
 674
 675                 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
 676                         TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
 677                         goto left;
 678                 }
 679
 680                 /* If it's free, expand to include it. */
 681                 if (r.magic == TDB_FREE_MAGIC) {
 682                         if (remove_from_freelist(tdb, right, r.next) == -1) {
 683                                 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
 684                                 goto left;
 685                         }
 686                         rec->rec_len += sizeof(r) + r.rec_len;
 687                 }
 688         }
 689
 690 left:
 691         /* Look left */
 692         left = offset - sizeof(tdb_off);
 693         if (left > TDB_DATA_START(tdb->header.hash_size)) {
 694                 struct list_struct l;
 695                 tdb_off leftsize;
 696
 697                 /* Read in tailer and jump back to header */
 698                 if (ofs_read(tdb, left, &leftsize) == -1) {
 699                         TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
 700                         goto update;
 701                 }
 702                 left = offset - leftsize;
 703
 704                 /* Now read in record */
 705                 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
 706                         TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
 707                         goto update;
 708                 }
 709
 710                 /* If it's free, expand to include it. */
 711                 if (l.magic == TDB_FREE_MAGIC) {
 712                         if (remove_from_freelist(tdb, left, l.next) == -1) {
 713                                 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
 714                                 goto update;
 715                         } else {
 716                                 offset = left;
 717                                 rec->rec_len += leftsize;
 718                         }
 719                 }
 720         }
 721
 722 update:
 723         if (update_tailer(tdb, offset, rec) == -1) {
 724                 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
 725                 goto fail;
 726         }
 727
 728         /* Now, prepend to free list */
 729         rec->magic = TDB_FREE_MAGIC;
 730
 731         if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
 732             rec_write(tdb, offset, rec) == -1 ||
 733             ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
 734                 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
 735                 goto fail;
 736         }
 737
 738         /* And we're done. */
 739         tdb_unlock(tdb, -1, F_WRLCK);
 740         return 0;
 741
 742  fail:
 743         tdb_unlock(tdb, -1, F_WRLCK);
 744         return -1;
 745 }
 746
 747
 748 /* expand a file.  we prefer to use ftruncate, as that is what posix
 749   says to use for mmap expansion */
 750 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
 751 {
 752         char buf[1024];
 753 #if HAVE_FTRUNCATE_EXTEND
 754         if (ftruncate(tdb->fd, size+addition) != 0) {
 755                 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
 756                            size+addition, strerror(errno)));
 757                 return -1;
 758         }
 759 #else
 760         char b = 0;
 761
 762 #ifdef HAVE_PWRITE
 763         if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
 764 #else
 765         if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (size+addition) - 1 ||
 766             write(tdb->fd, &b, 1) != 1) {
 767 #endif
 768                 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
 769                            size+addition, strerror(errno)));
 770                 return -1;
 771         }
 772 #endif
 773
 774         /* now fill the file with something. This ensures that the file isn't sparse, which would be
 775            very bad if we ran out of disk. This must be done with write, not via mmap */
 776         memset(buf, 0x42, sizeof(buf));
 777         while (addition) {
 778                 int n = addition>sizeof(buf)?sizeof(buf):addition;
 779 #ifdef HAVE_PWRITE
 780                 int ret = pwrite(tdb->fd, buf, n, size);
 781 #else
 782                 int ret;
 783                 if (lseek(tdb->fd, size, SEEK_SET) != size)
 784                         return -1;
 785                 ret = write(tdb->fd, buf, n);
 786 #endif
 787                 if (ret != n) {
 788                         TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
 789                                    n, strerror(errno)));
 790                         return -1;
 791                 }
 792                 addition -= n;
 793                 size += n;
 794         }
 795         return 0;
 796 }
 797
 798
 799 /* expand the database at least size bytes by expanding the underlying
 800    file and doing the mmap again if necessary */
 801 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
 802 {
 803         struct list_struct rec;
 804         tdb_off offset;
 805
 806         if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
 807                 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
 808                 return -1;
 809         }
 810
 811         /* must know about any previous expansions by another process */
 812         tdb_oob(tdb, tdb->map_size + 1, 1);
 813
 814         /* always make room for at least 10 more records, and round
 815            the database up to a multiple of TDB_PAGE_SIZE */
 816         size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
 817
 818         if (!(tdb->flags & TDB_INTERNAL))
 819                 tdb_munmap(tdb);
 820
 821         /*
 822          * We must ensure the file is unmapped before doing this
 823          * to ensure consistency with systems like OpenBSD where
 824          * writes and mmaps are not consistent.
 825          */
 826
 827         /* expand the file itself */
 828         if (!(tdb->flags & TDB_INTERNAL)) {
 829                 if (expand_file(tdb, tdb->map_size, size) != 0)
 830                         goto fail;
 831         }
 832
 833         tdb->map_size += size;
 834
 835         if (tdb->flags & TDB_INTERNAL) {
 836                 char *new_map_ptr = realloc(tdb->map_ptr, tdb->map_size);
 837                 if (!new_map_ptr) {
 838                         tdb->map_size -= size;
 839                         goto fail;
 840                 }
 841                 tdb->map_ptr = new_map_ptr;
 842         } else {
 843                 /*
 844                  * We must ensure the file is remapped before adding the space
 845                  * to ensure consistency with systems like OpenBSD where
 846                  * writes and mmaps are not consistent.
 847                  */
 848
 849                 /* We're ok if the mmap fails as we'll fallback to read/write */
 850                 tdb_mmap(tdb);
 851         }
 852
 853         /* form a new freelist record */
 854         memset(&rec,'\0',sizeof(rec));
 855         rec.rec_len = size - sizeof(rec);
 856
 857         /* link it into the free list */
 858         offset = tdb->map_size - size;
 859         if (tdb_free(tdb, offset, &rec) == -1)
 860                 goto fail;
 861
 862         tdb_unlock(tdb, -1, F_WRLCK);
 863         return 0;
 864  fail:
 865         tdb_unlock(tdb, -1, F_WRLCK);
 866         return -1;
 867 }
 868
 869 /* allocate some space from the free list. The offset returned points
 870    to a unconnected list_struct within the database with room for at
 871    least length bytes of total data
 872
 873    0 is returned if the space could not be allocated
 874  */
 875 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
 876                             struct list_struct *rec)
 877 {
 878         tdb_off rec_ptr, last_ptr, newrec_ptr;
 879         struct list_struct newrec;
 880
 881         memset(&newrec, '\0', sizeof(newrec));
 882
 883         if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 884                 return 0;
 885
 886         /* Extra bytes required for tailer */
 887         length += sizeof(tdb_off);
 888
 889  again:
 890         last_ptr = FREELIST_TOP;
 891
 892         /* read in the freelist top */
 893         if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
 894                 goto fail;
 895
 896         /* keep looking until we find a freelist record big enough */
 897         while (rec_ptr) {
 898                 if (rec_free_read(tdb, rec_ptr, rec) == -1)
 899                         goto fail;
 900
 901                 if (rec->rec_len >= length) {
 902                         /* found it - now possibly split it up  */
 903                         if (rec->rec_len > length + MIN_REC_SIZE) {
 904                                 /* Length of left piece */
 905                                 length = TDB_ALIGN(length, TDB_ALIGNMENT);
 906
 907                                 /* Right piece to go on free list */
 908                                 newrec.rec_len = rec->rec_len
 909                                         - (sizeof(*rec) + length);
 910                                 newrec_ptr = rec_ptr + sizeof(*rec) + length;
 911
 912                                 /* And left record is shortened */
 913                                 rec->rec_len = length;
 914                         } else
 915                                 newrec_ptr = 0;
 916
 917                         /* Remove allocated record from the free list */
 918                         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
 919                                 goto fail;
 920
 921                         /* Update header: do this before we drop alloc
 922                            lock, otherwise tdb_free() might try to
 923                            merge with us, thinking we're free.
 924                            (Thanks Jeremy Allison). */
 925                         rec->magic = TDB_MAGIC;
 926                         if (rec_write(tdb, rec_ptr, rec) == -1)
 927                                 goto fail;
 928
 929                         /* Did we create new block? */
 930                         if (newrec_ptr) {
 931                                 /* Update allocated record tailer (we
 932                                    shortened it). */
 933                                 if (update_tailer(tdb, rec_ptr, rec) == -1)
 934                                         goto fail;
 935
 936                                 /* Free new record */
 937                                 if (tdb_free(tdb, newrec_ptr, &newrec) == -1)
 938                                         goto fail;
 939                         }
 940
 941                         /* all done - return the new record offset */
 942                         tdb_unlock(tdb, -1, F_WRLCK);
 943                         return rec_ptr;
 944                 }
 945                 /* move to the next record */
 946                 last_ptr = rec_ptr;
 947                 rec_ptr = rec->next;
 948         }
 949         /* we didn't find enough space. See if we can expand the
 950            database and if we can then try again */
 951         if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
 952                 goto again;
 953  fail:
 954         tdb_unlock(tdb, -1, F_WRLCK);
 955         return 0;
 956 }
 957
 958 /* initialise a new database with a specified hash size */
 959 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
 960 {
 961         struct tdb_header *newdb;
 962         int size, ret = -1;
 963
 964         /* We make it up in memory, then write it out if not internal */
 965         size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
 966         if (!(newdb = calloc(size, 1)))
 967                 return TDB_ERRCODE(TDB_ERR_OOM, -1);
 968
 969         /* Fill in the header */
 970         newdb->version = TDB_VERSION;
 971         newdb->hash_size = hash_size;
 972 #ifdef USE_SPINLOCKS
 973         newdb->rwlocks = size;
 974 #endif
 975         if (tdb->flags & TDB_INTERNAL) {
 976                 tdb->map_size = size;
 977                 tdb->map_ptr = (char *)newdb;
 978                 memcpy(&tdb->header, newdb, sizeof(tdb->header));
 979                 /* Convert the `ondisk' version if asked. */
 980                 CONVERT(*newdb);
 981                 return 0;
 982         }
 983         if (lseek(tdb->fd, 0, SEEK_SET) == -1)
 984                 goto fail;
 985
 986         if (ftruncate(tdb->fd, 0) == -1)
 987                 goto fail;
 988
 989         /* This creates an endian-converted header, as if read from disk */
 990         CONVERT(*newdb);
 991         memcpy(&tdb->header, newdb, sizeof(tdb->header));
 992         /* Don't endian-convert the magic food! */
 993         memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
 994         if (write(tdb->fd, newdb, size) != size)
 995                 ret = -1;
 996         else
 997                 ret = tdb_create_rwlocks(tdb->fd, hash_size);
 998
 999   fail:
1000         SAFE_FREE(newdb);
1001         return ret;
1002 }
1003
1004 /* Returns 0 on fail.  On success, return offset of record, and fills
1005    in rec */
1006 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash,
1007                         struct list_struct *r)
1008 {
1009         tdb_off rec_ptr;
1010
1011         /* read in the hash top */
1012         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
1013                 return 0;
1014
1015         /* keep looking until we find the right record */
1016         while (rec_ptr) {
1017                 if (rec_read(tdb, rec_ptr, r) == -1)
1018                         return 0;
1019
1020                 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
1021                         char *k;
1022                         /* a very likely hit - read the key */
1023                         k = tdb_alloc_read(tdb, rec_ptr + sizeof(*r),
1024                                            r->key_len);
1025                         if (!k)
1026                                 return 0;
1027
1028                         if (memcmp(key.dptr, k, key.dsize) == 0) {
1029                                 SAFE_FREE(k);
1030                                 return rec_ptr;
1031                         }
1032                         SAFE_FREE(k);
1033                 }
1034                 rec_ptr = r->next;
1035         }
1036         return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
1037 }
1038
1039 /* As tdb_find, but if you succeed, keep the lock */
1040 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, int locktype,
1041                              struct list_struct *rec)
1042 {
1043         u32 rec_ptr;
1044
1045         if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
1046                 return 0;
1047         if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
1048                 tdb_unlock(tdb, BUCKET(hash), locktype);
1049         return rec_ptr;
1050 }
1051
1052 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
1053 {
1054         return tdb->ecode;
1055 }
1056
1057 static struct tdb_errname {
1058         enum TDB_ERROR ecode; const char *estring;
1059 } emap[] = { {TDB_SUCCESS, "Success"},
1060              {TDB_ERR_CORRUPT, "Corrupt database"},
1061              {TDB_ERR_IO, "IO Error"},
1062              {TDB_ERR_LOCK, "Locking error"},
1063              {TDB_ERR_OOM, "Out of memory"},
1064              {TDB_ERR_EXISTS, "Record exists"},
1065              {TDB_ERR_NOLOCK, "Lock exists on other keys"},
1066              {TDB_ERR_NOEXIST, "Record does not exist"} };
1067
1068 /* Error string for the last tdb error */
1069 const char *tdb_errorstr(TDB_CONTEXT *tdb)
1070 {
1071         u32 i;
1072         for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
1073                 if (tdb->ecode == emap[i].ecode)
1074                         return emap[i].estring;
1075         return "Invalid error code";
1076 }
1077
1078 /* update an entry in place - this only works if the new data size
1079    is <= the old data size and the key exists.
1080    on failure return -1.
1081 */
1082
1083 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA dbuf)
1084 {
1085         struct list_struct rec;
1086         tdb_off rec_ptr;
1087
1088         /* find entry */
1089         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1090                 return -1;
1091
1092         /* must be long enough key, data and tailer */
1093         if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1094                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1095                 return -1;
1096         }
1097
1098         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1099                       dbuf.dptr, dbuf.dsize) == -1)
1100                 return -1;
1101
1102         if (dbuf.dsize != rec.data_len) {
1103                 /* update size */
1104                 rec.data_len = dbuf.dsize;
1105                 return rec_write(tdb, rec_ptr, &rec);
1106         }
1107
1108         return 0;
1109 }
1110
1111 /* find an entry in the database given a key */
1112 /* If an entry doesn't exist tdb_err will be set to
1113  * TDB_ERR_NOEXIST. If a key has no data attached
1114  * tdb_err will not be set. Both will return a
1115  * zero pptr and zero dsize.
1116  */
1117
1118 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1119 {
1120         tdb_off rec_ptr;
1121         struct list_struct rec;
1122         TDB_DATA ret;
1123         u32 hash;
1124
1125         /* find which hash bucket it is in */
1126         hash = tdb->hash_fn(&key);
1127         if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1128                 return tdb_null;
1129
1130         if (rec.data_len)
1131                 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1132                                           rec.data_len);
1133         else
1134                 ret.dptr = NULL;
1135         ret.dsize = rec.data_len;
1136         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1137         return ret;
1138 }
1139
1140 /* check if an entry in the database exists
1141
1142    note that 1 is returned if the key is found and 0 is returned if not found
1143    this doesn't match the conventions in the rest of this module, but is
1144    compatible with gdbm
1145 */
1146 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1147 {
1148         struct list_struct rec;
1149
1150         if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1151                 return 0;
1152         tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1153         return 1;
1154 }
1155
1156 int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key)
1157 {
1158         u32 hash = tdb->hash_fn(&key);
1159         return tdb_exists_hash(tdb, key, hash);
1160 }
1161
1162 /* record lock stops delete underneath */
1163 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1164 {
1165         return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1166 }
1167 /*
1168   Write locks override our own fcntl readlocks, so check it here.
1169   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1170   an error to fail to get the lock here.
1171 */
1172
1173 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1174 {
1175         struct tdb_traverse_lock *i;
1176         for (i = &tdb->travlocks; i; i = i->next)
1177                 if (i->off == off)
1178                         return -1;
1179         return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1180 }
1181
1182 /*
1183   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1184   an error to fail to get the lock here.
1185 */
1186
1187 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1188 {
1189         return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1190 }
1191 /* fcntl locks don't stack: avoid unlocking someone else's */
1192 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1193 {
1194         struct tdb_traverse_lock *i;
1195         u32 count = 0;
1196
1197         if (off == 0)
1198                 return 0;
1199         for (i = &tdb->travlocks; i; i = i->next)
1200                 if (i->off == off)
1201                         count++;
1202         return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1203 }
1204
1205 /* actually delete an entry in the database given the offset */
1206 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1207 {
1208         tdb_off last_ptr, i;
1209         struct list_struct lastrec;
1210
1211         if (tdb->read_only) return -1;
1212
1213         if (write_lock_record(tdb, rec_ptr) == -1) {
1214                 /* Someone traversing here: mark it as dead */
1215                 rec->magic = TDB_DEAD_MAGIC;
1216                 return rec_write(tdb, rec_ptr, rec);
1217         }
1218         if (write_unlock_record(tdb, rec_ptr) != 0)
1219                 return -1;
1220
1221         /* find previous record in hash chain */
1222         if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1223                 return -1;
1224         for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1225                 if (rec_read(tdb, i, &lastrec) == -1)
1226                         return -1;
1227
1228         /* unlink it: next ptr is at start of record. */
1229         if (last_ptr == 0)
1230                 last_ptr = TDB_HASH_TOP(rec->full_hash);
1231         if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1232                 return -1;
1233
1234         /* recover the space */
1235         if (tdb_free(tdb, rec_ptr, rec) == -1)
1236                 return -1;
1237         return 0;
1238 }
1239
1240 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1241 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1242                          struct list_struct *rec)
1243 {
1244         int want_next = (tlock->off != 0);
1245
1246         /* Lock each chain from the start one. */
1247         for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1248                 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1249                         return -1;
1250
1251                 /* No previous record?  Start at top of chain. */
1252                 if (!tlock->off) {
1253                         if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1254                                      &tlock->off) == -1)
1255                                 goto fail;
1256                 } else {
1257                         /* Otherwise unlock the previous record. */
1258                         if (unlock_record(tdb, tlock->off) != 0)
1259                                 goto fail;
1260                 }
1261
1262                 if (want_next) {
1263                         /* We have offset of old record: grab next */
1264                         if (rec_read(tdb, tlock->off, rec) == -1)
1265                                 goto fail;
1266                         tlock->off = rec->next;
1267                 }
1268
1269                 /* Iterate through chain */
1270                 while( tlock->off) {
1271                         tdb_off current;
1272                         if (rec_read(tdb, tlock->off, rec) == -1)
1273                                 goto fail;
1274                         if (!TDB_DEAD(rec)) {
1275                                 /* Woohoo: we found one! */
1276                                 if (lock_record(tdb, tlock->off) != 0)
1277                                         goto fail;
1278                                 return tlock->off;
1279                         }
1280
1281                         /* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
1282                         if (tlock->off == rec->next) {
1283                                 TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
1284                                 goto fail;
1285                         }
1286
1287                         /* Try to clean dead ones from old traverses */
1288                         current = tlock->off;
1289                         tlock->off = rec->next;
1290                         if (!tdb->read_only &&
1291                             do_delete(tdb, current, rec) != 0)
1292                                 goto fail;
1293                 }
1294                 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1295                 want_next = 0;
1296         }
1297         /* We finished iteration without finding anything */
1298         return TDB_ERRCODE(TDB_SUCCESS, 0);
1299
1300  fail:
1301         tlock->off = 0;
1302         if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1303                 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1304         return -1;
1305 }
1306
1307 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1308    return -1 on error or the record count traversed
1309    if fn is NULL then it is not called
1310    a non-zero return value from fn() indicates that the traversal should stop
1311   */
1312 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1313 {
1314         TDB_DATA key, dbuf;
1315         struct list_struct rec;
1316         struct tdb_traverse_lock tl = { NULL, 0, 0 };
1317         int ret, count = 0;
1318
1319         /* This was in the initializaton, above, but the IRIX compiler
1320          * did not like it.  crh
1321          */
1322         tl.next = tdb->travlocks.next;
1323
1324         /* fcntl locks don't stack: beware traverse inside traverse */
1325         tdb->travlocks.next = &tl;
1326
1327         /* tdb_next_lock places locks on the record returned, and its chain */
1328         while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1329                 count++;
1330                 /* now read the full record */
1331                 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1332                                           rec.key_len + rec.data_len);
1333                 if (!key.dptr) {
1334                         ret = -1;
1335                         if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1336                                 goto out;
1337                         if (unlock_record(tdb, tl.off) != 0)
1338                                 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1339                         goto out;
1340                 }
1341                 key.dsize = rec.key_len;
1342                 dbuf.dptr = key.dptr + rec.key_len;
1343                 dbuf.dsize = rec.data_len;
1344
1345                 /* Drop chain lock, call out */
1346                 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1347                         ret = -1;
1348                         goto out;
1349                 }
1350                 if (fn && fn(tdb, key, dbuf, private)) {
1351                         /* They want us to terminate traversal */
1352                         ret = count;
1353                         if (unlock_record(tdb, tl.off) != 0) {
1354                                 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));;
1355                                 ret = -1;
1356                         }
1357                         tdb->travlocks.next = tl.next;
1358                         SAFE_FREE(key.dptr);
1359                         return count;
1360                 }
1361                 SAFE_FREE(key.dptr);
1362         }
1363 out:
1364         tdb->travlocks.next = tl.next;
1365         if (ret < 0)
1366                 return -1;
1367         else
1368                 return count;
1369 }
1370
1371 /* find the first entry in the database and return its key */
1372 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1373 {
1374         TDB_DATA key;
1375         struct list_struct rec;
1376
1377         /* release any old lock */
1378         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1379                 return tdb_null;
1380         tdb->travlocks.off = tdb->travlocks.hash = 0;
1381
1382         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1383                 return tdb_null;
1384         /* now read the key */
1385         key.dsize = rec.key_len;
1386         key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1387         if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1388                 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1389         return key;
1390 }
1391
1392 /* find the next entry in the database, returning its key */
1393 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1394 {
1395         u32 oldhash;
1396         TDB_DATA key = tdb_null;
1397         struct list_struct rec;
1398         char *k = NULL;
1399
1400         /* Is locked key the old key?  If so, traverse will be reliable. */
1401         if (tdb->travlocks.off) {
1402                 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1403                         return tdb_null;
1404                 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1405                     || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1406                                             rec.key_len))
1407                     || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1408                         /* No, it wasn't: unlock it and start from scratch */
1409                         if (unlock_record(tdb, tdb->travlocks.off) != 0)
1410                                 return tdb_null;
1411                         if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1412                                 return tdb_null;
1413                         tdb->travlocks.off = 0;
1414                 }
1415
1416                 SAFE_FREE(k);
1417         }
1418
1419         if (!tdb->travlocks.off) {
1420                 /* No previous element: do normal find, and lock record */
1421                 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1422                 if (!tdb->travlocks.off)
1423                         return tdb_null;
1424                 tdb->travlocks.hash = BUCKET(rec.full_hash);
1425                 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1426                         TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1427                         return tdb_null;
1428                 }
1429         }
1430         oldhash = tdb->travlocks.hash;
1431
1432         /* Grab next record: locks chain and returned record,
1433            unlocks old record */
1434         if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1435                 key.dsize = rec.key_len;
1436                 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1437                                           key.dsize);
1438                 /* Unlock the chain of this new record */
1439                 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1440                         TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1441         }
1442         /* Unlock the chain of old record */
1443         if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1444                 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1445         return key;
1446 }
1447
1448 /* delete an entry in the database given a key */
1449 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash)
1450 {
1451         tdb_off rec_ptr;
1452         struct list_struct rec;
1453         int ret;
1454
1455         if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1456                 return -1;
1457         ret = do_delete(tdb, rec_ptr, &rec);
1458         if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1459                 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1460         return ret;
1461 }
1462
1463 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1464 {
1465         u32 hash = tdb->hash_fn(&key);
1466         return tdb_delete_hash(tdb, key, hash);
1467 }
1468
1469 /* store an element in the database, replacing any existing element
1470    with the same key
1471
1472    return 0 on success, -1 on failure
1473 */
1474 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1475 {
1476         struct list_struct rec;
1477         u32 hash;
1478         tdb_off rec_ptr;
1479         char *p = NULL;
1480         int ret = 0;
1481
1482         /* find which hash bucket it is in */
1483         hash = tdb->hash_fn(&key);
1484         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1485                 return -1;
1486
1487         /* check for it existing, on insert. */
1488         if (flag == TDB_INSERT) {
1489                 if (tdb_exists_hash(tdb, key, hash)) {
1490                         tdb->ecode = TDB_ERR_EXISTS;
1491                         goto fail;
1492                 }
1493         } else {
1494                 /* first try in-place update, on modify or replace. */
1495                 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1496                         goto out;
1497                 if (tdb->ecode == TDB_ERR_NOEXIST &&
1498                     flag == TDB_MODIFY) {
1499                         /* if the record doesn't exist and we are in TDB_MODIFY mode then
1500                          we should fail the store */
1501                         goto fail;
1502         }
1503         }
1504         /* reset the error code potentially set by the tdb_update() */
1505         tdb->ecode = TDB_SUCCESS;
1506
1507         /* delete any existing record - if it doesn't exist we don't
1508            care.  Doing this first reduces fragmentation, and avoids
1509            coalescing with `allocated' block before it's updated. */
1510         if (flag != TDB_INSERT)
1511                 tdb_delete_hash(tdb, key, hash);
1512
1513         /* Copy key+value *before* allocating free space in case malloc
1514            fails and we are left with a dead spot in the tdb. */
1515
1516         if (!(p = (char *)malloc(key.dsize + dbuf.dsize))) {
1517                 tdb->ecode = TDB_ERR_OOM;
1518                 goto fail;
1519         }
1520
1521         memcpy(p, key.dptr, key.dsize);
1522         if (dbuf.dsize)
1523                 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1524
1525         /* we have to allocate some space */
1526         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1527                 goto fail;
1528
1529         /* Read hash top into next ptr */
1530         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1531                 goto fail;
1532
1533         rec.key_len = key.dsize;
1534         rec.data_len = dbuf.dsize;
1535         rec.full_hash = hash;
1536         rec.magic = TDB_MAGIC;
1537
1538         /* write out and point the top of the hash chain at it */
1539         if (rec_write(tdb, rec_ptr, &rec) == -1
1540             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1541             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1542                 /* Need to tdb_unallocate() here */
1543                 goto fail;
1544         }
1545  out:
1546         SAFE_FREE(p);
1547         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1548         return ret;
1549 fail:
1550         ret = -1;
1551         goto out;
1552 }
1553
1554 /* Attempt to append data to an entry in place - this only works if the new data size
1555    is <= the old data size and the key exists.
1556    on failure return -1. Record must be locked before calling.
1557 */
1558 static int tdb_append_inplace(TDB_CONTEXT *tdb, TDB_DATA key, u32 hash, TDB_DATA new_dbuf)
1559 {
1560         struct list_struct rec;
1561         tdb_off rec_ptr;
1562
1563         /* find entry */
1564         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1565                 return -1;
1566
1567         /* Append of 0 is always ok. */
1568         if (new_dbuf.dsize == 0)
1569                 return 0;
1570
1571         /* must be long enough for key, old data + new data and tailer */
1572         if (rec.rec_len < key.dsize + rec.data_len + new_dbuf.dsize + sizeof(tdb_off)) {
1573                 /* No room. */
1574                 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1575                 return -1;
1576         }
1577
1578         if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len + rec.data_len,
1579                       new_dbuf.dptr, new_dbuf.dsize) == -1)
1580                 return -1;
1581
1582         /* update size */
1583         rec.data_len += new_dbuf.dsize;
1584         return rec_write(tdb, rec_ptr, &rec);
1585 }
1586
1587 /* Append to an entry. Create if not exist. */
1588
1589 int tdb_append(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA new_dbuf)
1590 {
1591         struct list_struct rec;
1592         u32 hash;
1593         tdb_off rec_ptr;
1594         char *p = NULL;
1595         int ret = 0;
1596         size_t new_data_size = 0;
1597
1598         /* find which hash bucket it is in */
1599         hash = tdb->hash_fn(&key);
1600         if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1601                 return -1;
1602
1603         /* first try in-place. */
1604         if (tdb_append_inplace(tdb, key, hash, new_dbuf) == 0)
1605                 goto out;
1606
1607         /* reset the error code potentially set by the tdb_append_inplace() */
1608         tdb->ecode = TDB_SUCCESS;
1609
1610         /* find entry */
1611         if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
1612                 if (tdb->ecode != TDB_ERR_NOEXIST)
1613                         goto fail;
1614
1615                 /* Not found - create. */
1616
1617                 ret = tdb_store(tdb, key, new_dbuf, TDB_INSERT);
1618                 goto out;
1619         }
1620
1621         new_data_size = rec.data_len + new_dbuf.dsize;
1622
1623         /* Copy key+old_value+value *before* allocating free space in case malloc
1624            fails and we are left with a dead spot in the tdb. */
1625
1626         if (!(p = (char *)malloc(key.dsize + new_data_size))) {
1627                 tdb->ecode = TDB_ERR_OOM;
1628                 goto fail;
1629         }
1630
1631         /* Copy the key in place. */
1632         memcpy(p, key.dptr, key.dsize);
1633
1634         /* Now read the old data into place. */
1635         if (rec.data_len &&
1636                 tdb_read(tdb, rec_ptr + sizeof(rec) + rec.key_len, p + key.dsize, rec.data_len, 0) == -1)
1637                         goto fail;
1638
1639         /* Finally append the new data. */
1640         if (new_dbuf.dsize)
1641                 memcpy(p+key.dsize+rec.data_len, new_dbuf.dptr, new_dbuf.dsize);
1642
1643         /* delete any existing record - if it doesn't exist we don't
1644            care.  Doing this first reduces fragmentation, and avoids
1645            coalescing with `allocated' block before it's updated. */
1646
1647         tdb_delete_hash(tdb, key, hash);
1648
1649         if (!(rec_ptr = tdb_allocate(tdb, key.dsize + new_data_size, &rec)))
1650                 goto fail;
1651
1652         /* Read hash top into next ptr */
1653         if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1654                 goto fail;
1655
1656         rec.key_len = key.dsize;
1657         rec.data_len = new_data_size;
1658         rec.full_hash = hash;
1659         rec.magic = TDB_MAGIC;
1660
1661         /* write out and point the top of the hash chain at it */
1662         if (rec_write(tdb, rec_ptr, &rec) == -1
1663             || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+new_data_size)==-1
1664             || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1665                 /* Need to tdb_unallocate() here */
1666                 goto fail;
1667         }
1668
1669  out:
1670         SAFE_FREE(p);
1671         tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1672         return ret;
1673
1674 fail:
1675         ret = -1;
1676         goto out;
1677 }
1678
1679 static int tdb_already_open(dev_t device,
1680                             ino_t ino)
1681 {
1682         TDB_CONTEXT *i;
1683
1684         for (i = tdbs; i; i = i->next) {
1685                 if (i->device == device && i->inode == ino) {
1686                         return 1;
1687                 }
1688         }
1689
1690         return 0;
1691 }
1692
1693 /* This is based on the hash algorithm from gdbm */
1694 static u32 default_tdb_hash(TDB_DATA *key)
1695 {
1696         u32 value;      /* Used to compute the hash value.  */
1697         u32   i;        /* Used to cycle through random values. */
1698
1699         /* Set the initial value from the key size. */
1700         for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
1701                 value = (value + (key->dptr[i] << (i*5 % 24)));
1702
1703         return (1103515243 * value + 12345);
1704 }
1705
1706 /* open the database, creating it if necessary
1707
1708    The open_flags and mode are passed straight to the open call on the
1709    database file. A flags value of O_WRONLY is invalid. The hash size
1710    is advisory, use zero for a default value.
1711
1712    Return is NULL on error, in which case errno is also set.  Don't
1713    try to call tdb_error or tdb_errname, just do strerror(errno).
1714
1715    @param name may be NULL for internal databases. */
1716 TDB_CONTEXT *tdb_open(const char *name, int hash_size, int tdb_flags,
1717                       int open_flags, mode_t mode)
1718 {
1719         return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
1720 }
1721
1722
1723 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1724                          int open_flags, mode_t mode,
1725                          tdb_log_func log_fn,
1726                          tdb_hash_func hash_fn)
1727 {
1728         TDB_CONTEXT *tdb;
1729         struct stat st;
1730         int rev = 0, locked = 0;
1731         unsigned char *vp;
1732         u32 vertest;
1733
1734         if (!(tdb = calloc(1, sizeof *tdb))) {
1735                 /* Can't log this */
1736                 errno = ENOMEM;
1737                 goto fail;
1738         }
1739         tdb->fd = -1;
1740         tdb->name = NULL;
1741         tdb->map_ptr = NULL;
1742         tdb->flags = tdb_flags;
1743         tdb->open_flags = open_flags;
1744         tdb->log_fn = log_fn;
1745         tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1746
1747         if ((open_flags & O_ACCMODE) == O_WRONLY) {
1748                 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1749                          name));
1750                 errno = EINVAL;
1751                 goto fail;
1752         }
1753
1754         if (hash_size == 0)
1755                 hash_size = DEFAULT_HASH_SIZE;
1756         if ((open_flags & O_ACCMODE) == O_RDONLY) {
1757                 tdb->read_only = 1;
1758                 /* read only databases don't do locking or clear if first */
1759                 tdb->flags |= TDB_NOLOCK;
1760                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1761         }
1762
1763         /* internal databases don't mmap or lock, and start off cleared */
1764         if (tdb->flags & TDB_INTERNAL) {
1765                 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1766                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1767                 if (tdb_new_database(tdb, hash_size) != 0) {
1768                         TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1769                         goto fail;
1770                 }
1771                 goto internal;
1772         }
1773
1774         if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1775                 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1776                          name, strerror(errno)));
1777                 goto fail;      /* errno set by open(2) */
1778         }
1779
1780         /* ensure there is only one process initialising at once */
1781         if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1782                 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1783                          name, strerror(errno)));
1784                 goto fail;      /* errno set by tdb_brlock */
1785         }
1786
1787         /* we need to zero database if we are the only one with it open */
1788         if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1789                 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1790                 open_flags |= O_CREAT;
1791                 if (ftruncate(tdb->fd, 0) == -1) {
1792                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1793                                  "failed to truncate %s: %s\n",
1794                                  name, strerror(errno)));
1795                         goto fail; /* errno set by ftruncate */
1796                 }
1797         }
1798
1799         if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1800             || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1801             || (tdb->header.version != TDB_VERSION
1802                 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1803                 /* its not a valid database - possibly initialise it */
1804                 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1805                         errno = EIO; /* ie bad format or something */
1806                         goto fail;
1807                 }
1808                 rev = (tdb->flags & TDB_CONVERT);
1809         }
1810         vp = (unsigned char *)&tdb->header.version;
1811         vertest = (((u32)vp[0]) << 24) | (((u32)vp[1]) << 16) |
1812                   (((u32)vp[2]) << 8) | (u32)vp[3];
1813         tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1814         if (!rev)
1815                 tdb->flags &= ~TDB_CONVERT;
1816         else {
1817                 tdb->flags |= TDB_CONVERT;
1818                 convert(&tdb->header, sizeof(tdb->header));
1819         }
1820         if (fstat(tdb->fd, &st) == -1)
1821                 goto fail;
1822
1823         /* Is it already in the open list?  If so, fail. */
1824         if (tdb_already_open(st.st_dev, st.st_ino)) {
1825                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1826                          "%s (%d,%d) is already open in this process\n",
1827                          name, (int)st.st_dev, (int)st.st_ino));
1828                 errno = EBUSY;
1829                 goto fail;
1830         }
1831
1832         if (!(tdb->name = (char *)strdup(name))) {
1833                 errno = ENOMEM;
1834                 goto fail;
1835         }
1836
1837         tdb->map_size = st.st_size;
1838         tdb->device = st.st_dev;
1839         tdb->inode = st.st_ino;
1840         tdb->locked = calloc(tdb->header.hash_size+1, sizeof(tdb->locked[0]));
1841         if (!tdb->locked) {
1842                 TDB_LOG((tdb, 2, "tdb_open_ex: "
1843                          "failed to allocate lock structure for %s\n",
1844                          name));
1845                 errno = ENOMEM;
1846                 goto fail;
1847         }
1848         tdb_mmap(tdb);
1849         if (locked) {
1850                 if (!tdb->read_only)
1851                         if (tdb_clear_spinlocks(tdb) != 0) {
1852                                 TDB_LOG((tdb, 0, "tdb_open_ex: "
1853                                 "failed to clear spinlock\n"));
1854                                 goto fail;
1855                         }
1856                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1857                         TDB_LOG((tdb, 0, "tdb_open_ex: "
1858                                  "failed to take ACTIVE_LOCK on %s: %s\n",
1859                                  name, strerror(errno)));
1860                         goto fail;
1861                 }
1862
1863         }
1864
1865         /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1866            we didn't get the initial exclusive lock as we need to let all other
1867            users know we're using it. */
1868
1869         if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1870                 /* leave this lock in place to indicate it's in use */
1871                 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1872                         goto fail;
1873         }
1874
1875
1876  internal:
1877         /* Internal (memory-only) databases skip all the code above to
1878          * do with disk files, and resume here by releasing their
1879          * global lock and hooking into the active list. */
1880         if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1881                 goto fail;
1882         tdb->next = tdbs;
1883         tdbs = tdb;
1884         return tdb;
1885
1886  fail:
1887         { int save_errno = errno;
1888
1889         if (!tdb)
1890                 return NULL;
1891
1892         if (tdb->map_ptr) {
1893                 if (tdb->flags & TDB_INTERNAL)
1894                         SAFE_FREE(tdb->map_ptr);
1895                 else
1896                         tdb_munmap(tdb);
1897         }
1898         SAFE_FREE(tdb->name);
1899         if (tdb->fd != -1)
1900                 if (close(tdb->fd) != 0)
1901                         TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1902         SAFE_FREE(tdb->locked);
1903         SAFE_FREE(tdb);
1904         errno = save_errno;
1905         return NULL;
1906         }
1907 }
1908
1909 /**
1910  * Close a database.
1911  *
1912  * @returns -1 for error; 0 for success.
1913  **/
1914 int tdb_close(TDB_CONTEXT *tdb)
1915 {
1916         TDB_CONTEXT **i;
1917         int ret = 0;
1918
1919         if (tdb->map_ptr) {
1920                 if (tdb->flags & TDB_INTERNAL)
1921                         SAFE_FREE(tdb->map_ptr);
1922                 else
1923                         tdb_munmap(tdb);
1924         }
1925         SAFE_FREE(tdb->name);
1926         if (tdb->fd != -1)
1927                 ret = close(tdb->fd);
1928         SAFE_FREE(tdb->locked);
1929
1930         /* Remove from contexts list */
1931         for (i = &tdbs; *i; i = &(*i)->next) {
1932                 if (*i == tdb) {
1933                         *i = tdb->next;
1934                         break;
1935                 }
1936         }
1937
1938         memset(tdb, 0, sizeof(*tdb));
1939         SAFE_FREE(tdb);
1940
1941         return ret;
1942 }
1943
1944 /* lock/unlock entire database */
1945 int tdb_lockall(TDB_CONTEXT *tdb)
1946 {
1947         u32 i;
1948
1949         /* There are no locks on read-only dbs */
1950         if (tdb->read_only)
1951                 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
1952         for (i = 0; i < tdb->header.hash_size; i++)
1953                 if (tdb_lock(tdb, i, F_WRLCK))
1954                         break;
1955
1956         /* If error, release locks we have... */
1957         if (i < tdb->header.hash_size) {
1958                 u32 j;
1959
1960                 for ( j = 0; j < i; j++)
1961                         tdb_unlock(tdb, j, F_WRLCK);
1962                 return TDB_ERRCODE(TDB_ERR_NOLOCK, -1);
1963         }
1964
1965         return 0;
1966 }
1967 void tdb_unlockall(TDB_CONTEXT *tdb)
1968 {
1969         u32 i;
1970         for (i=0; i < tdb->header.hash_size; i++)
1971                 tdb_unlock(tdb, i, F_WRLCK);
1972 }
1973
1974 /* lock/unlock one hash chain. This is meant to be used to reduce
1975    contention - it cannot guarantee how many records will be locked */
1976 int tdb_chainlock(TDB_CONTEXT *tdb, TDB_DATA key)
1977 {
1978         return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1979 }
1980
1981 int tdb_chainunlock(TDB_CONTEXT *tdb, TDB_DATA key)
1982 {
1983         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
1984 }
1985
1986 int tdb_chainlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1987 {
1988         return tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1989 }
1990
1991 int tdb_chainunlock_read(TDB_CONTEXT *tdb, TDB_DATA key)
1992 {
1993         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
1994 }
1995
1996
1997 /* register a loging function */
1998 void tdb_logging_function(TDB_CONTEXT *tdb, void (*fn)(TDB_CONTEXT *, int , const char *, ...))
1999 {
2000         tdb->log_fn = fn;
2001 }
2002
2003 /* reopen a tdb - this can be used after a fork to ensure that we have an independent
2004    seek pointer from our parent and to re-establish locks */
2005 int tdb_reopen(TDB_CONTEXT *tdb)
2006 {
2007         struct stat st;
2008
2009         if (tdb->flags & TDB_INTERNAL)
2010                 return 0; /* Nothing to do. */
2011         if (tdb_munmap(tdb) != 0) {
2012                 TDB_LOG((tdb, 0, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
2013                 goto fail;
2014         }
2015         if (close(tdb->fd) != 0)
2016                 TDB_LOG((tdb, 0, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
2017         tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
2018         if (tdb->fd == -1) {
2019                 TDB_LOG((tdb, 0, "tdb_reopen: open failed (%s)\n", strerror(errno)));
2020                 goto fail;
2021         }
2022         if (fstat(tdb->fd, &st) != 0) {
2023                 TDB_LOG((tdb, 0, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
2024                 goto fail;
2025         }
2026         if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
2027                 TDB_LOG((tdb, 0, "tdb_reopen: file dev/inode has changed!\n"));
2028                 goto fail;
2029         }
2030         tdb_mmap(tdb);
2031         if ((tdb->flags & TDB_CLEAR_IF_FIRST) && (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)) {
2032                 TDB_LOG((tdb, 0, "tdb_reopen: failed to obtain active lock\n"));
2033                 goto fail;
2034         }
2035
2036         return 0;
2037
2038 fail:
2039         tdb_close(tdb);
2040         return -1;
2041 }
2042
2043 /* reopen all tdb's */
2044 int tdb_reopen_all(void)
2045 {
2046         TDB_CONTEXT *tdb;
2047
2048         for (tdb=tdbs; tdb; tdb = tdb->next) {
2049                 /* Ensure no clear-if-first. */
2050                 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
2051                 if (tdb_reopen(tdb) != 0)
2052                         return -1;
2053         }
2054
2055         return 0;
2056 }