lib/ntdb/lock.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the ntdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "private.h"
  29 #include <ccan/build_assert/build_assert.h>
  30
  31 /* If we were threaded, we could wait for unlock, but we're not, so fail. */
  32 enum NTDB_ERROR owner_conflict(struct ntdb_context *ntdb, const char *call)
  33 {
  34         return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
  35                           "%s: lock owned by another ntdb in this process.",
  36                           call);
  37 }
  38
  39 /* If we fork, we no longer really own locks. */
  40 bool check_lock_pid(struct ntdb_context *ntdb, const char *call, bool log)
  41 {
  42         /* No locks?  No problem! */
  43         if (ntdb->file->allrecord_lock.count == 0
  44             && ntdb->file->num_lockrecs == 0) {
  45                 return true;
  46         }
  47
  48         /* No fork?  No problem! */
  49         if (ntdb->file->locker == getpid()) {
  50                 return true;
  51         }
  52
  53         if (log) {
  54                 ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
  55                             "%s: fork() detected after lock acquisition!"
  56                             " (%u vs %u)", call,
  57                             (unsigned int)ntdb->file->locker,
  58                             (unsigned int)getpid());
  59         }
  60         return false;
  61 }
  62
  63 int ntdb_fcntl_lock(int fd, int rw, off_t off, off_t len, bool waitflag,
  64                    void *unused)
  65 {
  66         struct flock fl;
  67         int ret;
  68
  69         do {
  70                 fl.l_type = rw;
  71                 fl.l_whence = SEEK_SET;
  72                 fl.l_start = off;
  73                 fl.l_len = len;
  74
  75                 if (waitflag)
  76                         ret = fcntl(fd, F_SETLKW, &fl);
  77                 else
  78                         ret = fcntl(fd, F_SETLK, &fl);
  79         } while (ret != 0 && errno == EINTR);
  80         return ret;
  81 }
  82
  83 int ntdb_fcntl_unlock(int fd, int rw, off_t off, off_t len, void *unused)
  84 {
  85         struct flock fl;
  86         int ret;
  87
  88         do {
  89                 fl.l_type = F_UNLCK;
  90                 fl.l_whence = SEEK_SET;
  91                 fl.l_start = off;
  92                 fl.l_len = len;
  93
  94                 ret = fcntl(fd, F_SETLKW, &fl);
  95         } while (ret != 0 && errno == EINTR);
  96         return ret;
  97 }
  98
  99 static int lock(struct ntdb_context *ntdb,
 100                       int rw, off_t off, off_t len, bool waitflag)
 101 {
 102         int ret;
 103         if (ntdb->file->allrecord_lock.count == 0
 104             && ntdb->file->num_lockrecs == 0) {
 105                 ntdb->file->locker = getpid();
 106         }
 107
 108         ntdb->stats.lock_lowlevel++;
 109         ret = ntdb->lock_fn(ntdb->file->fd, rw, off, len, waitflag,
 110                            ntdb->lock_data);
 111         if (!waitflag) {
 112                 ntdb->stats.lock_nonblock++;
 113                 if (ret != 0)
 114                         ntdb->stats.lock_nonblock_fail++;
 115         }
 116         return ret;
 117 }
 118
 119 static int unlock(struct ntdb_context *ntdb, int rw, off_t off, off_t len)
 120 {
 121 #if 0 /* Check they matched up locks and unlocks correctly. */
 122         char line[80];
 123         FILE *locks;
 124         bool found = false;
 125
 126         locks = fopen("/proc/locks", "r");
 127
 128         while (fgets(line, 80, locks)) {
 129                 char *p;
 130                 int type, start, l;
 131
 132                 /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
 133                 p = strchr(line, ':') + 1;
 134                 if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
 135                         continue;
 136                 p += strlen(" FLOCK  ADVISORY  ");
 137                 if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
 138                         type = F_RDLCK;
 139                 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
 140                         type = F_WRLCK;
 141                 else
 142                         abort();
 143                 p += 6;
 144                 if (atoi(p) != getpid())
 145                         continue;
 146                 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
 147                 start = atoi(p);
 148                 p = strchr(p, ' ') + 1;
 149                 if (strncmp(p, "EOF", 3) == 0)
 150                         l = 0;
 151                 else
 152                         l = atoi(p) - start + 1;
 153
 154                 if (off == start) {
 155                         if (len != l) {
 156                                 fprintf(stderr, "Len %u should be %u: %s",
 157                                         (int)len, l, line);
 158                                 abort();
 159                         }
 160                         if (type != rw) {
 161                                 fprintf(stderr, "Type %s wrong: %s",
 162                                         rw == F_RDLCK ? "READ" : "WRITE", line);
 163                                 abort();
 164                         }
 165                         found = true;
 166                         break;
 167                 }
 168         }
 169
 170         if (!found) {
 171                 fprintf(stderr, "Unlock on %u@%u not found!",
 172                         (int)off, (int)len);
 173                 abort();
 174         }
 175
 176         fclose(locks);
 177 #endif
 178
 179         return ntdb->unlock_fn(ntdb->file->fd, rw, off, len, ntdb->lock_data);
 180 }
 181
 182 /* a byte range locking function - return 0 on success
 183    this functions locks len bytes at the specified offset.
 184
 185    note that a len of zero means lock to end of file
 186 */
 187 static enum NTDB_ERROR ntdb_brlock(struct ntdb_context *ntdb,
 188                                  int rw_type, ntdb_off_t offset, ntdb_off_t len,
 189                                  enum ntdb_lock_flags flags)
 190 {
 191         int ret;
 192
 193         if (rw_type == F_WRLCK && (ntdb->flags & NTDB_RDONLY)) {
 194                 return ntdb_logerr(ntdb, NTDB_ERR_RDONLY, NTDB_LOG_USE_ERROR,
 195                                   "Write lock attempted on read-only database");
 196         }
 197
 198         if (ntdb->flags & NTDB_NOLOCK) {
 199                 return NTDB_SUCCESS;
 200         }
 201
 202         /* A 32 bit system cannot open a 64-bit file, but it could have
 203          * expanded since then: check here. */
 204         if ((size_t)(offset + len) != offset + len) {
 205                 return ntdb_logerr(ntdb, NTDB_ERR_IO, NTDB_LOG_ERROR,
 206                                   "ntdb_brlock: lock on giant offset %llu",
 207                                   (long long)(offset + len));
 208         }
 209
 210         ret = lock(ntdb, rw_type, offset, len, flags & NTDB_LOCK_WAIT);
 211         if (ret != 0) {
 212                 /* Generic lock error. errno set by fcntl.
 213                  * EAGAIN is an expected return from non-blocking
 214                  * locks. */
 215                 if (!(flags & NTDB_LOCK_PROBE)
 216                     && (errno != EAGAIN && errno != EINTR)) {
 217                         ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 218                                    "ntdb_brlock failed (fd=%d) at"
 219                                    " offset %zu rw_type=%d flags=%d len=%zu:"
 220                                    " %s",
 221                                    ntdb->file->fd, (size_t)offset, rw_type,
 222                                    flags, (size_t)len, strerror(errno));
 223                 }
 224                 return NTDB_ERR_LOCK;
 225         }
 226         return NTDB_SUCCESS;
 227 }
 228
 229 static enum NTDB_ERROR ntdb_brunlock(struct ntdb_context *ntdb,
 230                                    int rw_type, ntdb_off_t offset, size_t len)
 231 {
 232         if (ntdb->flags & NTDB_NOLOCK) {
 233                 return NTDB_SUCCESS;
 234         }
 235
 236         if (!check_lock_pid(ntdb, "ntdb_brunlock", false))
 237                 return NTDB_ERR_LOCK;
 238
 239         if (unlock(ntdb, rw_type, offset, len) == -1) {
 240                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 241                                   "ntdb_brunlock failed (fd=%d) at offset %zu"
 242                                   " rw_type=%d len=%zu: %s",
 243                                   ntdb->file->fd, (size_t)offset, rw_type,
 244                                   (size_t)len, strerror(errno));
 245         }
 246         return NTDB_SUCCESS;
 247 }
 248
 249 /*
 250   upgrade a read lock to a write lock. This needs to be handled in a
 251   special way as some OSes (such as solaris) have too conservative
 252   deadlock detection and claim a deadlock when progress can be
 253   made. For those OSes we may loop for a while.
 254 */
 255 enum NTDB_ERROR ntdb_allrecord_upgrade(struct ntdb_context *ntdb, off_t start)
 256 {
 257         int count = 1000;
 258
 259         if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
 260                 return NTDB_ERR_LOCK;
 261
 262         if (ntdb->file->allrecord_lock.count != 1) {
 263                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 264                                   "ntdb_allrecord_upgrade failed:"
 265                                   " count %u too high",
 266                                   ntdb->file->allrecord_lock.count);
 267         }
 268
 269         if (ntdb->file->allrecord_lock.off != 1) {
 270                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 271                                   "ntdb_allrecord_upgrade failed:"
 272                                   " already upgraded?");
 273         }
 274
 275         if (ntdb->file->allrecord_lock.owner != ntdb) {
 276                 return owner_conflict(ntdb, "ntdb_allrecord_upgrade");
 277         }
 278
 279         while (count--) {
 280                 struct timeval tv;
 281                 if (ntdb_brlock(ntdb, F_WRLCK, start, 0,
 282                                NTDB_LOCK_WAIT|NTDB_LOCK_PROBE) == NTDB_SUCCESS) {
 283                         ntdb->file->allrecord_lock.ltype = F_WRLCK;
 284                         ntdb->file->allrecord_lock.off = 0;
 285                         return NTDB_SUCCESS;
 286                 }
 287                 if (errno != EDEADLK) {
 288                         break;
 289                 }
 290                 /* sleep for as short a time as we can - more portable than usleep() */
 291                 tv.tv_sec = 0;
 292                 tv.tv_usec = 1;
 293                 select(0, NULL, NULL, NULL, &tv);
 294         }
 295
 296         if (errno != EAGAIN && errno != EINTR)
 297                 ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 298                            "ntdb_allrecord_upgrade failed");
 299         return NTDB_ERR_LOCK;
 300 }
 301
 302 static struct ntdb_lock *find_nestlock(struct ntdb_context *ntdb, ntdb_off_t offset,
 303                                       const struct ntdb_context *owner)
 304 {
 305         unsigned int i;
 306
 307         for (i=0; i<ntdb->file->num_lockrecs; i++) {
 308                 if (ntdb->file->lockrecs[i].off == offset) {
 309                         if (owner && ntdb->file->lockrecs[i].owner != owner)
 310                                 return NULL;
 311                         return &ntdb->file->lockrecs[i];
 312                 }
 313         }
 314         return NULL;
 315 }
 316
 317 enum NTDB_ERROR ntdb_lock_and_recover(struct ntdb_context *ntdb)
 318 {
 319         enum NTDB_ERROR ecode;
 320
 321         if (!check_lock_pid(ntdb, "ntdb_transaction_prepare_commit", true))
 322                 return NTDB_ERR_LOCK;
 323
 324         ecode = ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK,
 325                                    false);
 326         if (ecode != NTDB_SUCCESS) {
 327                 return ecode;
 328         }
 329
 330         ecode = ntdb_lock_open(ntdb, F_WRLCK, NTDB_LOCK_WAIT|NTDB_LOCK_NOCHECK);
 331         if (ecode != NTDB_SUCCESS) {
 332                 ntdb_allrecord_unlock(ntdb, F_WRLCK);
 333                 return ecode;
 334         }
 335         ecode = ntdb_transaction_recover(ntdb);
 336         ntdb_unlock_open(ntdb, F_WRLCK);
 337         ntdb_allrecord_unlock(ntdb, F_WRLCK);
 338
 339         return ecode;
 340 }
 341
 342 /* lock an offset in the database. */
 343 static enum NTDB_ERROR ntdb_nest_lock(struct ntdb_context *ntdb,
 344                                     ntdb_off_t offset, int ltype,
 345                                     enum ntdb_lock_flags flags)
 346 {
 347         struct ntdb_lock *new_lck;
 348         enum NTDB_ERROR ecode;
 349
 350         assert(offset <= (NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
 351                           + ntdb->file->map_size / 8));
 352
 353         if (ntdb->flags & NTDB_NOLOCK)
 354                 return NTDB_SUCCESS;
 355
 356         if (!check_lock_pid(ntdb, "ntdb_nest_lock", true)) {
 357                 return NTDB_ERR_LOCK;
 358         }
 359
 360         ntdb->stats.locks++;
 361
 362         new_lck = find_nestlock(ntdb, offset, NULL);
 363         if (new_lck) {
 364                 if (new_lck->owner != ntdb) {
 365                         return owner_conflict(ntdb, "ntdb_nest_lock");
 366                 }
 367
 368                 if (new_lck->ltype == F_RDLCK && ltype == F_WRLCK) {
 369                         return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 370                                           "ntdb_nest_lock:"
 371                                           " offset %zu has read lock",
 372                                           (size_t)offset);
 373                 }
 374                 /* Just increment the struct, posix locks don't stack. */
 375                 new_lck->count++;
 376                 return NTDB_SUCCESS;
 377         }
 378
 379 #if 0
 380         if (ntdb->file->num_lockrecs
 381             && offset >= NTDB_HASH_LOCK_START
 382             && offset < NTDB_HASH_LOCK_START + NTDB_HASH_LOCK_RANGE) {
 383                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 384                                   "ntdb_nest_lock: already have a hash lock?");
 385         }
 386 #endif
 387         if (ntdb->file->lockrecs == NULL) {
 388                 new_lck = ntdb->alloc_fn(ntdb->file, sizeof(*ntdb->file->lockrecs),
 389                                      ntdb->alloc_data);
 390         } else {
 391                 new_lck = (struct ntdb_lock *)ntdb->expand_fn(
 392                         ntdb->file->lockrecs,
 393                         sizeof(*ntdb->file->lockrecs)
 394                         * (ntdb->file->num_lockrecs+1),
 395                         ntdb->alloc_data);
 396         }
 397         if (new_lck == NULL) {
 398                 return ntdb_logerr(ntdb, NTDB_ERR_OOM, NTDB_LOG_ERROR,
 399                                   "ntdb_nest_lock:"
 400                                   " unable to allocate %zu lock struct",
 401                                   ntdb->file->num_lockrecs + 1);
 402         }
 403         ntdb->file->lockrecs = new_lck;
 404
 405         /* Since fcntl locks don't nest, we do a lock for the first one,
 406            and simply bump the count for future ones */
 407         ecode = ntdb_brlock(ntdb, ltype, offset, 1, flags);
 408         if (ecode != NTDB_SUCCESS) {
 409                 return ecode;
 410         }
 411
 412         /* First time we grab a lock, perhaps someone died in commit? */
 413         if (!(flags & NTDB_LOCK_NOCHECK)
 414             && ntdb->file->num_lockrecs == 0) {
 415                 ntdb_bool_err berr = ntdb_needs_recovery(ntdb);
 416                 if (berr != false) {
 417                         ntdb_brunlock(ntdb, ltype, offset, 1);
 418
 419                         if (berr < 0)
 420                                 return NTDB_OFF_TO_ERR(berr);
 421                         ecode = ntdb_lock_and_recover(ntdb);
 422                         if (ecode == NTDB_SUCCESS) {
 423                                 ecode = ntdb_brlock(ntdb, ltype, offset, 1,
 424                                                    flags);
 425                         }
 426                         if (ecode != NTDB_SUCCESS) {
 427                                 return ecode;
 428                         }
 429                 }
 430         }
 431
 432         ntdb->file->lockrecs[ntdb->file->num_lockrecs].owner = ntdb;
 433         ntdb->file->lockrecs[ntdb->file->num_lockrecs].off = offset;
 434         ntdb->file->lockrecs[ntdb->file->num_lockrecs].count = 1;
 435         ntdb->file->lockrecs[ntdb->file->num_lockrecs].ltype = ltype;
 436         ntdb->file->num_lockrecs++;
 437
 438         return NTDB_SUCCESS;
 439 }
 440
 441 static enum NTDB_ERROR ntdb_nest_unlock(struct ntdb_context *ntdb,
 442                                       ntdb_off_t off, int ltype)
 443 {
 444         struct ntdb_lock *lck;
 445         enum NTDB_ERROR ecode;
 446
 447         if (ntdb->flags & NTDB_NOLOCK)
 448                 return NTDB_SUCCESS;
 449
 450         lck = find_nestlock(ntdb, off, ntdb);
 451         if ((lck == NULL) || (lck->count == 0)) {
 452                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 453                                   "ntdb_nest_unlock: no lock for %zu",
 454                                   (size_t)off);
 455         }
 456
 457         if (lck->count > 1) {
 458                 lck->count--;
 459                 return NTDB_SUCCESS;
 460         }
 461
 462         /*
 463          * This lock has count==1 left, so we need to unlock it in the
 464          * kernel. We don't bother with decrementing the in-memory array
 465          * element, we're about to overwrite it with the last array element
 466          * anyway.
 467          */
 468         ecode = ntdb_brunlock(ntdb, ltype, off, 1);
 469
 470         /*
 471          * Shrink the array by overwriting the element just unlocked with the
 472          * last array element.
 473          */
 474         *lck = ntdb->file->lockrecs[--ntdb->file->num_lockrecs];
 475
 476         return ecode;
 477 }
 478
 479 /*
 480   get the transaction lock
 481  */
 482 enum NTDB_ERROR ntdb_transaction_lock(struct ntdb_context *ntdb, int ltype)
 483 {
 484         return ntdb_nest_lock(ntdb, NTDB_TRANSACTION_LOCK, ltype, NTDB_LOCK_WAIT);
 485 }
 486
 487 /*
 488   release the transaction lock
 489  */
 490 void ntdb_transaction_unlock(struct ntdb_context *ntdb, int ltype)
 491 {
 492         ntdb_nest_unlock(ntdb, NTDB_TRANSACTION_LOCK, ltype);
 493 }
 494
 495 /* We only need to lock individual bytes, but Linux merges consecutive locks
 496  * so we lock in contiguous ranges. */
 497 static enum NTDB_ERROR ntdb_lock_gradual(struct ntdb_context *ntdb,
 498                                        int ltype, enum ntdb_lock_flags flags,
 499                                        ntdb_off_t off, ntdb_off_t len)
 500 {
 501         enum NTDB_ERROR ecode;
 502         enum ntdb_lock_flags nb_flags = (flags & ~NTDB_LOCK_WAIT);
 503
 504         if (len <= 1) {
 505                 /* 0 would mean to end-of-file... */
 506                 assert(len != 0);
 507                 /* Single hash.  Just do blocking lock. */
 508                 return ntdb_brlock(ntdb, ltype, off, len, flags);
 509         }
 510
 511         /* First we try non-blocking. */
 512         ecode = ntdb_brlock(ntdb, ltype, off, len, nb_flags);
 513         if (ecode != NTDB_ERR_LOCK) {
 514                 return ecode;
 515         }
 516
 517         /* Try locking first half, then second. */
 518         ecode = ntdb_lock_gradual(ntdb, ltype, flags, off, len / 2);
 519         if (ecode != NTDB_SUCCESS)
 520                 return ecode;
 521
 522         ecode = ntdb_lock_gradual(ntdb, ltype, flags,
 523                                  off + len / 2, len - len / 2);
 524         if (ecode != NTDB_SUCCESS) {
 525                 ntdb_brunlock(ntdb, ltype, off, len / 2);
 526         }
 527         return ecode;
 528 }
 529
 530 /* lock/unlock entire database.  It can only be upgradable if you have some
 531  * other way of guaranteeing exclusivity (ie. transaction write lock). */
 532 enum NTDB_ERROR ntdb_allrecord_lock(struct ntdb_context *ntdb, int ltype,
 533                                   enum ntdb_lock_flags flags, bool upgradable)
 534 {
 535         enum NTDB_ERROR ecode;
 536         ntdb_bool_err berr;
 537
 538         if (ntdb->flags & NTDB_NOLOCK) {
 539                 return NTDB_SUCCESS;
 540         }
 541
 542         if (!check_lock_pid(ntdb, "ntdb_allrecord_lock", true)) {
 543                 return NTDB_ERR_LOCK;
 544         }
 545
 546         if (ntdb->file->allrecord_lock.count) {
 547                 if (ntdb->file->allrecord_lock.owner != ntdb) {
 548                         return owner_conflict(ntdb, "ntdb_allrecord_lock");
 549                 }
 550
 551                 if (ltype == F_RDLCK
 552                     || ntdb->file->allrecord_lock.ltype == F_WRLCK) {
 553                         ntdb->file->allrecord_lock.count++;
 554                         return NTDB_SUCCESS;
 555                 }
 556
 557                 /* a global lock of a different type exists */
 558                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
 559                                   "ntdb_allrecord_lock: already have %s lock",
 560                                   ntdb->file->allrecord_lock.ltype == F_RDLCK
 561                                   ? "read" : "write");
 562         }
 563
 564         if (ntdb_has_hash_locks(ntdb)) {
 565                 /* can't combine global and chain locks */
 566                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
 567                                   "ntdb_allrecord_lock:"
 568                                   " already have chain lock");
 569         }
 570
 571         if (upgradable && ltype != F_RDLCK) {
 572                 /* ntdb error: you can't upgrade a write lock! */
 573                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 574                                   "ntdb_allrecord_lock:"
 575                                   " can't upgrade a write lock");
 576         }
 577
 578         ntdb->stats.locks++;
 579 again:
 580         /* Lock hashes, gradually. */
 581         ecode = ntdb_lock_gradual(ntdb, ltype, flags, NTDB_HASH_LOCK_START,
 582                                   1 << ntdb->hash_bits);
 583         if (ecode != NTDB_SUCCESS)
 584                 return ecode;
 585
 586         /* Lock free tables: there to end of file. */
 587         ecode = ntdb_brlock(ntdb, ltype,
 588                             NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits),
 589                             0, flags);
 590         if (ecode != NTDB_SUCCESS) {
 591                 ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START,
 592                               1 << ntdb->hash_bits);
 593                 return ecode;
 594         }
 595
 596         ntdb->file->allrecord_lock.owner = ntdb;
 597         ntdb->file->allrecord_lock.count = 1;
 598         /* If it's upgradable, it's actually exclusive so we can treat
 599          * it as a write lock. */
 600         ntdb->file->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
 601         ntdb->file->allrecord_lock.off = upgradable;
 602
 603         /* Now check for needing recovery. */
 604         if (flags & NTDB_LOCK_NOCHECK)
 605                 return NTDB_SUCCESS;
 606
 607         berr = ntdb_needs_recovery(ntdb);
 608         if (likely(berr == false))
 609                 return NTDB_SUCCESS;
 610
 611         ntdb_allrecord_unlock(ntdb, ltype);
 612         if (berr < 0)
 613                 return NTDB_OFF_TO_ERR(berr);
 614         ecode = ntdb_lock_and_recover(ntdb);
 615         if (ecode != NTDB_SUCCESS) {
 616                 return ecode;
 617         }
 618         goto again;
 619 }
 620
 621 enum NTDB_ERROR ntdb_lock_open(struct ntdb_context *ntdb,
 622                              int ltype, enum ntdb_lock_flags flags)
 623 {
 624         return ntdb_nest_lock(ntdb, NTDB_OPEN_LOCK, ltype, flags);
 625 }
 626
 627 void ntdb_unlock_open(struct ntdb_context *ntdb, int ltype)
 628 {
 629         ntdb_nest_unlock(ntdb, NTDB_OPEN_LOCK, ltype);
 630 }
 631
 632 bool ntdb_has_open_lock(struct ntdb_context *ntdb)
 633 {
 634         return !(ntdb->flags & NTDB_NOLOCK)
 635                 && find_nestlock(ntdb, NTDB_OPEN_LOCK, ntdb) != NULL;
 636 }
 637
 638 enum NTDB_ERROR ntdb_lock_expand(struct ntdb_context *ntdb, int ltype)
 639 {
 640         /* Lock doesn't protect data, so don't check (we recurse if we do!) */
 641         return ntdb_nest_lock(ntdb, NTDB_EXPANSION_LOCK, ltype,
 642                              NTDB_LOCK_WAIT | NTDB_LOCK_NOCHECK);
 643 }
 644
 645 void ntdb_unlock_expand(struct ntdb_context *ntdb, int ltype)
 646 {
 647         ntdb_nest_unlock(ntdb, NTDB_EXPANSION_LOCK, ltype);
 648 }
 649
 650 /* unlock entire db */
 651 void ntdb_allrecord_unlock(struct ntdb_context *ntdb, int ltype)
 652 {
 653         if (ntdb->flags & NTDB_NOLOCK)
 654                 return;
 655
 656         if (ntdb->file->allrecord_lock.count == 0) {
 657                 ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
 658                            "ntdb_allrecord_unlock: not locked!");
 659                 return;
 660         }
 661
 662         if (ntdb->file->allrecord_lock.owner != ntdb) {
 663                 ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
 664                            "ntdb_allrecord_unlock: not locked by us!");
 665                 return;
 666         }
 667
 668         /* Upgradable locks are marked as write locks. */
 669         if (ntdb->file->allrecord_lock.ltype != ltype
 670             && (!ntdb->file->allrecord_lock.off || ltype != F_RDLCK)) {
 671                 ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 672                            "ntdb_allrecord_unlock: have %s lock",
 673                            ntdb->file->allrecord_lock.ltype == F_RDLCK
 674                            ? "read" : "write");
 675                 return;
 676         }
 677
 678         if (ntdb->file->allrecord_lock.count > 1) {
 679                 ntdb->file->allrecord_lock.count--;
 680                 return;
 681         }
 682
 683         ntdb->file->allrecord_lock.count = 0;
 684         ntdb->file->allrecord_lock.ltype = 0;
 685
 686         ntdb_brunlock(ntdb, ltype, NTDB_HASH_LOCK_START, 0);
 687 }
 688
 689 bool ntdb_has_expansion_lock(struct ntdb_context *ntdb)
 690 {
 691         return find_nestlock(ntdb, NTDB_EXPANSION_LOCK, ntdb) != NULL;
 692 }
 693
 694 bool ntdb_has_hash_locks(struct ntdb_context *ntdb)
 695 {
 696         unsigned int i;
 697
 698         for (i=0; i<ntdb->file->num_lockrecs; i++) {
 699                 if (ntdb->file->lockrecs[i].off >= NTDB_HASH_LOCK_START
 700                     && ntdb->file->lockrecs[i].off < (NTDB_HASH_LOCK_START
 701                                                       + (1 << ntdb->hash_bits)))
 702                         return true;
 703         }
 704         return false;
 705 }
 706
 707 static bool ntdb_has_free_lock(struct ntdb_context *ntdb)
 708 {
 709         unsigned int i;
 710
 711         if (ntdb->flags & NTDB_NOLOCK)
 712                 return false;
 713
 714         for (i=0; i<ntdb->file->num_lockrecs; i++) {
 715                 if (ntdb->file->lockrecs[i].off
 716                     > NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits))
 717                         return true;
 718         }
 719         return false;
 720 }
 721
 722 enum NTDB_ERROR ntdb_lock_hash(struct ntdb_context *ntdb,
 723                                unsigned int h,
 724                                int ltype)
 725 {
 726         unsigned l = NTDB_HASH_LOCK_START + h;
 727
 728         assert(h < (1 << ntdb->hash_bits));
 729
 730         /* a allrecord lock allows us to avoid per chain locks */
 731         if (ntdb->file->allrecord_lock.count) {
 732                 if (!check_lock_pid(ntdb, "ntdb_lock_hashes", true))
 733                         return NTDB_ERR_LOCK;
 734
 735                 if (ntdb->file->allrecord_lock.owner != ntdb)
 736                         return owner_conflict(ntdb, "ntdb_lock_hashes");
 737                 if (ltype == ntdb->file->allrecord_lock.ltype
 738                     || ltype == F_RDLCK) {
 739                         return NTDB_SUCCESS;
 740                 }
 741
 742                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
 743                                   "ntdb_lock_hashes:"
 744                                   " already have %s allrecordlock",
 745                                   ntdb->file->allrecord_lock.ltype == F_RDLCK
 746                                   ? "read" : "write");
 747         }
 748
 749         if (ntdb_has_free_lock(ntdb)) {
 750                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 751                                   "ntdb_lock_hashes: already have free lock");
 752         }
 753
 754         if (ntdb_has_expansion_lock(ntdb)) {
 755                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 756                                   "ntdb_lock_hashes:"
 757                                   " already have expansion lock");
 758         }
 759
 760         return ntdb_nest_lock(ntdb, l, ltype, NTDB_LOCK_WAIT);
 761 }
 762
 763 enum NTDB_ERROR ntdb_unlock_hash(struct ntdb_context *ntdb,
 764                                  unsigned int h, int ltype)
 765 {
 766         unsigned l = NTDB_HASH_LOCK_START + (h & ((1 << ntdb->hash_bits)-1));
 767
 768         if (ntdb->flags & NTDB_NOLOCK)
 769                 return 0;
 770
 771         /* a allrecord lock allows us to avoid per chain locks */
 772         if (ntdb->file->allrecord_lock.count) {
 773                 if (ntdb->file->allrecord_lock.ltype == F_RDLCK
 774                     && ltype == F_WRLCK) {
 775                         return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 776                                           "ntdb_unlock_hashes RO allrecord!");
 777                 }
 778                 if (ntdb->file->allrecord_lock.owner != ntdb) {
 779                         return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_USE_ERROR,
 780                                           "ntdb_unlock_hashes:"
 781                                           " not locked by us!");
 782                 }
 783                 return NTDB_SUCCESS;
 784         }
 785
 786         return ntdb_nest_unlock(ntdb, l, ltype);
 787 }
 788
 789 /* Hash locks use NTDB_HASH_LOCK_START + <number of hash entries>..
 790  * Then we begin; bucket offsets are sizeof(ntdb_len_t) apart, so we divide.
 791  * The result is that on 32 bit systems we don't use lock values > 2^31 on
 792  * files that are less than 4GB.
 793  */
 794 static ntdb_off_t free_lock_off(const struct ntdb_context *ntdb,
 795                                 ntdb_off_t b_off)
 796 {
 797         return NTDB_HASH_LOCK_START + (1 << ntdb->hash_bits)
 798                 + b_off / sizeof(ntdb_off_t);
 799 }
 800
 801 enum NTDB_ERROR ntdb_lock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off,
 802                                     enum ntdb_lock_flags waitflag)
 803 {
 804         assert(b_off >= sizeof(struct ntdb_header));
 805
 806         if (ntdb->flags & NTDB_NOLOCK)
 807                 return 0;
 808
 809         /* a allrecord lock allows us to avoid per chain locks */
 810         if (ntdb->file->allrecord_lock.count) {
 811                 if (!check_lock_pid(ntdb, "ntdb_lock_free_bucket", true))
 812                         return NTDB_ERR_LOCK;
 813
 814                 if (ntdb->file->allrecord_lock.owner != ntdb) {
 815                         return owner_conflict(ntdb, "ntdb_lock_free_bucket");
 816                 }
 817
 818                 if (ntdb->file->allrecord_lock.ltype == F_WRLCK)
 819                         return 0;
 820                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 821                                   "ntdb_lock_free_bucket with"
 822                                   " read-only allrecordlock!");
 823         }
 824
 825 #if 0 /* FIXME */
 826         if (ntdb_has_expansion_lock(ntdb)) {
 827                 return ntdb_logerr(ntdb, NTDB_ERR_LOCK, NTDB_LOG_ERROR,
 828                                   "ntdb_lock_free_bucket:"
 829                                   " already have expansion lock");
 830         }
 831 #endif
 832
 833         return ntdb_nest_lock(ntdb, free_lock_off(ntdb, b_off), F_WRLCK,
 834                               waitflag);
 835 }
 836
 837 void ntdb_unlock_free_bucket(struct ntdb_context *ntdb, ntdb_off_t b_off)
 838 {
 839         if (ntdb->file->allrecord_lock.count)
 840                 return;
 841
 842         ntdb_nest_unlock(ntdb, free_lock_off(ntdb, b_off), F_WRLCK);
 843 }
 844
 845 _PUBLIC_ enum NTDB_ERROR ntdb_lockall(struct ntdb_context *ntdb)
 846 {
 847         return ntdb_allrecord_lock(ntdb, F_WRLCK, NTDB_LOCK_WAIT, false);
 848 }
 849
 850 _PUBLIC_ void ntdb_unlockall(struct ntdb_context *ntdb)
 851 {
 852         ntdb_allrecord_unlock(ntdb, F_WRLCK);
 853 }
 854
 855 _PUBLIC_ enum NTDB_ERROR ntdb_lockall_read(struct ntdb_context *ntdb)
 856 {
 857         return ntdb_allrecord_lock(ntdb, F_RDLCK, NTDB_LOCK_WAIT, false);
 858 }
 859
 860 _PUBLIC_ void ntdb_unlockall_read(struct ntdb_context *ntdb)
 861 {
 862         ntdb_allrecord_unlock(ntdb, F_RDLCK);
 863 }
 864
 865 void ntdb_lock_cleanup(struct ntdb_context *ntdb)
 866 {
 867         unsigned int i;
 868
 869         /* We don't want to warn: they're allowed to close ntdb after fork. */
 870         if (!check_lock_pid(ntdb, "ntdb_close", false))
 871                 return;
 872
 873         while (ntdb->file->allrecord_lock.count
 874                && ntdb->file->allrecord_lock.owner == ntdb) {
 875                 ntdb_allrecord_unlock(ntdb, ntdb->file->allrecord_lock.ltype);
 876         }
 877
 878         for (i=0; i<ntdb->file->num_lockrecs; i++) {
 879                 if (ntdb->file->lockrecs[i].owner == ntdb) {
 880                         ntdb_nest_unlock(ntdb,
 881                                         ntdb->file->lockrecs[i].off,
 882                                         ntdb->file->lockrecs[i].ltype);
 883                         i--;
 884                 }
 885         }
 886 }