lib/tdb/common/lock.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
  31 {
  32         tdb->interrupt_sig_ptr = ptr;
  33 }
  34
  35 static int fcntl_lock(struct tdb_context *tdb,
  36                       int rw, off_t off, off_t len, bool waitflag)
  37 {
  38         struct flock fl;
  39         int cmd;
  40
  41 #ifdef USE_TDB_MUTEX_LOCKING
  42         {
  43                 int ret;
  44                 if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
  45                         return ret;
  46                 }
  47         }
  48 #endif
  49
  50         fl.l_type = rw;
  51         fl.l_whence = SEEK_SET;
  52         fl.l_start = off;
  53         fl.l_len = len;
  54         fl.l_pid = 0;
  55
  56         cmd = waitflag ? F_SETLKW : F_SETLK;
  57
  58         return fcntl(tdb->fd, cmd, &fl);
  59 }
  60
  61 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
  62 {
  63         struct flock fl;
  64 #if 0 /* Check they matched up locks and unlocks correctly. */
  65         char line[80];
  66         FILE *locks;
  67         bool found = false;
  68
  69         locks = fopen("/proc/locks", "r");
  70
  71         while (fgets(line, 80, locks)) {
  72                 char *p;
  73                 int type, start, l;
  74
  75                 /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
  76                 p = strchr(line, ':') + 1;
  77                 if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
  78                         continue;
  79                 p += strlen(" FLOCK  ADVISORY  ");
  80                 if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
  81                         type = F_RDLCK;
  82                 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
  83                         type = F_WRLCK;
  84                 else
  85                         abort();
  86                 p += 6;
  87                 if (atoi(p) != getpid())
  88                         continue;
  89                 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
  90                 start = atoi(p);
  91                 p = strchr(p, ' ') + 1;
  92                 if (strncmp(p, "EOF", 3) == 0)
  93                         l = 0;
  94                 else
  95                         l = atoi(p) - start + 1;
  96
  97                 if (off == start) {
  98                         if (len != l) {
  99                                 fprintf(stderr, "Len %u should be %u: %s",
 100                                         (int)len, l, line);
 101                                 abort();
 102                         }
 103                         if (type != rw) {
 104                                 fprintf(stderr, "Type %s wrong: %s",
 105                                         rw == F_RDLCK ? "READ" : "WRITE", line);
 106                                 abort();
 107                         }
 108                         found = true;
 109                         break;
 110                 }
 111         }
 112
 113         if (!found) {
 114                 fprintf(stderr, "Unlock on %u@%u not found!\n",
 115                         (int)off, (int)len);
 116                 abort();
 117         }
 118
 119         fclose(locks);
 120 #endif
 121
 122 #ifdef USE_TDB_MUTEX_LOCKING
 123         {
 124                 int ret;
 125                 if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
 126                         return ret;
 127                 }
 128         }
 129 #endif
 130
 131         fl.l_type = F_UNLCK;
 132         fl.l_whence = SEEK_SET;
 133         fl.l_start = off;
 134         fl.l_len = len;
 135         fl.l_pid = 0;
 136
 137         return fcntl(tdb->fd, F_SETLKW, &fl);
 138 }
 139
 140 /* list -1 is the alloc list, otherwise a hash chain. */
 141 static tdb_off_t lock_offset(int list)
 142 {
 143         return FREELIST_TOP + 4*list;
 144 }
 145
 146 /* a byte range locking function - return 0 on success
 147    this functions locks/unlocks "len" byte at the specified offset.
 148
 149    On error, errno is also set so that errors are passed back properly
 150    through tdb_open().
 151
 152    note that a len of zero means lock to end of file
 153 */
 154 int tdb_brlock(struct tdb_context *tdb,
 155                int rw_type, tdb_off_t offset, size_t len,
 156                enum tdb_lock_flags flags)
 157 {
 158         int ret;
 159
 160         if (tdb->flags & TDB_NOLOCK) {
 161                 return 0;
 162         }
 163
 164         if (flags & TDB_LOCK_MARK_ONLY) {
 165                 return 0;
 166         }
 167
 168         if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
 169                 tdb->ecode = TDB_ERR_RDONLY;
 170                 return -1;
 171         }
 172
 173         do {
 174                 ret = fcntl_lock(tdb, rw_type, offset, len,
 175                                  flags & TDB_LOCK_WAIT);
 176                 /* Check for a sigalarm break. */
 177                 if (ret == -1 && errno == EINTR &&
 178                                 tdb->interrupt_sig_ptr &&
 179                                 *tdb->interrupt_sig_ptr) {
 180                         break;
 181                 }
 182         } while (ret == -1 && errno == EINTR);
 183
 184         if (ret == -1) {
 185                 tdb->ecode = TDB_ERR_LOCK;
 186                 /* Generic lock error. errno set by fcntl.
 187                  * EAGAIN is an expected return from non-blocking
 188                  * locks. */
 189                 if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
 190                         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %u rw_type=%d flags=%d len=%zu\n",
 191                                  tdb->fd, offset, rw_type, flags, len));
 192                 }
 193                 return -1;
 194         }
 195         return 0;
 196 }
 197
 198 int tdb_brunlock(struct tdb_context *tdb,
 199                  int rw_type, tdb_off_t offset, size_t len)
 200 {
 201         int ret;
 202
 203         if (tdb->flags & TDB_NOLOCK) {
 204                 return 0;
 205         }
 206
 207         do {
 208                 ret = fcntl_unlock(tdb, rw_type, offset, len);
 209         } while (ret == -1 && errno == EINTR);
 210
 211         if (ret == -1) {
 212                 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %u rw_type=%u len=%zu\n",
 213                          tdb->fd, offset, rw_type, len));
 214         }
 215         return ret;
 216 }
 217
 218 /*
 219  * Do a tdb_brlock in a loop. Some OSes (such as solaris) have too
 220  * conservative deadlock detection and claim a deadlock when progress can be
 221  * made. For those OSes we may loop for a while.
 222  */
 223
 224 static int tdb_brlock_retry(struct tdb_context *tdb,
 225                             int rw_type, tdb_off_t offset, size_t len,
 226                             enum tdb_lock_flags flags)
 227 {
 228         int count = 1000;
 229
 230         while (count--) {
 231                 struct timeval tv;
 232                 int ret;
 233
 234                 ret = tdb_brlock(tdb, rw_type, offset, len, flags);
 235                 if (ret == 0) {
 236                         return 0;
 237                 }
 238                 if (errno != EDEADLK) {
 239                         break;
 240                 }
 241                 /* sleep for as short a time as we can - more portable than usleep() */
 242                 tv.tv_sec = 0;
 243                 tv.tv_usec = 1;
 244                 select(0, NULL, NULL, NULL, &tv);
 245         }
 246         return -1;
 247 }
 248
 249 /*
 250   upgrade a read lock to a write lock.
 251 */
 252 int tdb_allrecord_upgrade(struct tdb_context *tdb)
 253 {
 254         int ret;
 255
 256         if (tdb->allrecord_lock.count != 1) {
 257                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 258                          "tdb_allrecord_upgrade failed: count %u too high\n",
 259                          tdb->allrecord_lock.count));
 260                 return -1;
 261         }
 262
 263         if (tdb->allrecord_lock.off != 1) {
 264                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 265                          "tdb_allrecord_upgrade failed: already upgraded?\n"));
 266                 return -1;
 267         }
 268
 269         if (tdb_have_mutexes(tdb)) {
 270                 ret = tdb_mutex_allrecord_upgrade(tdb);
 271                 if (ret == -1) {
 272                         goto fail;
 273                 }
 274                 ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size),
 275                                        0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 276                 if (ret == -1) {
 277                         tdb_mutex_allrecord_downgrade(tdb);
 278                 }
 279         } else {
 280                 ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
 281                                        TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 282         }
 283
 284         if (ret == 0) {
 285                 tdb->allrecord_lock.ltype = F_WRLCK;
 286                 tdb->allrecord_lock.off = 0;
 287                 return 0;
 288         }
 289 fail:
 290         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
 291         return -1;
 292 }
 293
 294 static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
 295                                            tdb_off_t offset)
 296 {
 297         unsigned int i;
 298
 299         for (i=0; i<tdb->num_lockrecs; i++) {
 300                 if (tdb->lockrecs[i].off == offset) {
 301                         return &tdb->lockrecs[i];
 302                 }
 303         }
 304         return NULL;
 305 }
 306
 307 /* lock an offset in the database. */
 308 int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
 309                   enum tdb_lock_flags flags)
 310 {
 311         struct tdb_lock_type *new_lck;
 312
 313         if (offset >= lock_offset(tdb->hash_size)) {
 314                 tdb->ecode = TDB_ERR_LOCK;
 315                 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
 316                          offset, ltype));
 317                 return -1;
 318         }
 319         if (tdb->flags & TDB_NOLOCK)
 320                 return 0;
 321
 322         new_lck = find_nestlock(tdb, offset);
 323         if (new_lck) {
 324                 /*
 325                  * Just increment the in-memory struct, posix locks
 326                  * don't stack.
 327                  */
 328                 new_lck->count++;
 329                 return 0;
 330         }
 331
 332         if (tdb->num_lockrecs == tdb->lockrecs_array_length) {
 333                 new_lck = (struct tdb_lock_type *)realloc(
 334                         tdb->lockrecs,
 335                         sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 336                 if (new_lck == NULL) {
 337                         errno = ENOMEM;
 338                         return -1;
 339                 }
 340                 tdb->lockrecs_array_length = tdb->num_lockrecs+1;
 341                 tdb->lockrecs = new_lck;
 342         }
 343
 344         /* Since fcntl locks don't nest, we do a lock for the first one,
 345            and simply bump the count for future ones */
 346         if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
 347                 return -1;
 348         }
 349
 350         new_lck = &tdb->lockrecs[tdb->num_lockrecs];
 351
 352         new_lck->off = offset;
 353         new_lck->count = 1;
 354         new_lck->ltype = ltype;
 355         tdb->num_lockrecs++;
 356
 357         return 0;
 358 }
 359
 360 static int tdb_lock_and_recover(struct tdb_context *tdb)
 361 {
 362         int ret;
 363
 364         /* We need to match locking order in transaction commit. */
 365         if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
 366                 return -1;
 367         }
 368
 369         if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
 370                 tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 371                 return -1;
 372         }
 373
 374         ret = tdb_transaction_recover(tdb);
 375
 376         tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
 377         tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 378
 379         return ret;
 380 }
 381
 382 static bool have_data_locks(const struct tdb_context *tdb)
 383 {
 384         unsigned int i;
 385
 386         for (i = 0; i < tdb->num_lockrecs; i++) {
 387                 if (tdb->lockrecs[i].off >= lock_offset(-1))
 388                         return true;
 389         }
 390         return false;
 391 }
 392
 393 /*
 394  * A allrecord lock allows us to avoid per chain locks. Check if the allrecord
 395  * lock is strong enough.
 396  */
 397 static int tdb_lock_covered_by_allrecord_lock(struct tdb_context *tdb,
 398                                               int ltype)
 399 {
 400         if (ltype == F_RDLCK) {
 401                 /*
 402                  * The allrecord_lock is equal (F_RDLCK) or stronger
 403                  * (F_WRLCK). Pass.
 404                  */
 405                 return 0;
 406         }
 407
 408         if (tdb->allrecord_lock.ltype == F_RDLCK) {
 409                 /*
 410                  * We ask for ltype==F_WRLCK, but the allrecord_lock
 411                  * is too weak. We can't upgrade here, so fail.
 412                  */
 413                 tdb->ecode = TDB_ERR_LOCK;
 414                 return -1;
 415         }
 416
 417         /*
 418          * Asking for F_WRLCK, allrecord is F_WRLCK as well. Pass.
 419          */
 420         return 0;
 421 }
 422
 423 static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
 424                          enum tdb_lock_flags waitflag)
 425 {
 426         int ret;
 427         bool check = false;
 428
 429         if (tdb->allrecord_lock.count) {
 430                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 431         }
 432
 433         /*
 434          * Check for recoveries: Someone might have kill -9'ed a process
 435          * during a commit.
 436          */
 437         check = !have_data_locks(tdb);
 438         ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
 439
 440         if (ret == 0 && check && tdb_needs_recovery(tdb)) {
 441                 tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 442
 443                 if (tdb_lock_and_recover(tdb) == -1) {
 444                         return -1;
 445                 }
 446                 return tdb_lock_list(tdb, list, ltype, waitflag);
 447         }
 448         return ret;
 449 }
 450
 451 /* lock a list in the database. list -1 is the alloc list */
 452 int tdb_lock(struct tdb_context *tdb, int list, int ltype)
 453 {
 454         int ret;
 455
 456         ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
 457         if (ret) {
 458                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
 459                          "ltype=%d (%s)\n",  list, ltype, strerror(errno)));
 460         }
 461         return ret;
 462 }
 463
 464 /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
 465 _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
 466 {
 467         return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
 468 }
 469
 470
 471 int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
 472                     bool mark_lock)
 473 {
 474         int ret = -1;
 475         struct tdb_lock_type *lck;
 476
 477         if (tdb->flags & TDB_NOLOCK)
 478                 return 0;
 479
 480         /* Sanity checks */
 481         if (offset >= lock_offset(tdb->hash_size)) {
 482                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->hash_size));
 483                 return ret;
 484         }
 485
 486         lck = find_nestlock(tdb, offset);
 487         if ((lck == NULL) || (lck->count == 0)) {
 488                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
 489                 return -1;
 490         }
 491
 492         if (lck->count > 1) {
 493                 lck->count--;
 494                 return 0;
 495         }
 496
 497         /*
 498          * This lock has count==1 left, so we need to unlock it in the
 499          * kernel. We don't bother with decrementing the in-memory array
 500          * element, we're about to overwrite it with the last array element
 501          * anyway.
 502          */
 503
 504         if (mark_lock) {
 505                 ret = 0;
 506         } else {
 507                 ret = tdb_brunlock(tdb, ltype, offset, 1);
 508         }
 509
 510         /*
 511          * Shrink the array by overwriting the element just unlocked with the
 512          * last array element.
 513          */
 514         *lck = tdb->lockrecs[--tdb->num_lockrecs];
 515
 516         /*
 517          * We don't bother with realloc when the array shrinks, but if we have
 518          * a completely idle tdb we should get rid of the locked array.
 519          */
 520
 521         if (ret)
 522                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
 523         return ret;
 524 }
 525
 526 _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
 527 {
 528         /* a global lock allows us to avoid per chain locks */
 529         if (tdb->allrecord_lock.count) {
 530                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 531         }
 532
 533         return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 534 }
 535
 536 /*
 537   get the transaction lock
 538  */
 539 int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
 540                          enum tdb_lock_flags lockflags)
 541 {
 542         return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
 543 }
 544
 545 /*
 546   release the transaction lock
 547  */
 548 int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
 549 {
 550         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
 551 }
 552
 553 /* Returns 0 if all done, -1 if error, 1 if ok. */
 554 static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
 555                                enum tdb_lock_flags flags, bool upgradable)
 556 {
 557         /* There are no locks on read-only dbs */
 558         if (tdb->read_only || tdb->traverse_read) {
 559                 tdb->ecode = TDB_ERR_LOCK;
 560                 return -1;
 561         }
 562
 563         if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
 564                 tdb->allrecord_lock.count++;
 565                 return 0;
 566         }
 567
 568         if (tdb->allrecord_lock.count) {
 569                 /* a global lock of a different type exists */
 570                 tdb->ecode = TDB_ERR_LOCK;
 571                 return -1;
 572         }
 573
 574         if (tdb_have_extra_locks(tdb)) {
 575                 /* can't combine global and chain locks */
 576                 tdb->ecode = TDB_ERR_LOCK;
 577                 return -1;
 578         }
 579
 580         if (upgradable && ltype != F_RDLCK) {
 581                 /* tdb error: you can't upgrade a write lock! */
 582                 tdb->ecode = TDB_ERR_LOCK;
 583                 return -1;
 584         }
 585         return 1;
 586 }
 587
 588 /* We only need to lock individual bytes, but Linux merges consecutive locks
 589  * so we lock in contiguous ranges. */
 590 static int tdb_chainlock_gradual(struct tdb_context *tdb,
 591                                  int ltype, enum tdb_lock_flags flags,
 592                                  size_t off, size_t len)
 593 {
 594         int ret;
 595         enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
 596
 597         if (len <= 4) {
 598                 /* Single record.  Just do blocking lock. */
 599                 return tdb_brlock(tdb, ltype, off, len, flags);
 600         }
 601
 602         /* First we try non-blocking. */
 603         ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
 604         if (ret == 0) {
 605                 return 0;
 606         }
 607
 608         /* Try locking first half, then second. */
 609         ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
 610         if (ret == -1)
 611                 return -1;
 612
 613         ret = tdb_chainlock_gradual(tdb, ltype, flags,
 614                                     off + len / 2, len - len / 2);
 615         if (ret == -1) {
 616                 tdb_brunlock(tdb, ltype, off, len / 2);
 617                 return -1;
 618         }
 619         return 0;
 620 }
 621
 622 /* lock/unlock entire database.  It can only be upgradable if you have some
 623  * other way of guaranteeing exclusivity (ie. transaction write lock).
 624  * We do the locking gradually to avoid being starved by smaller locks. */
 625 int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 626                        enum tdb_lock_flags flags, bool upgradable)
 627 {
 628         int ret;
 629
 630         switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
 631         case -1:
 632                 return -1;
 633         case 0:
 634                 return 0;
 635         }
 636
 637         /* We cover two kinds of locks:
 638          * 1) Normal chain locks.  Taken for almost all operations.
 639          * 2) Individual records locks.  Taken after normal or free
 640          *    chain locks.
 641          *
 642          * It is (1) which cause the starvation problem, so we're only
 643          * gradual for that. */
 644
 645         if (tdb_have_mutexes(tdb)) {
 646                 ret = tdb_mutex_allrecord_lock(tdb, ltype, flags);
 647         } else {
 648                 ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
 649                                             tdb->hash_size * 4);
 650         }
 651
 652         if (ret == -1) {
 653                 return -1;
 654         }
 655
 656         /* Grab individual record locks. */
 657         if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
 658                        flags) == -1) {
 659                 if (tdb_have_mutexes(tdb)) {
 660                         tdb_mutex_allrecord_unlock(tdb);
 661                 } else {
 662                         tdb_brunlock(tdb, ltype, FREELIST_TOP,
 663                                      tdb->hash_size * 4);
 664                 }
 665                 return -1;
 666         }
 667
 668         tdb->allrecord_lock.count = 1;
 669         /* If it's upgradable, it's actually exclusive so we can treat
 670          * it as a write lock. */
 671         tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
 672         tdb->allrecord_lock.off = upgradable;
 673
 674         if (tdb_needs_recovery(tdb)) {
 675                 bool mark = flags & TDB_LOCK_MARK_ONLY;
 676                 tdb_allrecord_unlock(tdb, ltype, mark);
 677                 if (mark) {
 678                         tdb->ecode = TDB_ERR_LOCK;
 679                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 680                                  "tdb_lockall_mark cannot do recovery\n"));
 681                         return -1;
 682                 }
 683                 if (tdb_lock_and_recover(tdb) == -1) {
 684                         return -1;
 685                 }
 686                 return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
 687         }
 688
 689         return 0;
 690 }
 691
 692
 693
 694 /* unlock entire db */
 695 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
 696 {
 697         /* There are no locks on read-only dbs */
 698         if (tdb->read_only || tdb->traverse_read) {
 699                 tdb->ecode = TDB_ERR_LOCK;
 700                 return -1;
 701         }
 702
 703         if (tdb->allrecord_lock.count == 0) {
 704                 tdb->ecode = TDB_ERR_LOCK;
 705                 return -1;
 706         }
 707
 708         /* Upgradable locks are marked as write locks. */
 709         if (tdb->allrecord_lock.ltype != ltype
 710             && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
 711                 tdb->ecode = TDB_ERR_LOCK;
 712                 return -1;
 713         }
 714
 715         if (tdb->allrecord_lock.count > 1) {
 716                 tdb->allrecord_lock.count--;
 717                 return 0;
 718         }
 719
 720         if (!mark_lock) {
 721                 int ret;
 722
 723                 if (tdb_have_mutexes(tdb)) {
 724                         ret = tdb_mutex_allrecord_unlock(tdb);
 725                         if (ret == 0) {
 726                                 ret = tdb_brunlock(tdb, ltype,
 727                                                    lock_offset(tdb->hash_size),
 728                                                    0);
 729                         }
 730                 } else {
 731                         ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0);
 732                 }
 733
 734                 if (ret != 0) {
 735                         TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed "
 736                                  "(%s)\n", strerror(errno)));
 737                         return -1;
 738                 }
 739         }
 740
 741         tdb->allrecord_lock.count = 0;
 742         tdb->allrecord_lock.ltype = 0;
 743
 744         return 0;
 745 }
 746
 747 /* lock entire database with write lock */
 748 _PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
 749 {
 750         tdb_trace(tdb, "tdb_lockall");
 751         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
 752 }
 753
 754 /* lock entire database with write lock - mark only */
 755 _PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
 756 {
 757         tdb_trace(tdb, "tdb_lockall_mark");
 758         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
 759 }
 760
 761 /* unlock entire database with write lock - unmark only */
 762 _PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
 763 {
 764         tdb_trace(tdb, "tdb_lockall_unmark");
 765         return tdb_allrecord_unlock(tdb, F_WRLCK, true);
 766 }
 767
 768 /* lock entire database with write lock - nonblocking varient */
 769 _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
 770 {
 771         int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
 772         tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
 773         return ret;
 774 }
 775
 776 /* unlock entire database with write lock */
 777 _PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
 778 {
 779         tdb_trace(tdb, "tdb_unlockall");
 780         return tdb_allrecord_unlock(tdb, F_WRLCK, false);
 781 }
 782
 783 /* lock entire database with read lock */
 784 _PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
 785 {
 786         tdb_trace(tdb, "tdb_lockall_read");
 787         return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
 788 }
 789
 790 /* lock entire database with read lock - nonblock varient */
 791 _PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
 792 {
 793         int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
 794         tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
 795         return ret;
 796 }
 797
 798 /* unlock entire database with read lock */
 799 _PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
 800 {
 801         tdb_trace(tdb, "tdb_unlockall_read");
 802         return tdb_allrecord_unlock(tdb, F_RDLCK, false);
 803 }
 804
 805 /* lock/unlock one hash chain. This is meant to be used to reduce
 806    contention - it cannot guarantee how many records will be locked */
 807 _PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
 808 {
 809         int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 810         tdb_trace_1rec(tdb, "tdb_chainlock", key);
 811         return ret;
 812 }
 813
 814 /* lock/unlock one hash chain, non-blocking. This is meant to be used
 815    to reduce contention - it cannot guarantee how many records will be
 816    locked */
 817 _PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
 818 {
 819         int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 820         tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret);
 821         return ret;
 822 }
 823
 824 /* mark a chain as locked without actually locking it. Warning! use with great caution! */
 825 _PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
 826 {
 827         int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 828                                 F_WRLCK, TDB_LOCK_MARK_ONLY);
 829         tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
 830         return ret;
 831 }
 832
 833 /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
 834 _PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
 835 {
 836         tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
 837         return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 838                                F_WRLCK, true);
 839 }
 840
 841 _PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
 842 {
 843         tdb_trace_1rec(tdb, "tdb_chainunlock", key);
 844         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 845 }
 846
 847 _PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
 848 {
 849         int ret;
 850         ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 851         tdb_trace_1rec(tdb, "tdb_chainlock_read", key);
 852         return ret;
 853 }
 854
 855 _PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
 856 {
 857         tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
 858         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 859 }
 860
 861 /* record lock stops delete underneath */
 862 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
 863 {
 864         if (tdb->allrecord_lock.count) {
 865                 return 0;
 866         }
 867         return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
 868 }
 869
 870 /*
 871   Write locks override our own fcntl readlocks, so check it here.
 872   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 873   an error to fail to get the lock here.
 874 */
 875 int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
 876 {
 877         struct tdb_traverse_lock *i;
 878         for (i = &tdb->travlocks; i; i = i->next)
 879                 if (i->off == off)
 880                         return -1;
 881         if (tdb->allrecord_lock.count) {
 882                 if (tdb->allrecord_lock.ltype == F_WRLCK) {
 883                         return 0;
 884                 }
 885                 return -1;
 886         }
 887         return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
 888 }
 889
 890 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 891 {
 892         if (tdb->allrecord_lock.count) {
 893                 return 0;
 894         }
 895         return tdb_brunlock(tdb, F_WRLCK, off, 1);
 896 }
 897
 898 /* fcntl locks don't stack: avoid unlocking someone else's */
 899 int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 900 {
 901         struct tdb_traverse_lock *i;
 902         uint32_t count = 0;
 903
 904         if (tdb->allrecord_lock.count) {
 905                 return 0;
 906         }
 907
 908         if (off == 0)
 909                 return 0;
 910         for (i = &tdb->travlocks; i; i = i->next)
 911                 if (i->off == off)
 912                         count++;
 913         return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
 914 }
 915
 916 bool tdb_have_extra_locks(struct tdb_context *tdb)
 917 {
 918         unsigned int extra = tdb->num_lockrecs;
 919
 920         /* A transaction holds the lock for all records. */
 921         if (!tdb->transaction && tdb->allrecord_lock.count) {
 922                 return true;
 923         }
 924
 925         /* We always hold the active lock if CLEAR_IF_FIRST. */
 926         if (find_nestlock(tdb, ACTIVE_LOCK)) {
 927                 extra--;
 928         }
 929
 930         /* In a transaction, we expect to hold the transaction lock */
 931         if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
 932                 extra--;
 933         }
 934
 935         return extra;
 936 }
 937
 938 /* The transaction code uses this to remove all locks. */
 939 void tdb_release_transaction_locks(struct tdb_context *tdb)
 940 {
 941         unsigned int i, active = 0;
 942
 943         if (tdb->allrecord_lock.count != 0) {
 944                 tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype, false);
 945                 tdb->allrecord_lock.count = 0;
 946         }
 947
 948         for (i=0;i<tdb->num_lockrecs;i++) {
 949                 struct tdb_lock_type *lck = &tdb->lockrecs[i];
 950
 951                 /* Don't release the active lock!  Copy it to first entry. */
 952                 if (lck->off == ACTIVE_LOCK) {
 953                         tdb->lockrecs[active++] = *lck;
 954                 } else {
 955                         tdb_brunlock(tdb, lck->ltype, lck->off, 1);
 956                 }
 957         }
 958         tdb->num_lockrecs = active;
 959 }
 960
 961 /* Following functions are added specifically to support CTDB. */
 962
 963 /* Don't do actual fcntl locking, just mark tdb locked */
 964 int tdb_transaction_write_lock_mark(struct tdb_context *tdb);
 965 _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb)
 966 {
 967         return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY);
 968 }
 969
 970 /* Don't do actual fcntl unlocking, just mark tdb unlocked */
 971 int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
 972 _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb)
 973 {
 974         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true);
 975 }