lib/tdb/common/lock.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
  31 {
  32         tdb->interrupt_sig_ptr = ptr;
  33 }
  34
  35 static int fcntl_lock(struct tdb_context *tdb,
  36                       int rw, off_t off, off_t len, bool waitflag)
  37 {
  38         struct flock fl;
  39         int cmd;
  40
  41 #ifdef USE_TDB_MUTEX_LOCKING
  42         {
  43                 int ret;
  44                 if (tdb_mutex_lock(tdb, rw, off, len, waitflag, &ret)) {
  45                         return ret;
  46                 }
  47         }
  48 #endif
  49
  50         fl.l_type = rw;
  51         fl.l_whence = SEEK_SET;
  52         fl.l_start = off;
  53         fl.l_len = len;
  54         fl.l_pid = 0;
  55
  56         cmd = waitflag ? F_SETLKW : F_SETLK;
  57
  58         return fcntl(tdb->fd, cmd, &fl);
  59 }
  60
  61 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
  62 {
  63         struct flock fl;
  64 #if 0 /* Check they matched up locks and unlocks correctly. */
  65         char line[80];
  66         FILE *locks;
  67         bool found = false;
  68
  69         locks = fopen("/proc/locks", "r");
  70
  71         while (fgets(line, 80, locks)) {
  72                 char *p;
  73                 int type, start, l;
  74
  75                 /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
  76                 p = strchr(line, ':') + 1;
  77                 if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
  78                         continue;
  79                 p += strlen(" FLOCK  ADVISORY  ");
  80                 if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
  81                         type = F_RDLCK;
  82                 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
  83                         type = F_WRLCK;
  84                 else
  85                         abort();
  86                 p += 6;
  87                 if (atoi(p) != getpid())
  88                         continue;
  89                 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
  90                 start = atoi(p);
  91                 p = strchr(p, ' ') + 1;
  92                 if (strncmp(p, "EOF", 3) == 0)
  93                         l = 0;
  94                 else
  95                         l = atoi(p) - start + 1;
  96
  97                 if (off == start) {
  98                         if (len != l) {
  99                                 fprintf(stderr, "Len %u should be %u: %s",
 100                                         (int)len, l, line);
 101                                 abort();
 102                         }
 103                         if (type != rw) {
 104                                 fprintf(stderr, "Type %s wrong: %s",
 105                                         rw == F_RDLCK ? "READ" : "WRITE", line);
 106                                 abort();
 107                         }
 108                         found = true;
 109                         break;
 110                 }
 111         }
 112
 113         if (!found) {
 114                 fprintf(stderr, "Unlock on %u@%u not found!\n",
 115                         (int)off, (int)len);
 116                 abort();
 117         }
 118
 119         fclose(locks);
 120 #endif
 121
 122 #ifdef USE_TDB_MUTEX_LOCKING
 123         {
 124                 int ret;
 125                 if (tdb_mutex_unlock(tdb, rw, off, len, &ret)) {
 126                         return ret;
 127                 }
 128         }
 129 #endif
 130
 131         fl.l_type = F_UNLCK;
 132         fl.l_whence = SEEK_SET;
 133         fl.l_start = off;
 134         fl.l_len = len;
 135         fl.l_pid = 0;
 136
 137         return fcntl(tdb->fd, F_SETLKW, &fl);
 138 }
 139
 140 /*
 141  * Calculate the lock offset for a list
 142  *
 143  * list -1 is the freelist, otherwise a hash chain.
 144  *
 145  * Note that we consistently (but without real reason) lock hash chains at an
 146  * offset that is 4 bytes below the real offset of the corresponding list head
 147  * in the db.
 148  *
 149  * This is the memory layout of the hashchain array:
 150  *
 151  * FREELIST_TOP + 0 = freelist
 152  * FREELIST_TOP + 4 = hashtbale list 0
 153  * FREELIST_TOP + 8 = hashtbale list 1
 154  * ...
 155  *
 156  * Otoh lock_offset computes:
 157  *
 158  * freelist = FREELIST_TOP - 4
 159  * list 0   = FREELIST_TOP + 0
 160  * list 1   = FREELIST_TOP + 4
 161  * ...
 162  *
 163  * Unfortunately we can't change this calculation in order to align the locking
 164  * offset with the memory layout, as that would make the locking incompatible
 165  * between different tdb versions.
 166  */
 167 static tdb_off_t lock_offset(int list)
 168 {
 169         return FREELIST_TOP + 4*list;
 170 }
 171
 172 /* a byte range locking function - return 0 on success
 173    this functions locks/unlocks "len" byte at the specified offset.
 174
 175    On error, errno is also set so that errors are passed back properly
 176    through tdb_open().
 177
 178    note that a len of zero means lock to end of file
 179 */
 180 int tdb_brlock(struct tdb_context *tdb,
 181                int rw_type, tdb_off_t offset, size_t len,
 182                enum tdb_lock_flags flags)
 183 {
 184         int ret;
 185
 186         if (tdb->flags & TDB_NOLOCK) {
 187                 return 0;
 188         }
 189
 190         if (flags & TDB_LOCK_MARK_ONLY) {
 191                 return 0;
 192         }
 193
 194         if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
 195                 tdb->ecode = TDB_ERR_RDONLY;
 196                 return -1;
 197         }
 198
 199         do {
 200                 ret = fcntl_lock(tdb, rw_type, offset, len,
 201                                  flags & TDB_LOCK_WAIT);
 202                 /* Check for a sigalarm break. */
 203                 if (ret == -1 && errno == EINTR &&
 204                                 tdb->interrupt_sig_ptr &&
 205                                 *tdb->interrupt_sig_ptr) {
 206                         break;
 207                 }
 208         } while (ret == -1 && errno == EINTR);
 209
 210         if (ret == -1) {
 211                 tdb->ecode = TDB_ERR_LOCK;
 212                 /* Generic lock error. errno set by fcntl.
 213                  * EAGAIN is an expected return from non-blocking
 214                  * locks. */
 215                 if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
 216                         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %u rw_type=%d flags=%d len=%zu\n",
 217                                  tdb->fd, offset, rw_type, flags, len));
 218                 }
 219                 return -1;
 220         }
 221         return 0;
 222 }
 223
 224 int tdb_brunlock(struct tdb_context *tdb,
 225                  int rw_type, tdb_off_t offset, size_t len)
 226 {
 227         int ret;
 228
 229         if (tdb->flags & TDB_NOLOCK) {
 230                 return 0;
 231         }
 232
 233         do {
 234                 ret = fcntl_unlock(tdb, rw_type, offset, len);
 235         } while (ret == -1 && errno == EINTR);
 236
 237         if (ret == -1) {
 238                 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %u rw_type=%u len=%zu\n",
 239                          tdb->fd, offset, rw_type, len));
 240         }
 241         return ret;
 242 }
 243
 244 /*
 245  * Do a tdb_brlock in a loop. Some OSes (such as solaris) have too
 246  * conservative deadlock detection and claim a deadlock when progress can be
 247  * made. For those OSes we may loop for a while.
 248  */
 249
 250 static int tdb_brlock_retry(struct tdb_context *tdb,
 251                             int rw_type, tdb_off_t offset, size_t len,
 252                             enum tdb_lock_flags flags)
 253 {
 254         int count = 1000;
 255
 256         while (count--) {
 257                 struct timeval tv;
 258                 int ret;
 259
 260                 ret = tdb_brlock(tdb, rw_type, offset, len, flags);
 261                 if (ret == 0) {
 262                         return 0;
 263                 }
 264                 if (errno != EDEADLK) {
 265                         break;
 266                 }
 267                 /* sleep for as short a time as we can - more portable than usleep() */
 268                 tv.tv_sec = 0;
 269                 tv.tv_usec = 1;
 270                 select(0, NULL, NULL, NULL, &tv);
 271         }
 272         return -1;
 273 }
 274
 275 /*
 276   upgrade a read lock to a write lock.
 277 */
 278 int tdb_allrecord_upgrade(struct tdb_context *tdb)
 279 {
 280         int ret;
 281
 282         if (tdb->allrecord_lock.count != 1) {
 283                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 284                          "tdb_allrecord_upgrade failed: count %u too high\n",
 285                          tdb->allrecord_lock.count));
 286                 tdb->ecode = TDB_ERR_LOCK;
 287                 return -1;
 288         }
 289
 290         if (tdb->allrecord_lock.off != 1) {
 291                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 292                          "tdb_allrecord_upgrade failed: already upgraded?\n"));
 293                 tdb->ecode = TDB_ERR_LOCK;
 294                 return -1;
 295         }
 296
 297         if (tdb_have_mutexes(tdb)) {
 298                 ret = tdb_mutex_allrecord_upgrade(tdb);
 299                 if (ret == -1) {
 300                         goto fail;
 301                 }
 302                 ret = tdb_brlock_retry(tdb, F_WRLCK, lock_offset(tdb->hash_size),
 303                                        0, TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 304                 if (ret == -1) {
 305                         tdb_mutex_allrecord_downgrade(tdb);
 306                 }
 307         } else {
 308                 ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
 309                                        TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 310         }
 311
 312         if (ret == 0) {
 313                 tdb->allrecord_lock.ltype = F_WRLCK;
 314                 tdb->allrecord_lock.off = 0;
 315                 return 0;
 316         }
 317 fail:
 318         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
 319         return -1;
 320 }
 321
 322 static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
 323                                            tdb_off_t offset)
 324 {
 325         int i;
 326
 327         for (i=0; i<tdb->num_lockrecs; i++) {
 328                 if (tdb->lockrecs[i].off == offset) {
 329                         return &tdb->lockrecs[i];
 330                 }
 331         }
 332         return NULL;
 333 }
 334
 335 /* lock an offset in the database. */
 336 int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
 337                   enum tdb_lock_flags flags)
 338 {
 339         struct tdb_lock_type *new_lck;
 340
 341         if (offset >= lock_offset(tdb->hash_size)) {
 342                 tdb->ecode = TDB_ERR_LOCK;
 343                 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
 344                          offset, ltype));
 345                 return -1;
 346         }
 347         if (tdb->flags & TDB_NOLOCK)
 348                 return 0;
 349
 350         new_lck = find_nestlock(tdb, offset);
 351         if (new_lck) {
 352                 if ((new_lck->ltype == F_RDLCK) && (ltype == F_WRLCK)) {
 353                         if (!tdb_have_mutexes(tdb)) {
 354                                 int ret;
 355                                 /*
 356                                  * Upgrade the underlying fcntl
 357                                  * lock. Mutexes don't do readlocks,
 358                                  * so this only applies to fcntl
 359                                  * locking.
 360                                  */
 361                                 ret = tdb_brlock(tdb, ltype, offset, 1, flags);
 362                                 if (ret != 0) {
 363                                         return ret;
 364                                 }
 365                         }
 366                         new_lck->ltype = F_WRLCK;
 367                 }
 368                 /*
 369                  * Just increment the in-memory struct, posix locks
 370                  * don't stack.
 371                  */
 372                 new_lck->count++;
 373                 return 0;
 374         }
 375
 376         if (tdb->num_lockrecs == tdb->lockrecs_array_length) {
 377                 new_lck = (struct tdb_lock_type *)realloc(
 378                         tdb->lockrecs,
 379                         sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 380                 if (new_lck == NULL) {
 381                         errno = ENOMEM;
 382                         return -1;
 383                 }
 384                 tdb->lockrecs_array_length = tdb->num_lockrecs+1;
 385                 tdb->lockrecs = new_lck;
 386         }
 387
 388         /* Since fcntl locks don't nest, we do a lock for the first one,
 389            and simply bump the count for future ones */
 390         if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
 391                 return -1;
 392         }
 393
 394         new_lck = &tdb->lockrecs[tdb->num_lockrecs];
 395
 396         new_lck->off = offset;
 397         new_lck->count = 1;
 398         new_lck->ltype = ltype;
 399         tdb->num_lockrecs++;
 400
 401         return 0;
 402 }
 403
 404 static int tdb_lock_and_recover(struct tdb_context *tdb)
 405 {
 406         int ret;
 407
 408         /* We need to match locking order in transaction commit. */
 409         if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
 410                 return -1;
 411         }
 412
 413         if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
 414                 tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 415                 return -1;
 416         }
 417
 418         ret = tdb_transaction_recover(tdb);
 419
 420         tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
 421         tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 422
 423         return ret;
 424 }
 425
 426 static bool have_data_locks(const struct tdb_context *tdb)
 427 {
 428         int i;
 429
 430         for (i = 0; i < tdb->num_lockrecs; i++) {
 431                 if (tdb->lockrecs[i].off >= lock_offset(-1))
 432                         return true;
 433         }
 434         return false;
 435 }
 436
 437 /*
 438  * A allrecord lock allows us to avoid per chain locks. Check if the allrecord
 439  * lock is strong enough.
 440  */
 441 static int tdb_lock_covered_by_allrecord_lock(struct tdb_context *tdb,
 442                                               int ltype)
 443 {
 444         if (ltype == F_RDLCK) {
 445                 /*
 446                  * The allrecord_lock is equal (F_RDLCK) or stronger
 447                  * (F_WRLCK). Pass.
 448                  */
 449                 return 0;
 450         }
 451
 452         if (tdb->allrecord_lock.ltype == F_RDLCK) {
 453                 /*
 454                  * We ask for ltype==F_WRLCK, but the allrecord_lock
 455                  * is too weak. We can't upgrade here, so fail.
 456                  */
 457                 tdb->ecode = TDB_ERR_LOCK;
 458                 return -1;
 459         }
 460
 461         /*
 462          * Asking for F_WRLCK, allrecord is F_WRLCK as well. Pass.
 463          */
 464         return 0;
 465 }
 466
 467 static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
 468                          enum tdb_lock_flags waitflag)
 469 {
 470         int ret;
 471         bool check = false;
 472
 473         if (tdb->allrecord_lock.count) {
 474                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 475         }
 476
 477         /*
 478          * Check for recoveries: Someone might have kill -9'ed a process
 479          * during a commit.
 480          */
 481         check = !have_data_locks(tdb);
 482         ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
 483
 484         if (ret == 0 && check && tdb_needs_recovery(tdb)) {
 485                 tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 486
 487                 if (tdb_lock_and_recover(tdb) == -1) {
 488                         return -1;
 489                 }
 490                 return tdb_lock_list(tdb, list, ltype, waitflag);
 491         }
 492         return ret;
 493 }
 494
 495 /* lock a list in the database. list -1 is the alloc list */
 496 int tdb_lock(struct tdb_context *tdb, int list, int ltype)
 497 {
 498         int ret;
 499
 500         ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
 501         if (ret) {
 502                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
 503                          "ltype=%d (%s)\n",  list, ltype, strerror(errno)));
 504         }
 505         return ret;
 506 }
 507
 508 /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
 509 _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
 510 {
 511         return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
 512 }
 513
 514
 515 int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
 516                     bool mark_lock)
 517 {
 518         int ret = -1;
 519         struct tdb_lock_type *lck;
 520
 521         if (tdb->flags & TDB_NOLOCK)
 522                 return 0;
 523
 524         /* Sanity checks */
 525         if (offset >= lock_offset(tdb->hash_size)) {
 526                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->hash_size));
 527                 return ret;
 528         }
 529
 530         lck = find_nestlock(tdb, offset);
 531         if ((lck == NULL) || (lck->count == 0)) {
 532                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
 533                 return -1;
 534         }
 535
 536         if (lck->count > 1) {
 537                 lck->count--;
 538                 return 0;
 539         }
 540
 541         /*
 542          * This lock has count==1 left, so we need to unlock it in the
 543          * kernel. We don't bother with decrementing the in-memory array
 544          * element, we're about to overwrite it with the last array element
 545          * anyway.
 546          */
 547
 548         if (mark_lock) {
 549                 ret = 0;
 550         } else {
 551                 ret = tdb_brunlock(tdb, ltype, offset, 1);
 552         }
 553
 554         /*
 555          * Shrink the array by overwriting the element just unlocked with the
 556          * last array element.
 557          */
 558         *lck = tdb->lockrecs[--tdb->num_lockrecs];
 559
 560         /*
 561          * We don't bother with realloc when the array shrinks, but if we have
 562          * a completely idle tdb we should get rid of the locked array.
 563          */
 564
 565         if (ret)
 566                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
 567         return ret;
 568 }
 569
 570 _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
 571 {
 572         /* a global lock allows us to avoid per chain locks */
 573         if (tdb->allrecord_lock.count) {
 574                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 575         }
 576
 577         return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 578 }
 579
 580 /*
 581   get the transaction lock
 582  */
 583 int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
 584                          enum tdb_lock_flags lockflags)
 585 {
 586         return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
 587 }
 588
 589 /*
 590   release the transaction lock
 591  */
 592 int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
 593 {
 594         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
 595 }
 596
 597 /* Returns 0 if all done, -1 if error, 1 if ok. */
 598 static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
 599                                enum tdb_lock_flags flags, bool upgradable)
 600 {
 601         /* There are no locks on read-only dbs */
 602         if (tdb->read_only || tdb->traverse_read) {
 603                 tdb->ecode = TDB_ERR_LOCK;
 604                 return -1;
 605         }
 606
 607         if (tdb->allrecord_lock.count &&
 608             tdb->allrecord_lock.ltype == (uint32_t)ltype) {
 609                 tdb->allrecord_lock.count++;
 610                 return 0;
 611         }
 612
 613         if (tdb->allrecord_lock.count) {
 614                 /* a global lock of a different type exists */
 615                 tdb->ecode = TDB_ERR_LOCK;
 616                 return -1;
 617         }
 618
 619         if (tdb_have_extra_locks(tdb)) {
 620                 /* can't combine global and chain locks */
 621                 tdb->ecode = TDB_ERR_LOCK;
 622                 return -1;
 623         }
 624
 625         if (upgradable && ltype != F_RDLCK) {
 626                 /* tdb error: you can't upgrade a write lock! */
 627                 tdb->ecode = TDB_ERR_LOCK;
 628                 return -1;
 629         }
 630         return 1;
 631 }
 632
 633 /* We only need to lock individual bytes, but Linux merges consecutive locks
 634  * so we lock in contiguous ranges. */
 635 static int tdb_chainlock_gradual(struct tdb_context *tdb,
 636                                  int ltype, enum tdb_lock_flags flags,
 637                                  size_t off, size_t len)
 638 {
 639         int ret;
 640         enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
 641
 642         if (len <= 4) {
 643                 /* Single record.  Just do blocking lock. */
 644                 return tdb_brlock(tdb, ltype, off, len, flags);
 645         }
 646
 647         /* First we try non-blocking. */
 648         ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
 649         if (ret == 0) {
 650                 return 0;
 651         }
 652
 653         /* Try locking first half, then second. */
 654         ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
 655         if (ret == -1)
 656                 return -1;
 657
 658         ret = tdb_chainlock_gradual(tdb, ltype, flags,
 659                                     off + len / 2, len - len / 2);
 660         if (ret == -1) {
 661                 tdb_brunlock(tdb, ltype, off, len / 2);
 662                 return -1;
 663         }
 664         return 0;
 665 }
 666
 667 /* lock/unlock entire database.  It can only be upgradable if you have some
 668  * other way of guaranteeing exclusivity (ie. transaction write lock).
 669  * We do the locking gradually to avoid being starved by smaller locks. */
 670 int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 671                        enum tdb_lock_flags flags, bool upgradable)
 672 {
 673         int ret;
 674
 675         switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
 676         case -1:
 677                 return -1;
 678         case 0:
 679                 return 0;
 680         }
 681
 682         /* We cover two kinds of locks:
 683          * 1) Normal chain locks.  Taken for almost all operations.
 684          * 2) Individual records locks.  Taken after normal or free
 685          *    chain locks.
 686          *
 687          * It is (1) which cause the starvation problem, so we're only
 688          * gradual for that. */
 689
 690         if (tdb_have_mutexes(tdb)) {
 691                 ret = tdb_mutex_allrecord_lock(tdb, ltype, flags);
 692         } else {
 693                 ret = tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
 694                                             tdb->hash_size * 4);
 695         }
 696
 697         if (ret == -1) {
 698                 return -1;
 699         }
 700
 701         /* Grab individual record locks. */
 702         if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
 703                        flags) == -1) {
 704                 if (tdb_have_mutexes(tdb)) {
 705                         tdb_mutex_allrecord_unlock(tdb);
 706                 } else {
 707                         tdb_brunlock(tdb, ltype, FREELIST_TOP,
 708                                      tdb->hash_size * 4);
 709                 }
 710                 return -1;
 711         }
 712
 713         tdb->allrecord_lock.count = 1;
 714         /* If it's upgradable, it's actually exclusive so we can treat
 715          * it as a write lock. */
 716         tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
 717         tdb->allrecord_lock.off = upgradable;
 718
 719         if (tdb_needs_recovery(tdb)) {
 720                 bool mark = flags & TDB_LOCK_MARK_ONLY;
 721                 tdb_allrecord_unlock(tdb, ltype, mark);
 722                 if (mark) {
 723                         tdb->ecode = TDB_ERR_LOCK;
 724                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 725                                  "tdb_lockall_mark cannot do recovery\n"));
 726                         return -1;
 727                 }
 728                 if (tdb_lock_and_recover(tdb) == -1) {
 729                         return -1;
 730                 }
 731                 return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
 732         }
 733
 734         return 0;
 735 }
 736
 737
 738
 739 /* unlock entire db */
 740 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
 741 {
 742         /* There are no locks on read-only dbs */
 743         if (tdb->read_only || tdb->traverse_read) {
 744                 tdb->ecode = TDB_ERR_LOCK;
 745                 return -1;
 746         }
 747
 748         if (tdb->allrecord_lock.count == 0) {
 749                 tdb->ecode = TDB_ERR_LOCK;
 750                 return -1;
 751         }
 752
 753         /* Upgradable locks are marked as write locks. */
 754         if (tdb->allrecord_lock.ltype != (uint32_t)ltype
 755             && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
 756                 tdb->ecode = TDB_ERR_LOCK;
 757                 return -1;
 758         }
 759
 760         if (tdb->allrecord_lock.count > 1) {
 761                 tdb->allrecord_lock.count--;
 762                 return 0;
 763         }
 764
 765         if (!mark_lock) {
 766                 int ret;
 767
 768                 if (tdb_have_mutexes(tdb)) {
 769                         ret = tdb_mutex_allrecord_unlock(tdb);
 770                         if (ret == 0) {
 771                                 ret = tdb_brunlock(tdb, ltype,
 772                                                    lock_offset(tdb->hash_size),
 773                                                    0);
 774                         }
 775                 } else {
 776                         ret = tdb_brunlock(tdb, ltype, FREELIST_TOP, 0);
 777                 }
 778
 779                 if (ret != 0) {
 780                         TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed "
 781                                  "(%s)\n", strerror(errno)));
 782                         return -1;
 783                 }
 784         }
 785
 786         tdb->allrecord_lock.count = 0;
 787         tdb->allrecord_lock.ltype = 0;
 788
 789         return 0;
 790 }
 791
 792 /* lock entire database with write lock */
 793 _PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
 794 {
 795         tdb_trace(tdb, "tdb_lockall");
 796         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
 797 }
 798
 799 /* lock entire database with write lock - mark only */
 800 _PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
 801 {
 802         tdb_trace(tdb, "tdb_lockall_mark");
 803         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
 804 }
 805
 806 /* unlock entire database with write lock - unmark only */
 807 _PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
 808 {
 809         tdb_trace(tdb, "tdb_lockall_unmark");
 810         return tdb_allrecord_unlock(tdb, F_WRLCK, true);
 811 }
 812
 813 /* lock entire database with write lock - nonblocking varient */
 814 _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
 815 {
 816         int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
 817         tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
 818         return ret;
 819 }
 820
 821 /* unlock entire database with write lock */
 822 _PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
 823 {
 824         tdb_trace(tdb, "tdb_unlockall");
 825         return tdb_allrecord_unlock(tdb, F_WRLCK, false);
 826 }
 827
 828 /* lock entire database with read lock */
 829 _PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
 830 {
 831         tdb_trace(tdb, "tdb_lockall_read");
 832         return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
 833 }
 834
 835 /* lock entire database with read lock - nonblock varient */
 836 _PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
 837 {
 838         int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
 839         tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
 840         return ret;
 841 }
 842
 843 /* unlock entire database with read lock */
 844 _PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
 845 {
 846         tdb_trace(tdb, "tdb_unlockall_read");
 847         return tdb_allrecord_unlock(tdb, F_RDLCK, false);
 848 }
 849
 850 /* lock/unlock one hash chain. This is meant to be used to reduce
 851    contention - it cannot guarantee how many records will be locked */
 852 _PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
 853 {
 854         int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 855         tdb_trace_1rec(tdb, "tdb_chainlock", key);
 856         return ret;
 857 }
 858
 859 /* lock/unlock one hash chain, non-blocking. This is meant to be used
 860    to reduce contention - it cannot guarantee how many records will be
 861    locked */
 862 _PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
 863 {
 864         int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 865         tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret);
 866         return ret;
 867 }
 868
 869 /* mark a chain as locked without actually locking it. Warning! use with great caution! */
 870 _PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
 871 {
 872         int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 873                                 F_WRLCK, TDB_LOCK_MARK_ONLY);
 874         tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
 875         return ret;
 876 }
 877
 878 /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
 879 _PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
 880 {
 881         tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
 882         return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 883                                F_WRLCK, true);
 884 }
 885
 886 _PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
 887 {
 888         tdb_trace_1rec(tdb, "tdb_chainunlock", key);
 889         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 890 }
 891
 892 _PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
 893 {
 894         int ret;
 895         ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 896         tdb_trace_1rec(tdb, "tdb_chainlock_read", key);
 897         return ret;
 898 }
 899
 900 _PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
 901 {
 902         tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
 903         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 904 }
 905
 906 _PUBLIC_ int tdb_chainlock_read_nonblock(struct tdb_context *tdb, TDB_DATA key)
 907 {
 908         int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 909         tdb_trace_1rec_ret(tdb, "tdb_chainlock_read_nonblock", key, ret);
 910         return ret;
 911 }
 912
 913 /* record lock stops delete underneath */
 914 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
 915 {
 916         if (tdb->allrecord_lock.count) {
 917                 return 0;
 918         }
 919         return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
 920 }
 921
 922 /*
 923   Write locks override our own fcntl readlocks, so check it here.
 924   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 925   an error to fail to get the lock here.
 926 */
 927 int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
 928 {
 929         struct tdb_traverse_lock *i;
 930         for (i = &tdb->travlocks; i; i = i->next)
 931                 if (i->off == off)
 932                         return -1;
 933         if (tdb->allrecord_lock.count) {
 934                 if (tdb->allrecord_lock.ltype == F_WRLCK) {
 935                         return 0;
 936                 }
 937                 return -1;
 938         }
 939         return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
 940 }
 941
 942 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 943 {
 944         if (tdb->allrecord_lock.count) {
 945                 return 0;
 946         }
 947         return tdb_brunlock(tdb, F_WRLCK, off, 1);
 948 }
 949
 950 /* fcntl locks don't stack: avoid unlocking someone else's */
 951 int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 952 {
 953         struct tdb_traverse_lock *i;
 954         uint32_t count = 0;
 955
 956         if (tdb->allrecord_lock.count) {
 957                 return 0;
 958         }
 959
 960         if (off == 0)
 961                 return 0;
 962         for (i = &tdb->travlocks; i; i = i->next)
 963                 if (i->off == off)
 964                         count++;
 965         return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
 966 }
 967
 968 bool tdb_have_extra_locks(struct tdb_context *tdb)
 969 {
 970         unsigned int extra = tdb->num_lockrecs;
 971
 972         /* A transaction holds the lock for all records. */
 973         if (!tdb->transaction && tdb->allrecord_lock.count) {
 974                 return true;
 975         }
 976
 977         /* We always hold the active lock if CLEAR_IF_FIRST. */
 978         if (find_nestlock(tdb, ACTIVE_LOCK)) {
 979                 extra--;
 980         }
 981
 982         /* In a transaction, we expect to hold the transaction lock */
 983         if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
 984                 extra--;
 985         }
 986
 987         return extra;
 988 }
 989
 990 /* The transaction code uses this to remove all locks. */
 991 void tdb_release_transaction_locks(struct tdb_context *tdb)
 992 {
 993         int i;
 994         unsigned int active = 0;
 995
 996         if (tdb->allrecord_lock.count != 0) {
 997                 tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype, false);
 998                 tdb->allrecord_lock.count = 0;
 999         }
1000
1001         for (i=0;i<tdb->num_lockrecs;i++) {
1002                 struct tdb_lock_type *lck = &tdb->lockrecs[i];
1003
1004                 /* Don't release the active lock!  Copy it to first entry. */
1005                 if (lck->off == ACTIVE_LOCK) {
1006                         tdb->lockrecs[active++] = *lck;
1007                 } else {
1008                         tdb_brunlock(tdb, lck->ltype, lck->off, 1);
1009                 }
1010         }
1011         tdb->num_lockrecs = active;
1012 }
1013
1014 /* Following functions are added specifically to support CTDB. */
1015
1016 /* Don't do actual fcntl locking, just mark tdb locked */
1017 int tdb_transaction_write_lock_mark(struct tdb_context *tdb);
1018 _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb)
1019 {
1020         return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY);
1021 }
1022
1023 /* Don't do actual fcntl unlocking, just mark tdb unlocked */
1024 int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
1025 _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb)
1026 {
1027         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true);
1028 }