lib/tdb/common/lock.c

   1  /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Andrew Tridgell              1999-2005
   7    Copyright (C) Paul `Rusty' Russell              2000
   8    Copyright (C) Jeremy Allison                    2000-2003
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27
  28 #include "tdb_private.h"
  29
  30 _PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
  31 {
  32         tdb->interrupt_sig_ptr = ptr;
  33 }
  34
  35 static int fcntl_lock(struct tdb_context *tdb,
  36                       int rw, off_t off, off_t len, bool waitflag)
  37 {
  38         struct flock fl;
  39         int cmd;
  40
  41         fl.l_type = rw;
  42         fl.l_whence = SEEK_SET;
  43         fl.l_start = off;
  44         fl.l_len = len;
  45         fl.l_pid = 0;
  46
  47         cmd = waitflag ? F_SETLKW : F_SETLK;
  48
  49         return fcntl(tdb->fd, cmd, &fl);
  50 }
  51
  52 static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
  53 {
  54         struct flock fl;
  55 #if 0 /* Check they matched up locks and unlocks correctly. */
  56         char line[80];
  57         FILE *locks;
  58         bool found = false;
  59
  60         locks = fopen("/proc/locks", "r");
  61
  62         while (fgets(line, 80, locks)) {
  63                 char *p;
  64                 int type, start, l;
  65
  66                 /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
  67                 p = strchr(line, ':') + 1;
  68                 if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
  69                         continue;
  70                 p += strlen(" FLOCK  ADVISORY  ");
  71                 if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
  72                         type = F_RDLCK;
  73                 else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
  74                         type = F_WRLCK;
  75                 else
  76                         abort();
  77                 p += 6;
  78                 if (atoi(p) != getpid())
  79                         continue;
  80                 p = strchr(strchr(p, ' ') + 1, ' ') + 1;
  81                 start = atoi(p);
  82                 p = strchr(p, ' ') + 1;
  83                 if (strncmp(p, "EOF", 3) == 0)
  84                         l = 0;
  85                 else
  86                         l = atoi(p) - start + 1;
  87
  88                 if (off == start) {
  89                         if (len != l) {
  90                                 fprintf(stderr, "Len %u should be %u: %s",
  91                                         (int)len, l, line);
  92                                 abort();
  93                         }
  94                         if (type != rw) {
  95                                 fprintf(stderr, "Type %s wrong: %s",
  96                                         rw == F_RDLCK ? "READ" : "WRITE", line);
  97                                 abort();
  98                         }
  99                         found = true;
 100                         break;
 101                 }
 102         }
 103
 104         if (!found) {
 105                 fprintf(stderr, "Unlock on %u@%u not found!\n",
 106                         (int)off, (int)len);
 107                 abort();
 108         }
 109
 110         fclose(locks);
 111 #endif
 112
 113         fl.l_type = F_UNLCK;
 114         fl.l_whence = SEEK_SET;
 115         fl.l_start = off;
 116         fl.l_len = len;
 117         fl.l_pid = 0;
 118
 119         return fcntl(tdb->fd, F_SETLKW, &fl);
 120 }
 121
 122 /* list -1 is the alloc list, otherwise a hash chain. */
 123 static tdb_off_t lock_offset(int list)
 124 {
 125         return FREELIST_TOP + 4*list;
 126 }
 127
 128 /* a byte range locking function - return 0 on success
 129    this functions locks/unlocks "len" byte at the specified offset.
 130
 131    On error, errno is also set so that errors are passed back properly
 132    through tdb_open().
 133
 134    note that a len of zero means lock to end of file
 135 */
 136 int tdb_brlock(struct tdb_context *tdb,
 137                int rw_type, tdb_off_t offset, size_t len,
 138                enum tdb_lock_flags flags)
 139 {
 140         int ret;
 141
 142         if (tdb->flags & TDB_NOLOCK) {
 143                 return 0;
 144         }
 145
 146         if (flags & TDB_LOCK_MARK_ONLY) {
 147                 return 0;
 148         }
 149
 150         if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
 151                 tdb->ecode = TDB_ERR_RDONLY;
 152                 return -1;
 153         }
 154
 155         do {
 156                 ret = fcntl_lock(tdb, rw_type, offset, len,
 157                                  flags & TDB_LOCK_WAIT);
 158                 /* Check for a sigalarm break. */
 159                 if (ret == -1 && errno == EINTR &&
 160                                 tdb->interrupt_sig_ptr &&
 161                                 *tdb->interrupt_sig_ptr) {
 162                         break;
 163                 }
 164         } while (ret == -1 && errno == EINTR);
 165
 166         if (ret == -1) {
 167                 tdb->ecode = TDB_ERR_LOCK;
 168                 /* Generic lock error. errno set by fcntl.
 169                  * EAGAIN is an expected return from non-blocking
 170                  * locks. */
 171                 if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
 172                         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %u rw_type=%d flags=%d len=%zu\n",
 173                                  tdb->fd, offset, rw_type, flags, len));
 174                 }
 175                 return -1;
 176         }
 177         return 0;
 178 }
 179
 180 int tdb_brunlock(struct tdb_context *tdb,
 181                  int rw_type, tdb_off_t offset, size_t len)
 182 {
 183         int ret;
 184
 185         if (tdb->flags & TDB_NOLOCK) {
 186                 return 0;
 187         }
 188
 189         do {
 190                 ret = fcntl_unlock(tdb, rw_type, offset, len);
 191         } while (ret == -1 && errno == EINTR);
 192
 193         if (ret == -1) {
 194                 TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %u rw_type=%u len=%zu\n",
 195                          tdb->fd, offset, rw_type, len));
 196         }
 197         return ret;
 198 }
 199
 200 /*
 201  * Do a tdb_brlock in a loop. Some OSes (such as solaris) have too
 202  * conservative deadlock detection and claim a deadlock when progress can be
 203  * made. For those OSes we may loop for a while.
 204  */
 205
 206 static int tdb_brlock_retry(struct tdb_context *tdb,
 207                             int rw_type, tdb_off_t offset, size_t len,
 208                             enum tdb_lock_flags flags)
 209 {
 210         int count = 1000;
 211
 212         while (count--) {
 213                 struct timeval tv;
 214                 int ret;
 215
 216                 ret = tdb_brlock(tdb, rw_type, offset, len, flags);
 217                 if (ret == 0) {
 218                         return 0;
 219                 }
 220                 if (errno != EDEADLK) {
 221                         break;
 222                 }
 223                 /* sleep for as short a time as we can - more portable than usleep() */
 224                 tv.tv_sec = 0;
 225                 tv.tv_usec = 1;
 226                 select(0, NULL, NULL, NULL, &tv);
 227         }
 228         return -1;
 229 }
 230
 231 /*
 232   upgrade a read lock to a write lock.
 233 */
 234 int tdb_allrecord_upgrade(struct tdb_context *tdb)
 235 {
 236         int ret;
 237
 238         if (tdb->allrecord_lock.count != 1) {
 239                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 240                          "tdb_allrecord_upgrade failed: count %u too high\n",
 241                          tdb->allrecord_lock.count));
 242                 return -1;
 243         }
 244
 245         if (tdb->allrecord_lock.off != 1) {
 246                 TDB_LOG((tdb, TDB_DEBUG_ERROR,
 247                          "tdb_allrecord_upgrade failed: already upgraded?\n"));
 248                 return -1;
 249         }
 250
 251         ret = tdb_brlock_retry(tdb, F_WRLCK, FREELIST_TOP, 0,
 252                                TDB_LOCK_WAIT|TDB_LOCK_PROBE);
 253         if (ret == 0) {
 254                 tdb->allrecord_lock.ltype = F_WRLCK;
 255                 tdb->allrecord_lock.off = 0;
 256                 return 0;
 257         }
 258         TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
 259         return -1;
 260 }
 261
 262 static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
 263                                            tdb_off_t offset)
 264 {
 265         unsigned int i;
 266
 267         for (i=0; i<tdb->num_lockrecs; i++) {
 268                 if (tdb->lockrecs[i].off == offset) {
 269                         return &tdb->lockrecs[i];
 270                 }
 271         }
 272         return NULL;
 273 }
 274
 275 /* lock an offset in the database. */
 276 int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
 277                   enum tdb_lock_flags flags)
 278 {
 279         struct tdb_lock_type *new_lck;
 280
 281         if (offset >= lock_offset(tdb->hash_size)) {
 282                 tdb->ecode = TDB_ERR_LOCK;
 283                 TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
 284                          offset, ltype));
 285                 return -1;
 286         }
 287         if (tdb->flags & TDB_NOLOCK)
 288                 return 0;
 289
 290         new_lck = find_nestlock(tdb, offset);
 291         if (new_lck) {
 292                 /*
 293                  * Just increment the in-memory struct, posix locks
 294                  * don't stack.
 295                  */
 296                 new_lck->count++;
 297                 return 0;
 298         }
 299
 300         if (tdb->num_lockrecs == tdb->lockrecs_array_length) {
 301                 new_lck = (struct tdb_lock_type *)realloc(
 302                         tdb->lockrecs,
 303                         sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
 304                 if (new_lck == NULL) {
 305                         errno = ENOMEM;
 306                         return -1;
 307                 }
 308                 tdb->lockrecs_array_length = tdb->num_lockrecs+1;
 309                 tdb->lockrecs = new_lck;
 310         }
 311
 312         /* Since fcntl locks don't nest, we do a lock for the first one,
 313            and simply bump the count for future ones */
 314         if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
 315                 return -1;
 316         }
 317
 318         new_lck = &tdb->lockrecs[tdb->num_lockrecs];
 319
 320         new_lck->off = offset;
 321         new_lck->count = 1;
 322         new_lck->ltype = ltype;
 323         tdb->num_lockrecs++;
 324
 325         return 0;
 326 }
 327
 328 static int tdb_lock_and_recover(struct tdb_context *tdb)
 329 {
 330         int ret;
 331
 332         /* We need to match locking order in transaction commit. */
 333         if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
 334                 return -1;
 335         }
 336
 337         if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
 338                 tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 339                 return -1;
 340         }
 341
 342         ret = tdb_transaction_recover(tdb);
 343
 344         tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
 345         tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
 346
 347         return ret;
 348 }
 349
 350 static bool have_data_locks(const struct tdb_context *tdb)
 351 {
 352         unsigned int i;
 353
 354         for (i = 0; i < tdb->num_lockrecs; i++) {
 355                 if (tdb->lockrecs[i].off >= lock_offset(-1))
 356                         return true;
 357         }
 358         return false;
 359 }
 360
 361 /*
 362  * A allrecord lock allows us to avoid per chain locks. Check if the allrecord
 363  * lock is strong enough.
 364  */
 365 static int tdb_lock_covered_by_allrecord_lock(struct tdb_context *tdb,
 366                                               int ltype)
 367 {
 368         if (ltype == F_RDLCK) {
 369                 /*
 370                  * The allrecord_lock is equal (F_RDLCK) or stronger
 371                  * (F_WRLCK). Pass.
 372                  */
 373                 return 0;
 374         }
 375
 376         if (tdb->allrecord_lock.ltype == F_RDLCK) {
 377                 /*
 378                  * We ask for ltype==F_WRLCK, but the allrecord_lock
 379                  * is too weak. We can't upgrade here, so fail.
 380                  */
 381                 tdb->ecode = TDB_ERR_LOCK;
 382                 return -1;
 383         }
 384
 385         /*
 386          * Asking for F_WRLCK, allrecord is F_WRLCK as well. Pass.
 387          */
 388         return 0;
 389 }
 390
 391 static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
 392                          enum tdb_lock_flags waitflag)
 393 {
 394         int ret;
 395         bool check = false;
 396
 397         if (tdb->allrecord_lock.count) {
 398                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 399         }
 400
 401         /*
 402          * Check for recoveries: Someone might have kill -9'ed a process
 403          * during a commit.
 404          */
 405         check = !have_data_locks(tdb);
 406         ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
 407
 408         if (ret == 0 && check && tdb_needs_recovery(tdb)) {
 409                 tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 410
 411                 if (tdb_lock_and_recover(tdb) == -1) {
 412                         return -1;
 413                 }
 414                 return tdb_lock_list(tdb, list, ltype, waitflag);
 415         }
 416         return ret;
 417 }
 418
 419 /* lock a list in the database. list -1 is the alloc list */
 420 int tdb_lock(struct tdb_context *tdb, int list, int ltype)
 421 {
 422         int ret;
 423
 424         ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
 425         if (ret) {
 426                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
 427                          "ltype=%d (%s)\n",  list, ltype, strerror(errno)));
 428         }
 429         return ret;
 430 }
 431
 432 /* lock a list in the database. list -1 is the alloc list. non-blocking lock */
 433 _PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
 434 {
 435         return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
 436 }
 437
 438
 439 int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
 440                     bool mark_lock)
 441 {
 442         int ret = -1;
 443         struct tdb_lock_type *lck;
 444
 445         if (tdb->flags & TDB_NOLOCK)
 446                 return 0;
 447
 448         /* Sanity checks */
 449         if (offset >= lock_offset(tdb->hash_size)) {
 450                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->hash_size));
 451                 return ret;
 452         }
 453
 454         lck = find_nestlock(tdb, offset);
 455         if ((lck == NULL) || (lck->count == 0)) {
 456                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
 457                 return -1;
 458         }
 459
 460         if (lck->count > 1) {
 461                 lck->count--;
 462                 return 0;
 463         }
 464
 465         /*
 466          * This lock has count==1 left, so we need to unlock it in the
 467          * kernel. We don't bother with decrementing the in-memory array
 468          * element, we're about to overwrite it with the last array element
 469          * anyway.
 470          */
 471
 472         if (mark_lock) {
 473                 ret = 0;
 474         } else {
 475                 ret = tdb_brunlock(tdb, ltype, offset, 1);
 476         }
 477
 478         /*
 479          * Shrink the array by overwriting the element just unlocked with the
 480          * last array element.
 481          */
 482         *lck = tdb->lockrecs[--tdb->num_lockrecs];
 483
 484         /*
 485          * We don't bother with realloc when the array shrinks, but if we have
 486          * a completely idle tdb we should get rid of the locked array.
 487          */
 488
 489         if (ret)
 490                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n"));
 491         return ret;
 492 }
 493
 494 _PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
 495 {
 496         /* a global lock allows us to avoid per chain locks */
 497         if (tdb->allrecord_lock.count) {
 498                 return tdb_lock_covered_by_allrecord_lock(tdb, ltype);
 499         }
 500
 501         return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
 502 }
 503
 504 /*
 505   get the transaction lock
 506  */
 507 int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
 508                          enum tdb_lock_flags lockflags)
 509 {
 510         return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
 511 }
 512
 513 /*
 514   release the transaction lock
 515  */
 516 int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
 517 {
 518         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
 519 }
 520
 521 /* Returns 0 if all done, -1 if error, 1 if ok. */
 522 static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
 523                                enum tdb_lock_flags flags, bool upgradable)
 524 {
 525         /* There are no locks on read-only dbs */
 526         if (tdb->read_only || tdb->traverse_read) {
 527                 tdb->ecode = TDB_ERR_LOCK;
 528                 return -1;
 529         }
 530
 531         if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
 532                 tdb->allrecord_lock.count++;
 533                 return 0;
 534         }
 535
 536         if (tdb->allrecord_lock.count) {
 537                 /* a global lock of a different type exists */
 538                 tdb->ecode = TDB_ERR_LOCK;
 539                 return -1;
 540         }
 541
 542         if (tdb_have_extra_locks(tdb)) {
 543                 /* can't combine global and chain locks */
 544                 tdb->ecode = TDB_ERR_LOCK;
 545                 return -1;
 546         }
 547
 548         if (upgradable && ltype != F_RDLCK) {
 549                 /* tdb error: you can't upgrade a write lock! */
 550                 tdb->ecode = TDB_ERR_LOCK;
 551                 return -1;
 552         }
 553         return 1;
 554 }
 555
 556 /* We only need to lock individual bytes, but Linux merges consecutive locks
 557  * so we lock in contiguous ranges. */
 558 static int tdb_chainlock_gradual(struct tdb_context *tdb,
 559                                  int ltype, enum tdb_lock_flags flags,
 560                                  size_t off, size_t len)
 561 {
 562         int ret;
 563         enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
 564
 565         if (len <= 4) {
 566                 /* Single record.  Just do blocking lock. */
 567                 return tdb_brlock(tdb, ltype, off, len, flags);
 568         }
 569
 570         /* First we try non-blocking. */
 571         ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
 572         if (ret == 0) {
 573                 return 0;
 574         }
 575
 576         /* Try locking first half, then second. */
 577         ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
 578         if (ret == -1)
 579                 return -1;
 580
 581         ret = tdb_chainlock_gradual(tdb, ltype, flags,
 582                                     off + len / 2, len - len / 2);
 583         if (ret == -1) {
 584                 tdb_brunlock(tdb, ltype, off, len / 2);
 585                 return -1;
 586         }
 587         return 0;
 588 }
 589
 590 /* lock/unlock entire database.  It can only be upgradable if you have some
 591  * other way of guaranteeing exclusivity (ie. transaction write lock).
 592  * We do the locking gradually to avoid being starved by smaller locks. */
 593 int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
 594                        enum tdb_lock_flags flags, bool upgradable)
 595 {
 596         switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
 597         case -1:
 598                 return -1;
 599         case 0:
 600                 return 0;
 601         }
 602
 603         /* We cover two kinds of locks:
 604          * 1) Normal chain locks.  Taken for almost all operations.
 605          * 2) Individual records locks.  Taken after normal or free
 606          *    chain locks.
 607          *
 608          * It is (1) which cause the starvation problem, so we're only
 609          * gradual for that. */
 610         if (tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
 611                                   tdb->hash_size * 4) == -1) {
 612                 return -1;
 613         }
 614
 615         /* Grab individual record locks. */
 616         if (tdb_brlock(tdb, ltype, lock_offset(tdb->hash_size), 0,
 617                        flags) == -1) {
 618                 tdb_brunlock(tdb, ltype, FREELIST_TOP,
 619                              tdb->hash_size * 4);
 620                 return -1;
 621         }
 622
 623         tdb->allrecord_lock.count = 1;
 624         /* If it's upgradable, it's actually exclusive so we can treat
 625          * it as a write lock. */
 626         tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
 627         tdb->allrecord_lock.off = upgradable;
 628
 629         if (tdb_needs_recovery(tdb)) {
 630                 bool mark = flags & TDB_LOCK_MARK_ONLY;
 631                 tdb_allrecord_unlock(tdb, ltype, mark);
 632                 if (mark) {
 633                         tdb->ecode = TDB_ERR_LOCK;
 634                         TDB_LOG((tdb, TDB_DEBUG_ERROR,
 635                                  "tdb_lockall_mark cannot do recovery\n"));
 636                         return -1;
 637                 }
 638                 if (tdb_lock_and_recover(tdb) == -1) {
 639                         return -1;
 640                 }
 641                 return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
 642         }
 643
 644         return 0;
 645 }
 646
 647
 648
 649 /* unlock entire db */
 650 int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
 651 {
 652         /* There are no locks on read-only dbs */
 653         if (tdb->read_only || tdb->traverse_read) {
 654                 tdb->ecode = TDB_ERR_LOCK;
 655                 return -1;
 656         }
 657
 658         if (tdb->allrecord_lock.count == 0) {
 659                 tdb->ecode = TDB_ERR_LOCK;
 660                 return -1;
 661         }
 662
 663         /* Upgradable locks are marked as write locks. */
 664         if (tdb->allrecord_lock.ltype != ltype
 665             && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
 666                 tdb->ecode = TDB_ERR_LOCK;
 667                 return -1;
 668         }
 669
 670         if (tdb->allrecord_lock.count > 1) {
 671                 tdb->allrecord_lock.count--;
 672                 return 0;
 673         }
 674
 675         if (!mark_lock && tdb_brunlock(tdb, ltype, FREELIST_TOP, 0)) {
 676                 TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
 677                 return -1;
 678         }
 679
 680         tdb->allrecord_lock.count = 0;
 681         tdb->allrecord_lock.ltype = 0;
 682
 683         return 0;
 684 }
 685
 686 /* lock entire database with write lock */
 687 _PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
 688 {
 689         tdb_trace(tdb, "tdb_lockall");
 690         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
 691 }
 692
 693 /* lock entire database with write lock - mark only */
 694 _PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
 695 {
 696         tdb_trace(tdb, "tdb_lockall_mark");
 697         return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
 698 }
 699
 700 /* unlock entire database with write lock - unmark only */
 701 _PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
 702 {
 703         tdb_trace(tdb, "tdb_lockall_unmark");
 704         return tdb_allrecord_unlock(tdb, F_WRLCK, true);
 705 }
 706
 707 /* lock entire database with write lock - nonblocking varient */
 708 _PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
 709 {
 710         int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
 711         tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
 712         return ret;
 713 }
 714
 715 /* unlock entire database with write lock */
 716 _PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
 717 {
 718         tdb_trace(tdb, "tdb_unlockall");
 719         return tdb_allrecord_unlock(tdb, F_WRLCK, false);
 720 }
 721
 722 /* lock entire database with read lock */
 723 _PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
 724 {
 725         tdb_trace(tdb, "tdb_lockall_read");
 726         return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
 727 }
 728
 729 /* lock entire database with read lock - nonblock varient */
 730 _PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
 731 {
 732         int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
 733         tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
 734         return ret;
 735 }
 736
 737 /* unlock entire database with read lock */
 738 _PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
 739 {
 740         tdb_trace(tdb, "tdb_unlockall_read");
 741         return tdb_allrecord_unlock(tdb, F_RDLCK, false);
 742 }
 743
 744 /* lock/unlock one hash chain. This is meant to be used to reduce
 745    contention - it cannot guarantee how many records will be locked */
 746 _PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
 747 {
 748         int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 749         tdb_trace_1rec(tdb, "tdb_chainlock", key);
 750         return ret;
 751 }
 752
 753 /* lock/unlock one hash chain, non-blocking. This is meant to be used
 754    to reduce contention - it cannot guarantee how many records will be
 755    locked */
 756 _PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
 757 {
 758         int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 759         tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret);
 760         return ret;
 761 }
 762
 763 /* mark a chain as locked without actually locking it. Warning! use with great caution! */
 764 _PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
 765 {
 766         int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 767                                 F_WRLCK, TDB_LOCK_MARK_ONLY);
 768         tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
 769         return ret;
 770 }
 771
 772 /* unmark a chain as locked without actually locking it. Warning! use with great caution! */
 773 _PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
 774 {
 775         tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
 776         return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
 777                                F_WRLCK, true);
 778 }
 779
 780 _PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
 781 {
 782         tdb_trace_1rec(tdb, "tdb_chainunlock", key);
 783         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
 784 }
 785
 786 _PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
 787 {
 788         int ret;
 789         ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 790         tdb_trace_1rec(tdb, "tdb_chainlock_read", key);
 791         return ret;
 792 }
 793
 794 _PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
 795 {
 796         tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
 797         return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
 798 }
 799
 800 /* record lock stops delete underneath */
 801 int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
 802 {
 803         if (tdb->allrecord_lock.count) {
 804                 return 0;
 805         }
 806         return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
 807 }
 808
 809 /*
 810   Write locks override our own fcntl readlocks, so check it here.
 811   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
 812   an error to fail to get the lock here.
 813 */
 814 int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
 815 {
 816         struct tdb_traverse_lock *i;
 817         for (i = &tdb->travlocks; i; i = i->next)
 818                 if (i->off == off)
 819                         return -1;
 820         if (tdb->allrecord_lock.count) {
 821                 if (tdb->allrecord_lock.ltype == F_WRLCK) {
 822                         return 0;
 823                 }
 824                 return -1;
 825         }
 826         return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
 827 }
 828
 829 int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 830 {
 831         if (tdb->allrecord_lock.count) {
 832                 return 0;
 833         }
 834         return tdb_brunlock(tdb, F_WRLCK, off, 1);
 835 }
 836
 837 /* fcntl locks don't stack: avoid unlocking someone else's */
 838 int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
 839 {
 840         struct tdb_traverse_lock *i;
 841         uint32_t count = 0;
 842
 843         if (tdb->allrecord_lock.count) {
 844                 return 0;
 845         }
 846
 847         if (off == 0)
 848                 return 0;
 849         for (i = &tdb->travlocks; i; i = i->next)
 850                 if (i->off == off)
 851                         count++;
 852         return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
 853 }
 854
 855 bool tdb_have_extra_locks(struct tdb_context *tdb)
 856 {
 857         unsigned int extra = tdb->num_lockrecs;
 858
 859         /* A transaction holds the lock for all records. */
 860         if (!tdb->transaction && tdb->allrecord_lock.count) {
 861                 return true;
 862         }
 863
 864         /* We always hold the active lock if CLEAR_IF_FIRST. */
 865         if (find_nestlock(tdb, ACTIVE_LOCK)) {
 866                 extra--;
 867         }
 868
 869         /* In a transaction, we expect to hold the transaction lock */
 870         if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
 871                 extra--;
 872         }
 873
 874         return extra;
 875 }
 876
 877 /* The transaction code uses this to remove all locks. */
 878 void tdb_release_transaction_locks(struct tdb_context *tdb)
 879 {
 880         unsigned int i, active = 0;
 881
 882         if (tdb->allrecord_lock.count != 0) {
 883                 tdb_allrecord_unlock(tdb, tdb->allrecord_lock.ltype, false);
 884                 tdb->allrecord_lock.count = 0;
 885         }
 886
 887         for (i=0;i<tdb->num_lockrecs;i++) {
 888                 struct tdb_lock_type *lck = &tdb->lockrecs[i];
 889
 890                 /* Don't release the active lock!  Copy it to first entry. */
 891                 if (lck->off == ACTIVE_LOCK) {
 892                         tdb->lockrecs[active++] = *lck;
 893                 } else {
 894                         tdb_brunlock(tdb, lck->ltype, lck->off, 1);
 895                 }
 896         }
 897         tdb->num_lockrecs = active;
 898 }
 899
 900 /* Following functions are added specifically to support CTDB. */
 901
 902 /* Don't do actual fcntl locking, just mark tdb locked */
 903 int tdb_transaction_write_lock_mark(struct tdb_context *tdb);
 904 _PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb)
 905 {
 906         return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY);
 907 }
 908
 909 /* Don't do actual fcntl unlocking, just mark tdb unlocked */
 910 int tdb_transaction_write_lock_unmark(struct tdb_context *tdb);
 911 _PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb)
 912 {
 913         return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true);
 914 }