lib/tdb/common/mutex.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3
   4    trivial database library
   5
   6    Copyright (C) Volker Lendecke 2012,2013
   7    Copyright (C) Stefan Metzmacher 2013,2014
   8    Copyright (C) Michael Adam 2014
   9
  10      ** NOTE! The following LGPL license applies to the tdb
  11      ** library. This does NOT imply that all of Samba is released
  12      ** under the LGPL
  13
  14    This library is free software; you can redistribute it and/or
  15    modify it under the terms of the GNU Lesser General Public
  16    License as published by the Free Software Foundation; either
  17    version 3 of the License, or (at your option) any later version.
  18
  19    This library is distributed in the hope that it will be useful,
  20    but WITHOUT ANY WARRANTY; without even the implied warranty of
  21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  22    Lesser General Public License for more details.
  23
  24    You should have received a copy of the GNU Lesser General Public
  25    License along with this library; if not, see <http://www.gnu.org/licenses/>.
  26 */
  27 #include "tdb_private.h"
  28 #include "system/threads.h"
  29
  30 #ifdef USE_TDB_MUTEX_LOCKING
  31
  32 /*
  33  * If we run with mutexes, we store the "struct tdb_mutexes" at the
  34  * beginning of the file. We store an additional tdb_header right
  35  * beyond the mutex area, page aligned. All the offsets within the tdb
  36  * are relative to the area behind the mutex area. tdb->map_ptr points
  37  * behind the mmap area as well, so the read and write path in the
  38  * mutex case can remain unchanged.
  39  *
  40  * Early in the mutex development the mutexes were placed between the hash
  41  * chain pointers and the real tdb data. This had two drawbacks: First, it
  42  * made pointer calculations more complex. Second, we had to mmap the mutex
  43  * area twice. One was the normal map_ptr in the tdb. This frequently changed
  44  * from within tdb_oob. At least the Linux glibc robust mutex code assumes
  45  * constant pointers in memory, so a constantly changing mmap area destroys
  46  * the mutex list. So we had to mmap the first bytes of the file with a second
  47  * mmap call. With that scheme, very weird errors happened that could be
  48  * easily fixed by doing the mutex mmap in a second file. It seemed that
  49  * mapping the same memory area twice does not end up in accessing the same
  50  * physical page, looking at the mutexes in gdb it seemed that old data showed
  51  * up after some re-mapping. To avoid a separate mutex file, the code now puts
  52  * the real content of the tdb file after the mutex area. This way we do not
  53  * have overlapping mmap areas, the mutex area is mmapped once and not
  54  * changed, the tdb data area's mmap is constantly changed but does not
  55  * overlap.
  56  */
  57
  58 struct tdb_mutexes {
  59         struct tdb_header hdr;
  60
  61         /* protect allrecord_lock */
  62         pthread_mutex_t allrecord_mutex;
  63
  64         /*
  65          * F_UNLCK: free,
  66          * F_RDLCK: shared,
  67          * F_WRLCK: exclusive
  68          */
  69         short int allrecord_lock;
  70
  71         /*
  72          * Index 0 is the freelist mutex, followed by
  73          * one mutex per hashchain.
  74          */
  75         pthread_mutex_t hashchains[1];
  76 };
  77
  78 bool tdb_have_mutexes(struct tdb_context *tdb)
  79 {
  80         return ((tdb->feature_flags & TDB_FEATURE_FLAG_MUTEX) != 0);
  81 }
  82
  83 size_t tdb_mutex_size(struct tdb_context *tdb)
  84 {
  85         size_t mutex_size;
  86
  87         if (!tdb_have_mutexes(tdb)) {
  88                 return 0;
  89         }
  90
  91         mutex_size = sizeof(struct tdb_mutexes);
  92         mutex_size += tdb->hash_size * sizeof(pthread_mutex_t);
  93
  94         return TDB_ALIGN(mutex_size, tdb->page_size);
  95 }
  96
  97 /*
  98  * Get the index for a chain mutex
  99  */
 100 static bool tdb_mutex_index(struct tdb_context *tdb, off_t off, off_t len,
 101                             unsigned *idx)
 102 {
 103         /*
 104          * Weird but true: We fcntl lock 1 byte at an offset 4 bytes before
 105          * the 4 bytes of the freelist start and the hash chain that is about
 106          * to be locked. See lock_offset() where the freelist is -1 vs the
 107          * "+1" in TDB_HASH_TOP(). Because the mutex array is represented in
 108          * the tdb file itself as data, we need to adjust the offset here.
 109          */
 110         const off_t freelist_lock_ofs = FREELIST_TOP - sizeof(tdb_off_t);
 111
 112         if (!tdb_have_mutexes(tdb)) {
 113                 return false;
 114         }
 115         if (len != 1) {
 116                 /* Possibly the allrecord lock */
 117                 return false;
 118         }
 119         if (off < freelist_lock_ofs) {
 120                 /* One of the special locks */
 121                 return false;
 122         }
 123         if (tdb->hash_size == 0) {
 124                 /* tdb not initialized yet, called from tdb_open_ex() */
 125                 return false;
 126         }
 127         if (off >= TDB_DATA_START(tdb->hash_size)) {
 128                 /* Single record lock from traverses */
 129                 return false;
 130         }
 131
 132         /*
 133          * Now we know it's a freelist or hash chain lock. Those are always 4
 134          * byte aligned. Paranoia check.
 135          */
 136         if ((off % sizeof(tdb_off_t)) != 0) {
 137                 abort();
 138         }
 139
 140         /*
 141          * Re-index the fcntl offset into an offset into the mutex array
 142          */
 143         off -= freelist_lock_ofs; /* rebase to index 0 */
 144         off /= sizeof(tdb_off_t); /* 0 for freelist 1-n for hashchain */
 145
 146         *idx = off;
 147         return true;
 148 }
 149
 150 static bool tdb_have_mutex_chainlocks(struct tdb_context *tdb)
 151 {
 152         size_t i;
 153
 154         for (i=0; i < tdb->num_lockrecs; i++) {
 155                 bool ret;
 156                 unsigned idx;
 157
 158                 ret = tdb_mutex_index(tdb,
 159                                       tdb->lockrecs[i].off,
 160                                       tdb->lockrecs[i].count,
 161                                       &idx);
 162                 if (!ret) {
 163                         continue;
 164                 }
 165
 166                 if (idx == 0) {
 167                         /* this is the freelist mutex */
 168                         continue;
 169                 }
 170
 171                 return true;
 172         }
 173
 174         return false;
 175 }
 176
 177 static int chain_mutex_lock(pthread_mutex_t *m, bool waitflag)
 178 {
 179         int ret;
 180
 181         if (waitflag) {
 182                 ret = pthread_mutex_lock(m);
 183         } else {
 184                 ret = pthread_mutex_trylock(m);
 185         }
 186         if (ret != EOWNERDEAD) {
 187                 return ret;
 188         }
 189
 190         /*
 191          * For chainlocks, we don't do any cleanup (yet?)
 192          */
 193         return pthread_mutex_consistent(m);
 194 }
 195
 196 static int allrecord_mutex_lock(struct tdb_mutexes *m, bool waitflag)
 197 {
 198         int ret;
 199
 200         if (waitflag) {
 201                 ret = pthread_mutex_lock(&m->allrecord_mutex);
 202         } else {
 203                 ret = pthread_mutex_trylock(&m->allrecord_mutex);
 204         }
 205         if (ret != EOWNERDEAD) {
 206                 return ret;
 207         }
 208
 209         /*
 210          * The allrecord lock holder died. We need to reset the allrecord_lock
 211          * to F_UNLCK. This should also be the indication for
 212          * tdb_needs_recovery.
 213          */
 214         m->allrecord_lock = F_UNLCK;
 215
 216         return pthread_mutex_consistent(&m->allrecord_mutex);
 217 }
 218
 219 bool tdb_mutex_lock(struct tdb_context *tdb, int rw, off_t off, off_t len,
 220                     bool waitflag, int *pret)
 221 {
 222         struct tdb_mutexes *m = tdb->mutexes;
 223         pthread_mutex_t *chain;
 224         int ret;
 225         unsigned idx;
 226         bool allrecord_ok;
 227
 228         if (!tdb_mutex_index(tdb, off, len, &idx)) {
 229                 return false;
 230         }
 231         chain = &m->hashchains[idx];
 232
 233 again:
 234         ret = chain_mutex_lock(chain, waitflag);
 235         if (ret == EBUSY) {
 236                 ret = EAGAIN;
 237         }
 238         if (ret != 0) {
 239                 errno = ret;
 240                 goto fail;
 241         }
 242
 243         if (idx == 0) {
 244                 /*
 245                  * This is a freelist lock, which is independent to
 246                  * the allrecord lock. So we're done once we got the
 247                  * freelist mutex.
 248                  */
 249                 *pret = 0;
 250                 return true;
 251         }
 252
 253         if (tdb_have_mutex_chainlocks(tdb)) {
 254                 /*
 255                  * We can only check the allrecord lock once. If we do it with
 256                  * one chain mutex locked, we will deadlock with the allrecord
 257                  * locker process in the following way: We lock the first hash
 258                  * chain, we check for the allrecord lock. We keep the hash
 259                  * chain locked. Then the allrecord locker locks the
 260                  * allrecord_mutex. It walks the list of chain mutexes,
 261                  * locking them all in sequence. Meanwhile, we have the chain
 262                  * mutex locked, so the allrecord locker blocks trying to lock
 263                  * our chain mutex. Then we come in and try to lock the second
 264                  * chain lock, which in most cases will be the freelist. We
 265                  * see that the allrecord lock is locked and put ourselves on
 266                  * the allrecord_mutex. This will never be signalled though
 267                  * because the allrecord locker waits for us to give up the
 268                  * chain lock.
 269                  */
 270
 271                 *pret = 0;
 272                 return true;
 273         }
 274
 275         /*
 276          * Check if someone is has the allrecord lock: queue if so.
 277          */
 278
 279         allrecord_ok = false;
 280
 281         if (m->allrecord_lock == F_UNLCK) {
 282                 /*
 283                  * allrecord lock not taken
 284                  */
 285                 allrecord_ok = true;
 286         }
 287
 288         if ((m->allrecord_lock == F_RDLCK) && (rw == F_RDLCK)) {
 289                 /*
 290                  * allrecord shared lock taken, but we only want to read
 291                  */
 292                 allrecord_ok = true;
 293         }
 294
 295         if (allrecord_ok) {
 296                 *pret = 0;
 297                 return true;
 298         }
 299
 300         ret = pthread_mutex_unlock(chain);
 301         if (ret != 0) {
 302                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 303                          "(chain_mutex) failed: %s\n", strerror(ret)));
 304                 errno = ret;
 305                 goto fail;
 306         }
 307         ret = allrecord_mutex_lock(m, waitflag);
 308         if (ret == EBUSY) {
 309                 ret = EAGAIN;
 310         }
 311         if (ret != 0) {
 312                 if (waitflag || (ret != EAGAIN)) {
 313                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_%slock"
 314                                  "(allrecord_mutex) failed: %s\n",
 315                                  waitflag ? "" : "try_",  strerror(ret)));
 316                 }
 317                 errno = ret;
 318                 goto fail;
 319         }
 320         ret = pthread_mutex_unlock(&m->allrecord_mutex);
 321         if (ret != 0) {
 322                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 323                          "(allrecord_mutex) failed: %s\n", strerror(ret)));
 324                 errno = ret;
 325                 goto fail;
 326         }
 327         goto again;
 328
 329 fail:
 330         *pret = -1;
 331         return true;
 332 }
 333
 334 bool tdb_mutex_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len,
 335                       int *pret)
 336 {
 337         struct tdb_mutexes *m = tdb->mutexes;
 338         pthread_mutex_t *chain;
 339         int ret;
 340         unsigned idx;
 341
 342         if (!tdb_mutex_index(tdb, off, len, &idx)) {
 343                 return false;
 344         }
 345         chain = &m->hashchains[idx];
 346
 347         ret = pthread_mutex_unlock(chain);
 348         if (ret == 0) {
 349                 *pret = 0;
 350                 return true;
 351         }
 352         errno = ret;
 353         *pret = -1;
 354         return true;
 355 }
 356
 357 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
 358                              enum tdb_lock_flags flags)
 359 {
 360         struct tdb_mutexes *m = tdb->mutexes;
 361         int ret;
 362         uint32_t i;
 363         bool waitflag = (flags & TDB_LOCK_WAIT);
 364         int saved_errno;
 365
 366         if (tdb->flags & TDB_NOLOCK) {
 367                 return 0;
 368         }
 369
 370         if (flags & TDB_LOCK_MARK_ONLY) {
 371                 return 0;
 372         }
 373
 374         ret = allrecord_mutex_lock(m, waitflag);
 375         if (!waitflag && (ret == EBUSY)) {
 376                 errno = EAGAIN;
 377                 tdb->ecode = TDB_ERR_LOCK;
 378                 return -1;
 379         }
 380         if (ret != 0) {
 381                 if (!(flags & TDB_LOCK_PROBE)) {
 382                         TDB_LOG((tdb, TDB_DEBUG_TRACE,
 383                                  "allrecord_mutex_lock() failed: %s\n",
 384                                  strerror(ret)));
 385                 }
 386                 tdb->ecode = TDB_ERR_LOCK;
 387                 return -1;
 388         }
 389
 390         if (m->allrecord_lock != F_UNLCK) {
 391                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 392                          (int)m->allrecord_lock));
 393                 goto fail_unlock_allrecord_mutex;
 394         }
 395         m->allrecord_lock = (ltype == F_RDLCK) ? F_RDLCK : F_WRLCK;
 396
 397         for (i=0; i<tdb->hash_size; i++) {
 398
 399                 /* ignore hashchains[0], the freelist */
 400                 pthread_mutex_t *chain = &m->hashchains[i+1];
 401
 402                 ret = chain_mutex_lock(chain, waitflag);
 403                 if (!waitflag && (ret == EBUSY)) {
 404                         errno = EAGAIN;
 405                         goto fail_unroll_allrecord_lock;
 406                 }
 407                 if (ret != 0) {
 408                         if (!(flags & TDB_LOCK_PROBE)) {
 409                                 TDB_LOG((tdb, TDB_DEBUG_TRACE,
 410                                          "chain_mutex_lock() failed: %s\n",
 411                                          strerror(ret)));
 412                         }
 413                         errno = ret;
 414                         goto fail_unroll_allrecord_lock;
 415                 }
 416
 417                 ret = pthread_mutex_unlock(chain);
 418                 if (ret != 0) {
 419                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 420                                  "(chainlock) failed: %s\n", strerror(ret)));
 421                         errno = ret;
 422                         goto fail_unroll_allrecord_lock;
 423                 }
 424         }
 425         /*
 426          * We leave this routine with m->allrecord_mutex locked
 427          */
 428         return 0;
 429
 430 fail_unroll_allrecord_lock:
 431         m->allrecord_lock = F_UNLCK;
 432
 433 fail_unlock_allrecord_mutex:
 434         saved_errno = errno;
 435         ret = pthread_mutex_unlock(&m->allrecord_mutex);
 436         if (ret != 0) {
 437                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 438                          "(allrecord_mutex) failed: %s\n", strerror(ret)));
 439         }
 440         errno = saved_errno;
 441         tdb->ecode = TDB_ERR_LOCK;
 442         return -1;
 443 }
 444
 445 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
 446 {
 447         struct tdb_mutexes *m = tdb->mutexes;
 448         int ret;
 449         uint32_t i;
 450
 451         if (tdb->flags & TDB_NOLOCK) {
 452                 return 0;
 453         }
 454
 455         /*
 456          * Our only caller tdb_allrecord_upgrade()
 457          * garantees that we already own the allrecord lock.
 458          *
 459          * Which means m->allrecord_mutex is still locked by us.
 460          */
 461
 462         if (m->allrecord_lock != F_RDLCK) {
 463                 tdb->ecode = TDB_ERR_LOCK;
 464                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 465                          (int)m->allrecord_lock));
 466                 return -1;
 467         }
 468
 469         m->allrecord_lock = F_WRLCK;
 470
 471         for (i=0; i<tdb->hash_size; i++) {
 472
 473                 /* ignore hashchains[0], the freelist */
 474                 pthread_mutex_t *chain = &m->hashchains[i+1];
 475
 476                 ret = chain_mutex_lock(chain, true);
 477                 if (ret != 0) {
 478                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_lock"
 479                                  "(chainlock) failed: %s\n", strerror(ret)));
 480                         goto fail_unroll_allrecord_lock;
 481                 }
 482
 483                 ret = pthread_mutex_unlock(chain);
 484                 if (ret != 0) {
 485                         TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 486                                  "(chainlock) failed: %s\n", strerror(ret)));
 487                         goto fail_unroll_allrecord_lock;
 488                 }
 489         }
 490
 491         return 0;
 492
 493 fail_unroll_allrecord_lock:
 494         m->allrecord_lock = F_RDLCK;
 495         tdb->ecode = TDB_ERR_LOCK;
 496         return -1;
 497 }
 498
 499 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
 500 {
 501         struct tdb_mutexes *m = tdb->mutexes;
 502
 503         /*
 504          * Our only caller tdb_allrecord_upgrade() (in the error case)
 505          * garantees that we already own the allrecord lock.
 506          *
 507          * Which means m->allrecord_mutex is still locked by us.
 508          */
 509
 510         if (m->allrecord_lock != F_WRLCK) {
 511                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 512                          (int)m->allrecord_lock));
 513                 return;
 514         }
 515
 516         m->allrecord_lock = F_RDLCK;
 517         return;
 518 }
 519
 520
 521 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
 522 {
 523         struct tdb_mutexes *m = tdb->mutexes;
 524         short old;
 525         int ret;
 526
 527         if (tdb->flags & TDB_NOLOCK) {
 528                 return 0;
 529         }
 530
 531         /*
 532          * Our only callers tdb_allrecord_unlock() and
 533          * tdb_allrecord_lock() (in the error path)
 534          * garantee that we already own the allrecord lock.
 535          *
 536          * Which means m->allrecord_mutex is still locked by us.
 537          */
 538
 539         if ((m->allrecord_lock != F_RDLCK) && (m->allrecord_lock != F_WRLCK)) {
 540                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "allrecord_lock == %d\n",
 541                          (int)m->allrecord_lock));
 542                 return -1;
 543         }
 544
 545         old = m->allrecord_lock;
 546         m->allrecord_lock = F_UNLCK;
 547
 548         ret = pthread_mutex_unlock(&m->allrecord_mutex);
 549         if (ret != 0) {
 550                 m->allrecord_lock = old;
 551                 TDB_LOG((tdb, TDB_DEBUG_FATAL, "pthread_mutex_unlock"
 552                          "(allrecord_mutex) failed: %s\n", strerror(ret)));
 553                 return -1;
 554         }
 555         return 0;
 556 }
 557
 558 int tdb_mutex_init(struct tdb_context *tdb)
 559 {
 560         struct tdb_mutexes *m;
 561         pthread_mutexattr_t ma;
 562         int i, ret;
 563
 564         ret = tdb_mutex_mmap(tdb);
 565         if (ret == -1) {
 566                 return -1;
 567         }
 568         m = tdb->mutexes;
 569
 570         ret = pthread_mutexattr_init(&ma);
 571         if (ret != 0) {
 572                 goto fail_munmap;
 573         }
 574         ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
 575         if (ret != 0) {
 576                 goto fail;
 577         }
 578         ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
 579         if (ret != 0) {
 580                 goto fail;
 581         }
 582         ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 583         if (ret != 0) {
 584                 goto fail;
 585         }
 586
 587         for (i=0; i<tdb->hash_size+1; i++) {
 588                 pthread_mutex_t *chain = &m->hashchains[i];
 589
 590                 ret = pthread_mutex_init(chain, &ma);
 591                 if (ret != 0) {
 592                         goto fail;
 593                 }
 594         }
 595
 596         m->allrecord_lock = F_UNLCK;
 597
 598         ret = pthread_mutex_init(&m->allrecord_mutex, &ma);
 599         if (ret != 0) {
 600                 goto fail;
 601         }
 602         ret = 0;
 603 fail:
 604         pthread_mutexattr_destroy(&ma);
 605 fail_munmap:
 606         tdb_mutex_munmap(tdb);
 607
 608         if (ret == 0) {
 609                 return 0;
 610         }
 611
 612         errno = ret;
 613         return -1;
 614 }
 615
 616 int tdb_mutex_mmap(struct tdb_context *tdb)
 617 {
 618         size_t len;
 619         void *ptr;
 620
 621         len = tdb_mutex_size(tdb);
 622         if (len == 0) {
 623                 return 0;
 624         }
 625
 626         ptr = mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_FILE,
 627                    tdb->fd, 0);
 628         if (ptr == MAP_FAILED) {
 629                 return -1;
 630         }
 631         tdb->mutexes = (struct tdb_mutexes *)ptr;
 632
 633         return 0;
 634 }
 635
 636 int tdb_mutex_munmap(struct tdb_context *tdb)
 637 {
 638         size_t len;
 639
 640         len = tdb_mutex_size(tdb);
 641         if (len == 0) {
 642                 return 0;
 643         }
 644
 645         return munmap(tdb->mutexes, len);
 646 }
 647
 648 static bool tdb_mutex_locking_cached;
 649
 650 static bool tdb_mutex_locking_supported(void)
 651 {
 652         pthread_mutexattr_t ma;
 653         pthread_mutex_t m;
 654         int ret;
 655         static bool initialized;
 656
 657         if (initialized) {
 658                 return tdb_mutex_locking_cached;
 659         }
 660
 661         initialized = true;
 662
 663         ret = pthread_mutexattr_init(&ma);
 664         if (ret != 0) {
 665                 return false;
 666         }
 667         ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
 668         if (ret != 0) {
 669                 goto cleanup_ma;
 670         }
 671         ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
 672         if (ret != 0) {
 673                 goto cleanup_ma;
 674         }
 675         ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 676         if (ret != 0) {
 677                 goto cleanup_ma;
 678         }
 679         ret = pthread_mutex_init(&m, &ma);
 680         if (ret != 0) {
 681                 goto cleanup_ma;
 682         }
 683         ret = pthread_mutex_lock(&m);
 684         if (ret != 0) {
 685                 goto cleanup_m;
 686         }
 687         /*
 688          * This makes sure we have real mutexes
 689          * from a threading library instead of just
 690          * stubs from libc.
 691          */
 692         ret = pthread_mutex_lock(&m);
 693         if (ret != EDEADLK) {
 694                 goto cleanup_lock;
 695         }
 696         ret = pthread_mutex_unlock(&m);
 697         if (ret != 0) {
 698                 goto cleanup_m;
 699         }
 700
 701         tdb_mutex_locking_cached = true;
 702         goto cleanup_m;
 703
 704 cleanup_lock:
 705         pthread_mutex_unlock(&m);
 706 cleanup_m:
 707         pthread_mutex_destroy(&m);
 708 cleanup_ma:
 709         pthread_mutexattr_destroy(&ma);
 710         return tdb_mutex_locking_cached;
 711 }
 712
 713 static void (*tdb_robust_mutext_old_handler)(int) = SIG_ERR;
 714 static pid_t tdb_robust_mutex_pid = -1;
 715
 716 static void (*tdb_robust_mutex_setup_sigchild(void (*handler)(int)))(int)
 717 {
 718 #ifdef HAVE_SIGACTION
 719         struct sigaction act;
 720         struct sigaction oldact;
 721
 722         memset(&act, '\0', sizeof(act));
 723
 724         act.sa_handler = handler;
 725 #ifdef SA_RESTART
 726         act.sa_flags = SA_RESTART;
 727 #endif
 728         sigemptyset(&act.sa_mask);
 729         sigaddset(&act.sa_mask, SIGCHLD);
 730         sigaction(SIGCHLD, &act, &oldact);
 731         return oldact.sa_handler;
 732 #else /* !HAVE_SIGACTION */
 733         return NULL;
 734 #endif
 735 }
 736
 737 static void tdb_robust_mutex_handler(int sig)
 738 {
 739         if (tdb_robust_mutex_pid != -1) {
 740                 pid_t pid;
 741                 int status;
 742
 743                 pid = waitpid(tdb_robust_mutex_pid, &status, WNOHANG);
 744                 if (pid == tdb_robust_mutex_pid) {
 745                         tdb_robust_mutex_pid = -1;
 746                         return;
 747                 }
 748         }
 749
 750         if (tdb_robust_mutext_old_handler == SIG_DFL) {
 751                 return;
 752         }
 753         if (tdb_robust_mutext_old_handler == SIG_IGN) {
 754                 return;
 755         }
 756         if (tdb_robust_mutext_old_handler == SIG_ERR) {
 757                 return;
 758         }
 759
 760         tdb_robust_mutext_old_handler(sig);
 761 }
 762
 763 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
 764 {
 765         void *ptr;
 766         pthread_mutex_t *m;
 767         pthread_mutexattr_t ma;
 768         int ret = 1;
 769         int pipe_down[2] = { -1, -1 };
 770         int pipe_up[2] = { -1, -1 };
 771         ssize_t nread;
 772         char c = 0;
 773         bool ok;
 774         int status;
 775         static bool initialized;
 776
 777         if (initialized) {
 778                 return tdb_mutex_locking_cached;
 779         }
 780
 781         initialized = true;
 782
 783         ok = tdb_mutex_locking_supported();
 784         if (!ok) {
 785                 return false;
 786         }
 787
 788         tdb_mutex_locking_cached = false;
 789
 790         ptr = mmap(NULL, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE,
 791                    MAP_SHARED|MAP_ANON, -1 /* fd */, 0);
 792         if (ptr == MAP_FAILED) {
 793                 return false;
 794         }
 795         m = (pthread_mutex_t *)ptr;
 796
 797         ret = pipe(pipe_down);
 798         if (ret != 0) {
 799                 goto cleanup_mmap;
 800         }
 801         ret = pipe(pipe_up);
 802         if (ret != 0) {
 803                 goto cleanup_pipe;
 804         }
 805
 806         ret = pthread_mutexattr_init(&ma);
 807         if (ret != 0) {
 808                 goto cleanup_pipe;
 809         }
 810         ret = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
 811         if (ret != 0) {
 812                 goto cleanup_ma;
 813         }
 814         ret = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
 815         if (ret != 0) {
 816                 goto cleanup_ma;
 817         }
 818         ret = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
 819         if (ret != 0) {
 820                 goto cleanup_ma;
 821         }
 822         ret = pthread_mutex_init(m, &ma);
 823         if (ret != 0) {
 824                 goto cleanup_ma;
 825         }
 826
 827         tdb_robust_mutext_old_handler = tdb_robust_mutex_setup_sigchild(
 828                                                 tdb_robust_mutex_handler);
 829         if (tdb_robust_mutext_old_handler == NULL) {
 830                 goto cleanup_ma;
 831         }
 832
 833         tdb_robust_mutex_pid = fork();
 834         if (tdb_robust_mutex_pid == 0) {
 835                 size_t nwritten;
 836                 close(pipe_down[1]);
 837                 close(pipe_up[0]);
 838                 ret = pthread_mutex_lock(m);
 839                 nwritten = write(pipe_up[1], &ret, sizeof(ret));
 840                 if (nwritten != sizeof(ret)) {
 841                         _exit(1);
 842                 }
 843                 if (ret != 0) {
 844                         _exit(1);
 845                 }
 846                 nread = read(pipe_down[0], &c, 1);
 847                 if (nread != 1) {
 848                         _exit(1);
 849                 }
 850                 /* leave locked */
 851                 _exit(0);
 852         }
 853         if (tdb_robust_mutex_pid == -1) {
 854                 goto cleanup_sig_child;
 855         }
 856         close(pipe_down[0]);
 857         pipe_down[0] = -1;
 858         close(pipe_up[1]);
 859         pipe_up[1] = -1;
 860
 861         nread = read(pipe_up[0], &ret, sizeof(ret));
 862         if (nread != sizeof(ret)) {
 863                 goto cleanup_child;
 864         }
 865
 866         ret = pthread_mutex_trylock(m);
 867         if (ret != EBUSY) {
 868                 if (ret == 0) {
 869                         pthread_mutex_unlock(m);
 870                 }
 871                 goto cleanup_child;
 872         }
 873
 874         if (write(pipe_down[1], &c, 1) != 1) {
 875                 goto cleanup_child;
 876         }
 877
 878         nread = read(pipe_up[0], &c, 1);
 879         if (nread != 0) {
 880                 goto cleanup_child;
 881         }
 882
 883         while (tdb_robust_mutex_pid > 0) {
 884                 pid_t pid;
 885
 886                 errno = 0;
 887                 pid = waitpid(tdb_robust_mutex_pid, &status, 0);
 888                 if (pid == tdb_robust_mutex_pid) {
 889                         tdb_robust_mutex_pid = -1;
 890                         break;
 891                 }
 892                 if (pid == -1 && errno != EINTR) {
 893                         goto cleanup_child;
 894                 }
 895         }
 896         tdb_robust_mutex_setup_sigchild(tdb_robust_mutext_old_handler);
 897
 898         ret = pthread_mutex_trylock(m);
 899         if (ret != EOWNERDEAD) {
 900                 if (ret == 0) {
 901                         pthread_mutex_unlock(m);
 902                 }
 903                 goto cleanup_m;
 904         }
 905
 906         ret = pthread_mutex_consistent(m);
 907         if (ret != 0) {
 908                 goto cleanup_m;
 909         }
 910
 911         ret = pthread_mutex_trylock(m);
 912         if (ret != EDEADLK) {
 913                 pthread_mutex_unlock(m);
 914                 goto cleanup_m;
 915         }
 916
 917         ret = pthread_mutex_unlock(m);
 918         if (ret != 0) {
 919                 goto cleanup_m;
 920         }
 921
 922         tdb_mutex_locking_cached = true;
 923         goto cleanup_m;
 924
 925 cleanup_child:
 926         while (tdb_robust_mutex_pid > 0) {
 927                 pid_t pid;
 928
 929                 kill(tdb_robust_mutex_pid, SIGKILL);
 930
 931                 errno = 0;
 932                 pid = waitpid(tdb_robust_mutex_pid, &status, 0);
 933                 if (pid == tdb_robust_mutex_pid) {
 934                         tdb_robust_mutex_pid = -1;
 935                         break;
 936                 }
 937                 if (pid == -1 && errno != EINTR) {
 938                         break;
 939                 }
 940         }
 941 cleanup_sig_child:
 942         tdb_robust_mutex_setup_sigchild(tdb_robust_mutext_old_handler);
 943 cleanup_m:
 944         pthread_mutex_destroy(m);
 945 cleanup_ma:
 946         pthread_mutexattr_destroy(&ma);
 947 cleanup_pipe:
 948         if (pipe_down[0] != -1) {
 949                 close(pipe_down[0]);
 950         }
 951         if (pipe_down[1] != -1) {
 952                 close(pipe_down[1]);
 953         }
 954         if (pipe_up[0] != -1) {
 955                 close(pipe_up[0]);
 956         }
 957         if (pipe_up[1] != -1) {
 958                 close(pipe_up[1]);
 959         }
 960 cleanup_mmap:
 961         munmap(ptr, sizeof(pthread_mutex_t));
 962
 963         return tdb_mutex_locking_cached;
 964 }
 965
 966 #else
 967
 968 size_t tdb_mutex_size(struct tdb_context *tdb)
 969 {
 970         return 0;
 971 }
 972
 973 bool tdb_have_mutexes(struct tdb_context *tdb)
 974 {
 975         return false;
 976 }
 977
 978 int tdb_mutex_allrecord_lock(struct tdb_context *tdb, int ltype,
 979                              enum tdb_lock_flags flags)
 980 {
 981         tdb->ecode = TDB_ERR_LOCK;
 982         return -1;
 983 }
 984
 985 int tdb_mutex_allrecord_unlock(struct tdb_context *tdb)
 986 {
 987         return -1;
 988 }
 989
 990 int tdb_mutex_allrecord_upgrade(struct tdb_context *tdb)
 991 {
 992         tdb->ecode = TDB_ERR_LOCK;
 993         return -1;
 994 }
 995
 996 void tdb_mutex_allrecord_downgrade(struct tdb_context *tdb)
 997 {
 998         return;
 999 }
1000
1001 int tdb_mutex_mmap(struct tdb_context *tdb)
1002 {
1003         errno = ENOSYS;
1004         return -1;
1005 }
1006
1007 int tdb_mutex_munmap(struct tdb_context *tdb)
1008 {
1009         errno = ENOSYS;
1010         return -1;
1011 }
1012
1013 int tdb_mutex_init(struct tdb_context *tdb)
1014 {
1015         errno = ENOSYS;
1016         return -1;
1017 }
1018
1019 _PUBLIC_ bool tdb_runtime_check_for_robust_mutexes(void)
1020 {
1021         return false;
1022 }
1023
1024 #endif