source3/lib/g_lock.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    global locks based on dbwrap and messaging
   4    Copyright (C) 2009 by Volker Lendecke
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 3 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  18 */
  19
  20 #include "replace.h"
  21 #include "system/filesys.h"
  22 #include "lib/util/server_id.h"
  23 #include "lib/util/debug.h"
  24 #include "lib/util/talloc_stack.h"
  25 #include "lib/util/samba_util.h"
  26 #include "lib/util_path.h"
  27 #include "dbwrap/dbwrap.h"
  28 #include "dbwrap/dbwrap_open.h"
  29 #include "dbwrap/dbwrap_watch.h"
  30 #include "g_lock.h"
  31 #include "util_tdb.h"
  32 #include "../lib/util/tevent_ntstatus.h"
  33 #include "messages.h"
  34 #include "serverid.h"
  35
  36 struct g_lock_ctx {
  37         struct db_context *db;
  38         struct messaging_context *msg;
  39         enum dbwrap_lock_order lock_order;
  40         bool busy;
  41 };
  42
  43 struct g_lock {
  44         struct server_id exclusive;
  45         size_t num_shared;
  46         uint8_t *shared;
  47         uint64_t unique_lock_epoch;
  48         uint64_t unique_data_epoch;
  49         size_t datalen;
  50         uint8_t *data;
  51 };
  52
  53 static bool g_lock_parse(uint8_t *buf, size_t buflen, struct g_lock *lck)
  54 {
  55         struct server_id exclusive;
  56         size_t num_shared, shared_len;
  57         uint64_t unique_lock_epoch;
  58         uint64_t unique_data_epoch;
  59
  60         if (buflen < (SERVER_ID_BUF_LENGTH + /* exclusive */
  61                       sizeof(uint64_t) +     /* unique_lock_epoch */
  62                       sizeof(uint64_t) +     /* unique_data_epoch */
  63                       sizeof(uint32_t))) {   /* num_shared */
  64                 struct g_lock ret = {
  65                         .exclusive.pid = 0,
  66                         .unique_lock_epoch = generate_unique_u64(0),
  67                         .unique_data_epoch = generate_unique_u64(0),
  68                 };
  69                 *lck = ret;
  70                 return true;
  71         }
  72
  73         server_id_get(&exclusive, buf);
  74         buf += SERVER_ID_BUF_LENGTH;
  75         buflen -= SERVER_ID_BUF_LENGTH;
  76
  77         unique_lock_epoch = BVAL(buf, 0);
  78         buf += sizeof(uint64_t);
  79         buflen -= sizeof(uint64_t);
  80
  81         unique_data_epoch = BVAL(buf, 0);
  82         buf += sizeof(uint64_t);
  83         buflen -= sizeof(uint64_t);
  84
  85         num_shared = IVAL(buf, 0);
  86         buf += sizeof(uint32_t);
  87         buflen -= sizeof(uint32_t);
  88
  89         if (num_shared > buflen/SERVER_ID_BUF_LENGTH) {
  90                 DBG_DEBUG("num_shared=%zu, buflen=%zu\n",
  91                           num_shared,
  92                           buflen);
  93                 return false;
  94         }
  95
  96         shared_len = num_shared * SERVER_ID_BUF_LENGTH;
  97
  98         *lck = (struct g_lock) {
  99                 .exclusive = exclusive,
 100                 .num_shared = num_shared,
 101                 .shared = buf,
 102                 .unique_lock_epoch = unique_lock_epoch,
 103                 .unique_data_epoch = unique_data_epoch,
 104                 .datalen = buflen-shared_len,
 105                 .data = buf+shared_len,
 106         };
 107
 108         return true;
 109 }
 110
 111 static void g_lock_get_shared(const struct g_lock *lck,
 112                               size_t i,
 113                               struct server_id *shared)
 114 {
 115         if (i >= lck->num_shared) {
 116                 abort();
 117         }
 118         server_id_get(shared, lck->shared + i*SERVER_ID_BUF_LENGTH);
 119 }
 120
 121 static void g_lock_del_shared(struct g_lock *lck, size_t i)
 122 {
 123         if (i >= lck->num_shared) {
 124                 abort();
 125         }
 126         lck->num_shared -= 1;
 127         if (i < lck->num_shared) {
 128                 memcpy(lck->shared + i*SERVER_ID_BUF_LENGTH,
 129                        lck->shared + lck->num_shared*SERVER_ID_BUF_LENGTH,
 130                        SERVER_ID_BUF_LENGTH);
 131         }
 132 }
 133
 134 static NTSTATUS g_lock_store(
 135         struct db_record *rec,
 136         struct g_lock *lck,
 137         struct server_id *new_shared,
 138         const TDB_DATA *new_dbufs,
 139         size_t num_new_dbufs)
 140 {
 141         uint8_t exclusive[SERVER_ID_BUF_LENGTH];
 142         uint8_t seqnum_buf[sizeof(uint64_t)*2];
 143         uint8_t sizebuf[sizeof(uint32_t)];
 144         uint8_t new_shared_buf[SERVER_ID_BUF_LENGTH];
 145
 146         struct TDB_DATA dbufs[6 + num_new_dbufs];
 147
 148         dbufs[0] = (TDB_DATA) {
 149                 .dptr = exclusive, .dsize = sizeof(exclusive),
 150         };
 151         dbufs[1] = (TDB_DATA) {
 152                 .dptr = seqnum_buf, .dsize = sizeof(seqnum_buf),
 153         };
 154         dbufs[2] = (TDB_DATA) {
 155                 .dptr = sizebuf, .dsize = sizeof(sizebuf),
 156         };
 157         dbufs[3] = (TDB_DATA) {
 158                 .dptr = lck->shared,
 159                 .dsize = lck->num_shared * SERVER_ID_BUF_LENGTH,
 160         };
 161         dbufs[4] = (TDB_DATA) { 0 };
 162         dbufs[5] = (TDB_DATA) {
 163                 .dptr = lck->data, .dsize = lck->datalen,
 164         };
 165
 166         if (num_new_dbufs != 0) {
 167                 memcpy(&dbufs[6],
 168                        new_dbufs,
 169                        num_new_dbufs * sizeof(TDB_DATA));
 170         }
 171
 172         server_id_put(exclusive, lck->exclusive);
 173         SBVAL(seqnum_buf, 0, lck->unique_lock_epoch);
 174         SBVAL(seqnum_buf, 8, lck->unique_data_epoch);
 175
 176         if (new_shared != NULL) {
 177                 if (lck->num_shared >= UINT32_MAX) {
 178                         return NT_STATUS_BUFFER_OVERFLOW;
 179                 }
 180
 181                 server_id_put(new_shared_buf, *new_shared);
 182
 183                 dbufs[4] = (TDB_DATA) {
 184                         .dptr = new_shared_buf,
 185                         .dsize = sizeof(new_shared_buf),
 186                 };
 187
 188                 lck->num_shared += 1;
 189         }
 190
 191         SIVAL(sizebuf, 0, lck->num_shared);
 192
 193         return dbwrap_record_storev(rec, dbufs, ARRAY_SIZE(dbufs), 0);
 194 }
 195
 196 struct g_lock_ctx *g_lock_ctx_init_backend(
 197         TALLOC_CTX *mem_ctx,
 198         struct messaging_context *msg,
 199         struct db_context **backend)
 200 {
 201         struct g_lock_ctx *result;
 202
 203         result = talloc_zero(mem_ctx, struct g_lock_ctx);
 204         if (result == NULL) {
 205                 return NULL;
 206         }
 207         result->msg = msg;
 208         result->lock_order = DBWRAP_LOCK_ORDER_NONE;
 209
 210         result->db = db_open_watched(result, backend, msg);
 211         if (result->db == NULL) {
 212                 DBG_WARNING("db_open_watched failed\n");
 213                 TALLOC_FREE(result);
 214                 return NULL;
 215         }
 216         return result;
 217 }
 218
 219 void g_lock_set_lock_order(struct g_lock_ctx *ctx,
 220                            enum dbwrap_lock_order lock_order)
 221 {
 222         ctx->lock_order = lock_order;
 223 }
 224
 225 struct g_lock_ctx *g_lock_ctx_init(TALLOC_CTX *mem_ctx,
 226                                    struct messaging_context *msg)
 227 {
 228         char *db_path = NULL;
 229         struct db_context *backend = NULL;
 230         struct g_lock_ctx *ctx = NULL;
 231
 232         db_path = lock_path(mem_ctx, "g_lock.tdb");
 233         if (db_path == NULL) {
 234                 return NULL;
 235         }
 236
 237         backend = db_open(
 238                 mem_ctx,
 239                 db_path,
 240                 0,
 241                 TDB_CLEAR_IF_FIRST|TDB_INCOMPATIBLE_HASH|TDB_VOLATILE,
 242                 O_RDWR|O_CREAT,
 243                 0600,
 244                 DBWRAP_LOCK_ORDER_3,
 245                 DBWRAP_FLAG_NONE);
 246         TALLOC_FREE(db_path);
 247         if (backend == NULL) {
 248                 DBG_WARNING("Could not open g_lock.tdb\n");
 249                 return NULL;
 250         }
 251
 252         ctx = g_lock_ctx_init_backend(mem_ctx, msg, &backend);
 253         return ctx;
 254 }
 255
 256 static void g_lock_cleanup_dead(
 257         struct g_lock *lck,
 258         struct server_id *dead_blocker)
 259 {
 260         bool exclusive_died;
 261         struct server_id_buf tmp;
 262
 263         if (dead_blocker == NULL) {
 264                 return;
 265         }
 266
 267         exclusive_died = server_id_equal(dead_blocker, &lck->exclusive);
 268
 269         if (exclusive_died) {
 270                 DBG_DEBUG("Exclusive holder %s died\n",
 271                           server_id_str_buf(lck->exclusive, &tmp));
 272                 lck->exclusive.pid = 0;
 273         }
 274
 275         if (lck->num_shared != 0) {
 276                 bool shared_died;
 277                 struct server_id shared;
 278
 279                 g_lock_get_shared(lck, 0, &shared);
 280                 shared_died = server_id_equal(dead_blocker, &shared);
 281
 282                 if (shared_died) {
 283                         DBG_DEBUG("Shared holder %s died\n",
 284                                   server_id_str_buf(shared, &tmp));
 285                         g_lock_del_shared(lck, 0);
 286                 }
 287         }
 288 }
 289
 290 static ssize_t g_lock_find_shared(
 291         struct g_lock *lck,
 292         const struct server_id *self)
 293 {
 294         size_t i;
 295
 296         for (i=0; i<lck->num_shared; i++) {
 297                 struct server_id shared;
 298                 bool same;
 299
 300                 g_lock_get_shared(lck, i, &shared);
 301
 302                 same = server_id_equal(self, &shared);
 303                 if (same) {
 304                         return i;
 305                 }
 306         }
 307
 308         return -1;
 309 }
 310
 311 static void g_lock_cleanup_shared(struct g_lock *lck)
 312 {
 313         size_t i;
 314         struct server_id check;
 315         bool exists;
 316
 317         if (lck->num_shared == 0) {
 318                 return;
 319         }
 320
 321         /*
 322          * Read locks can stay around forever if the process dies. Do
 323          * a heuristic check for process existence: Check one random
 324          * process for existence. Hopefully this will keep runaway
 325          * read locks under control.
 326          */
 327         i = generate_random() % lck->num_shared;
 328         g_lock_get_shared(lck, i, &check);
 329
 330         exists = serverid_exists(&check);
 331         if (!exists) {
 332                 struct server_id_buf tmp;
 333                 DBG_DEBUG("Shared locker %s died -- removing\n",
 334                           server_id_str_buf(check, &tmp));
 335                 g_lock_del_shared(lck, i);
 336         }
 337 }
 338
 339 struct g_lock_lock_cb_state {
 340         struct g_lock_ctx *ctx;
 341         struct db_record *rec;
 342         struct g_lock *lck;
 343         struct server_id *new_shared;
 344         g_lock_lock_cb_fn_t cb_fn;
 345         void *cb_private;
 346         TALLOC_CTX *update_mem_ctx;
 347         TDB_DATA updated_data;
 348         bool existed;
 349         bool modified;
 350         bool unlock;
 351 };
 352
 353 NTSTATUS g_lock_lock_cb_dump(struct g_lock_lock_cb_state *cb_state,
 354                              void (*fn)(struct server_id exclusive,
 355                                         size_t num_shared,
 356                                         const struct server_id *shared,
 357                                         const uint8_t *data,
 358                                         size_t datalen,
 359                                         void *private_data),
 360                              void *private_data)
 361 {
 362         struct g_lock *lck = cb_state->lck;
 363
 364         /* We allow a cb_fn only for G_LOCK_WRITE for now... */
 365         SMB_ASSERT(lck->num_shared == 0);
 366
 367         fn(lck->exclusive,
 368            0, /* num_shared */
 369            NULL, /* shared */
 370            lck->data,
 371            lck->datalen,
 372            private_data);
 373
 374         return NT_STATUS_OK;
 375 }
 376
 377 NTSTATUS g_lock_lock_cb_writev(struct g_lock_lock_cb_state *cb_state,
 378                                const TDB_DATA *dbufs,
 379                                size_t num_dbufs)
 380 {
 381         NTSTATUS status;
 382
 383         status = dbwrap_merge_dbufs(&cb_state->updated_data,
 384                                     cb_state->update_mem_ctx,
 385                                     dbufs, num_dbufs);
 386         if (!NT_STATUS_IS_OK(status)) {
 387                 return status;
 388         }
 389
 390         cb_state->modified = true;
 391         cb_state->lck->data = cb_state->updated_data.dptr;
 392         cb_state->lck->datalen = cb_state->updated_data.dsize;
 393
 394         return NT_STATUS_OK;
 395 }
 396
 397 void g_lock_lock_cb_unlock(struct g_lock_lock_cb_state *cb_state)
 398 {
 399         cb_state->unlock = true;
 400 }
 401
 402 struct g_lock_lock_cb_watch_data_state {
 403         struct tevent_context *ev;
 404         struct g_lock_ctx *ctx;
 405         TDB_DATA key;
 406         struct server_id blocker;
 407         bool blockerdead;
 408         uint64_t unique_lock_epoch;
 409         uint64_t unique_data_epoch;
 410         uint64_t watch_instance;
 411         NTSTATUS status;
 412 };
 413
 414 static void g_lock_lock_cb_watch_data_done(struct tevent_req *subreq);
 415
 416 struct tevent_req *g_lock_lock_cb_watch_data_send(
 417         TALLOC_CTX *mem_ctx,
 418         struct tevent_context *ev,
 419         struct g_lock_lock_cb_state *cb_state,
 420         struct server_id blocker)
 421 {
 422         struct tevent_req *req = NULL;
 423         struct g_lock_lock_cb_watch_data_state *state = NULL;
 424         struct tevent_req *subreq = NULL;
 425         TDB_DATA key = dbwrap_record_get_key(cb_state->rec);
 426
 427         req = tevent_req_create(
 428                 mem_ctx, &state, struct g_lock_lock_cb_watch_data_state);
 429         if (req == NULL) {
 430                 return NULL;
 431         }
 432         state->ev = ev;
 433         state->ctx = cb_state->ctx;
 434         state->blocker = blocker;
 435
 436         state->key = tdb_data_talloc_copy(state, key);
 437         if (tevent_req_nomem(state->key.dptr, req)) {
 438                 return tevent_req_post(req, ev);
 439         }
 440
 441         state->unique_lock_epoch = cb_state->lck->unique_lock_epoch;
 442         state->unique_data_epoch = cb_state->lck->unique_data_epoch;
 443
 444         DBG_DEBUG("state->unique_data_epoch=%"PRIu64"\n", state->unique_data_epoch);
 445
 446         subreq = dbwrap_watched_watch_send(
 447                 state, state->ev, cb_state->rec, 0, state->blocker);
 448         if (tevent_req_nomem(subreq, req)) {
 449                 return tevent_req_post(req, ev);
 450         }
 451         tevent_req_set_callback(subreq, g_lock_lock_cb_watch_data_done, req);
 452
 453         return req;
 454 }
 455
 456 static void g_lock_lock_cb_watch_data_done_fn(
 457         struct db_record *rec,
 458         TDB_DATA value,
 459         void *private_data)
 460 {
 461         struct tevent_req *req = talloc_get_type_abort(
 462                 private_data, struct tevent_req);
 463         struct g_lock_lock_cb_watch_data_state *state = tevent_req_data(
 464                 req, struct g_lock_lock_cb_watch_data_state);
 465         struct tevent_req *subreq = NULL;
 466         struct g_lock lck;
 467         bool ok;
 468
 469         ok = g_lock_parse(value.dptr, value.dsize, &lck);
 470         if (!ok) {
 471                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
 472                 state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
 473                 return;
 474         }
 475
 476         if (lck.unique_data_epoch != state->unique_data_epoch) {
 477                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
 478                 DBG_DEBUG("lck.unique_data_epoch=%"PRIu64", "
 479                           "state->unique_data_epoch=%"PRIu64"\n",
 480                           lck.unique_data_epoch,
 481                           state->unique_data_epoch);
 482                 state->status = NT_STATUS_OK;
 483                 return;
 484         }
 485
 486         /*
 487          * The lock epoch changed, so we better
 488          * remove ourself from the waiter list
 489          * (most likely the first position)
 490          * and re-add us at the end of the list.
 491          *
 492          * This gives other lock waiters a change
 493          * to make progress.
 494          *
 495          * Otherwise we'll keep our waiter instance alive,
 496          * keep waiting (most likely at first position).
 497          */
 498         if (lck.unique_lock_epoch != state->unique_lock_epoch) {
 499                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
 500                 state->watch_instance = dbwrap_watched_watch_add_instance(rec);
 501                 state->unique_lock_epoch = lck.unique_lock_epoch;
 502         }
 503
 504         subreq = dbwrap_watched_watch_send(
 505                 state, state->ev, rec, state->watch_instance, state->blocker);
 506         if (subreq == NULL) {
 507                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
 508                 state->status = NT_STATUS_NO_MEMORY;
 509                 return;
 510         }
 511         tevent_req_set_callback(subreq, g_lock_lock_cb_watch_data_done, req);
 512
 513         state->status = NT_STATUS_EVENT_PENDING;
 514 }
 515
 516 static void g_lock_lock_cb_watch_data_done(struct tevent_req *subreq)
 517 {
 518         struct tevent_req *req = tevent_req_callback_data(
 519                 subreq, struct tevent_req);
 520         struct g_lock_lock_cb_watch_data_state *state = tevent_req_data(
 521                 req, struct g_lock_lock_cb_watch_data_state);
 522         NTSTATUS status;
 523         uint64_t instance = 0;
 524
 525         status = dbwrap_watched_watch_recv(
 526                 subreq, &instance, &state->blockerdead, &state->blocker);
 527         TALLOC_FREE(subreq);
 528         if (tevent_req_nterror(req, status)) {
 529                 DBG_DEBUG("dbwrap_watched_watch_recv returned %s\n",
 530                           nt_errstr(status));
 531                 return;
 532         }
 533
 534         state->watch_instance = instance;
 535
 536         status = dbwrap_do_locked(
 537                 state->ctx->db, state->key, g_lock_lock_cb_watch_data_done_fn, req);
 538         if (tevent_req_nterror(req, status)) {
 539                 DBG_DEBUG("dbwrap_do_locked returned %s\n", nt_errstr(status));
 540                 return;
 541         }
 542         if (NT_STATUS_EQUAL(state->status, NT_STATUS_EVENT_PENDING)) {
 543                 return;
 544         }
 545         if (tevent_req_nterror(req, state->status)) {
 546                 return;
 547         }
 548         tevent_req_done(req);
 549 }
 550
 551 NTSTATUS g_lock_lock_cb_watch_data_recv(
 552         struct tevent_req *req,
 553         bool *blockerdead,
 554         struct server_id *blocker)
 555 {
 556         struct g_lock_lock_cb_watch_data_state *state = tevent_req_data(
 557                 req, struct g_lock_lock_cb_watch_data_state);
 558         NTSTATUS status;
 559
 560         if (tevent_req_is_nterror(req, &status)) {
 561                 return status;
 562         }
 563         if (blockerdead != NULL) {
 564                 *blockerdead = state->blockerdead;
 565         }
 566         if (blocker != NULL) {
 567                 *blocker = state->blocker;
 568         }
 569
 570         return NT_STATUS_OK;
 571 }
 572
 573 void g_lock_lock_cb_wake_watchers(struct g_lock_lock_cb_state *cb_state)
 574 {
 575         struct g_lock *lck = cb_state->lck;
 576
 577         lck->unique_data_epoch = generate_unique_u64(lck->unique_data_epoch);
 578         cb_state->modified = true;
 579 }
 580
 581 static NTSTATUS g_lock_lock_cb_run_and_store(struct g_lock_lock_cb_state *cb_state)
 582 {
 583         struct g_lock *lck = cb_state->lck;
 584         NTSTATUS success_status = NT_STATUS_OK;
 585         NTSTATUS status;
 586
 587         if (cb_state->cb_fn != NULL) {
 588
 589                 SMB_ASSERT(lck->num_shared == 0);
 590                 SMB_ASSERT(cb_state->new_shared == NULL);
 591
 592                 if (cb_state->ctx->lock_order != DBWRAP_LOCK_ORDER_NONE) {
 593                         const char *name = dbwrap_name(cb_state->ctx->db);
 594                         dbwrap_lock_order_lock(name, cb_state->ctx->lock_order);
 595                 }
 596
 597                 cb_state->ctx->busy = true;
 598                 cb_state->cb_fn(cb_state, cb_state->cb_private);
 599                 cb_state->ctx->busy = false;
 600
 601                 if (cb_state->ctx->lock_order != DBWRAP_LOCK_ORDER_NONE) {
 602                         const char *name = dbwrap_name(cb_state->ctx->db);
 603                         dbwrap_lock_order_unlock(name, cb_state->ctx->lock_order);
 604                 }
 605         }
 606
 607         if (cb_state->unlock) {
 608                 /*
 609                  * Unlocked should wake up watchers.
 610                  *
 611                  * We no longer need the lock, so
 612                  * force a wakeup of the next watchers,
 613                  * even if we don't do any update.
 614                  */
 615                 dbwrap_watched_watch_reset_alerting(cb_state->rec);
 616                 dbwrap_watched_watch_force_alerting(cb_state->rec);
 617                 if (!cb_state->modified) {
 618                         /*
 619                          * The record was not changed at
 620                          * all, so we can also avoid
 621                          * storing the lck.unique_lock_epoch
 622                          * change
 623                          */
 624                         return NT_STATUS_WAS_UNLOCKED;
 625                 }
 626                 lck->exclusive = (struct server_id) { .pid = 0 };
 627                 cb_state->new_shared = NULL;
 628
 629                 if (lck->datalen == 0) {
 630                         if (!cb_state->existed) {
 631                                 return NT_STATUS_WAS_UNLOCKED;
 632                         }
 633
 634                         status = dbwrap_record_delete(cb_state->rec);
 635                         if (!NT_STATUS_IS_OK(status)) {
 636                                 DBG_WARNING("dbwrap_record_delete() failed: %s\n",
 637                                     nt_errstr(status));
 638                                 return status;
 639                         }
 640                         return NT_STATUS_WAS_UNLOCKED;
 641                 }
 642
 643                 success_status = NT_STATUS_WAS_UNLOCKED;
 644         }
 645
 646         status = g_lock_store(cb_state->rec,
 647                               cb_state->lck,
 648                               cb_state->new_shared,
 649                               NULL, 0);
 650         if (!NT_STATUS_IS_OK(status)) {
 651                 DBG_WARNING("g_lock_store() failed: %s\n",
 652                             nt_errstr(status));
 653                 return status;
 654         }
 655
 656         return success_status;
 657 }
 658
 659 struct g_lock_lock_state {
 660         struct tevent_context *ev;
 661         struct g_lock_ctx *ctx;
 662         TDB_DATA key;
 663         enum g_lock_type type;
 664         bool retry;
 665         g_lock_lock_cb_fn_t cb_fn;
 666         void *cb_private;
 667 };
 668
 669 struct g_lock_lock_fn_state {
 670         struct g_lock_lock_state *req_state;
 671         struct server_id *dead_blocker;
 672
 673         struct tevent_req *watch_req;
 674         uint64_t watch_instance;
 675         NTSTATUS status;
 676 };
 677
 678 static int g_lock_lock_state_destructor(struct g_lock_lock_state *s);
 679
 680 static NTSTATUS g_lock_trylock(
 681         struct db_record *rec,
 682         struct g_lock_lock_fn_state *state,
 683         TDB_DATA data,
 684         struct server_id *blocker)
 685 {
 686         struct g_lock_lock_state *req_state = state->req_state;
 687         struct server_id self = messaging_server_id(req_state->ctx->msg);
 688         enum g_lock_type type = req_state->type;
 689         bool retry = req_state->retry;
 690         struct g_lock lck = { .exclusive.pid = 0 };
 691         struct g_lock_lock_cb_state cb_state = {
 692                 .ctx = req_state->ctx,
 693                 .rec = rec,
 694                 .lck = &lck,
 695                 .cb_fn = req_state->cb_fn,
 696                 .cb_private = req_state->cb_private,
 697                 .existed = data.dsize != 0,
 698                 .update_mem_ctx = talloc_tos(),
 699         };
 700         struct server_id_buf tmp;
 701         NTSTATUS status;
 702         bool ok;
 703
 704         ok = g_lock_parse(data.dptr, data.dsize, &lck);
 705         if (!ok) {
 706                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
 707                 DBG_DEBUG("g_lock_parse failed\n");
 708                 return NT_STATUS_INTERNAL_DB_CORRUPTION;
 709         }
 710
 711         g_lock_cleanup_dead(&lck, state->dead_blocker);
 712
 713         lck.unique_lock_epoch = generate_unique_u64(lck.unique_lock_epoch);
 714
 715         if (lck.exclusive.pid != 0) {
 716                 bool self_exclusive = server_id_equal(&self, &lck.exclusive);
 717
 718                 if (!self_exclusive) {
 719                         bool exists = serverid_exists(&lck.exclusive);
 720                         if (!exists) {
 721                                 lck.exclusive = (struct server_id) { .pid=0 };
 722                                 goto noexclusive;
 723                         }
 724
 725                         DBG_DEBUG("%s has an exclusive lock\n",
 726                                   server_id_str_buf(lck.exclusive, &tmp));
 727
 728                         if (type == G_LOCK_DOWNGRADE) {
 729                                 struct server_id_buf tmp2;
 730
 731                                 dbwrap_watched_watch_remove_instance(rec,
 732                                                 state->watch_instance);
 733
 734                                 DBG_DEBUG("%s: Trying to downgrade %s\n",
 735                                           server_id_str_buf(self, &tmp),
 736                                           server_id_str_buf(
 737                                                   lck.exclusive, &tmp2));
 738                                 return NT_STATUS_NOT_LOCKED;
 739                         }
 740
 741                         if (type == G_LOCK_UPGRADE) {
 742                                 ssize_t shared_idx;
 743
 744                                 dbwrap_watched_watch_remove_instance(rec,
 745                                                 state->watch_instance);
 746
 747                                 shared_idx = g_lock_find_shared(&lck, &self);
 748
 749                                 if (shared_idx == -1) {
 750                                         DBG_DEBUG("Trying to upgrade %s "
 751                                                   "without "
 752                                                   "existing shared lock\n",
 753                                                   server_id_str_buf(
 754                                                           self, &tmp));
 755                                         return NT_STATUS_NOT_LOCKED;
 756                                 }
 757
 758                                 /*
 759                                  * We're trying to upgrade, and the
 760                                  * exclusive lock is taken by someone
 761                                  * else. This means that someone else
 762                                  * is waiting for us to give up our
 763                                  * shared lock. If we now also wait
 764                                  * for someone to give their shared
 765                                  * lock, we will deadlock.
 766                                  */
 767
 768                                 DBG_DEBUG("Trying to upgrade %s while "
 769                                           "someone else is also "
 770                                           "trying to upgrade\n",
 771                                           server_id_str_buf(self, &tmp));
 772                                 return NT_STATUS_POSSIBLE_DEADLOCK;
 773                         }
 774
 775                         DBG_DEBUG("Waiting for lck.exclusive=%s\n",
 776                                   server_id_str_buf(lck.exclusive, &tmp));
 777
 778                         /*
 779                          * We will return NT_STATUS_LOCK_NOT_GRANTED
 780                          * and need to monitor the record.
 781                          *
 782                          * If we don't have a watcher instance yet,
 783                          * we should add one.
 784                          */
 785                         if (state->watch_instance == 0) {
 786                                 state->watch_instance =
 787                                         dbwrap_watched_watch_add_instance(rec);
 788                         }
 789
 790                         *blocker = lck.exclusive;
 791                         return NT_STATUS_LOCK_NOT_GRANTED;
 792                 }
 793
 794                 if (type == G_LOCK_DOWNGRADE) {
 795                         DBG_DEBUG("Downgrading %s from WRITE to READ\n",
 796                                   server_id_str_buf(self, &tmp));
 797
 798                         lck.exclusive = (struct server_id) { .pid = 0 };
 799                         goto do_shared;
 800                 }
 801
 802                 if (!retry) {
 803                         dbwrap_watched_watch_remove_instance(rec,
 804                                                 state->watch_instance);
 805
 806                         DBG_DEBUG("%s already locked by self\n",
 807                                   server_id_str_buf(self, &tmp));
 808                         return NT_STATUS_WAS_LOCKED;
 809                 }
 810
 811                 g_lock_cleanup_shared(&lck);
 812
 813                 if (lck.num_shared != 0) {
 814                         g_lock_get_shared(&lck, 0, blocker);
 815
 816                         DBG_DEBUG("Continue waiting for shared lock %s\n",
 817                                   server_id_str_buf(*blocker, &tmp));
 818
 819                         /*
 820                          * We will return NT_STATUS_LOCK_NOT_GRANTED
 821                          * and need to monitor the record.
 822                          *
 823                          * If we don't have a watcher instance yet,
 824                          * we should add one.
 825                          */
 826                         if (state->watch_instance == 0) {
 827                                 state->watch_instance =
 828                                         dbwrap_watched_watch_add_instance(rec);
 829                         }
 830
 831                         return NT_STATUS_LOCK_NOT_GRANTED;
 832                 }
 833
 834                 /*
 835                  * Retry after a conflicting lock was released..
 836                  * All pending readers are gone so we got the lock...
 837                  */
 838                 goto got_lock;
 839         }
 840
 841 noexclusive:
 842
 843         if (type == G_LOCK_UPGRADE) {
 844                 ssize_t shared_idx = g_lock_find_shared(&lck, &self);
 845
 846                 if (shared_idx == -1) {
 847                         dbwrap_watched_watch_remove_instance(rec,
 848                                                 state->watch_instance);
 849
 850                         DBG_DEBUG("Trying to upgrade %s without "
 851                                   "existing shared lock\n",
 852                                   server_id_str_buf(self, &tmp));
 853                         return NT_STATUS_NOT_LOCKED;
 854                 }
 855
 856                 g_lock_del_shared(&lck, shared_idx);
 857                 type = G_LOCK_WRITE;
 858         }
 859
 860         if (type == G_LOCK_WRITE) {
 861                 ssize_t shared_idx = g_lock_find_shared(&lck, &self);
 862
 863                 if (shared_idx != -1) {
 864                         dbwrap_watched_watch_remove_instance(rec,
 865                                                 state->watch_instance);
 866                         DBG_DEBUG("Trying to writelock existing shared %s\n",
 867                                   server_id_str_buf(self, &tmp));
 868                         return NT_STATUS_WAS_LOCKED;
 869                 }
 870
 871                 lck.exclusive = self;
 872
 873                 g_lock_cleanup_shared(&lck);
 874
 875                 if (lck.num_shared == 0) {
 876                         /*
 877                          * If we store ourself as exclusive writer,
 878                          * without any pending readers ...
 879                          */
 880                         goto got_lock;
 881                 }
 882
 883                 if (state->watch_instance == 0) {
 884                         /*
 885                          * Here we have lck.num_shared != 0.
 886                          *
 887                          * We will return NT_STATUS_LOCK_NOT_GRANTED
 888                          * below.
 889                          *
 890                          * And don't have a watcher instance yet!
 891                          *
 892                          * We add it here before g_lock_store()
 893                          * in order to trigger just one
 894                          * low level dbwrap_do_locked() call.
 895                          */
 896                         state->watch_instance =
 897                                 dbwrap_watched_watch_add_instance(rec);
 898                 }
 899
 900                 status = g_lock_store(rec, &lck, NULL, NULL, 0);
 901                 if (!NT_STATUS_IS_OK(status)) {
 902                         DBG_DEBUG("g_lock_store() failed: %s\n",
 903                                   nt_errstr(status));
 904                         return status;
 905                 }
 906
 907                 talloc_set_destructor(
 908                         req_state, g_lock_lock_state_destructor);
 909
 910                 g_lock_get_shared(&lck, 0, blocker);
 911
 912                 DBG_DEBUG("Waiting for %zu shared locks, "
 913                           "picking blocker %s\n",
 914                           lck.num_shared,
 915                           server_id_str_buf(*blocker, &tmp));
 916
 917                 return NT_STATUS_LOCK_NOT_GRANTED;
 918         }
 919
 920 do_shared:
 921
 922         g_lock_cleanup_shared(&lck);
 923         cb_state.new_shared = &self;
 924         goto got_lock;
 925
 926 got_lock:
 927         /*
 928          * We got the lock we asked for, so we no
 929          * longer need to monitor the record.
 930          */
 931         dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
 932
 933         status = g_lock_lock_cb_run_and_store(&cb_state);
 934         if (!NT_STATUS_IS_OK(status) &&
 935             !NT_STATUS_EQUAL(status, NT_STATUS_WAS_UNLOCKED))
 936         {
 937                 DBG_WARNING("g_lock_lock_cb_run_and_store() failed: %s\n",
 938                             nt_errstr(status));
 939                 return status;
 940         }
 941
 942         talloc_set_destructor(req_state, NULL);
 943         return status;
 944 }
 945
 946 static void g_lock_lock_fn(
 947         struct db_record *rec,
 948         TDB_DATA value,
 949         void *private_data)
 950 {
 951         struct g_lock_lock_fn_state *state = private_data;
 952         struct server_id blocker = {0};
 953
 954         /*
 955          * We're trying to get a lock and if we are
 956          * successful in doing that, we should not
 957          * wakeup any other waiters, all they would
 958          * find is that we're holding a lock they
 959          * are conflicting with.
 960          */
 961         dbwrap_watched_watch_skip_alerting(rec);
 962
 963         state->status = g_lock_trylock(rec, state, value, &blocker);
 964         if (!NT_STATUS_IS_OK(state->status)) {
 965                 DBG_DEBUG("g_lock_trylock returned %s\n",
 966                           nt_errstr(state->status));
 967         }
 968         if (!NT_STATUS_EQUAL(state->status, NT_STATUS_LOCK_NOT_GRANTED)) {
 969                 return;
 970         }
 971
 972         state->watch_req = dbwrap_watched_watch_send(
 973                 state->req_state, state->req_state->ev, rec, state->watch_instance, blocker);
 974         if (state->watch_req == NULL) {
 975                 state->status = NT_STATUS_NO_MEMORY;
 976         }
 977 }
 978
 979 static int g_lock_lock_state_destructor(struct g_lock_lock_state *s)
 980 {
 981         NTSTATUS status = g_lock_unlock(s->ctx, s->key);
 982         if (!NT_STATUS_IS_OK(status)) {
 983                 DBG_DEBUG("g_lock_unlock failed: %s\n", nt_errstr(status));
 984         }
 985         return 0;
 986 }
 987
 988 static void g_lock_lock_retry(struct tevent_req *subreq);
 989
 990 struct tevent_req *g_lock_lock_send(TALLOC_CTX *mem_ctx,
 991                                     struct tevent_context *ev,
 992                                     struct g_lock_ctx *ctx,
 993                                     TDB_DATA key,
 994                                     enum g_lock_type type,
 995                                     g_lock_lock_cb_fn_t cb_fn,
 996                                     void *cb_private)
 997 {
 998         struct tevent_req *req;
 999         struct g_lock_lock_state *state;
1000         struct g_lock_lock_fn_state fn_state;
1001         NTSTATUS status;
1002         bool ok;
1003
1004         SMB_ASSERT(!ctx->busy);
1005
1006         req = tevent_req_create(mem_ctx, &state, struct g_lock_lock_state);
1007         if (req == NULL) {
1008                 return NULL;
1009         }
1010         state->ev = ev;
1011         state->ctx = ctx;
1012         state->key = key;
1013         state->type = type;
1014         state->cb_fn = cb_fn;
1015         state->cb_private = cb_private;
1016
1017         fn_state = (struct g_lock_lock_fn_state) {
1018                 .req_state = state,
1019         };
1020
1021         /*
1022          * We allow a cb_fn only for G_LOCK_WRITE for now.
1023          *
1024          * It's all we currently need and it makes a few things
1025          * easier to implement.
1026          */
1027         if (unlikely(cb_fn != NULL && type != G_LOCK_WRITE)) {
1028                 tevent_req_nterror(req, NT_STATUS_INVALID_PARAMETER_6);
1029                 return tevent_req_post(req, ev);
1030         }
1031
1032         status = dbwrap_do_locked(ctx->db, key, g_lock_lock_fn, &fn_state);
1033         if (tevent_req_nterror(req, status)) {
1034                 DBG_DEBUG("dbwrap_do_locked failed: %s\n",
1035                           nt_errstr(status));
1036                 return tevent_req_post(req, ev);
1037         }
1038
1039         if (NT_STATUS_IS_OK(fn_state.status)) {
1040                 tevent_req_done(req);
1041                 return tevent_req_post(req, ev);
1042         }
1043         if (!NT_STATUS_EQUAL(fn_state.status, NT_STATUS_LOCK_NOT_GRANTED)) {
1044                 tevent_req_nterror(req, fn_state.status);
1045                 return tevent_req_post(req, ev);
1046         }
1047
1048         if (tevent_req_nomem(fn_state.watch_req, req)) {
1049                 return tevent_req_post(req, ev);
1050         }
1051
1052         ok = tevent_req_set_endtime(
1053                 fn_state.watch_req,
1054                 state->ev,
1055                 timeval_current_ofs(5 + generate_random() % 5, 0));
1056         if (!ok) {
1057                 tevent_req_oom(req);
1058                 return tevent_req_post(req, ev);
1059         }
1060         tevent_req_set_callback(fn_state.watch_req, g_lock_lock_retry, req);
1061
1062         return req;
1063 }
1064
1065 static void g_lock_lock_retry(struct tevent_req *subreq)
1066 {
1067         struct tevent_req *req = tevent_req_callback_data(
1068                 subreq, struct tevent_req);
1069         struct g_lock_lock_state *state = tevent_req_data(
1070                 req, struct g_lock_lock_state);
1071         struct g_lock_lock_fn_state fn_state;
1072         struct server_id blocker = { .pid = 0 };
1073         bool blockerdead = false;
1074         NTSTATUS status;
1075         uint64_t instance = 0;
1076
1077         status = dbwrap_watched_watch_recv(subreq, &instance, &blockerdead, &blocker);
1078         DBG_DEBUG("watch_recv returned %s\n", nt_errstr(status));
1079         TALLOC_FREE(subreq);
1080
1081         if (!NT_STATUS_IS_OK(status) &&
1082             !NT_STATUS_EQUAL(status, NT_STATUS_IO_TIMEOUT)) {
1083                 tevent_req_nterror(req, status);
1084                 return;
1085         }
1086
1087         state->retry = true;
1088
1089         fn_state = (struct g_lock_lock_fn_state) {
1090                 .req_state = state,
1091                 .dead_blocker = blockerdead ? &blocker : NULL,
1092                 .watch_instance = instance,
1093         };
1094
1095         status = dbwrap_do_locked(state->ctx->db, state->key,
1096                                   g_lock_lock_fn, &fn_state);
1097         if (tevent_req_nterror(req, status)) {
1098                 DBG_DEBUG("dbwrap_do_locked failed: %s\n",
1099                           nt_errstr(status));
1100                 return;
1101         }
1102
1103         if (NT_STATUS_IS_OK(fn_state.status)) {
1104                 tevent_req_done(req);
1105                 return;
1106         }
1107         if (!NT_STATUS_EQUAL(fn_state.status, NT_STATUS_LOCK_NOT_GRANTED)) {
1108                 tevent_req_nterror(req, fn_state.status);
1109                 return;
1110         }
1111
1112         if (tevent_req_nomem(fn_state.watch_req, req)) {
1113                 return;
1114         }
1115
1116         if (!tevent_req_set_endtime(
1117                     fn_state.watch_req, state->ev,
1118                     timeval_current_ofs(5 + generate_random() % 5, 0))) {
1119                 return;
1120         }
1121         tevent_req_set_callback(fn_state.watch_req, g_lock_lock_retry, req);
1122 }
1123
1124 NTSTATUS g_lock_lock_recv(struct tevent_req *req)
1125 {
1126         struct g_lock_lock_state *state = tevent_req_data(
1127                 req, struct g_lock_lock_state);
1128         struct g_lock_ctx *ctx = state->ctx;
1129         NTSTATUS status;
1130
1131         if (tevent_req_is_nterror(req, &status)) {
1132                 if (NT_STATUS_EQUAL(status, NT_STATUS_WAS_UNLOCKED)) {
1133                         return NT_STATUS_OK;
1134                 }
1135                 return status;
1136         }
1137
1138         if ((ctx->lock_order != DBWRAP_LOCK_ORDER_NONE) &&
1139             ((state->type == G_LOCK_READ) ||
1140              (state->type == G_LOCK_WRITE))) {
1141                 const char *name = dbwrap_name(ctx->db);
1142                 dbwrap_lock_order_lock(name, ctx->lock_order);
1143         }
1144
1145         return NT_STATUS_OK;
1146 }
1147
1148 struct g_lock_lock_simple_state {
1149         struct g_lock_ctx *ctx;
1150         struct server_id me;
1151         enum g_lock_type type;
1152         NTSTATUS status;
1153         g_lock_lock_cb_fn_t cb_fn;
1154         void *cb_private;
1155 };
1156
1157 static void g_lock_lock_simple_fn(
1158         struct db_record *rec,
1159         TDB_DATA value,
1160         void *private_data)
1161 {
1162         struct g_lock_lock_simple_state *state = private_data;
1163         struct server_id_buf buf;
1164         struct g_lock lck = { .exclusive.pid = 0 };
1165         struct g_lock_lock_cb_state cb_state = {
1166                 .ctx = state->ctx,
1167                 .rec = rec,
1168                 .lck = &lck,
1169                 .cb_fn = state->cb_fn,
1170                 .cb_private = state->cb_private,
1171                 .existed = value.dsize != 0,
1172                 .update_mem_ctx = talloc_tos(),
1173         };
1174         bool ok;
1175
1176         ok = g_lock_parse(value.dptr, value.dsize, &lck);
1177         if (!ok) {
1178                 DBG_DEBUG("g_lock_parse failed\n");
1179                 state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
1180                 return;
1181         }
1182
1183         if (lck.exclusive.pid != 0) {
1184                 DBG_DEBUG("locked by %s\n",
1185                           server_id_str_buf(lck.exclusive, &buf));
1186                 goto not_granted;
1187         }
1188
1189         if (state->type == G_LOCK_WRITE) {
1190                 if (lck.num_shared != 0) {
1191                         DBG_DEBUG("num_shared=%zu\n", lck.num_shared);
1192                         goto not_granted;
1193                 }
1194                 lck.exclusive = state->me;
1195         } else if (state->type == G_LOCK_READ) {
1196                 g_lock_cleanup_shared(&lck);
1197                 cb_state.new_shared = &state->me;
1198         } else {
1199                 smb_panic(__location__);
1200         }
1201
1202         lck.unique_lock_epoch = generate_unique_u64(lck.unique_lock_epoch);
1203
1204         /*
1205          * We are going to store us as owner,
1206          * so we got what we were waiting for.
1207          *
1208          * So we no longer need to monitor the
1209          * record.
1210          */
1211         dbwrap_watched_watch_skip_alerting(rec);
1212
1213         state->status = g_lock_lock_cb_run_and_store(&cb_state);
1214         if (!NT_STATUS_IS_OK(state->status) &&
1215             !NT_STATUS_EQUAL(state->status, NT_STATUS_WAS_UNLOCKED))
1216         {
1217                 DBG_WARNING("g_lock_lock_cb_run_and_store() failed: %s\n",
1218                             nt_errstr(state->status));
1219                 return;
1220         }
1221
1222         return;
1223
1224 not_granted:
1225         state->status = NT_STATUS_LOCK_NOT_GRANTED;
1226 }
1227
1228 NTSTATUS g_lock_lock(struct g_lock_ctx *ctx, TDB_DATA key,
1229                      enum g_lock_type type, struct timeval timeout,
1230                      g_lock_lock_cb_fn_t cb_fn,
1231                      void *cb_private)
1232 {
1233         TALLOC_CTX *frame;
1234         struct tevent_context *ev;
1235         struct tevent_req *req;
1236         struct timeval end;
1237         NTSTATUS status;
1238
1239         SMB_ASSERT(!ctx->busy);
1240
1241         /*
1242          * We allow a cb_fn only for G_LOCK_WRITE for now.
1243          *
1244          * It's all we currently need and it makes a few things
1245          * easier to implement.
1246          */
1247         if (unlikely(cb_fn != NULL && type != G_LOCK_WRITE)) {
1248                 return NT_STATUS_INVALID_PARAMETER_5;
1249         }
1250
1251         if ((type == G_LOCK_READ) || (type == G_LOCK_WRITE)) {
1252                 /*
1253                  * This is an abstraction violation: Normally we do
1254                  * the sync wrappers around async functions with full
1255                  * nested event contexts. However, this is used in
1256                  * very hot code paths, so avoid the event context
1257                  * creation for the good path where there's no lock
1258                  * contention. My benchmark gave a factor of 2
1259                  * improvement for lock/unlock.
1260                  */
1261                 struct g_lock_lock_simple_state state = {
1262                         .ctx = ctx,
1263                         .me = messaging_server_id(ctx->msg),
1264                         .type = type,
1265                         .cb_fn = cb_fn,
1266                         .cb_private = cb_private,
1267                 };
1268                 status = dbwrap_do_locked(
1269                         ctx->db, key, g_lock_lock_simple_fn, &state);
1270                 if (!NT_STATUS_IS_OK(status)) {
1271                         DBG_DEBUG("dbwrap_do_locked() failed: %s\n",
1272                                   nt_errstr(status));
1273                         return status;
1274                 }
1275
1276                 DBG_DEBUG("status=%s, state.status=%s\n",
1277                           nt_errstr(status),
1278                           nt_errstr(state.status));
1279
1280                 if (NT_STATUS_IS_OK(state.status)) {
1281                         if (ctx->lock_order != DBWRAP_LOCK_ORDER_NONE) {
1282                                 const char *name = dbwrap_name(ctx->db);
1283                                 dbwrap_lock_order_lock(name, ctx->lock_order);
1284                         }
1285                         return NT_STATUS_OK;
1286                 }
1287                 if (NT_STATUS_EQUAL(state.status, NT_STATUS_WAS_UNLOCKED)) {
1288                         /* without dbwrap_lock_order_lock() */
1289                         return NT_STATUS_OK;
1290                 }
1291                 if (!NT_STATUS_EQUAL(
1292                             state.status, NT_STATUS_LOCK_NOT_GRANTED)) {
1293                         return state.status;
1294                 }
1295
1296                 if (timeval_is_zero(&timeout)) {
1297                         return NT_STATUS_LOCK_NOT_GRANTED;
1298                 }
1299
1300                 /*
1301                  * Fall back to the full g_lock_trylock logic,
1302                  * g_lock_lock_simple_fn() called above only covers
1303                  * the uncontended path.
1304                  */
1305         }
1306
1307         frame = talloc_stackframe();
1308         status = NT_STATUS_NO_MEMORY;
1309
1310         ev = samba_tevent_context_init(frame);
1311         if (ev == NULL) {
1312                 goto fail;
1313         }
1314         req = g_lock_lock_send(frame, ev, ctx, key, type, cb_fn, cb_private);
1315         if (req == NULL) {
1316                 goto fail;
1317         }
1318         end = timeval_current_ofs(timeout.tv_sec, timeout.tv_usec);
1319         if (!tevent_req_set_endtime(req, ev, end)) {
1320                 goto fail;
1321         }
1322         if (!tevent_req_poll_ntstatus(req, ev, &status)) {
1323                 goto fail;
1324         }
1325         status = g_lock_lock_recv(req);
1326  fail:
1327         TALLOC_FREE(frame);
1328         return status;
1329 }
1330
1331 struct g_lock_unlock_state {
1332         struct server_id self;
1333         NTSTATUS status;
1334 };
1335
1336 static void g_lock_unlock_fn(
1337         struct db_record *rec,
1338         TDB_DATA value,
1339         void *private_data)
1340 {
1341         struct g_lock_unlock_state *state = private_data;
1342         struct server_id_buf tmp1, tmp2;
1343         struct g_lock lck;
1344         size_t i;
1345         bool ok, exclusive;
1346
1347         ok = g_lock_parse(value.dptr, value.dsize, &lck);
1348         if (!ok) {
1349                 DBG_DEBUG("g_lock_parse() failed\n");
1350                 state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
1351                 return;
1352         }
1353
1354         exclusive = server_id_equal(&state->self, &lck.exclusive);
1355
1356         for (i=0; i<lck.num_shared; i++) {
1357                 struct server_id shared;
1358                 g_lock_get_shared(&lck, i, &shared);
1359                 if (server_id_equal(&state->self, &shared)) {
1360                         break;
1361                 }
1362         }
1363
1364         if (i < lck.num_shared) {
1365                 if (exclusive) {
1366                         DBG_DEBUG("%s both exclusive and shared (%zu)\n",
1367                                   server_id_str_buf(state->self, &tmp1),
1368                                   i);
1369                         state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
1370                         return;
1371                 }
1372                 g_lock_del_shared(&lck, i);
1373         } else {
1374                 if (!exclusive) {
1375                         DBG_DEBUG("Lock not found, self=%s, lck.exclusive=%s, "
1376                                   "num_shared=%zu\n",
1377                                   server_id_str_buf(state->self, &tmp1),
1378                                   server_id_str_buf(lck.exclusive, &tmp2),
1379                                   lck.num_shared);
1380                         state->status = NT_STATUS_NOT_FOUND;
1381                         return;
1382                 }
1383                 lck.exclusive = (struct server_id) { .pid = 0 };
1384         }
1385
1386         if ((lck.exclusive.pid == 0) &&
1387             (lck.num_shared == 0) &&
1388             (lck.datalen == 0)) {
1389                 state->status = dbwrap_record_delete(rec);
1390                 return;
1391         }
1392
1393         if (!exclusive && lck.exclusive.pid != 0) {
1394                 /*
1395                  * We only had a read lock and there's
1396                  * someone waiting for an exclusive lock.
1397                  *
1398                  * Don't alert the exclusive lock waiter
1399                  * if there are still other read lock holders.
1400                  */
1401                 g_lock_cleanup_shared(&lck);
1402                 if (lck.num_shared != 0) {
1403                         dbwrap_watched_watch_skip_alerting(rec);
1404                 }
1405         }
1406
1407         lck.unique_lock_epoch = generate_unique_u64(lck.unique_lock_epoch);
1408
1409         state->status = g_lock_store(rec, &lck, NULL, NULL, 0);
1410 }
1411
1412 NTSTATUS g_lock_unlock(struct g_lock_ctx *ctx, TDB_DATA key)
1413 {
1414         struct g_lock_unlock_state state = {
1415                 .self = messaging_server_id(ctx->msg),
1416         };
1417         NTSTATUS status;
1418
1419         SMB_ASSERT(!ctx->busy);
1420
1421         status = dbwrap_do_locked(ctx->db, key, g_lock_unlock_fn, &state);
1422         if (!NT_STATUS_IS_OK(status)) {
1423                 DBG_WARNING("dbwrap_do_locked failed: %s\n",
1424                             nt_errstr(status));
1425                 return status;
1426         }
1427         if (!NT_STATUS_IS_OK(state.status)) {
1428                 DBG_WARNING("g_lock_unlock_fn failed: %s\n",
1429                             nt_errstr(state.status));
1430                 return state.status;
1431         }
1432
1433         if (ctx->lock_order != DBWRAP_LOCK_ORDER_NONE) {
1434                 const char *name = dbwrap_name(ctx->db);
1435                 dbwrap_lock_order_unlock(name, ctx->lock_order);
1436         }
1437
1438         return NT_STATUS_OK;
1439 }
1440
1441 struct g_lock_writev_data_state {
1442         TDB_DATA key;
1443         struct server_id self;
1444         const TDB_DATA *dbufs;
1445         size_t num_dbufs;
1446         NTSTATUS status;
1447 };
1448
1449 static void g_lock_writev_data_fn(
1450         struct db_record *rec,
1451         TDB_DATA value,
1452         void *private_data)
1453 {
1454         struct g_lock_writev_data_state *state = private_data;
1455         struct g_lock lck;
1456         bool exclusive;
1457         bool ok;
1458
1459         /*
1460          * We're holding an exclusive write lock.
1461          *
1462          * Now we're updating the content of the record.
1463          *
1464          * We should not wakeup any other waiters, all they
1465          * would find is that we're still holding a lock they
1466          * are conflicting with.
1467          */
1468         dbwrap_watched_watch_skip_alerting(rec);
1469
1470         ok = g_lock_parse(value.dptr, value.dsize, &lck);
1471         if (!ok) {
1472                 DBG_DEBUG("g_lock_parse for %s failed\n",
1473                           tdb_data_dbg(state->key));
1474                 state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
1475                 return;
1476         }
1477
1478         exclusive = server_id_equal(&state->self, &lck.exclusive);
1479
1480         /*
1481          * Make sure we're really exclusive. We are marked as
1482          * exclusive when we are waiting for an exclusive lock
1483          */
1484         exclusive &= (lck.num_shared == 0);
1485
1486         if (!exclusive) {
1487                 struct server_id_buf buf1, buf2;
1488                 DBG_DEBUG("Not locked by us: self=%s, lck.exclusive=%s, "
1489                           "lck.num_shared=%zu\n",
1490                           server_id_str_buf(state->self, &buf1),
1491                           server_id_str_buf(lck.exclusive, &buf2),
1492                           lck.num_shared);
1493                 state->status = NT_STATUS_NOT_LOCKED;
1494                 return;
1495         }
1496
1497         lck.unique_data_epoch = generate_unique_u64(lck.unique_data_epoch);
1498         lck.data = NULL;
1499         lck.datalen = 0;
1500         state->status = g_lock_store(
1501                 rec, &lck, NULL, state->dbufs, state->num_dbufs);
1502 }
1503
1504 NTSTATUS g_lock_writev_data(
1505         struct g_lock_ctx *ctx,
1506         TDB_DATA key,
1507         const TDB_DATA *dbufs,
1508         size_t num_dbufs)
1509 {
1510         struct g_lock_writev_data_state state = {
1511                 .key = key,
1512                 .self = messaging_server_id(ctx->msg),
1513                 .dbufs = dbufs,
1514                 .num_dbufs = num_dbufs,
1515         };
1516         NTSTATUS status;
1517
1518         SMB_ASSERT(!ctx->busy);
1519
1520         status = dbwrap_do_locked(
1521                 ctx->db, key, g_lock_writev_data_fn, &state);
1522         if (!NT_STATUS_IS_OK(status)) {
1523                 DBG_WARNING("dbwrap_do_locked failed: %s\n",
1524                             nt_errstr(status));
1525                 return status;
1526         }
1527         if (!NT_STATUS_IS_OK(state.status)) {
1528                 DBG_WARNING("g_lock_writev_data_fn failed: %s\n",
1529                             nt_errstr(state.status));
1530                 return state.status;
1531         }
1532
1533         return NT_STATUS_OK;
1534 }
1535
1536 NTSTATUS g_lock_write_data(struct g_lock_ctx *ctx, TDB_DATA key,
1537                            const uint8_t *buf, size_t buflen)
1538 {
1539         TDB_DATA dbuf = {
1540                 .dptr = discard_const_p(uint8_t, buf),
1541                 .dsize = buflen,
1542         };
1543         return g_lock_writev_data(ctx, key, &dbuf, 1);
1544 }
1545
1546 struct g_lock_locks_state {
1547         int (*fn)(TDB_DATA key, void *private_data);
1548         void *private_data;
1549 };
1550
1551 static int g_lock_locks_fn(struct db_record *rec, void *priv)
1552 {
1553         TDB_DATA key;
1554         struct g_lock_locks_state *state = (struct g_lock_locks_state *)priv;
1555
1556         key = dbwrap_record_get_key(rec);
1557         return state->fn(key, state->private_data);
1558 }
1559
1560 int g_lock_locks(struct g_lock_ctx *ctx,
1561                  int (*fn)(TDB_DATA key, void *private_data),
1562                  void *private_data)
1563 {
1564         struct g_lock_locks_state state;
1565         NTSTATUS status;
1566         int count;
1567
1568         SMB_ASSERT(!ctx->busy);
1569
1570         state.fn = fn;
1571         state.private_data = private_data;
1572
1573         status = dbwrap_traverse_read(ctx->db, g_lock_locks_fn, &state, &count);
1574         if (!NT_STATUS_IS_OK(status)) {
1575                 return -1;
1576         }
1577         return count;
1578 }
1579
1580 struct g_lock_dump_state {
1581         TALLOC_CTX *mem_ctx;
1582         TDB_DATA key;
1583         void (*fn)(struct server_id exclusive,
1584                    size_t num_shared,
1585                    const struct server_id *shared,
1586                    const uint8_t *data,
1587                    size_t datalen,
1588                    void *private_data);
1589         void *private_data;
1590         NTSTATUS status;
1591         enum dbwrap_req_state req_state;
1592 };
1593
1594 static void g_lock_dump_fn(TDB_DATA key, TDB_DATA data,
1595                            void *private_data)
1596 {
1597         struct g_lock_dump_state *state = private_data;
1598         struct g_lock lck = (struct g_lock) { .exclusive.pid = 0 };
1599         struct server_id *shared = NULL;
1600         size_t i;
1601         bool ok;
1602
1603         ok = g_lock_parse(data.dptr, data.dsize, &lck);
1604         if (!ok) {
1605                 DBG_DEBUG("g_lock_parse failed for %s\n",
1606                           tdb_data_dbg(state->key));
1607                 state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
1608                 return;
1609         }
1610
1611         if (lck.num_shared > 0) {
1612                 shared = talloc_array(
1613                         state->mem_ctx, struct server_id, lck.num_shared);
1614                 if (shared == NULL) {
1615                         DBG_DEBUG("talloc failed\n");
1616                         state->status = NT_STATUS_NO_MEMORY;
1617                         return;
1618                 }
1619         }
1620
1621         for (i=0; i<lck.num_shared; i++) {
1622                 g_lock_get_shared(&lck, i, &shared[i]);
1623         }
1624
1625         state->fn(lck.exclusive,
1626                   lck.num_shared,
1627                   shared,
1628                   lck.data,
1629                   lck.datalen,
1630                   state->private_data);
1631
1632         TALLOC_FREE(shared);
1633
1634         state->status = NT_STATUS_OK;
1635 }
1636
1637 NTSTATUS g_lock_dump(struct g_lock_ctx *ctx, TDB_DATA key,
1638                      void (*fn)(struct server_id exclusive,
1639                                 size_t num_shared,
1640                                 const struct server_id *shared,
1641                                 const uint8_t *data,
1642                                 size_t datalen,
1643                                 void *private_data),
1644                      void *private_data)
1645 {
1646         struct g_lock_dump_state state = {
1647                 .mem_ctx = ctx, .key = key,
1648                 .fn = fn, .private_data = private_data
1649         };
1650         NTSTATUS status;
1651
1652         SMB_ASSERT(!ctx->busy);
1653
1654         status = dbwrap_parse_record(ctx->db, key, g_lock_dump_fn, &state);
1655         if (!NT_STATUS_IS_OK(status)) {
1656                 DBG_DEBUG("dbwrap_parse_record returned %s\n",
1657                           nt_errstr(status));
1658                 return status;
1659         }
1660         if (!NT_STATUS_IS_OK(state.status)) {
1661                 DBG_DEBUG("g_lock_dump_fn returned %s\n",
1662                           nt_errstr(state.status));
1663                 return state.status;
1664         }
1665         return NT_STATUS_OK;
1666 }
1667
1668 static void g_lock_dump_done(struct tevent_req *subreq);
1669
1670 struct tevent_req *g_lock_dump_send(
1671         TALLOC_CTX *mem_ctx,
1672         struct tevent_context *ev,
1673         struct g_lock_ctx *ctx,
1674         TDB_DATA key,
1675         void (*fn)(struct server_id exclusive,
1676                    size_t num_shared,
1677                    const struct server_id *shared,
1678                    const uint8_t *data,
1679                    size_t datalen,
1680                    void *private_data),
1681         void *private_data)
1682 {
1683         struct tevent_req *req = NULL, *subreq = NULL;
1684         struct g_lock_dump_state *state = NULL;
1685
1686         SMB_ASSERT(!ctx->busy);
1687
1688         req = tevent_req_create(mem_ctx, &state, struct g_lock_dump_state);
1689         if (req == NULL) {
1690                 return NULL;
1691         }
1692         state->mem_ctx = state;
1693         state->key = key;
1694         state->fn = fn;
1695         state->private_data = private_data;
1696
1697         SMB_ASSERT(!ctx->busy);
1698
1699         subreq = dbwrap_parse_record_send(
1700                 state,
1701                 ev,
1702                 ctx->db,
1703                 key,
1704                 g_lock_dump_fn,
1705                 state,
1706                 &state->req_state);
1707         if (tevent_req_nomem(subreq, req)) {
1708                 return tevent_req_post(req, ev);
1709         }
1710         tevent_req_set_callback(subreq, g_lock_dump_done, req);
1711         return req;
1712 }
1713
1714 static void g_lock_dump_done(struct tevent_req *subreq)
1715 {
1716         struct tevent_req *req = tevent_req_callback_data(
1717                 subreq, struct tevent_req);
1718         struct g_lock_dump_state *state = tevent_req_data(
1719                 req, struct g_lock_dump_state);
1720         NTSTATUS status;
1721
1722         status = dbwrap_parse_record_recv(subreq);
1723         TALLOC_FREE(subreq);
1724         if (tevent_req_nterror(req, status) ||
1725             tevent_req_nterror(req, state->status)) {
1726                 return;
1727         }
1728         tevent_req_done(req);
1729 }
1730
1731 NTSTATUS g_lock_dump_recv(struct tevent_req *req)
1732 {
1733         return tevent_req_simple_recv_ntstatus(req);
1734 }
1735
1736 int g_lock_seqnum(struct g_lock_ctx *ctx)
1737 {
1738         return dbwrap_get_seqnum(ctx->db);
1739 }
1740
1741 struct g_lock_watch_data_state {
1742         struct tevent_context *ev;
1743         struct g_lock_ctx *ctx;
1744         TDB_DATA key;
1745         struct server_id blocker;
1746         bool blockerdead;
1747         uint64_t unique_lock_epoch;
1748         uint64_t unique_data_epoch;
1749         uint64_t watch_instance;
1750         NTSTATUS status;
1751 };
1752
1753 static void g_lock_watch_data_done(struct tevent_req *subreq);
1754
1755 static void g_lock_watch_data_send_fn(
1756         struct db_record *rec,
1757         TDB_DATA value,
1758         void *private_data)
1759 {
1760         struct tevent_req *req = talloc_get_type_abort(
1761                 private_data, struct tevent_req);
1762         struct g_lock_watch_data_state *state = tevent_req_data(
1763                 req, struct g_lock_watch_data_state);
1764         struct tevent_req *subreq = NULL;
1765         struct g_lock lck;
1766         bool ok;
1767
1768         ok = g_lock_parse(value.dptr, value.dsize, &lck);
1769         if (!ok) {
1770                 state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
1771                 return;
1772         }
1773         state->unique_lock_epoch = lck.unique_lock_epoch;
1774         state->unique_data_epoch = lck.unique_data_epoch;
1775
1776         DBG_DEBUG("state->unique_data_epoch=%"PRIu64"\n", state->unique_data_epoch);
1777
1778         subreq = dbwrap_watched_watch_send(
1779                 state, state->ev, rec, 0, state->blocker);
1780         if (subreq == NULL) {
1781                 state->status = NT_STATUS_NO_MEMORY;
1782                 return;
1783         }
1784         tevent_req_set_callback(subreq, g_lock_watch_data_done, req);
1785
1786         state->status = NT_STATUS_EVENT_PENDING;
1787 }
1788
1789 struct tevent_req *g_lock_watch_data_send(
1790         TALLOC_CTX *mem_ctx,
1791         struct tevent_context *ev,
1792         struct g_lock_ctx *ctx,
1793         TDB_DATA key,
1794         struct server_id blocker)
1795 {
1796         struct tevent_req *req = NULL;
1797         struct g_lock_watch_data_state *state = NULL;
1798         NTSTATUS status;
1799
1800         SMB_ASSERT(!ctx->busy);
1801
1802         req = tevent_req_create(
1803                 mem_ctx, &state, struct g_lock_watch_data_state);
1804         if (req == NULL) {
1805                 return NULL;
1806         }
1807         state->ev = ev;
1808         state->ctx = ctx;
1809         state->blocker = blocker;
1810
1811         state->key = tdb_data_talloc_copy(state, key);
1812         if (tevent_req_nomem(state->key.dptr, req)) {
1813                 return tevent_req_post(req, ev);
1814         }
1815
1816         status = dbwrap_do_locked(
1817                 ctx->db, key, g_lock_watch_data_send_fn, req);
1818         if (tevent_req_nterror(req, status)) {
1819                 DBG_DEBUG("dbwrap_do_locked returned %s\n", nt_errstr(status));
1820                 return tevent_req_post(req, ev);
1821         }
1822
1823         if (NT_STATUS_EQUAL(state->status, NT_STATUS_EVENT_PENDING)) {
1824                 return req;
1825         }
1826         if (tevent_req_nterror(req, state->status)) {
1827                 return tevent_req_post(req, ev);
1828         }
1829         tevent_req_done(req);
1830         return tevent_req_post(req, ev);
1831 }
1832
1833 static void g_lock_watch_data_done_fn(
1834         struct db_record *rec,
1835         TDB_DATA value,
1836         void *private_data)
1837 {
1838         struct tevent_req *req = talloc_get_type_abort(
1839                 private_data, struct tevent_req);
1840         struct g_lock_watch_data_state *state = tevent_req_data(
1841                 req, struct g_lock_watch_data_state);
1842         struct tevent_req *subreq = NULL;
1843         struct g_lock lck;
1844         bool ok;
1845
1846         ok = g_lock_parse(value.dptr, value.dsize, &lck);
1847         if (!ok) {
1848                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
1849                 state->status = NT_STATUS_INTERNAL_DB_CORRUPTION;
1850                 return;
1851         }
1852
1853         if (lck.unique_data_epoch != state->unique_data_epoch) {
1854                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
1855                 DBG_DEBUG("lck.unique_data_epoch=%"PRIu64", "
1856                           "state->unique_data_epoch=%"PRIu64"\n",
1857                           lck.unique_data_epoch,
1858                           state->unique_data_epoch);
1859                 state->status = NT_STATUS_OK;
1860                 return;
1861         }
1862
1863         /*
1864          * The lock epoch changed, so we better
1865          * remove ourself from the waiter list
1866          * (most likely the first position)
1867          * and re-add us at the end of the list.
1868          *
1869          * This gives other lock waiters a change
1870          * to make progress.
1871          *
1872          * Otherwise we'll keep our waiter instance alive,
1873          * keep waiting (most likely at first position).
1874          */
1875         if (lck.unique_lock_epoch != state->unique_lock_epoch) {
1876                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
1877                 state->watch_instance = dbwrap_watched_watch_add_instance(rec);
1878                 state->unique_lock_epoch = lck.unique_lock_epoch;
1879         }
1880
1881         subreq = dbwrap_watched_watch_send(
1882                 state, state->ev, rec, state->watch_instance, state->blocker);
1883         if (subreq == NULL) {
1884                 dbwrap_watched_watch_remove_instance(rec, state->watch_instance);
1885                 state->status = NT_STATUS_NO_MEMORY;
1886                 return;
1887         }
1888         tevent_req_set_callback(subreq, g_lock_watch_data_done, req);
1889
1890         state->status = NT_STATUS_EVENT_PENDING;
1891 }
1892
1893 static void g_lock_watch_data_done(struct tevent_req *subreq)
1894 {
1895         struct tevent_req *req = tevent_req_callback_data(
1896                 subreq, struct tevent_req);
1897         struct g_lock_watch_data_state *state = tevent_req_data(
1898                 req, struct g_lock_watch_data_state);
1899         NTSTATUS status;
1900         uint64_t instance = 0;
1901
1902         status = dbwrap_watched_watch_recv(
1903                 subreq, &instance, &state->blockerdead, &state->blocker);
1904         TALLOC_FREE(subreq);
1905         if (tevent_req_nterror(req, status)) {
1906                 DBG_DEBUG("dbwrap_watched_watch_recv returned %s\n",
1907                           nt_errstr(status));
1908                 return;
1909         }
1910
1911         state->watch_instance = instance;
1912
1913         status = dbwrap_do_locked(
1914                 state->ctx->db, state->key, g_lock_watch_data_done_fn, req);
1915         if (tevent_req_nterror(req, status)) {
1916                 DBG_DEBUG("dbwrap_do_locked returned %s\n", nt_errstr(status));
1917                 return;
1918         }
1919         if (NT_STATUS_EQUAL(state->status, NT_STATUS_EVENT_PENDING)) {
1920                 return;
1921         }
1922         if (tevent_req_nterror(req, state->status)) {
1923                 return;
1924         }
1925         tevent_req_done(req);
1926 }
1927
1928 NTSTATUS g_lock_watch_data_recv(
1929         struct tevent_req *req,
1930         bool *blockerdead,
1931         struct server_id *blocker)
1932 {
1933         struct g_lock_watch_data_state *state = tevent_req_data(
1934                 req, struct g_lock_watch_data_state);
1935         NTSTATUS status;
1936
1937         if (tevent_req_is_nterror(req, &status)) {
1938                 return status;
1939         }
1940         if (blockerdead != NULL) {
1941                 *blockerdead = state->blockerdead;
1942         }
1943         if (blocker != NULL) {
1944                 *blocker = state->blocker;
1945         }
1946
1947         return NT_STATUS_OK;
1948 }
1949
1950 static void g_lock_wake_watchers_fn(
1951         struct db_record *rec,
1952         TDB_DATA value,
1953         void *private_data)
1954 {
1955         struct g_lock lck = { .exclusive.pid = 0 };
1956         NTSTATUS status;
1957         bool ok;
1958
1959         ok = g_lock_parse(value.dptr, value.dsize, &lck);
1960         if (!ok) {
1961                 DBG_WARNING("g_lock_parse failed\n");
1962                 return;
1963         }
1964
1965         lck.unique_data_epoch = generate_unique_u64(lck.unique_data_epoch);
1966
1967         status = g_lock_store(rec, &lck, NULL, NULL, 0);
1968         if (!NT_STATUS_IS_OK(status)) {
1969                 DBG_WARNING("g_lock_store failed: %s\n", nt_errstr(status));
1970                 return;
1971         }
1972 }
1973
1974 void g_lock_wake_watchers(struct g_lock_ctx *ctx, TDB_DATA key)
1975 {
1976         NTSTATUS status;
1977
1978         SMB_ASSERT(!ctx->busy);
1979
1980         status = dbwrap_do_locked(ctx->db, key, g_lock_wake_watchers_fn, NULL);
1981         if (!NT_STATUS_IS_OK(status)) {
1982                 DBG_DEBUG("dbwrap_do_locked returned %s\n",
1983                           nt_errstr(status));
1984         }
1985 }