fs/ocfs2/dlm/dlmmaster.c

   1 /* -*- mode: c; c-basic-offset: 8; -*-
   2  * vim: noexpandtab sw=8 ts=8 sts=0:
   3  *
   4  * dlmmod.c
   5  *
   6  * standalone DLM module
   7  *
   8  * Copyright (C) 2004 Oracle.  All rights reserved.
   9  *
  10  * This program is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2 of the License, or (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public
  21  * License along with this program; if not, write to the
  22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  23  * Boston, MA 021110-1307, USA.
  24  *
  25  */
  26
  27
  28 #include <linux/module.h>
  29 #include <linux/fs.h>
  30 #include <linux/types.h>
  31 #include <linux/slab.h>
  32 #include <linux/highmem.h>
  33 #include <linux/utsname.h>
  34 #include <linux/init.h>
  35 #include <linux/sysctl.h>
  36 #include <linux/random.h>
  37 #include <linux/blkdev.h>
  38 #include <linux/socket.h>
  39 #include <linux/inet.h>
  40 #include <linux/spinlock.h>
  41 #include <linux/delay.h>
  42
  43
  44 #include "cluster/heartbeat.h"
  45 #include "cluster/nodemanager.h"
  46 #include "cluster/tcp.h"
  47
  48 #include "dlmapi.h"
  49 #include "dlmcommon.h"
  50 #include "dlmdebug.h"
  51 #include "dlmdomain.h"
  52
  53 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
  54 #include "cluster/masklog.h"
  55
  56 enum dlm_mle_type {
  57         DLM_MLE_BLOCK,
  58         DLM_MLE_MASTER,
  59         DLM_MLE_MIGRATION
  60 };
  61
  62 struct dlm_lock_name
  63 {
  64         u8 len;
  65         u8 name[DLM_LOCKID_NAME_MAX];
  66 };
  67
  68 struct dlm_master_list_entry
  69 {
  70         struct list_head list;
  71         struct list_head hb_events;
  72         struct dlm_ctxt *dlm;
  73         spinlock_t spinlock;
  74         wait_queue_head_t wq;
  75         atomic_t woken;
  76         struct kref mle_refs;
  77         unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
  78         unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
  79         unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
  80         unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
  81         u8 master;
  82         u8 new_master;
  83         enum dlm_mle_type type;
  84         struct o2hb_callback_func mle_hb_up;
  85         struct o2hb_callback_func mle_hb_down;
  86         union {
  87                 struct dlm_lock_resource *res;
  88                 struct dlm_lock_name name;
  89         } u;
  90 };
  91
  92 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
  93                               struct dlm_master_list_entry *mle,
  94                               struct o2nm_node *node,
  95                               int idx);
  96 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
  97                             struct dlm_master_list_entry *mle,
  98                             struct o2nm_node *node,
  99                             int idx);
 100
 101 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
 102 static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
 103                                 unsigned int namelen, void *nodemap,
 104                                 u32 flags);
 105
 106 static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
 107                                 struct dlm_master_list_entry *mle,
 108                                 const char *name,
 109                                 unsigned int namelen)
 110 {
 111         struct dlm_lock_resource *res;
 112
 113         if (dlm != mle->dlm)
 114                 return 0;
 115
 116         if (mle->type == DLM_MLE_BLOCK ||
 117             mle->type == DLM_MLE_MIGRATION) {
 118                 if (namelen != mle->u.name.len ||
 119                     memcmp(name, mle->u.name.name, namelen)!=0)
 120                         return 0;
 121         } else {
 122                 res = mle->u.res;
 123                 if (namelen != res->lockname.len ||
 124                     memcmp(res->lockname.name, name, namelen) != 0)
 125                         return 0;
 126         }
 127         return 1;
 128 }
 129
 130 #if 0
 131 /* Code here is included but defined out as it aids debugging */
 132
 133 void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 134 {
 135         int i = 0, refs;
 136         char *type;
 137         char attached;
 138         u8 master;
 139         unsigned int namelen;
 140         const char *name;
 141         struct kref *k;
 142
 143         k = &mle->mle_refs;
 144         if (mle->type == DLM_MLE_BLOCK)
 145                 type = "BLK";
 146         else if (mle->type == DLM_MLE_MASTER)
 147                 type = "MAS";
 148         else
 149                 type = "MIG";
 150         refs = atomic_read(&k->refcount);
 151         master = mle->master;
 152         attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
 153
 154         if (mle->type != DLM_MLE_MASTER) {
 155                 namelen = mle->u.name.len;
 156                 name = mle->u.name.name;
 157         } else {
 158                 namelen = mle->u.res->lockname.len;
 159                 name = mle->u.res->lockname.name;
 160         }
 161
 162         mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
 163                   i, type, refs, master, mle->new_master, attached,
 164                   namelen, namelen, name);
 165 }
 166
 167 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 168 {
 169         struct dlm_master_list_entry *mle;
 170         struct list_head *iter;
 171
 172         mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
 173         mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
 174         spin_lock(&dlm->master_lock);
 175         list_for_each(iter, &dlm->master_list) {
 176                 mle = list_entry(iter, struct dlm_master_list_entry, list);
 177                 dlm_print_one_mle(mle);
 178         }
 179         spin_unlock(&dlm->master_lock);
 180 }
 181
 182 int dlm_dump_all_mles(const char __user *data, unsigned int len)
 183 {
 184         struct list_head *iter;
 185         struct dlm_ctxt *dlm;
 186
 187         spin_lock(&dlm_domain_lock);
 188         list_for_each(iter, &dlm_domains) {
 189                 dlm = list_entry (iter, struct dlm_ctxt, list);
 190                 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
 191                 dlm_dump_mles(dlm);
 192         }
 193         spin_unlock(&dlm_domain_lock);
 194         return len;
 195 }
 196 EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
 197
 198 #endif  /*  0  */
 199
 200
 201 static kmem_cache_t *dlm_mle_cache = NULL;
 202
 203
 204 static void dlm_mle_release(struct kref *kref);
 205 static void dlm_init_mle(struct dlm_master_list_entry *mle,
 206                         enum dlm_mle_type type,
 207                         struct dlm_ctxt *dlm,
 208                         struct dlm_lock_resource *res,
 209                         const char *name,
 210                         unsigned int namelen);
 211 static void dlm_put_mle(struct dlm_master_list_entry *mle);
 212 static void __dlm_put_mle(struct dlm_master_list_entry *mle);
 213 static int dlm_find_mle(struct dlm_ctxt *dlm,
 214                         struct dlm_master_list_entry **mle,
 215                         char *name, unsigned int namelen);
 216
 217 static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to);
 218
 219
 220 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
 221                                      struct dlm_lock_resource *res,
 222                                      struct dlm_master_list_entry *mle,
 223                                      int *blocked);
 224 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 225                                     struct dlm_lock_resource *res,
 226                                     struct dlm_master_list_entry *mle,
 227                                     int blocked);
 228 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 229                                  struct dlm_lock_resource *res,
 230                                  struct dlm_master_list_entry *mle,
 231                                  struct dlm_master_list_entry **oldmle,
 232                                  const char *name, unsigned int namelen,
 233                                  u8 new_master, u8 master);
 234
 235 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
 236                                     struct dlm_lock_resource *res);
 237 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
 238                                       struct dlm_lock_resource *res);
 239 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
 240                                        struct dlm_lock_resource *res,
 241                                        u8 target);
 242 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
 243                                        struct dlm_lock_resource *res);
 244
 245
 246 int dlm_is_host_down(int errno)
 247 {
 248         switch (errno) {
 249                 case -EBADF:
 250                 case -ECONNREFUSED:
 251                 case -ENOTCONN:
 252                 case -ECONNRESET:
 253                 case -EPIPE:
 254                 case -EHOSTDOWN:
 255                 case -EHOSTUNREACH:
 256                 case -ETIMEDOUT:
 257                 case -ECONNABORTED:
 258                 case -ENETDOWN:
 259                 case -ENETUNREACH:
 260                 case -ENETRESET:
 261                 case -ESHUTDOWN:
 262                 case -ENOPROTOOPT:
 263                 case -EINVAL:   /* if returned from our tcp code,
 264                                    this means there is no socket */
 265                         return 1;
 266         }
 267         return 0;
 268 }
 269
 270
 271 /*
 272  * MASTER LIST FUNCTIONS
 273  */
 274
 275
 276 /*
 277  * regarding master list entries and heartbeat callbacks:
 278  *
 279  * in order to avoid sleeping and allocation that occurs in
 280  * heartbeat, master list entries are simply attached to the
 281  * dlm's established heartbeat callbacks.  the mle is attached
 282  * when it is created, and since the dlm->spinlock is held at
 283  * that time, any heartbeat event will be properly discovered
 284  * by the mle.  the mle needs to be detached from the
 285  * dlm->mle_hb_events list as soon as heartbeat events are no
 286  * longer useful to the mle, and before the mle is freed.
 287  *
 288  * as a general rule, heartbeat events are no longer needed by
 289  * the mle once an "answer" regarding the lock master has been
 290  * received.
 291  */
 292 static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
 293                                               struct dlm_master_list_entry *mle)
 294 {
 295         assert_spin_locked(&dlm->spinlock);
 296
 297         list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
 298 }
 299
 300
 301 static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
 302                                               struct dlm_master_list_entry *mle)
 303 {
 304         if (!list_empty(&mle->hb_events))
 305                 list_del_init(&mle->hb_events);
 306 }
 307
 308
 309 static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
 310                                             struct dlm_master_list_entry *mle)
 311 {
 312         spin_lock(&dlm->spinlock);
 313         __dlm_mle_detach_hb_events(dlm, mle);
 314         spin_unlock(&dlm->spinlock);
 315 }
 316
 317 /* remove from list and free */
 318 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 319 {
 320         struct dlm_ctxt *dlm;
 321         dlm = mle->dlm;
 322
 323         assert_spin_locked(&dlm->spinlock);
 324         assert_spin_locked(&dlm->master_lock);
 325         BUG_ON(!atomic_read(&mle->mle_refs.refcount));
 326
 327         kref_put(&mle->mle_refs, dlm_mle_release);
 328 }
 329
 330
 331 /* must not have any spinlocks coming in */
 332 static void dlm_put_mle(struct dlm_master_list_entry *mle)
 333 {
 334         struct dlm_ctxt *dlm;
 335         dlm = mle->dlm;
 336
 337         spin_lock(&dlm->spinlock);
 338         spin_lock(&dlm->master_lock);
 339         __dlm_put_mle(mle);
 340         spin_unlock(&dlm->master_lock);
 341         spin_unlock(&dlm->spinlock);
 342 }
 343
 344 static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
 345 {
 346         kref_get(&mle->mle_refs);
 347 }
 348
 349 static void dlm_init_mle(struct dlm_master_list_entry *mle,
 350                         enum dlm_mle_type type,
 351                         struct dlm_ctxt *dlm,
 352                         struct dlm_lock_resource *res,
 353                         const char *name,
 354                         unsigned int namelen)
 355 {
 356         assert_spin_locked(&dlm->spinlock);
 357
 358         mle->dlm = dlm;
 359         mle->type = type;
 360         INIT_LIST_HEAD(&mle->list);
 361         INIT_LIST_HEAD(&mle->hb_events);
 362         memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
 363         spin_lock_init(&mle->spinlock);
 364         init_waitqueue_head(&mle->wq);
 365         atomic_set(&mle->woken, 0);
 366         kref_init(&mle->mle_refs);
 367         memset(mle->response_map, 0, sizeof(mle->response_map));
 368         mle->master = O2NM_MAX_NODES;
 369         mle->new_master = O2NM_MAX_NODES;
 370
 371         if (mle->type == DLM_MLE_MASTER) {
 372                 BUG_ON(!res);
 373                 mle->u.res = res;
 374         } else if (mle->type == DLM_MLE_BLOCK) {
 375                 BUG_ON(!name);
 376                 memcpy(mle->u.name.name, name, namelen);
 377                 mle->u.name.len = namelen;
 378         } else /* DLM_MLE_MIGRATION */ {
 379                 BUG_ON(!name);
 380                 memcpy(mle->u.name.name, name, namelen);
 381                 mle->u.name.len = namelen;
 382         }
 383
 384         /* copy off the node_map and register hb callbacks on our copy */
 385         memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
 386         memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
 387         clear_bit(dlm->node_num, mle->vote_map);
 388         clear_bit(dlm->node_num, mle->node_map);
 389
 390         /* attach the mle to the domain node up/down events */
 391         __dlm_mle_attach_hb_events(dlm, mle);
 392 }
 393
 394
 395 /* returns 1 if found, 0 if not */
 396 static int dlm_find_mle(struct dlm_ctxt *dlm,
 397                         struct dlm_master_list_entry **mle,
 398                         char *name, unsigned int namelen)
 399 {
 400         struct dlm_master_list_entry *tmpmle;
 401         struct list_head *iter;
 402
 403         assert_spin_locked(&dlm->master_lock);
 404
 405         list_for_each(iter, &dlm->master_list) {
 406                 tmpmle = list_entry(iter, struct dlm_master_list_entry, list);
 407                 if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
 408                         continue;
 409                 dlm_get_mle(tmpmle);
 410                 *mle = tmpmle;
 411                 return 1;
 412         }
 413         return 0;
 414 }
 415
 416 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
 417 {
 418         struct dlm_master_list_entry *mle;
 419         struct list_head *iter;
 420
 421         assert_spin_locked(&dlm->spinlock);
 422
 423         list_for_each(iter, &dlm->mle_hb_events) {
 424                 mle = list_entry(iter, struct dlm_master_list_entry,
 425                                  hb_events);
 426                 if (node_up)
 427                         dlm_mle_node_up(dlm, mle, NULL, idx);
 428                 else
 429                         dlm_mle_node_down(dlm, mle, NULL, idx);
 430         }
 431 }
 432
 433 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
 434                               struct dlm_master_list_entry *mle,
 435                               struct o2nm_node *node, int idx)
 436 {
 437         spin_lock(&mle->spinlock);
 438
 439         if (!test_bit(idx, mle->node_map))
 440                 mlog(0, "node %u already removed from nodemap!\n", idx);
 441         else
 442                 clear_bit(idx, mle->node_map);
 443
 444         spin_unlock(&mle->spinlock);
 445 }
 446
 447 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
 448                             struct dlm_master_list_entry *mle,
 449                             struct o2nm_node *node, int idx)
 450 {
 451         spin_lock(&mle->spinlock);
 452
 453         if (test_bit(idx, mle->node_map))
 454                 mlog(0, "node %u already in node map!\n", idx);
 455         else
 456                 set_bit(idx, mle->node_map);
 457
 458         spin_unlock(&mle->spinlock);
 459 }
 460
 461
 462 int dlm_init_mle_cache(void)
 463 {
 464         dlm_mle_cache = kmem_cache_create("dlm_mle_cache",
 465                                           sizeof(struct dlm_master_list_entry),
 466                                           0, SLAB_HWCACHE_ALIGN,
 467                                           NULL, NULL);
 468         if (dlm_mle_cache == NULL)
 469                 return -ENOMEM;
 470         return 0;
 471 }
 472
 473 void dlm_destroy_mle_cache(void)
 474 {
 475         if (dlm_mle_cache)
 476                 kmem_cache_destroy(dlm_mle_cache);
 477 }
 478
 479 static void dlm_mle_release(struct kref *kref)
 480 {
 481         struct dlm_master_list_entry *mle;
 482         struct dlm_ctxt *dlm;
 483
 484         mlog_entry_void();
 485
 486         mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
 487         dlm = mle->dlm;
 488
 489         if (mle->type != DLM_MLE_MASTER) {
 490                 mlog(0, "calling mle_release for %.*s, type %d\n",
 491                      mle->u.name.len, mle->u.name.name, mle->type);
 492         } else {
 493                 mlog(0, "calling mle_release for %.*s, type %d\n",
 494                      mle->u.res->lockname.len,
 495                      mle->u.res->lockname.name, mle->type);
 496         }
 497         assert_spin_locked(&dlm->spinlock);
 498         assert_spin_locked(&dlm->master_lock);
 499
 500         /* remove from list if not already */
 501         if (!list_empty(&mle->list))
 502                 list_del_init(&mle->list);
 503
 504         /* detach the mle from the domain node up/down events */
 505         __dlm_mle_detach_hb_events(dlm, mle);
 506
 507         /* NOTE: kfree under spinlock here.
 508          * if this is bad, we can move this to a freelist. */
 509         kmem_cache_free(dlm_mle_cache, mle);
 510 }
 511
 512
 513 /*
 514  * LOCK RESOURCE FUNCTIONS
 515  */
 516
 517 static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
 518                                   struct dlm_lock_resource *res,
 519                                   u8 owner)
 520 {
 521         assert_spin_locked(&res->spinlock);
 522
 523         mlog_entry("%.*s, %u\n", res->lockname.len, res->lockname.name, owner);
 524
 525         if (owner == dlm->node_num)
 526                 atomic_inc(&dlm->local_resources);
 527         else if (owner == DLM_LOCK_RES_OWNER_UNKNOWN)
 528                 atomic_inc(&dlm->unknown_resources);
 529         else
 530                 atomic_inc(&dlm->remote_resources);
 531
 532         res->owner = owner;
 533 }
 534
 535 void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
 536                               struct dlm_lock_resource *res, u8 owner)
 537 {
 538         assert_spin_locked(&res->spinlock);
 539
 540         if (owner == res->owner)
 541                 return;
 542
 543         if (res->owner == dlm->node_num)
 544                 atomic_dec(&dlm->local_resources);
 545         else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN)
 546                 atomic_dec(&dlm->unknown_resources);
 547         else
 548                 atomic_dec(&dlm->remote_resources);
 549
 550         dlm_set_lockres_owner(dlm, res, owner);
 551 }
 552
 553
 554 static void dlm_lockres_release(struct kref *kref)
 555 {
 556         struct dlm_lock_resource *res;
 557
 558         res = container_of(kref, struct dlm_lock_resource, refs);
 559
 560         /* This should not happen -- all lockres' have a name
 561          * associated with them at init time. */
 562         BUG_ON(!res->lockname.name);
 563
 564         mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 565              res->lockname.name);
 566
 567         /* By the time we're ready to blow this guy away, we shouldn't
 568          * be on any lists. */
 569         BUG_ON(!hlist_unhashed(&res->hash_node));
 570         BUG_ON(!list_empty(&res->granted));
 571         BUG_ON(!list_empty(&res->converting));
 572         BUG_ON(!list_empty(&res->blocked));
 573         BUG_ON(!list_empty(&res->dirty));
 574         BUG_ON(!list_empty(&res->recovering));
 575         BUG_ON(!list_empty(&res->purge));
 576
 577         kfree(res->lockname.name);
 578
 579         kfree(res);
 580 }
 581
 582 void dlm_lockres_put(struct dlm_lock_resource *res)
 583 {
 584         kref_put(&res->refs, dlm_lockres_release);
 585 }
 586
 587 static void dlm_init_lockres(struct dlm_ctxt *dlm,
 588                              struct dlm_lock_resource *res,
 589                              const char *name, unsigned int namelen)
 590 {
 591         char *qname;
 592
 593         /* If we memset here, we lose our reference to the kmalloc'd
 594          * res->lockname.name, so be sure to init every field
 595          * correctly! */
 596
 597         qname = (char *) res->lockname.name;
 598         memcpy(qname, name, namelen);
 599
 600         res->lockname.len = namelen;
 601         res->lockname.hash = dlm_lockid_hash(name, namelen);
 602
 603         init_waitqueue_head(&res->wq);
 604         spin_lock_init(&res->spinlock);
 605         INIT_HLIST_NODE(&res->hash_node);
 606         INIT_LIST_HEAD(&res->granted);
 607         INIT_LIST_HEAD(&res->converting);
 608         INIT_LIST_HEAD(&res->blocked);
 609         INIT_LIST_HEAD(&res->dirty);
 610         INIT_LIST_HEAD(&res->recovering);
 611         INIT_LIST_HEAD(&res->purge);
 612         atomic_set(&res->asts_reserved, 0);
 613         res->migration_pending = 0;
 614
 615         kref_init(&res->refs);
 616
 617         /* just for consistency */
 618         spin_lock(&res->spinlock);
 619         dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
 620         spin_unlock(&res->spinlock);
 621
 622         res->state = DLM_LOCK_RES_IN_PROGRESS;
 623
 624         res->last_used = 0;
 625
 626         memset(res->lvb, 0, DLM_LVB_LEN);
 627 }
 628
 629 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
 630                                    const char *name,
 631                                    unsigned int namelen)
 632 {
 633         struct dlm_lock_resource *res;
 634
 635         res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 636         if (!res)
 637                 return NULL;
 638
 639         res->lockname.name = kmalloc(namelen, GFP_KERNEL);
 640         if (!res->lockname.name) {
 641                 kfree(res);
 642                 return NULL;
 643         }
 644
 645         dlm_init_lockres(dlm, res, name, namelen);
 646         return res;
 647 }
 648
 649 /*
 650  * lookup a lock resource by name.
 651  * may already exist in the hashtable.
 652  * lockid is null terminated
 653  *
 654  * if not, allocate enough for the lockres and for
 655  * the temporary structure used in doing the mastering.
 656  *
 657  * also, do a lookup in the dlm->master_list to see
 658  * if another node has begun mastering the same lock.
 659  * if so, there should be a block entry in there
 660  * for this name, and we should *not* attempt to master
 661  * the lock here.   need to wait around for that node
 662  * to assert_master (or die).
 663  *
 664  */
 665 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 666                                           const char *lockid,
 667                                           int flags)
 668 {
 669         struct dlm_lock_resource *tmpres=NULL, *res=NULL;
 670         struct dlm_master_list_entry *mle = NULL;
 671         struct dlm_master_list_entry *alloc_mle = NULL;
 672         int blocked = 0;
 673         int ret, nodenum;
 674         struct dlm_node_iter iter;
 675         unsigned int namelen, hash;
 676         int tries = 0;
 677         int bit, wait_on_recovery = 0;
 678
 679         BUG_ON(!lockid);
 680
 681         namelen = strlen(lockid);
 682         hash = dlm_lockid_hash(lockid, namelen);
 683
 684         mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 685
 686 lookup:
 687         spin_lock(&dlm->spinlock);
 688         tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
 689         if (tmpres) {
 690                 spin_unlock(&dlm->spinlock);
 691                 mlog(0, "found in hash!\n");
 692                 if (res)
 693                         dlm_lockres_put(res);
 694                 res = tmpres;
 695                 goto leave;
 696         }
 697
 698         if (!res) {
 699                 spin_unlock(&dlm->spinlock);
 700                 mlog(0, "allocating a new resource\n");
 701                 /* nothing found and we need to allocate one. */
 702                 alloc_mle = (struct dlm_master_list_entry *)
 703                         kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
 704                 if (!alloc_mle)
 705                         goto leave;
 706                 res = dlm_new_lockres(dlm, lockid, namelen);
 707                 if (!res)
 708                         goto leave;
 709                 goto lookup;
 710         }
 711
 712         mlog(0, "no lockres found, allocated our own: %p\n", res);
 713
 714         if (flags & LKM_LOCAL) {
 715                 /* caller knows it's safe to assume it's not mastered elsewhere
 716                  * DONE!  return right away */
 717                 spin_lock(&res->spinlock);
 718                 dlm_change_lockres_owner(dlm, res, dlm->node_num);
 719                 __dlm_insert_lockres(dlm, res);
 720                 spin_unlock(&res->spinlock);
 721                 spin_unlock(&dlm->spinlock);
 722                 /* lockres still marked IN_PROGRESS */
 723                 goto wake_waiters;
 724         }
 725
 726         /* check master list to see if another node has started mastering it */
 727         spin_lock(&dlm->master_lock);
 728
 729         /* if we found a block, wait for lock to be mastered by another node */
 730         blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
 731         if (blocked) {
 732                 if (mle->type == DLM_MLE_MASTER) {
 733                         mlog(ML_ERROR, "master entry for nonexistent lock!\n");
 734                         BUG();
 735                 } else if (mle->type == DLM_MLE_MIGRATION) {
 736                         /* migration is in progress! */
 737                         /* the good news is that we now know the
 738                          * "current" master (mle->master). */
 739
 740                         spin_unlock(&dlm->master_lock);
 741                         assert_spin_locked(&dlm->spinlock);
 742
 743                         /* set the lockres owner and hash it */
 744                         spin_lock(&res->spinlock);
 745                         dlm_set_lockres_owner(dlm, res, mle->master);
 746                         __dlm_insert_lockres(dlm, res);
 747                         spin_unlock(&res->spinlock);
 748                         spin_unlock(&dlm->spinlock);
 749
 750                         /* master is known, detach */
 751                         dlm_mle_detach_hb_events(dlm, mle);
 752                         dlm_put_mle(mle);
 753                         mle = NULL;
 754                         goto wake_waiters;
 755                 }
 756         } else {
 757                 /* go ahead and try to master lock on this node */
 758                 mle = alloc_mle;
 759                 /* make sure this does not get freed below */
 760                 alloc_mle = NULL;
 761                 dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
 762                 set_bit(dlm->node_num, mle->maybe_map);
 763                 list_add(&mle->list, &dlm->master_list);
 764
 765                 /* still holding the dlm spinlock, check the recovery map
 766                  * to see if there are any nodes that still need to be
 767                  * considered.  these will not appear in the mle nodemap
 768                  * but they might own this lockres.  wait on them. */
 769                 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
 770                 if (bit < O2NM_MAX_NODES) {
 771                         mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
 772                              "recover before lock mastery can begin\n",
 773                              dlm->name, namelen, (char *)lockid, bit);
 774                         wait_on_recovery = 1;
 775                 }
 776         }
 777
 778         /* at this point there is either a DLM_MLE_BLOCK or a
 779          * DLM_MLE_MASTER on the master list, so it's safe to add the
 780          * lockres to the hashtable.  anyone who finds the lock will
 781          * still have to wait on the IN_PROGRESS. */
 782
 783         /* finally add the lockres to its hash bucket */
 784         __dlm_insert_lockres(dlm, res);
 785         /* get an extra ref on the mle in case this is a BLOCK
 786          * if so, the creator of the BLOCK may try to put the last
 787          * ref at this time in the assert master handler, so we
 788          * need an extra one to keep from a bad ptr deref. */
 789         dlm_get_mle(mle);
 790         spin_unlock(&dlm->master_lock);
 791         spin_unlock(&dlm->spinlock);
 792
 793         while (wait_on_recovery) {
 794                 /* any cluster changes that occurred after dropping the
 795                  * dlm spinlock would be detectable be a change on the mle,
 796                  * so we only need to clear out the recovery map once. */
 797                 if (dlm_is_recovery_lock(lockid, namelen)) {
 798                         mlog(ML_NOTICE, "%s: recovery map is not empty, but "
 799                              "must master $RECOVERY lock now\n", dlm->name);
 800                         if (!dlm_pre_master_reco_lockres(dlm, res))
 801                                 wait_on_recovery = 0;
 802                         else {
 803                                 mlog(0, "%s: waiting 500ms for heartbeat state "
 804                                     "change\n", dlm->name);
 805                                 msleep(500);
 806                         }
 807                         continue;
 808                 }
 809
 810                 dlm_kick_recovery_thread(dlm);
 811                 msleep(100);
 812                 dlm_wait_for_recovery(dlm);
 813
 814                 spin_lock(&dlm->spinlock);
 815                 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
 816                 if (bit < O2NM_MAX_NODES) {
 817                         mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to"
 818                              "recover before lock mastery can begin\n",
 819                              dlm->name, namelen, (char *)lockid, bit);
 820                         wait_on_recovery = 1;
 821                 } else
 822                         wait_on_recovery = 0;
 823                 spin_unlock(&dlm->spinlock);
 824         }
 825
 826         /* must wait for lock to be mastered elsewhere */
 827         if (blocked)
 828                 goto wait;
 829
 830 redo_request:
 831         ret = -EINVAL;
 832         dlm_node_iter_init(mle->vote_map, &iter);
 833         while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
 834                 ret = dlm_do_master_request(mle, nodenum);
 835                 if (ret < 0)
 836                         mlog_errno(ret);
 837                 if (mle->master != O2NM_MAX_NODES) {
 838                         /* found a master ! */
 839                         if (mle->master <= nodenum)
 840                                 break;
 841                         /* if our master request has not reached the master
 842                          * yet, keep going until it does.  this is how the
 843                          * master will know that asserts are needed back to
 844                          * the lower nodes. */
 845                         mlog(0, "%s:%.*s: requests only up to %u but master "
 846                              "is %u, keep going\n", dlm->name, namelen,
 847                              lockid, nodenum, mle->master);
 848                 }
 849         }
 850
 851 wait:
 852         /* keep going until the response map includes all nodes */
 853         ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
 854         if (ret < 0) {
 855                 mlog(0, "%s:%.*s: node map changed, redo the "
 856                      "master request now, blocked=%d\n",
 857                      dlm->name, res->lockname.len,
 858                      res->lockname.name, blocked);
 859                 if (++tries > 20) {
 860                         mlog(ML_ERROR, "%s:%.*s: spinning on "
 861                              "dlm_wait_for_lock_mastery, blocked=%d\n",
 862                              dlm->name, res->lockname.len,
 863                              res->lockname.name, blocked);
 864                         dlm_print_one_lock_resource(res);
 865                         /* dlm_print_one_mle(mle); */
 866                         tries = 0;
 867                 }
 868                 goto redo_request;
 869         }
 870
 871         mlog(0, "lockres mastered by %u\n", res->owner);
 872         /* make sure we never continue without this */
 873         BUG_ON(res->owner == O2NM_MAX_NODES);
 874
 875         /* master is known, detach if not already detached */
 876         dlm_mle_detach_hb_events(dlm, mle);
 877         dlm_put_mle(mle);
 878         /* put the extra ref */
 879         dlm_put_mle(mle);
 880
 881 wake_waiters:
 882         spin_lock(&res->spinlock);
 883         res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
 884         spin_unlock(&res->spinlock);
 885         wake_up(&res->wq);
 886
 887 leave:
 888         /* need to free the unused mle */
 889         if (alloc_mle)
 890                 kmem_cache_free(dlm_mle_cache, alloc_mle);
 891
 892         return res;
 893 }
 894
 895
 896 #define DLM_MASTERY_TIMEOUT_MS   5000
 897
 898 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
 899                                      struct dlm_lock_resource *res,
 900                                      struct dlm_master_list_entry *mle,
 901                                      int *blocked)
 902 {
 903         u8 m;
 904         int ret, bit;
 905         int map_changed, voting_done;
 906         int assert, sleep;
 907
 908 recheck:
 909         ret = 0;
 910         assert = 0;
 911
 912         /* check if another node has already become the owner */
 913         spin_lock(&res->spinlock);
 914         if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
 915                 mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
 916                      res->lockname.len, res->lockname.name, res->owner);
 917                 spin_unlock(&res->spinlock);
 918                 /* this will cause the master to re-assert across
 919                  * the whole cluster, freeing up mles */
 920                 ret = dlm_do_master_request(mle, res->owner);
 921                 if (ret < 0) {
 922                         /* give recovery a chance to run */
 923                         mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
 924                         msleep(500);
 925                         goto recheck;
 926                 }
 927                 ret = 0;
 928                 goto leave;
 929         }
 930         spin_unlock(&res->spinlock);
 931
 932         spin_lock(&mle->spinlock);
 933         m = mle->master;
 934         map_changed = (memcmp(mle->vote_map, mle->node_map,
 935                               sizeof(mle->vote_map)) != 0);
 936         voting_done = (memcmp(mle->vote_map, mle->response_map,
 937                              sizeof(mle->vote_map)) == 0);
 938
 939         /* restart if we hit any errors */
 940         if (map_changed) {
 941                 int b;
 942                 mlog(0, "%s: %.*s: node map changed, restarting\n",
 943                      dlm->name, res->lockname.len, res->lockname.name);
 944                 ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
 945                 b = (mle->type == DLM_MLE_BLOCK);
 946                 if ((*blocked && !b) || (!*blocked && b)) {
 947                         mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
 948                              dlm->name, res->lockname.len, res->lockname.name,
 949                              *blocked, b);
 950                         *blocked = b;
 951                 }
 952                 spin_unlock(&mle->spinlock);
 953                 if (ret < 0) {
 954                         mlog_errno(ret);
 955                         goto leave;
 956                 }
 957                 mlog(0, "%s:%.*s: restart lock mastery succeeded, "
 958                      "rechecking now\n", dlm->name, res->lockname.len,
 959                      res->lockname.name);
 960                 goto recheck;
 961         }
 962
 963         if (m != O2NM_MAX_NODES) {
 964                 /* another node has done an assert!
 965                  * all done! */
 966                 sleep = 0;
 967         } else {
 968                 sleep = 1;
 969                 /* have all nodes responded? */
 970                 if (voting_done && !*blocked) {
 971                         bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
 972                         if (dlm->node_num <= bit) {
 973                                 /* my node number is lowest.
 974                                  * now tell other nodes that I am
 975                                  * mastering this. */
 976                                 mle->master = dlm->node_num;
 977                                 assert = 1;
 978                                 sleep = 0;
 979                         }
 980                         /* if voting is done, but we have not received
 981                          * an assert master yet, we must sleep */
 982                 }
 983         }
 984
 985         spin_unlock(&mle->spinlock);
 986
 987         /* sleep if we haven't finished voting yet */
 988         if (sleep) {
 989                 unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
 990
 991                 /*
 992                 if (atomic_read(&mle->mle_refs.refcount) < 2)
 993                         mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle,
 994                         atomic_read(&mle->mle_refs.refcount),
 995                         res->lockname.len, res->lockname.name);
 996                 */
 997                 atomic_set(&mle->woken, 0);
 998                 (void)wait_event_timeout(mle->wq,
 999                                          (atomic_read(&mle->woken) == 1),
1000                                          timeo);
1001                 if (res->owner == O2NM_MAX_NODES) {
1002                         mlog(0, "waiting again\n");
1003                         goto recheck;
1004                 }
1005                 mlog(0, "done waiting, master is %u\n", res->owner);
1006                 ret = 0;
1007                 goto leave;
1008         }
1009
1010         ret = 0;   /* done */
1011         if (assert) {
1012                 m = dlm->node_num;
1013                 mlog(0, "about to master %.*s here, this=%u\n",
1014                      res->lockname.len, res->lockname.name, m);
1015                 ret = dlm_do_assert_master(dlm, res->lockname.name,
1016                                            res->lockname.len, mle->vote_map, 0);
1017                 if (ret) {
1018                         /* This is a failure in the network path,
1019                          * not in the response to the assert_master
1020                          * (any nonzero response is a BUG on this node).
1021                          * Most likely a socket just got disconnected
1022                          * due to node death. */
1023                         mlog_errno(ret);
1024                 }
1025                 /* no longer need to restart lock mastery.
1026                  * all living nodes have been contacted. */
1027                 ret = 0;
1028         }
1029
1030         /* set the lockres owner */
1031         spin_lock(&res->spinlock);
1032         dlm_change_lockres_owner(dlm, res, m);
1033         spin_unlock(&res->spinlock);
1034
1035 leave:
1036         return ret;
1037 }
1038
1039 struct dlm_bitmap_diff_iter
1040 {
1041         int curnode;
1042         unsigned long *orig_bm;
1043         unsigned long *cur_bm;
1044         unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
1045 };
1046
1047 enum dlm_node_state_change
1048 {
1049         NODE_DOWN = -1,
1050         NODE_NO_CHANGE = 0,
1051         NODE_UP
1052 };
1053
1054 static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
1055                                       unsigned long *orig_bm,
1056                                       unsigned long *cur_bm)
1057 {
1058         unsigned long p1, p2;
1059         int i;
1060
1061         iter->curnode = -1;
1062         iter->orig_bm = orig_bm;
1063         iter->cur_bm = cur_bm;
1064
1065         for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1066                 p1 = *(iter->orig_bm + i);
1067                 p2 = *(iter->cur_bm + i);
1068                 iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1069         }
1070 }
1071
1072 static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1073                                      enum dlm_node_state_change *state)
1074 {
1075         int bit;
1076
1077         if (iter->curnode >= O2NM_MAX_NODES)
1078                 return -ENOENT;
1079
1080         bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1081                             iter->curnode+1);
1082         if (bit >= O2NM_MAX_NODES) {
1083                 iter->curnode = O2NM_MAX_NODES;
1084                 return -ENOENT;
1085         }
1086
1087         /* if it was there in the original then this node died */
1088         if (test_bit(bit, iter->orig_bm))
1089                 *state = NODE_DOWN;
1090         else
1091                 *state = NODE_UP;
1092
1093         iter->curnode = bit;
1094         return bit;
1095 }
1096
1097
1098 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1099                                     struct dlm_lock_resource *res,
1100                                     struct dlm_master_list_entry *mle,
1101                                     int blocked)
1102 {
1103         struct dlm_bitmap_diff_iter bdi;
1104         enum dlm_node_state_change sc;
1105         int node;
1106         int ret = 0;
1107
1108         mlog(0, "something happened such that the "
1109              "master process may need to be restarted!\n");
1110
1111         assert_spin_locked(&mle->spinlock);
1112
1113         dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1114         node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1115         while (node >= 0) {
1116                 if (sc == NODE_UP) {
1117                         /* a node came up.  clear any old vote from
1118                          * the response map and set it in the vote map
1119                          * then restart the mastery. */
1120                         mlog(ML_NOTICE, "node %d up while restarting\n", node);
1121
1122                         /* redo the master request, but only for the new node */
1123                         mlog(0, "sending request to new node\n");
1124                         clear_bit(node, mle->response_map);
1125                         set_bit(node, mle->vote_map);
1126                 } else {
1127                         mlog(ML_ERROR, "node down! %d\n", node);
1128
1129                         /* if the node wasn't involved in mastery skip it,
1130                          * but clear it out from the maps so that it will
1131                          * not affect mastery of this lockres */
1132                         clear_bit(node, mle->response_map);
1133                         clear_bit(node, mle->vote_map);
1134                         if (!test_bit(node, mle->maybe_map))
1135                                 goto next;
1136
1137                         /* if we're already blocked on lock mastery, and the
1138                          * dead node wasn't the expected master, or there is
1139                          * another node in the maybe_map, keep waiting */
1140                         if (blocked) {
1141                                 int lowest = find_next_bit(mle->maybe_map,
1142                                                        O2NM_MAX_NODES, 0);
1143
1144                                 /* act like it was never there */
1145                                 clear_bit(node, mle->maybe_map);
1146
1147                                 if (node != lowest)
1148                                         goto next;
1149
1150                                 mlog(ML_ERROR, "expected master %u died while "
1151                                      "this node was blocked waiting on it!\n",
1152                                      node);
1153                                 lowest = find_next_bit(mle->maybe_map,
1154                                                        O2NM_MAX_NODES,
1155                                                        lowest+1);
1156                                 if (lowest < O2NM_MAX_NODES) {
1157                                         mlog(0, "still blocked. waiting "
1158                                              "on %u now\n", lowest);
1159                                         goto next;
1160                                 }
1161
1162                                 /* mle is an MLE_BLOCK, but there is now
1163                                  * nothing left to block on.  we need to return
1164                                  * all the way back out and try again with
1165                                  * an MLE_MASTER. dlm_do_local_recovery_cleanup
1166                                  * has already run, so the mle refcount is ok */
1167                                 mlog(0, "no longer blocking. we can "
1168                                      "try to master this here\n");
1169                                 mle->type = DLM_MLE_MASTER;
1170                                 memset(mle->maybe_map, 0,
1171                                        sizeof(mle->maybe_map));
1172                                 memset(mle->response_map, 0,
1173                                        sizeof(mle->maybe_map));
1174                                 memcpy(mle->vote_map, mle->node_map,
1175                                        sizeof(mle->node_map));
1176                                 mle->u.res = res;
1177                                 set_bit(dlm->node_num, mle->maybe_map);
1178
1179                                 ret = -EAGAIN;
1180                                 goto next;
1181                         }
1182
1183                         clear_bit(node, mle->maybe_map);
1184                         if (node > dlm->node_num)
1185                                 goto next;
1186
1187                         mlog(0, "dead node in map!\n");
1188                         /* yuck. go back and re-contact all nodes
1189                          * in the vote_map, removing this node. */
1190                         memset(mle->response_map, 0,
1191                                sizeof(mle->response_map));
1192                 }
1193                 ret = -EAGAIN;
1194 next:
1195                 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1196         }
1197         return ret;
1198 }
1199
1200
1201 /*
1202  * DLM_MASTER_REQUEST_MSG
1203  *
1204  * returns: 0 on success,
1205  *          -errno on a network error
1206  *
1207  * on error, the caller should assume the target node is "dead"
1208  *
1209  */
1210
1211 static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to)
1212 {
1213         struct dlm_ctxt *dlm = mle->dlm;
1214         struct dlm_master_request request;
1215         int ret, response=0, resend;
1216
1217         memset(&request, 0, sizeof(request));
1218         request.node_idx = dlm->node_num;
1219
1220         BUG_ON(mle->type == DLM_MLE_MIGRATION);
1221
1222         if (mle->type != DLM_MLE_MASTER) {
1223                 request.namelen = mle->u.name.len;
1224                 memcpy(request.name, mle->u.name.name, request.namelen);
1225         } else {
1226                 request.namelen = mle->u.res->lockname.len;
1227                 memcpy(request.name, mle->u.res->lockname.name,
1228                         request.namelen);
1229         }
1230
1231 again:
1232         ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1233                                  sizeof(request), to, &response);
1234         if (ret < 0)  {
1235                 if (ret == -ESRCH) {
1236                         /* should never happen */
1237                         mlog(ML_ERROR, "TCP stack not ready!\n");
1238                         BUG();
1239                 } else if (ret == -EINVAL) {
1240                         mlog(ML_ERROR, "bad args passed to o2net!\n");
1241                         BUG();
1242                 } else if (ret == -ENOMEM) {
1243                         mlog(ML_ERROR, "out of memory while trying to send "
1244                              "network message!  retrying\n");
1245                         /* this is totally crude */
1246                         msleep(50);
1247                         goto again;
1248                 } else if (!dlm_is_host_down(ret)) {
1249                         /* not a network error. bad. */
1250                         mlog_errno(ret);
1251                         mlog(ML_ERROR, "unhandled error!");
1252                         BUG();
1253                 }
1254                 /* all other errors should be network errors,
1255                  * and likely indicate node death */
1256                 mlog(ML_ERROR, "link to %d went down!\n", to);
1257                 goto out;
1258         }
1259
1260         ret = 0;
1261         resend = 0;
1262         spin_lock(&mle->spinlock);
1263         switch (response) {
1264                 case DLM_MASTER_RESP_YES:
1265                         set_bit(to, mle->response_map);
1266                         mlog(0, "node %u is the master, response=YES\n", to);
1267                         mle->master = to;
1268                         break;
1269                 case DLM_MASTER_RESP_NO:
1270                         mlog(0, "node %u not master, response=NO\n", to);
1271                         set_bit(to, mle->response_map);
1272                         break;
1273                 case DLM_MASTER_RESP_MAYBE:
1274                         mlog(0, "node %u not master, response=MAYBE\n", to);
1275                         set_bit(to, mle->response_map);
1276                         set_bit(to, mle->maybe_map);
1277                         break;
1278                 case DLM_MASTER_RESP_ERROR:
1279                         mlog(0, "node %u hit an error, resending\n", to);
1280                         resend = 1;
1281                         response = 0;
1282                         break;
1283                 default:
1284                         mlog(ML_ERROR, "bad response! %u\n", response);
1285                         BUG();
1286         }
1287         spin_unlock(&mle->spinlock);
1288         if (resend) {
1289                 /* this is also totally crude */
1290                 msleep(50);
1291                 goto again;
1292         }
1293
1294 out:
1295         return ret;
1296 }
1297
1298 /*
1299  * locks that can be taken here:
1300  * dlm->spinlock
1301  * res->spinlock
1302  * mle->spinlock
1303  * dlm->master_list
1304  *
1305  * if possible, TRIM THIS DOWN!!!
1306  */
1307 int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
1308 {
1309         u8 response = DLM_MASTER_RESP_MAYBE;
1310         struct dlm_ctxt *dlm = data;
1311         struct dlm_lock_resource *res = NULL;
1312         struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1313         struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1314         char *name;
1315         unsigned int namelen, hash;
1316         int found, ret;
1317         int set_maybe;
1318         int dispatch_assert = 0;
1319
1320         if (!dlm_grab(dlm))
1321                 return DLM_MASTER_RESP_NO;
1322
1323         if (!dlm_domain_fully_joined(dlm)) {
1324                 response = DLM_MASTER_RESP_NO;
1325                 goto send_response;
1326         }
1327
1328         name = request->name;
1329         namelen = request->namelen;
1330         hash = dlm_lockid_hash(name, namelen);
1331
1332         if (namelen > DLM_LOCKID_NAME_MAX) {
1333                 response = DLM_IVBUFLEN;
1334                 goto send_response;
1335         }
1336
1337 way_up_top:
1338         spin_lock(&dlm->spinlock);
1339         res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1340         if (res) {
1341                 spin_unlock(&dlm->spinlock);
1342
1343                 /* take care of the easy cases up front */
1344                 spin_lock(&res->spinlock);
1345                 if (res->state & DLM_LOCK_RES_RECOVERING) {
1346                         spin_unlock(&res->spinlock);
1347                         mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1348                              "being recovered\n");
1349                         response = DLM_MASTER_RESP_ERROR;
1350                         if (mle)
1351                                 kmem_cache_free(dlm_mle_cache, mle);
1352                         goto send_response;
1353                 }
1354
1355                 if (res->owner == dlm->node_num) {
1356                         spin_unlock(&res->spinlock);
1357                         // mlog(0, "this node is the master\n");
1358                         response = DLM_MASTER_RESP_YES;
1359                         if (mle)
1360                                 kmem_cache_free(dlm_mle_cache, mle);
1361
1362                         /* this node is the owner.
1363                          * there is some extra work that needs to
1364                          * happen now.  the requesting node has
1365                          * caused all nodes up to this one to
1366                          * create mles.  this node now needs to
1367                          * go back and clean those up. */
1368                         dispatch_assert = 1;
1369                         goto send_response;
1370                 } else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1371                         spin_unlock(&res->spinlock);
1372                         // mlog(0, "node %u is the master\n", res->owner);
1373                         response = DLM_MASTER_RESP_NO;
1374                         if (mle)
1375                                 kmem_cache_free(dlm_mle_cache, mle);
1376                         goto send_response;
1377                 }
1378
1379                 /* ok, there is no owner.  either this node is
1380                  * being blocked, or it is actively trying to
1381                  * master this lock. */
1382                 if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1383                         mlog(ML_ERROR, "lock with no owner should be "
1384                              "in-progress!\n");
1385                         BUG();
1386                 }
1387
1388                 // mlog(0, "lockres is in progress...\n");
1389                 spin_lock(&dlm->master_lock);
1390                 found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1391                 if (!found) {
1392                         mlog(ML_ERROR, "no mle found for this lock!\n");
1393                         BUG();
1394                 }
1395                 set_maybe = 1;
1396                 spin_lock(&tmpmle->spinlock);
1397                 if (tmpmle->type == DLM_MLE_BLOCK) {
1398                         // mlog(0, "this node is waiting for "
1399                         // "lockres to be mastered\n");
1400                         response = DLM_MASTER_RESP_NO;
1401                 } else if (tmpmle->type == DLM_MLE_MIGRATION) {
1402                         mlog(0, "node %u is master, but trying to migrate to "
1403                              "node %u.\n", tmpmle->master, tmpmle->new_master);
1404                         if (tmpmle->master == dlm->node_num) {
1405                                 response = DLM_MASTER_RESP_YES;
1406                                 mlog(ML_ERROR, "no owner on lockres, but this "
1407                                      "node is trying to migrate it to %u?!\n",
1408                                      tmpmle->new_master);
1409                                 BUG();
1410                         } else {
1411                                 /* the real master can respond on its own */
1412                                 response = DLM_MASTER_RESP_NO;
1413                         }
1414                 } else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1415                         set_maybe = 0;
1416                         if (tmpmle->master == dlm->node_num) {
1417                                 response = DLM_MASTER_RESP_YES;
1418                                 /* this node will be the owner.
1419                                  * go back and clean the mles on any
1420                                  * other nodes */
1421                                 dispatch_assert = 1;
1422                         } else
1423                                 response = DLM_MASTER_RESP_NO;
1424                 } else {
1425                         // mlog(0, "this node is attempting to "
1426                         // "master lockres\n");
1427                         response = DLM_MASTER_RESP_MAYBE;
1428                 }
1429                 if (set_maybe)
1430                         set_bit(request->node_idx, tmpmle->maybe_map);
1431                 spin_unlock(&tmpmle->spinlock);
1432
1433                 spin_unlock(&dlm->master_lock);
1434                 spin_unlock(&res->spinlock);
1435
1436                 /* keep the mle attached to heartbeat events */
1437                 dlm_put_mle(tmpmle);
1438                 if (mle)
1439                         kmem_cache_free(dlm_mle_cache, mle);
1440                 goto send_response;
1441         }
1442
1443         /*
1444          * lockres doesn't exist on this node
1445          * if there is an MLE_BLOCK, return NO
1446          * if there is an MLE_MASTER, return MAYBE
1447          * otherwise, add an MLE_BLOCK, return NO
1448          */
1449         spin_lock(&dlm->master_lock);
1450         found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1451         if (!found) {
1452                 /* this lockid has never been seen on this node yet */
1453                 // mlog(0, "no mle found\n");
1454                 if (!mle) {
1455                         spin_unlock(&dlm->master_lock);
1456                         spin_unlock(&dlm->spinlock);
1457
1458                         mle = (struct dlm_master_list_entry *)
1459                                 kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
1460                         if (!mle) {
1461                                 response = DLM_MASTER_RESP_ERROR;
1462                                 mlog_errno(-ENOMEM);
1463                                 goto send_response;
1464                         }
1465                         spin_lock(&dlm->spinlock);
1466                         dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
1467                                          name, namelen);
1468                         spin_unlock(&dlm->spinlock);
1469                         goto way_up_top;
1470                 }
1471
1472                 // mlog(0, "this is second time thru, already allocated, "
1473                 // "add the block.\n");
1474                 set_bit(request->node_idx, mle->maybe_map);
1475                 list_add(&mle->list, &dlm->master_list);
1476                 response = DLM_MASTER_RESP_NO;
1477         } else {
1478                 // mlog(0, "mle was found\n");
1479                 set_maybe = 1;
1480                 spin_lock(&tmpmle->spinlock);
1481                 if (tmpmle->master == dlm->node_num) {
1482                         mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
1483                         BUG();
1484                 }
1485                 if (tmpmle->type == DLM_MLE_BLOCK)
1486                         response = DLM_MASTER_RESP_NO;
1487                 else if (tmpmle->type == DLM_MLE_MIGRATION) {
1488                         mlog(0, "migration mle was found (%u->%u)\n",
1489                              tmpmle->master, tmpmle->new_master);
1490                         /* real master can respond on its own */
1491                         response = DLM_MASTER_RESP_NO;
1492                 } else
1493                         response = DLM_MASTER_RESP_MAYBE;
1494                 if (set_maybe)
1495                         set_bit(request->node_idx, tmpmle->maybe_map);
1496                 spin_unlock(&tmpmle->spinlock);
1497         }
1498         spin_unlock(&dlm->master_lock);
1499         spin_unlock(&dlm->spinlock);
1500
1501         if (found) {
1502                 /* keep the mle attached to heartbeat events */
1503                 dlm_put_mle(tmpmle);
1504         }
1505 send_response:
1506
1507         if (dispatch_assert) {
1508                 if (response != DLM_MASTER_RESP_YES)
1509                         mlog(ML_ERROR, "invalid response %d\n", response);
1510                 if (!res) {
1511                         mlog(ML_ERROR, "bad lockres while trying to assert!\n");
1512                         BUG();
1513                 }
1514                 mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1515                              dlm->node_num, res->lockname.len, res->lockname.name);
1516                 ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1517                                                  DLM_ASSERT_MASTER_MLE_CLEANUP);
1518                 if (ret < 0) {
1519                         mlog(ML_ERROR, "failed to dispatch assert master work\n");
1520                         response = DLM_MASTER_RESP_ERROR;
1521                 }
1522         }
1523
1524         dlm_put(dlm);
1525         return response;
1526 }
1527
1528 /*
1529  * DLM_ASSERT_MASTER_MSG
1530  */
1531
1532
1533 /*
1534  * NOTE: this can be used for debugging
1535  * can periodically run all locks owned by this node
1536  * and re-assert across the cluster...
1537  */
1538 static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname,
1539                                 unsigned int namelen, void *nodemap,
1540                                 u32 flags)
1541 {
1542         struct dlm_assert_master assert;
1543         int to, tmpret;
1544         struct dlm_node_iter iter;
1545         int ret = 0;
1546         int reassert;
1547
1548         BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1549 again:
1550         reassert = 0;
1551
1552         /* note that if this nodemap is empty, it returns 0 */
1553         dlm_node_iter_init(nodemap, &iter);
1554         while ((to = dlm_node_iter_next(&iter)) >= 0) {
1555                 int r = 0;
1556                 mlog(0, "sending assert master to %d (%.*s)\n", to,
1557                      namelen, lockname);
1558                 memset(&assert, 0, sizeof(assert));
1559                 assert.node_idx = dlm->node_num;
1560                 assert.namelen = namelen;
1561                 memcpy(assert.name, lockname, namelen);
1562                 assert.flags = cpu_to_be32(flags);
1563
1564                 tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1565                                             &assert, sizeof(assert), to, &r);
1566                 if (tmpret < 0) {
1567                         mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
1568                         if (!dlm_is_host_down(tmpret)) {
1569                                 mlog(ML_ERROR, "unhandled error!\n");
1570                                 BUG();
1571                         }
1572                         /* a node died.  finish out the rest of the nodes. */
1573                         mlog(ML_ERROR, "link to %d went down!\n", to);
1574                         /* any nonzero status return will do */
1575                         ret = tmpret;
1576                 } else if (r < 0) {
1577                         /* ok, something horribly messed.  kill thyself. */
1578                         mlog(ML_ERROR,"during assert master of %.*s to %u, "
1579                              "got %d.\n", namelen, lockname, to, r);
1580                         dlm_dump_lock_resources(dlm);
1581                         BUG();
1582                 } else if (r == EAGAIN) {
1583                         mlog(0, "%.*s: node %u create mles on other "
1584                              "nodes and requests a re-assert\n",
1585                              namelen, lockname, to);
1586                         reassert = 1;
1587                 }
1588         }
1589
1590         if (reassert)
1591                 goto again;
1592
1593         return ret;
1594 }
1595
1596 /*
1597  * locks that can be taken here:
1598  * dlm->spinlock
1599  * res->spinlock
1600  * mle->spinlock
1601  * dlm->master_list
1602  *
1603  * if possible, TRIM THIS DOWN!!!
1604  */
1605 int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
1606 {
1607         struct dlm_ctxt *dlm = data;
1608         struct dlm_master_list_entry *mle = NULL;
1609         struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1610         struct dlm_lock_resource *res = NULL;
1611         char *name;
1612         unsigned int namelen, hash;
1613         u32 flags;
1614         int master_request = 0;
1615         int ret = 0;
1616
1617         if (!dlm_grab(dlm))
1618                 return 0;
1619
1620         name = assert->name;
1621         namelen = assert->namelen;
1622         hash = dlm_lockid_hash(name, namelen);
1623         flags = be32_to_cpu(assert->flags);
1624
1625         if (namelen > DLM_LOCKID_NAME_MAX) {
1626                 mlog(ML_ERROR, "Invalid name length!");
1627                 goto done;
1628         }
1629
1630         spin_lock(&dlm->spinlock);
1631
1632         if (flags)
1633                 mlog(0, "assert_master with flags: %u\n", flags);
1634
1635         /* find the MLE */
1636         spin_lock(&dlm->master_lock);
1637         if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1638                 /* not an error, could be master just re-asserting */
1639                 mlog(0, "just got an assert_master from %u, but no "
1640                      "MLE for it! (%.*s)\n", assert->node_idx,
1641                      namelen, name);
1642         } else {
1643                 int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1644                 if (bit >= O2NM_MAX_NODES) {
1645                         /* not necessarily an error, though less likely.
1646                          * could be master just re-asserting. */
1647                         mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
1648                              "is asserting! (%.*s)\n", assert->node_idx,
1649                              namelen, name);
1650                 } else if (bit != assert->node_idx) {
1651                         if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1652                                 mlog(0, "master %u was found, %u should "
1653                                      "back off\n", assert->node_idx, bit);
1654                         } else {
1655                                 /* with the fix for bug 569, a higher node
1656                                  * number winning the mastery will respond
1657                                  * YES to mastery requests, but this node
1658                                  * had no way of knowing.  let it pass. */
1659                                 mlog(ML_ERROR, "%u is the lowest node, "
1660                                      "%u is asserting. (%.*s)  %u must "
1661                                      "have begun after %u won.\n", bit,
1662                                      assert->node_idx, namelen, name, bit,
1663                                      assert->node_idx);
1664                         }
1665                 }
1666         }
1667         spin_unlock(&dlm->master_lock);
1668
1669         /* ok everything checks out with the MLE
1670          * now check to see if there is a lockres */
1671         res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1672         if (res) {
1673                 spin_lock(&res->spinlock);
1674                 if (res->state & DLM_LOCK_RES_RECOVERING)  {
1675                         mlog(ML_ERROR, "%u asserting but %.*s is "
1676                              "RECOVERING!\n", assert->node_idx, namelen, name);
1677                         goto kill;
1678                 }
1679                 if (!mle) {
1680                         if (res->owner != assert->node_idx) {
1681                                 mlog(ML_ERROR, "assert_master from "
1682                                           "%u, but current owner is "
1683                                           "%u! (%.*s)\n",
1684                                        assert->node_idx, res->owner,
1685                                        namelen, name);
1686                                 goto kill;
1687                         }
1688                 } else if (mle->type != DLM_MLE_MIGRATION) {
1689                         if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1690                                 /* owner is just re-asserting */
1691                                 if (res->owner == assert->node_idx) {
1692                                         mlog(0, "owner %u re-asserting on "
1693                                              "lock %.*s\n", assert->node_idx,
1694                                              namelen, name);
1695                                         goto ok;
1696                                 }
1697                                 mlog(ML_ERROR, "got assert_master from "
1698                                      "node %u, but %u is the owner! "
1699                                      "(%.*s)\n", assert->node_idx,
1700                                      res->owner, namelen, name);
1701                                 goto kill;
1702                         }
1703                         if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1704                                 mlog(ML_ERROR, "got assert from %u, but lock "
1705                                      "with no owner should be "
1706                                      "in-progress! (%.*s)\n",
1707                                      assert->node_idx,
1708                                      namelen, name);
1709                                 goto kill;
1710                         }
1711                 } else /* mle->type == DLM_MLE_MIGRATION */ {
1712                         /* should only be getting an assert from new master */
1713                         if (assert->node_idx != mle->new_master) {
1714                                 mlog(ML_ERROR, "got assert from %u, but "
1715                                      "new master is %u, and old master "
1716                                      "was %u (%.*s)\n",
1717                                      assert->node_idx, mle->new_master,
1718                                      mle->master, namelen, name);
1719                                 goto kill;
1720                         }
1721
1722                 }
1723 ok:
1724                 spin_unlock(&res->spinlock);
1725         }
1726         spin_unlock(&dlm->spinlock);
1727
1728         // mlog(0, "woo!  got an assert_master from node %u!\n",
1729         //           assert->node_idx);
1730         if (mle) {
1731                 int extra_ref = 0;
1732                 int nn = -1;
1733
1734                 spin_lock(&mle->spinlock);
1735                 if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1736                         extra_ref = 1;
1737                 else {
1738                         /* MASTER mle: if any bits set in the response map
1739                          * then the calling node needs to re-assert to clear
1740                          * up nodes that this node contacted */
1741                         while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1742                                                     nn+1)) < O2NM_MAX_NODES) {
1743                                 if (nn != dlm->node_num && nn != assert->node_idx)
1744                                         master_request = 1;
1745                         }
1746                 }
1747                 mle->master = assert->node_idx;
1748                 atomic_set(&mle->woken, 1);
1749                 wake_up(&mle->wq);
1750                 spin_unlock(&mle->spinlock);
1751
1752                 if (mle->type == DLM_MLE_MIGRATION && res) {
1753                         mlog(0, "finishing off migration of lockres %.*s, "
1754                              "from %u to %u\n",
1755                                res->lockname.len, res->lockname.name,
1756                                dlm->node_num, mle->new_master);
1757                         spin_lock(&res->spinlock);
1758                         res->state &= ~DLM_LOCK_RES_MIGRATING;
1759                         dlm_change_lockres_owner(dlm, res, mle->new_master);
1760                         BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1761                         spin_unlock(&res->spinlock);
1762                 }
1763                 /* master is known, detach if not already detached */
1764                 dlm_mle_detach_hb_events(dlm, mle);
1765                 dlm_put_mle(mle);
1766
1767                 if (extra_ref) {
1768                         /* the assert master message now balances the extra
1769                          * ref given by the master / migration request message.
1770                          * if this is the last put, it will be removed
1771                          * from the list. */
1772                         dlm_put_mle(mle);
1773                 }
1774         }
1775
1776 done:
1777         ret = 0;
1778         if (res)
1779                 dlm_lockres_put(res);
1780         dlm_put(dlm);
1781         if (master_request) {
1782                 mlog(0, "need to tell master to reassert\n");
1783                 ret = EAGAIN;  // positive. negative would shoot down the node.
1784         }
1785         return ret;
1786
1787 kill:
1788         /* kill the caller! */
1789         spin_unlock(&res->spinlock);
1790         spin_unlock(&dlm->spinlock);
1791         dlm_lockres_put(res);
1792         mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
1793              "and killing the other node now!  This node is OK and can continue.\n");
1794         dlm_dump_lock_resources(dlm);
1795         dlm_put(dlm);
1796         return -EINVAL;
1797 }
1798
1799 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
1800                                struct dlm_lock_resource *res,
1801                                int ignore_higher, u8 request_from, u32 flags)
1802 {
1803         struct dlm_work_item *item;
1804         item = kcalloc(1, sizeof(*item), GFP_KERNEL);
1805         if (!item)
1806                 return -ENOMEM;
1807
1808
1809         /* queue up work for dlm_assert_master_worker */
1810         dlm_grab(dlm);  /* get an extra ref for the work item */
1811         dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
1812         item->u.am.lockres = res; /* already have a ref */
1813         /* can optionally ignore node numbers higher than this node */
1814         item->u.am.ignore_higher = ignore_higher;
1815         item->u.am.request_from = request_from;
1816         item->u.am.flags = flags;
1817
1818         if (ignore_higher)
1819                 mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
1820                      res->lockname.name);
1821
1822         spin_lock(&dlm->work_lock);
1823         list_add_tail(&item->list, &dlm->work_list);
1824         spin_unlock(&dlm->work_lock);
1825
1826         schedule_work(&dlm->dispatched_work);
1827         return 0;
1828 }
1829
1830 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
1831 {
1832         struct dlm_ctxt *dlm = data;
1833         int ret = 0;
1834         struct dlm_lock_resource *res;
1835         unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
1836         int ignore_higher;
1837         int bit;
1838         u8 request_from;
1839         u32 flags;
1840
1841         dlm = item->dlm;
1842         res = item->u.am.lockres;
1843         ignore_higher = item->u.am.ignore_higher;
1844         request_from = item->u.am.request_from;
1845         flags = item->u.am.flags;
1846
1847         spin_lock(&dlm->spinlock);
1848         memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
1849         spin_unlock(&dlm->spinlock);
1850
1851         clear_bit(dlm->node_num, nodemap);
1852         if (ignore_higher) {
1853                 /* if is this just to clear up mles for nodes below
1854                  * this node, do not send the message to the original
1855                  * caller or any node number higher than this */
1856                 clear_bit(request_from, nodemap);
1857                 bit = dlm->node_num;
1858                 while (1) {
1859                         bit = find_next_bit(nodemap, O2NM_MAX_NODES,
1860                                             bit+1);
1861                         if (bit >= O2NM_MAX_NODES)
1862                                 break;
1863                         clear_bit(bit, nodemap);
1864                 }
1865         }
1866
1867         /* this call now finishes out the nodemap
1868          * even if one or more nodes die */
1869         mlog(0, "worker about to master %.*s here, this=%u\n",
1870                      res->lockname.len, res->lockname.name, dlm->node_num);
1871         ret = dlm_do_assert_master(dlm, res->lockname.name,
1872                                    res->lockname.len,
1873                                    nodemap, flags);
1874         if (ret < 0) {
1875                 /* no need to restart, we are done */
1876                 mlog_errno(ret);
1877         }
1878
1879         dlm_lockres_put(res);
1880
1881         mlog(0, "finished with dlm_assert_master_worker\n");
1882 }
1883
1884 /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
1885  * We cannot wait for node recovery to complete to begin mastering this
1886  * lockres because this lockres is used to kick off recovery! ;-)
1887  * So, do a pre-check on all living nodes to see if any of those nodes
1888  * think that $RECOVERY is currently mastered by a dead node.  If so,
1889  * we wait a short time to allow that node to get notified by its own
1890  * heartbeat stack, then check again.  All $RECOVERY lock resources
1891  * mastered by dead nodes are purged when the hearbeat callback is
1892  * fired, so we can know for sure that it is safe to continue once
1893  * the node returns a live node or no node.  */
1894 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
1895                                        struct dlm_lock_resource *res)
1896 {
1897         struct dlm_node_iter iter;
1898         int nodenum;
1899         int ret = 0;
1900         u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
1901
1902         spin_lock(&dlm->spinlock);
1903         dlm_node_iter_init(dlm->domain_map, &iter);
1904         spin_unlock(&dlm->spinlock);
1905
1906         while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
1907                 /* do not send to self */
1908                 if (nodenum == dlm->node_num)
1909                         continue;
1910                 ret = dlm_do_master_requery(dlm, res, nodenum, &master);
1911                 if (ret < 0) {
1912                         mlog_errno(ret);
1913                         if (!dlm_is_host_down(ret))
1914                                 BUG();
1915                         /* host is down, so answer for that node would be
1916                          * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
1917                 }
1918
1919                 if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1920                         /* check to see if this master is in the recovery map */
1921                         spin_lock(&dlm->spinlock);
1922                         if (test_bit(master, dlm->recovery_map)) {
1923                                 mlog(ML_NOTICE, "%s: node %u has not seen "
1924                                      "node %u go down yet, and thinks the "
1925                                      "dead node is mastering the recovery "
1926                                      "lock.  must wait.\n", dlm->name,
1927                                      nodenum, master);
1928                                 ret = -EAGAIN;
1929                         }
1930                         spin_unlock(&dlm->spinlock);
1931                         mlog(0, "%s: reco lock master is %u\n", dlm->name,
1932                              master);
1933                         break;
1934                 }
1935         }
1936         return ret;
1937 }
1938
1939
1940 /*
1941  * DLM_MIGRATE_LOCKRES
1942  */
1943
1944
1945 int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1946                         u8 target)
1947 {
1948         struct dlm_master_list_entry *mle = NULL;
1949         struct dlm_master_list_entry *oldmle = NULL;
1950         struct dlm_migratable_lockres *mres = NULL;
1951         int ret = -EINVAL;
1952         const char *name;
1953         unsigned int namelen;
1954         int mle_added = 0;
1955         struct list_head *queue, *iter;
1956         int i;
1957         struct dlm_lock *lock;
1958         int empty = 1;
1959
1960         if (!dlm_grab(dlm))
1961                 return -EINVAL;
1962
1963         name = res->lockname.name;
1964         namelen = res->lockname.len;
1965
1966         mlog(0, "migrating %.*s to %u\n", namelen, name, target);
1967
1968         /*
1969          * ensure this lockres is a proper candidate for migration
1970          */
1971         spin_lock(&res->spinlock);
1972         if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
1973                 mlog(0, "cannot migrate lockres with unknown owner!\n");
1974                 spin_unlock(&res->spinlock);
1975                 goto leave;
1976         }
1977         if (res->owner != dlm->node_num) {
1978                 mlog(0, "cannot migrate lockres this node doesn't own!\n");
1979                 spin_unlock(&res->spinlock);
1980                 goto leave;
1981         }
1982         mlog(0, "checking queues...\n");
1983         queue = &res->granted;
1984         for (i=0; i<3; i++) {
1985                 list_for_each(iter, queue) {
1986                         lock = list_entry (iter, struct dlm_lock, list);
1987                         empty = 0;
1988                         if (lock->ml.node == dlm->node_num) {
1989                                 mlog(0, "found a lock owned by this node "
1990                                      "still on the %s queue!  will not "
1991                                      "migrate this lockres\n",
1992                                      i==0 ? "granted" :
1993                                      (i==1 ? "converting" : "blocked"));
1994                                 spin_unlock(&res->spinlock);
1995                                 ret = -ENOTEMPTY;
1996                                 goto leave;
1997                         }
1998                 }
1999                 queue++;
2000         }
2001         mlog(0, "all locks on this lockres are nonlocal.  continuing\n");
2002         spin_unlock(&res->spinlock);
2003
2004         /* no work to do */
2005         if (empty) {
2006                 mlog(0, "no locks were found on this lockres! done!\n");
2007                 ret = 0;
2008                 goto leave;
2009         }
2010
2011         /*
2012          * preallocate up front
2013          * if this fails, abort
2014          */
2015
2016         ret = -ENOMEM;
2017         mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
2018         if (!mres) {
2019                 mlog_errno(ret);
2020                 goto leave;
2021         }
2022
2023         mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2024                                                                 GFP_KERNEL);
2025         if (!mle) {
2026                 mlog_errno(ret);
2027                 goto leave;
2028         }
2029         ret = 0;
2030
2031         /*
2032          * find a node to migrate the lockres to
2033          */
2034
2035         mlog(0, "picking a migration node\n");
2036         spin_lock(&dlm->spinlock);
2037         /* pick a new node */
2038         if (!test_bit(target, dlm->domain_map) ||
2039             target >= O2NM_MAX_NODES) {
2040                 target = dlm_pick_migration_target(dlm, res);
2041         }
2042         mlog(0, "node %u chosen for migration\n", target);
2043
2044         if (target >= O2NM_MAX_NODES ||
2045             !test_bit(target, dlm->domain_map)) {
2046                 /* target chosen is not alive */
2047                 ret = -EINVAL;
2048         }
2049
2050         if (ret) {
2051                 spin_unlock(&dlm->spinlock);
2052                 goto fail;
2053         }
2054
2055         mlog(0, "continuing with target = %u\n", target);
2056
2057         /*
2058          * clear any existing master requests and
2059          * add the migration mle to the list
2060          */
2061         spin_lock(&dlm->master_lock);
2062         ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2063                                     namelen, target, dlm->node_num);
2064         spin_unlock(&dlm->master_lock);
2065         spin_unlock(&dlm->spinlock);
2066
2067         if (ret == -EEXIST) {
2068                 mlog(0, "another process is already migrating it\n");
2069                 goto fail;
2070         }
2071         mle_added = 1;
2072
2073         /*
2074          * set the MIGRATING flag and flush asts
2075          * if we fail after this we need to re-dirty the lockres
2076          */
2077         if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
2078                 mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
2079                      "the target went down.\n", res->lockname.len,
2080                      res->lockname.name, target);
2081                 spin_lock(&res->spinlock);
2082                 res->state &= ~DLM_LOCK_RES_MIGRATING;
2083                 spin_unlock(&res->spinlock);
2084                 ret = -EINVAL;
2085         }
2086
2087 fail:
2088         if (oldmle) {
2089                 /* master is known, detach if not already detached */
2090                 dlm_mle_detach_hb_events(dlm, oldmle);
2091                 dlm_put_mle(oldmle);
2092         }
2093
2094         if (ret < 0) {
2095                 if (mle_added) {
2096                         dlm_mle_detach_hb_events(dlm, mle);
2097                         dlm_put_mle(mle);
2098                 } else if (mle) {
2099                         kmem_cache_free(dlm_mle_cache, mle);
2100                 }
2101                 goto leave;
2102         }
2103
2104         /*
2105          * at this point, we have a migration target, an mle
2106          * in the master list, and the MIGRATING flag set on
2107          * the lockres
2108          */
2109
2110
2111         /* get an extra reference on the mle.
2112          * otherwise the assert_master from the new
2113          * master will destroy this.
2114          * also, make sure that all callers of dlm_get_mle
2115          * take both dlm->spinlock and dlm->master_lock */
2116         spin_lock(&dlm->spinlock);
2117         spin_lock(&dlm->master_lock);
2118         dlm_get_mle(mle);
2119         spin_unlock(&dlm->master_lock);
2120         spin_unlock(&dlm->spinlock);
2121
2122         /* notify new node and send all lock state */
2123         /* call send_one_lockres with migration flag.
2124          * this serves as notice to the target node that a
2125          * migration is starting. */
2126         ret = dlm_send_one_lockres(dlm, res, mres, target,
2127                                    DLM_MRES_MIGRATION);
2128
2129         if (ret < 0) {
2130                 mlog(0, "migration to node %u failed with %d\n",
2131                      target, ret);
2132                 /* migration failed, detach and clean up mle */
2133                 dlm_mle_detach_hb_events(dlm, mle);
2134                 dlm_put_mle(mle);
2135                 dlm_put_mle(mle);
2136                 goto leave;
2137         }
2138
2139         /* at this point, the target sends a message to all nodes,
2140          * (using dlm_do_migrate_request).  this node is skipped since
2141          * we had to put an mle in the list to begin the process.  this
2142          * node now waits for target to do an assert master.  this node
2143          * will be the last one notified, ensuring that the migration
2144          * is complete everywhere.  if the target dies while this is
2145          * going on, some nodes could potentially see the target as the
2146          * master, so it is important that my recovery finds the migration
2147          * mle and sets the master to UNKNONWN. */
2148
2149
2150         /* wait for new node to assert master */
2151         while (1) {
2152                 ret = wait_event_interruptible_timeout(mle->wq,
2153                                         (atomic_read(&mle->woken) == 1),
2154                                         msecs_to_jiffies(5000));
2155
2156                 if (ret >= 0) {
2157                         if (atomic_read(&mle->woken) == 1 ||
2158                             res->owner == target)
2159                                 break;
2160
2161                         mlog(0, "timed out during migration\n");
2162                         /* avoid hang during shutdown when migrating lockres
2163                          * to a node which also goes down */
2164                         if (dlm_is_node_dead(dlm, target)) {
2165                                 mlog(0, "%s:%.*s: expected migration target %u "
2166                                      "is no longer up.  restarting.\n",
2167                                      dlm->name, res->lockname.len,
2168                                      res->lockname.name, target);
2169                                 ret = -ERESTARTSYS;
2170                         }
2171                 }
2172                 if (ret == -ERESTARTSYS) {
2173                         /* migration failed, detach and clean up mle */
2174                         dlm_mle_detach_hb_events(dlm, mle);
2175                         dlm_put_mle(mle);
2176                         dlm_put_mle(mle);
2177                         goto leave;
2178                 }
2179                 /* TODO: if node died: stop, clean up, return error */
2180         }
2181
2182         /* all done, set the owner, clear the flag */
2183         spin_lock(&res->spinlock);
2184         dlm_set_lockres_owner(dlm, res, target);
2185         res->state &= ~DLM_LOCK_RES_MIGRATING;
2186         dlm_remove_nonlocal_locks(dlm, res);
2187         spin_unlock(&res->spinlock);
2188         wake_up(&res->wq);
2189
2190         /* master is known, detach if not already detached */
2191         dlm_mle_detach_hb_events(dlm, mle);
2192         dlm_put_mle(mle);
2193         ret = 0;
2194
2195         dlm_lockres_calc_usage(dlm, res);
2196
2197 leave:
2198         /* re-dirty the lockres if we failed */
2199         if (ret < 0)
2200                 dlm_kick_thread(dlm, res);
2201
2202         /* TODO: cleanup */
2203         if (mres)
2204                 free_page((unsigned long)mres);
2205
2206         dlm_put(dlm);
2207
2208         mlog(0, "returning %d\n", ret);
2209         return ret;
2210 }
2211 EXPORT_SYMBOL_GPL(dlm_migrate_lockres);
2212
2213 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2214 {
2215         int ret;
2216         spin_lock(&dlm->ast_lock);
2217         spin_lock(&lock->spinlock);
2218         ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2219         spin_unlock(&lock->spinlock);
2220         spin_unlock(&dlm->ast_lock);
2221         return ret;
2222 }
2223
2224 static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2225                                      struct dlm_lock_resource *res,
2226                                      u8 mig_target)
2227 {
2228         int can_proceed;
2229         spin_lock(&res->spinlock);
2230         can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2231         spin_unlock(&res->spinlock);
2232
2233         /* target has died, so make the caller break out of the
2234          * wait_event, but caller must recheck the domain_map */
2235         spin_lock(&dlm->spinlock);
2236         if (!test_bit(mig_target, dlm->domain_map))
2237                 can_proceed = 1;
2238         spin_unlock(&dlm->spinlock);
2239         return can_proceed;
2240 }
2241
2242 int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2243 {
2244         int ret;
2245         spin_lock(&res->spinlock);
2246         ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2247         spin_unlock(&res->spinlock);
2248         return ret;
2249 }
2250
2251
2252 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2253                                        struct dlm_lock_resource *res,
2254                                        u8 target)
2255 {
2256         int ret = 0;
2257
2258         mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2259                res->lockname.len, res->lockname.name, dlm->node_num,
2260                target);
2261         /* need to set MIGRATING flag on lockres.  this is done by
2262          * ensuring that all asts have been flushed for this lockres. */
2263         spin_lock(&res->spinlock);
2264         BUG_ON(res->migration_pending);
2265         res->migration_pending = 1;
2266         /* strategy is to reserve an extra ast then release
2267          * it below, letting the release do all of the work */
2268         __dlm_lockres_reserve_ast(res);
2269         spin_unlock(&res->spinlock);
2270
2271         /* now flush all the pending asts.. hang out for a bit */
2272         dlm_kick_thread(dlm, res);
2273         wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2274         dlm_lockres_release_ast(dlm, res);
2275
2276         mlog(0, "about to wait on migration_wq, dirty=%s\n",
2277                res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2278         /* if the extra ref we just put was the final one, this
2279          * will pass thru immediately.  otherwise, we need to wait
2280          * for the last ast to finish. */
2281 again:
2282         ret = wait_event_interruptible_timeout(dlm->migration_wq,
2283                    dlm_migration_can_proceed(dlm, res, target),
2284                    msecs_to_jiffies(1000));
2285         if (ret < 0) {
2286                 mlog(0, "woken again: migrating? %s, dead? %s\n",
2287                        res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2288                        test_bit(target, dlm->domain_map) ? "no":"yes");
2289         } else {
2290                 mlog(0, "all is well: migrating? %s, dead? %s\n",
2291                        res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2292                        test_bit(target, dlm->domain_map) ? "no":"yes");
2293         }
2294         if (!dlm_migration_can_proceed(dlm, res, target)) {
2295                 mlog(0, "trying again...\n");
2296                 goto again;
2297         }
2298
2299         /* did the target go down or die? */
2300         spin_lock(&dlm->spinlock);
2301         if (!test_bit(target, dlm->domain_map)) {
2302                 mlog(ML_ERROR, "aha. migration target %u just went down\n",
2303                      target);
2304                 ret = -EHOSTDOWN;
2305         }
2306         spin_unlock(&dlm->spinlock);
2307
2308         /*
2309          * at this point:
2310          *
2311          *   o the DLM_LOCK_RES_MIGRATING flag is set
2312          *   o there are no pending asts on this lockres
2313          *   o all processes trying to reserve an ast on this
2314          *     lockres must wait for the MIGRATING flag to clear
2315          */
2316         return ret;
2317 }
2318
2319 /* last step in the migration process.
2320  * original master calls this to free all of the dlm_lock
2321  * structures that used to be for other nodes. */
2322 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2323                                       struct dlm_lock_resource *res)
2324 {
2325         struct list_head *iter, *iter2;
2326         struct list_head *queue = &res->granted;
2327         int i;
2328         struct dlm_lock *lock;
2329
2330         assert_spin_locked(&res->spinlock);
2331
2332         BUG_ON(res->owner == dlm->node_num);
2333
2334         for (i=0; i<3; i++) {
2335                 list_for_each_safe(iter, iter2, queue) {
2336                         lock = list_entry (iter, struct dlm_lock, list);
2337                         if (lock->ml.node != dlm->node_num) {
2338                                 mlog(0, "putting lock for node %u\n",
2339                                      lock->ml.node);
2340                                 /* be extra careful */
2341                                 BUG_ON(!list_empty(&lock->ast_list));
2342                                 BUG_ON(!list_empty(&lock->bast_list));
2343                                 BUG_ON(lock->ast_pending);
2344                                 BUG_ON(lock->bast_pending);
2345                                 list_del_init(&lock->list);
2346                                 dlm_lock_put(lock);
2347                         }
2348                 }
2349                 queue++;
2350         }
2351 }
2352
2353 /* for now this is not too intelligent.  we will
2354  * need stats to make this do the right thing.
2355  * this just finds the first lock on one of the
2356  * queues and uses that node as the target. */
2357 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2358                                     struct dlm_lock_resource *res)
2359 {
2360         int i;
2361         struct list_head *queue = &res->granted;
2362         struct list_head *iter;
2363         struct dlm_lock *lock;
2364         int nodenum;
2365
2366         assert_spin_locked(&dlm->spinlock);
2367
2368         spin_lock(&res->spinlock);
2369         for (i=0; i<3; i++) {
2370                 list_for_each(iter, queue) {
2371                         /* up to the caller to make sure this node
2372                          * is alive */
2373                         lock = list_entry (iter, struct dlm_lock, list);
2374                         if (lock->ml.node != dlm->node_num) {
2375                                 spin_unlock(&res->spinlock);
2376                                 return lock->ml.node;
2377                         }
2378                 }
2379                 queue++;
2380         }
2381         spin_unlock(&res->spinlock);
2382         mlog(0, "have not found a suitable target yet! checking domain map\n");
2383
2384         /* ok now we're getting desperate.  pick anyone alive. */
2385         nodenum = -1;
2386         while (1) {
2387                 nodenum = find_next_bit(dlm->domain_map,
2388                                         O2NM_MAX_NODES, nodenum+1);
2389                 mlog(0, "found %d in domain map\n", nodenum);
2390                 if (nodenum >= O2NM_MAX_NODES)
2391                         break;
2392                 if (nodenum != dlm->node_num) {
2393                         mlog(0, "picking %d\n", nodenum);
2394                         return nodenum;
2395                 }
2396         }
2397
2398         mlog(0, "giving up.  no master to migrate to\n");
2399         return DLM_LOCK_RES_OWNER_UNKNOWN;
2400 }
2401
2402
2403
2404 /* this is called by the new master once all lockres
2405  * data has been received */
2406 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2407                                   struct dlm_lock_resource *res,
2408                                   u8 master, u8 new_master,
2409                                   struct dlm_node_iter *iter)
2410 {
2411         struct dlm_migrate_request migrate;
2412         int ret, status = 0;
2413         int nodenum;
2414
2415         memset(&migrate, 0, sizeof(migrate));
2416         migrate.namelen = res->lockname.len;
2417         memcpy(migrate.name, res->lockname.name, migrate.namelen);
2418         migrate.new_master = new_master;
2419         migrate.master = master;
2420
2421         ret = 0;
2422
2423         /* send message to all nodes, except the master and myself */
2424         while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
2425                 if (nodenum == master ||
2426                     nodenum == new_master)
2427                         continue;
2428
2429                 ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
2430                                          &migrate, sizeof(migrate), nodenum,
2431                                          &status);
2432                 if (ret < 0)
2433                         mlog_errno(ret);
2434                 else if (status < 0) {
2435                         mlog(0, "migrate request (node %u) returned %d!\n",
2436                              nodenum, status);
2437                         ret = status;
2438                 }
2439         }
2440
2441         if (ret < 0)
2442                 mlog_errno(ret);
2443
2444         mlog(0, "returning ret=%d\n", ret);
2445         return ret;
2446 }
2447
2448
2449 /* if there is an existing mle for this lockres, we now know who the master is.
2450  * (the one who sent us *this* message) we can clear it up right away.
2451  * since the process that put the mle on the list still has a reference to it,
2452  * we can unhash it now, set the master and wake the process.  as a result,
2453  * we will have no mle in the list to start with.  now we can add an mle for
2454  * the migration and this should be the only one found for those scanning the
2455  * list.  */
2456 int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
2457 {
2458         struct dlm_ctxt *dlm = data;
2459         struct dlm_lock_resource *res = NULL;
2460         struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
2461         struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
2462         const char *name;
2463         unsigned int namelen, hash;
2464         int ret = 0;
2465
2466         if (!dlm_grab(dlm))
2467                 return -EINVAL;
2468
2469         name = migrate->name;
2470         namelen = migrate->namelen;
2471         hash = dlm_lockid_hash(name, namelen);
2472
2473         /* preallocate.. if this fails, abort */
2474         mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
2475                                                          GFP_KERNEL);
2476
2477         if (!mle) {
2478                 ret = -ENOMEM;
2479                 goto leave;
2480         }
2481
2482         /* check for pre-existing lock */
2483         spin_lock(&dlm->spinlock);
2484         res = __dlm_lookup_lockres(dlm, name, namelen, hash);
2485         spin_lock(&dlm->master_lock);
2486
2487         if (res) {
2488                 spin_lock(&res->spinlock);
2489                 if (res->state & DLM_LOCK_RES_RECOVERING) {
2490                         /* if all is working ok, this can only mean that we got
2491                         * a migrate request from a node that we now see as
2492                         * dead.  what can we do here?  drop it to the floor? */
2493                         spin_unlock(&res->spinlock);
2494                         mlog(ML_ERROR, "Got a migrate request, but the "
2495                              "lockres is marked as recovering!");
2496                         kmem_cache_free(dlm_mle_cache, mle);
2497                         ret = -EINVAL; /* need a better solution */
2498                         goto unlock;
2499                 }
2500                 res->state |= DLM_LOCK_RES_MIGRATING;
2501                 spin_unlock(&res->spinlock);
2502         }
2503
2504         /* ignore status.  only nonzero status would BUG. */
2505         ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
2506                                     name, namelen,
2507                                     migrate->new_master,
2508                                     migrate->master);
2509
2510 unlock:
2511         spin_unlock(&dlm->master_lock);
2512         spin_unlock(&dlm->spinlock);
2513
2514         if (oldmle) {
2515                 /* master is known, detach if not already detached */
2516                 dlm_mle_detach_hb_events(dlm, oldmle);
2517                 dlm_put_mle(oldmle);
2518         }
2519
2520         if (res)
2521                 dlm_lockres_put(res);
2522 leave:
2523         dlm_put(dlm);
2524         return ret;
2525 }
2526
2527 /* must be holding dlm->spinlock and dlm->master_lock
2528  * when adding a migration mle, we can clear any other mles
2529  * in the master list because we know with certainty that
2530  * the master is "master".  so we remove any old mle from
2531  * the list after setting it's master field, and then add
2532  * the new migration mle.  this way we can hold with the rule
2533  * of having only one mle for a given lock name at all times. */
2534 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
2535                                  struct dlm_lock_resource *res,
2536                                  struct dlm_master_list_entry *mle,
2537                                  struct dlm_master_list_entry **oldmle,
2538                                  const char *name, unsigned int namelen,
2539                                  u8 new_master, u8 master)
2540 {
2541         int found;
2542         int ret = 0;
2543
2544         *oldmle = NULL;
2545
2546         mlog_entry_void();
2547
2548         assert_spin_locked(&dlm->spinlock);
2549         assert_spin_locked(&dlm->master_lock);
2550
2551         /* caller is responsible for any ref taken here on oldmle */
2552         found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
2553         if (found) {
2554                 struct dlm_master_list_entry *tmp = *oldmle;
2555                 spin_lock(&tmp->spinlock);
2556                 if (tmp->type == DLM_MLE_MIGRATION) {
2557                         if (master == dlm->node_num) {
2558                                 /* ah another process raced me to it */
2559                                 mlog(0, "tried to migrate %.*s, but some "
2560                                      "process beat me to it\n",
2561                                      namelen, name);
2562                                 ret = -EEXIST;
2563                         } else {
2564                                 /* bad.  2 NODES are trying to migrate! */
2565                                 mlog(ML_ERROR, "migration error  mle: "
2566                                      "master=%u new_master=%u // request: "
2567                                      "master=%u new_master=%u // "
2568                                      "lockres=%.*s\n",
2569                                      tmp->master, tmp->new_master,
2570                                      master, new_master,
2571                                      namelen, name);
2572                                 BUG();
2573                         }
2574                 } else {
2575                         /* this is essentially what assert_master does */
2576                         tmp->master = master;
2577                         atomic_set(&tmp->woken, 1);
2578                         wake_up(&tmp->wq);
2579                         /* remove it from the list so that only one
2580                          * mle will be found */
2581                         list_del_init(&tmp->list);
2582                 }
2583                 spin_unlock(&tmp->spinlock);
2584         }
2585
2586         /* now add a migration mle to the tail of the list */
2587         dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
2588         mle->new_master = new_master;
2589         mle->master = master;
2590         /* do this for consistency with other mle types */
2591         set_bit(new_master, mle->maybe_map);
2592         list_add(&mle->list, &dlm->master_list);
2593
2594         return ret;
2595 }
2596
2597
2598 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
2599 {
2600         struct list_head *iter, *iter2;
2601         struct dlm_master_list_entry *mle;
2602         struct dlm_lock_resource *res;
2603         unsigned int hash;
2604
2605         mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
2606 top:
2607         assert_spin_locked(&dlm->spinlock);
2608
2609         /* clean the master list */
2610         spin_lock(&dlm->master_lock);
2611         list_for_each_safe(iter, iter2, &dlm->master_list) {
2612                 mle = list_entry(iter, struct dlm_master_list_entry, list);
2613
2614                 BUG_ON(mle->type != DLM_MLE_BLOCK &&
2615                        mle->type != DLM_MLE_MASTER &&
2616                        mle->type != DLM_MLE_MIGRATION);
2617
2618                 /* MASTER mles are initiated locally.  the waiting
2619                  * process will notice the node map change
2620                  * shortly.  let that happen as normal. */
2621                 if (mle->type == DLM_MLE_MASTER)
2622                         continue;
2623
2624
2625                 /* BLOCK mles are initiated by other nodes.
2626                  * need to clean up if the dead node would have
2627                  * been the master. */
2628                 if (mle->type == DLM_MLE_BLOCK) {
2629                         int bit;
2630
2631                         spin_lock(&mle->spinlock);
2632                         bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
2633                         if (bit != dead_node) {
2634                                 mlog(0, "mle found, but dead node %u would "
2635                                      "not have been master\n", dead_node);
2636                                 spin_unlock(&mle->spinlock);
2637                         } else {
2638                                 /* must drop the refcount by one since the
2639                                  * assert_master will never arrive.  this
2640                                  * may result in the mle being unlinked and
2641                                  * freed, but there may still be a process
2642                                  * waiting in the dlmlock path which is fine. */
2643                                 mlog(ML_ERROR, "node %u was expected master\n",
2644                                      dead_node);
2645                                 atomic_set(&mle->woken, 1);
2646                                 spin_unlock(&mle->spinlock);
2647                                 wake_up(&mle->wq);
2648                                 /* do not need events any longer, so detach
2649                                  * from heartbeat */
2650                                 __dlm_mle_detach_hb_events(dlm, mle);
2651                                 __dlm_put_mle(mle);
2652                         }
2653                         continue;
2654                 }
2655
2656                 /* everything else is a MIGRATION mle */
2657
2658                 /* the rule for MIGRATION mles is that the master
2659                  * becomes UNKNOWN if *either* the original or
2660                  * the new master dies.  all UNKNOWN lockreses
2661                  * are sent to whichever node becomes the recovery
2662                  * master.  the new master is responsible for
2663                  * determining if there is still a master for
2664                  * this lockres, or if he needs to take over
2665                  * mastery.  either way, this node should expect
2666                  * another message to resolve this. */
2667                 if (mle->master != dead_node &&
2668                     mle->new_master != dead_node)
2669                         continue;
2670
2671                 /* if we have reached this point, this mle needs to
2672                  * be removed from the list and freed. */
2673
2674                 /* remove from the list early.  NOTE: unlinking
2675                  * list_head while in list_for_each_safe */
2676                 spin_lock(&mle->spinlock);
2677                 list_del_init(&mle->list);
2678                 atomic_set(&mle->woken, 1);
2679                 spin_unlock(&mle->spinlock);
2680                 wake_up(&mle->wq);
2681
2682                 mlog(0, "node %u died during migration from "
2683                      "%u to %u!\n", dead_node,
2684                      mle->master, mle->new_master);
2685                 /* if there is a lockres associated with this
2686                  * mle, find it and set its owner to UNKNOWN */
2687                 hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
2688                 res = __dlm_lookup_lockres(dlm, mle->u.name.name,
2689                                            mle->u.name.len, hash);
2690                 if (res) {
2691                         /* unfortunately if we hit this rare case, our
2692                          * lock ordering is messed.  we need to drop
2693                          * the master lock so that we can take the
2694                          * lockres lock, meaning that we will have to
2695                          * restart from the head of list. */
2696                         spin_unlock(&dlm->master_lock);
2697
2698                         /* move lockres onto recovery list */
2699                         spin_lock(&res->spinlock);
2700                         dlm_set_lockres_owner(dlm, res,
2701                                         DLM_LOCK_RES_OWNER_UNKNOWN);
2702                         dlm_move_lockres_to_recovery_list(dlm, res);
2703                         spin_unlock(&res->spinlock);
2704                         dlm_lockres_put(res);
2705
2706                         /* about to get rid of mle, detach from heartbeat */
2707                         __dlm_mle_detach_hb_events(dlm, mle);
2708
2709                         /* dump the mle */
2710                         spin_lock(&dlm->master_lock);
2711                         __dlm_put_mle(mle);
2712                         spin_unlock(&dlm->master_lock);
2713
2714                         /* restart */
2715                         goto top;
2716                 }
2717
2718                 /* this may be the last reference */
2719                 __dlm_put_mle(mle);
2720         }
2721         spin_unlock(&dlm->master_lock);
2722 }
2723
2724
2725 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
2726                          u8 old_master)
2727 {
2728         struct dlm_node_iter iter;
2729         int ret = 0;
2730
2731         spin_lock(&dlm->spinlock);
2732         dlm_node_iter_init(dlm->domain_map, &iter);
2733         clear_bit(old_master, iter.node_map);
2734         clear_bit(dlm->node_num, iter.node_map);
2735         spin_unlock(&dlm->spinlock);
2736
2737         mlog(0, "now time to do a migrate request to other nodes\n");
2738         ret = dlm_do_migrate_request(dlm, res, old_master,
2739                                      dlm->node_num, &iter);
2740         if (ret < 0) {
2741                 mlog_errno(ret);
2742                 goto leave;
2743         }
2744
2745         mlog(0, "doing assert master of %.*s to all except the original node\n",
2746              res->lockname.len, res->lockname.name);
2747         /* this call now finishes out the nodemap
2748          * even if one or more nodes die */
2749         ret = dlm_do_assert_master(dlm, res->lockname.name,
2750                                    res->lockname.len, iter.node_map,
2751                                    DLM_ASSERT_MASTER_FINISH_MIGRATION);
2752         if (ret < 0) {
2753                 /* no longer need to retry.  all living nodes contacted. */
2754                 mlog_errno(ret);
2755                 ret = 0;
2756         }
2757
2758         memset(iter.node_map, 0, sizeof(iter.node_map));
2759         set_bit(old_master, iter.node_map);
2760         mlog(0, "doing assert master of %.*s back to %u\n",
2761              res->lockname.len, res->lockname.name, old_master);
2762         ret = dlm_do_assert_master(dlm, res->lockname.name,
2763                                    res->lockname.len, iter.node_map,
2764                                    DLM_ASSERT_MASTER_FINISH_MIGRATION);
2765         if (ret < 0) {
2766                 mlog(0, "assert master to original master failed "
2767                      "with %d.\n", ret);
2768                 /* the only nonzero status here would be because of
2769                  * a dead original node.  we're done. */
2770                 ret = 0;
2771         }
2772
2773         /* all done, set the owner, clear the flag */
2774         spin_lock(&res->spinlock);
2775         dlm_set_lockres_owner(dlm, res, dlm->node_num);
2776         res->state &= ~DLM_LOCK_RES_MIGRATING;
2777         spin_unlock(&res->spinlock);
2778         /* re-dirty it on the new master */
2779         dlm_kick_thread(dlm, res);
2780         wake_up(&res->wq);
2781 leave:
2782         return ret;
2783 }
2784
2785 /*
2786  * LOCKRES AST REFCOUNT
2787  * this is integral to migration
2788  */
2789
2790 /* for future intent to call an ast, reserve one ahead of time.
2791  * this should be called only after waiting on the lockres
2792  * with dlm_wait_on_lockres, and while still holding the
2793  * spinlock after the call. */
2794 void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
2795 {
2796         assert_spin_locked(&res->spinlock);
2797         if (res->state & DLM_LOCK_RES_MIGRATING) {
2798                 __dlm_print_one_lock_resource(res);
2799         }
2800         BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2801
2802         atomic_inc(&res->asts_reserved);
2803 }
2804
2805 /*
2806  * used to drop the reserved ast, either because it went unused,
2807  * or because the ast/bast was actually called.
2808  *
2809  * also, if there is a pending migration on this lockres,
2810  * and this was the last pending ast on the lockres,
2811  * atomically set the MIGRATING flag before we drop the lock.
2812  * this is how we ensure that migration can proceed with no
2813  * asts in progress.  note that it is ok if the state of the
2814  * queues is such that a lock should be granted in the future
2815  * or that a bast should be fired, because the new master will
2816  * shuffle the lists on this lockres as soon as it is migrated.
2817  */
2818 void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
2819                              struct dlm_lock_resource *res)
2820 {
2821         if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
2822                 return;
2823
2824         if (!res->migration_pending) {
2825                 spin_unlock(&res->spinlock);
2826                 return;
2827         }
2828
2829         BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
2830         res->migration_pending = 0;
2831         res->state |= DLM_LOCK_RES_MIGRATING;
2832         spin_unlock(&res->spinlock);
2833         wake_up(&res->wq);
2834         wake_up(&dlm->migration_wq);
2835 }