1 /******************************************************************************
2 *******************************************************************************
4 ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5 ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
7 ** This copyrighted material is made available to anyone wishing to use,
8 ** modify, copy, or redistribute it subject to the terms and conditions
9 ** of the GNU General Public License v.2.
11 *******************************************************************************
12 ******************************************************************************/
14 #include "dlm_internal.h"
15 #include "lockspace.h"
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
44 static void dlm_wait_timer_fn(unsigned long data
)
46 struct dlm_ls
*ls
= (struct dlm_ls
*) data
;
47 mod_timer(&ls
->ls_timer
, jiffies
+ (dlm_config
.recover_timer
* HZ
));
48 wake_up(&ls
->ls_wait_general
);
51 int dlm_wait_function(struct dlm_ls
*ls
, int (*testfn
) (struct dlm_ls
*ls
))
55 init_timer(&ls
->ls_timer
);
56 ls
->ls_timer
.function
= dlm_wait_timer_fn
;
57 ls
->ls_timer
.data
= (long) ls
;
58 ls
->ls_timer
.expires
= jiffies
+ (dlm_config
.recover_timer
* HZ
);
59 add_timer(&ls
->ls_timer
);
61 wait_event(ls
->ls_wait_general
, testfn(ls
) || dlm_recovery_stopped(ls
));
62 del_timer_sync(&ls
->ls_timer
);
64 if (dlm_recovery_stopped(ls
)) {
65 log_debug(ls
, "dlm_wait_function aborted");
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
79 uint32_t dlm_recover_status(struct dlm_ls
*ls
)
82 spin_lock(&ls
->ls_recover_lock
);
83 status
= ls
->ls_recover_status
;
84 spin_unlock(&ls
->ls_recover_lock
);
88 void dlm_set_recover_status(struct dlm_ls
*ls
, uint32_t status
)
90 spin_lock(&ls
->ls_recover_lock
);
91 ls
->ls_recover_status
|= status
;
92 spin_unlock(&ls
->ls_recover_lock
);
95 static int wait_status_all(struct dlm_ls
*ls
, uint32_t wait_status
)
97 struct dlm_rcom
*rc
= (struct dlm_rcom
*) ls
->ls_recover_buf
;
98 struct dlm_member
*memb
;
101 list_for_each_entry(memb
, &ls
->ls_nodes
, list
) {
104 if (dlm_recovery_stopped(ls
)) {
109 error
= dlm_rcom_status(ls
, memb
->nodeid
);
113 if (rc
->rc_result
& wait_status
)
124 static int wait_status_low(struct dlm_ls
*ls
, uint32_t wait_status
)
126 struct dlm_rcom
*rc
= (struct dlm_rcom
*) ls
->ls_recover_buf
;
127 int error
= 0, delay
= 0, nodeid
= ls
->ls_low_nodeid
;
130 if (dlm_recovery_stopped(ls
)) {
135 error
= dlm_rcom_status(ls
, nodeid
);
139 if (rc
->rc_result
& wait_status
)
149 static int wait_status(struct dlm_ls
*ls
, uint32_t status
)
151 uint32_t status_all
= status
<< 1;
154 if (ls
->ls_low_nodeid
== dlm_our_nodeid()) {
155 error
= wait_status_all(ls
, status
);
157 dlm_set_recover_status(ls
, status_all
);
159 error
= wait_status_low(ls
, status_all
);
164 int dlm_recover_members_wait(struct dlm_ls
*ls
)
166 return wait_status(ls
, DLM_RS_NODES
);
169 int dlm_recover_directory_wait(struct dlm_ls
*ls
)
171 return wait_status(ls
, DLM_RS_DIR
);
174 int dlm_recover_locks_wait(struct dlm_ls
*ls
)
176 return wait_status(ls
, DLM_RS_LOCKS
);
179 int dlm_recover_done_wait(struct dlm_ls
*ls
)
181 return wait_status(ls
, DLM_RS_DONE
);
185 * The recover_list contains all the rsb's for which we've requested the new
186 * master nodeid. As replies are returned from the resource directories the
187 * rsb's are removed from the list. When the list is empty we're done.
189 * The recover_list is later similarly used for all rsb's for which we've sent
190 * new lkb's and need to receive new corresponding lkid's.
192 * We use the address of the rsb struct as a simple local identifier for the
193 * rsb so we can match an rcom reply with the rsb it was sent for.
196 static int recover_list_empty(struct dlm_ls
*ls
)
200 spin_lock(&ls
->ls_recover_list_lock
);
201 empty
= list_empty(&ls
->ls_recover_list
);
202 spin_unlock(&ls
->ls_recover_list_lock
);
207 static void recover_list_add(struct dlm_rsb
*r
)
209 struct dlm_ls
*ls
= r
->res_ls
;
211 spin_lock(&ls
->ls_recover_list_lock
);
212 if (list_empty(&r
->res_recover_list
)) {
213 list_add_tail(&r
->res_recover_list
, &ls
->ls_recover_list
);
214 ls
->ls_recover_list_count
++;
217 spin_unlock(&ls
->ls_recover_list_lock
);
220 static void recover_list_del(struct dlm_rsb
*r
)
222 struct dlm_ls
*ls
= r
->res_ls
;
224 spin_lock(&ls
->ls_recover_list_lock
);
225 list_del_init(&r
->res_recover_list
);
226 ls
->ls_recover_list_count
--;
227 spin_unlock(&ls
->ls_recover_list_lock
);
232 static struct dlm_rsb
*recover_list_find(struct dlm_ls
*ls
, uint64_t id
)
234 struct dlm_rsb
*r
= NULL
;
236 spin_lock(&ls
->ls_recover_list_lock
);
238 list_for_each_entry(r
, &ls
->ls_recover_list
, res_recover_list
) {
239 if (id
== (unsigned long) r
)
244 spin_unlock(&ls
->ls_recover_list_lock
);
248 static void recover_list_clear(struct dlm_ls
*ls
)
250 struct dlm_rsb
*r
, *s
;
252 spin_lock(&ls
->ls_recover_list_lock
);
253 list_for_each_entry_safe(r
, s
, &ls
->ls_recover_list
, res_recover_list
) {
254 list_del_init(&r
->res_recover_list
);
255 r
->res_recover_locks_count
= 0;
257 ls
->ls_recover_list_count
--;
260 if (ls
->ls_recover_list_count
!= 0) {
261 log_error(ls
, "warning: recover_list_count %d",
262 ls
->ls_recover_list_count
);
263 ls
->ls_recover_list_count
= 0;
265 spin_unlock(&ls
->ls_recover_list_lock
);
269 /* Master recovery: find new master node for rsb's that were
270 mastered on nodes that have been removed.
274 dlm_send_rcom_lookup -> receive_rcom_lookup
276 receive_rcom_lookup_reply <-
277 dlm_recover_master_reply
284 * Set the lock master for all LKBs in a lock queue
285 * If we are the new master of the rsb, we may have received new
286 * MSTCPY locks from other nodes already which we need to ignore
287 * when setting the new nodeid.
290 static void set_lock_master(struct list_head
*queue
, int nodeid
)
294 list_for_each_entry(lkb
, queue
, lkb_statequeue
)
295 if (!(lkb
->lkb_flags
& DLM_IFL_MSTCPY
))
296 lkb
->lkb_nodeid
= nodeid
;
299 static void set_master_lkbs(struct dlm_rsb
*r
)
301 set_lock_master(&r
->res_grantqueue
, r
->res_nodeid
);
302 set_lock_master(&r
->res_convertqueue
, r
->res_nodeid
);
303 set_lock_master(&r
->res_waitqueue
, r
->res_nodeid
);
307 * Propogate the new master nodeid to locks
308 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
309 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
313 static void set_new_master(struct dlm_rsb
*r
, int nodeid
)
316 r
->res_nodeid
= nodeid
;
318 rsb_set_flag(r
, RSB_NEW_MASTER
);
319 rsb_set_flag(r
, RSB_NEW_MASTER2
);
324 * We do async lookups on rsb's that need new masters. The rsb's
325 * waiting for a lookup reply are kept on the recover_list.
328 static int recover_master(struct dlm_rsb
*r
)
330 struct dlm_ls
*ls
= r
->res_ls
;
331 int error
, dir_nodeid
, ret_nodeid
, our_nodeid
= dlm_our_nodeid();
333 dir_nodeid
= dlm_dir_nodeid(r
);
335 if (dir_nodeid
== our_nodeid
) {
336 error
= dlm_dir_lookup(ls
, our_nodeid
, r
->res_name
,
337 r
->res_length
, &ret_nodeid
);
339 log_error(ls
, "recover dir lookup error %d", error
);
341 if (ret_nodeid
== our_nodeid
)
343 set_new_master(r
, ret_nodeid
);
346 error
= dlm_send_rcom_lookup(r
, dir_nodeid
);
353 * When not using a directory, most resource names will hash to a new static
354 * master nodeid and the resource will need to be remastered.
357 static int recover_master_static(struct dlm_rsb
*r
)
359 int master
= dlm_dir_nodeid(r
);
361 if (master
== dlm_our_nodeid())
364 if (r
->res_nodeid
!= master
) {
366 dlm_purge_mstcpy_locks(r
);
367 set_new_master(r
, master
);
374 * Go through local root resources and for each rsb which has a master which
375 * has departed, get the new master nodeid from the directory. The dir will
376 * assign mastery to the first node to look up the new master. That means
377 * we'll discover in this lookup if we're the new master of any rsb's.
379 * We fire off all the dir lookup requests individually and asynchronously to
380 * the correct dir node.
383 int dlm_recover_masters(struct dlm_ls
*ls
)
386 int error
= 0, count
= 0;
388 log_debug(ls
, "dlm_recover_masters");
390 down_read(&ls
->ls_root_sem
);
391 list_for_each_entry(r
, &ls
->ls_root_list
, res_root_list
) {
392 if (dlm_recovery_stopped(ls
)) {
393 up_read(&ls
->ls_root_sem
);
398 if (dlm_no_directory(ls
))
399 count
+= recover_master_static(r
);
400 else if (!is_master(r
) && dlm_is_removed(ls
, r
->res_nodeid
)) {
407 up_read(&ls
->ls_root_sem
);
409 log_debug(ls
, "dlm_recover_masters %d resources", count
);
411 error
= dlm_wait_function(ls
, &recover_list_empty
);
414 recover_list_clear(ls
);
418 int dlm_recover_master_reply(struct dlm_ls
*ls
, struct dlm_rcom
*rc
)
423 r
= recover_list_find(ls
, rc
->rc_id
);
425 log_error(ls
, "dlm_recover_master_reply no id %llx",
426 (unsigned long long)rc
->rc_id
);
430 nodeid
= rc
->rc_result
;
431 if (nodeid
== dlm_our_nodeid())
434 set_new_master(r
, nodeid
);
437 if (recover_list_empty(ls
))
438 wake_up(&ls
->ls_wait_general
);
444 /* Lock recovery: rebuild the process-copy locks we hold on a
445 remastered rsb on the new rsb master.
450 dlm_send_rcom_lock -> receive_rcom_lock
451 dlm_recover_master_copy
452 receive_rcom_lock_reply <-
453 dlm_recover_process_copy
458 * keep a count of the number of lkb's we send to the new master; when we get
459 * an equal number of replies then recovery for the rsb is done
462 static int recover_locks_queue(struct dlm_rsb
*r
, struct list_head
*head
)
467 list_for_each_entry(lkb
, head
, lkb_statequeue
) {
468 error
= dlm_send_rcom_lock(r
, lkb
);
471 r
->res_recover_locks_count
++;
477 static int recover_locks(struct dlm_rsb
*r
)
483 DLM_ASSERT(!r
->res_recover_locks_count
, dlm_dump_rsb(r
););
485 error
= recover_locks_queue(r
, &r
->res_grantqueue
);
488 error
= recover_locks_queue(r
, &r
->res_convertqueue
);
491 error
= recover_locks_queue(r
, &r
->res_waitqueue
);
495 if (r
->res_recover_locks_count
)
498 rsb_clear_flag(r
, RSB_NEW_MASTER
);
504 int dlm_recover_locks(struct dlm_ls
*ls
)
507 int error
, count
= 0;
509 log_debug(ls
, "dlm_recover_locks");
511 down_read(&ls
->ls_root_sem
);
512 list_for_each_entry(r
, &ls
->ls_root_list
, res_root_list
) {
514 rsb_clear_flag(r
, RSB_NEW_MASTER
);
518 if (!rsb_flag(r
, RSB_NEW_MASTER
))
521 if (dlm_recovery_stopped(ls
)) {
523 up_read(&ls
->ls_root_sem
);
527 error
= recover_locks(r
);
529 up_read(&ls
->ls_root_sem
);
533 count
+= r
->res_recover_locks_count
;
535 up_read(&ls
->ls_root_sem
);
537 log_debug(ls
, "dlm_recover_locks %d locks", count
);
539 error
= dlm_wait_function(ls
, &recover_list_empty
);
542 recover_list_clear(ls
);
544 dlm_set_recover_status(ls
, DLM_RS_LOCKS
);
548 void dlm_recovered_lock(struct dlm_rsb
*r
)
550 DLM_ASSERT(rsb_flag(r
, RSB_NEW_MASTER
), dlm_dump_rsb(r
););
552 r
->res_recover_locks_count
--;
553 if (!r
->res_recover_locks_count
) {
554 rsb_clear_flag(r
, RSB_NEW_MASTER
);
558 if (recover_list_empty(r
->res_ls
))
559 wake_up(&r
->res_ls
->ls_wait_general
);
563 * The lvb needs to be recovered on all master rsb's. This includes setting
564 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
565 * based on the lvb's of the locks held on the rsb.
567 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
568 * was already set prior to recovery, it's not cleared, regardless of locks.
570 * The LVB contents are only considered for changing when this is a new master
571 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
572 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
573 * from the lkb with the largest lvb sequence number.
576 static void recover_lvb(struct dlm_rsb
*r
)
578 struct dlm_lkb
*lkb
, *high_lkb
= NULL
;
579 uint32_t high_seq
= 0;
580 int lock_lvb_exists
= 0;
581 int big_lock_exists
= 0;
582 int lvblen
= r
->res_ls
->ls_lvblen
;
584 list_for_each_entry(lkb
, &r
->res_grantqueue
, lkb_statequeue
) {
585 if (!(lkb
->lkb_exflags
& DLM_LKF_VALBLK
))
590 if (lkb
->lkb_grmode
> DLM_LOCK_CR
) {
595 if (((int)lkb
->lkb_lvbseq
- (int)high_seq
) >= 0) {
597 high_seq
= lkb
->lkb_lvbseq
;
601 list_for_each_entry(lkb
, &r
->res_convertqueue
, lkb_statequeue
) {
602 if (!(lkb
->lkb_exflags
& DLM_LKF_VALBLK
))
607 if (lkb
->lkb_grmode
> DLM_LOCK_CR
) {
612 if (((int)lkb
->lkb_lvbseq
- (int)high_seq
) >= 0) {
614 high_seq
= lkb
->lkb_lvbseq
;
619 if (!lock_lvb_exists
)
622 if (!big_lock_exists
)
623 rsb_set_flag(r
, RSB_VALNOTVALID
);
625 /* don't mess with the lvb unless we're the new master */
626 if (!rsb_flag(r
, RSB_NEW_MASTER2
))
629 if (!r
->res_lvbptr
) {
630 r
->res_lvbptr
= allocate_lvb(r
->res_ls
);
635 if (big_lock_exists
) {
636 r
->res_lvbseq
= lkb
->lkb_lvbseq
;
637 memcpy(r
->res_lvbptr
, lkb
->lkb_lvbptr
, lvblen
);
638 } else if (high_lkb
) {
639 r
->res_lvbseq
= high_lkb
->lkb_lvbseq
;
640 memcpy(r
->res_lvbptr
, high_lkb
->lkb_lvbptr
, lvblen
);
643 memset(r
->res_lvbptr
, 0, lvblen
);
649 /* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
650 converting PR->CW or CW->PR need to have their lkb_grmode set. */
652 static void recover_conversion(struct dlm_rsb
*r
)
657 list_for_each_entry(lkb
, &r
->res_grantqueue
, lkb_statequeue
) {
658 if (lkb
->lkb_grmode
== DLM_LOCK_PR
||
659 lkb
->lkb_grmode
== DLM_LOCK_CW
) {
660 grmode
= lkb
->lkb_grmode
;
665 list_for_each_entry(lkb
, &r
->res_convertqueue
, lkb_statequeue
) {
666 if (lkb
->lkb_grmode
!= DLM_LOCK_IV
)
669 lkb
->lkb_grmode
= lkb
->lkb_rqmode
;
671 lkb
->lkb_grmode
= grmode
;
675 /* We've become the new master for this rsb and waiting/converting locks may
676 need to be granted in dlm_grant_after_purge() due to locks that may have
677 existed from a removed node. */
679 static void set_locks_purged(struct dlm_rsb
*r
)
681 if (!list_empty(&r
->res_waitqueue
) || !list_empty(&r
->res_convertqueue
))
682 rsb_set_flag(r
, RSB_LOCKS_PURGED
);
685 void dlm_recover_rsbs(struct dlm_ls
*ls
)
690 log_debug(ls
, "dlm_recover_rsbs");
692 down_read(&ls
->ls_root_sem
);
693 list_for_each_entry(r
, &ls
->ls_root_list
, res_root_list
) {
696 if (rsb_flag(r
, RSB_RECOVER_CONVERT
))
697 recover_conversion(r
);
698 if (rsb_flag(r
, RSB_NEW_MASTER2
))
703 rsb_clear_flag(r
, RSB_RECOVER_CONVERT
);
704 rsb_clear_flag(r
, RSB_NEW_MASTER2
);
707 up_read(&ls
->ls_root_sem
);
709 log_debug(ls
, "dlm_recover_rsbs %d rsbs", count
);
712 /* Create a single list of all root rsb's to be used during recovery */
714 int dlm_create_root_list(struct dlm_ls
*ls
)
719 down_write(&ls
->ls_root_sem
);
720 if (!list_empty(&ls
->ls_root_list
)) {
721 log_error(ls
, "root list not empty");
726 for (i
= 0; i
< ls
->ls_rsbtbl_size
; i
++) {
727 read_lock(&ls
->ls_rsbtbl
[i
].lock
);
728 list_for_each_entry(r
, &ls
->ls_rsbtbl
[i
].list
, res_hashchain
) {
729 list_add(&r
->res_root_list
, &ls
->ls_root_list
);
732 read_unlock(&ls
->ls_rsbtbl
[i
].lock
);
735 up_write(&ls
->ls_root_sem
);
739 void dlm_release_root_list(struct dlm_ls
*ls
)
741 struct dlm_rsb
*r
, *safe
;
743 down_write(&ls
->ls_root_sem
);
744 list_for_each_entry_safe(r
, safe
, &ls
->ls_root_list
, res_root_list
) {
745 list_del_init(&r
->res_root_list
);
748 up_write(&ls
->ls_root_sem
);
751 void dlm_clear_toss_list(struct dlm_ls
*ls
)
753 struct dlm_rsb
*r
, *safe
;
756 for (i
= 0; i
< ls
->ls_rsbtbl_size
; i
++) {
757 write_lock(&ls
->ls_rsbtbl
[i
].lock
);
758 list_for_each_entry_safe(r
, safe
, &ls
->ls_rsbtbl
[i
].toss
,
760 list_del(&r
->res_hashchain
);
763 write_unlock(&ls
->ls_rsbtbl
[i
].lock
);