1 /******************************************************************************
2 *******************************************************************************
4 ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
5 ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
7 ** This copyrighted material is made available to anyone wishing to use,
8 ** modify, copy, or redistribute it subject to the terms and conditions
9 ** of the GNU General Public License v.2.
11 *******************************************************************************
12 ******************************************************************************/
14 #include "dlm_internal.h"
15 #include "lockspace.h"
28 * Recovery waiting routines: these functions wait for a particular reply from
29 * a remote node, or for the remote node to report a certain status. They need
30 * to abort if the lockspace is stopped indicating a node has failed (perhaps
31 * the one being waited for).
35 * Wait until given function returns non-zero or lockspace is stopped
36 * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another
37 * function thinks it could have completed the waited-on task, they should wake
38 * up ls_wait_general to get an immediate response rather than waiting for the
39 * timer to detect the result. A timer wakes us up periodically while waiting
40 * to see if we should abort due to a node failure. This should only be called
41 * by the dlm_recoverd thread.
44 static void dlm_wait_timer_fn(unsigned long data
)
46 struct dlm_ls
*ls
= (struct dlm_ls
*) data
;
47 mod_timer(&ls
->ls_timer
, jiffies
+ (dlm_config
.ci_recover_timer
* HZ
));
48 wake_up(&ls
->ls_wait_general
);
51 int dlm_wait_function(struct dlm_ls
*ls
, int (*testfn
) (struct dlm_ls
*ls
))
55 init_timer(&ls
->ls_timer
);
56 ls
->ls_timer
.function
= dlm_wait_timer_fn
;
57 ls
->ls_timer
.data
= (long) ls
;
58 ls
->ls_timer
.expires
= jiffies
+ (dlm_config
.ci_recover_timer
* HZ
);
59 add_timer(&ls
->ls_timer
);
61 wait_event(ls
->ls_wait_general
, testfn(ls
) || dlm_recovery_stopped(ls
));
62 del_timer_sync(&ls
->ls_timer
);
64 if (dlm_recovery_stopped(ls
)) {
65 log_debug(ls
, "dlm_wait_function aborted");
72 * An efficient way for all nodes to wait for all others to have a certain
73 * status. The node with the lowest nodeid polls all the others for their
74 * status (wait_status_all) and all the others poll the node with the low id
75 * for its accumulated result (wait_status_low). When all nodes have set
76 * status flag X, then status flag X_ALL will be set on the low nodeid.
79 uint32_t dlm_recover_status(struct dlm_ls
*ls
)
82 spin_lock(&ls
->ls_recover_lock
);
83 status
= ls
->ls_recover_status
;
84 spin_unlock(&ls
->ls_recover_lock
);
88 static void _set_recover_status(struct dlm_ls
*ls
, uint32_t status
)
90 ls
->ls_recover_status
|= status
;
93 void dlm_set_recover_status(struct dlm_ls
*ls
, uint32_t status
)
95 spin_lock(&ls
->ls_recover_lock
);
96 _set_recover_status(ls
, status
);
97 spin_unlock(&ls
->ls_recover_lock
);
100 static int wait_status_all(struct dlm_ls
*ls
, uint32_t wait_status
,
103 struct dlm_rcom
*rc
= ls
->ls_recover_buf
;
104 struct dlm_member
*memb
;
105 int error
= 0, delay
;
107 list_for_each_entry(memb
, &ls
->ls_nodes
, list
) {
110 if (dlm_recovery_stopped(ls
)) {
115 error
= dlm_rcom_status(ls
, memb
->nodeid
, 0);
120 dlm_slot_save(ls
, rc
, memb
);
122 if (rc
->rc_result
& wait_status
)
133 static int wait_status_low(struct dlm_ls
*ls
, uint32_t wait_status
,
134 uint32_t status_flags
)
136 struct dlm_rcom
*rc
= ls
->ls_recover_buf
;
137 int error
= 0, delay
= 0, nodeid
= ls
->ls_low_nodeid
;
140 if (dlm_recovery_stopped(ls
)) {
145 error
= dlm_rcom_status(ls
, nodeid
, status_flags
);
149 if (rc
->rc_result
& wait_status
)
159 static int wait_status(struct dlm_ls
*ls
, uint32_t status
)
161 uint32_t status_all
= status
<< 1;
164 if (ls
->ls_low_nodeid
== dlm_our_nodeid()) {
165 error
= wait_status_all(ls
, status
, 0);
167 dlm_set_recover_status(ls
, status_all
);
169 error
= wait_status_low(ls
, status_all
, 0);
174 int dlm_recover_members_wait(struct dlm_ls
*ls
)
176 struct dlm_member
*memb
;
177 struct dlm_slot
*slots
;
178 int num_slots
, slots_size
;
182 list_for_each_entry(memb
, &ls
->ls_nodes
, list
) {
184 memb
->generation
= 0;
187 if (ls
->ls_low_nodeid
== dlm_our_nodeid()) {
188 error
= wait_status_all(ls
, DLM_RS_NODES
, 1);
192 /* slots array is sparse, slots_size may be > num_slots */
194 rv
= dlm_slots_assign(ls
, &num_slots
, &slots_size
, &slots
, &gen
);
196 spin_lock(&ls
->ls_recover_lock
);
197 _set_recover_status(ls
, DLM_RS_NODES_ALL
);
198 ls
->ls_num_slots
= num_slots
;
199 ls
->ls_slots_size
= slots_size
;
200 ls
->ls_slots
= slots
;
201 ls
->ls_generation
= gen
;
202 spin_unlock(&ls
->ls_recover_lock
);
204 dlm_set_recover_status(ls
, DLM_RS_NODES_ALL
);
207 error
= wait_status_low(ls
, DLM_RS_NODES_ALL
, DLM_RSF_NEED_SLOTS
);
211 dlm_slots_copy_in(ls
);
217 int dlm_recover_directory_wait(struct dlm_ls
*ls
)
219 return wait_status(ls
, DLM_RS_DIR
);
222 int dlm_recover_locks_wait(struct dlm_ls
*ls
)
224 return wait_status(ls
, DLM_RS_LOCKS
);
227 int dlm_recover_done_wait(struct dlm_ls
*ls
)
229 return wait_status(ls
, DLM_RS_DONE
);
233 * The recover_list contains all the rsb's for which we've requested the new
234 * master nodeid. As replies are returned from the resource directories the
235 * rsb's are removed from the list. When the list is empty we're done.
237 * The recover_list is later similarly used for all rsb's for which we've sent
238 * new lkb's and need to receive new corresponding lkid's.
240 * We use the address of the rsb struct as a simple local identifier for the
241 * rsb so we can match an rcom reply with the rsb it was sent for.
244 static int recover_list_empty(struct dlm_ls
*ls
)
248 spin_lock(&ls
->ls_recover_list_lock
);
249 empty
= list_empty(&ls
->ls_recover_list
);
250 spin_unlock(&ls
->ls_recover_list_lock
);
255 static void recover_list_add(struct dlm_rsb
*r
)
257 struct dlm_ls
*ls
= r
->res_ls
;
259 spin_lock(&ls
->ls_recover_list_lock
);
260 if (list_empty(&r
->res_recover_list
)) {
261 list_add_tail(&r
->res_recover_list
, &ls
->ls_recover_list
);
262 ls
->ls_recover_list_count
++;
265 spin_unlock(&ls
->ls_recover_list_lock
);
268 static void recover_list_del(struct dlm_rsb
*r
)
270 struct dlm_ls
*ls
= r
->res_ls
;
272 spin_lock(&ls
->ls_recover_list_lock
);
273 list_del_init(&r
->res_recover_list
);
274 ls
->ls_recover_list_count
--;
275 spin_unlock(&ls
->ls_recover_list_lock
);
280 static struct dlm_rsb
*recover_list_find(struct dlm_ls
*ls
, uint64_t id
)
282 struct dlm_rsb
*r
= NULL
;
284 spin_lock(&ls
->ls_recover_list_lock
);
286 list_for_each_entry(r
, &ls
->ls_recover_list
, res_recover_list
) {
287 if (id
== (unsigned long) r
)
292 spin_unlock(&ls
->ls_recover_list_lock
);
296 static void recover_list_clear(struct dlm_ls
*ls
)
298 struct dlm_rsb
*r
, *s
;
300 spin_lock(&ls
->ls_recover_list_lock
);
301 list_for_each_entry_safe(r
, s
, &ls
->ls_recover_list
, res_recover_list
) {
302 list_del_init(&r
->res_recover_list
);
303 r
->res_recover_locks_count
= 0;
305 ls
->ls_recover_list_count
--;
308 if (ls
->ls_recover_list_count
!= 0) {
309 log_error(ls
, "warning: recover_list_count %d",
310 ls
->ls_recover_list_count
);
311 ls
->ls_recover_list_count
= 0;
313 spin_unlock(&ls
->ls_recover_list_lock
);
317 /* Master recovery: find new master node for rsb's that were
318 mastered on nodes that have been removed.
322 dlm_send_rcom_lookup -> receive_rcom_lookup
324 receive_rcom_lookup_reply <-
325 dlm_recover_master_reply
332 * Set the lock master for all LKBs in a lock queue
333 * If we are the new master of the rsb, we may have received new
334 * MSTCPY locks from other nodes already which we need to ignore
335 * when setting the new nodeid.
338 static void set_lock_master(struct list_head
*queue
, int nodeid
)
342 list_for_each_entry(lkb
, queue
, lkb_statequeue
)
343 if (!(lkb
->lkb_flags
& DLM_IFL_MSTCPY
))
344 lkb
->lkb_nodeid
= nodeid
;
347 static void set_master_lkbs(struct dlm_rsb
*r
)
349 set_lock_master(&r
->res_grantqueue
, r
->res_nodeid
);
350 set_lock_master(&r
->res_convertqueue
, r
->res_nodeid
);
351 set_lock_master(&r
->res_waitqueue
, r
->res_nodeid
);
355 * Propagate the new master nodeid to locks
356 * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider.
357 * The NEW_MASTER2 flag tells recover_lvb() and set_locks_purged() which
361 static void set_new_master(struct dlm_rsb
*r
, int nodeid
)
364 r
->res_nodeid
= nodeid
;
366 rsb_set_flag(r
, RSB_NEW_MASTER
);
367 rsb_set_flag(r
, RSB_NEW_MASTER2
);
372 * We do async lookups on rsb's that need new masters. The rsb's
373 * waiting for a lookup reply are kept on the recover_list.
376 static int recover_master(struct dlm_rsb
*r
)
378 struct dlm_ls
*ls
= r
->res_ls
;
379 int error
, dir_nodeid
, ret_nodeid
, our_nodeid
= dlm_our_nodeid();
381 dir_nodeid
= dlm_dir_nodeid(r
);
383 if (dir_nodeid
== our_nodeid
) {
384 error
= dlm_dir_lookup(ls
, our_nodeid
, r
->res_name
,
385 r
->res_length
, &ret_nodeid
);
387 log_error(ls
, "recover dir lookup error %d", error
);
389 if (ret_nodeid
== our_nodeid
)
391 set_new_master(r
, ret_nodeid
);
394 error
= dlm_send_rcom_lookup(r
, dir_nodeid
);
401 * When not using a directory, most resource names will hash to a new static
402 * master nodeid and the resource will need to be remastered.
405 static int recover_master_static(struct dlm_rsb
*r
)
407 int master
= dlm_dir_nodeid(r
);
409 if (master
== dlm_our_nodeid())
412 if (r
->res_nodeid
!= master
) {
414 dlm_purge_mstcpy_locks(r
);
415 set_new_master(r
, master
);
422 * Go through local root resources and for each rsb which has a master which
423 * has departed, get the new master nodeid from the directory. The dir will
424 * assign mastery to the first node to look up the new master. That means
425 * we'll discover in this lookup if we're the new master of any rsb's.
427 * We fire off all the dir lookup requests individually and asynchronously to
428 * the correct dir node.
431 int dlm_recover_masters(struct dlm_ls
*ls
)
434 int error
= 0, count
= 0;
436 log_debug(ls
, "dlm_recover_masters");
438 down_read(&ls
->ls_root_sem
);
439 list_for_each_entry(r
, &ls
->ls_root_list
, res_root_list
) {
440 if (dlm_recovery_stopped(ls
)) {
441 up_read(&ls
->ls_root_sem
);
446 if (dlm_no_directory(ls
))
447 count
+= recover_master_static(r
);
448 else if (!is_master(r
) &&
449 (dlm_is_removed(ls
, r
->res_nodeid
) ||
450 rsb_flag(r
, RSB_NEW_MASTER
))) {
457 up_read(&ls
->ls_root_sem
);
459 log_debug(ls
, "dlm_recover_masters %d resources", count
);
461 error
= dlm_wait_function(ls
, &recover_list_empty
);
464 recover_list_clear(ls
);
468 int dlm_recover_master_reply(struct dlm_ls
*ls
, struct dlm_rcom
*rc
)
473 r
= recover_list_find(ls
, rc
->rc_id
);
475 log_error(ls
, "dlm_recover_master_reply no id %llx",
476 (unsigned long long)rc
->rc_id
);
480 nodeid
= rc
->rc_result
;
481 if (nodeid
== dlm_our_nodeid())
484 set_new_master(r
, nodeid
);
487 if (recover_list_empty(ls
))
488 wake_up(&ls
->ls_wait_general
);
494 /* Lock recovery: rebuild the process-copy locks we hold on a
495 remastered rsb on the new rsb master.
500 dlm_send_rcom_lock -> receive_rcom_lock
501 dlm_recover_master_copy
502 receive_rcom_lock_reply <-
503 dlm_recover_process_copy
508 * keep a count of the number of lkb's we send to the new master; when we get
509 * an equal number of replies then recovery for the rsb is done
512 static int recover_locks_queue(struct dlm_rsb
*r
, struct list_head
*head
)
517 list_for_each_entry(lkb
, head
, lkb_statequeue
) {
518 error
= dlm_send_rcom_lock(r
, lkb
);
521 r
->res_recover_locks_count
++;
527 static int recover_locks(struct dlm_rsb
*r
)
533 DLM_ASSERT(!r
->res_recover_locks_count
, dlm_dump_rsb(r
););
535 error
= recover_locks_queue(r
, &r
->res_grantqueue
);
538 error
= recover_locks_queue(r
, &r
->res_convertqueue
);
541 error
= recover_locks_queue(r
, &r
->res_waitqueue
);
545 if (r
->res_recover_locks_count
)
548 rsb_clear_flag(r
, RSB_NEW_MASTER
);
554 int dlm_recover_locks(struct dlm_ls
*ls
)
557 int error
, count
= 0;
559 log_debug(ls
, "dlm_recover_locks");
561 down_read(&ls
->ls_root_sem
);
562 list_for_each_entry(r
, &ls
->ls_root_list
, res_root_list
) {
564 rsb_clear_flag(r
, RSB_NEW_MASTER
);
568 if (!rsb_flag(r
, RSB_NEW_MASTER
))
571 if (dlm_recovery_stopped(ls
)) {
573 up_read(&ls
->ls_root_sem
);
577 error
= recover_locks(r
);
579 up_read(&ls
->ls_root_sem
);
583 count
+= r
->res_recover_locks_count
;
585 up_read(&ls
->ls_root_sem
);
587 log_debug(ls
, "dlm_recover_locks %d locks", count
);
589 error
= dlm_wait_function(ls
, &recover_list_empty
);
592 recover_list_clear(ls
);
596 void dlm_recovered_lock(struct dlm_rsb
*r
)
598 DLM_ASSERT(rsb_flag(r
, RSB_NEW_MASTER
), dlm_dump_rsb(r
););
600 r
->res_recover_locks_count
--;
601 if (!r
->res_recover_locks_count
) {
602 rsb_clear_flag(r
, RSB_NEW_MASTER
);
606 if (recover_list_empty(r
->res_ls
))
607 wake_up(&r
->res_ls
->ls_wait_general
);
611 * The lvb needs to be recovered on all master rsb's. This includes setting
612 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
613 * based on the lvb's of the locks held on the rsb.
615 * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
616 * was already set prior to recovery, it's not cleared, regardless of locks.
618 * The LVB contents are only considered for changing when this is a new master
619 * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
620 * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken
621 * from the lkb with the largest lvb sequence number.
624 static void recover_lvb(struct dlm_rsb
*r
)
626 struct dlm_lkb
*lkb
, *high_lkb
= NULL
;
627 uint32_t high_seq
= 0;
628 int lock_lvb_exists
= 0;
629 int big_lock_exists
= 0;
630 int lvblen
= r
->res_ls
->ls_lvblen
;
632 list_for_each_entry(lkb
, &r
->res_grantqueue
, lkb_statequeue
) {
633 if (!(lkb
->lkb_exflags
& DLM_LKF_VALBLK
))
638 if (lkb
->lkb_grmode
> DLM_LOCK_CR
) {
643 if (((int)lkb
->lkb_lvbseq
- (int)high_seq
) >= 0) {
645 high_seq
= lkb
->lkb_lvbseq
;
649 list_for_each_entry(lkb
, &r
->res_convertqueue
, lkb_statequeue
) {
650 if (!(lkb
->lkb_exflags
& DLM_LKF_VALBLK
))
655 if (lkb
->lkb_grmode
> DLM_LOCK_CR
) {
660 if (((int)lkb
->lkb_lvbseq
- (int)high_seq
) >= 0) {
662 high_seq
= lkb
->lkb_lvbseq
;
667 if (!lock_lvb_exists
)
670 if (!big_lock_exists
)
671 rsb_set_flag(r
, RSB_VALNOTVALID
);
673 /* don't mess with the lvb unless we're the new master */
674 if (!rsb_flag(r
, RSB_NEW_MASTER2
))
677 if (!r
->res_lvbptr
) {
678 r
->res_lvbptr
= dlm_allocate_lvb(r
->res_ls
);
683 if (big_lock_exists
) {
684 r
->res_lvbseq
= lkb
->lkb_lvbseq
;
685 memcpy(r
->res_lvbptr
, lkb
->lkb_lvbptr
, lvblen
);
686 } else if (high_lkb
) {
687 r
->res_lvbseq
= high_lkb
->lkb_lvbseq
;
688 memcpy(r
->res_lvbptr
, high_lkb
->lkb_lvbptr
, lvblen
);
691 memset(r
->res_lvbptr
, 0, lvblen
);
697 /* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks
698 converting PR->CW or CW->PR need to have their lkb_grmode set. */
700 static void recover_conversion(struct dlm_rsb
*r
)
705 list_for_each_entry(lkb
, &r
->res_grantqueue
, lkb_statequeue
) {
706 if (lkb
->lkb_grmode
== DLM_LOCK_PR
||
707 lkb
->lkb_grmode
== DLM_LOCK_CW
) {
708 grmode
= lkb
->lkb_grmode
;
713 list_for_each_entry(lkb
, &r
->res_convertqueue
, lkb_statequeue
) {
714 if (lkb
->lkb_grmode
!= DLM_LOCK_IV
)
717 lkb
->lkb_grmode
= lkb
->lkb_rqmode
;
719 lkb
->lkb_grmode
= grmode
;
723 /* We've become the new master for this rsb and waiting/converting locks may
724 need to be granted in dlm_grant_after_purge() due to locks that may have
725 existed from a removed node. */
727 static void set_locks_purged(struct dlm_rsb
*r
)
729 if (!list_empty(&r
->res_waitqueue
) || !list_empty(&r
->res_convertqueue
))
730 rsb_set_flag(r
, RSB_LOCKS_PURGED
);
733 void dlm_recover_rsbs(struct dlm_ls
*ls
)
738 log_debug(ls
, "dlm_recover_rsbs");
740 down_read(&ls
->ls_root_sem
);
741 list_for_each_entry(r
, &ls
->ls_root_list
, res_root_list
) {
744 if (rsb_flag(r
, RSB_RECOVER_CONVERT
))
745 recover_conversion(r
);
746 if (rsb_flag(r
, RSB_NEW_MASTER2
))
751 rsb_clear_flag(r
, RSB_RECOVER_CONVERT
);
752 rsb_clear_flag(r
, RSB_NEW_MASTER2
);
755 up_read(&ls
->ls_root_sem
);
757 log_debug(ls
, "dlm_recover_rsbs %d rsbs", count
);
760 /* Create a single list of all root rsb's to be used during recovery */
762 int dlm_create_root_list(struct dlm_ls
*ls
)
768 down_write(&ls
->ls_root_sem
);
769 if (!list_empty(&ls
->ls_root_list
)) {
770 log_error(ls
, "root list not empty");
775 for (i
= 0; i
< ls
->ls_rsbtbl_size
; i
++) {
776 spin_lock(&ls
->ls_rsbtbl
[i
].lock
);
777 for (n
= rb_first(&ls
->ls_rsbtbl
[i
].keep
); n
; n
= rb_next(n
)) {
778 r
= rb_entry(n
, struct dlm_rsb
, res_hashnode
);
779 list_add(&r
->res_root_list
, &ls
->ls_root_list
);
783 /* If we're using a directory, add tossed rsbs to the root
784 list; they'll have entries created in the new directory,
785 but no other recovery steps should do anything with them. */
787 if (dlm_no_directory(ls
)) {
788 spin_unlock(&ls
->ls_rsbtbl
[i
].lock
);
792 for (n
= rb_first(&ls
->ls_rsbtbl
[i
].toss
); n
; n
= rb_next(n
)) {
793 r
= rb_entry(n
, struct dlm_rsb
, res_hashnode
);
794 list_add(&r
->res_root_list
, &ls
->ls_root_list
);
797 spin_unlock(&ls
->ls_rsbtbl
[i
].lock
);
800 up_write(&ls
->ls_root_sem
);
804 void dlm_release_root_list(struct dlm_ls
*ls
)
806 struct dlm_rsb
*r
, *safe
;
808 down_write(&ls
->ls_root_sem
);
809 list_for_each_entry_safe(r
, safe
, &ls
->ls_root_list
, res_root_list
) {
810 list_del_init(&r
->res_root_list
);
813 up_write(&ls
->ls_root_sem
);
816 /* If not using a directory, clear the entire toss list, there's no benefit to
817 caching the master value since it's fixed. If we are using a dir, keep the
818 rsb's we're the master of. Recovery will add them to the root list and from
819 there they'll be entered in the rebuilt directory. */
821 void dlm_clear_toss_list(struct dlm_ls
*ls
)
823 struct rb_node
*n
, *next
;
827 for (i
= 0; i
< ls
->ls_rsbtbl_size
; i
++) {
828 spin_lock(&ls
->ls_rsbtbl
[i
].lock
);
829 for (n
= rb_first(&ls
->ls_rsbtbl
[i
].toss
); n
; n
= next
) {
831 rsb
= rb_entry(n
, struct dlm_rsb
, res_hashnode
);
832 if (dlm_no_directory(ls
) || !is_master(rsb
)) {
833 rb_erase(n
, &ls
->ls_rsbtbl
[i
].toss
);
837 spin_unlock(&ls
->ls_rsbtbl
[i
].lock
);