4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/param.h>
27 #include <sys/systm.h>
32 #include <sys/t_lock.h>
38 #include <sys/cmn_err.h>
39 #include <sys/sysmacros.h>
40 #include <sys/types.h>
41 #include <sys/mkdev.h>
45 #include <sys/lvm/md_mirror.h>
46 #include <sys/modctl.h>
48 #include <sys/sunddi.h>
49 #include <sys/debug.h>
50 #include <sys/callb.h>
52 #include <sys/sysevent/eventdefs.h>
53 #include <sys/sysevent/svm.h>
54 #include <sys/lvm/mdmn_commd.h>
57 extern kmutex_t md_status_mx
;
58 extern kmutex_t md_mx
;
60 extern unit_t md_nunits
;
61 extern set_t md_nsets
;
62 extern md_set_t md_set
[];
63 extern major_t md_major
;
65 extern md_ops_t mirror_md_ops
;
66 extern kmem_cache_t
*mirror_child_cache
; /* mirror child memory pool */
67 extern mdq_anchor_t md_mto_daemon
;
68 extern daemon_request_t mirror_timeout
;
69 extern md_resync_t md_cpr_resync
;
71 extern int md_mtioctl_cnt
;
73 extern kmem_cache_t
*mirror_parent_cache
;
75 extern int mirror_debug_flag
;
79 * Tunable resync thread timeout. This is used as the time interval for updating
80 * the resync progress to the mddb. This allows restartable resyncs to be
81 * continued across a system reboot.
82 * Default is to update the resync progress every 5 minutes.
84 int md_mirror_resync_update_intvl
= MD_DEF_MIRROR_RESYNC_INTVL
;
87 * Settable mirror resync buffer size. Specified in 512 byte
88 * blocks. This is set to MD_DEF_RESYNC_BUF_SIZE by default.
90 int md_resync_bufsz
= MD_DEF_RESYNC_BUF_SIZE
;
93 * Tunables for dirty region processing when
94 * closing down a mirror.
96 * Dirty region processing during close of a
97 * mirror is basically monitoring the state
98 * of the resync region bitmaps and the number
99 * of outstanding i/o's per submirror to
100 * determine that there are no more dirty
103 * The approach taken is a retry logic over
104 * md_mirror_rr_cleans iterations to monitor
107 * There are two methods of polling the progress
108 * on dirty bitmap processing: busy-waits and
111 * Busy-waits are used at the beginning to
112 * determine the final state as quick as
113 * possible; md_mirror_rr_polls defines the
114 * number of busy-waits.
116 * In case the number of busy-waits got exhausted
117 * with dirty regions left over, the retry logic
118 * switches over to non-busy-waits, thus giving
119 * relief to an obviously heavily loaded system.
120 * The timeout value is defined by the tunable
121 * md_mirror_rr_sleep_timo in seconds.
123 * The number of non-busy-waits is given by:
124 * md_mirror_rr_cleans - md_mirror_rr_polls.
126 * The values were found by testing on a
127 * 'typical' system and may require tuning
128 * to meet specific customer's requirements.
131 int md_mirror_rr_cleans
= 13;
132 int md_mirror_rr_polls
= 3;
133 int md_mirror_rr_sleep_timo
= 1;
136 * The value is not #defined because it will be computed
139 int md_max_xfer_bufsz
= 2048;
142 * mirror_generate_rr_bitmap:
143 * -------------------
144 * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
145 * bitmap associated with mirror 'un'
148 * un - mirror unit to get bitmap data from
149 * *msgp - location to return newly allocated md_mn_msg_rr_clean_t
150 * *activep- location to return # of active i/os
153 * 1 => dirty bits cleared from un_dirty_bm and DRL flush required
154 * *msgp contains bitmap of to-be-cleared bits
155 * 0 => no bits cleared
159 mirror_generate_rr_bitmap(mm_unit_t
*un
, md_mn_msg_rr_clean_t
**msgp
,
162 unsigned int i
, next_bit
, data_bytes
, start_bit
;
163 int cleared_dirty
= 0;
165 /* Skip any initial 0s. */
167 if ((start_bit
= un
->un_rr_clean_start_bit
) >= un
->un_rrd_num
)
168 un
->un_rr_clean_start_bit
= start_bit
= 0;
171 * Handle case where NO bits are set in PERNODE_DIRTY but the
172 * un_dirty_bm[] map does have entries set (after a 1st resync)
174 for (; start_bit
< un
->un_rrd_num
&&
175 !IS_PERNODE_DIRTY(md_mn_mynode_id
, start_bit
, un
) &&
176 (un
->un_pernode_dirty_sum
[start_bit
] != (uchar_t
)0); start_bit
++)
179 if (start_bit
>= un
->un_rrd_num
) {
180 if (un
->un_rr_clean_start_bit
== 0) {
183 un
->un_rr_clean_start_bit
= 0;
184 goto retry_dirty_scan
;
188 /* how much to fit into this message */
189 data_bytes
= MIN(howmany(un
->un_rrd_num
- start_bit
, NBBY
),
190 MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES
);
192 (*msgp
) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes
),
195 (*msgp
)->rr_nodeid
= md_mn_mynode_id
;
196 (*msgp
)->rr_mnum
= MD_SID(un
);
197 MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp
, start_bit
, data_bytes
);
199 next_bit
= MIN(start_bit
+ data_bytes
* NBBY
, un
->un_rrd_num
);
201 for (i
= start_bit
; i
< next_bit
; i
++) {
202 if (un
->c
.un_status
& MD_UN_KEEP_DIRTY
&& IS_KEEPDIRTY(i
, un
)) {
205 if (!IS_REGION_DIRTY(i
, un
)) {
208 if (un
->un_outstanding_writes
[i
] != 0) {
214 * Handle the case where a resync has completed and we still
215 * have the un_dirty_bm[] entries marked as dirty (these are
216 * the most recent DRL re-read from the replica). They need
217 * to be cleared from our un_dirty_bm[] but they will not have
218 * corresponding un_pernode_dirty[] entries set unless (and
219 * until) further write()s have been issued to the area.
220 * This handles the case where only the un_dirty_bm[] entry is
221 * set. Without this we'd not clear this region until a local
222 * write is issued to the affected area.
224 if (IS_PERNODE_DIRTY(md_mn_mynode_id
, i
, un
) ||
225 (un
->un_pernode_dirty_sum
[i
] == (uchar_t
)0)) {
226 if (!IS_GOING_CLEAN(i
, un
)) {
227 SET_GOING_CLEAN(i
, un
);
232 * Now we've got a flagged pernode_dirty, _or_ a clean
233 * bitmap entry to process. Update the bitmap to flush
234 * the REGION_DIRTY / GOING_CLEAN bits when we send the
235 * cross-cluster message.
238 setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp
), i
- start_bit
);
241 * Not marked as active in the pernode bitmap, so skip
242 * any update to this. We just increment the 0 count
243 * and adjust the active count by any outstanding
244 * un_pernode_dirty_sum[] entries. This means we don't
245 * leave the mirror permanently dirty.
247 (*activep
) += (int)un
->un_pernode_dirty_sum
[i
];
250 if (!cleared_dirty
) {
251 kmem_free(*msgp
, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes
));
254 un
->un_rr_clean_start_bit
= next_bit
;
255 return (cleared_dirty
);
259 * There are three paths into here:
261 * md_daemon -> check_resync_regions -> prr
262 * mirror_internal_close -> mirror_process_unit_resync -> prr
263 * mirror_set_capability -> mirror_process_unit_resync -> prr
265 * The first one is a kernel daemon, the other two result from system calls.
266 * Thus, only the first case needs to deal with kernel CPR activity. This
267 * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
268 * NULL for system call paths.
271 process_resync_regions_non_owner(mm_unit_t
*un
, callb_cpr_t
*cprinfop
)
274 int cleared_dirty
= 0;
275 /* Number of reasons why we can not proceed shutting down the mirror. */
277 set_t setno
= MD_UN2SET(un
);
278 md_mn_msg_rr_clean_t
*rmsg
;
279 md_mn_kresult_t
*kres
;
281 minor_t mnum
= MD_SID(un
);
282 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
283 md_mn_nodeid_t owner_node
;
286 * We drop the readerlock here to assist lock ordering with
287 * update_resync. Once we have the un_rrp_inflight_mx, we
290 md_unit_readerexit(ui
);
293 * Resync region processing must be single threaded. We can't use
294 * un_resync_mx for this purpose since this mutex gets released
295 * when blocking on un_resync_cv.
297 mutex_enter(&un
->un_rrp_inflight_mx
);
299 (void) md_unit_readerlock(ui
);
301 mutex_enter(&un
->un_resync_mx
);
303 rw_enter(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1], RW_READER
);
304 cleared_dirty
= mirror_generate_rr_bitmap(un
, &rmsg
, &active
);
305 rw_exit(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1]);
308 owner_node
= un
->un_mirror_owner
;
309 mutex_exit(&un
->un_resync_mx
);
312 * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
313 * Receipt of the message will cause the mirror owner to
314 * update the on-disk DRL.
317 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
319 /* release readerlock before sending message */
320 md_unit_readerexit(ui
);
323 mutex_enter(&un
->un_prr_cpr_mx
);
324 CALLB_CPR_SAFE_BEGIN(cprinfop
);
327 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RR_CLEAN
,
328 MD_MSGF_NO_LOG
|MD_MSGF_BLK_SIGNAL
|MD_MSGF_KSEND_NORETRY
|
329 MD_MSGF_DIRECTED
, un
->un_mirror_owner
,
330 (char *)rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
), kres
);
333 CALLB_CPR_SAFE_END(cprinfop
, &un
->un_prr_cpr_mx
);
334 mutex_exit(&un
->un_prr_cpr_mx
);
337 /* reacquire readerlock after message */
338 (void) md_unit_readerlock(ui
);
340 if ((!MDMN_KSEND_MSG_OK(rval
, kres
)) &&
341 (kres
->kmmr_comm_state
!= MDMNE_NOT_JOINED
)) {
342 /* if commd is gone, no point in printing a message */
343 if (md_mn_is_commd_present())
344 mdmn_ksend_show_error(rval
, kres
, "RR_CLEAN");
345 kmem_free(kres
, sizeof (md_mn_kresult_t
));
346 kmem_free(rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
));
347 mutex_exit(&un
->un_rrp_inflight_mx
);
350 kmem_free(kres
, sizeof (md_mn_kresult_t
));
353 * If ownership changed while we were sending, we probably
354 * sent the message to the wrong node. Leave fixing that for
357 if (un
->un_mirror_owner
!= owner_node
) {
358 mutex_exit(&un
->un_rrp_inflight_mx
);
363 * Now that we've sent the message, clear them from the
364 * pernode_dirty arrays. These are ONLY cleared on a
365 * successful send, and failure has no impact.
368 start
= MDMN_MSG_RR_CLEAN_START_BIT(rmsg
);
369 end
= start
+ MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg
) * NBBY
;
370 mutex_enter(&un
->un_resync_mx
);
371 rw_enter(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1],
373 for (i
= start
; i
< end
; i
++) {
374 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg
),
376 if (IS_PERNODE_DIRTY(md_mn_mynode_id
, i
, un
)) {
377 un
->un_pernode_dirty_sum
[i
]--;
378 CLR_PERNODE_DIRTY(md_mn_mynode_id
, i
,
381 if (IS_REGION_DIRTY(i
, un
)) {
383 CLR_REGION_DIRTY(i
, un
);
384 CLR_GOING_CLEAN(i
, un
);
388 rw_exit(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1]);
390 kmem_free(rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
));
392 mutex_exit(&un
->un_resync_mx
);
394 mutex_exit(&un
->un_rrp_inflight_mx
);
400 process_resync_regions_owner(mm_unit_t
*un
)
403 int cleared_dirty
= 0;
404 /* Number of reasons why we can not proceed shutting down the mirror. */
406 set_t setno
= MD_UN2SET(un
);
407 int mnset
= MD_MNSET_SETNO(setno
);
408 md_mn_msg_rr_clean_t
*rmsg
;
409 minor_t mnum
= MD_SID(un
);
410 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
413 * We drop the readerlock here to assist lock ordering with
414 * update_resync. Once we have the un_rrp_inflight_mx, we
417 md_unit_readerexit(ui
);
420 * Resync region processing must be single threaded. We can't use
421 * un_resync_mx for this purpose since this mutex gets released
422 * when blocking on un_resync_cv.
424 mutex_enter(&un
->un_rrp_inflight_mx
);
426 (void) md_unit_readerlock(ui
);
428 mutex_enter(&un
->un_resync_mx
);
429 un
->un_waiting_to_clear
++;
430 while (un
->un_resync_flg
& MM_RF_STALL_CLEAN
)
431 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
432 un
->un_waiting_to_clear
--;
435 rw_enter(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1],
437 cleared_dirty
= mirror_generate_rr_bitmap(un
, &rmsg
, &active
);
441 * Clear the bits from the pernode_dirty arrays.
442 * If that results in any being cleared from the
443 * un_dirty_bm, commit it.
446 start
= MDMN_MSG_RR_CLEAN_START_BIT(rmsg
);
447 end
= start
+ MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg
) * NBBY
;
448 for (i
= start
; i
< end
; i
++) {
449 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg
),
451 if (IS_PERNODE_DIRTY(md_mn_mynode_id
, i
,
453 un
->un_pernode_dirty_sum
[i
]--;
455 md_mn_mynode_id
, i
, un
);
457 if (un
->un_pernode_dirty_sum
[i
] == 0) {
459 CLR_REGION_DIRTY(i
, un
);
460 CLR_GOING_CLEAN(i
, un
);
464 kmem_free(rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
));
466 rw_exit(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1]);
468 for (i
= 0; i
< un
->un_rrd_num
; i
++) {
469 if (un
->c
.un_status
& MD_UN_KEEP_DIRTY
)
470 if (IS_KEEPDIRTY(i
, un
))
473 if (!IS_REGION_DIRTY(i
, un
))
475 if (un
->un_outstanding_writes
[i
] != 0) {
480 if (!IS_GOING_CLEAN(i
, un
)) {
481 SET_GOING_CLEAN(i
, un
);
485 CLR_REGION_DIRTY(i
, un
);
486 CLR_GOING_CLEAN(i
, un
);
492 un
->un_resync_flg
|= MM_RF_GATECLOSED
;
493 mutex_exit(&un
->un_resync_mx
);
494 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
495 mutex_enter(&un
->un_resync_mx
);
496 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
498 if (un
->un_waiting_to_mark
!= 0 ||
499 un
->un_waiting_to_clear
!= 0) {
501 cv_broadcast(&un
->un_resync_cv
);
504 mutex_exit(&un
->un_resync_mx
);
506 mutex_exit(&un
->un_rrp_inflight_mx
);
512 process_resync_regions(mm_unit_t
*un
, callb_cpr_t
*cprinfop
)
514 int mnset
= MD_MNSET_SETNO(MD_UN2SET(un
));
516 * For a mirror we can only update the on-disk resync-record if we
517 * currently own the mirror. If we are called and there is no owner we
518 * bail out before scanning the outstanding_writes[] array.
519 * NOTE: we only need to check here (before scanning the array) as we
520 * are called with the readerlock held. This means that a change
521 * of ownership away from us will block until this resync check
524 if (mnset
&& (MD_MN_NO_MIRROR_OWNER(un
) ||
525 (!MD_MN_MIRROR_OWNER(un
) && !md_mn_is_commd_present_lite()))) {
527 } else if (mnset
&& !MD_MN_MIRROR_OWNER(un
)) {
528 return (process_resync_regions_non_owner(un
, cprinfop
));
530 return (process_resync_regions_owner(un
));
535 * Function that is callable from other modules to provide
536 * ability to cleanup dirty region bitmap on demand. Used
537 * on last close of a unit to avoid massive device resyncs
538 * when coming back after rolling large amounts of data to
539 * a mirror (e.g. at umount with logging).
543 mirror_process_unit_resync(mm_unit_t
*un
)
547 while (process_resync_regions(un
, NULL
)) {
550 if (cleans
>= md_mirror_rr_cleans
) {
552 "Could not clean resync regions\n");
555 if (cleans
> md_mirror_rr_polls
) {
557 * We did not make it with md_mirror_rr_polls
558 * iterations. Give the system relief and
559 * switch over to non-busy-wait.
561 delay(md_mirror_rr_sleep_timo
* md_hz
);
567 check_resync_regions(daemon_request_t
*timeout
)
574 rw_enter(&mirror_md_ops
.md_link_rw
.lock
, RW_READER
);
575 for (next
= mirror_md_ops
.md_head
; next
!= NULL
; next
= next
->ln_next
) {
577 if (md_get_setstatus(next
->ln_setno
) & MD_SET_STALE
)
580 un
= MD_UNIT(next
->ln_id
);
583 * Register this resync thread with the CPR mechanism. This
584 * allows us to detect when the system is suspended and so
585 * keep track of the RPC failure condition.
587 CALLB_CPR_INIT(&cprinfo
, &un
->un_prr_cpr_mx
, callb_md_mrs_cpr
,
588 "check_resync_regions");
590 ui
= MDI_UNIT(next
->ln_id
);
591 (void) md_unit_readerlock(ui
);
594 * Do not clean up resync regions if it is an ABR
595 * mirror, or if a submirror is offline (we will use the resync
596 * region to resync when back online) or if there is only one
599 if ((ui
->ui_tstate
& MD_ABR_CAP
) ||
600 (un
->c
.un_status
& MD_UN_OFFLINE_SM
) || (un
->un_nsm
< 2)) {
601 md_unit_readerexit(ui
);
602 /* Remove this thread from the CPR callback table. */
603 mutex_enter(&un
->un_prr_cpr_mx
);
604 CALLB_CPR_EXIT(&cprinfo
);
608 (void) process_resync_regions(un
, &cprinfo
);
610 md_unit_readerexit(ui
);
612 /* Remove this thread from the CPR callback table. */
613 mutex_enter(&un
->un_prr_cpr_mx
);
614 CALLB_CPR_EXIT(&cprinfo
);
617 rw_exit(&mirror_md_ops
.md_link_rw
.lock
);
620 mutex_enter(&mirror_timeout
.dr_mx
);
621 timeout
->dr_pending
= 0;
622 mutex_exit(&mirror_timeout
.dr_mx
);
626 md_mirror_timeout(void *throwaway
)
629 mutex_enter(&mirror_timeout
.dr_mx
);
630 if (!mirror_timeout
.dr_pending
) {
631 mirror_timeout
.dr_pending
= 1;
632 daemon_request(&md_mto_daemon
, check_resync_regions
,
633 (daemon_queue_t
*)&mirror_timeout
, REQ_OLD
);
636 if (mirror_md_ops
.md_head
!= NULL
)
637 mirror_timeout
.dr_timeout_id
= timeout(md_mirror_timeout
,
638 throwaway
, (int)MD_MDELAY
*hz
);
640 mirror_timeout
.dr_timeout_id
= 0;
642 mutex_exit(&mirror_timeout
.dr_mx
);
646 resync_start_timeout(set_t setno
)
648 if (md_get_setstatus(setno
) & MD_SET_STALE
)
651 mutex_enter(&mirror_timeout
.dr_mx
);
652 if (mirror_timeout
.dr_timeout_id
== 0)
653 mirror_timeout
.dr_timeout_id
= timeout(md_mirror_timeout
,
654 (void *)NULL
, (int)MD_MDELAY
*hz
);
655 mutex_exit(&mirror_timeout
.dr_mx
);
659 offlined_to_attached(mm_unit_t
*un
)
664 if (md_get_setstatus(MD_UN2SET(un
)) & MD_SET_STALE
)
667 for (i
= 0; i
< NMIRROR
; i
++) {
668 if (SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE
)) {
669 mirror_set_sm_state(&un
->un_sm
[i
],
670 &un
->un_smic
[i
], SMS_ATTACHED
, 1);
673 if (SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE_RESYNC
)) {
674 mirror_set_sm_state(&un
->un_sm
[i
],
675 &un
->un_smic
[i
], SMS_ATTACHED_RESYNC
, 1);
681 un
->c
.un_status
&= ~MD_UN_OFFLINE_SM
;
682 mddb_setrecprivate(un
->c
.un_record_id
, MD_PRV_PENDCOM
);
687 get_unit_resync(mm_unit_t
*un
)
689 mddb_recstatus_t status
;
690 struct optim_resync
*orp
;
692 if (un
->un_rr_dirty_recid
== 0) {
693 offlined_to_attached(un
);
697 status
= mddb_getrecstatus(un
->un_rr_dirty_recid
);
698 if ((status
== MDDB_NORECORD
) || (status
== MDDB_NODATA
)) {
699 un
->un_rr_dirty_recid
= 0;
700 offlined_to_attached(un
);
704 mddb_setrecprivate(un
->un_rr_dirty_recid
, MD_PRV_GOTIT
);
705 orp
= (struct optim_resync
*)mddb_getrecaddr(un
->un_rr_dirty_recid
);
706 un
->un_dirty_bm
= orp
->or_rr
;
710 create_unit_resync(mm_unit_t
*un
, int snarfing
)
714 int blksize
; /* rr size in blocks */
717 size_t size
; /* bitmap size */
722 tb
= un
->c
.un_total_blocks
;
724 if (((tb
+ MD_MIN_RR_SIZE
)/ MD_MIN_RR_SIZE
) > MD_DEF_NUM_RR
) {
725 blksize
= (int)(tb
/ MD_DEF_NUM_RR
);
726 num_rr
= (int)((tb
+ (blksize
)) / (blksize
));
728 blksize
= MD_MIN_RR_SIZE
;
729 num_rr
= (int)((tb
+ MD_MIN_RR_SIZE
) / MD_MIN_RR_SIZE
);
732 size
= howmany(num_rr
, NBBY
) + sizeof (*orp
) - sizeof (orp
->or_rr
);
734 setno
= MD_UN2SET(un
);
736 typ1
= (mddb_type_t
)md_getshared_key(setno
,
737 mirror_md_ops
.md_driver
.md_drivername
);
739 recid
= mddb_createrec(size
, typ1
, RESYNC_REC
,
740 MD_CRO_OPTIMIZE
|MD_CRO_32BIT
, setno
);
742 if (snarfing
&& !(md_get_setstatus(setno
) & MD_SET_STALE
)) {
743 md_set_setstatus(setno
, MD_SET_STALE
);
744 cmn_err(CE_WARN
, "md: state database is stale");
749 un
->un_rr_dirty_recid
= recid
;
750 orp
= (optim_resync_t
*)mddb_getrecaddr(recid
);
751 orp
->or_magic
= OR_MAGIC
;
752 orp
->or_blksize
= blksize
;
753 orp
->or_num
= num_rr
;
755 un
->un_rrd_blksize
= blksize
;
756 un
->un_rrd_num
= num_rr
;
757 un
->un_dirty_bm
= orp
->or_rr
;
760 for (i
= 0; i
< howmany(num_rr
, NBBY
); i
++)
761 orp
->or_rr
[i
] = 0xFF;
764 mddb_commitrec_wrapper(recid
);
765 mirror_commit(un
, NO_SUBMIRRORS
, 0);
768 mddb_setrecprivate(recid
, MD_PRV_PENDCOM
);
769 mddb_setrecprivate(un
->c
.un_record_id
, MD_PRV_PENDCOM
);
774 unit_setup_resync(mm_unit_t
*un
, int snarfing
)
779 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
780 int nonABR
= 1; /* only set if ABR marked in ui_tstate */
782 un
->un_dirty_bm
= NULL
;
783 un
->un_rs_buffer
= NULL
;
785 mutex_init(&un
->un_rrp_inflight_mx
, "rrp mx", MUTEX_DEFAULT
, NULL
);
787 mutex_init(&un
->un_resync_mx
, NULL
, MUTEX_DEFAULT
, NULL
);
788 cv_init(&un
->un_resync_cv
, NULL
, CV_DEFAULT
, NULL
);
789 un
->un_resync_flg
= 0;
790 un
->un_waiting_to_mark
= 0;
791 un
->un_waiting_to_commit
= 0;
792 un
->un_waiting_to_clear
= 0;
794 un
->un_goingclean_bm
= NULL
;
795 un
->un_goingdirty_bm
= NULL
;
796 un
->un_outstanding_writes
= NULL
;
797 un
->un_resync_bm
= NULL
;
802 if (un
->un_rr_dirty_recid
== 0) {
804 * If a MN diskset and snarfing and this node is not the
805 * master, do not delete any records on snarf of the
806 * mirror records (create_unit_resync deletes records).
808 * Master node should have already handled this case.
810 if (MD_MNSET_SETNO(MD_UN2SET(un
)) && snarfing
&&
811 md_set
[MD_UN2SET(un
)].s_am_i_master
== 0) {
813 cmn_err(CE_NOTE
, "unit_setup_resync: no rr for %s on"
814 " nodeid %d\n", md_shortname(MD_SID(un
)),
815 md_set
[MD_UN2SET(un
)].s_nodeid
);
819 if ((err
= create_unit_resync(un
, snarfing
)) != 0)
823 un
->un_goingclean_bm
= (uchar_t
*)kmem_zalloc((uint_t
)(howmany(
824 un
->un_rrd_num
, NBBY
)), KM_SLEEP
);
825 un
->un_goingdirty_bm
= (uchar_t
*)kmem_zalloc((uint_t
)(howmany(
826 un
->un_rrd_num
, NBBY
)), KM_SLEEP
);
827 un
->un_outstanding_writes
= (short *)kmem_zalloc(
828 (uint_t
)un
->un_rrd_num
* sizeof (short), KM_SLEEP
);
829 un
->un_resync_bm
= (uchar_t
*)kmem_zalloc((uint_t
)(howmany(
830 un
->un_rrd_num
, NBBY
)), KM_SLEEP
);
833 * Allocate pernode bitmap for this node. All other nodes' maps will
834 * be created 'on-the-fly' in the ioctl message handler
836 if (MD_MNSET_SETNO(MD_UN2SET(un
))) {
837 un
->un_pernode_dirty_sum
=
838 (uchar_t
*)kmem_zalloc(un
->un_rrd_num
, KM_SLEEP
);
839 if (md_mn_mynode_id
> 0) {
840 un
->un_pernode_dirty_bm
[md_mn_mynode_id
-1] = (uchar_t
*)
841 kmem_zalloc((uint_t
)(howmany(un
->un_rrd_num
, NBBY
)),
846 * Allocate taskq to process deferred (due to locking) RR_CLEAN
849 un
->un_drl_task
= (ddi_taskq_t
*)md_create_taskq(MD_UN2SET(un
),
853 if (md_get_setstatus(MD_UN2SET(un
)) & MD_SET_STALE
)
857 * Only mark mirror which has an associated DRL as requiring a resync.
858 * For ABR mirrors we need not set the resync record bitmap up.
860 if (ui
&& (ui
->ui_tstate
& MD_ABR_CAP
))
863 for (i
= 0, syncable
= 0; i
< NMIRROR
; i
++) {
865 if ((SUBMIRROR_IS_READABLE(un
, i
) ||
866 SMS_BY_INDEX_IS(un
, i
,
867 (SMS_OFFLINE
| SMS_OFFLINE_RESYNC
))))
872 if (snarfing
&& un
->un_pass_num
&& (syncable
> 1)) {
873 bcopy((caddr_t
)un
->un_dirty_bm
, (caddr_t
)un
->un_resync_bm
,
874 howmany(un
->un_rrd_num
, NBBY
));
876 un
->c
.un_status
|= (MD_UN_OPT_NOT_DONE
| MD_UN_WAR
);
877 un
->c
.un_status
&= ~MD_UN_OFFLINE_SM
;
878 for (i
= 0; i
< NMIRROR
; i
++) {
879 if ((SUBMIRROR_IS_READABLE(un
, i
)) ||
880 SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE_RESYNC
))
881 un
->un_sm
[i
].sm_flags
|= MD_SM_RESYNC_TARGET
;
883 if (SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE
)) {
884 un
->un_sm
[i
].sm_flags
|= MD_SM_RESYNC_TARGET
;
885 mirror_set_sm_state(&un
->un_sm
[i
],
886 &un
->un_smic
[i
], SMS_OFFLINE_RESYNC
, 1);
887 mddb_setrecprivate(un
->c
.un_record_id
,
896 * resync_kill_pending:
897 * -------------------
898 * Determine if the resync thread has been requested to terminate.
899 * Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
900 * MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
901 * MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node mirror.
905 * 1 Kill requested (set MD_UN_RESYNC_CANCEL in un->c.un_status)
907 * Note: this routine may block
908 * the writerlock for <ui> will be dropped and reacquired if <mx_type>
909 * is set to MD_WRITER_HELD.
910 * the readerlock for <ui> will be dropped and reacquired if <mx_type>
911 * is set to MD_READER_HELD.
921 /* Ensure that we don't block with any mutex held */
922 if (mx_type
== MD_WRITER_HELD
) {
923 md_unit_writerexit(ui
);
924 } else if (mx_type
== MD_READER_HELD
) {
925 md_unit_readerexit(ui
);
927 mutex_enter(&un
->un_rs_thread_mx
);
928 while (un
->un_rs_thread_flags
& (MD_RI_BLOCK
|MD_RI_BLOCK_OWNER
)) {
929 cv_wait(&un
->un_rs_thread_cv
, &un
->un_rs_thread_mx
);
930 if (un
->un_rs_thread_flags
& (MD_RI_KILL
|MD_RI_SHUTDOWN
))
933 /* Determine if we've been asked to abort or shutdown gracefully */
934 if (un
->un_rs_thread_flags
& MD_RI_KILL
) {
935 un
->c
.un_status
|= MD_UN_RESYNC_CANCEL
;
937 } else if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
) {
940 mutex_exit(&un
->un_rs_thread_mx
);
942 /* Reacquire mutex if dropped on entry */
943 if (mx_type
== MD_WRITER_HELD
) {
944 (void) md_unit_writerlock(ui
);
945 } else if (mx_type
== MD_READER_HELD
) {
946 (void) md_unit_readerlock(ui
);
952 * resync_read_buffer:
954 * Issue the resync source read for the specified start block and size.
955 * This will cause the mirror strategy routine to issue a write-after-read
956 * once this request completes successfully.
957 * If 'flag_err' is set we expect to see a write error flagged in the b_error
958 * field of the buffer created for this i/o request. If clear we do not expect
959 * to see the error flagged for write failures.
960 * Read failures will always set the B_ERROR bit which will stop the resync
964 resync_read_buffer(mm_unit_t
*un
, diskaddr_t blk
, size_t cnt
, int flag_err
)
970 sp
= kmem_cache_alloc(mirror_child_cache
, MD_ALLOCFLAGS
);
971 mirror_child_init(sp
);
974 bp
->b_edev
= makedevice(md_major
, MD_SID(un
));
975 bp
->b_flags
= B_READ
;
977 bp
->b_bcount
= dbtob(cnt
);
978 bp
->b_un
.b_addr
= un
->un_rs_buffer
;
979 md_unit_readerexit(MDI_UNIT(MD_SID(un
)));
981 (void) md_mirror_strategy(bp
, MD_STR_NOTTOP
| MD_STR_MAPPED
|
982 MD_STR_WAR
| (flag_err
? MD_STR_FLAG_ERR
: 0), NULL
);
986 (void) md_unit_readerlock(MDI_UNIT(MD_SID(un
)));
987 if (bp
->b_flags
& B_ERROR
) {
990 kmem_cache_free(mirror_child_cache
, sp
);
995 * send_mn_resync_done_message
997 * At the end of a resync, send a message to all nodes to indicate that
998 * the resync is complete. The argument, flags, has the following values
1000 * RESYNC_ERR - if an error occurred that terminated the resync
1001 * CLEAR_OPT_NOT_DONE - Just need to clear the OPT_NOT_DONE flag
1003 * unit writerlock set on entry
1004 * Only send the message if the thread is not marked as shutting down:
1005 * [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
1006 * [un->c.un_status & MD_UN_RESYNC_CANCEL]
1007 * or if there has been an error that terminated the resync:
1008 * flags & RESYNC_ERR
1012 send_mn_resync_done_message(
1017 md_mn_msg_resync_t
*rmsg
= un
->un_rs_msg
;
1019 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
1020 md_mn_kresult_t
*kres
;
1025 rmsg
= (md_mn_msg_resync_t
*)un
->un_rs_msg
;
1028 * Only send the message if this resync thread is still active. This
1029 * handles the case where ownership changes to different nodes during
1030 * a resync can cause multiple spurious resync_done messages to occur
1031 * when the resync completes. This happens because only one node is
1032 * the resync owner but other nodes will have their resync_unit thread
1033 * blocked in 'resync_kill_pending'
1035 mutex_enter(&un
->un_rs_thread_mx
);
1036 dont_send
= (un
->un_rs_thread_flags
& (MD_RI_KILL
|MD_RI_SHUTDOWN
)) ? 1
1038 mutex_exit(&un
->un_rs_thread_mx
);
1039 dont_send
|= (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
) ? 1 : 0;
1042 * Always send a message if we've encountered an error that terminated
1045 if (flags
& RESYNC_ERR
)
1050 if (mirror_debug_flag
) {
1051 printf("Don't send resync done message, mnum = %x,"
1052 " type = %x, flags = %d\n", MD_SID(un
),
1053 un
->un_rs_type
, flags
);
1060 if (mirror_debug_flag
) {
1061 printf("send resync done message, mnum = %x, type = %x\n",
1062 MD_SID(un
), un
->un_rs_type
);
1066 rmsg
->msg_resync_mnum
= MD_SID(un
);
1067 rmsg
->msg_resync_type
= un
->un_rs_type
;
1068 rmsg
->msg_originator
= md_mn_mynode_id
;
1069 rmsg
->msg_resync_flags
= 0;
1070 if (flags
& RESYNC_ERR
)
1071 rmsg
->msg_resync_flags
|= MD_MN_RS_ERR
;
1072 if (flags
& CLEAR_OPT_NOT_DONE
)
1073 rmsg
->msg_resync_flags
|= MD_MN_RS_CLEAR_OPT_NOT_DONE
;
1075 setno
= MD_MIN2SET(MD_SID(un
));
1076 md_unit_writerexit(ui
);
1077 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
1080 mutex_enter(&un
->un_rs_cpr_mx
);
1081 CALLB_CPR_SAFE_BEGIN(&un
->un_rs_cprinfo
);
1083 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RESYNC_PHASE_DONE
,
1084 MD_MSGF_NO_LOG
, 0, (char *)rmsg
, sizeof (md_mn_msg_resync_t
), kres
);
1086 CALLB_CPR_SAFE_END(&un
->un_rs_cprinfo
, &un
->un_rs_cpr_mx
);
1087 mutex_exit(&un
->un_rs_cpr_mx
);
1089 /* if the node hasn't yet joined, it's Ok. */
1090 if ((!MDMN_KSEND_MSG_OK(rval
, kres
)) &&
1091 (kres
->kmmr_comm_state
!= MDMNE_NOT_JOINED
)) {
1092 mdmn_ksend_show_error(rval
, kres
, "RESYNC_PHASE_DONE");
1093 /* If we're shutting down already, pause things here. */
1094 if (kres
->kmmr_comm_state
== MDMNE_RPC_FAIL
) {
1095 while (!md_mn_is_commd_present()) {
1099 * commd is now available again. Retry the message once.
1100 * If this fails we panic as the system is in an
1103 if (nretries
++ == 0)
1106 cmn_err(CE_PANIC
, "ksend_message failure: RESYNC_PHASE_DONE");
1108 kmem_free(kres
, sizeof (md_mn_kresult_t
));
1109 (void) md_unit_writerlock(ui
);
1113 * send_mn_resync_next_message
1115 * Sent a message to all nodes indicating the next region to be resynced.
1116 * The message contains the region to be resynced and the current position in
1117 * the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
1118 * On entry the unit readerlock is held.
1121 send_mn_resync_next_message(
1123 diskaddr_t currentblk
,
1128 md_mn_msg_resync_t
*rmsg
= un
->un_rs_msg
;
1130 md_mn_kresult_t
*kres
;
1131 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
1138 ASSERT(rmsg
!= NULL
);
1140 if (mirror_debug_flag
) {
1141 printf("send resync next message, mnum = %x, start=%lld, "
1142 "size=%ld, type=%x, done=%lld, 2_do=%lld\n",
1143 MD_SID(un
), currentblk
, rsize
, un
->un_rs_type
,
1144 un
->un_rs_resync_done
, un
->un_rs_resync_2_do
);
1147 rmsg
->msg_resync_mnum
= MD_SID(un
);
1148 rmsg
->msg_resync_type
= un
->un_rs_type
;
1149 rmsg
->msg_resync_start
= currentblk
;
1150 rmsg
->msg_resync_rsize
= rsize
;
1151 rmsg
->msg_resync_done
= un
->un_rs_resync_done
;
1152 rmsg
->msg_resync_2_do
= un
->un_rs_resync_2_do
;
1153 rmsg
->msg_originator
= md_mn_mynode_id
;
1154 if (flags
& MD_FIRST_RESYNC_NEXT
)
1155 rmsg
->msg_resync_flags
= MD_MN_RS_FIRST_RESYNC_NEXT
;
1158 * Copy current submirror state and flags into message. This provides
1159 * a means of keeping all nodes that are currently active in the cluster
1160 * synchronised with regards to their submirror state settings. If we
1161 * did not pass this information here, the only time every node gets
1162 * submirror state updated is at the end of a resync phase. This can be
1163 * a significant amount of time for large metadevices.
1165 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1166 sm
= &un
->un_sm
[smi
];
1167 rmsg
->msg_sm_state
[smi
] = sm
->sm_state
;
1168 rmsg
->msg_sm_flags
[smi
] = sm
->sm_flags
;
1170 setno
= MD_MIN2SET(MD_SID(un
));
1171 md_unit_readerexit(ui
);
1172 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
1175 mutex_enter(&un
->un_rs_cpr_mx
);
1176 CALLB_CPR_SAFE_BEGIN(&un
->un_rs_cprinfo
);
1178 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RESYNC_NEXT
, MD_MSGF_NO_LOG
,
1179 0, (char *)rmsg
, sizeof (md_mn_msg_resync_t
), kres
);
1181 CALLB_CPR_SAFE_END(&un
->un_rs_cprinfo
, &un
->un_rs_cpr_mx
);
1182 mutex_exit(&un
->un_rs_cpr_mx
);
1184 if (!MDMN_KSEND_MSG_OK(rval
, kres
)) {
1185 mdmn_ksend_show_error(rval
, kres
, "RESYNC_NEXT");
1186 /* If we're shutting down already, pause things here. */
1187 if (kres
->kmmr_comm_state
== MDMNE_RPC_FAIL
) {
1188 while (!md_mn_is_commd_present()) {
1192 * commd is now available again. Retry the message once.
1193 * If this fails we panic as the system is in an
1196 if (nretries
++ == 0)
1199 cmn_err(CE_PANIC
, "ksend_message failure: RESYNC_NEXT");
1201 kmem_free(kres
, sizeof (md_mn_kresult_t
));
1202 (void) md_unit_readerlock(ui
);
1203 ps
= un
->un_rs_prev_overlap
;
1205 /* Allocate previous overlap reference if needed */
1207 ps
= kmem_cache_alloc(mirror_parent_cache
, MD_ALLOCFLAGS
);
1210 ps
->ps_firstblk
= 0;
1213 md_unit_readerexit(ui
);
1214 (void) md_unit_writerlock(ui
);
1215 un
->un_rs_prev_overlap
= ps
;
1216 md_unit_writerexit(ui
);
1217 (void) md_unit_readerlock(ui
);
1220 ps
->ps_firstblk
= currentblk
;
1221 ps
->ps_lastblk
= currentblk
+ rsize
- 1;
1225 resync_read_blk_range(
1227 diskaddr_t currentblk
,
1228 diskaddr_t stopbefore
,
1233 size_t copysize
; /* limited by max xfer buf size */
1234 size_t rsize
; /* size of resync block (for MN) */
1237 diskaddr_t rs_startblk
;
1239 int flags1
= flags
& MD_FIRST_RESYNC_NEXT
;
1241 rs_type
= un
->un_rs_type
;
1242 rs_startblk
= currentblk
;
1243 if (stopbefore
> un
->c
.un_total_blocks
)
1244 stopbefore
= un
->c
.un_total_blocks
;
1245 if (currentblk
< un
->un_resync_startbl
)
1246 currentblk
= un
->un_resync_startbl
;
1248 copysize
= un
->un_rs_copysize
;
1249 rsize
= MD_DEF_RESYNC_BLK_SZ
;
1251 setno
= MD_MIN2SET(MD_SID(un
));
1252 while (currentblk
< stopbefore
) {
1254 * Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
1255 * if a MN device and sendflag is set, send a RESYNC_MESSAGE
1258 if ((currentblk
+ MD_DEF_RESYNC_BLK_SZ
) > stopbefore
)
1259 rsize
= stopbefore
- currentblk
;
1260 if (MD_MNSET_SETNO(setno
) && (flags
& MD_SEND_MESS_XMIT
)) {
1261 un
->un_resync_startbl
= currentblk
;
1262 rs_startblk
= currentblk
;
1263 send_mn_resync_next_message(un
, currentblk
, rsize
,
1267 /* check to see if we've been asked to terminate */
1268 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), type
))
1269 return ((un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1272 * Check to see if another node has completed this
1273 * block, if so either the type or the resync region
1274 * will have changed. If the resync type has changed,
1276 * If the resync region has changed, reset currentblk
1277 * to the start of the current resync region and
1280 if (un
->un_rs_type
!= rs_type
)
1282 if (un
->un_rs_prev_overlap
->ps_firstblk
>
1285 un
->un_rs_prev_overlap
->ps_firstblk
;
1289 newstop
= currentblk
+ rsize
;
1290 while (currentblk
< newstop
) {
1291 if ((currentblk
+ copysize
) > stopbefore
)
1292 copysize
= (size_t)(stopbefore
- currentblk
);
1293 if (resync_read_buffer(un
, currentblk
, copysize
,
1294 (flags
& MD_RESYNC_FLAG_ERR
)))
1297 /* resync_read_buffer releases/grabs a new lock */
1298 un
= (mm_unit_t
*)MD_UNIT(MD_SID(un
));
1299 currentblk
+= copysize
;
1301 /* check to see if we've been asked to terminate */
1302 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), type
))
1303 return ((un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1305 if (MD_MNSET_SETNO(setno
)) {
1307 * Check to see if another node has completed
1308 * this block, see above
1310 if (un
->un_rs_type
!= rs_type
)
1312 if (un
->un_rs_prev_overlap
->ps_firstblk
>
1315 un
->un_rs_prev_overlap
->ps_firstblk
;
1323 optimized_resync(mm_unit_t
*un
)
1329 uchar_t
*dirtyregions
;
1330 diskaddr_t first
, stopbefore
;
1336 uint_t old_rs_type
= un
->un_rs_type
;
1338 uint_t flags1
= MD_FIRST_RESYNC_NEXT
|MD_RESYNC_FLAG_ERR
;
1342 ui
= MDI_UNIT(mnum
);
1343 setno
= MD_UN2SET(un
);
1345 if (!(un
->c
.un_status
& MD_UN_OPT_NOT_DONE
)) {
1347 * We aren't marked as needing a resync so for multi-node
1348 * sets we flag the completion so that all nodes see the same
1349 * metadevice state. This is a problem when a new node joins
1350 * an existing set as it has to perform a 'metasync -r' and
1351 * we have to step through all of the resync phases. If we
1352 * don't do this the nodes that were already in the set will
1353 * have the metadevices marked as 'Okay' but the joining node
1354 * will have 'Needs Maintenance' which is unclearable.
1356 if (MD_MNSET_SETNO(setno
)) {
1357 send_mn_resync_done_message(un
, CLEAR_OPT_NOT_DONE
);
1363 * No need for optimized resync if ABR set, clear rs_type and flags
1366 if (ui
->ui_tstate
& MD_ABR_CAP
) {
1367 un
->un_rs_type
= MD_RS_NONE
;
1368 un
->c
.un_status
&= ~(MD_UN_OPT_NOT_DONE
| MD_UN_WAR
);
1372 un
->un_rs_dropped_lock
= 1;
1373 un
->c
.un_status
|= MD_UN_WAR
;
1374 resync_regions
= un
->un_rrd_num
;
1375 dirtyregions
= un
->un_resync_bm
;
1376 md_unit_writerexit(ui
);
1378 /* For MN sets, resync NOTIFY is done when processing resync messages */
1379 if (!MD_MNSET_SETNO(setno
)) {
1380 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_START
,
1381 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1383 un
= (mm_unit_t
*)md_unit_readerlock(ui
);
1385 /* check to see if we've been asked to terminate */
1386 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1387 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1388 broke_out
= RESYNC_ERR
;
1391 * Check that we are still performing an optimized
1392 * resync. If not, another node must have completed it
1393 * so we have no more work to do.
1395 if (un
->un_rs_type
!= old_rs_type
) {
1396 md_unit_readerexit(ui
);
1397 (void) md_unit_writerlock(ui
);
1401 * If rs_resync_done is non-zero, we must be completing an optimized
1402 * resync that has already been partially done on another node.
1403 * Therefore clear the bits in resync_bm for the resync regions
1404 * already done. If resync_startbl is zero, calculate 2_do.
1406 if (un
->un_rs_resync_done
> 0) {
1407 BLK_TO_RR(start_rr
, un
->un_resync_startbl
, un
);
1408 for (rr
= 0; rr
< start_rr
&& rr
< resync_regions
; rr
++)
1409 CLR_KEEPDIRTY(rr
, un
);
1411 un
->un_rs_resync_2_do
= 0;
1412 for (rr
= 0; rr
< resync_regions
; rr
++)
1413 if (isset(dirtyregions
, rr
))
1414 un
->un_rs_resync_2_do
++;
1417 for (rr
= 0; (rr
< resync_regions
) && (broke_out
!= RESYNC_ERR
); rr
++) {
1418 if (isset(dirtyregions
, rr
)) {
1419 RR_TO_BLK(first
, rr
, un
);
1420 RR_TO_BLK(stopbefore
, rr
+1, un
);
1421 old_rs_type
= un
->un_rs_type
;
1422 old_rs_done
= un
->un_rs_resync_done
;
1423 err
= resync_read_blk_range(un
, first
, stopbefore
,
1424 MD_READER_HELD
, MD_SEND_MESS_XMIT
| flags1
);
1425 flags1
= MD_RESYNC_FLAG_ERR
;
1427 /* resync_read_blk_range releases/grabs a new lock */
1428 un
= (mm_unit_t
*)MD_UNIT(mnum
);
1431 broke_out
= RESYNC_ERR
;
1436 * Check that we are still performing an optimized
1437 * resync. If not, another node must have completed it
1438 * so we have no more work to do.
1440 if (un
->un_rs_type
!= old_rs_type
) {
1441 md_unit_readerexit(ui
);
1442 (void) md_unit_writerlock(ui
);
1447 * If resync_done has increased, we must have
1448 * blocked in resync_read_blk_range while another node
1449 * continued with the resync. Therefore clear resync_bm
1450 * for the blocks that have been resynced on another
1451 * node and update rr to the next RR to be done.
1453 if (old_rs_done
< un
->un_rs_resync_done
) {
1455 BLK_TO_RR(start_rr
, un
->un_resync_startbl
- 1,
1457 for (i
= rr
; i
< start_rr
; i
++)
1458 CLR_KEEPDIRTY(i
, un
);
1461 un
->un_rs_resync_done
++;
1463 for (smi
= 0, cnt
= 0; smi
< NMIRROR
; smi
++)
1464 if (SUBMIRROR_IS_WRITEABLE(un
, smi
) &&
1465 !(SMS_BY_INDEX_IS(un
, smi
, SMS_ALL_ERRED
)))
1468 broke_out
= RESYNC_ERR
;
1471 CLR_KEEPDIRTY(rr
, un
);
1472 /* Check to see if we've completed the resync cleanly */
1473 if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
)
1477 * Check that we haven't exceeded un_rs_resync_2_do. If
1478 * we have we've completed the resync.
1480 if (un
->un_rs_resync_done
> un
->un_rs_resync_2_do
)
1484 md_unit_readerexit(ui
);
1485 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
1488 * If MN set send message to all nodes to indicate resync
1489 * phase is complete. The processing of the message will update the
1492 if (MD_MNSET_SETNO(setno
)) {
1493 send_mn_resync_done_message(un
, broke_out
);
1497 un
->c
.un_status
&= ~MD_UN_WAR
;
1499 un
->c
.un_status
&= ~MD_UN_KEEP_DIRTY
;
1501 setno
= MD_UN2SET(un
);
1502 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1503 un
->un_sm
[smi
].sm_flags
&= ~MD_SM_RESYNC_TARGET
;
1504 if (SMS_BY_INDEX_IS(un
, smi
, SMS_OFFLINE_RESYNC
)) {
1505 state
= (broke_out
? SMS_OFFLINE
: SMS_RUNNING
);
1506 mirror_set_sm_state(&un
->un_sm
[smi
],
1507 &un
->un_smic
[smi
], state
, broke_out
);
1508 mirror_commit(un
, NO_SUBMIRRORS
, 0);
1510 if (SMS_BY_INDEX_IS(un
, smi
, SMS_OFFLINE
))
1511 un
->c
.un_status
|= MD_UN_OFFLINE_SM
;
1515 /* For MN sets, resync NOTIFY is done when processing resync messages */
1516 if (!MD_MNSET_SETNO(setno
)) {
1518 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_FAILED
,
1519 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
1521 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_DONE
,
1522 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
1528 * recalc_resync_done
1530 * This function deals with a change in value of un_rs_resync_2_do in a
1531 * component resync. This may change if we are restarting a component
1532 * resync on a single node having rebooted with a different value of
1533 * md_resync_bufsz or if we are running in a multi-node with nodes having
1534 * different values of md_resync_bufsz.
1535 * If there is a change in un_rs_resync_2_do, we need to recalculate
1536 * the value of un_rs_resync_done given the new value for resync_2_do.
1537 * We have to calculate a new value for resync_done to be either
1538 * if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
1539 * or if it is not set, we need to calculate it from un_rs_resync_done,
1540 * (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
1541 * In addition we need to deal with the overflow case by using a factor to
1546 recalc_resync_done(mm_unit_t
*un
, size_t resync_2_do
, diskaddr_t initblock
,
1547 u_longlong_t blk_size
, u_longlong_t skip
)
1553 * If resync_2_do has not yet been calculated, no need to modify
1556 if (un
->un_rs_resync_2_do
== 0) {
1559 if (un
->un_rs_resync_2_do
== resync_2_do
)
1560 return; /* No change, so nothing to do */
1562 * If un_rs_startbl is set, another node must have already started
1563 * this resync and hence we can calculate resync_done from
1566 if (un
->un_resync_startbl
) {
1567 un
->un_rs_resync_done
= (un
->un_resync_startbl
- initblock
) /
1572 * un_resync_startbl is not set so we must calculate it from
1573 * un_rs_resync_done.
1574 * If the larger of the two values of resync_2_do is greater than 32
1575 * bits, calculate a factor to divide by to ensure that we don't
1576 * overflow 64 bits when calculating the new value for resync_done
1578 x
= (un
->un_rs_resync_2_do
> resync_2_do
) ? un
->un_rs_resync_2_do
:
1580 while (x
> INT32_MAX
) {
1582 factor
= factor
<< 1;
1584 un
->un_rs_resync_done
= ((un
->un_rs_resync_done
/factor
) *
1585 (resync_2_do
/factor
)) /
1586 ((un
->un_rs_resync_2_do
+ (factor
* factor
) - 1)/
1591 check_comp_4_resync(mm_unit_t
*un
, int smi
, int ci
)
1596 mm_submirror_ic_t
*smic
;
1600 u_longlong_t blk_size
;
1601 diskaddr_t initblock
;
1603 diskaddr_t frag
= 0;
1604 md_m_shared_t
*shared
;
1609 uint_t old_rs_type
= un
->un_rs_type
;
1610 diskaddr_t old_rs_done
;
1611 uint_t flags1
= MD_FIRST_RESYNC_NEXT
;
1612 diskaddr_t resync_2_do
;
1615 ui
= MDI_UNIT(mnum
);
1616 sm
= &un
->un_sm
[smi
];
1617 smic
= &un
->un_smic
[smi
];
1618 setno
= MD_UN2SET(un
);
1620 shared
= (md_m_shared_t
*)(*(smic
->sm_shared_by_indx
))
1621 (sm
->sm_dev
, sm
, ci
);
1623 if (shared
->ms_state
!= CS_RESYNC
) {
1624 SET_RS_TYPE_NONE(un
->un_rs_type
);
1628 if (shared
->ms_flags
& MDM_S_RS_TRIED
) {
1629 SET_RS_TYPE_NONE(un
->un_rs_type
);
1633 (void) (*(smic
->sm_get_bcss
))
1634 (sm
->sm_dev
, sm
, ci
, &initblock
, &count
, &skip
, &size
);
1636 if ((count
== 1) && (skip
== 0)) {
1637 count
= (size_t)(size
/ un
->un_rs_copysize
);
1638 if ((frag
= (size
- (count
* un
->un_rs_copysize
))) != 0)
1640 size
= (u_longlong_t
)un
->un_rs_copysize
;
1642 blk_size
= size
; /* Save block size for this resync */
1645 resync_2_do
= count
;
1647 * If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
1648 * gives the proportion of the resync that has already been done.
1649 * If un_rs_copysize has changed since this previous partial resync,
1650 * either because this node has been rebooted with a different value
1651 * for md_resync_bufsz or because another node with a different value
1652 * for md_resync_bufsz performed the previous resync, we need to
1653 * recalculate un_rs_resync_done as a proportion of our value of
1656 recalc_resync_done(un
, resync_2_do
, initblock
, blk_size
, skip
);
1659 * For MN mirrors we need to send a message to all nodes indicating
1660 * the next region to be resynced. For a component resync, the size of
1661 * the contiguous region that is processed by resync_read_blk_range()
1662 * may be small if there is the interleave size.
1663 * Therefore, rather than sending the message within
1664 * resync_read_blk_range(), we will send a message every
1665 * MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
1666 * the number of blocks. Then, if we are restarting a resync, round
1667 * un_rs_resync_done down to the previous resync region boundary. This
1668 * ensures that we send a RESYNC_NEXT message before resyncing any
1671 if (MD_MNSET_SETNO(setno
)) {
1672 blks
= ((MD_DEF_RESYNC_BLK_SZ
+ blk_size
+ skip
- 1)/
1674 un
->un_rs_resync_done
= (un
->un_rs_resync_done
/blks
) * blks
;
1677 * un_rs_resync_done is the number of ('size' + 'skip') increments
1678 * already resynced from the base 'block'
1679 * un_rs_resync_2_do is the number of iterations in
1680 * this component resync.
1682 ASSERT(count
>= un
->un_rs_resync_done
);
1683 un
->un_rs_resync_2_do
= (diskaddr_t
)count
;
1685 un
->c
.un_status
|= MD_UN_WAR
;
1686 sm
->sm_flags
|= MD_SM_RESYNC_TARGET
;
1687 md_unit_writerexit(ui
);
1689 /* For MN sets, resync NOTIFY is done when processing resync messages */
1690 if (!MD_MNSET_SETNO(setno
)) {
1691 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_START
,
1692 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1694 un
= (mm_unit_t
*)md_unit_readerlock(ui
);
1696 /* check to see if we've been asked to terminate */
1697 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1698 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1699 broke_out
= RESYNC_ERR
;
1702 * Check that we are still performing the same component
1703 * resync. If not, another node must have completed it
1704 * so we have no more work to do.
1706 if (un
->un_rs_type
!= old_rs_type
) {
1707 md_unit_readerexit(ui
);
1708 (void) md_unit_writerlock(ui
);
1712 * Adjust resync_done, resync_2_do, start of resync area and count to
1713 * skip already resync'd data. We need to recalculate resync_done as
1714 * we have dropped the unit lock above and may have lost ownership to
1715 * another node, with a different resync buffer size and it may have
1716 * sent us new values of resync_done and resync_2_do based on its
1717 * resync buffer size
1719 recalc_resync_done(un
, resync_2_do
, initblock
, blk_size
, skip
);
1720 un
->un_rs_resync_2_do
= resync_2_do
;
1721 count
-= un
->un_rs_resync_done
;
1722 block
= initblock
+ ((blk_size
+ skip
) * (int)un
->un_rs_resync_done
);
1724 un
->un_rs_dropped_lock
= 1;
1725 while ((count
> 0) && (broke_out
!= RESYNC_ERR
)) {
1726 old_rs_done
= un
->un_rs_resync_done
;
1728 * For MN mirrors send a message to the other nodes. This
1729 * message includes the size of the region that must be blocked
1732 if (MD_MNSET_SETNO(setno
)) {
1733 if ((un
->un_rs_resync_done
%blks
== 0)) {
1734 un
->un_resync_startbl
= block
;
1735 send_mn_resync_next_message(un
, block
,
1736 (blk_size
+skip
)*blks
, flags1
);
1739 * check to see if we've been asked to
1742 if (resync_kill_pending(un
,
1743 MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1744 if (un
->c
.un_status
&
1745 MD_UN_RESYNC_CANCEL
) {
1746 broke_out
= RESYNC_ERR
;
1752 * Check that we are still performing the same
1753 * component resync. If not, another node must
1754 * have completed it so we have no more work to
1755 * do. Also reset count to remaining resync as
1756 * we may have lost ownership in in
1757 * send_mn_resync_next_message while another
1758 * node continued with the resync and
1759 * incremented resync_done.
1761 if (un
->un_rs_type
!= old_rs_type
) {
1762 md_unit_readerexit(ui
);
1763 (void) md_unit_writerlock(ui
);
1767 * recalculate resync_done, resync_2_do
1768 * We need to recalculate resync_done as
1769 * we have dropped the unit lock in
1770 * send_mn_resync_next_message above and may
1771 * have lost ownership to another node, with a
1772 * different resync buffer size and it may have
1773 * sent us new values of resync_done and
1774 * resync_2_do based on its resync buffer size
1776 recalc_resync_done(un
, resync_2_do
, initblock
,
1778 un
->un_rs_resync_2_do
= resync_2_do
;
1779 count
= un
->un_rs_resync_2_do
-
1780 un
->un_rs_resync_done
;
1782 * Adjust start of resync area to skip already
1785 block
= initblock
+ ((blk_size
+ skip
) *
1786 (int)un
->un_rs_resync_done
);
1787 old_rs_done
= un
->un_rs_resync_done
;
1790 err
= resync_read_blk_range(un
, block
, block
+ size
,
1791 MD_READER_HELD
, MD_RESYNC_FLAG_ERR
);
1793 /* resync_read_blk_range releases/grabs a new lock */
1794 un
= (mm_unit_t
*)MD_UNIT(mnum
);
1797 broke_out
= RESYNC_ERR
;
1801 * If we are no longer resyncing this component, return as
1802 * another node has progressed the resync.
1804 if (un
->un_rs_type
!= old_rs_type
) {
1805 md_unit_readerexit(ui
);
1806 (void) md_unit_writerlock(ui
);
1811 * recalculate resync_done, resync_2_do. We need to recalculate
1812 * resync_done as we have dropped the unit lock in
1813 * resync_read_blk_range above and may have lost ownership to
1814 * another node, with a different resync buffer size and it may
1815 * have sent us new values of resync_done and resync_2_do based
1816 * on its resync buffer size
1818 recalc_resync_done(un
, resync_2_do
, initblock
, blk_size
, skip
);
1819 un
->un_rs_resync_2_do
= resync_2_do
;
1822 * Reset count to remaining resync as we may have blocked in
1823 * resync_read_blk_range while another node continued
1824 * with the resync and incremented resync_done. Also adjust
1825 * start of resync area to skip already resync'd data.
1827 count
= un
->un_rs_resync_2_do
- un
->un_rs_resync_done
;
1828 block
= initblock
+((blk_size
+ skip
) *
1829 (int)un
->un_rs_resync_done
);
1832 * If we are picking up from another node, we retry the last
1833 * block otherwise step on to the next block
1835 if (old_rs_done
== un
->un_rs_resync_done
) {
1836 block
+= blk_size
+ skip
;
1837 un
->un_rs_resync_done
++;
1841 if ((count
== 1) && frag
)
1843 if (shared
->ms_state
== CS_ERRED
) {
1845 broke_out
= RESYNC_ERR
;
1849 /* Check to see if we've completed the resync cleanly */
1850 if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
)
1854 md_unit_readerexit(ui
);
1855 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
1858 * If MN set send message to all nodes to indicate resync
1859 * phase is complete. The processing of the message will update the
1862 if (MD_MNSET_SETNO(setno
)) {
1863 send_mn_resync_done_message(un
, broke_out
);
1865 un
->c
.un_status
&= ~MD_UN_WAR
;
1866 sm
->sm_flags
&= ~MD_SM_RESYNC_TARGET
;
1869 shared
->ms_flags
|= MDM_S_RS_TRIED
;
1872 * As we don't transmit the changes,
1873 * no need to drop the lock.
1875 set_sm_comp_state(un
, smi
, ci
, CS_OKAY
, 0,
1876 MD_STATE_NO_XMIT
, (IOLOCK
*)NULL
);
1879 /* For MN sets, resync NOTIFY is done when processing resync messages */
1880 if (!MD_MNSET_SETNO(setno
)) {
1882 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_FAILED
,
1883 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1885 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_DONE
,
1886 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1888 SET_RS_TYPE_NONE(un
->un_rs_type
);
1893 submirror_resync(mm_unit_t
*un
)
1898 mm_submirror_ic_t
*smic
;
1907 int flags1
= MD_FIRST_RESYNC_NEXT
;
1911 ui
= MDI_UNIT(mnum
);
1912 setno
= MD_UN2SET(un
);
1915 * If the submirror_index is non-zero, we are continuing a resync
1916 * so restart resync from last submirror marked as being resynced.
1918 if (RS_SMI(un
->un_rs_type
) != 0) {
1919 smi
= RS_SMI(un
->un_rs_type
);
1920 sm
= &un
->un_sm
[smi
];
1921 smic
= &un
->un_smic
[smi
];
1922 if (!SMS_IS(sm
, SMS_ATTACHED_RESYNC
)) {
1923 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1924 sm
= &un
->un_sm
[smi
];
1925 smic
= &un
->un_smic
[smi
];
1926 if (SMS_IS(sm
, SMS_ATTACHED_RESYNC
))
1931 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1932 sm
= &un
->un_sm
[smi
];
1933 smic
= &un
->un_smic
[smi
];
1934 if (SMS_IS(sm
, SMS_ATTACHED_RESYNC
))
1938 if (smi
== NMIRROR
) {
1939 SET_RS_TYPE_NONE(un
->un_rs_type
);
1944 * If we've only got one component we can fail on a resync write
1945 * if an error is encountered. This stops an unnecessary read of the
1946 * whole mirror on a target write error.
1948 compcnt
= (*(smic
->sm_get_component_count
))(sm
->sm_dev
, sm
);
1950 flags1
|= MD_RESYNC_FLAG_ERR
;
1952 un
->c
.un_status
|= MD_UN_WAR
;
1953 sm
->sm_flags
|= MD_SM_RESYNC_TARGET
;
1954 SET_RS_SMI(un
->un_rs_type
, smi
);
1955 md_unit_writerexit(ui
);
1957 /* For MN sets, resync NOTIFY is done when processing resync messages */
1958 if (!MD_MNSET_SETNO(setno
)) {
1959 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_START
,
1960 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1962 un
= (mm_unit_t
*)md_unit_readerlock(ui
);
1964 un
->un_rs_dropped_lock
= 1;
1966 /* check to see if we've been asked to terminate */
1967 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1968 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1969 broke_out
= RESYNC_ERR
;
1972 * Check that we are still performing the same submirror
1973 * resync. If not, another node must have completed it
1974 * so we have no more work to do.
1976 if (RS_TYPE(un
->un_rs_type
) != MD_RS_SUBMIRROR
) {
1977 md_unit_readerexit(ui
);
1978 (void) md_unit_writerlock(ui
);
1982 /* if > 1TB mirror, increase percent done granularity */
1983 if (un
->c
.un_total_blocks
> MD_MAX_BLKS_FOR_SMALL_DEVS
)
1984 chunk
= un
->c
.un_total_blocks
/ 1000;
1986 chunk
= un
->c
.un_total_blocks
/ 100;
1988 chunk
= un
->c
.un_total_blocks
;
1990 * If a MN set, round the chunk size up to a multiple of
1991 * MD_DEF_RESYNC_BLK_SZ
1993 if (MD_MNSET_SETNO(setno
)) {
1994 chunk
= ((chunk
+ MD_DEF_RESYNC_BLK_SZ
)/MD_DEF_RESYNC_BLK_SZ
)
1995 * MD_DEF_RESYNC_BLK_SZ
;
1996 if (chunk
> un
->c
.un_total_blocks
)
1997 chunk
= un
->c
.un_total_blocks
;
2000 * Handle restartable resyncs that continue from where the previous
2001 * resync left off. The new resync range is from un_rs_resync_done ..
2005 if (un
->un_rs_resync_done
== 0) {
2006 un
->un_rs_resync_2_do
= un
->c
.un_total_blocks
;
2008 curblk
= un
->un_rs_resync_done
;
2010 while ((curblk
!= un
->c
.un_total_blocks
) && (broke_out
!= RESYNC_ERR
)) {
2013 rs_done
= un
->un_rs_resync_done
;
2014 err
= resync_read_blk_range(un
, curblk
, curblk
+ chunk
,
2015 MD_READER_HELD
, MD_SEND_MESS_XMIT
| flags1
);
2016 flags1
= (compcnt
== 1 ? MD_RESYNC_FLAG_ERR
: 0);
2018 /* resync_read_blk_range releases/grabs a new lock */
2019 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2022 broke_out
= RESYNC_ERR
;
2027 * If we are no longer executing a submirror resync, return
2028 * as another node has completed the submirror resync.
2030 if (RS_TYPE(un
->un_rs_type
) != MD_RS_SUBMIRROR
) {
2031 md_unit_readerexit(ui
);
2032 (void) md_unit_writerlock(ui
);
2036 * If resync_done has changed, we must have blocked
2037 * in resync_read_blk_range while another node
2038 * continued with the resync so restart from resync_done.
2040 if (rs_done
!= un
->un_rs_resync_done
) {
2041 curblk
= un
->un_rs_resync_done
;
2044 un
->un_rs_resync_done
= curblk
;
2047 if ((curblk
+ chunk
) > un
->c
.un_total_blocks
)
2048 chunk
= un
->c
.un_total_blocks
- curblk
;
2049 for (i
= 0, cnt
= 0; i
< NMIRROR
; i
++)
2050 if (SUBMIRROR_IS_WRITEABLE(un
, i
) &&
2051 !SMS_BY_INDEX_IS(un
, i
, SMS_ALL_ERRED
) &&
2052 (un
->un_sm
[i
].sm_flags
& MD_SM_RESYNC_TARGET
))
2055 broke_out
= RESYNC_ERR
;
2059 /* Check to see if we've completed the resync cleanly */
2060 if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
)
2063 md_unit_readerexit(ui
);
2064 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
2067 * If MN set send message to all nodes to indicate resync
2068 * phase is complete. The processing of the message will update the
2071 if (MD_MNSET_SETNO(setno
)) {
2072 send_mn_resync_done_message(un
, broke_out
);
2074 sm
->sm_flags
&= ~MD_SM_RESYNC_TARGET
;
2076 mirror_set_sm_state(sm
, smic
, SMS_ATTACHED
, 1);
2078 mirror_set_sm_state(sm
, smic
, SMS_RUNNING
, 0);
2080 un
->c
.un_status
&= ~MD_UN_WAR
;
2081 mirror_commit(un
, SMI2BIT(smi
), 0);
2084 /* For MN sets, resync NOTIFY is done when processing resync messages */
2085 if (!MD_MNSET_SETNO(setno
)) {
2087 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_FAILED
,
2088 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
2090 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_DONE
,
2091 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
2097 component_resync(mm_unit_t
*un
)
2100 mm_submirror_ic_t
*smic
;
2106 * Handle the case where we are picking up a partially complete
2107 * component resync. In this case un_rs_type contains the submirror
2108 * and component index of where we should restart the resync.
2110 while (un
->un_rs_type
!= MD_RS_COMPONENT
) {
2111 i
= RS_SMI(un
->un_rs_type
);
2112 ci
= RS_CI(un
->un_rs_type
);
2113 check_comp_4_resync(un
, i
, ci
);
2114 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)),
2118 * If we have no current resync, contine to scan submirror and
2119 * components. If the resync has moved on to another component,
2120 * restart it and if the resync is no longer a component
2123 if (RS_TYPE(un
->un_rs_type
) == MD_RS_NONE
)
2125 if (RS_TYPE(un
->un_rs_type
) != MD_RS_COMPONENT
)
2128 /* Now continue scanning _all_ submirrors and components */
2129 for (i
= 0; i
< NMIRROR
; i
++) {
2131 smic
= &un
->un_smic
[i
];
2132 if (!SMS_IS(sm
, SMS_RUNNING
| SMS_LIMPING
))
2134 compcnt
= (*(smic
->sm_get_component_count
))(sm
->sm_dev
, sm
);
2135 for (ci
= 0; ci
< compcnt
; ci
++) {
2136 SET_RS_SMI(un
->un_rs_type
, i
);
2137 SET_RS_CI(un
->un_rs_type
, ci
);
2138 SET_RS_TYPE(un
->un_rs_type
, MD_RS_COMPONENT
);
2139 check_comp_4_resync(un
, i
, ci
);
2140 /* Bail out if we've been asked to abort/shutdown */
2141 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)),
2145 * Now check if another node has continued with the
2146 * resync, if we are no longer in component resync,
2147 * exit, otherwise update to the current component - 1
2148 * so that the next call of check_comp_4 resync() will
2149 * resync the current component.
2151 if ((RS_TYPE(un
->un_rs_type
) != MD_RS_NONE
) &&
2152 (RS_TYPE(un
->un_rs_type
) != MD_RS_COMPONENT
))
2155 if (RS_SMI(un
->un_rs_type
) != i
) {
2156 i
= RS_SMI(un
->un_rs_type
);
2157 ci
= RS_CI(un
->un_rs_type
) - 1;
2158 } else if (RS_CI(un
->un_rs_type
) != ci
)
2159 ci
= RS_CI(un
->un_rs_type
) - 1;
2166 reset_comp_flags(mm_unit_t
*un
)
2169 mm_submirror_ic_t
*smic
;
2170 md_m_shared_t
*shared
;
2175 for (i
= 0; i
< NMIRROR
; i
++) {
2177 smic
= &un
->un_smic
[i
];
2178 if (!SMS_IS(sm
, SMS_INUSE
))
2180 compcnt
= (*(smic
->sm_get_component_count
))(sm
->sm_dev
, sm
);
2181 for (ci
= 0; ci
< compcnt
; ci
++) {
2182 shared
= (md_m_shared_t
*)(*(smic
->sm_shared_by_indx
))
2183 (sm
->sm_dev
, sm
, ci
);
2184 shared
->ms_flags
&= ~MDM_S_RS_TRIED
;
2190 * resync_progress_thread:
2191 * ----------------------
2192 * Thread started on first resync of a unit which simply blocks until woken up
2193 * by a cv_signal, and then updates the mddb for the mirror unit record. This
2194 * saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
2195 * so that an aborted resync can be continued after an intervening reboot.
2198 resync_progress_thread(minor_t mnum
)
2200 mm_unit_t
*un
= MD_UNIT(mnum
);
2201 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
2202 set_t setno
= MD_MIN2SET(mnum
);
2204 while (un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
) {
2205 mutex_enter(&un
->un_rs_progress_mx
);
2206 cv_wait(&un
->un_rs_progress_cv
, &un
->un_rs_progress_mx
);
2207 mutex_exit(&un
->un_rs_progress_mx
);
2208 if (un
->un_rs_progress_flags
& MD_RI_KILL
)
2212 * Commit mirror unit if we're the Master node in a multi-node
2215 if (MD_MNSET_SETNO(setno
) && md_set
[setno
].s_am_i_master
) {
2216 (void) md_unit_readerlock(ui
);
2217 mirror_commit(un
, NO_SUBMIRRORS
, 0);
2218 md_unit_readerexit(ui
);
2227 * Timeout handler for updating the progress of the resync thread.
2228 * Simply wake up the resync progress daemon which will then mirror_commit() the
2229 * unit structure to the mddb. This snapshots the current progress of the resync
2232 resync_progress(void *arg
)
2234 mm_unit_t
*un
= (mm_unit_t
*)arg
;
2235 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
2238 mutex_enter(&un
->un_rs_progress_mx
);
2239 cv_signal(&un
->un_rs_progress_cv
);
2240 mutex_exit(&un
->un_rs_progress_mx
);
2242 /* schedule the next timeout if the resync is still marked active */
2243 (void) md_unit_readerlock(ui
);
2244 active
= un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
? 1 : 0;
2245 md_unit_readerexit(ui
);
2247 un
->un_rs_resync_to_id
= timeout(resync_progress
, un
,
2248 (clock_t)(drv_usectohz(60000000) *
2249 md_mirror_resync_update_intvl
));
2256 * Resync thread which drives all forms of resync (optimized, component,
2257 * submirror). Must handle thread suspension and kill to allow multi-node
2258 * resync to run without undue ownership changes.
2260 * For a MN set, the reync mechanism is as follows:
2262 * When a resync is started, either via metattach, metaonline, metareplace,
2263 * metasync or by a hotspare kicking in, a message is sent to all nodes, which
2264 * calls mirror_resync_thread. If there is currently no mirror owner, the
2265 * master node sends a CHOOSE_OWNER message to the handler on the master. This
2266 * chooses a mirror owner and sends a CHANGE_OWNER message requesting the
2267 * selected node to become the owner.
2268 * If this node is not the owner it sets itself to block in resync_kill_pending
2269 * and if there is no owner all nodes will block until the chosen owner is
2270 * selected, in which case it will unblock itself. So, on entry to this
2271 * function only one node will continue past resync_kill_pending().
2272 * Once the resync thread is started, it basically cycles through the optimized,
2273 * component and submirrors resyncs until there is no more work to do.
2275 * For an ABR mirror, once a mirror owner is chosen it will complete the resync
2276 * unless the nodes dies in which case a new owner will be chosen and it will
2277 * have to complete the resync from the point at which the previous owner died.
2278 * To do this we broadcast a RESYNC_NEXT message before each region to be
2279 * resynced and this message contains the address and length of the region
2280 * being resynced and the current progress through the resync. The size of
2281 * this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
2282 * block size to limit the amount of inter node traffic. The RESYNC_NEXT
2283 * message also indicates to all other nodes that all writes to this block
2284 * must be blocked until the next RESYNC_NEXT message is received. This ensures
2285 * that no node can write to a block that is being resynced. For all MN
2286 * mirrors we also block the whole resync region on the resync owner node so
2287 * that all writes to the resync region are blocked on all nodes. There is a
2288 * difference here between a MN set and a regular set in that for a MN set
2289 * we protect the mirror from writes to the current resync block by blocking
2290 * a larger region. For a regular set we just block writes to the current
2293 * For a non-ABR mirror the same RESYNC_NEXT message is sent with an
2294 * additional purpose. In this case, there is only one mirror owner at a time
2295 * and rather than continually switching ownership between the chosen mirror
2296 * owner and the node that is writing to the mirror, we move the resync to the
2297 * mirror owner. When we swich ownership, we block the old owner and unblock
2298 * the resync thread on the new owner. To enable the new owner to continue the
2299 * resync, all nodes need to have the latest resync status, Then, following each
2300 * resync write, we check to see if the resync state has changed and if it
2301 * has this must be because we have lost ownership to another node(s) for a
2302 * period and then have become owner again later in the resync process. If we
2303 * are still dealing with the same resync, we just adjust addresses and counts
2304 * and then continue. If the resync has moved on to a different type, for
2305 * example from an optimized to a submirror resync, we move on to process the
2306 * resync described by rs_type and continue from the position described by
2307 * resync_done and resync_startbl.
2309 * Note that for non-ABR mirrors it is possible for a write to be made on a
2310 * non resync-owner node without a change of ownership. This is the case when
2311 * the mirror has a soft part created on it and a write in ABR mode is made
2312 * to that soft part. Therefore we still need to block writes to the resync
2313 * region on all nodes.
2315 * Sending the latest resync state to all nodes also enables them to continue
2316 * a resync in the event that the mirror owner dies. If a mirror owner for
2317 * a non-ABR mirror has died, there will be dirty resync regions. Therefore,
2318 * regardless of whether another type of resync was in progress, we must first
2319 * do an optimized resync to clean up the dirty regions before continuing
2320 * with the interrupted resync.
2322 * The resync status is held in the unit structure
2324 * un_rs_resync_done The number of contiguous resyc blocks done so far
2325 * un_rs_resync_2_do The total number of contiguous resync blocks
2326 * un_rs_type The resync type (inc submirror and component numbers)
2328 * un_resync_startbl The address of the current resync block being processed
2330 * In the event that the whole cluster fails we need to just use
2331 * un_rs_resync_done to restart the resync and to ensure that this is
2332 * periodically written to disk, we have a thread which writes the record
2333 * to disk every 5 minutes. As the granularity of un_rs_resync_done is
2334 * usually coarse ( for an optimized resync 1001 is the max value) there is
2335 * little point in writing this more frequently.
2338 resync_unit(minor_t mnum
)
2342 md_error_t mde
= mdnullerror
;
2344 int resync_finish
= 0;
2345 set_t setno
= MD_MIN2SET(mnum
);
2346 uint_t old_rs_type
= MD_RS_NONE
;
2347 uint_t old_rs_done
= 0, old_rs_2_do
= 0;
2348 uint_t old_rs_startbl
= 0;
2349 int block_resync
= 1;
2350 char cpr_name
[23]; /* Unique CPR name */
2357 if (mirror_debug_flag
)
2358 printf("Resync started (mnum = %x)\n", mnum
);
2361 * increment the mirror resync count
2363 mutex_enter(&md_cpr_resync
.md_resync_mutex
);
2364 md_cpr_resync
.md_mirror_resync
++;
2365 mutex_exit(&md_cpr_resync
.md_resync_mutex
);
2367 ui
= MDI_UNIT(mnum
);
2370 rs_copysize
= un
->un_rs_copysize
;
2371 if (rs_copysize
== 0) {
2373 * Don't allow buffer size to fall outside the
2374 * range 0 < bufsize <= md_max_xfer_bufsz.
2376 if (md_resync_bufsz
<= 0)
2377 md_resync_bufsz
= MD_DEF_RESYNC_BUF_SIZE
;
2378 rs_copysize
= MIN(md_resync_bufsz
, md_max_xfer_bufsz
);
2380 rs_buffer
= kmem_zalloc(dbtob(rs_copysize
), KM_SLEEP
);
2381 un
= md_unit_writerlock(ui
);
2382 un
->un_rs_copysize
= rs_copysize
;
2383 un
->un_rs_buffer
= rs_buffer
;
2385 if (MD_MNSET_SETNO(setno
)) {
2387 * Register this resync thread with the CPR mechanism. This
2388 * allows us to detect when the system is suspended and so
2389 * keep track of the RPC failure condition.
2391 (void) snprintf(cpr_name
, sizeof (cpr_name
),
2392 "mirror_resync%x", mnum
);
2393 CALLB_CPR_INIT(&un
->un_rs_cprinfo
, &un
->un_rs_cpr_mx
,
2394 callb_md_mrs_cpr
, cpr_name
);
2396 if (ui
->ui_tstate
& MD_RESYNC_NOT_DONE
) {
2398 * If this is the first resync following the initial
2399 * snarf (MD_RESYNC_NOT_DONE still set) and we've
2400 * been started outside a reconfig step (e.g. by being
2401 * added to an existing set) we need to query the
2402 * existing submirror state for this mirror.
2403 * The set_status flags will have MD_MN_SET_MIR_STATE_RC
2404 * set if we've been through a step4 reconfig, so only
2405 * query the master if this isn't (yet) set. In this
2406 * case we must continue the resync thread as there is
2407 * not guaranteed to be a currently running resync on
2408 * any of the other nodes. Worst case is that we will
2409 * initiate an ownership change to this node and then
2410 * find that there is no resync to perform. However, we
2411 * will then have correct status across the cluster.
2413 if (!md_set
[setno
].s_am_i_master
) {
2414 if (!(md_get_setstatus(setno
) &
2415 MD_SET_MN_MIR_STATE_RC
)) {
2416 mirror_get_status(un
, NULL
);
2419 if (mirror_debug_flag
) {
2422 for (i
= 0; i
< NMIRROR
; i
++) {
2434 ui
->ui_tstate
&= ~MD_RESYNC_NOT_DONE
;
2437 * For MN set, if we have an owner, then start the resync on it.
2438 * If there is no owner the master must send a message to
2439 * choose the owner. This message will contain the current
2440 * resync count and it will only be sent to the master, where
2441 * the resync count will be used to choose the next node to
2442 * perform a resync, by cycling through the nodes in the set.
2443 * The message handler will then send a CHANGE_OWNER message to
2444 * all nodes, and on receipt of that message, the chosen owner
2445 * will issue a SET_OWNER ioctl to become the owner. This ioctl
2446 * will be requested to spawn a thread to issue the
2447 * REQUEST_OWNER message to become the owner which avoids the
2448 * need for concurrent ioctl requests.
2449 * After sending the message, we will block waiting for one
2450 * of the nodes to become the owner and start the resync
2452 if (MD_MN_NO_MIRROR_OWNER(un
)) {
2454 * There is no owner, block and then the master will
2455 * choose the owner. Only perform this if 'block_resync'
2459 mutex_enter(&un
->un_rs_thread_mx
);
2460 un
->un_rs_thread_flags
|= MD_RI_BLOCK_OWNER
;
2461 mutex_exit(&un
->un_rs_thread_mx
);
2463 if (md_set
[setno
].s_am_i_master
) {
2464 md_unit_writerexit(ui
);
2465 (void) mirror_choose_owner(un
, NULL
);
2466 (void) md_unit_writerlock(ui
);
2469 /* There is an owner, block if we are not it */
2470 if (!MD_MN_MIRROR_OWNER(un
)) {
2471 mutex_enter(&un
->un_rs_thread_mx
);
2472 un
->un_rs_thread_flags
|= MD_RI_BLOCK_OWNER
;
2473 mutex_exit(&un
->un_rs_thread_mx
);
2478 * Start a timeout chain to update the resync progress to the mddb.
2479 * This will run every md_mirror_resync_update_intvl minutes and allows
2480 * a resync to be continued over a reboot.
2482 ASSERT(un
->un_rs_resync_to_id
== 0);
2483 un
->un_rs_resync_to_id
= timeout(resync_progress
, un
,
2484 (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl
));
2487 * Handle resync restart from the last logged position. The contents
2488 * of un_rs_resync_2_do and un_rs_resync_done are dependent on the
2489 * type of resync that was in progress.
2491 if (MD_MNSET_SETNO(setno
)) {
2492 switch ((uint_t
)RS_TYPE(un
->un_rs_type
)) {
2494 case MD_RS_OPTIMIZED
:
2495 case MD_RS_COMPONENT
:
2496 case MD_RS_SUBMIRROR
:
2500 un
->un_rs_type
= MD_RS_NONE
;
2502 /* Allocate a resync message, if required */
2503 if (un
->un_rs_msg
== NULL
) {
2504 un
->un_rs_msg
= (md_mn_msg_resync_t
*)kmem_zalloc(
2505 sizeof (md_mn_msg_resync_t
), KM_SLEEP
);
2510 /* Check to see if we've been requested to block/kill */
2511 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2516 un
->un_rs_dropped_lock
= 0;
2518 * Always perform an optimized resync first as this will bring
2519 * the mirror into an available state in the shortest time.
2520 * If we are resuming an interrupted resync, other than an
2521 * optimized resync, we save the type and amount done so that
2522 * we can resume the appropriate resync after the optimized
2523 * resync has completed.
2525 if ((RS_TYPE(un
->un_rs_type
) != MD_RS_NONE
) &&
2526 (RS_TYPE(un
->un_rs_type
) != MD_RS_OPTIMIZED
)) {
2527 old_rs_type
= un
->un_rs_type
;
2528 old_rs_done
= un
->un_rs_resync_done
;
2529 old_rs_2_do
= un
->un_rs_resync_2_do
;
2530 old_rs_startbl
= un
->un_resync_startbl
;
2532 SET_RS_TYPE(un
->un_rs_type
, MD_RS_OPTIMIZED
);
2534 * If we are continuing a resync that is not an
2535 * OPTIMIZED one, then we start from the beginning when
2536 * doing this optimized resync
2538 if (RS_TYPE(old_rs_type
) != MD_RS_OPTIMIZED
) {
2539 un
->un_rs_resync_done
= 0;
2540 un
->un_rs_resync_2_do
= 0;
2541 un
->un_resync_startbl
= 0;
2543 optimized_resync(un
);
2544 /* Check to see if we've been requested to block/kill */
2545 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2548 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2550 * If another node has moved the resync on, we must
2551 * restart the correct resync
2554 (RS_TYPE(un
->un_rs_type
) != MD_RS_NONE
)) {
2555 old_rs_type
= un
->un_rs_type
;
2556 old_rs_done
= un
->un_rs_resync_done
;
2557 old_rs_2_do
= un
->un_rs_resync_2_do
;
2558 old_rs_startbl
= un
->un_resync_startbl
;
2562 * Restore previous resync progress or move onto a
2565 if (RS_TYPE(old_rs_type
) != MD_RS_NONE
) {
2566 un
->un_rs_type
= old_rs_type
;
2567 un
->un_rs_resync_done
= old_rs_done
;
2568 un
->un_rs_resync_2_do
= old_rs_2_do
;
2569 un
->un_resync_startbl
= old_rs_startbl
;
2571 un
->un_rs_type
= MD_RS_COMPONENT
;
2572 un
->un_rs_resync_done
= 0;
2573 un
->un_rs_resync_2_do
= 0;
2574 un
->un_resync_startbl
= 0;
2577 if (RS_TYPE(un
->un_rs_type
) == MD_RS_COMPONENT
) {
2578 component_resync(un
);
2579 /* Check to see if we've been requested to block/kill */
2580 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2583 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2585 * If we have moved on from a component resync, another
2586 * node must have completed it and started a submirror
2587 * resync, so leave the resync state alone. For non
2588 * multi-node sets we move onto the submirror resync.
2591 if (RS_TYPE(un
->un_rs_type
) == MD_RS_NONE
) {
2592 un
->un_rs_type
= MD_RS_SUBMIRROR
;
2593 un
->un_rs_resync_done
=
2594 un
->un_rs_resync_2_do
= 0;
2595 un
->un_resync_startbl
= 0;
2598 un
->un_rs_type
= MD_RS_SUBMIRROR
;
2599 un
->un_rs_resync_done
= 0;
2600 un
->un_rs_resync_2_do
= 0;
2601 un
->un_resync_startbl
= 0;
2604 if (RS_TYPE(un
->un_rs_type
) == MD_RS_SUBMIRROR
) {
2605 submirror_resync(un
);
2606 /* Check to see if we've been requested to block/kill */
2607 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2610 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2612 * If we have moved on from a submirror resync, another
2613 * node must have completed it and started a different
2614 * resync, so leave the resync state alone
2617 if (RS_TYPE(un
->un_rs_type
) == MD_RS_NONE
) {
2618 un
->un_rs_resync_done
=
2619 un
->un_rs_resync_2_do
= 0;
2620 un
->un_resync_startbl
= 0;
2623 /* If non-MN mirror, reinitialize state */
2624 un
->un_rs_type
= MD_RS_NONE
;
2625 un
->un_rs_resync_done
= 0;
2626 un
->un_rs_resync_2_do
= 0;
2627 un
->un_resync_startbl
= 0;
2630 } while (un
->un_rs_dropped_lock
);
2631 mutex_enter(&un
->un_rs_thread_mx
);
2632 un
->un_rs_thread_flags
|= MD_RI_SHUTDOWN
;
2633 mutex_exit(&un
->un_rs_thread_mx
);
2638 if (mirror_debug_flag
)
2639 printf("Resync stopped (mnum = %x), resync_finish = %d\n",
2640 mnum
, resync_finish
);
2642 kmem_free(un
->un_rs_buffer
, dbtob(un
->un_rs_copysize
));
2644 mutex_enter(&un
->un_rs_progress_mx
);
2645 un
->un_rs_progress_flags
|= MD_RI_KILL
;
2646 cv_signal(&un
->un_rs_progress_cv
);
2647 mutex_exit(&un
->un_rs_progress_mx
);
2650 * For MN Set, send a RESYNC_FINISH if this node completed the resync.
2651 * There is no need to grow unit here, it will be done in the
2652 * handler for the RESYNC_FINISH message together with resetting
2653 * MD_UN_RESYNC_ACTIVE.
2656 if (resync_finish
) {
2658 * Normal resync completion. Issue a RESYNC_FINISH
2659 * message if we're part of a multi-node set.
2661 md_mn_kresult_t
*kres
;
2662 md_mn_msg_resync_t
*rmsg
;
2665 rmsg
= (md_mn_msg_resync_t
*)un
->un_rs_msg
;
2666 md_unit_writerexit(ui
);
2668 rmsg
->msg_resync_mnum
= mnum
;
2669 rmsg
->msg_resync_type
= 0;
2670 rmsg
->msg_resync_done
= 0;
2671 rmsg
->msg_resync_2_do
= 0;
2672 rmsg
->msg_originator
= md_mn_mynode_id
;
2674 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
2677 mutex_enter(&un
->un_rs_cpr_mx
);
2678 CALLB_CPR_SAFE_BEGIN(&un
->un_rs_cprinfo
);
2680 rval
= mdmn_ksend_message(setno
,
2681 MD_MN_MSG_RESYNC_FINISH
, MD_MSGF_NO_LOG
, 0,
2682 (char *)rmsg
, sizeof (md_mn_msg_resync_t
), kres
);
2684 CALLB_CPR_SAFE_END(&un
->un_rs_cprinfo
,
2686 mutex_exit(&un
->un_rs_cpr_mx
);
2688 if (!MDMN_KSEND_MSG_OK(rval
, kres
)) {
2689 mdmn_ksend_show_error(rval
, kres
,
2691 /* If we're shutting down, pause things here. */
2692 if (kres
->kmmr_comm_state
== MDMNE_RPC_FAIL
) {
2693 while (!md_mn_is_commd_present()) {
2697 * commd is now available again. Retry
2698 * the message once. If this fails we
2699 * panic as the system is in an
2702 if (nretries
++ == 0)
2706 "ksend_message failure: RESYNC_FINISH");
2708 kmem_free(kres
, sizeof (md_mn_kresult_t
));
2709 (void) md_unit_writerlock(ui
);
2712 * If the resync has been cancelled, clear flags, reset owner
2713 * for ABR mirror and release the resync region parent
2716 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
) {
2719 if (ui
->ui_tstate
& MD_ABR_CAP
) {
2720 /* Resync finished, if ABR set owner to NULL */
2721 mutex_enter(&un
->un_owner_mx
);
2722 un
->un_mirror_owner
= 0;
2723 mutex_exit(&un
->un_owner_mx
);
2726 un
->c
.un_status
&= ~(MD_UN_RESYNC_CANCEL
|
2727 MD_UN_RESYNC_ACTIVE
);
2728 ps
= un
->un_rs_prev_overlap
;
2730 /* Remove previous overlap resync region */
2731 if (ps
->ps_flags
& MD_MPS_ON_OVERLAP
)
2732 mirror_overlap_tree_remove(ps
);
2734 * Release the overlap range reference
2736 un
->un_rs_prev_overlap
= NULL
;
2737 kmem_cache_free(mirror_parent_cache
,
2743 * Release resync message buffer. This will be reallocated on
2744 * the next invocation of the resync_unit thread.
2746 if (un
->un_rs_msg
) {
2747 kmem_free(un
->un_rs_msg
, sizeof (md_mn_msg_resync_t
));
2748 un
->un_rs_msg
= NULL
;
2751 /* For non-MN sets deal with any pending grows */
2752 un
->c
.un_status
&= ~MD_UN_RESYNC_ACTIVE
;
2753 if (un
->c
.un_status
& MD_UN_GROW_PENDING
) {
2754 if ((mirror_grow_unit(un
, &mde
) != 0) ||
2755 (! mdismderror(&mde
, MDE_GROW_DELAYED
))) {
2756 un
->c
.un_status
&= ~MD_UN_GROW_PENDING
;
2761 reset_comp_flags(un
);
2762 un
->un_resync_completed
= 0;
2763 mirror_commit(un
, NO_SUBMIRRORS
, 0);
2764 md_unit_writerexit(ui
);
2767 * Stop the resync progress thread.
2769 if (un
->un_rs_resync_to_id
!= 0) {
2770 (void) untimeout(un
->un_rs_resync_to_id
);
2771 un
->un_rs_resync_to_id
= 0;
2775 * Calling mirror_internal_close() makes further reference to un / ui
2776 * dangerous. If we are the only consumer of the mirror it is possible
2777 * for a metaclear to be processed after completion of the m_i_c()
2778 * routine. As we need to handle the case where another resync has been
2779 * scheduled for the mirror, we raise the open count on the device
2780 * which protects against the close / metaclear / lock => panic scenario
2782 (void) md_unit_incopen(MD_SID(un
), FREAD
|FWRITE
, OTYP_LYR
);
2783 (void) mirror_internal_close(MD_SID(un
), OTYP_LYR
, 0, (IOLOCK
*)NULL
);
2786 * deccrement the mirror resync count
2788 mutex_enter(&md_cpr_resync
.md_resync_mutex
);
2789 md_cpr_resync
.md_mirror_resync
--;
2790 mutex_exit(&md_cpr_resync
.md_resync_mutex
);
2793 * Remove the thread reference as we're about to exit. This allows a
2794 * subsequent mirror_resync_unit() to start a new thread.
2795 * If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
2796 * called to start a new resync, so reopen the mirror and go back to
2799 (void) md_unit_writerlock(ui
);
2800 mutex_enter(&un
->un_rs_thread_mx
);
2801 un
->un_rs_thread_flags
&= ~(MD_RI_KILL
|MD_RI_SHUTDOWN
);
2802 mutex_exit(&un
->un_rs_thread_mx
);
2803 if (un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
) {
2804 md_unit_writerexit(ui
);
2805 if (mirror_internal_open(MD_SID(un
), (FREAD
|FWRITE
),
2806 OTYP_LYR
, 0, (IOLOCK
*)NULL
) == 0) {
2807 /* Release the reference grabbed above */
2808 (void) mirror_internal_close(MD_SID(un
), OTYP_LYR
, 0,
2810 goto resync_restart
;
2812 (void) md_unit_writerlock(ui
);
2814 "Could not open metadevice (%x) for resync\n",
2817 un
->un_rs_thread
= NULL
;
2818 md_unit_writerexit(ui
);
2821 * Check for hotspares once we've cleared the resync thread reference.
2822 * If there are any errored units a poke_hotspares() will result in
2823 * a call to mirror_resync_unit() which we need to allow to start.
2825 (void) poke_hotspares();
2828 * Remove this thread from the CPR callback table.
2831 mutex_enter(&un
->un_rs_cpr_mx
);
2832 CALLB_CPR_EXIT(&un
->un_rs_cprinfo
);
2836 * Remove the extra reference to the unit we generated above. After
2837 * this call it is *unsafe* to reference either ui or un as they may
2838 * no longer be allocated.
2840 (void) mirror_internal_close(MD_SID(un
), OTYP_LYR
, 0, (IOLOCK
*)NULL
);
2846 * mirror_resync_unit:
2847 * ------------------
2848 * Start a resync for the given mirror metadevice. Save the resync thread ID in
2849 * un->un_rs_thread for later manipulation.
2859 md_resync_ioctl_t
*ri
,
2866 set_t setno
= MD_MIN2SET(mnum
);
2868 ui
= MDI_UNIT(mnum
);
2870 if (md_get_setstatus(setno
) & MD_SET_STALE
)
2871 return (mdmddberror(ep
, MDE_DB_STALE
, mnum
, setno
));
2873 if (mirror_internal_open(mnum
, (FREAD
|FWRITE
), OTYP_LYR
, 0, lockp
)) {
2874 return (mdmderror(ep
, MDE_MIRROR_OPEN_FAILURE
, mnum
));
2877 un
= (mm_unit_t
*)md_ioctl_writerlock(lockp
, ui
);
2879 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
2883 * Check to see if we're attempting to start a resync while one is
2886 if (un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
||
2887 un
->un_rs_thread
!= NULL
) {
2889 * Ensure RESYNC_ACTIVE set, it may not be if the resync thread
2890 * is in the process of terminating, setting the flag will
2891 * cause the resync thread to return to the beginning
2893 un
->c
.un_status
|= MD_UN_RESYNC_ACTIVE
;
2895 md_ioctl_writerexit(lockp
);
2897 md_unit_writerexit(ui
);
2899 (void) mirror_internal_close(mnum
, OTYP_LYR
, 0, lockp
);
2902 un
->c
.un_status
|= MD_UN_RESYNC_ACTIVE
;
2903 un
->c
.un_status
&= ~MD_UN_RESYNC_CANCEL
;
2904 if ((ri
) && (ri
->ri_copysize
> 0) &&
2905 (ri
->ri_copysize
<= md_max_xfer_bufsz
))
2906 un
->un_rs_copysize
= ri
->ri_copysize
;
2908 un
->un_rs_copysize
= 0;
2910 /* Start the resync progress thread off */
2911 un
->un_rs_progress_flags
= 0;
2912 (void) thread_create(NULL
, 0, resync_progress_thread
,
2913 (caddr_t
)(uintptr_t)mnum
, 0, &p0
, TS_RUN
, minclsyspri
);
2916 * We have to store the thread ID in the unit structure so do not
2917 * drop writerlock until the thread is active. This means resync_unit
2918 * may spin on its first md_unit_readerlock(), but deadlock won't occur.
2920 mutex_enter(&un
->un_rs_thread_mx
);
2921 un
->un_rs_thread_flags
&= ~(MD_RI_KILL
|MD_RI_SHUTDOWN
);
2922 mutex_exit(&un
->un_rs_thread_mx
);
2923 un
->un_rs_thread
= thread_create(NULL
, 0, resync_unit
,
2924 (caddr_t
)(uintptr_t)mnum
, 0, &p0
, TS_RUN
, 60);
2925 if (un
->un_rs_thread
== (kthread_id_t
)NULL
) {
2926 un
->c
.un_status
&= ~MD_UN_RESYNC_ACTIVE
;
2928 md_ioctl_writerexit(lockp
);
2930 md_unit_writerexit(ui
);
2932 (void) mirror_internal_close(mnum
, OTYP_LYR
, 0, lockp
);
2933 return (mdmderror(ep
, MDE_MIRROR_THREAD_FAILURE
, mnum
));
2936 md_ioctl_writerexit(lockp
);
2938 md_unit_writerexit(ui
);
2946 * mirror_ioctl_resync:
2947 * -------------------
2948 * Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
2949 * or kill the resync thread associated with the specified unit.
2950 * Can return with locks held since mdioctl will free any locks
2951 * that are marked in lock->l_flags.
2958 mirror_ioctl_resync(
2959 md_resync_ioctl_t
*ri
,
2963 minor_t mnum
= ri
->ri_mnum
;
2967 mm_submirror_ic_t
*smic
;
2970 set_t setno
= MD_MIN2SET(mnum
);
2972 mdclrerror(&ri
->mde
);
2974 if ((setno
>= md_nsets
) ||
2975 (MD_MIN2UNIT(mnum
) >= md_nunits
)) {
2976 return (mdmderror(&ri
->mde
, MDE_INVAL_UNIT
, mnum
));
2979 /* RD_LOCK flag grabs the md_ioctl_readerlock */
2980 un
= mirror_getun(mnum
, &ri
->mde
, RD_LOCK
, lock
);
2983 return (mdmderror(&ri
->mde
, MDE_UNIT_NOT_SETUP
, mnum
));
2985 if (un
->c
.un_type
!= MD_METAMIRROR
) {
2986 return (mdmderror(&ri
->mde
, MDE_NOT_MM
, mnum
));
2988 if (un
->un_nsm
< 2) {
2993 * Determine the action to take based on the ri_flags field:
2994 * MD_RI_BLOCK: Block current resync thread
2995 * MD_RI_UNBLOCK: Unblock resync thread
2996 * MD_RI_KILL: Abort resync thread
2997 * MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
2998 * without using rpc.mdcommd messages.
2999 * any other: Start resync thread
3001 switch (ri
->ri_flags
& (MD_RI_BLOCK
|MD_RI_UNBLOCK
|MD_RI_KILL
)) {
3004 /* Halt resync thread by setting flag in un_rs_flags */
3005 if (!(un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
)) {
3008 mutex_enter(&un
->un_rs_thread_mx
);
3009 un
->un_rs_thread_flags
|= MD_RI_BLOCK
;
3010 mutex_exit(&un
->un_rs_thread_mx
);
3015 * Restart resync thread by clearing flag in un_rs_flags and
3016 * cv_signal'ing the blocked thread.
3018 if (!(un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
)) {
3021 mutex_enter(&un
->un_rs_thread_mx
);
3022 un
->un_rs_thread_flags
&= ~MD_RI_BLOCK
;
3023 cv_signal(&un
->un_rs_thread_cv
);
3024 mutex_exit(&un
->un_rs_thread_mx
);
3028 /* Abort resync thread. */
3029 if (!(un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
)) {
3032 mutex_enter(&un
->un_rs_thread_mx
);
3033 tid
= un
->un_rs_thread
? (un
->un_rs_thread
)->t_did
: 0;
3034 un
->un_rs_thread_flags
&= ~(MD_RI_BLOCK
|MD_RI_BLOCK_OWNER
);
3035 un
->un_rs_thread_flags
|= MD_RI_KILL
;
3036 cv_signal(&un
->un_rs_thread_cv
);
3037 mutex_exit(&un
->un_rs_thread_mx
);
3039 if (!(ri
->ri_flags
& MD_RI_NO_WAIT
)) {
3040 md_ioctl_readerexit(lock
);
3042 un
->un_rs_thread_flags
&= ~MD_RI_KILL
;
3043 un
->un_rs_thread
= NULL
;
3044 cmn_err(CE_WARN
, "md: %s: Resync cancelled\n",
3045 md_shortname(MD_SID(un
)));
3051 md_ioctl_readerexit(lock
);
3054 for (smi
= 0; smi
< NMIRROR
; smi
++) {
3055 sm
= &un
->un_sm
[smi
];
3056 smic
= &un
->un_smic
[smi
];
3057 if (!SMS_IS(sm
, SMS_ATTACHED
))
3059 mirror_set_sm_state(sm
, smic
, SMS_ATTACHED_RESYNC
, 1);
3060 bits
|= SMI2BIT(smi
);
3063 mirror_commit(un
, bits
, 0);
3066 * If we are resyncing a mirror in a MN set and the rpc.mdcommd
3067 * can be used, we do not start the resync at this point.
3068 * Instead, the metasync command that issued the ioctl
3069 * will send a RESYNC_STARTING message to start the resync thread. The
3070 * reason we do it this way is to ensure that the metasync ioctl is
3071 * executed on all nodes before the resync thread is started.
3073 * If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
3074 * don't use rpc.mdcommd, but just start the resync thread. This
3075 * flag is set on a node when it is being added to a diskset
3076 * so that the resync threads are started on the newly added node.
3078 if ((!(MD_MNSET_SETNO(setno
))) ||
3079 (ri
->ri_flags
& MD_RI_RESYNC_FORCE_MNSTART
)) {
3080 return (mirror_resync_unit(mnum
, ri
, &ri
->mde
, lock
));
3087 mirror_mark_resync_region_non_owner(struct mm_unit
*un
,
3088 diskaddr_t startblk
, diskaddr_t endblk
, md_mn_nodeid_t source_node
)
3094 md_mn_msg_rr_dirty_t
*rr
;
3095 md_mn_kresult_t
*kres
;
3096 set_t setno
= MD_UN2SET(un
);
3098 md_mn_nodeid_t node_idx
= source_node
- 1;
3099 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
3100 md_mn_nodeid_t owner_node
;
3101 minor_t mnum
= MD_SID(un
);
3107 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3108 * not, allocate it and then fill the [start..end] entries.
3109 * Update un_pernode_dirty_sum if we've gone 0->1.
3110 * Update un_dirty_bm if the corresponding entries are clear.
3112 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_WRITER
);
3113 if (un
->un_pernode_dirty_bm
[node_idx
] == NULL
) {
3114 un
->un_pernode_dirty_bm
[node_idx
] =
3115 (uchar_t
*)kmem_zalloc(
3116 (uint_t
)howmany(un
->un_rrd_num
, NBBY
), KM_SLEEP
);
3118 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3120 BLK_TO_RR(end_rr
, endblk
, un
);
3121 BLK_TO_RR(start_rr
, startblk
, un
);
3125 mutex_enter(&un
->un_resync_mx
);
3126 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_READER
);
3127 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++) {
3128 un
->un_outstanding_writes
[current_rr
]++;
3129 if (!IS_PERNODE_DIRTY(source_node
, current_rr
, un
)) {
3130 un
->un_pernode_dirty_sum
[current_rr
]++;
3131 SET_PERNODE_DIRTY(source_node
, current_rr
, un
);
3133 CLR_GOING_CLEAN(current_rr
, un
);
3134 if (!IS_REGION_DIRTY(current_rr
, un
)) {
3136 SET_REGION_DIRTY(current_rr
, un
);
3137 SET_GOING_DIRTY(current_rr
, un
);
3138 } else if (IS_GOING_DIRTY(current_rr
, un
))
3141 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3142 mutex_exit(&un
->un_resync_mx
);
3149 * If we have dirty regions to commit, send a
3150 * message to the owning node so that the
3151 * in-core bitmap gets updated appropriately.
3152 * TODO: make this a kmem_cache pool to improve
3153 * alloc/free performance ???
3155 kres
= (md_mn_kresult_t
*)kmem_alloc(sizeof (md_mn_kresult_t
),
3157 rr
= (md_mn_msg_rr_dirty_t
*)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t
),
3161 owner_node
= un
->un_mirror_owner
;
3164 rr
->rr_nodeid
= md_mn_mynode_id
;
3165 rr
->rr_range
= (ushort_t
)start_rr
<< 16;
3166 rr
->rr_range
|= (ushort_t
)end_rr
& 0xFFFF;
3168 /* release readerlock before sending message */
3169 md_unit_readerexit(ui
);
3171 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RR_DIRTY
,
3172 MD_MSGF_NO_LOG
|MD_MSGF_BLK_SIGNAL
|MD_MSGF_DIRECTED
,
3173 un
->un_mirror_owner
, (char *)rr
,
3174 sizeof (md_mn_msg_rr_dirty_t
), kres
);
3176 /* reaquire readerlock on message completion */
3177 (void) md_unit_readerlock(ui
);
3179 /* if the message send failed, note it, and pass an error back up */
3180 if (!MDMN_KSEND_MSG_OK(rval
, kres
)) {
3181 /* if commd is gone, no point in printing a message */
3182 if (md_mn_is_commd_present())
3183 mdmn_ksend_show_error(rval
, kres
, "RR_DIRTY");
3184 kmem_free(kres
, sizeof (md_mn_kresult_t
));
3185 kmem_free(rr
, sizeof (md_mn_msg_rr_dirty_t
));
3190 * if the owner changed while we were sending the message, and it's
3191 * not us, the new mirror owner won't yet have done the right thing
3192 * with our data. Let him know. If we became the owner, we'll
3193 * deal with that differently below. Note that receiving a message
3194 * about another node twice won't hurt anything.
3196 if (un
->un_mirror_owner
!= owner_node
&& !MD_MN_MIRROR_OWNER(un
))
3199 kmem_free(kres
, sizeof (md_mn_kresult_t
));
3200 kmem_free(rr
, sizeof (md_mn_msg_rr_dirty_t
));
3202 mutex_enter(&un
->un_resync_mx
);
3205 * If we became the owner changed while we were sending the message,
3206 * we have dirty bits in the un_pernode_bm that aren't yet reflected
3207 * in the un_dirty_bm, as it was re-read from disk, and our bits
3208 * are also not reflected in the on-disk DRL. Fix that now.
3210 if (MD_MN_MIRROR_OWNER(un
)) {
3211 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_WRITER
);
3212 mirror_copy_rr(howmany(un
->un_rrd_num
, NBBY
),
3213 un
->un_pernode_dirty_bm
[node_idx
], un
->un_dirty_bm
);
3214 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3216 un
->un_resync_flg
|= MM_RF_COMMITING
| MM_RF_GATECLOSED
;
3218 mutex_exit(&un
->un_resync_mx
);
3219 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
3220 mutex_enter(&un
->un_resync_mx
);
3222 un
->un_resync_flg
&= ~(MM_RF_COMMITING
| MM_RF_GATECLOSED
);
3223 cv_broadcast(&un
->un_resync_cv
);
3226 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++)
3227 CLR_GOING_DIRTY(current_rr
, un
);
3229 mutex_exit(&un
->un_resync_mx
);
3235 mirror_mark_resync_region_owner(struct mm_unit
*un
,
3236 diskaddr_t startblk
, diskaddr_t endblk
, md_mn_nodeid_t source_node
)
3242 int mnset
= MD_MNSET_SETNO(MD_UN2SET(un
));
3243 md_mn_nodeid_t node_idx
= source_node
- 1;
3249 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3250 * not, allocate it and then fill the [start..end] entries.
3251 * Update un_pernode_dirty_sum if we've gone 0->1.
3252 * Update un_dirty_bm if the corresponding entries are clear.
3255 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_WRITER
);
3256 if (un
->un_pernode_dirty_bm
[node_idx
] == NULL
) {
3257 un
->un_pernode_dirty_bm
[node_idx
] =
3258 (uchar_t
*)kmem_zalloc(
3259 (uint_t
)howmany(un
->un_rrd_num
, NBBY
), KM_SLEEP
);
3261 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3264 mutex_enter(&un
->un_resync_mx
);
3267 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_READER
);
3270 BLK_TO_RR(end_rr
, endblk
, un
);
3271 BLK_TO_RR(start_rr
, startblk
, un
);
3272 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++) {
3273 if (!mnset
|| source_node
== md_mn_mynode_id
)
3274 un
->un_outstanding_writes
[current_rr
]++;
3276 if (!IS_PERNODE_DIRTY(source_node
, current_rr
, un
))
3277 un
->un_pernode_dirty_sum
[current_rr
]++;
3278 SET_PERNODE_DIRTY(source_node
, current_rr
, un
);
3280 CLR_GOING_CLEAN(current_rr
, un
);
3281 if (!IS_REGION_DIRTY(current_rr
, un
))
3283 if (IS_GOING_DIRTY(current_rr
, un
))
3288 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3291 mutex_exit(&un
->un_resync_mx
);
3294 un
->un_waiting_to_mark
++;
3295 while (un
->un_resync_flg
& MM_RF_GATECLOSED
) {
3298 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3300 un
->un_waiting_to_mark
--;
3303 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++) {
3304 if (!IS_REGION_DIRTY(current_rr
, un
)) {
3305 SET_REGION_DIRTY(current_rr
, un
);
3306 SET_GOING_DIRTY(current_rr
, un
);
3309 if (IS_GOING_DIRTY(current_rr
, un
))
3314 if (un
->un_waiting_to_mark
== 0 || un
->un_waiting_to_clear
!= 0)
3315 cv_broadcast(&un
->un_resync_cv
);
3316 mutex_exit(&un
->un_resync_mx
);
3320 un
->un_resync_flg
|= MM_RF_COMMIT_NEEDED
;
3321 un
->un_waiting_to_commit
++;
3322 while (un
->un_waiting_to_mark
!= 0 &&
3323 !(un
->un_resync_flg
& MM_RF_GATECLOSED
)) {
3326 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3329 if (un
->un_resync_flg
& MM_RF_COMMIT_NEEDED
) {
3330 un
->un_resync_flg
|= MM_RF_COMMITING
| MM_RF_GATECLOSED
;
3331 un
->un_resync_flg
&= ~MM_RF_COMMIT_NEEDED
;
3333 mutex_exit(&un
->un_resync_mx
);
3334 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
3335 mutex_enter(&un
->un_resync_mx
);
3337 un
->un_resync_flg
&= ~MM_RF_COMMITING
;
3338 cv_broadcast(&un
->un_resync_cv
);
3340 while (un
->un_resync_flg
& MM_RF_COMMITING
) {
3343 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3346 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++)
3347 CLR_GOING_DIRTY(current_rr
, un
);
3349 if (--un
->un_waiting_to_commit
== 0) {
3350 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
3351 cv_broadcast(&un
->un_resync_cv
);
3353 mutex_exit(&un
->un_resync_mx
);
3359 mirror_mark_resync_region(struct mm_unit
*un
,
3360 diskaddr_t startblk
, diskaddr_t endblk
, md_mn_nodeid_t source_node
)
3362 int mnset
= MD_MNSET_SETNO(MD_UN2SET(un
));
3364 if (mnset
&& !MD_MN_MIRROR_OWNER(un
)) {
3365 return (mirror_mark_resync_region_non_owner(un
, startblk
,
3366 endblk
, source_node
));
3368 return (mirror_mark_resync_region_owner(un
, startblk
, endblk
,
3374 mirror_resize_resync_regions(mm_unit_t
*un
, diskaddr_t new_tb
)
3377 optim_resync_t
*orp
;
3379 uint_t old_nregions
, new_nregions
;
3380 int old_bm_size
, new_bm_size
;
3382 mddb_recid_t recid
, old_recid
;
3383 uchar_t
*old_dirty_bm
;
3386 set_t setno
= MD_UN2SET(un
);
3389 old_nregions
= un
->un_rrd_num
;
3390 new_nregions
= (uint_t
)((new_tb
/un
->un_rrd_blksize
) + 1);
3392 while (new_nregions
> MD_MAX_NUM_RR
) {
3397 new_bm_size
= howmany(new_nregions
, NBBY
);
3398 old_bm_size
= howmany(old_nregions
, NBBY
);
3400 size
= new_bm_size
+ sizeof (*orp
) - sizeof (orp
->or_rr
);
3402 typ1
= (mddb_type_t
)md_getshared_key(setno
,
3403 mirror_md_ops
.md_driver
.md_drivername
);
3404 recid
= mddb_createrec(size
, typ1
, RESYNC_REC
,
3405 MD_CRO_OPTIMIZE
|MD_CRO_32BIT
, setno
);
3409 orp
= (struct optim_resync
*)mddb_getrecaddr(recid
);
3410 ASSERT(orp
!= NULL
);
3412 orp
->or_magic
= OR_MAGIC
; /* Magic # */
3413 orp
->or_blksize
= un
->un_rrd_blksize
; /* Same block size */
3414 orp
->or_num
= new_nregions
; /* New number of regions */
3416 old_dirty_bm
= un
->un_dirty_bm
;
3417 un
->un_dirty_bm
= orp
->or_rr
;
3419 kmem_free((caddr_t
)un
->un_goingdirty_bm
, old_bm_size
);
3420 un
->un_goingdirty_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3422 kmem_free((caddr_t
)un
->un_goingclean_bm
, old_bm_size
);
3423 un
->un_goingclean_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3425 kmem_free((caddr_t
)un
->un_resync_bm
, old_bm_size
);
3426 un
->un_resync_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3428 owp
= un
->un_outstanding_writes
;
3429 un
->un_outstanding_writes
= (short *)kmem_zalloc(
3430 new_nregions
* sizeof (short), KM_SLEEP
);
3432 old_pns
= un
->un_pernode_dirty_sum
;
3434 un
->un_pernode_dirty_sum
= (uchar_t
*)kmem_zalloc(new_nregions
,
3438 * Now translate the old records into the new
3441 for (i
= 0; i
< old_nregions
; i
++) {
3443 * only bring forward the
3444 * outstanding write counters and the dirty bits and also
3445 * the pernode_summary counts
3447 if (!isset(old_dirty_bm
, i
))
3450 setbit(un
->un_dirty_bm
, (i
/ rr_mult
));
3451 un
->un_outstanding_writes
[(i
/ rr_mult
)] += owp
[i
];
3453 un
->un_pernode_dirty_sum
[(i
/ rr_mult
)] += old_pns
[i
];
3455 kmem_free((caddr_t
)owp
, old_nregions
* sizeof (short));
3457 kmem_free((caddr_t
)old_pns
, old_nregions
);
3460 * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
3462 for (j
= 0; j
< MD_MNMAXSIDES
; j
++) {
3463 rw_enter(&un
->un_pernode_dirty_mx
[j
], RW_WRITER
);
3464 old_dirty_bm
= un
->un_pernode_dirty_bm
[j
];
3466 un
->un_pernode_dirty_bm
[j
] = (uchar_t
*)kmem_zalloc(
3467 new_bm_size
, KM_SLEEP
);
3468 for (i
= 0; i
< old_nregions
; i
++) {
3469 if (!isset(old_dirty_bm
, i
))
3472 setbit(un
->un_pernode_dirty_bm
[j
],
3475 kmem_free((caddr_t
)old_dirty_bm
, old_bm_size
);
3477 rw_exit(&un
->un_pernode_dirty_mx
[j
]);
3480 /* Save the old record id */
3481 old_recid
= un
->un_rr_dirty_recid
;
3483 /* Update the mirror unit struct */
3484 un
->un_rr_dirty_recid
= recid
;
3485 un
->un_rrd_num
= new_nregions
;
3486 un
->un_rrd_blksize
= un
->un_rrd_blksize
* rr_mult
;
3488 orp
->or_blksize
= un
->un_rrd_blksize
;
3491 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3492 * instead of using mddb_commitrecs_wrapper, is that you cannot
3493 * atomically commit optimized records.
3495 mddb_commitrec_wrapper(recid
);
3496 mddb_commitrec_wrapper(un
->c
.un_record_id
);
3497 mddb_deleterec_wrapper(old_recid
);
3501 /* lockp can be NULL for !MN diksets */
3503 mirror_add_resync_regions(mm_unit_t
*un
, diskaddr_t new_tb
)
3507 optim_resync_t
*orp
;
3508 uint_t old_nregions
, new_nregions
;
3509 int old_bm_size
, new_bm_size
;
3511 mddb_recid_t recid
, old_recid
;
3513 set_t setno
= MD_UN2SET(un
);
3516 old_nregions
= un
->un_rrd_num
;
3517 new_nregions
= (uint_t
)((new_tb
/un
->un_rrd_blksize
) + 1);
3519 new_bm_size
= howmany(new_nregions
, NBBY
);
3520 old_bm_size
= howmany(old_nregions
, NBBY
);
3522 size
= new_bm_size
+ sizeof (*orp
) - sizeof (orp
->or_rr
);
3524 typ1
= (mddb_type_t
)md_getshared_key(setno
,
3525 mirror_md_ops
.md_driver
.md_drivername
);
3527 recid
= mddb_createrec(size
, typ1
, RESYNC_REC
,
3528 MD_CRO_OPTIMIZE
|MD_CRO_32BIT
, setno
);
3532 orp
= (struct optim_resync
*)mddb_getrecaddr(recid
);
3533 ASSERT(orp
!= NULL
);
3535 orp
->or_magic
= OR_MAGIC
; /* Magic # */
3536 orp
->or_blksize
= un
->un_rrd_blksize
; /* Same block size */
3537 orp
->or_num
= new_nregions
; /* New number of regions */
3539 /* Copy the old bm over the new bm */
3540 bcopy((caddr_t
)un
->un_dirty_bm
, (caddr_t
)orp
->or_rr
, old_bm_size
);
3543 * Create new bigger incore arrays, copy, and free old ones:
3547 * un_outstanding_writes
3548 * un_pernode_dirty_sum
3549 * un_pernode_dirty_bm[]
3551 old
= un
->un_goingdirty_bm
;
3552 un
->un_goingdirty_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3553 bcopy((caddr_t
)old
, (caddr_t
)un
->un_goingdirty_bm
, old_bm_size
);
3554 kmem_free((caddr_t
)old
, old_bm_size
);
3556 old
= un
->un_goingclean_bm
;
3557 un
->un_goingclean_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3558 bcopy((caddr_t
)old
, (caddr_t
)un
->un_goingclean_bm
, old_bm_size
);
3559 kmem_free((caddr_t
)old
, old_bm_size
);
3561 old
= un
->un_resync_bm
;
3562 un
->un_resync_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3563 bcopy((caddr_t
)old
, (caddr_t
)un
->un_resync_bm
, old_bm_size
);
3564 kmem_free((caddr_t
)old
, old_bm_size
);
3566 owp
= un
->un_outstanding_writes
;
3567 un
->un_outstanding_writes
= (short *)kmem_zalloc(
3568 (uint_t
)new_nregions
* sizeof (short), KM_SLEEP
);
3569 bcopy((caddr_t
)owp
, (caddr_t
)un
->un_outstanding_writes
,
3570 old_nregions
* sizeof (short));
3571 kmem_free((caddr_t
)owp
, (old_nregions
* sizeof (short)));
3573 old
= un
->un_pernode_dirty_sum
;
3575 un
->un_pernode_dirty_sum
= (uchar_t
*)kmem_zalloc(
3576 new_nregions
, KM_SLEEP
);
3577 bcopy((caddr_t
)old
, (caddr_t
)un
->un_pernode_dirty_sum
,
3579 kmem_free((caddr_t
)old
, old_nregions
);
3582 for (i
= 0; i
< MD_MNMAXSIDES
; i
++) {
3583 rw_enter(&un
->un_pernode_dirty_mx
[i
], RW_WRITER
);
3584 old
= un
->un_pernode_dirty_bm
[i
];
3586 un
->un_pernode_dirty_bm
[i
] = (uchar_t
*)kmem_zalloc(
3587 new_bm_size
, KM_SLEEP
);
3588 bcopy((caddr_t
)old
, (caddr_t
)un
->un_pernode_dirty_bm
[i
],
3590 kmem_free((caddr_t
)old
, old_bm_size
);
3592 rw_exit(&un
->un_pernode_dirty_mx
[i
]);
3595 /* Save the old record id */
3596 old_recid
= un
->un_rr_dirty_recid
;
3598 /* Update the mirror unit struct */
3599 un
->un_rr_dirty_recid
= recid
;
3600 un
->un_rrd_num
= new_nregions
;
3601 un
->un_dirty_bm
= orp
->or_rr
;
3604 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3605 * instead of using mddb_commitrecs_wrapper, is that you cannot
3606 * atomically commit optimized records.
3608 mddb_commitrec_wrapper(recid
);
3609 mddb_commitrec_wrapper(un
->c
.un_record_id
);
3610 mddb_deleterec_wrapper(old_recid
);
3617 * Combine the dirty record bitmap with the in-core resync bitmap. This allows
3618 * us to carry a resync over an ownership change.
3621 mirror_copy_rr(int sz
, uchar_t
*src
, uchar_t
*dest
)
3625 for (i
= 0; i
< sz
; i
++)
3630 * mirror_set_dirty_rr:
3631 * -------------------
3632 * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
3633 * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
3634 * Called on every clean->dirty transition for the originating writer node.
3635 * Note: only the non-owning nodes will initiate this message and it is only
3636 * the owning node that has to process it.
3639 mirror_set_dirty_rr(md_mn_rr_dirty_params_t
*iocp
)
3642 minor_t mnum
= iocp
->rr_mnum
;
3644 int start
= (int)iocp
->rr_start
;
3645 int end
= (int)iocp
->rr_end
;
3646 set_t setno
= MD_MIN2SET(mnum
);
3647 md_mn_nodeid_t orignode
= iocp
->rr_nodeid
; /* 1-based */
3648 diskaddr_t startblk
, endblk
;
3650 mdclrerror(&iocp
->mde
);
3652 if ((setno
>= md_nsets
) ||
3653 (MD_MIN2UNIT(mnum
) >= md_nunits
)) {
3654 return (mdmderror(&iocp
->mde
, MDE_INVAL_UNIT
, mnum
));
3657 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3658 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3661 return (mdmderror(&iocp
->mde
, MDE_UNIT_NOT_SETUP
, mnum
));
3663 if (un
->c
.un_type
!= MD_METAMIRROR
) {
3664 return (mdmderror(&iocp
->mde
, MDE_NOT_MM
, mnum
));
3666 if (orignode
< 1 || orignode
>= MD_MNMAXSIDES
) {
3667 return (mdmderror(&iocp
->mde
, MDE_INVAL_UNIT
, mnum
));
3669 if (un
->un_nsm
< 2) {
3674 * Only process this message if we're the owner of the mirror.
3676 if (!MD_MN_MIRROR_OWNER(un
)) {
3680 RR_TO_BLK(startblk
, start
, un
);
3681 RR_TO_BLK(endblk
, end
, un
);
3682 return (mirror_mark_resync_region_owner(un
, startblk
, endblk
,
3687 * mirror_clean_rr_bits:
3688 * --------------------
3689 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3690 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3691 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3692 * nodes. Callable from ioctl / interrupt / whatever context.
3693 * un_resync_mx is held on entry.
3696 mirror_clean_rr_bits(
3697 md_mn_rr_clean_params_t
*iocp
)
3699 minor_t mnum
= iocp
->rr_mnum
;
3701 uint_t cleared_bits
;
3702 md_mn_nodeid_t node
= iocp
->rr_nodeid
- 1;
3703 md_mn_nodeid_t orignode
= iocp
->rr_nodeid
;
3706 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3709 start
= MDMN_RR_CLEAN_PARAMS_START_BIT(iocp
);
3710 end
= start
+ MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp
) * NBBY
;
3711 rw_enter(&un
->un_pernode_dirty_mx
[node
], RW_READER
);
3712 for (i
= start
; i
< end
; i
++) {
3713 if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp
), i
- start
)) {
3714 if (IS_PERNODE_DIRTY(orignode
, i
, un
)) {
3715 un
->un_pernode_dirty_sum
[i
]--;
3716 CLR_PERNODE_DIRTY(orignode
, i
, un
);
3718 if (un
->un_pernode_dirty_sum
[i
] == 0) {
3720 CLR_REGION_DIRTY(i
, un
);
3721 CLR_GOING_CLEAN(i
, un
);
3725 rw_exit(&un
->un_pernode_dirty_mx
[node
]);
3728 * We can only be called iff we are the mirror owner, however
3729 * as this is a (potentially) decoupled routine the ownership
3730 * may have moved from us by the time we get to execute the
3731 * bit clearing. Hence we still need to check for being the
3732 * owner before flushing the DRL to the replica.
3734 if (MD_MN_MIRROR_OWNER(un
)) {
3735 mutex_exit(&un
->un_resync_mx
);
3736 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
3737 mutex_enter(&un
->un_resync_mx
);
3745 * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
3746 * We need to obtain exclusive access to the un_resync_cv and then clear the
3748 * On completion, we must also free the passed in argument as it is allocated
3749 * at the end of the ioctl handler and won't be freed on completion.
3752 mirror_drl_task(void *arg
)
3754 md_mn_rr_clean_params_t
*iocp
= (md_mn_rr_clean_params_t
*)arg
;
3755 minor_t mnum
= iocp
->rr_mnum
;
3758 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3760 mutex_enter(&un
->un_rrp_inflight_mx
);
3761 mutex_enter(&un
->un_resync_mx
);
3762 un
->un_waiting_to_clear
++;
3763 while (un
->un_resync_flg
& MM_RF_STALL_CLEAN
)
3764 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3765 un
->un_waiting_to_clear
--;
3767 un
->un_resync_flg
|= MM_RF_GATECLOSED
;
3768 mirror_clean_rr_bits(iocp
);
3769 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
3770 if (un
->un_waiting_to_mark
!= 0 || un
->un_waiting_to_clear
!= 0) {
3771 cv_broadcast(&un
->un_resync_cv
);
3773 mutex_exit(&un
->un_resync_mx
);
3774 mutex_exit(&un
->un_rrp_inflight_mx
);
3776 kmem_free((caddr_t
)iocp
, MDMN_RR_CLEAN_PARAMS_SIZE(iocp
));
3780 * mirror_set_clean_rr:
3781 * -------------------
3782 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3783 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3784 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3787 * Only the mirror-owner need process this message as it is the only RR updater.
3788 * Non-owner nodes issue this request, but as we have no point-to-point message
3789 * support we will receive the message on all nodes.
3792 mirror_set_clean_rr(md_mn_rr_clean_params_t
*iocp
)
3795 minor_t mnum
= iocp
->rr_mnum
;
3797 set_t setno
= MD_MIN2SET(mnum
);
3798 md_mn_nodeid_t node
= iocp
->rr_nodeid
- 1;
3800 md_mn_rr_clean_params_t
*newiocp
;
3803 mdclrerror(&iocp
->mde
);
3805 if ((setno
>= md_nsets
) ||
3806 (MD_MIN2UNIT(mnum
) >= md_nunits
)) {
3807 return (mdmderror(&iocp
->mde
, MDE_INVAL_UNIT
, mnum
));
3810 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3811 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3814 return (mdmderror(&iocp
->mde
, MDE_UNIT_NOT_SETUP
, mnum
));
3816 if (un
->c
.un_type
!= MD_METAMIRROR
) {
3817 return (mdmderror(&iocp
->mde
, MDE_NOT_MM
, mnum
));
3819 if (un
->un_nsm
< 2) {
3824 * Check to see if we're the mirror owner. If not, there's nothing
3827 if (!MD_MN_MIRROR_OWNER(un
)) {
3832 * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
3833 * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
3834 * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
3835 * we can just defer this cleaning until the next process_resync_regions
3838 rw_enter(&un
->un_pernode_dirty_mx
[node
], RW_WRITER
);
3839 if (un
->un_pernode_dirty_bm
[node
] == NULL
) {
3840 un
->un_pernode_dirty_bm
[node
] = (uchar_t
*)kmem_zalloc(
3841 howmany(un
->un_rrd_num
, NBBY
), KM_SLEEP
);
3843 rw_exit(&un
->un_pernode_dirty_mx
[node
]);
3846 * See if we can simply clear the un_dirty_bm[] entries. If we're not
3847 * the issuing node _and_ we aren't in the process of marking/clearing
3848 * the RR bitmaps, we can simply update the bits as needed.
3849 * If we're the owning node and _not_ the issuing node, we should also
3850 * sync the RR if we clear any bits in it.
3852 mutex_enter(&un
->un_resync_mx
);
3853 can_clear
= (un
->un_resync_flg
& MM_RF_STALL_CLEAN
) ? 0 : 1;
3855 un
->un_resync_flg
|= MM_RF_GATECLOSED
;
3856 mirror_clean_rr_bits(iocp
);
3857 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
3858 if (un
->un_waiting_to_mark
!= 0 ||
3859 un
->un_waiting_to_clear
!= 0) {
3860 cv_broadcast(&un
->un_resync_cv
);
3863 mutex_exit(&un
->un_resync_mx
);
3866 * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
3867 * we must schedule a blocking call to update the DRL on this node.
3868 * As we're invoked from an ioctl we are going to have the original data
3869 * disappear (kmem_free) once we return. So, copy the data into a new
3870 * structure and let the taskq routine release it on completion.
3873 size_t sz
= MDMN_RR_CLEAN_PARAMS_SIZE(iocp
);
3875 newiocp
= (md_mn_rr_clean_params_t
*)kmem_alloc(sz
, KM_SLEEP
);
3877 bcopy(iocp
, newiocp
, sz
);
3879 if (ddi_taskq_dispatch(un
->un_drl_task
, mirror_drl_task
,
3880 newiocp
, DDI_NOSLEEP
) != DDI_SUCCESS
) {
3881 kmem_free(newiocp
, sz
);
3882 rval
= ENOMEM
; /* probably starvation */