4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/param.h>
28 #include <sys/systm.h>
33 #include <sys/t_lock.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
46 #include <sys/lvm/md_mirror.h>
47 #include <sys/modctl.h>
49 #include <sys/sunddi.h>
50 #include <sys/debug.h>
51 #include <sys/callb.h>
53 #include <sys/sysevent/eventdefs.h>
54 #include <sys/sysevent/svm.h>
55 #include <sys/lvm/mdmn_commd.h>
58 extern kmutex_t md_status_mx
;
59 extern kmutex_t md_mx
;
61 extern unit_t md_nunits
;
62 extern set_t md_nsets
;
63 extern md_set_t md_set
[];
64 extern major_t md_major
;
66 extern md_ops_t mirror_md_ops
;
67 extern kmem_cache_t
*mirror_child_cache
; /* mirror child memory pool */
68 extern mdq_anchor_t md_mto_daemon
;
69 extern daemon_request_t mirror_timeout
;
70 extern md_resync_t md_cpr_resync
;
72 extern int md_mtioctl_cnt
;
74 extern kmem_cache_t
*mirror_parent_cache
;
76 extern int mirror_debug_flag
;
80 * Tunable resync thread timeout. This is used as the time interval for updating
81 * the resync progress to the mddb. This allows restartable resyncs to be
82 * continued across a system reboot.
83 * Default is to update the resync progress every 5 minutes.
85 int md_mirror_resync_update_intvl
= MD_DEF_MIRROR_RESYNC_INTVL
;
88 * Settable mirror resync buffer size. Specified in 512 byte
89 * blocks. This is set to MD_DEF_RESYNC_BUF_SIZE by default.
91 int md_resync_bufsz
= MD_DEF_RESYNC_BUF_SIZE
;
94 * Tunables for dirty region processing when
95 * closing down a mirror.
97 * Dirty region processing during close of a
98 * mirror is basically monitoring the state
99 * of the resync region bitmaps and the number
100 * of outstanding i/o's per submirror to
101 * determine that there are no more dirty
104 * The approach taken is a retry logic over
105 * md_mirror_rr_cleans iterations to monitor
108 * There are two methods of polling the progress
109 * on dirty bitmap processing: busy-waits and
112 * Busy-waits are used at the beginning to
113 * determine the final state as quick as
114 * possible; md_mirror_rr_polls defines the
115 * number of busy-waits.
117 * In case the number of busy-waits got exhausted
118 * with dirty regions left over, the retry logic
119 * switches over to non-busy-waits, thus giving
120 * relief to an obviously heavily loaded system.
121 * The timeout value is defined by the tunable
122 * md_mirror_rr_sleep_timo in seconds.
124 * The number of non-busy-waits is given by:
125 * md_mirror_rr_cleans - md_mirror_rr_polls.
127 * The values were found by testing on a
128 * 'typical' system and may require tuning
129 * to meet specific customer's requirements.
132 int md_mirror_rr_cleans
= 13;
133 int md_mirror_rr_polls
= 3;
134 int md_mirror_rr_sleep_timo
= 1;
137 * The value is not #defined because it will be computed
140 int md_max_xfer_bufsz
= 2048;
143 * mirror_generate_rr_bitmap:
144 * -------------------
145 * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
146 * bitmap associated with mirror 'un'
149 * un - mirror unit to get bitmap data from
150 * *msgp - location to return newly allocated md_mn_msg_rr_clean_t
151 * *activep- location to return # of active i/os
154 * 1 => dirty bits cleared from un_dirty_bm and DRL flush required
155 * *msgp contains bitmap of to-be-cleared bits
156 * 0 => no bits cleared
160 mirror_generate_rr_bitmap(mm_unit_t
*un
, md_mn_msg_rr_clean_t
**msgp
,
163 unsigned int i
, next_bit
, data_bytes
, start_bit
;
164 int cleared_dirty
= 0;
166 /* Skip any initial 0s. */
168 if ((start_bit
= un
->un_rr_clean_start_bit
) >= un
->un_rrd_num
)
169 un
->un_rr_clean_start_bit
= start_bit
= 0;
172 * Handle case where NO bits are set in PERNODE_DIRTY but the
173 * un_dirty_bm[] map does have entries set (after a 1st resync)
175 for (; start_bit
< un
->un_rrd_num
&&
176 !IS_PERNODE_DIRTY(md_mn_mynode_id
, start_bit
, un
) &&
177 (un
->un_pernode_dirty_sum
[start_bit
] != (uchar_t
)0); start_bit
++)
180 if (start_bit
>= un
->un_rrd_num
) {
181 if (un
->un_rr_clean_start_bit
== 0) {
184 un
->un_rr_clean_start_bit
= 0;
185 goto retry_dirty_scan
;
189 /* how much to fit into this message */
190 data_bytes
= MIN(howmany(un
->un_rrd_num
- start_bit
, NBBY
),
191 MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES
);
193 (*msgp
) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes
),
196 (*msgp
)->rr_nodeid
= md_mn_mynode_id
;
197 (*msgp
)->rr_mnum
= MD_SID(un
);
198 MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp
, start_bit
, data_bytes
);
200 next_bit
= MIN(start_bit
+ data_bytes
* NBBY
, un
->un_rrd_num
);
202 for (i
= start_bit
; i
< next_bit
; i
++) {
203 if (un
->c
.un_status
& MD_UN_KEEP_DIRTY
&& IS_KEEPDIRTY(i
, un
)) {
206 if (!IS_REGION_DIRTY(i
, un
)) {
209 if (un
->un_outstanding_writes
[i
] != 0) {
215 * Handle the case where a resync has completed and we still
216 * have the un_dirty_bm[] entries marked as dirty (these are
217 * the most recent DRL re-read from the replica). They need
218 * to be cleared from our un_dirty_bm[] but they will not have
219 * corresponding un_pernode_dirty[] entries set unless (and
220 * until) further write()s have been issued to the area.
221 * This handles the case where only the un_dirty_bm[] entry is
222 * set. Without this we'd not clear this region until a local
223 * write is issued to the affected area.
225 if (IS_PERNODE_DIRTY(md_mn_mynode_id
, i
, un
) ||
226 (un
->un_pernode_dirty_sum
[i
] == (uchar_t
)0)) {
227 if (!IS_GOING_CLEAN(i
, un
)) {
228 SET_GOING_CLEAN(i
, un
);
233 * Now we've got a flagged pernode_dirty, _or_ a clean
234 * bitmap entry to process. Update the bitmap to flush
235 * the REGION_DIRTY / GOING_CLEAN bits when we send the
236 * cross-cluster message.
239 setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp
), i
- start_bit
);
242 * Not marked as active in the pernode bitmap, so skip
243 * any update to this. We just increment the 0 count
244 * and adjust the active count by any outstanding
245 * un_pernode_dirty_sum[] entries. This means we don't
246 * leave the mirror permanently dirty.
248 (*activep
) += (int)un
->un_pernode_dirty_sum
[i
];
251 if (!cleared_dirty
) {
252 kmem_free(*msgp
, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes
));
255 un
->un_rr_clean_start_bit
= next_bit
;
256 return (cleared_dirty
);
260 * There are three paths into here:
262 * md_daemon -> check_resync_regions -> prr
263 * mirror_internal_close -> mirror_process_unit_resync -> prr
264 * mirror_set_capability -> mirror_process_unit_resync -> prr
266 * The first one is a kernel daemon, the other two result from system calls.
267 * Thus, only the first case needs to deal with kernel CPR activity. This
268 * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
269 * NULL for system call paths.
272 process_resync_regions_non_owner(mm_unit_t
*un
, callb_cpr_t
*cprinfop
)
275 int cleared_dirty
= 0;
276 /* Number of reasons why we can not proceed shutting down the mirror. */
278 set_t setno
= MD_UN2SET(un
);
279 md_mn_msg_rr_clean_t
*rmsg
;
280 md_mn_kresult_t
*kres
;
282 minor_t mnum
= MD_SID(un
);
283 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
284 md_mn_nodeid_t owner_node
;
287 * We drop the readerlock here to assist lock ordering with
288 * update_resync. Once we have the un_rrp_inflight_mx, we
291 md_unit_readerexit(ui
);
294 * Resync region processing must be single threaded. We can't use
295 * un_resync_mx for this purpose since this mutex gets released
296 * when blocking on un_resync_cv.
298 mutex_enter(&un
->un_rrp_inflight_mx
);
300 (void) md_unit_readerlock(ui
);
302 mutex_enter(&un
->un_resync_mx
);
304 rw_enter(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1], RW_READER
);
305 cleared_dirty
= mirror_generate_rr_bitmap(un
, &rmsg
, &active
);
306 rw_exit(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1]);
309 owner_node
= un
->un_mirror_owner
;
310 mutex_exit(&un
->un_resync_mx
);
313 * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
314 * Receipt of the message will cause the mirror owner to
315 * update the on-disk DRL.
318 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
320 /* release readerlock before sending message */
321 md_unit_readerexit(ui
);
324 mutex_enter(&un
->un_prr_cpr_mx
);
325 CALLB_CPR_SAFE_BEGIN(cprinfop
);
328 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RR_CLEAN
,
329 MD_MSGF_NO_LOG
|MD_MSGF_BLK_SIGNAL
|MD_MSGF_KSEND_NORETRY
|
330 MD_MSGF_DIRECTED
, un
->un_mirror_owner
,
331 (char *)rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
), kres
);
334 CALLB_CPR_SAFE_END(cprinfop
, &un
->un_prr_cpr_mx
);
335 mutex_exit(&un
->un_prr_cpr_mx
);
338 /* reacquire readerlock after message */
339 (void) md_unit_readerlock(ui
);
341 if ((!MDMN_KSEND_MSG_OK(rval
, kres
)) &&
342 (kres
->kmmr_comm_state
!= MDMNE_NOT_JOINED
)) {
343 /* if commd is gone, no point in printing a message */
344 if (md_mn_is_commd_present())
345 mdmn_ksend_show_error(rval
, kres
, "RR_CLEAN");
346 kmem_free(kres
, sizeof (md_mn_kresult_t
));
347 kmem_free(rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
));
348 mutex_exit(&un
->un_rrp_inflight_mx
);
351 kmem_free(kres
, sizeof (md_mn_kresult_t
));
354 * If ownership changed while we were sending, we probably
355 * sent the message to the wrong node. Leave fixing that for
358 if (un
->un_mirror_owner
!= owner_node
) {
359 mutex_exit(&un
->un_rrp_inflight_mx
);
364 * Now that we've sent the message, clear them from the
365 * pernode_dirty arrays. These are ONLY cleared on a
366 * successful send, and failure has no impact.
369 start
= MDMN_MSG_RR_CLEAN_START_BIT(rmsg
);
370 end
= start
+ MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg
) * NBBY
;
371 mutex_enter(&un
->un_resync_mx
);
372 rw_enter(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1],
374 for (i
= start
; i
< end
; i
++) {
375 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg
),
377 if (IS_PERNODE_DIRTY(md_mn_mynode_id
, i
, un
)) {
378 un
->un_pernode_dirty_sum
[i
]--;
379 CLR_PERNODE_DIRTY(md_mn_mynode_id
, i
,
382 if (IS_REGION_DIRTY(i
, un
)) {
384 CLR_REGION_DIRTY(i
, un
);
385 CLR_GOING_CLEAN(i
, un
);
389 rw_exit(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1]);
391 kmem_free(rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
));
393 mutex_exit(&un
->un_resync_mx
);
395 mutex_exit(&un
->un_rrp_inflight_mx
);
401 process_resync_regions_owner(mm_unit_t
*un
)
404 int cleared_dirty
= 0;
405 /* Number of reasons why we can not proceed shutting down the mirror. */
407 set_t setno
= MD_UN2SET(un
);
408 int mnset
= MD_MNSET_SETNO(setno
);
409 md_mn_msg_rr_clean_t
*rmsg
;
410 minor_t mnum
= MD_SID(un
);
411 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
414 * We drop the readerlock here to assist lock ordering with
415 * update_resync. Once we have the un_rrp_inflight_mx, we
418 md_unit_readerexit(ui
);
421 * Resync region processing must be single threaded. We can't use
422 * un_resync_mx for this purpose since this mutex gets released
423 * when blocking on un_resync_cv.
425 mutex_enter(&un
->un_rrp_inflight_mx
);
427 (void) md_unit_readerlock(ui
);
429 mutex_enter(&un
->un_resync_mx
);
430 un
->un_waiting_to_clear
++;
431 while (un
->un_resync_flg
& MM_RF_STALL_CLEAN
)
432 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
433 un
->un_waiting_to_clear
--;
436 rw_enter(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1],
438 cleared_dirty
= mirror_generate_rr_bitmap(un
, &rmsg
, &active
);
442 * Clear the bits from the pernode_dirty arrays.
443 * If that results in any being cleared from the
444 * un_dirty_bm, commit it.
447 start
= MDMN_MSG_RR_CLEAN_START_BIT(rmsg
);
448 end
= start
+ MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg
) * NBBY
;
449 for (i
= start
; i
< end
; i
++) {
450 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg
),
452 if (IS_PERNODE_DIRTY(md_mn_mynode_id
, i
,
454 un
->un_pernode_dirty_sum
[i
]--;
456 md_mn_mynode_id
, i
, un
);
458 if (un
->un_pernode_dirty_sum
[i
] == 0) {
460 CLR_REGION_DIRTY(i
, un
);
461 CLR_GOING_CLEAN(i
, un
);
465 kmem_free(rmsg
, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg
));
467 rw_exit(&un
->un_pernode_dirty_mx
[md_mn_mynode_id
- 1]);
469 for (i
= 0; i
< un
->un_rrd_num
; i
++) {
470 if (un
->c
.un_status
& MD_UN_KEEP_DIRTY
)
471 if (IS_KEEPDIRTY(i
, un
))
474 if (!IS_REGION_DIRTY(i
, un
))
476 if (un
->un_outstanding_writes
[i
] != 0) {
481 if (!IS_GOING_CLEAN(i
, un
)) {
482 SET_GOING_CLEAN(i
, un
);
486 CLR_REGION_DIRTY(i
, un
);
487 CLR_GOING_CLEAN(i
, un
);
493 un
->un_resync_flg
|= MM_RF_GATECLOSED
;
494 mutex_exit(&un
->un_resync_mx
);
495 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
496 mutex_enter(&un
->un_resync_mx
);
497 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
499 if (un
->un_waiting_to_mark
!= 0 ||
500 un
->un_waiting_to_clear
!= 0) {
502 cv_broadcast(&un
->un_resync_cv
);
505 mutex_exit(&un
->un_resync_mx
);
507 mutex_exit(&un
->un_rrp_inflight_mx
);
513 process_resync_regions(mm_unit_t
*un
, callb_cpr_t
*cprinfop
)
515 int mnset
= MD_MNSET_SETNO(MD_UN2SET(un
));
517 * For a mirror we can only update the on-disk resync-record if we
518 * currently own the mirror. If we are called and there is no owner we
519 * bail out before scanning the outstanding_writes[] array.
520 * NOTE: we only need to check here (before scanning the array) as we
521 * are called with the readerlock held. This means that a change
522 * of ownership away from us will block until this resync check
525 if (mnset
&& (MD_MN_NO_MIRROR_OWNER(un
) ||
526 (!MD_MN_MIRROR_OWNER(un
) && !md_mn_is_commd_present_lite()))) {
528 } else if (mnset
&& !MD_MN_MIRROR_OWNER(un
)) {
529 return (process_resync_regions_non_owner(un
, cprinfop
));
531 return (process_resync_regions_owner(un
));
536 * Function that is callable from other modules to provide
537 * ability to cleanup dirty region bitmap on demand. Used
538 * on last close of a unit to avoid massive device resyncs
539 * when coming back after rolling large amounts of data to
540 * a mirror (e.g. at umount with logging).
544 mirror_process_unit_resync(mm_unit_t
*un
)
548 while (process_resync_regions(un
, NULL
)) {
551 if (cleans
>= md_mirror_rr_cleans
) {
553 "Could not clean resync regions\n");
556 if (cleans
> md_mirror_rr_polls
) {
558 * We did not make it with md_mirror_rr_polls
559 * iterations. Give the system relief and
560 * switch over to non-busy-wait.
562 delay(md_mirror_rr_sleep_timo
* md_hz
);
568 check_resync_regions(daemon_request_t
*timeout
)
575 rw_enter(&mirror_md_ops
.md_link_rw
.lock
, RW_READER
);
576 for (next
= mirror_md_ops
.md_head
; next
!= NULL
; next
= next
->ln_next
) {
578 if (md_get_setstatus(next
->ln_setno
) & MD_SET_STALE
)
581 un
= MD_UNIT(next
->ln_id
);
584 * Register this resync thread with the CPR mechanism. This
585 * allows us to detect when the system is suspended and so
586 * keep track of the RPC failure condition.
588 CALLB_CPR_INIT(&cprinfo
, &un
->un_prr_cpr_mx
, callb_md_mrs_cpr
,
589 "check_resync_regions");
591 ui
= MDI_UNIT(next
->ln_id
);
592 (void) md_unit_readerlock(ui
);
595 * Do not clean up resync regions if it is an ABR
596 * mirror, or if a submirror is offline (we will use the resync
597 * region to resync when back online) or if there is only one
600 if ((ui
->ui_tstate
& MD_ABR_CAP
) ||
601 (un
->c
.un_status
& MD_UN_OFFLINE_SM
) || (un
->un_nsm
< 2)) {
602 md_unit_readerexit(ui
);
606 (void) process_resync_regions(un
, &cprinfo
);
608 md_unit_readerexit(ui
);
610 /* Remove this thread from the CPR callback table. */
611 mutex_enter(&un
->un_prr_cpr_mx
);
612 CALLB_CPR_EXIT(&cprinfo
);
615 rw_exit(&mirror_md_ops
.md_link_rw
.lock
);
618 mutex_enter(&mirror_timeout
.dr_mx
);
619 timeout
->dr_pending
= 0;
620 mutex_exit(&mirror_timeout
.dr_mx
);
624 md_mirror_timeout(void *throwaway
)
627 mutex_enter(&mirror_timeout
.dr_mx
);
628 if (!mirror_timeout
.dr_pending
) {
629 mirror_timeout
.dr_pending
= 1;
630 daemon_request(&md_mto_daemon
, check_resync_regions
,
631 (daemon_queue_t
*)&mirror_timeout
, REQ_OLD
);
634 if (mirror_md_ops
.md_head
!= NULL
)
635 mirror_timeout
.dr_timeout_id
= timeout(md_mirror_timeout
,
636 throwaway
, (int)MD_MDELAY
*hz
);
638 mirror_timeout
.dr_timeout_id
= 0;
640 mutex_exit(&mirror_timeout
.dr_mx
);
644 resync_start_timeout(set_t setno
)
646 if (md_get_setstatus(setno
) & MD_SET_STALE
)
649 mutex_enter(&mirror_timeout
.dr_mx
);
650 if (mirror_timeout
.dr_timeout_id
== 0)
651 mirror_timeout
.dr_timeout_id
= timeout(md_mirror_timeout
,
652 (void *)NULL
, (int)MD_MDELAY
*hz
);
653 mutex_exit(&mirror_timeout
.dr_mx
);
657 offlined_to_attached(mm_unit_t
*un
)
662 if (md_get_setstatus(MD_UN2SET(un
)) & MD_SET_STALE
)
665 for (i
= 0; i
< NMIRROR
; i
++) {
666 if (SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE
)) {
667 mirror_set_sm_state(&un
->un_sm
[i
],
668 &un
->un_smic
[i
], SMS_ATTACHED
, 1);
671 if (SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE_RESYNC
)) {
672 mirror_set_sm_state(&un
->un_sm
[i
],
673 &un
->un_smic
[i
], SMS_ATTACHED_RESYNC
, 1);
679 un
->c
.un_status
&= ~MD_UN_OFFLINE_SM
;
680 mddb_setrecprivate(un
->c
.un_record_id
, MD_PRV_PENDCOM
);
685 get_unit_resync(mm_unit_t
*un
)
687 mddb_recstatus_t status
;
688 struct optim_resync
*orp
;
690 if (un
->un_rr_dirty_recid
== 0) {
691 offlined_to_attached(un
);
695 status
= mddb_getrecstatus(un
->un_rr_dirty_recid
);
696 if ((status
== MDDB_NORECORD
) || (status
== MDDB_NODATA
)) {
697 un
->un_rr_dirty_recid
= 0;
698 offlined_to_attached(un
);
702 mddb_setrecprivate(un
->un_rr_dirty_recid
, MD_PRV_GOTIT
);
703 orp
= (struct optim_resync
*)mddb_getrecaddr(un
->un_rr_dirty_recid
);
704 un
->un_dirty_bm
= orp
->or_rr
;
708 create_unit_resync(mm_unit_t
*un
, int snarfing
)
712 int blksize
; /* rr size in blocks */
715 size_t size
; /* bitmap size */
720 tb
= un
->c
.un_total_blocks
;
722 if (((tb
+ MD_MIN_RR_SIZE
)/ MD_MIN_RR_SIZE
) > MD_DEF_NUM_RR
) {
723 blksize
= (int)(tb
/ MD_DEF_NUM_RR
);
724 num_rr
= (int)((tb
+ (blksize
)) / (blksize
));
726 blksize
= MD_MIN_RR_SIZE
;
727 num_rr
= (int)((tb
+ MD_MIN_RR_SIZE
) / MD_MIN_RR_SIZE
);
730 size
= howmany(num_rr
, NBBY
) + sizeof (*orp
) - sizeof (orp
->or_rr
);
732 setno
= MD_UN2SET(un
);
734 typ1
= (mddb_type_t
)md_getshared_key(setno
,
735 mirror_md_ops
.md_driver
.md_drivername
);
737 recid
= mddb_createrec(size
, typ1
, RESYNC_REC
,
738 MD_CRO_OPTIMIZE
|MD_CRO_32BIT
, setno
);
740 if (snarfing
&& !(md_get_setstatus(setno
) & MD_SET_STALE
)) {
741 md_set_setstatus(setno
, MD_SET_STALE
);
742 cmn_err(CE_WARN
, "md: state database is stale");
747 un
->un_rr_dirty_recid
= recid
;
748 orp
= (optim_resync_t
*)mddb_getrecaddr(recid
);
749 orp
->or_magic
= OR_MAGIC
;
750 orp
->or_blksize
= blksize
;
751 orp
->or_num
= num_rr
;
753 un
->un_rrd_blksize
= blksize
;
754 un
->un_rrd_num
= num_rr
;
755 un
->un_dirty_bm
= orp
->or_rr
;
758 for (i
= 0; i
< howmany(num_rr
, NBBY
); i
++)
759 orp
->or_rr
[i
] = 0xFF;
762 mddb_commitrec_wrapper(recid
);
763 mirror_commit(un
, NO_SUBMIRRORS
, 0);
766 mddb_setrecprivate(recid
, MD_PRV_PENDCOM
);
767 mddb_setrecprivate(un
->c
.un_record_id
, MD_PRV_PENDCOM
);
772 unit_setup_resync(mm_unit_t
*un
, int snarfing
)
777 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
778 int nonABR
= 1; /* only set if ABR marked in ui_tstate */
780 un
->un_dirty_bm
= NULL
;
781 un
->un_rs_buffer
= NULL
;
783 mutex_init(&un
->un_rrp_inflight_mx
, "rrp mx", MUTEX_DEFAULT
, NULL
);
785 mutex_init(&un
->un_resync_mx
, NULL
, MUTEX_DEFAULT
, NULL
);
786 cv_init(&un
->un_resync_cv
, NULL
, CV_DEFAULT
, NULL
);
787 un
->un_resync_flg
= 0;
788 un
->un_waiting_to_mark
= 0;
789 un
->un_waiting_to_commit
= 0;
790 un
->un_waiting_to_clear
= 0;
792 un
->un_goingclean_bm
= NULL
;
793 un
->un_goingdirty_bm
= NULL
;
794 un
->un_outstanding_writes
= NULL
;
795 un
->un_resync_bm
= NULL
;
800 if (un
->un_rr_dirty_recid
== 0) {
802 * If a MN diskset and snarfing and this node is not the
803 * master, do not delete any records on snarf of the
804 * mirror records (create_unit_resync deletes records).
806 * Master node should have already handled this case.
808 if (MD_MNSET_SETNO(MD_UN2SET(un
)) && snarfing
&&
809 md_set
[MD_UN2SET(un
)].s_am_i_master
== 0) {
811 cmn_err(CE_NOTE
, "unit_setup_resync: no rr for %s on"
812 " nodeid %d\n", md_shortname(MD_SID(un
)),
813 md_set
[MD_UN2SET(un
)].s_nodeid
);
817 if ((err
= create_unit_resync(un
, snarfing
)) != 0)
821 un
->un_goingclean_bm
= (uchar_t
*)kmem_zalloc((uint_t
)(howmany(
822 un
->un_rrd_num
, NBBY
)), KM_SLEEP
);
823 un
->un_goingdirty_bm
= (uchar_t
*)kmem_zalloc((uint_t
)(howmany(
824 un
->un_rrd_num
, NBBY
)), KM_SLEEP
);
825 un
->un_outstanding_writes
= (short *)kmem_zalloc(
826 (uint_t
)un
->un_rrd_num
* sizeof (short), KM_SLEEP
);
827 un
->un_resync_bm
= (uchar_t
*)kmem_zalloc((uint_t
)(howmany(
828 un
->un_rrd_num
, NBBY
)), KM_SLEEP
);
831 * Allocate pernode bitmap for this node. All other nodes' maps will
832 * be created 'on-the-fly' in the ioctl message handler
834 if (MD_MNSET_SETNO(MD_UN2SET(un
))) {
835 un
->un_pernode_dirty_sum
=
836 (uchar_t
*)kmem_zalloc(un
->un_rrd_num
, KM_SLEEP
);
837 if (md_mn_mynode_id
> 0) {
838 un
->un_pernode_dirty_bm
[md_mn_mynode_id
-1] = (uchar_t
*)
839 kmem_zalloc((uint_t
)(howmany(un
->un_rrd_num
, NBBY
)),
844 * Allocate taskq to process deferred (due to locking) RR_CLEAN
847 un
->un_drl_task
= (ddi_taskq_t
*)md_create_taskq(MD_UN2SET(un
),
851 if (md_get_setstatus(MD_UN2SET(un
)) & MD_SET_STALE
)
855 * Only mark mirror which has an associated DRL as requiring a resync.
856 * For ABR mirrors we need not set the resync record bitmap up.
858 if (ui
&& (ui
->ui_tstate
& MD_ABR_CAP
))
861 for (i
= 0, syncable
= 0; i
< NMIRROR
; i
++) {
863 if ((SUBMIRROR_IS_READABLE(un
, i
) ||
864 SMS_BY_INDEX_IS(un
, i
,
865 (SMS_OFFLINE
| SMS_OFFLINE_RESYNC
))))
870 if (snarfing
&& un
->un_pass_num
&& (syncable
> 1)) {
871 bcopy((caddr_t
)un
->un_dirty_bm
, (caddr_t
)un
->un_resync_bm
,
872 howmany(un
->un_rrd_num
, NBBY
));
874 un
->c
.un_status
|= (MD_UN_OPT_NOT_DONE
| MD_UN_WAR
);
875 un
->c
.un_status
&= ~MD_UN_OFFLINE_SM
;
876 for (i
= 0; i
< NMIRROR
; i
++) {
877 if ((SUBMIRROR_IS_READABLE(un
, i
)) ||
878 SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE_RESYNC
))
879 un
->un_sm
[i
].sm_flags
|= MD_SM_RESYNC_TARGET
;
881 if (SMS_BY_INDEX_IS(un
, i
, SMS_OFFLINE
)) {
882 un
->un_sm
[i
].sm_flags
|= MD_SM_RESYNC_TARGET
;
883 mirror_set_sm_state(&un
->un_sm
[i
],
884 &un
->un_smic
[i
], SMS_OFFLINE_RESYNC
, 1);
885 mddb_setrecprivate(un
->c
.un_record_id
,
894 * resync_kill_pending:
895 * -------------------
896 * Determine if the resync thread has been requested to terminate.
897 * Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
898 * MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
899 * MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node mirror.
903 * 1 Kill requested (set MD_UN_RESYNC_CANCEL in un->c.un_status)
905 * Note: this routine may block
906 * the writerlock for <ui> will be dropped and reacquired if <mx_type>
907 * is set to MD_WRITER_HELD.
908 * the readerlock for <ui> will be dropped and reacquired if <mx_type>
909 * is set to MD_READER_HELD.
919 /* Ensure that we don't block with any mutex held */
920 if (mx_type
== MD_WRITER_HELD
) {
921 md_unit_writerexit(ui
);
922 } else if (mx_type
== MD_READER_HELD
) {
923 md_unit_readerexit(ui
);
925 mutex_enter(&un
->un_rs_thread_mx
);
926 while (un
->un_rs_thread_flags
& (MD_RI_BLOCK
|MD_RI_BLOCK_OWNER
)) {
927 cv_wait(&un
->un_rs_thread_cv
, &un
->un_rs_thread_mx
);
928 if (un
->un_rs_thread_flags
& (MD_RI_KILL
|MD_RI_SHUTDOWN
))
931 /* Determine if we've been asked to abort or shutdown gracefully */
932 if (un
->un_rs_thread_flags
& MD_RI_KILL
) {
933 un
->c
.un_status
|= MD_UN_RESYNC_CANCEL
;
935 } else if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
) {
938 mutex_exit(&un
->un_rs_thread_mx
);
940 /* Reacquire mutex if dropped on entry */
941 if (mx_type
== MD_WRITER_HELD
) {
942 (void) md_unit_writerlock(ui
);
943 } else if (mx_type
== MD_READER_HELD
) {
944 (void) md_unit_readerlock(ui
);
950 * resync_read_buffer:
952 * Issue the resync source read for the specified start block and size.
953 * This will cause the mirror strategy routine to issue a write-after-read
954 * once this request completes successfully.
955 * If 'flag_err' is set we expect to see a write error flagged in the b_error
956 * field of the buffer created for this i/o request. If clear we do not expect
957 * to see the error flagged for write failures.
958 * Read failures will always set the B_ERROR bit which will stop the resync
962 resync_read_buffer(mm_unit_t
*un
, diskaddr_t blk
, size_t cnt
, int flag_err
)
968 sp
= kmem_cache_alloc(mirror_child_cache
, MD_ALLOCFLAGS
);
969 mirror_child_init(sp
);
972 bp
->b_edev
= makedevice(md_major
, MD_SID(un
));
973 bp
->b_flags
= B_READ
;
975 bp
->b_bcount
= dbtob(cnt
);
976 bp
->b_un
.b_addr
= un
->un_rs_buffer
;
977 md_unit_readerexit(MDI_UNIT(MD_SID(un
)));
979 (void) md_mirror_strategy(bp
, MD_STR_NOTTOP
| MD_STR_MAPPED
|
980 MD_STR_WAR
| (flag_err
? MD_STR_FLAG_ERR
: 0), NULL
);
984 (void) md_unit_readerlock(MDI_UNIT(MD_SID(un
)));
985 if (bp
->b_flags
& B_ERROR
) {
988 kmem_cache_free(mirror_child_cache
, sp
);
993 * send_mn_resync_done_message
995 * At the end of a resync, send a message to all nodes to indicate that
996 * the resync is complete. The argument, flags, has the following values
998 * RESYNC_ERR - if an error occurred that terminated the resync
999 * CLEAR_OPT_NOT_DONE - Just need to clear the OPT_NOT_DONE flag
1001 * unit writerlock set on entry
1002 * Only send the message if the thread is not marked as shutting down:
1003 * [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
1004 * [un->c.un_status & MD_UN_RESYNC_CANCEL]
1005 * or if there has been an error that terminated the resync:
1006 * flags & RESYNC_ERR
1010 send_mn_resync_done_message(
1015 md_mn_msg_resync_t
*rmsg
= un
->un_rs_msg
;
1017 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
1018 md_mn_kresult_t
*kres
;
1022 rmsg
= (md_mn_msg_resync_t
*)un
->un_rs_msg
;
1025 * Only send the message if this resync thread is still active. This
1026 * handles the case where ownership changes to different nodes during
1027 * a resync can cause multiple spurious resync_done messages to occur
1028 * when the resync completes. This happens because only one node is
1029 * the resync owner but other nodes will have their resync_unit thread
1030 * blocked in 'resync_kill_pending'
1032 mutex_enter(&un
->un_rs_thread_mx
);
1033 dont_send
= (un
->un_rs_thread_flags
& (MD_RI_KILL
|MD_RI_SHUTDOWN
)) ? 1
1035 mutex_exit(&un
->un_rs_thread_mx
);
1036 dont_send
|= (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
) ? 1 : 0;
1039 * Always send a message if we've encountered an error that terminated
1042 if (flags
& RESYNC_ERR
)
1047 if (mirror_debug_flag
) {
1048 printf("Don't send resync done message, mnum = %x,"
1049 " type = %x, flags = %d\n", MD_SID(un
),
1050 un
->un_rs_type
, flags
);
1057 if (mirror_debug_flag
) {
1058 printf("send resync done message, mnum = %x, type = %x\n",
1059 MD_SID(un
), un
->un_rs_type
);
1063 rmsg
->msg_resync_mnum
= MD_SID(un
);
1064 rmsg
->msg_resync_type
= un
->un_rs_type
;
1065 rmsg
->msg_originator
= md_mn_mynode_id
;
1066 rmsg
->msg_resync_flags
= 0;
1067 if (flags
& RESYNC_ERR
)
1068 rmsg
->msg_resync_flags
|= MD_MN_RS_ERR
;
1069 if (flags
& CLEAR_OPT_NOT_DONE
)
1070 rmsg
->msg_resync_flags
|= MD_MN_RS_CLEAR_OPT_NOT_DONE
;
1072 setno
= MD_MIN2SET(MD_SID(un
));
1073 md_unit_writerexit(ui
);
1074 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
1076 mutex_enter(&un
->un_rs_cpr_mx
);
1077 CALLB_CPR_SAFE_BEGIN(&un
->un_rs_cprinfo
);
1079 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RESYNC_PHASE_DONE
,
1080 MD_MSGF_NO_LOG
, 0, (char *)rmsg
, sizeof (md_mn_msg_resync_t
), kres
);
1082 CALLB_CPR_SAFE_END(&un
->un_rs_cprinfo
, &un
->un_rs_cpr_mx
);
1083 mutex_exit(&un
->un_rs_cpr_mx
);
1085 /* if the node hasn't yet joined, it's Ok. */
1086 if ((!MDMN_KSEND_MSG_OK(rval
, kres
)) &&
1087 (kres
->kmmr_comm_state
!= MDMNE_NOT_JOINED
)) {
1088 mdmn_ksend_show_error(rval
, kres
, "RESYNC_PHASE_DONE");
1089 /* If we're shutting down already, pause things here. */
1090 if (kres
->kmmr_comm_state
== MDMNE_RPC_FAIL
) {
1091 while (!md_mn_is_commd_present()) {
1095 cmn_err(CE_PANIC
, "ksend_message failure: RESYNC_PHASE_DONE");
1097 kmem_free(kres
, sizeof (md_mn_kresult_t
));
1098 (void) md_unit_writerlock(ui
);
1102 * send_mn_resync_next_message
1104 * Sent a message to all nodes indicating the next region to be resynced.
1105 * The message contains the region to be resynced and the current position in
1106 * the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
1107 * On entry the unit readerlock is held.
1110 send_mn_resync_next_message(
1112 diskaddr_t currentblk
,
1117 md_mn_msg_resync_t
*rmsg
= un
->un_rs_msg
;
1119 md_mn_kresult_t
*kres
;
1120 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
1126 ASSERT(rmsg
!= NULL
);
1128 if (mirror_debug_flag
) {
1129 printf("send resync next message, mnum = %x, start=%lld, "
1130 "size=%ld, type=%x, done=%lld, 2_do=%lld\n",
1131 MD_SID(un
), currentblk
, rsize
, un
->un_rs_type
,
1132 un
->un_rs_resync_done
, un
->un_rs_resync_2_do
);
1135 rmsg
->msg_resync_mnum
= MD_SID(un
);
1136 rmsg
->msg_resync_type
= un
->un_rs_type
;
1137 rmsg
->msg_resync_start
= currentblk
;
1138 rmsg
->msg_resync_rsize
= rsize
;
1139 rmsg
->msg_resync_done
= un
->un_rs_resync_done
;
1140 rmsg
->msg_resync_2_do
= un
->un_rs_resync_2_do
;
1141 rmsg
->msg_originator
= md_mn_mynode_id
;
1142 if (flags
& MD_FIRST_RESYNC_NEXT
)
1143 rmsg
->msg_resync_flags
= MD_MN_RS_FIRST_RESYNC_NEXT
;
1146 * Copy current submirror state and flags into message. This provides
1147 * a means of keeping all nodes that are currently active in the cluster
1148 * synchronised with regards to their submirror state settings. If we
1149 * did not pass this information here, the only time every node gets
1150 * submirror state updated is at the end of a resync phase. This can be
1151 * a significant amount of time for large metadevices.
1153 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1154 sm
= &un
->un_sm
[smi
];
1155 rmsg
->msg_sm_state
[smi
] = sm
->sm_state
;
1156 rmsg
->msg_sm_flags
[smi
] = sm
->sm_flags
;
1158 setno
= MD_MIN2SET(MD_SID(un
));
1159 md_unit_readerexit(ui
);
1160 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
1162 mutex_enter(&un
->un_rs_cpr_mx
);
1163 CALLB_CPR_SAFE_BEGIN(&un
->un_rs_cprinfo
);
1165 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RESYNC_NEXT
, MD_MSGF_NO_LOG
,
1166 0, (char *)rmsg
, sizeof (md_mn_msg_resync_t
), kres
);
1168 CALLB_CPR_SAFE_END(&un
->un_rs_cprinfo
, &un
->un_rs_cpr_mx
);
1169 mutex_exit(&un
->un_rs_cpr_mx
);
1171 if (!MDMN_KSEND_MSG_OK(rval
, kres
)) {
1172 mdmn_ksend_show_error(rval
, kres
, "RESYNC_NEXT");
1173 /* If we're shutting down already, pause things here. */
1174 if (kres
->kmmr_comm_state
== MDMNE_RPC_FAIL
) {
1175 while (!md_mn_is_commd_present()) {
1179 cmn_err(CE_PANIC
, "ksend_message failure: RESYNC_NEXT");
1181 kmem_free(kres
, sizeof (md_mn_kresult_t
));
1182 (void) md_unit_readerlock(ui
);
1183 ps
= un
->un_rs_prev_overlap
;
1185 /* Allocate previous overlap reference if needed */
1187 ps
= kmem_cache_alloc(mirror_parent_cache
, MD_ALLOCFLAGS
);
1190 ps
->ps_firstblk
= 0;
1193 md_unit_readerexit(ui
);
1194 (void) md_unit_writerlock(ui
);
1195 un
->un_rs_prev_overlap
= ps
;
1196 md_unit_writerexit(ui
);
1197 (void) md_unit_readerlock(ui
);
1200 ps
->ps_firstblk
= currentblk
;
1201 ps
->ps_lastblk
= currentblk
+ rsize
- 1;
1205 resync_read_blk_range(
1207 diskaddr_t currentblk
,
1208 diskaddr_t stopbefore
,
1213 size_t copysize
; /* limited by max xfer buf size */
1214 size_t rsize
; /* size of resync block (for MN) */
1217 diskaddr_t rs_startblk
;
1219 int flags1
= flags
& MD_FIRST_RESYNC_NEXT
;
1221 rs_type
= un
->un_rs_type
;
1222 rs_startblk
= currentblk
;
1223 if (stopbefore
> un
->c
.un_total_blocks
)
1224 stopbefore
= un
->c
.un_total_blocks
;
1225 if (currentblk
< un
->un_resync_startbl
)
1226 currentblk
= un
->un_resync_startbl
;
1228 copysize
= un
->un_rs_copysize
;
1229 rsize
= MD_DEF_RESYNC_BLK_SZ
;
1231 setno
= MD_MIN2SET(MD_SID(un
));
1232 while (currentblk
< stopbefore
) {
1234 * Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
1235 * if a MN device and sendflag is set, send a RESYNC_MESSAGE
1238 if ((currentblk
+ MD_DEF_RESYNC_BLK_SZ
) > stopbefore
)
1239 rsize
= stopbefore
- currentblk
;
1240 if (MD_MNSET_SETNO(setno
) && (flags
& MD_SEND_MESS_XMIT
)) {
1241 un
->un_resync_startbl
= currentblk
;
1242 rs_startblk
= currentblk
;
1243 send_mn_resync_next_message(un
, currentblk
, rsize
,
1247 /* check to see if we've been asked to terminate */
1248 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), type
))
1249 return ((un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1252 * Check to see if another node has completed this
1253 * block, if so either the type or the resync region
1254 * will have changed. If the resync type has changed,
1256 * If the resync region has changed, reset currentblk
1257 * to the start of the current resync region and
1260 if (un
->un_rs_type
!= rs_type
)
1262 if (un
->un_rs_prev_overlap
->ps_firstblk
>
1265 un
->un_rs_prev_overlap
->ps_firstblk
;
1269 newstop
= currentblk
+ rsize
;
1270 while (currentblk
< newstop
) {
1271 if ((currentblk
+ copysize
) > stopbefore
)
1272 copysize
= (size_t)(stopbefore
- currentblk
);
1273 if (resync_read_buffer(un
, currentblk
, copysize
,
1274 (flags
& MD_RESYNC_FLAG_ERR
)))
1277 /* resync_read_buffer releases/grabs a new lock */
1278 un
= (mm_unit_t
*)MD_UNIT(MD_SID(un
));
1279 currentblk
+= copysize
;
1281 /* check to see if we've been asked to terminate */
1282 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), type
))
1283 return ((un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1285 if (MD_MNSET_SETNO(setno
)) {
1287 * Check to see if another node has completed
1288 * this block, see above
1290 if (un
->un_rs_type
!= rs_type
)
1292 if (un
->un_rs_prev_overlap
->ps_firstblk
>
1295 un
->un_rs_prev_overlap
->ps_firstblk
;
1303 optimized_resync(mm_unit_t
*un
)
1309 uchar_t
*dirtyregions
;
1310 diskaddr_t first
, stopbefore
;
1316 uint_t old_rs_type
= un
->un_rs_type
;
1318 uint_t flags1
= MD_FIRST_RESYNC_NEXT
|MD_RESYNC_FLAG_ERR
;
1322 ui
= MDI_UNIT(mnum
);
1323 setno
= MD_UN2SET(un
);
1325 if (!(un
->c
.un_status
& MD_UN_OPT_NOT_DONE
)) {
1327 * We aren't marked as needing a resync so for multi-node
1328 * sets we flag the completion so that all nodes see the same
1329 * metadevice state. This is a problem when a new node joins
1330 * an existing set as it has to perform a 'metasync -r' and
1331 * we have to step through all of the resync phases. If we
1332 * don't do this the nodes that were already in the set will
1333 * have the metadevices marked as 'Okay' but the joining node
1334 * will have 'Needs Maintenance' which is unclearable.
1336 if (MD_MNSET_SETNO(setno
)) {
1337 send_mn_resync_done_message(un
, CLEAR_OPT_NOT_DONE
);
1343 * No need for optimized resync if ABR set, clear rs_type and flags
1346 if (ui
->ui_tstate
& MD_ABR_CAP
) {
1347 un
->un_rs_type
= MD_RS_NONE
;
1348 un
->c
.un_status
&= ~(MD_UN_OPT_NOT_DONE
| MD_UN_WAR
);
1352 un
->un_rs_dropped_lock
= 1;
1353 un
->c
.un_status
|= MD_UN_WAR
;
1354 resync_regions
= un
->un_rrd_num
;
1355 dirtyregions
= un
->un_resync_bm
;
1356 md_unit_writerexit(ui
);
1358 /* For MN sets, resync NOTIFY is done when processing resync messages */
1359 if (!MD_MNSET_SETNO(setno
)) {
1360 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_START
,
1361 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1363 un
= (mm_unit_t
*)md_unit_readerlock(ui
);
1365 /* check to see if we've been asked to terminate */
1366 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1367 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1368 broke_out
= RESYNC_ERR
;
1371 * Check that we are still performing an optimized
1372 * resync. If not, another node must have completed it
1373 * so we have no more work to do.
1375 if (un
->un_rs_type
!= old_rs_type
) {
1376 md_unit_readerexit(ui
);
1377 (void) md_unit_writerlock(ui
);
1381 * If rs_resync_done is non-zero, we must be completing an optimized
1382 * resync that has already been partially done on another node.
1383 * Therefore clear the bits in resync_bm for the resync regions
1384 * already done. If resync_startbl is zero, calculate 2_do.
1386 if (un
->un_rs_resync_done
> 0) {
1387 BLK_TO_RR(start_rr
, un
->un_resync_startbl
, un
);
1388 for (rr
= 0; rr
< start_rr
&& rr
< resync_regions
; rr
++)
1389 CLR_KEEPDIRTY(rr
, un
);
1391 un
->un_rs_resync_2_do
= 0;
1392 for (rr
= 0; rr
< resync_regions
; rr
++)
1393 if (isset(dirtyregions
, rr
))
1394 un
->un_rs_resync_2_do
++;
1397 for (rr
= 0; (rr
< resync_regions
) && (broke_out
!= RESYNC_ERR
); rr
++) {
1398 if (isset(dirtyregions
, rr
)) {
1399 RR_TO_BLK(first
, rr
, un
);
1400 RR_TO_BLK(stopbefore
, rr
+1, un
);
1401 old_rs_type
= un
->un_rs_type
;
1402 old_rs_done
= un
->un_rs_resync_done
;
1403 err
= resync_read_blk_range(un
, first
, stopbefore
,
1404 MD_READER_HELD
, MD_SEND_MESS_XMIT
| flags1
);
1405 flags1
= MD_RESYNC_FLAG_ERR
;
1407 /* resync_read_blk_range releases/grabs a new lock */
1408 un
= (mm_unit_t
*)MD_UNIT(mnum
);
1411 broke_out
= RESYNC_ERR
;
1416 * Check that we are still performing an optimized
1417 * resync. If not, another node must have completed it
1418 * so we have no more work to do.
1420 if (un
->un_rs_type
!= old_rs_type
) {
1421 md_unit_readerexit(ui
);
1422 (void) md_unit_writerlock(ui
);
1427 * If resync_done has increased, we must have
1428 * blocked in resync_read_blk_range while another node
1429 * continued with the resync. Therefore clear resync_bm
1430 * for the blocks that have been resynced on another
1431 * node and update rr to the next RR to be done.
1433 if (old_rs_done
< un
->un_rs_resync_done
) {
1435 BLK_TO_RR(start_rr
, un
->un_resync_startbl
- 1,
1437 for (i
= rr
; i
< start_rr
; i
++)
1438 CLR_KEEPDIRTY(i
, un
);
1441 un
->un_rs_resync_done
++;
1443 for (smi
= 0, cnt
= 0; smi
< NMIRROR
; smi
++)
1444 if (SUBMIRROR_IS_WRITEABLE(un
, smi
) &&
1445 !(SMS_BY_INDEX_IS(un
, smi
, SMS_ALL_ERRED
)))
1448 broke_out
= RESYNC_ERR
;
1451 CLR_KEEPDIRTY(rr
, un
);
1452 /* Check to see if we've completed the resync cleanly */
1453 if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
)
1457 * Check that we haven't exceeded un_rs_resync_2_do. If
1458 * we have we've completed the resync.
1460 if (un
->un_rs_resync_done
> un
->un_rs_resync_2_do
)
1464 md_unit_readerexit(ui
);
1465 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
1468 * If MN set send message to all nodes to indicate resync
1469 * phase is complete. The processing of the message will update the
1472 if (MD_MNSET_SETNO(setno
)) {
1473 send_mn_resync_done_message(un
, broke_out
);
1477 un
->c
.un_status
&= ~MD_UN_WAR
;
1479 un
->c
.un_status
&= ~MD_UN_KEEP_DIRTY
;
1481 setno
= MD_UN2SET(un
);
1482 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1483 un
->un_sm
[smi
].sm_flags
&= ~MD_SM_RESYNC_TARGET
;
1484 if (SMS_BY_INDEX_IS(un
, smi
, SMS_OFFLINE_RESYNC
)) {
1485 state
= (broke_out
? SMS_OFFLINE
: SMS_RUNNING
);
1486 mirror_set_sm_state(&un
->un_sm
[smi
],
1487 &un
->un_smic
[smi
], state
, broke_out
);
1488 mirror_commit(un
, NO_SUBMIRRORS
, 0);
1490 if (SMS_BY_INDEX_IS(un
, smi
, SMS_OFFLINE
))
1491 un
->c
.un_status
|= MD_UN_OFFLINE_SM
;
1495 /* For MN sets, resync NOTIFY is done when processing resync messages */
1496 if (!MD_MNSET_SETNO(setno
)) {
1498 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_FAILED
,
1499 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
1501 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_DONE
,
1502 SVM_TAG_METADEVICE
, MD_UN2SET(un
), MD_SID(un
));
1508 * recalc_resync_done
1510 * This function deals with a change in value of un_rs_resync_2_do in a
1511 * component resync. This may change if we are restarting a component
1512 * resync on a single node having rebooted with a different value of
1513 * md_resync_bufsz or if we are running in a multi-node with nodes having
1514 * different values of md_resync_bufsz.
1515 * If there is a change in un_rs_resync_2_do, we need to recalculate
1516 * the value of un_rs_resync_done given the new value for resync_2_do.
1517 * We have to calculate a new value for resync_done to be either
1518 * if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
1519 * or if it is not set, we need to calculate it from un_rs_resync_done,
1520 * (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
1521 * In addition we need to deal with the overflow case by using a factor to
1526 recalc_resync_done(mm_unit_t
*un
, size_t resync_2_do
, diskaddr_t initblock
,
1527 u_longlong_t blk_size
, u_longlong_t skip
)
1533 * If resync_2_do has not yet been calculated, no need to modify
1536 if (un
->un_rs_resync_2_do
== 0) {
1539 if (un
->un_rs_resync_2_do
== resync_2_do
)
1540 return; /* No change, so nothing to do */
1542 * If un_rs_startbl is set, another node must have already started
1543 * this resync and hence we can calculate resync_done from
1546 if (un
->un_resync_startbl
) {
1547 un
->un_rs_resync_done
= (un
->un_resync_startbl
- initblock
) /
1552 * un_resync_startbl is not set so we must calculate it from
1553 * un_rs_resync_done.
1554 * If the larger of the two values of resync_2_do is greater than 32
1555 * bits, calculate a factor to divide by to ensure that we don't
1556 * overflow 64 bits when calculating the new value for resync_done
1558 x
= (un
->un_rs_resync_2_do
> resync_2_do
) ? un
->un_rs_resync_2_do
:
1560 while (x
> INT32_MAX
) {
1562 factor
= factor
<< 1;
1564 un
->un_rs_resync_done
= ((un
->un_rs_resync_done
/factor
) *
1565 (resync_2_do
/factor
)) /
1566 ((un
->un_rs_resync_2_do
+ (factor
* factor
) - 1)/
1571 check_comp_4_resync(mm_unit_t
*un
, int smi
, int ci
)
1576 mm_submirror_ic_t
*smic
;
1580 u_longlong_t blk_size
;
1581 diskaddr_t initblock
;
1583 diskaddr_t frag
= 0;
1584 md_m_shared_t
*shared
;
1589 uint_t old_rs_type
= un
->un_rs_type
;
1590 diskaddr_t old_rs_done
;
1591 uint_t flags1
= MD_FIRST_RESYNC_NEXT
;
1592 diskaddr_t resync_2_do
;
1595 ui
= MDI_UNIT(mnum
);
1596 sm
= &un
->un_sm
[smi
];
1597 smic
= &un
->un_smic
[smi
];
1598 setno
= MD_UN2SET(un
);
1600 shared
= (md_m_shared_t
*)(*(smic
->sm_shared_by_indx
))
1601 (sm
->sm_dev
, sm
, ci
);
1603 if (shared
->ms_state
!= CS_RESYNC
) {
1604 SET_RS_TYPE_NONE(un
->un_rs_type
);
1608 if (shared
->ms_flags
& MDM_S_RS_TRIED
) {
1609 SET_RS_TYPE_NONE(un
->un_rs_type
);
1613 (void) (*(smic
->sm_get_bcss
))
1614 (sm
->sm_dev
, sm
, ci
, &initblock
, &count
, &skip
, &size
);
1616 if ((count
== 1) && (skip
== 0)) {
1617 count
= (size_t)(size
/ un
->un_rs_copysize
);
1618 if ((frag
= (size
- (count
* un
->un_rs_copysize
))) != 0)
1620 size
= (u_longlong_t
)un
->un_rs_copysize
;
1622 blk_size
= size
; /* Save block size for this resync */
1625 resync_2_do
= count
;
1627 * If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
1628 * gives the proportion of the resync that has already been done.
1629 * If un_rs_copysize has changed since this previous partial resync,
1630 * either because this node has been rebooted with a different value
1631 * for md_resync_bufsz or because another node with a different value
1632 * for md_resync_bufsz performed the previous resync, we need to
1633 * recalculate un_rs_resync_done as a proportion of our value of
1636 recalc_resync_done(un
, resync_2_do
, initblock
, blk_size
, skip
);
1639 * For MN mirrors we need to send a message to all nodes indicating
1640 * the next region to be resynced. For a component resync, the size of
1641 * the contiguous region that is processed by resync_read_blk_range()
1642 * may be small if there is the interleave size.
1643 * Therefore, rather than sending the message within
1644 * resync_read_blk_range(), we will send a message every
1645 * MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
1646 * the number of blocks. Then, if we are restarting a resync, round
1647 * un_rs_resync_done down to the previous resync region boundary. This
1648 * ensures that we send a RESYNC_NEXT message before resyncing any
1651 if (MD_MNSET_SETNO(setno
)) {
1652 blks
= ((MD_DEF_RESYNC_BLK_SZ
+ blk_size
+ skip
- 1)/
1654 un
->un_rs_resync_done
= (un
->un_rs_resync_done
/blks
) * blks
;
1657 * un_rs_resync_done is the number of ('size' + 'skip') increments
1658 * already resynced from the base 'block'
1659 * un_rs_resync_2_do is the number of iterations in
1660 * this component resync.
1662 ASSERT(count
>= un
->un_rs_resync_done
);
1663 un
->un_rs_resync_2_do
= (diskaddr_t
)count
;
1665 un
->c
.un_status
|= MD_UN_WAR
;
1666 sm
->sm_flags
|= MD_SM_RESYNC_TARGET
;
1667 md_unit_writerexit(ui
);
1669 /* For MN sets, resync NOTIFY is done when processing resync messages */
1670 if (!MD_MNSET_SETNO(setno
)) {
1671 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_START
,
1672 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1674 un
= (mm_unit_t
*)md_unit_readerlock(ui
);
1676 /* check to see if we've been asked to terminate */
1677 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1678 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1679 broke_out
= RESYNC_ERR
;
1682 * Check that we are still performing the same component
1683 * resync. If not, another node must have completed it
1684 * so we have no more work to do.
1686 if (un
->un_rs_type
!= old_rs_type
) {
1687 md_unit_readerexit(ui
);
1688 (void) md_unit_writerlock(ui
);
1692 * Adjust resync_done, resync_2_do, start of resync area and count to
1693 * skip already resync'd data. We need to recalculate resync_done as
1694 * we have dropped the unit lock above and may have lost ownership to
1695 * another node, with a different resync buffer size and it may have
1696 * sent us new values of resync_done and resync_2_do based on its
1697 * resync buffer size
1699 recalc_resync_done(un
, resync_2_do
, initblock
, blk_size
, skip
);
1700 un
->un_rs_resync_2_do
= resync_2_do
;
1701 count
-= un
->un_rs_resync_done
;
1702 block
= initblock
+ ((blk_size
+ skip
) * (int)un
->un_rs_resync_done
);
1704 un
->un_rs_dropped_lock
= 1;
1705 while ((count
> 0) && (broke_out
!= RESYNC_ERR
)) {
1706 old_rs_done
= un
->un_rs_resync_done
;
1708 * For MN mirrors send a message to the other nodes. This
1709 * message includes the size of the region that must be blocked
1712 if (MD_MNSET_SETNO(setno
)) {
1713 if ((un
->un_rs_resync_done
%blks
== 0)) {
1714 un
->un_resync_startbl
= block
;
1715 send_mn_resync_next_message(un
, block
,
1716 (blk_size
+skip
)*blks
, flags1
);
1719 * check to see if we've been asked to
1722 if (resync_kill_pending(un
,
1723 MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1724 if (un
->c
.un_status
&
1725 MD_UN_RESYNC_CANCEL
) {
1726 broke_out
= RESYNC_ERR
;
1732 * Check that we are still performing the same
1733 * component resync. If not, another node must
1734 * have completed it so we have no more work to
1735 * do. Also reset count to remaining resync as
1736 * we may have lost ownership in in
1737 * send_mn_resync_next_message while another
1738 * node continued with the resync and
1739 * incremented resync_done.
1741 if (un
->un_rs_type
!= old_rs_type
) {
1742 md_unit_readerexit(ui
);
1743 (void) md_unit_writerlock(ui
);
1747 * recalculate resync_done, resync_2_do
1748 * We need to recalculate resync_done as
1749 * we have dropped the unit lock in
1750 * send_mn_resync_next_message above and may
1751 * have lost ownership to another node, with a
1752 * different resync buffer size and it may have
1753 * sent us new values of resync_done and
1754 * resync_2_do based on its resync buffer size
1756 recalc_resync_done(un
, resync_2_do
, initblock
,
1758 un
->un_rs_resync_2_do
= resync_2_do
;
1759 count
= un
->un_rs_resync_2_do
-
1760 un
->un_rs_resync_done
;
1762 * Adjust start of resync area to skip already
1765 block
= initblock
+ ((blk_size
+ skip
) *
1766 (int)un
->un_rs_resync_done
);
1767 old_rs_done
= un
->un_rs_resync_done
;
1770 err
= resync_read_blk_range(un
, block
, block
+ size
,
1771 MD_READER_HELD
, MD_RESYNC_FLAG_ERR
);
1773 /* resync_read_blk_range releases/grabs a new lock */
1774 un
= (mm_unit_t
*)MD_UNIT(mnum
);
1777 broke_out
= RESYNC_ERR
;
1781 * If we are no longer resyncing this component, return as
1782 * another node has progressed the resync.
1784 if (un
->un_rs_type
!= old_rs_type
) {
1785 md_unit_readerexit(ui
);
1786 (void) md_unit_writerlock(ui
);
1791 * recalculate resync_done, resync_2_do. We need to recalculate
1792 * resync_done as we have dropped the unit lock in
1793 * resync_read_blk_range above and may have lost ownership to
1794 * another node, with a different resync buffer size and it may
1795 * have sent us new values of resync_done and resync_2_do based
1796 * on its resync buffer size
1798 recalc_resync_done(un
, resync_2_do
, initblock
, blk_size
, skip
);
1799 un
->un_rs_resync_2_do
= resync_2_do
;
1802 * Reset count to remaining resync as we may have blocked in
1803 * resync_read_blk_range while another node continued
1804 * with the resync and incremented resync_done. Also adjust
1805 * start of resync area to skip already resync'd data.
1807 count
= un
->un_rs_resync_2_do
- un
->un_rs_resync_done
;
1808 block
= initblock
+((blk_size
+ skip
) *
1809 (int)un
->un_rs_resync_done
);
1812 * If we are picking up from another node, we retry the last
1813 * block otherwise step on to the next block
1815 if (old_rs_done
== un
->un_rs_resync_done
) {
1816 block
+= blk_size
+ skip
;
1817 un
->un_rs_resync_done
++;
1821 if ((count
== 1) && frag
)
1823 if (shared
->ms_state
== CS_ERRED
) {
1825 broke_out
= RESYNC_ERR
;
1829 /* Check to see if we've completed the resync cleanly */
1830 if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
)
1834 md_unit_readerexit(ui
);
1835 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
1838 * If MN set send message to all nodes to indicate resync
1839 * phase is complete. The processing of the message will update the
1842 if (MD_MNSET_SETNO(setno
)) {
1843 send_mn_resync_done_message(un
, broke_out
);
1845 un
->c
.un_status
&= ~MD_UN_WAR
;
1846 sm
->sm_flags
&= ~MD_SM_RESYNC_TARGET
;
1849 shared
->ms_flags
|= MDM_S_RS_TRIED
;
1852 * As we don't transmit the changes,
1853 * no need to drop the lock.
1855 set_sm_comp_state(un
, smi
, ci
, CS_OKAY
, 0,
1856 MD_STATE_NO_XMIT
, (IOLOCK
*)NULL
);
1859 /* For MN sets, resync NOTIFY is done when processing resync messages */
1860 if (!MD_MNSET_SETNO(setno
)) {
1862 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_FAILED
,
1863 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1865 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_DONE
,
1866 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1868 SET_RS_TYPE_NONE(un
->un_rs_type
);
1873 submirror_resync(mm_unit_t
*un
)
1878 mm_submirror_ic_t
*smic
;
1887 int flags1
= MD_FIRST_RESYNC_NEXT
;
1891 ui
= MDI_UNIT(mnum
);
1892 setno
= MD_UN2SET(un
);
1895 * If the submirror_index is non-zero, we are continuing a resync
1896 * so restart resync from last submirror marked as being resynced.
1898 if (RS_SMI(un
->un_rs_type
) != 0) {
1899 smi
= RS_SMI(un
->un_rs_type
);
1900 sm
= &un
->un_sm
[smi
];
1901 smic
= &un
->un_smic
[smi
];
1902 if (!SMS_IS(sm
, SMS_ATTACHED_RESYNC
)) {
1903 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1904 sm
= &un
->un_sm
[smi
];
1905 smic
= &un
->un_smic
[smi
];
1906 if (SMS_IS(sm
, SMS_ATTACHED_RESYNC
))
1911 for (smi
= 0; smi
< NMIRROR
; smi
++) {
1912 sm
= &un
->un_sm
[smi
];
1913 smic
= &un
->un_smic
[smi
];
1914 if (SMS_IS(sm
, SMS_ATTACHED_RESYNC
))
1918 if (smi
== NMIRROR
) {
1919 SET_RS_TYPE_NONE(un
->un_rs_type
);
1924 * If we've only got one component we can fail on a resync write
1925 * if an error is encountered. This stops an unnecessary read of the
1926 * whole mirror on a target write error.
1928 compcnt
= (*(smic
->sm_get_component_count
))(sm
->sm_dev
, sm
);
1930 flags1
|= MD_RESYNC_FLAG_ERR
;
1932 un
->c
.un_status
|= MD_UN_WAR
;
1933 sm
->sm_flags
|= MD_SM_RESYNC_TARGET
;
1934 SET_RS_SMI(un
->un_rs_type
, smi
);
1935 md_unit_writerexit(ui
);
1937 /* For MN sets, resync NOTIFY is done when processing resync messages */
1938 if (!MD_MNSET_SETNO(setno
)) {
1939 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_START
,
1940 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
1942 un
= (mm_unit_t
*)md_unit_readerlock(ui
);
1944 un
->un_rs_dropped_lock
= 1;
1946 /* check to see if we've been asked to terminate */
1947 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)), MD_READER_HELD
)) {
1948 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
)
1949 broke_out
= RESYNC_ERR
;
1952 * Check that we are still performing the same submirror
1953 * resync. If not, another node must have completed it
1954 * so we have no more work to do.
1956 if (RS_TYPE(un
->un_rs_type
) != MD_RS_SUBMIRROR
) {
1957 md_unit_readerexit(ui
);
1958 (void) md_unit_writerlock(ui
);
1962 /* if > 1TB mirror, increase percent done granularity */
1963 if (un
->c
.un_total_blocks
> MD_MAX_BLKS_FOR_SMALL_DEVS
)
1964 chunk
= un
->c
.un_total_blocks
/ 1000;
1966 chunk
= un
->c
.un_total_blocks
/ 100;
1968 chunk
= un
->c
.un_total_blocks
;
1970 * If a MN set, round the chunk size up to a multiple of
1971 * MD_DEF_RESYNC_BLK_SZ
1973 if (MD_MNSET_SETNO(setno
)) {
1974 chunk
= ((chunk
+ MD_DEF_RESYNC_BLK_SZ
)/MD_DEF_RESYNC_BLK_SZ
)
1975 * MD_DEF_RESYNC_BLK_SZ
;
1976 if (chunk
> un
->c
.un_total_blocks
)
1977 chunk
= un
->c
.un_total_blocks
;
1980 * Handle restartable resyncs that continue from where the previous
1981 * resync left off. The new resync range is from un_rs_resync_done ..
1985 if (un
->un_rs_resync_done
== 0) {
1986 un
->un_rs_resync_2_do
= un
->c
.un_total_blocks
;
1988 curblk
= un
->un_rs_resync_done
;
1990 while ((curblk
!= un
->c
.un_total_blocks
) && (broke_out
!= RESYNC_ERR
)) {
1993 rs_done
= un
->un_rs_resync_done
;
1994 err
= resync_read_blk_range(un
, curblk
, curblk
+ chunk
,
1995 MD_READER_HELD
, MD_SEND_MESS_XMIT
| flags1
);
1996 flags1
= (compcnt
== 1 ? MD_RESYNC_FLAG_ERR
: 0);
1998 /* resync_read_blk_range releases/grabs a new lock */
1999 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2002 broke_out
= RESYNC_ERR
;
2007 * If we are no longer executing a submirror resync, return
2008 * as another node has completed the submirror resync.
2010 if (RS_TYPE(un
->un_rs_type
) != MD_RS_SUBMIRROR
) {
2011 md_unit_readerexit(ui
);
2012 (void) md_unit_writerlock(ui
);
2016 * If resync_done has changed, we must have blocked
2017 * in resync_read_blk_range while another node
2018 * continued with the resync so restart from resync_done.
2020 if (rs_done
!= un
->un_rs_resync_done
) {
2021 curblk
= un
->un_rs_resync_done
;
2024 un
->un_rs_resync_done
= curblk
;
2027 if ((curblk
+ chunk
) > un
->c
.un_total_blocks
)
2028 chunk
= un
->c
.un_total_blocks
- curblk
;
2029 for (i
= 0, cnt
= 0; i
< NMIRROR
; i
++)
2030 if (SUBMIRROR_IS_WRITEABLE(un
, i
) &&
2031 !SMS_BY_INDEX_IS(un
, i
, SMS_ALL_ERRED
) &&
2032 (un
->un_sm
[i
].sm_flags
& MD_SM_RESYNC_TARGET
))
2035 broke_out
= RESYNC_ERR
;
2039 /* Check to see if we've completed the resync cleanly */
2040 if (un
->un_rs_thread_flags
& MD_RI_SHUTDOWN
)
2043 md_unit_readerexit(ui
);
2044 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
2047 * If MN set send message to all nodes to indicate resync
2048 * phase is complete. The processing of the message will update the
2051 if (MD_MNSET_SETNO(setno
)) {
2052 send_mn_resync_done_message(un
, broke_out
);
2054 sm
->sm_flags
&= ~MD_SM_RESYNC_TARGET
;
2056 mirror_set_sm_state(sm
, smic
, SMS_ATTACHED
, 1);
2058 mirror_set_sm_state(sm
, smic
, SMS_RUNNING
, 0);
2060 un
->c
.un_status
&= ~MD_UN_WAR
;
2061 mirror_commit(un
, SMI2BIT(smi
), 0);
2064 /* For MN sets, resync NOTIFY is done when processing resync messages */
2065 if (!MD_MNSET_SETNO(setno
)) {
2067 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_FAILED
,
2068 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
2070 SE_NOTIFY(EC_SVM_STATE
, ESC_SVM_RESYNC_DONE
,
2071 SVM_TAG_METADEVICE
, setno
, MD_SID(un
));
2077 component_resync(mm_unit_t
*un
)
2080 mm_submirror_ic_t
*smic
;
2086 * Handle the case where we are picking up a partially complete
2087 * component resync. In this case un_rs_type contains the submirror
2088 * and component index of where we should restart the resync.
2090 while (un
->un_rs_type
!= MD_RS_COMPONENT
) {
2091 i
= RS_SMI(un
->un_rs_type
);
2092 ci
= RS_CI(un
->un_rs_type
);
2093 check_comp_4_resync(un
, i
, ci
);
2094 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)),
2098 * If we have no current resync, contine to scan submirror and
2099 * components. If the resync has moved on to another component,
2100 * restart it and if the resync is no longer a component
2103 if (RS_TYPE(un
->un_rs_type
) == MD_RS_NONE
)
2105 if (RS_TYPE(un
->un_rs_type
) != MD_RS_COMPONENT
)
2108 /* Now continue scanning _all_ submirrors and components */
2109 for (i
= 0; i
< NMIRROR
; i
++) {
2111 smic
= &un
->un_smic
[i
];
2112 if (!SMS_IS(sm
, SMS_RUNNING
| SMS_LIMPING
))
2114 compcnt
= (*(smic
->sm_get_component_count
))(sm
->sm_dev
, sm
);
2115 for (ci
= 0; ci
< compcnt
; ci
++) {
2116 SET_RS_SMI(un
->un_rs_type
, i
);
2117 SET_RS_CI(un
->un_rs_type
, ci
);
2118 SET_RS_TYPE(un
->un_rs_type
, MD_RS_COMPONENT
);
2119 check_comp_4_resync(un
, i
, ci
);
2120 /* Bail out if we've been asked to abort/shutdown */
2121 if (resync_kill_pending(un
, MDI_UNIT(MD_SID(un
)),
2125 * Now check if another node has continued with the
2126 * resync, if we are no longer in component resync,
2127 * exit, otherwise update to the current component - 1
2128 * so that the next call of check_comp_4 resync() will
2129 * resync the current component.
2131 if ((RS_TYPE(un
->un_rs_type
) != MD_RS_NONE
) &&
2132 (RS_TYPE(un
->un_rs_type
) != MD_RS_COMPONENT
))
2135 if (RS_SMI(un
->un_rs_type
) != i
) {
2136 i
= RS_SMI(un
->un_rs_type
);
2137 ci
= RS_CI(un
->un_rs_type
) - 1;
2138 } else if (RS_CI(un
->un_rs_type
) != ci
)
2139 ci
= RS_CI(un
->un_rs_type
) - 1;
2146 reset_comp_flags(mm_unit_t
*un
)
2149 mm_submirror_ic_t
*smic
;
2150 md_m_shared_t
*shared
;
2155 for (i
= 0; i
< NMIRROR
; i
++) {
2157 smic
= &un
->un_smic
[i
];
2158 if (!SMS_IS(sm
, SMS_INUSE
))
2160 compcnt
= (*(smic
->sm_get_component_count
))(sm
->sm_dev
, sm
);
2161 for (ci
= 0; ci
< compcnt
; ci
++) {
2162 shared
= (md_m_shared_t
*)(*(smic
->sm_shared_by_indx
))
2163 (sm
->sm_dev
, sm
, ci
);
2164 shared
->ms_flags
&= ~MDM_S_RS_TRIED
;
2170 * resync_progress_thread:
2171 * ----------------------
2172 * Thread started on first resync of a unit which simply blocks until woken up
2173 * by a cv_signal, and then updates the mddb for the mirror unit record. This
2174 * saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
2175 * so that an aborted resync can be continued after an intervening reboot.
2178 resync_progress_thread(minor_t mnum
)
2180 mm_unit_t
*un
= MD_UNIT(mnum
);
2181 mdi_unit_t
*ui
= MDI_UNIT(mnum
);
2182 set_t setno
= MD_MIN2SET(mnum
);
2184 while (un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
) {
2185 mutex_enter(&un
->un_rs_progress_mx
);
2186 cv_wait(&un
->un_rs_progress_cv
, &un
->un_rs_progress_mx
);
2187 mutex_exit(&un
->un_rs_progress_mx
);
2188 if (un
->un_rs_progress_flags
& MD_RI_KILL
)
2192 * Commit mirror unit if we're the Master node in a multi-node
2195 if (MD_MNSET_SETNO(setno
) && md_set
[setno
].s_am_i_master
) {
2196 (void) md_unit_readerlock(ui
);
2197 mirror_commit(un
, NO_SUBMIRRORS
, 0);
2198 md_unit_readerexit(ui
);
2207 * Timeout handler for updating the progress of the resync thread.
2208 * Simply wake up the resync progress daemon which will then mirror_commit() the
2209 * unit structure to the mddb. This snapshots the current progress of the resync
2212 resync_progress(void *arg
)
2214 mm_unit_t
*un
= (mm_unit_t
*)arg
;
2215 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
2218 mutex_enter(&un
->un_rs_progress_mx
);
2219 cv_signal(&un
->un_rs_progress_cv
);
2220 mutex_exit(&un
->un_rs_progress_mx
);
2222 /* schedule the next timeout if the resync is still marked active */
2223 (void) md_unit_readerlock(ui
);
2224 active
= un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
? 1 : 0;
2225 md_unit_readerexit(ui
);
2227 un
->un_rs_resync_to_id
= timeout(resync_progress
, un
,
2228 (clock_t)(drv_usectohz(60000000) *
2229 md_mirror_resync_update_intvl
));
2236 * Resync thread which drives all forms of resync (optimized, component,
2237 * submirror). Must handle thread suspension and kill to allow multi-node
2238 * resync to run without undue ownership changes.
2240 * For a MN set, the reync mechanism is as follows:
2242 * When a resync is started, either via metattach, metaonline, metareplace,
2243 * metasync or by a hotspare kicking in, a message is sent to all nodes, which
2244 * calls mirror_resync_thread. If there is currently no mirror owner, the
2245 * master node sends a CHOOSE_OWNER message to the handler on the master. This
2246 * chooses a mirror owner and sends a CHANGE_OWNER message requesting the
2247 * selected node to become the owner.
2248 * If this node is not the owner it sets itself to block in resync_kill_pending
2249 * and if there is no owner all nodes will block until the chosen owner is
2250 * selected, in which case it will unblock itself. So, on entry to this
2251 * function only one node will continue past resync_kill_pending().
2252 * Once the resync thread is started, it basically cycles through the optimized,
2253 * component and submirrors resyncs until there is no more work to do.
2255 * For an ABR mirror, once a mirror owner is chosen it will complete the resync
2256 * unless the nodes dies in which case a new owner will be chosen and it will
2257 * have to complete the resync from the point at which the previous owner died.
2258 * To do this we broadcast a RESYNC_NEXT message before each region to be
2259 * resynced and this message contains the address and length of the region
2260 * being resynced and the current progress through the resync. The size of
2261 * this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
2262 * block size to limit the amount of inter node traffic. The RESYNC_NEXT
2263 * message also indicates to all other nodes that all writes to this block
2264 * must be blocked until the next RESYNC_NEXT message is received. This ensures
2265 * that no node can write to a block that is being resynced. For all MN
2266 * mirrors we also block the whole resync region on the resync owner node so
2267 * that all writes to the resync region are blocked on all nodes. There is a
2268 * difference here between a MN set and a regular set in that for a MN set
2269 * we protect the mirror from writes to the current resync block by blocking
2270 * a larger region. For a regular set we just block writes to the current
2273 * For a non-ABR mirror the same RESYNC_NEXT message is sent with an
2274 * additional purpose. In this case, there is only one mirror owner at a time
2275 * and rather than continually switching ownership between the chosen mirror
2276 * owner and the node that is writing to the mirror, we move the resync to the
2277 * mirror owner. When we swich ownership, we block the old owner and unblock
2278 * the resync thread on the new owner. To enable the new owner to continue the
2279 * resync, all nodes need to have the latest resync status, Then, following each
2280 * resync write, we check to see if the resync state has changed and if it
2281 * has this must be because we have lost ownership to another node(s) for a
2282 * period and then have become owner again later in the resync process. If we
2283 * are still dealing with the same resync, we just adjust addresses and counts
2284 * and then continue. If the resync has moved on to a different type, for
2285 * example from an optimized to a submirror resync, we move on to process the
2286 * resync described by rs_type and continue from the position described by
2287 * resync_done and resync_startbl.
2289 * Note that for non-ABR mirrors it is possible for a write to be made on a
2290 * non resync-owner node without a change of ownership. This is the case when
2291 * the mirror has a soft part created on it and a write in ABR mode is made
2292 * to that soft part. Therefore we still need to block writes to the resync
2293 * region on all nodes.
2295 * Sending the latest resync state to all nodes also enables them to continue
2296 * a resync in the event that the mirror owner dies. If a mirror owner for
2297 * a non-ABR mirror has died, there will be dirty resync regions. Therefore,
2298 * regardless of whether another type of resync was in progress, we must first
2299 * do an optimized resync to clean up the dirty regions before continuing
2300 * with the interrupted resync.
2302 * The resync status is held in the unit structure
2304 * un_rs_resync_done The number of contiguous resyc blocks done so far
2305 * un_rs_resync_2_do The total number of contiguous resync blocks
2306 * un_rs_type The resync type (inc submirror and component numbers)
2308 * un_resync_startbl The address of the current resync block being processed
2310 * In the event that the whole cluster fails we need to just use
2311 * un_rs_resync_done to restart the resync and to ensure that this is
2312 * periodically written to disk, we have a thread which writes the record
2313 * to disk every 5 minutes. As the granularity of un_rs_resync_done is
2314 * usually coarse ( for an optimized resync 1001 is the max value) there is
2315 * little point in writing this more frequently.
2318 resync_unit(minor_t mnum
)
2322 md_error_t mde
= mdnullerror
;
2324 int resync_finish
= 0;
2325 set_t setno
= MD_MIN2SET(mnum
);
2326 uint_t old_rs_type
= MD_RS_NONE
;
2327 uint_t old_rs_done
= 0, old_rs_2_do
= 0;
2328 uint_t old_rs_startbl
= 0;
2329 int block_resync
= 1;
2330 char cpr_name
[23]; /* Unique CPR name */
2336 if (mirror_debug_flag
)
2337 printf("Resync started (mnum = %x)\n", mnum
);
2340 * increment the mirror resync count
2342 mutex_enter(&md_cpr_resync
.md_resync_mutex
);
2343 md_cpr_resync
.md_mirror_resync
++;
2344 mutex_exit(&md_cpr_resync
.md_resync_mutex
);
2346 ui
= MDI_UNIT(mnum
);
2349 rs_copysize
= un
->un_rs_copysize
;
2350 if (rs_copysize
== 0) {
2352 * Don't allow buffer size to fall outside the
2353 * range 0 < bufsize <= md_max_xfer_bufsz.
2355 if (md_resync_bufsz
<= 0)
2356 md_resync_bufsz
= MD_DEF_RESYNC_BUF_SIZE
;
2357 rs_copysize
= MIN(md_resync_bufsz
, md_max_xfer_bufsz
);
2359 rs_buffer
= kmem_zalloc(dbtob(rs_copysize
), KM_SLEEP
);
2360 un
= md_unit_writerlock(ui
);
2361 un
->un_rs_copysize
= rs_copysize
;
2362 un
->un_rs_buffer
= rs_buffer
;
2364 if (MD_MNSET_SETNO(setno
)) {
2366 * Register this resync thread with the CPR mechanism. This
2367 * allows us to detect when the system is suspended and so
2368 * keep track of the RPC failure condition.
2370 (void) snprintf(cpr_name
, sizeof (cpr_name
),
2371 "mirror_resync%x", mnum
);
2372 CALLB_CPR_INIT(&un
->un_rs_cprinfo
, &un
->un_rs_cpr_mx
,
2373 callb_md_mrs_cpr
, cpr_name
);
2375 if (ui
->ui_tstate
& MD_RESYNC_NOT_DONE
) {
2377 * If this is the first resync following the initial
2378 * snarf (MD_RESYNC_NOT_DONE still set) and we've
2379 * been started outside a reconfig step (e.g. by being
2380 * added to an existing set) we need to query the
2381 * existing submirror state for this mirror.
2382 * The set_status flags will have MD_MN_SET_MIR_STATE_RC
2383 * set if we've been through a step4 reconfig, so only
2384 * query the master if this isn't (yet) set. In this
2385 * case we must continue the resync thread as there is
2386 * not guaranteed to be a currently running resync on
2387 * any of the other nodes. Worst case is that we will
2388 * initiate an ownership change to this node and then
2389 * find that there is no resync to perform. However, we
2390 * will then have correct status across the cluster.
2392 if (!md_set
[setno
].s_am_i_master
) {
2393 if (!(md_get_setstatus(setno
) &
2394 MD_SET_MN_MIR_STATE_RC
)) {
2395 mirror_get_status(un
, NULL
);
2398 if (mirror_debug_flag
) {
2401 for (i
= 0; i
< NMIRROR
; i
++) {
2413 ui
->ui_tstate
&= ~MD_RESYNC_NOT_DONE
;
2416 * For MN set, if we have an owner, then start the resync on it.
2417 * If there is no owner the master must send a message to
2418 * choose the owner. This message will contain the current
2419 * resync count and it will only be sent to the master, where
2420 * the resync count will be used to choose the next node to
2421 * perform a resync, by cycling through the nodes in the set.
2422 * The message handler will then send a CHANGE_OWNER message to
2423 * all nodes, and on receipt of that message, the chosen owner
2424 * will issue a SET_OWNER ioctl to become the owner. This ioctl
2425 * will be requested to spawn a thread to issue the
2426 * REQUEST_OWNER message to become the owner which avoids the
2427 * need for concurrent ioctl requests.
2428 * After sending the message, we will block waiting for one
2429 * of the nodes to become the owner and start the resync
2431 if (MD_MN_NO_MIRROR_OWNER(un
)) {
2433 * There is no owner, block and then the master will
2434 * choose the owner. Only perform this if 'block_resync'
2438 mutex_enter(&un
->un_rs_thread_mx
);
2439 un
->un_rs_thread_flags
|= MD_RI_BLOCK_OWNER
;
2440 mutex_exit(&un
->un_rs_thread_mx
);
2442 if (md_set
[setno
].s_am_i_master
) {
2443 md_unit_writerexit(ui
);
2444 (void) mirror_choose_owner(un
, NULL
);
2445 (void) md_unit_writerlock(ui
);
2448 /* There is an owner, block if we are not it */
2449 if (!MD_MN_MIRROR_OWNER(un
)) {
2450 mutex_enter(&un
->un_rs_thread_mx
);
2451 un
->un_rs_thread_flags
|= MD_RI_BLOCK_OWNER
;
2452 mutex_exit(&un
->un_rs_thread_mx
);
2457 * Start a timeout chain to update the resync progress to the mddb.
2458 * This will run every md_mirror_resync_update_intvl minutes and allows
2459 * a resync to be continued over a reboot.
2461 ASSERT(un
->un_rs_resync_to_id
== 0);
2462 un
->un_rs_resync_to_id
= timeout(resync_progress
, un
,
2463 (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl
));
2466 * Handle resync restart from the last logged position. The contents
2467 * of un_rs_resync_2_do and un_rs_resync_done are dependent on the
2468 * type of resync that was in progress.
2470 if (MD_MNSET_SETNO(setno
)) {
2471 switch ((uint_t
)RS_TYPE(un
->un_rs_type
)) {
2473 case MD_RS_OPTIMIZED
:
2474 case MD_RS_COMPONENT
:
2475 case MD_RS_SUBMIRROR
:
2479 un
->un_rs_type
= MD_RS_NONE
;
2481 /* Allocate a resync message, if required */
2482 if (un
->un_rs_msg
== NULL
) {
2483 un
->un_rs_msg
= (md_mn_msg_resync_t
*)kmem_zalloc(
2484 sizeof (md_mn_msg_resync_t
), KM_SLEEP
);
2489 /* Check to see if we've been requested to block/kill */
2490 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2495 un
->un_rs_dropped_lock
= 0;
2497 * Always perform an optimized resync first as this will bring
2498 * the mirror into an available state in the shortest time.
2499 * If we are resuming an interrupted resync, other than an
2500 * optimized resync, we save the type and amount done so that
2501 * we can resume the appropriate resync after the optimized
2502 * resync has completed.
2504 if ((RS_TYPE(un
->un_rs_type
) != MD_RS_NONE
) &&
2505 (RS_TYPE(un
->un_rs_type
) != MD_RS_OPTIMIZED
)) {
2506 old_rs_type
= un
->un_rs_type
;
2507 old_rs_done
= un
->un_rs_resync_done
;
2508 old_rs_2_do
= un
->un_rs_resync_2_do
;
2509 old_rs_startbl
= un
->un_resync_startbl
;
2511 SET_RS_TYPE(un
->un_rs_type
, MD_RS_OPTIMIZED
);
2513 * If we are continuing a resync that is not an
2514 * OPTIMIZED one, then we start from the beginning when
2515 * doing this optimized resync
2517 if (RS_TYPE(old_rs_type
) != MD_RS_OPTIMIZED
) {
2518 un
->un_rs_resync_done
= 0;
2519 un
->un_rs_resync_2_do
= 0;
2520 un
->un_resync_startbl
= 0;
2522 optimized_resync(un
);
2523 /* Check to see if we've been requested to block/kill */
2524 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2527 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2529 * If another node has moved the resync on, we must
2530 * restart the correct resync
2533 (RS_TYPE(un
->un_rs_type
) != MD_RS_NONE
)) {
2534 old_rs_type
= un
->un_rs_type
;
2535 old_rs_done
= un
->un_rs_resync_done
;
2536 old_rs_2_do
= un
->un_rs_resync_2_do
;
2537 old_rs_startbl
= un
->un_resync_startbl
;
2541 * Restore previous resync progress or move onto a
2544 if (RS_TYPE(old_rs_type
) != MD_RS_NONE
) {
2545 un
->un_rs_type
= old_rs_type
;
2546 un
->un_rs_resync_done
= old_rs_done
;
2547 un
->un_rs_resync_2_do
= old_rs_2_do
;
2548 un
->un_resync_startbl
= old_rs_startbl
;
2550 un
->un_rs_type
= MD_RS_COMPONENT
;
2551 un
->un_rs_resync_done
= 0;
2552 un
->un_rs_resync_2_do
= 0;
2553 un
->un_resync_startbl
= 0;
2556 if (RS_TYPE(un
->un_rs_type
) == MD_RS_COMPONENT
) {
2557 component_resync(un
);
2558 /* Check to see if we've been requested to block/kill */
2559 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2562 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2564 * If we have moved on from a component resync, another
2565 * node must have completed it and started a submirror
2566 * resync, so leave the resync state alone. For non
2567 * multi-node sets we move onto the submirror resync.
2570 if (RS_TYPE(un
->un_rs_type
) == MD_RS_NONE
) {
2571 un
->un_rs_type
= MD_RS_SUBMIRROR
;
2572 un
->un_rs_resync_done
=
2573 un
->un_rs_resync_2_do
= 0;
2574 un
->un_resync_startbl
= 0;
2577 un
->un_rs_type
= MD_RS_SUBMIRROR
;
2578 un
->un_rs_resync_done
= 0;
2579 un
->un_rs_resync_2_do
= 0;
2580 un
->un_resync_startbl
= 0;
2583 if (RS_TYPE(un
->un_rs_type
) == MD_RS_SUBMIRROR
) {
2584 submirror_resync(un
);
2585 /* Check to see if we've been requested to block/kill */
2586 if (resync_kill_pending(un
, ui
, MD_WRITER_HELD
)) {
2589 un
= (mm_unit_t
*)MD_UNIT(mnum
);
2591 * If we have moved on from a submirror resync, another
2592 * node must have completed it and started a different
2593 * resync, so leave the resync state alone
2596 if (RS_TYPE(un
->un_rs_type
) == MD_RS_NONE
) {
2597 un
->un_rs_resync_done
=
2598 un
->un_rs_resync_2_do
= 0;
2599 un
->un_resync_startbl
= 0;
2602 /* If non-MN mirror, reinitialize state */
2603 un
->un_rs_type
= MD_RS_NONE
;
2604 un
->un_rs_resync_done
= 0;
2605 un
->un_rs_resync_2_do
= 0;
2606 un
->un_resync_startbl
= 0;
2609 } while (un
->un_rs_dropped_lock
);
2610 mutex_enter(&un
->un_rs_thread_mx
);
2611 un
->un_rs_thread_flags
|= MD_RI_SHUTDOWN
;
2612 mutex_exit(&un
->un_rs_thread_mx
);
2617 if (mirror_debug_flag
)
2618 printf("Resync stopped (mnum = %x), resync_finish = %d\n",
2619 mnum
, resync_finish
);
2621 kmem_free(un
->un_rs_buffer
, dbtob(un
->un_rs_copysize
));
2623 mutex_enter(&un
->un_rs_progress_mx
);
2624 un
->un_rs_progress_flags
|= MD_RI_KILL
;
2625 cv_signal(&un
->un_rs_progress_cv
);
2626 mutex_exit(&un
->un_rs_progress_mx
);
2629 * For MN Set, send a RESYNC_FINISH if this node completed the resync.
2630 * There is no need to grow unit here, it will be done in the
2631 * handler for the RESYNC_FINISH message together with resetting
2632 * MD_UN_RESYNC_ACTIVE.
2635 if (resync_finish
) {
2637 * Normal resync completion. Issue a RESYNC_FINISH
2638 * message if we're part of a multi-node set.
2640 md_mn_kresult_t
*kres
;
2641 md_mn_msg_resync_t
*rmsg
;
2644 rmsg
= (md_mn_msg_resync_t
*)un
->un_rs_msg
;
2645 md_unit_writerexit(ui
);
2647 rmsg
->msg_resync_mnum
= mnum
;
2648 rmsg
->msg_resync_type
= 0;
2649 rmsg
->msg_resync_done
= 0;
2650 rmsg
->msg_resync_2_do
= 0;
2651 rmsg
->msg_originator
= md_mn_mynode_id
;
2653 kres
= kmem_alloc(sizeof (md_mn_kresult_t
), KM_SLEEP
);
2655 mutex_enter(&un
->un_rs_cpr_mx
);
2656 CALLB_CPR_SAFE_BEGIN(&un
->un_rs_cprinfo
);
2658 rval
= mdmn_ksend_message(setno
,
2659 MD_MN_MSG_RESYNC_FINISH
, MD_MSGF_NO_LOG
, 0,
2660 (char *)rmsg
, sizeof (md_mn_msg_resync_t
), kres
);
2662 CALLB_CPR_SAFE_END(&un
->un_rs_cprinfo
,
2664 mutex_exit(&un
->un_rs_cpr_mx
);
2666 if (!MDMN_KSEND_MSG_OK(rval
, kres
)) {
2667 mdmn_ksend_show_error(rval
, kres
,
2669 /* If we're shutting down, pause things here. */
2670 if (kres
->kmmr_comm_state
== MDMNE_RPC_FAIL
) {
2671 while (!md_mn_is_commd_present()) {
2676 "ksend_message failure: RESYNC_FINISH");
2678 kmem_free(kres
, sizeof (md_mn_kresult_t
));
2679 (void) md_unit_writerlock(ui
);
2682 * If the resync has been cancelled, clear flags, reset owner
2683 * for ABR mirror and release the resync region parent
2686 if (un
->c
.un_status
& MD_UN_RESYNC_CANCEL
) {
2689 if (ui
->ui_tstate
& MD_ABR_CAP
) {
2690 /* Resync finished, if ABR set owner to NULL */
2691 mutex_enter(&un
->un_owner_mx
);
2692 un
->un_mirror_owner
= 0;
2693 mutex_exit(&un
->un_owner_mx
);
2696 un
->c
.un_status
&= ~(MD_UN_RESYNC_CANCEL
|
2697 MD_UN_RESYNC_ACTIVE
);
2698 ps
= un
->un_rs_prev_overlap
;
2700 /* Remove previous overlap resync region */
2701 if (ps
->ps_flags
& MD_MPS_ON_OVERLAP
)
2702 mirror_overlap_tree_remove(ps
);
2704 * Release the overlap range reference
2706 un
->un_rs_prev_overlap
= NULL
;
2707 kmem_cache_free(mirror_parent_cache
,
2713 * Release resync message buffer. This will be reallocated on
2714 * the next invocation of the resync_unit thread.
2716 if (un
->un_rs_msg
) {
2717 kmem_free(un
->un_rs_msg
, sizeof (md_mn_msg_resync_t
));
2718 un
->un_rs_msg
= NULL
;
2721 /* For non-MN sets deal with any pending grows */
2722 un
->c
.un_status
&= ~MD_UN_RESYNC_ACTIVE
;
2723 if (un
->c
.un_status
& MD_UN_GROW_PENDING
) {
2724 if ((mirror_grow_unit(un
, &mde
) != 0) ||
2725 (! mdismderror(&mde
, MDE_GROW_DELAYED
))) {
2726 un
->c
.un_status
&= ~MD_UN_GROW_PENDING
;
2731 reset_comp_flags(un
);
2732 un
->un_resync_completed
= 0;
2733 mirror_commit(un
, NO_SUBMIRRORS
, 0);
2734 md_unit_writerexit(ui
);
2737 * Stop the resync progress thread.
2739 if (un
->un_rs_resync_to_id
!= 0) {
2740 (void) untimeout(un
->un_rs_resync_to_id
);
2741 un
->un_rs_resync_to_id
= 0;
2745 * Calling mirror_internal_close() makes further reference to un / ui
2746 * dangerous. If we are the only consumer of the mirror it is possible
2747 * for a metaclear to be processed after completion of the m_i_c()
2748 * routine. As we need to handle the case where another resync has been
2749 * scheduled for the mirror, we raise the open count on the device
2750 * which protects against the close / metaclear / lock => panic scenario
2752 (void) md_unit_incopen(MD_SID(un
), FREAD
|FWRITE
, OTYP_LYR
);
2753 (void) mirror_internal_close(MD_SID(un
), OTYP_LYR
, 0, (IOLOCK
*)NULL
);
2756 * deccrement the mirror resync count
2758 mutex_enter(&md_cpr_resync
.md_resync_mutex
);
2759 md_cpr_resync
.md_mirror_resync
--;
2760 mutex_exit(&md_cpr_resync
.md_resync_mutex
);
2763 * Remove the thread reference as we're about to exit. This allows a
2764 * subsequent mirror_resync_unit() to start a new thread.
2765 * If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
2766 * called to start a new resync, so reopen the mirror and go back to
2769 (void) md_unit_writerlock(ui
);
2770 mutex_enter(&un
->un_rs_thread_mx
);
2771 un
->un_rs_thread_flags
&= ~(MD_RI_KILL
|MD_RI_SHUTDOWN
);
2772 mutex_exit(&un
->un_rs_thread_mx
);
2773 if (un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
) {
2774 md_unit_writerexit(ui
);
2775 if (mirror_internal_open(MD_SID(un
), (FREAD
|FWRITE
),
2776 OTYP_LYR
, 0, (IOLOCK
*)NULL
) == 0) {
2777 /* Release the reference grabbed above */
2778 (void) mirror_internal_close(MD_SID(un
), OTYP_LYR
, 0,
2780 goto resync_restart
;
2782 (void) md_unit_writerlock(ui
);
2784 "Could not open metadevice (%x) for resync\n",
2787 un
->un_rs_thread
= NULL
;
2788 md_unit_writerexit(ui
);
2791 * Check for hotspares once we've cleared the resync thread reference.
2792 * If there are any errored units a poke_hotspares() will result in
2793 * a call to mirror_resync_unit() which we need to allow to start.
2795 (void) poke_hotspares();
2798 * Remove this thread from the CPR callback table.
2801 mutex_enter(&un
->un_rs_cpr_mx
);
2802 CALLB_CPR_EXIT(&un
->un_rs_cprinfo
);
2806 * Remove the extra reference to the unit we generated above. After
2807 * this call it is *unsafe* to reference either ui or un as they may
2808 * no longer be allocated.
2810 (void) mirror_internal_close(MD_SID(un
), OTYP_LYR
, 0, (IOLOCK
*)NULL
);
2816 * mirror_resync_unit:
2817 * ------------------
2818 * Start a resync for the given mirror metadevice. Save the resync thread ID in
2819 * un->un_rs_thread for later manipulation.
2829 md_resync_ioctl_t
*ri
,
2836 set_t setno
= MD_MIN2SET(mnum
);
2838 ui
= MDI_UNIT(mnum
);
2840 if (md_get_setstatus(setno
) & MD_SET_STALE
)
2841 return (mdmddberror(ep
, MDE_DB_STALE
, mnum
, setno
));
2843 if (mirror_internal_open(mnum
, (FREAD
|FWRITE
), OTYP_LYR
, 0, lockp
)) {
2844 return (mdmderror(ep
, MDE_MIRROR_OPEN_FAILURE
, mnum
));
2847 un
= (mm_unit_t
*)md_ioctl_writerlock(lockp
, ui
);
2849 un
= (mm_unit_t
*)md_unit_writerlock(ui
);
2853 * Check to see if we're attempting to start a resync while one is
2856 if (un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
||
2857 un
->un_rs_thread
!= NULL
) {
2859 * Ensure RESYNC_ACTIVE set, it may not be if the resync thread
2860 * is in the process of terminating, setting the flag will
2861 * cause the resync thread to return to the beginning
2863 un
->c
.un_status
|= MD_UN_RESYNC_ACTIVE
;
2865 md_ioctl_writerexit(lockp
);
2867 md_unit_writerexit(ui
);
2869 (void) mirror_internal_close(mnum
, OTYP_LYR
, 0, lockp
);
2872 un
->c
.un_status
|= MD_UN_RESYNC_ACTIVE
;
2873 un
->c
.un_status
&= ~MD_UN_RESYNC_CANCEL
;
2874 if ((ri
) && (ri
->ri_copysize
> 0) &&
2875 (ri
->ri_copysize
<= md_max_xfer_bufsz
))
2876 un
->un_rs_copysize
= ri
->ri_copysize
;
2878 un
->un_rs_copysize
= 0;
2880 /* Start the resync progress thread off */
2881 un
->un_rs_progress_flags
= 0;
2882 (void) thread_create(NULL
, 0, resync_progress_thread
,
2883 (caddr_t
)(uintptr_t)mnum
, 0, &p0
, TS_RUN
, minclsyspri
);
2886 * We have to store the thread ID in the unit structure so do not
2887 * drop writerlock until the thread is active. This means resync_unit
2888 * may spin on its first md_unit_readerlock(), but deadlock won't occur.
2890 mutex_enter(&un
->un_rs_thread_mx
);
2891 un
->un_rs_thread_flags
&= ~(MD_RI_KILL
|MD_RI_SHUTDOWN
);
2892 mutex_exit(&un
->un_rs_thread_mx
);
2893 un
->un_rs_thread
= thread_create(NULL
, 0, resync_unit
,
2894 (caddr_t
)(uintptr_t)mnum
, 0, &p0
, TS_RUN
, 60);
2895 if (un
->un_rs_thread
== (kthread_id_t
)NULL
) {
2896 un
->c
.un_status
&= ~MD_UN_RESYNC_ACTIVE
;
2898 md_ioctl_writerexit(lockp
);
2900 md_unit_writerexit(ui
);
2902 (void) mirror_internal_close(mnum
, OTYP_LYR
, 0, lockp
);
2903 return (mdmderror(ep
, MDE_MIRROR_THREAD_FAILURE
, mnum
));
2906 md_ioctl_writerexit(lockp
);
2908 md_unit_writerexit(ui
);
2916 * mirror_ioctl_resync:
2917 * -------------------
2918 * Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
2919 * or kill the resync thread associated with the specified unit.
2920 * Can return with locks held since mdioctl will free any locks
2921 * that are marked in lock->l_flags.
2928 mirror_ioctl_resync(
2929 md_resync_ioctl_t
*ri
,
2933 minor_t mnum
= ri
->ri_mnum
;
2937 mm_submirror_ic_t
*smic
;
2940 set_t setno
= MD_MIN2SET(mnum
);
2942 mdclrerror(&ri
->mde
);
2944 if ((setno
>= md_nsets
) ||
2945 (MD_MIN2UNIT(mnum
) >= md_nunits
)) {
2946 return (mdmderror(&ri
->mde
, MDE_INVAL_UNIT
, mnum
));
2949 /* RD_LOCK flag grabs the md_ioctl_readerlock */
2950 un
= mirror_getun(mnum
, &ri
->mde
, RD_LOCK
, lock
);
2953 return (mdmderror(&ri
->mde
, MDE_UNIT_NOT_SETUP
, mnum
));
2955 if (un
->c
.un_type
!= MD_METAMIRROR
) {
2956 return (mdmderror(&ri
->mde
, MDE_NOT_MM
, mnum
));
2958 if (un
->un_nsm
< 2) {
2963 * Determine the action to take based on the ri_flags field:
2964 * MD_RI_BLOCK: Block current resync thread
2965 * MD_RI_UNBLOCK: Unblock resync thread
2966 * MD_RI_KILL: Abort resync thread
2967 * MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
2968 * without using rpc.mdcommd messages.
2969 * any other: Start resync thread
2971 switch (ri
->ri_flags
& (MD_RI_BLOCK
|MD_RI_UNBLOCK
|MD_RI_KILL
)) {
2974 /* Halt resync thread by setting flag in un_rs_flags */
2975 if (!(un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
)) {
2978 mutex_enter(&un
->un_rs_thread_mx
);
2979 un
->un_rs_thread_flags
|= MD_RI_BLOCK
;
2980 mutex_exit(&un
->un_rs_thread_mx
);
2985 * Restart resync thread by clearing flag in un_rs_flags and
2986 * cv_signal'ing the blocked thread.
2988 if (!(un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
)) {
2991 mutex_enter(&un
->un_rs_thread_mx
);
2992 un
->un_rs_thread_flags
&= ~MD_RI_BLOCK
;
2993 cv_signal(&un
->un_rs_thread_cv
);
2994 mutex_exit(&un
->un_rs_thread_mx
);
2998 /* Abort resync thread. */
2999 if (!(un
->c
.un_status
& MD_UN_RESYNC_ACTIVE
)) {
3002 mutex_enter(&un
->un_rs_thread_mx
);
3003 tid
= un
->un_rs_thread
? (un
->un_rs_thread
)->t_did
: 0;
3004 un
->un_rs_thread_flags
&= ~(MD_RI_BLOCK
|MD_RI_BLOCK_OWNER
);
3005 un
->un_rs_thread_flags
|= MD_RI_KILL
;
3006 cv_signal(&un
->un_rs_thread_cv
);
3007 mutex_exit(&un
->un_rs_thread_mx
);
3009 if (!(ri
->ri_flags
& MD_RI_NO_WAIT
)) {
3010 md_ioctl_readerexit(lock
);
3012 un
->un_rs_thread_flags
&= ~MD_RI_KILL
;
3013 un
->un_rs_thread
= NULL
;
3014 cmn_err(CE_WARN
, "md: %s: Resync cancelled\n",
3015 md_shortname(MD_SID(un
)));
3021 md_ioctl_readerexit(lock
);
3024 for (smi
= 0; smi
< NMIRROR
; smi
++) {
3025 sm
= &un
->un_sm
[smi
];
3026 smic
= &un
->un_smic
[smi
];
3027 if (!SMS_IS(sm
, SMS_ATTACHED
))
3029 mirror_set_sm_state(sm
, smic
, SMS_ATTACHED_RESYNC
, 1);
3030 bits
|= SMI2BIT(smi
);
3033 mirror_commit(un
, bits
, 0);
3036 * If we are resyncing a mirror in a MN set and the rpc.mdcommd
3037 * can be used, we do not start the resync at this point.
3038 * Instead, the metasync command that issued the ioctl
3039 * will send a RESYNC_STARTING message to start the resync thread. The
3040 * reason we do it this way is to ensure that the metasync ioctl is
3041 * executed on all nodes before the resync thread is started.
3043 * If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
3044 * don't use rpc.mdcommd, but just start the resync thread. This
3045 * flag is set on a node when it is being added to a diskset
3046 * so that the resync threads are started on the newly added node.
3048 if ((!(MD_MNSET_SETNO(setno
))) ||
3049 (ri
->ri_flags
& MD_RI_RESYNC_FORCE_MNSTART
)) {
3050 return (mirror_resync_unit(mnum
, ri
, &ri
->mde
, lock
));
3057 mirror_mark_resync_region_non_owner(struct mm_unit
*un
,
3058 diskaddr_t startblk
, diskaddr_t endblk
, md_mn_nodeid_t source_node
)
3064 md_mn_msg_rr_dirty_t
*rr
;
3065 md_mn_kresult_t
*kres
;
3066 set_t setno
= MD_UN2SET(un
);
3068 md_mn_nodeid_t node_idx
= source_node
- 1;
3069 mdi_unit_t
*ui
= MDI_UNIT(MD_SID(un
));
3070 md_mn_nodeid_t owner_node
;
3071 minor_t mnum
= MD_SID(un
);
3077 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3078 * not, allocate it and then fill the [start..end] entries.
3079 * Update un_pernode_dirty_sum if we've gone 0->1.
3080 * Update un_dirty_bm if the corresponding entries are clear.
3082 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_WRITER
);
3083 if (un
->un_pernode_dirty_bm
[node_idx
] == NULL
) {
3084 un
->un_pernode_dirty_bm
[node_idx
] =
3085 (uchar_t
*)kmem_zalloc(
3086 (uint_t
)howmany(un
->un_rrd_num
, NBBY
), KM_SLEEP
);
3088 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3090 BLK_TO_RR(end_rr
, endblk
, un
);
3091 BLK_TO_RR(start_rr
, startblk
, un
);
3095 mutex_enter(&un
->un_resync_mx
);
3096 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_READER
);
3097 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++) {
3098 un
->un_outstanding_writes
[current_rr
]++;
3099 if (!IS_PERNODE_DIRTY(source_node
, current_rr
, un
)) {
3100 un
->un_pernode_dirty_sum
[current_rr
]++;
3101 SET_PERNODE_DIRTY(source_node
, current_rr
, un
);
3103 CLR_GOING_CLEAN(current_rr
, un
);
3104 if (!IS_REGION_DIRTY(current_rr
, un
)) {
3106 SET_REGION_DIRTY(current_rr
, un
);
3107 SET_GOING_DIRTY(current_rr
, un
);
3108 } else if (IS_GOING_DIRTY(current_rr
, un
))
3111 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3112 mutex_exit(&un
->un_resync_mx
);
3119 * If we have dirty regions to commit, send a
3120 * message to the owning node so that the
3121 * in-core bitmap gets updated appropriately.
3122 * TODO: make this a kmem_cache pool to improve
3123 * alloc/free performance ???
3125 kres
= (md_mn_kresult_t
*)kmem_zalloc(sizeof (md_mn_kresult_t
),
3127 rr
= (md_mn_msg_rr_dirty_t
*)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t
),
3131 owner_node
= un
->un_mirror_owner
;
3134 rr
->rr_nodeid
= md_mn_mynode_id
;
3135 rr
->rr_range
= (ushort_t
)start_rr
<< 16;
3136 rr
->rr_range
|= (ushort_t
)end_rr
& 0xFFFF;
3138 /* release readerlock before sending message */
3139 md_unit_readerexit(ui
);
3141 rval
= mdmn_ksend_message(setno
, MD_MN_MSG_RR_DIRTY
,
3142 MD_MSGF_NO_LOG
|MD_MSGF_BLK_SIGNAL
|MD_MSGF_DIRECTED
,
3143 un
->un_mirror_owner
, (char *)rr
,
3144 sizeof (md_mn_msg_rr_dirty_t
), kres
);
3146 /* reaquire readerlock on message completion */
3147 (void) md_unit_readerlock(ui
);
3149 /* if the message send failed, note it, and pass an error back up */
3150 if (!MDMN_KSEND_MSG_OK(rval
, kres
)) {
3151 /* if commd is gone, no point in printing a message */
3152 if (md_mn_is_commd_present())
3153 mdmn_ksend_show_error(rval
, kres
, "RR_DIRTY");
3154 kmem_free(kres
, sizeof (md_mn_kresult_t
));
3155 kmem_free(rr
, sizeof (md_mn_msg_rr_dirty_t
));
3160 * if the owner changed while we were sending the message, and it's
3161 * not us, the new mirror owner won't yet have done the right thing
3162 * with our data. Let him know. If we became the owner, we'll
3163 * deal with that differently below. Note that receiving a message
3164 * about another node twice won't hurt anything.
3166 if (un
->un_mirror_owner
!= owner_node
&& !MD_MN_MIRROR_OWNER(un
))
3169 kmem_free(kres
, sizeof (md_mn_kresult_t
));
3170 kmem_free(rr
, sizeof (md_mn_msg_rr_dirty_t
));
3172 mutex_enter(&un
->un_resync_mx
);
3175 * If we became the owner changed while we were sending the message,
3176 * we have dirty bits in the un_pernode_bm that aren't yet reflected
3177 * in the un_dirty_bm, as it was re-read from disk, and our bits
3178 * are also not reflected in the on-disk DRL. Fix that now.
3180 if (MD_MN_MIRROR_OWNER(un
)) {
3181 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_WRITER
);
3182 mirror_copy_rr(howmany(un
->un_rrd_num
, NBBY
),
3183 un
->un_pernode_dirty_bm
[node_idx
], un
->un_dirty_bm
);
3184 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3186 un
->un_resync_flg
|= MM_RF_COMMITING
| MM_RF_GATECLOSED
;
3188 mutex_exit(&un
->un_resync_mx
);
3189 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
3190 mutex_enter(&un
->un_resync_mx
);
3192 un
->un_resync_flg
&= ~(MM_RF_COMMITING
| MM_RF_GATECLOSED
);
3193 cv_broadcast(&un
->un_resync_cv
);
3196 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++)
3197 CLR_GOING_DIRTY(current_rr
, un
);
3199 mutex_exit(&un
->un_resync_mx
);
3205 mirror_mark_resync_region_owner(struct mm_unit
*un
,
3206 diskaddr_t startblk
, diskaddr_t endblk
, md_mn_nodeid_t source_node
)
3212 int mnset
= MD_MNSET_SETNO(MD_UN2SET(un
));
3213 md_mn_nodeid_t node_idx
= source_node
- 1;
3219 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3220 * not, allocate it and then fill the [start..end] entries.
3221 * Update un_pernode_dirty_sum if we've gone 0->1.
3222 * Update un_dirty_bm if the corresponding entries are clear.
3225 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_WRITER
);
3226 if (un
->un_pernode_dirty_bm
[node_idx
] == NULL
) {
3227 un
->un_pernode_dirty_bm
[node_idx
] =
3228 (uchar_t
*)kmem_zalloc(
3229 (uint_t
)howmany(un
->un_rrd_num
, NBBY
), KM_SLEEP
);
3231 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3234 mutex_enter(&un
->un_resync_mx
);
3237 rw_enter(&un
->un_pernode_dirty_mx
[node_idx
], RW_READER
);
3240 BLK_TO_RR(end_rr
, endblk
, un
);
3241 BLK_TO_RR(start_rr
, startblk
, un
);
3242 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++) {
3243 if (!mnset
|| source_node
== md_mn_mynode_id
)
3244 un
->un_outstanding_writes
[current_rr
]++;
3246 if (!IS_PERNODE_DIRTY(source_node
, current_rr
, un
))
3247 un
->un_pernode_dirty_sum
[current_rr
]++;
3248 SET_PERNODE_DIRTY(source_node
, current_rr
, un
);
3250 CLR_GOING_CLEAN(current_rr
, un
);
3251 if (!IS_REGION_DIRTY(current_rr
, un
))
3253 if (IS_GOING_DIRTY(current_rr
, un
))
3258 rw_exit(&un
->un_pernode_dirty_mx
[node_idx
]);
3261 mutex_exit(&un
->un_resync_mx
);
3264 un
->un_waiting_to_mark
++;
3265 while (un
->un_resync_flg
& MM_RF_GATECLOSED
) {
3268 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3270 un
->un_waiting_to_mark
--;
3273 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++) {
3274 if (!IS_REGION_DIRTY(current_rr
, un
)) {
3275 SET_REGION_DIRTY(current_rr
, un
);
3276 SET_GOING_DIRTY(current_rr
, un
);
3279 if (IS_GOING_DIRTY(current_rr
, un
))
3284 if (un
->un_waiting_to_mark
== 0 || un
->un_waiting_to_clear
!= 0)
3285 cv_broadcast(&un
->un_resync_cv
);
3286 mutex_exit(&un
->un_resync_mx
);
3290 un
->un_resync_flg
|= MM_RF_COMMIT_NEEDED
;
3291 un
->un_waiting_to_commit
++;
3292 while (un
->un_waiting_to_mark
!= 0 &&
3293 !(un
->un_resync_flg
& MM_RF_GATECLOSED
)) {
3296 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3299 if (un
->un_resync_flg
& MM_RF_COMMIT_NEEDED
) {
3300 un
->un_resync_flg
|= MM_RF_COMMITING
| MM_RF_GATECLOSED
;
3301 un
->un_resync_flg
&= ~MM_RF_COMMIT_NEEDED
;
3303 mutex_exit(&un
->un_resync_mx
);
3304 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
3305 mutex_enter(&un
->un_resync_mx
);
3307 un
->un_resync_flg
&= ~MM_RF_COMMITING
;
3308 cv_broadcast(&un
->un_resync_cv
);
3310 while (un
->un_resync_flg
& MM_RF_COMMITING
) {
3313 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3316 for (current_rr
= start_rr
; current_rr
<= end_rr
; current_rr
++)
3317 CLR_GOING_DIRTY(current_rr
, un
);
3319 if (--un
->un_waiting_to_commit
== 0) {
3320 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
3321 cv_broadcast(&un
->un_resync_cv
);
3323 mutex_exit(&un
->un_resync_mx
);
3329 mirror_mark_resync_region(struct mm_unit
*un
,
3330 diskaddr_t startblk
, diskaddr_t endblk
, md_mn_nodeid_t source_node
)
3332 int mnset
= MD_MNSET_SETNO(MD_UN2SET(un
));
3334 if (mnset
&& !MD_MN_MIRROR_OWNER(un
)) {
3335 return (mirror_mark_resync_region_non_owner(un
, startblk
,
3336 endblk
, source_node
));
3338 return (mirror_mark_resync_region_owner(un
, startblk
, endblk
,
3344 mirror_resize_resync_regions(mm_unit_t
*un
, diskaddr_t new_tb
)
3347 optim_resync_t
*orp
;
3349 uint_t old_nregions
, new_nregions
;
3350 int old_bm_size
, new_bm_size
;
3352 mddb_recid_t recid
, old_recid
;
3353 uchar_t
*old_dirty_bm
;
3356 set_t setno
= MD_UN2SET(un
);
3359 old_nregions
= un
->un_rrd_num
;
3360 new_nregions
= (uint_t
)((new_tb
/un
->un_rrd_blksize
) + 1);
3362 while (new_nregions
> MD_MAX_NUM_RR
) {
3367 new_bm_size
= howmany(new_nregions
, NBBY
);
3368 old_bm_size
= howmany(old_nregions
, NBBY
);
3370 size
= new_bm_size
+ sizeof (*orp
) - sizeof (orp
->or_rr
);
3372 typ1
= (mddb_type_t
)md_getshared_key(setno
,
3373 mirror_md_ops
.md_driver
.md_drivername
);
3374 recid
= mddb_createrec(size
, typ1
, RESYNC_REC
,
3375 MD_CRO_OPTIMIZE
|MD_CRO_32BIT
, setno
);
3379 orp
= (struct optim_resync
*)mddb_getrecaddr(recid
);
3380 ASSERT(orp
!= NULL
);
3382 orp
->or_magic
= OR_MAGIC
; /* Magic # */
3383 orp
->or_blksize
= un
->un_rrd_blksize
; /* Same block size */
3384 orp
->or_num
= new_nregions
; /* New number of regions */
3386 old_dirty_bm
= un
->un_dirty_bm
;
3387 un
->un_dirty_bm
= orp
->or_rr
;
3389 kmem_free((caddr_t
)un
->un_goingdirty_bm
, old_bm_size
);
3390 un
->un_goingdirty_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3392 kmem_free((caddr_t
)un
->un_goingclean_bm
, old_bm_size
);
3393 un
->un_goingclean_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3395 kmem_free((caddr_t
)un
->un_resync_bm
, old_bm_size
);
3396 un
->un_resync_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3398 owp
= un
->un_outstanding_writes
;
3399 un
->un_outstanding_writes
= (short *)kmem_zalloc(
3400 new_nregions
* sizeof (short), KM_SLEEP
);
3402 old_pns
= un
->un_pernode_dirty_sum
;
3404 un
->un_pernode_dirty_sum
= (uchar_t
*)kmem_zalloc(new_nregions
,
3408 * Now translate the old records into the new
3411 for (i
= 0; i
< old_nregions
; i
++) {
3413 * only bring forward the
3414 * outstanding write counters and the dirty bits and also
3415 * the pernode_summary counts
3417 if (!isset(old_dirty_bm
, i
))
3420 setbit(un
->un_dirty_bm
, (i
/ rr_mult
));
3421 un
->un_outstanding_writes
[(i
/ rr_mult
)] += owp
[i
];
3423 un
->un_pernode_dirty_sum
[(i
/ rr_mult
)] += old_pns
[i
];
3425 kmem_free((caddr_t
)owp
, old_nregions
* sizeof (short));
3427 kmem_free((caddr_t
)old_pns
, old_nregions
);
3430 * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
3432 for (j
= 0; j
< MD_MNMAXSIDES
; j
++) {
3433 rw_enter(&un
->un_pernode_dirty_mx
[j
], RW_WRITER
);
3434 old_dirty_bm
= un
->un_pernode_dirty_bm
[j
];
3436 un
->un_pernode_dirty_bm
[j
] = (uchar_t
*)kmem_zalloc(
3437 new_bm_size
, KM_SLEEP
);
3438 for (i
= 0; i
< old_nregions
; i
++) {
3439 if (!isset(old_dirty_bm
, i
))
3442 setbit(un
->un_pernode_dirty_bm
[j
],
3445 kmem_free((caddr_t
)old_dirty_bm
, old_bm_size
);
3447 rw_exit(&un
->un_pernode_dirty_mx
[j
]);
3450 /* Save the old record id */
3451 old_recid
= un
->un_rr_dirty_recid
;
3453 /* Update the mirror unit struct */
3454 un
->un_rr_dirty_recid
= recid
;
3455 un
->un_rrd_num
= new_nregions
;
3456 un
->un_rrd_blksize
= un
->un_rrd_blksize
* rr_mult
;
3458 orp
->or_blksize
= un
->un_rrd_blksize
;
3461 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3462 * instead of using mddb_commitrecs_wrapper, is that you cannot
3463 * atomically commit optimized records.
3465 mddb_commitrec_wrapper(recid
);
3466 mddb_commitrec_wrapper(un
->c
.un_record_id
);
3467 mddb_deleterec_wrapper(old_recid
);
3471 /* lockp can be NULL for !MN diksets */
3473 mirror_add_resync_regions(mm_unit_t
*un
, diskaddr_t new_tb
)
3477 optim_resync_t
*orp
;
3478 uint_t old_nregions
, new_nregions
;
3479 int old_bm_size
, new_bm_size
;
3481 mddb_recid_t recid
, old_recid
;
3483 set_t setno
= MD_UN2SET(un
);
3486 old_nregions
= un
->un_rrd_num
;
3487 new_nregions
= (uint_t
)((new_tb
/un
->un_rrd_blksize
) + 1);
3489 new_bm_size
= howmany(new_nregions
, NBBY
);
3490 old_bm_size
= howmany(old_nregions
, NBBY
);
3492 size
= new_bm_size
+ sizeof (*orp
) - sizeof (orp
->or_rr
);
3494 typ1
= (mddb_type_t
)md_getshared_key(setno
,
3495 mirror_md_ops
.md_driver
.md_drivername
);
3497 recid
= mddb_createrec(size
, typ1
, RESYNC_REC
,
3498 MD_CRO_OPTIMIZE
|MD_CRO_32BIT
, setno
);
3502 orp
= (struct optim_resync
*)mddb_getrecaddr(recid
);
3503 ASSERT(orp
!= NULL
);
3505 orp
->or_magic
= OR_MAGIC
; /* Magic # */
3506 orp
->or_blksize
= un
->un_rrd_blksize
; /* Same block size */
3507 orp
->or_num
= new_nregions
; /* New number of regions */
3509 /* Copy the old bm over the new bm */
3510 bcopy((caddr_t
)un
->un_dirty_bm
, (caddr_t
)orp
->or_rr
, old_bm_size
);
3513 * Create new bigger incore arrays, copy, and free old ones:
3517 * un_outstanding_writes
3518 * un_pernode_dirty_sum
3519 * un_pernode_dirty_bm[]
3521 old
= un
->un_goingdirty_bm
;
3522 un
->un_goingdirty_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3523 bcopy((caddr_t
)old
, (caddr_t
)un
->un_goingdirty_bm
, old_bm_size
);
3524 kmem_free((caddr_t
)old
, old_bm_size
);
3526 old
= un
->un_goingclean_bm
;
3527 un
->un_goingclean_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3528 bcopy((caddr_t
)old
, (caddr_t
)un
->un_goingclean_bm
, old_bm_size
);
3529 kmem_free((caddr_t
)old
, old_bm_size
);
3531 old
= un
->un_resync_bm
;
3532 un
->un_resync_bm
= (uchar_t
*)kmem_zalloc(new_bm_size
, KM_SLEEP
);
3533 bcopy((caddr_t
)old
, (caddr_t
)un
->un_resync_bm
, old_bm_size
);
3534 kmem_free((caddr_t
)old
, old_bm_size
);
3536 owp
= un
->un_outstanding_writes
;
3537 un
->un_outstanding_writes
= (short *)kmem_zalloc(
3538 (uint_t
)new_nregions
* sizeof (short), KM_SLEEP
);
3539 bcopy((caddr_t
)owp
, (caddr_t
)un
->un_outstanding_writes
,
3540 old_nregions
* sizeof (short));
3541 kmem_free((caddr_t
)owp
, (old_nregions
* sizeof (short)));
3543 old
= un
->un_pernode_dirty_sum
;
3545 un
->un_pernode_dirty_sum
= (uchar_t
*)kmem_zalloc(
3546 new_nregions
, KM_SLEEP
);
3547 bcopy((caddr_t
)old
, (caddr_t
)un
->un_pernode_dirty_sum
,
3549 kmem_free((caddr_t
)old
, old_nregions
);
3552 for (i
= 0; i
< MD_MNMAXSIDES
; i
++) {
3553 rw_enter(&un
->un_pernode_dirty_mx
[i
], RW_WRITER
);
3554 old
= un
->un_pernode_dirty_bm
[i
];
3556 un
->un_pernode_dirty_bm
[i
] = (uchar_t
*)kmem_zalloc(
3557 new_bm_size
, KM_SLEEP
);
3558 bcopy((caddr_t
)old
, (caddr_t
)un
->un_pernode_dirty_bm
[i
],
3560 kmem_free((caddr_t
)old
, old_bm_size
);
3562 rw_exit(&un
->un_pernode_dirty_mx
[i
]);
3565 /* Save the old record id */
3566 old_recid
= un
->un_rr_dirty_recid
;
3568 /* Update the mirror unit struct */
3569 un
->un_rr_dirty_recid
= recid
;
3570 un
->un_rrd_num
= new_nregions
;
3571 un
->un_dirty_bm
= orp
->or_rr
;
3574 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3575 * instead of using mddb_commitrecs_wrapper, is that you cannot
3576 * atomically commit optimized records.
3578 mddb_commitrec_wrapper(recid
);
3579 mddb_commitrec_wrapper(un
->c
.un_record_id
);
3580 mddb_deleterec_wrapper(old_recid
);
3587 * Combine the dirty record bitmap with the in-core resync bitmap. This allows
3588 * us to carry a resync over an ownership change.
3591 mirror_copy_rr(int sz
, uchar_t
*src
, uchar_t
*dest
)
3595 for (i
= 0; i
< sz
; i
++)
3600 * mirror_set_dirty_rr:
3601 * -------------------
3602 * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
3603 * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
3604 * Called on every clean->dirty transition for the originating writer node.
3605 * Note: only the non-owning nodes will initiate this message and it is only
3606 * the owning node that has to process it.
3609 mirror_set_dirty_rr(md_mn_rr_dirty_params_t
*iocp
)
3612 minor_t mnum
= iocp
->rr_mnum
;
3614 int start
= (int)iocp
->rr_start
;
3615 int end
= (int)iocp
->rr_end
;
3616 set_t setno
= MD_MIN2SET(mnum
);
3617 md_mn_nodeid_t orignode
= iocp
->rr_nodeid
; /* 1-based */
3618 diskaddr_t startblk
, endblk
;
3620 mdclrerror(&iocp
->mde
);
3622 if ((setno
>= md_nsets
) ||
3623 (MD_MIN2UNIT(mnum
) >= md_nunits
)) {
3624 return (mdmderror(&iocp
->mde
, MDE_INVAL_UNIT
, mnum
));
3627 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3628 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3631 return (mdmderror(&iocp
->mde
, MDE_UNIT_NOT_SETUP
, mnum
));
3633 if (un
->c
.un_type
!= MD_METAMIRROR
) {
3634 return (mdmderror(&iocp
->mde
, MDE_NOT_MM
, mnum
));
3636 if (orignode
< 1 || orignode
>= MD_MNMAXSIDES
) {
3637 return (mdmderror(&iocp
->mde
, MDE_INVAL_UNIT
, mnum
));
3639 if (un
->un_nsm
< 2) {
3644 * Only process this message if we're the owner of the mirror.
3646 if (!MD_MN_MIRROR_OWNER(un
)) {
3650 RR_TO_BLK(startblk
, start
, un
);
3651 RR_TO_BLK(endblk
, end
, un
);
3652 return (mirror_mark_resync_region_owner(un
, startblk
, endblk
,
3657 * mirror_clean_rr_bits:
3658 * --------------------
3659 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3660 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3661 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3662 * nodes. Callable from ioctl / interrupt / whatever context.
3663 * un_resync_mx is held on entry.
3666 mirror_clean_rr_bits(
3667 md_mn_rr_clean_params_t
*iocp
)
3669 minor_t mnum
= iocp
->rr_mnum
;
3671 uint_t cleared_bits
;
3672 md_mn_nodeid_t node
= iocp
->rr_nodeid
- 1;
3673 md_mn_nodeid_t orignode
= iocp
->rr_nodeid
;
3676 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3679 start
= MDMN_RR_CLEAN_PARAMS_START_BIT(iocp
);
3680 end
= start
+ MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp
) * NBBY
;
3681 rw_enter(&un
->un_pernode_dirty_mx
[node
], RW_READER
);
3682 for (i
= start
; i
< end
; i
++) {
3683 if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp
), i
- start
)) {
3684 if (IS_PERNODE_DIRTY(orignode
, i
, un
)) {
3685 un
->un_pernode_dirty_sum
[i
]--;
3686 CLR_PERNODE_DIRTY(orignode
, i
, un
);
3688 if (un
->un_pernode_dirty_sum
[i
] == 0) {
3690 CLR_REGION_DIRTY(i
, un
);
3691 CLR_GOING_CLEAN(i
, un
);
3695 rw_exit(&un
->un_pernode_dirty_mx
[node
]);
3698 * We can only be called iff we are the mirror owner, however
3699 * as this is a (potentially) decoupled routine the ownership
3700 * may have moved from us by the time we get to execute the
3701 * bit clearing. Hence we still need to check for being the
3702 * owner before flushing the DRL to the replica.
3704 if (MD_MN_MIRROR_OWNER(un
)) {
3705 mutex_exit(&un
->un_resync_mx
);
3706 mddb_commitrec_wrapper(un
->un_rr_dirty_recid
);
3707 mutex_enter(&un
->un_resync_mx
);
3715 * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
3716 * We need to obtain exclusive access to the un_resync_cv and then clear the
3718 * On completion, we must also free the passed in argument as it is allocated
3719 * at the end of the ioctl handler and won't be freed on completion.
3722 mirror_drl_task(void *arg
)
3724 md_mn_rr_clean_params_t
*iocp
= (md_mn_rr_clean_params_t
*)arg
;
3725 minor_t mnum
= iocp
->rr_mnum
;
3728 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3730 mutex_enter(&un
->un_rrp_inflight_mx
);
3731 mutex_enter(&un
->un_resync_mx
);
3732 un
->un_waiting_to_clear
++;
3733 while (un
->un_resync_flg
& MM_RF_STALL_CLEAN
)
3734 cv_wait(&un
->un_resync_cv
, &un
->un_resync_mx
);
3735 un
->un_waiting_to_clear
--;
3737 un
->un_resync_flg
|= MM_RF_GATECLOSED
;
3738 mirror_clean_rr_bits(iocp
);
3739 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
3740 if (un
->un_waiting_to_mark
!= 0 || un
->un_waiting_to_clear
!= 0) {
3741 cv_broadcast(&un
->un_resync_cv
);
3743 mutex_exit(&un
->un_resync_mx
);
3744 mutex_exit(&un
->un_rrp_inflight_mx
);
3746 kmem_free((caddr_t
)iocp
, MDMN_RR_CLEAN_PARAMS_SIZE(iocp
));
3750 * mirror_set_clean_rr:
3751 * -------------------
3752 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3753 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3754 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3757 * Only the mirror-owner need process this message as it is the only RR updater.
3758 * Non-owner nodes issue this request, but as we have no point-to-point message
3759 * support we will receive the message on all nodes.
3762 mirror_set_clean_rr(md_mn_rr_clean_params_t
*iocp
)
3765 minor_t mnum
= iocp
->rr_mnum
;
3767 set_t setno
= MD_MIN2SET(mnum
);
3768 md_mn_nodeid_t node
= iocp
->rr_nodeid
- 1;
3770 md_mn_rr_clean_params_t
*newiocp
;
3773 mdclrerror(&iocp
->mde
);
3775 if ((setno
>= md_nsets
) ||
3776 (MD_MIN2UNIT(mnum
) >= md_nunits
)) {
3777 return (mdmderror(&iocp
->mde
, MDE_INVAL_UNIT
, mnum
));
3780 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3781 un
= mirror_getun(mnum
, &iocp
->mde
, NO_LOCK
, NULL
);
3784 return (mdmderror(&iocp
->mde
, MDE_UNIT_NOT_SETUP
, mnum
));
3786 if (un
->c
.un_type
!= MD_METAMIRROR
) {
3787 return (mdmderror(&iocp
->mde
, MDE_NOT_MM
, mnum
));
3789 if (un
->un_nsm
< 2) {
3794 * Check to see if we're the mirror owner. If not, there's nothing
3797 if (!MD_MN_MIRROR_OWNER(un
)) {
3802 * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
3803 * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
3804 * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
3805 * we can just defer this cleaning until the next process_resync_regions
3808 rw_enter(&un
->un_pernode_dirty_mx
[node
], RW_WRITER
);
3809 if (un
->un_pernode_dirty_bm
[node
] == NULL
) {
3810 un
->un_pernode_dirty_bm
[node
] = (uchar_t
*)kmem_zalloc(
3811 un
->un_rrd_num
, KM_SLEEP
);
3813 rw_exit(&un
->un_pernode_dirty_mx
[node
]);
3816 * See if we can simply clear the un_dirty_bm[] entries. If we're not
3817 * the issuing node _and_ we aren't in the process of marking/clearing
3818 * the RR bitmaps, we can simply update the bits as needed.
3819 * If we're the owning node and _not_ the issuing node, we should also
3820 * sync the RR if we clear any bits in it.
3822 mutex_enter(&un
->un_resync_mx
);
3823 can_clear
= (un
->un_resync_flg
& MM_RF_STALL_CLEAN
) ? 0 : 1;
3825 un
->un_resync_flg
|= MM_RF_GATECLOSED
;
3826 mirror_clean_rr_bits(iocp
);
3827 un
->un_resync_flg
&= ~MM_RF_GATECLOSED
;
3828 if (un
->un_waiting_to_mark
!= 0 ||
3829 un
->un_waiting_to_clear
!= 0) {
3830 cv_broadcast(&un
->un_resync_cv
);
3833 mutex_exit(&un
->un_resync_mx
);
3836 * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
3837 * we must schedule a blocking call to update the DRL on this node.
3838 * As we're invoked from an ioctl we are going to have the original data
3839 * disappear (kmem_free) once we return. So, copy the data into a new
3840 * structure and let the taskq routine release it on completion.
3843 size_t sz
= MDMN_RR_CLEAN_PARAMS_SIZE(iocp
);
3845 newiocp
= (md_mn_rr_clean_params_t
*)kmem_alloc(sz
, KM_SLEEP
);
3847 bcopy(iocp
, newiocp
, sz
);
3849 if (ddi_taskq_dispatch(un
->un_drl_task
, mirror_drl_task
,
3850 newiocp
, DDI_NOSLEEP
) != DDI_SUCCESS
) {
3851 kmem_free(newiocp
, sz
);
3852 rval
= ENOMEM
; /* probably starvation */