6580729 node 16th joined, metaclust timed out in step4, local_daemon has rpc tli...
[unleashed.git] / usr / src / uts / common / io / lvm / mirror / mirror_resync.c
blob59785d670e9d0309c1d39826674c8523f70947cd
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/file.h>
31 #include <sys/user.h>
32 #include <sys/uio.h>
33 #include <sys/t_lock.h>
34 #include <sys/buf.h>
35 #include <sys/dkio.h>
36 #include <sys/vtoc.h>
37 #include <sys/kmem.h>
38 #include <vm/page.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
43 #include <sys/stat.h>
44 #include <sys/open.h>
45 #include <sys/disp.h>
46 #include <sys/lvm/md_mirror.h>
47 #include <sys/modctl.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/debug.h>
51 #include <sys/callb.h>
53 #include <sys/sysevent/eventdefs.h>
54 #include <sys/sysevent/svm.h>
55 #include <sys/lvm/mdmn_commd.h>
57 extern int md_status;
58 extern kmutex_t md_status_mx;
59 extern kmutex_t md_mx;
61 extern unit_t md_nunits;
62 extern set_t md_nsets;
63 extern md_set_t md_set[];
64 extern major_t md_major;
66 extern md_ops_t mirror_md_ops;
67 extern kmem_cache_t *mirror_child_cache; /* mirror child memory pool */
68 extern mdq_anchor_t md_mto_daemon;
69 extern daemon_request_t mirror_timeout;
70 extern md_resync_t md_cpr_resync;
71 extern clock_t md_hz;
72 extern int md_mtioctl_cnt;
74 extern kmem_cache_t *mirror_parent_cache;
75 #ifdef DEBUG
76 extern int mirror_debug_flag;
77 #endif
80 * Tunable resync thread timeout. This is used as the time interval for updating
81 * the resync progress to the mddb. This allows restartable resyncs to be
82 * continued across a system reboot.
83 * Default is to update the resync progress every 5 minutes.
85 int md_mirror_resync_update_intvl = MD_DEF_MIRROR_RESYNC_INTVL;
88 * Settable mirror resync buffer size. Specified in 512 byte
89 * blocks. This is set to MD_DEF_RESYNC_BUF_SIZE by default.
91 int md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
94 * Tunables for dirty region processing when
95 * closing down a mirror.
97 * Dirty region processing during close of a
98 * mirror is basically monitoring the state
99 * of the resync region bitmaps and the number
100 * of outstanding i/o's per submirror to
101 * determine that there are no more dirty
102 * regions left over.
104 * The approach taken is a retry logic over
105 * md_mirror_rr_cleans iterations to monitor
106 * the progress.
108 * There are two methods of polling the progress
109 * on dirty bitmap processing: busy-waits and
110 * non-busy-waits.
112 * Busy-waits are used at the beginning to
113 * determine the final state as quick as
114 * possible; md_mirror_rr_polls defines the
115 * number of busy-waits.
117 * In case the number of busy-waits got exhausted
118 * with dirty regions left over, the retry logic
119 * switches over to non-busy-waits, thus giving
120 * relief to an obviously heavily loaded system.
121 * The timeout value is defined by the tunable
122 * md_mirror_rr_sleep_timo in seconds.
124 * The number of non-busy-waits is given by:
125 * md_mirror_rr_cleans - md_mirror_rr_polls.
127 * The values were found by testing on a
128 * 'typical' system and may require tuning
129 * to meet specific customer's requirements.
132 int md_mirror_rr_cleans = 13;
133 int md_mirror_rr_polls = 3;
134 int md_mirror_rr_sleep_timo = 1;
137 * The value is not #defined because it will be computed
138 * in the future.
140 int md_max_xfer_bufsz = 2048;
143 * mirror_generate_rr_bitmap:
144 * -------------------
145 * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
146 * bitmap associated with mirror 'un'
148 * Input:
149 * un - mirror unit to get bitmap data from
150 * *msgp - location to return newly allocated md_mn_msg_rr_clean_t
151 * *activep- location to return # of active i/os
153 * Returns:
154 * 1 => dirty bits cleared from un_dirty_bm and DRL flush required
155 * *msgp contains bitmap of to-be-cleared bits
156 * 0 => no bits cleared
157 * *msgp == NULL
159 static int
160 mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
161 int *activep)
163 unsigned int i, next_bit, data_bytes, start_bit;
164 int cleared_dirty = 0;
166 /* Skip any initial 0s. */
167 retry_dirty_scan:
168 if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
169 un->un_rr_clean_start_bit = start_bit = 0;
172 * Handle case where NO bits are set in PERNODE_DIRTY but the
173 * un_dirty_bm[] map does have entries set (after a 1st resync)
175 for (; start_bit < un->un_rrd_num &&
176 !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
177 (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
180 if (start_bit >= un->un_rrd_num) {
181 if (un->un_rr_clean_start_bit == 0) {
182 return (0);
183 } else {
184 un->un_rr_clean_start_bit = 0;
185 goto retry_dirty_scan;
189 /* how much to fit into this message */
190 data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
191 MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
193 (*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
194 KM_SLEEP);
196 (*msgp)->rr_nodeid = md_mn_mynode_id;
197 (*msgp)->rr_mnum = MD_SID(un);
198 MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
200 next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
202 for (i = start_bit; i < next_bit; i++) {
203 if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
204 continue;
206 if (!IS_REGION_DIRTY(i, un)) {
207 continue;
209 if (un->un_outstanding_writes[i] != 0) {
210 (*activep)++;
211 continue;
215 * Handle the case where a resync has completed and we still
216 * have the un_dirty_bm[] entries marked as dirty (these are
217 * the most recent DRL re-read from the replica). They need
218 * to be cleared from our un_dirty_bm[] but they will not have
219 * corresponding un_pernode_dirty[] entries set unless (and
220 * until) further write()s have been issued to the area.
221 * This handles the case where only the un_dirty_bm[] entry is
222 * set. Without this we'd not clear this region until a local
223 * write is issued to the affected area.
225 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
226 (un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
227 if (!IS_GOING_CLEAN(i, un)) {
228 SET_GOING_CLEAN(i, un);
229 (*activep)++;
230 continue;
233 * Now we've got a flagged pernode_dirty, _or_ a clean
234 * bitmap entry to process. Update the bitmap to flush
235 * the REGION_DIRTY / GOING_CLEAN bits when we send the
236 * cross-cluster message.
238 cleared_dirty++;
239 setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
240 } else {
242 * Not marked as active in the pernode bitmap, so skip
243 * any update to this. We just increment the 0 count
244 * and adjust the active count by any outstanding
245 * un_pernode_dirty_sum[] entries. This means we don't
246 * leave the mirror permanently dirty.
248 (*activep) += (int)un->un_pernode_dirty_sum[i];
251 if (!cleared_dirty) {
252 kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
253 *msgp = NULL;
255 un->un_rr_clean_start_bit = next_bit;
256 return (cleared_dirty);
260 * There are three paths into here:
262 * md_daemon -> check_resync_regions -> prr
263 * mirror_internal_close -> mirror_process_unit_resync -> prr
264 * mirror_set_capability -> mirror_process_unit_resync -> prr
266 * The first one is a kernel daemon, the other two result from system calls.
267 * Thus, only the first case needs to deal with kernel CPR activity. This
268 * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
269 * NULL for system call paths.
271 static int
272 process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
274 int i, start, end;
275 int cleared_dirty = 0;
276 /* Number of reasons why we can not proceed shutting down the mirror. */
277 int active = 0;
278 set_t setno = MD_UN2SET(un);
279 md_mn_msg_rr_clean_t *rmsg;
280 md_mn_kresult_t *kres;
281 int rval;
282 minor_t mnum = MD_SID(un);
283 mdi_unit_t *ui = MDI_UNIT(mnum);
284 md_mn_nodeid_t owner_node;
287 * We drop the readerlock here to assist lock ordering with
288 * update_resync. Once we have the un_rrp_inflight_mx, we
289 * can re-acquire it.
291 md_unit_readerexit(ui);
294 * Resync region processing must be single threaded. We can't use
295 * un_resync_mx for this purpose since this mutex gets released
296 * when blocking on un_resync_cv.
298 mutex_enter(&un->un_rrp_inflight_mx);
300 (void) md_unit_readerlock(ui);
302 mutex_enter(&un->un_resync_mx);
304 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
305 cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
306 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
308 if (cleared_dirty) {
309 owner_node = un->un_mirror_owner;
310 mutex_exit(&un->un_resync_mx);
313 * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
314 * Receipt of the message will cause the mirror owner to
315 * update the on-disk DRL.
318 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
320 /* release readerlock before sending message */
321 md_unit_readerexit(ui);
323 if (cprinfop) {
324 mutex_enter(&un->un_prr_cpr_mx);
325 CALLB_CPR_SAFE_BEGIN(cprinfop);
328 rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
329 MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
330 MD_MSGF_DIRECTED, un->un_mirror_owner,
331 (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
333 if (cprinfop) {
334 CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
335 mutex_exit(&un->un_prr_cpr_mx);
338 /* reacquire readerlock after message */
339 (void) md_unit_readerlock(ui);
341 if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
342 (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
343 /* if commd is gone, no point in printing a message */
344 if (md_mn_is_commd_present())
345 mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
346 kmem_free(kres, sizeof (md_mn_kresult_t));
347 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
348 mutex_exit(&un->un_rrp_inflight_mx);
349 return (active);
351 kmem_free(kres, sizeof (md_mn_kresult_t));
354 * If ownership changed while we were sending, we probably
355 * sent the message to the wrong node. Leave fixing that for
356 * the next cycle.
358 if (un->un_mirror_owner != owner_node) {
359 mutex_exit(&un->un_rrp_inflight_mx);
360 return (active);
364 * Now that we've sent the message, clear them from the
365 * pernode_dirty arrays. These are ONLY cleared on a
366 * successful send, and failure has no impact.
368 cleared_dirty = 0;
369 start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
370 end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
371 mutex_enter(&un->un_resync_mx);
372 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
373 RW_READER);
374 for (i = start; i < end; i++) {
375 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
376 i - start)) {
377 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
378 un->un_pernode_dirty_sum[i]--;
379 CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
380 un);
382 if (IS_REGION_DIRTY(i, un)) {
383 cleared_dirty++;
384 CLR_REGION_DIRTY(i, un);
385 CLR_GOING_CLEAN(i, un);
389 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
391 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
393 mutex_exit(&un->un_resync_mx);
395 mutex_exit(&un->un_rrp_inflight_mx);
397 return (active);
400 static int
401 process_resync_regions_owner(mm_unit_t *un)
403 int i, start, end;
404 int cleared_dirty = 0;
405 /* Number of reasons why we can not proceed shutting down the mirror. */
406 int active = 0;
407 set_t setno = MD_UN2SET(un);
408 int mnset = MD_MNSET_SETNO(setno);
409 md_mn_msg_rr_clean_t *rmsg;
410 minor_t mnum = MD_SID(un);
411 mdi_unit_t *ui = MDI_UNIT(mnum);
414 * We drop the readerlock here to assist lock ordering with
415 * update_resync. Once we have the un_rrp_inflight_mx, we
416 * can re-acquire it.
418 md_unit_readerexit(ui);
421 * Resync region processing must be single threaded. We can't use
422 * un_resync_mx for this purpose since this mutex gets released
423 * when blocking on un_resync_cv.
425 mutex_enter(&un->un_rrp_inflight_mx);
427 (void) md_unit_readerlock(ui);
429 mutex_enter(&un->un_resync_mx);
430 un->un_waiting_to_clear++;
431 while (un->un_resync_flg & MM_RF_STALL_CLEAN)
432 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
433 un->un_waiting_to_clear--;
435 if (mnset) {
436 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
437 RW_READER);
438 cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
440 if (cleared_dirty) {
442 * Clear the bits from the pernode_dirty arrays.
443 * If that results in any being cleared from the
444 * un_dirty_bm, commit it.
446 cleared_dirty = 0;
447 start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
448 end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
449 for (i = start; i < end; i++) {
450 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
451 i - start)) {
452 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
453 un)) {
454 un->un_pernode_dirty_sum[i]--;
455 CLR_PERNODE_DIRTY(
456 md_mn_mynode_id, i, un);
458 if (un->un_pernode_dirty_sum[i] == 0) {
459 cleared_dirty++;
460 CLR_REGION_DIRTY(i, un);
461 CLR_GOING_CLEAN(i, un);
465 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
467 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
468 } else {
469 for (i = 0; i < un->un_rrd_num; i++) {
470 if (un->c.un_status & MD_UN_KEEP_DIRTY)
471 if (IS_KEEPDIRTY(i, un))
472 continue;
474 if (!IS_REGION_DIRTY(i, un))
475 continue;
476 if (un->un_outstanding_writes[i] != 0) {
477 active++;
478 continue;
481 if (!IS_GOING_CLEAN(i, un)) {
482 SET_GOING_CLEAN(i, un);
483 active++;
484 continue;
486 CLR_REGION_DIRTY(i, un);
487 CLR_GOING_CLEAN(i, un);
488 cleared_dirty++;
492 if (cleared_dirty) {
493 un->un_resync_flg |= MM_RF_GATECLOSED;
494 mutex_exit(&un->un_resync_mx);
495 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
496 mutex_enter(&un->un_resync_mx);
497 un->un_resync_flg &= ~MM_RF_GATECLOSED;
499 if (un->un_waiting_to_mark != 0 ||
500 un->un_waiting_to_clear != 0) {
501 active++;
502 cv_broadcast(&un->un_resync_cv);
505 mutex_exit(&un->un_resync_mx);
507 mutex_exit(&un->un_rrp_inflight_mx);
509 return (active);
512 static int
513 process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
515 int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
517 * For a mirror we can only update the on-disk resync-record if we
518 * currently own the mirror. If we are called and there is no owner we
519 * bail out before scanning the outstanding_writes[] array.
520 * NOTE: we only need to check here (before scanning the array) as we
521 * are called with the readerlock held. This means that a change
522 * of ownership away from us will block until this resync check
523 * has completed.
525 if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
526 (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
527 return (0);
528 } else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
529 return (process_resync_regions_non_owner(un, cprinfop));
530 } else {
531 return (process_resync_regions_owner(un));
536 * Function that is callable from other modules to provide
537 * ability to cleanup dirty region bitmap on demand. Used
538 * on last close of a unit to avoid massive device resyncs
539 * when coming back after rolling large amounts of data to
540 * a mirror (e.g. at umount with logging).
543 void
544 mirror_process_unit_resync(mm_unit_t *un)
546 int cleans = 0;
548 while (process_resync_regions(un, NULL)) {
550 cleans++;
551 if (cleans >= md_mirror_rr_cleans) {
552 cmn_err(CE_NOTE,
553 "Could not clean resync regions\n");
554 break;
556 if (cleans > md_mirror_rr_polls) {
558 * We did not make it with md_mirror_rr_polls
559 * iterations. Give the system relief and
560 * switch over to non-busy-wait.
562 delay(md_mirror_rr_sleep_timo * md_hz);
567 static void
568 check_resync_regions(daemon_request_t *timeout)
570 mdi_unit_t *ui;
571 mm_unit_t *un;
572 md_link_t *next;
573 callb_cpr_t cprinfo;
575 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
576 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
578 if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
579 continue;
581 un = MD_UNIT(next->ln_id);
584 * Register this resync thread with the CPR mechanism. This
585 * allows us to detect when the system is suspended and so
586 * keep track of the RPC failure condition.
588 CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
589 "check_resync_regions");
591 ui = MDI_UNIT(next->ln_id);
592 (void) md_unit_readerlock(ui);
595 * Do not clean up resync regions if it is an ABR
596 * mirror, or if a submirror is offline (we will use the resync
597 * region to resync when back online) or if there is only one
598 * submirror.
600 if ((ui->ui_tstate & MD_ABR_CAP) ||
601 (un->c.un_status & MD_UN_OFFLINE_SM) || (un->un_nsm < 2)) {
602 md_unit_readerexit(ui);
603 continue;
606 (void) process_resync_regions(un, &cprinfo);
608 md_unit_readerexit(ui);
610 /* Remove this thread from the CPR callback table. */
611 mutex_enter(&un->un_prr_cpr_mx);
612 CALLB_CPR_EXIT(&cprinfo);
615 rw_exit(&mirror_md_ops.md_link_rw.lock);
617 /* We are done */
618 mutex_enter(&mirror_timeout.dr_mx);
619 timeout->dr_pending = 0;
620 mutex_exit(&mirror_timeout.dr_mx);
623 static void
624 md_mirror_timeout(void *throwaway)
627 mutex_enter(&mirror_timeout.dr_mx);
628 if (!mirror_timeout.dr_pending) {
629 mirror_timeout.dr_pending = 1;
630 daemon_request(&md_mto_daemon, check_resync_regions,
631 (daemon_queue_t *)&mirror_timeout, REQ_OLD);
634 if (mirror_md_ops.md_head != NULL)
635 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
636 throwaway, (int)MD_MDELAY*hz);
637 else
638 mirror_timeout.dr_timeout_id = 0;
640 mutex_exit(&mirror_timeout.dr_mx);
643 void
644 resync_start_timeout(set_t setno)
646 if (md_get_setstatus(setno) & MD_SET_STALE)
647 return;
649 mutex_enter(&mirror_timeout.dr_mx);
650 if (mirror_timeout.dr_timeout_id == 0)
651 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
652 (void *)NULL, (int)MD_MDELAY*hz);
653 mutex_exit(&mirror_timeout.dr_mx);
656 static void
657 offlined_to_attached(mm_unit_t *un)
659 int i;
660 int changed = 0;
662 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
663 return;
665 for (i = 0; i < NMIRROR; i++) {
666 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
667 mirror_set_sm_state(&un->un_sm[i],
668 &un->un_smic[i], SMS_ATTACHED, 1);
669 changed++;
671 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) {
672 mirror_set_sm_state(&un->un_sm[i],
673 &un->un_smic[i], SMS_ATTACHED_RESYNC, 1);
674 changed++;
678 if (changed != 0) {
679 un->c.un_status &= ~MD_UN_OFFLINE_SM;
680 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
684 static void
685 get_unit_resync(mm_unit_t *un)
687 mddb_recstatus_t status;
688 struct optim_resync *orp;
690 if (un->un_rr_dirty_recid == 0) {
691 offlined_to_attached(un);
692 return;
695 status = mddb_getrecstatus(un->un_rr_dirty_recid);
696 if ((status == MDDB_NORECORD) || (status == MDDB_NODATA)) {
697 un->un_rr_dirty_recid = 0;
698 offlined_to_attached(un);
699 return;
702 mddb_setrecprivate(un->un_rr_dirty_recid, MD_PRV_GOTIT);
703 orp = (struct optim_resync *)mddb_getrecaddr(un->un_rr_dirty_recid);
704 un->un_dirty_bm = orp->or_rr;
707 static int
708 create_unit_resync(mm_unit_t *un, int snarfing)
710 diskaddr_t tb;
711 int i;
712 int blksize; /* rr size in blocks */
713 int num_rr;
714 mddb_recid_t recid;
715 size_t size; /* bitmap size */
716 optim_resync_t *orp;
717 mddb_type_t typ1;
718 set_t setno;
720 tb = un->c.un_total_blocks;
722 if (((tb + MD_MIN_RR_SIZE)/ MD_MIN_RR_SIZE) > MD_DEF_NUM_RR) {
723 blksize = (int)(tb / MD_DEF_NUM_RR);
724 num_rr = (int)((tb + (blksize)) / (blksize));
725 } else {
726 blksize = MD_MIN_RR_SIZE;
727 num_rr = (int)((tb + MD_MIN_RR_SIZE) / MD_MIN_RR_SIZE);
730 size = howmany(num_rr, NBBY) + sizeof (*orp) - sizeof (orp->or_rr);
732 setno = MD_UN2SET(un);
734 typ1 = (mddb_type_t)md_getshared_key(setno,
735 mirror_md_ops.md_driver.md_drivername);
737 recid = mddb_createrec(size, typ1, RESYNC_REC,
738 MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
739 if (recid < 0) {
740 if (snarfing && !(md_get_setstatus(setno) & MD_SET_STALE)) {
741 md_set_setstatus(setno, MD_SET_STALE);
742 cmn_err(CE_WARN, "md: state database is stale");
744 return (-1);
747 un->un_rr_dirty_recid = recid;
748 orp = (optim_resync_t *)mddb_getrecaddr(recid);
749 orp->or_magic = OR_MAGIC;
750 orp->or_blksize = blksize;
751 orp->or_num = num_rr;
753 un->un_rrd_blksize = blksize;
754 un->un_rrd_num = num_rr;
755 un->un_dirty_bm = orp->or_rr;
757 if (snarfing)
758 for (i = 0; i < howmany(num_rr, NBBY); i++)
759 orp->or_rr[i] = 0xFF;
761 if (!snarfing) {
762 mddb_commitrec_wrapper(recid);
763 mirror_commit(un, NO_SUBMIRRORS, 0);
764 return (0);
766 mddb_setrecprivate(recid, MD_PRV_PENDCOM);
767 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
768 return (0);
772 unit_setup_resync(mm_unit_t *un, int snarfing)
774 int err;
775 int syncable;
776 int i;
777 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
778 int nonABR = 1; /* only set if ABR marked in ui_tstate */
780 un->un_dirty_bm = NULL;
781 un->un_rs_buffer = NULL;
783 mutex_init(&un->un_rrp_inflight_mx, "rrp mx", MUTEX_DEFAULT, NULL);
785 mutex_init(&un->un_resync_mx, NULL, MUTEX_DEFAULT, NULL);
786 cv_init(&un->un_resync_cv, NULL, CV_DEFAULT, NULL);
787 un->un_resync_flg = 0;
788 un->un_waiting_to_mark = 0;
789 un->un_waiting_to_commit = 0;
790 un->un_waiting_to_clear = 0;
792 un->un_goingclean_bm = NULL;
793 un->un_goingdirty_bm = NULL;
794 un->un_outstanding_writes = NULL;
795 un->un_resync_bm = NULL;
797 if (snarfing)
798 get_unit_resync(un);
800 if (un->un_rr_dirty_recid == 0) {
802 * If a MN diskset and snarfing and this node is not the
803 * master, do not delete any records on snarf of the
804 * mirror records (create_unit_resync deletes records).
806 * Master node should have already handled this case.
808 if (MD_MNSET_SETNO(MD_UN2SET(un)) && snarfing &&
809 md_set[MD_UN2SET(un)].s_am_i_master == 0) {
810 #ifdef DEBUG
811 cmn_err(CE_NOTE, "unit_setup_resync: no rr for %s on"
812 " nodeid %d\n", md_shortname(MD_SID(un)),
813 md_set[MD_UN2SET(un)].s_nodeid);
814 #endif
815 return (-1);
817 if ((err = create_unit_resync(un, snarfing)) != 0)
818 return (err);
821 un->un_goingclean_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
822 un->un_rrd_num, NBBY)), KM_SLEEP);
823 un->un_goingdirty_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
824 un->un_rrd_num, NBBY)), KM_SLEEP);
825 un->un_outstanding_writes = (short *)kmem_zalloc(
826 (uint_t)un->un_rrd_num * sizeof (short), KM_SLEEP);
827 un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
828 un->un_rrd_num, NBBY)), KM_SLEEP);
831 * Allocate pernode bitmap for this node. All other nodes' maps will
832 * be created 'on-the-fly' in the ioctl message handler
834 if (MD_MNSET_SETNO(MD_UN2SET(un))) {
835 un->un_pernode_dirty_sum =
836 (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
837 if (md_mn_mynode_id > 0) {
838 un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
839 kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
840 KM_SLEEP);
844 * Allocate taskq to process deferred (due to locking) RR_CLEAN
845 * requests.
847 un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
848 MD_SID(un));
851 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
852 return (0);
855 * Only mark mirror which has an associated DRL as requiring a resync.
856 * For ABR mirrors we need not set the resync record bitmap up.
858 if (ui && (ui->ui_tstate & MD_ABR_CAP))
859 nonABR = 0;
861 for (i = 0, syncable = 0; i < NMIRROR; i++) {
862 if (nonABR) {
863 if ((SUBMIRROR_IS_READABLE(un, i) ||
864 SMS_BY_INDEX_IS(un, i,
865 (SMS_OFFLINE | SMS_OFFLINE_RESYNC))))
866 syncable++;
870 if (snarfing && un->un_pass_num && (syncable > 1)) {
871 bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
872 howmany(un->un_rrd_num, NBBY));
874 un->c.un_status |= (MD_UN_OPT_NOT_DONE | MD_UN_WAR);
875 un->c.un_status &= ~MD_UN_OFFLINE_SM;
876 for (i = 0; i < NMIRROR; i++) {
877 if ((SUBMIRROR_IS_READABLE(un, i)) ||
878 SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC))
879 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
881 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
882 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
883 mirror_set_sm_state(&un->un_sm[i],
884 &un->un_smic[i], SMS_OFFLINE_RESYNC, 1);
885 mddb_setrecprivate(un->c.un_record_id,
886 MD_PRV_PENDCOM);
890 return (0);
894 * resync_kill_pending:
895 * -------------------
896 * Determine if the resync thread has been requested to terminate.
897 * Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
898 * MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
899 * MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node mirror.
901 * Returns:
902 * 0 Kill not pending
903 * 1 Kill requested (set MD_UN_RESYNC_CANCEL in un->c.un_status)
905 * Note: this routine may block
906 * the writerlock for <ui> will be dropped and reacquired if <mx_type>
907 * is set to MD_WRITER_HELD.
908 * the readerlock for <ui> will be dropped and reacquired if <mx_type>
909 * is set to MD_READER_HELD.
911 static int
912 resync_kill_pending(
913 mm_unit_t *un,
914 mdi_unit_t *ui,
915 uint_t mx_type)
917 int retval = 0;
919 /* Ensure that we don't block with any mutex held */
920 if (mx_type == MD_WRITER_HELD) {
921 md_unit_writerexit(ui);
922 } else if (mx_type == MD_READER_HELD) {
923 md_unit_readerexit(ui);
925 mutex_enter(&un->un_rs_thread_mx);
926 while (un->un_rs_thread_flags & (MD_RI_BLOCK|MD_RI_BLOCK_OWNER)) {
927 cv_wait(&un->un_rs_thread_cv, &un->un_rs_thread_mx);
928 if (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN))
929 break;
931 /* Determine if we've been asked to abort or shutdown gracefully */
932 if (un->un_rs_thread_flags & MD_RI_KILL) {
933 un->c.un_status |= MD_UN_RESYNC_CANCEL;
934 retval = 1;
935 } else if (un->un_rs_thread_flags & MD_RI_SHUTDOWN) {
936 retval = 1;
938 mutex_exit(&un->un_rs_thread_mx);
940 /* Reacquire mutex if dropped on entry */
941 if (mx_type == MD_WRITER_HELD) {
942 (void) md_unit_writerlock(ui);
943 } else if (mx_type == MD_READER_HELD) {
944 (void) md_unit_readerlock(ui);
946 return (retval);
950 * resync_read_buffer:
951 * ------------------
952 * Issue the resync source read for the specified start block and size.
953 * This will cause the mirror strategy routine to issue a write-after-read
954 * once this request completes successfully.
955 * If 'flag_err' is set we expect to see a write error flagged in the b_error
956 * field of the buffer created for this i/o request. If clear we do not expect
957 * to see the error flagged for write failures.
958 * Read failures will always set the B_ERROR bit which will stop the resync
959 * immediately.
961 static int
962 resync_read_buffer(mm_unit_t *un, diskaddr_t blk, size_t cnt, int flag_err)
964 md_mcs_t *sp;
965 buf_t *bp;
966 int ret = 0;
968 sp = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
969 mirror_child_init(sp);
971 bp = &sp->cs_buf;
972 bp->b_edev = makedevice(md_major, MD_SID(un));
973 bp->b_flags = B_READ;
974 bp->b_lblkno = blk;
975 bp->b_bcount = dbtob(cnt);
976 bp->b_un.b_addr = un->un_rs_buffer;
977 md_unit_readerexit(MDI_UNIT(MD_SID(un)));
979 (void) md_mirror_strategy(bp, MD_STR_NOTTOP | MD_STR_MAPPED |
980 MD_STR_WAR | (flag_err ? MD_STR_FLAG_ERR : 0), NULL);
982 (void) biowait(bp);
984 (void) md_unit_readerlock(MDI_UNIT(MD_SID(un)));
985 if (bp->b_flags & B_ERROR) {
986 ret = 1;
988 kmem_cache_free(mirror_child_cache, sp);
989 return (ret);
993 * send_mn_resync_done_message
995 * At the end of a resync, send a message to all nodes to indicate that
996 * the resync is complete. The argument, flags, has the following values
998 * RESYNC_ERR - if an error occurred that terminated the resync
999 * CLEAR_OPT_NOT_DONE - Just need to clear the OPT_NOT_DONE flag
1001 * unit writerlock set on entry
1002 * Only send the message if the thread is not marked as shutting down:
1003 * [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
1004 * [un->c.un_status & MD_UN_RESYNC_CANCEL]
1005 * or if there has been an error that terminated the resync:
1006 * flags & RESYNC_ERR
1009 static void
1010 send_mn_resync_done_message(
1011 mm_unit_t *un,
1012 int flags
1015 md_mn_msg_resync_t *rmsg = un->un_rs_msg;
1016 set_t setno;
1017 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
1018 md_mn_kresult_t *kres;
1019 int dont_send = 0;
1020 int rval;
1022 rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
1025 * Only send the message if this resync thread is still active. This
1026 * handles the case where ownership changes to different nodes during
1027 * a resync can cause multiple spurious resync_done messages to occur
1028 * when the resync completes. This happens because only one node is
1029 * the resync owner but other nodes will have their resync_unit thread
1030 * blocked in 'resync_kill_pending'
1032 mutex_enter(&un->un_rs_thread_mx);
1033 dont_send = (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN)) ? 1
1034 : 0;
1035 mutex_exit(&un->un_rs_thread_mx);
1036 dont_send |= (un->c.un_status & MD_UN_RESYNC_CANCEL) ? 1 : 0;
1039 * Always send a message if we've encountered an error that terminated
1040 * the resync.
1042 if (flags & RESYNC_ERR)
1043 dont_send = 0;
1045 if (dont_send) {
1046 #ifdef DEBUG
1047 if (mirror_debug_flag) {
1048 printf("Don't send resync done message, mnum = %x,"
1049 " type = %x, flags = %d\n", MD_SID(un),
1050 un->un_rs_type, flags);
1052 #endif /* DEBUG */
1053 return;
1056 #ifdef DEBUG
1057 if (mirror_debug_flag) {
1058 printf("send resync done message, mnum = %x, type = %x\n",
1059 MD_SID(un), un->un_rs_type);
1061 #endif
1063 rmsg->msg_resync_mnum = MD_SID(un);
1064 rmsg->msg_resync_type = un->un_rs_type;
1065 rmsg->msg_originator = md_mn_mynode_id;
1066 rmsg->msg_resync_flags = 0;
1067 if (flags & RESYNC_ERR)
1068 rmsg->msg_resync_flags |= MD_MN_RS_ERR;
1069 if (flags & CLEAR_OPT_NOT_DONE)
1070 rmsg->msg_resync_flags |= MD_MN_RS_CLEAR_OPT_NOT_DONE;
1072 setno = MD_MIN2SET(MD_SID(un));
1073 md_unit_writerexit(ui);
1074 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1076 mutex_enter(&un->un_rs_cpr_mx);
1077 CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1079 rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
1080 MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1082 CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1083 mutex_exit(&un->un_rs_cpr_mx);
1085 /* if the node hasn't yet joined, it's Ok. */
1086 if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
1087 (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
1088 mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
1089 /* If we're shutting down already, pause things here. */
1090 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1091 while (!md_mn_is_commd_present()) {
1092 delay(md_hz);
1095 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
1097 kmem_free(kres, sizeof (md_mn_kresult_t));
1098 (void) md_unit_writerlock(ui);
1102 * send_mn_resync_next_message
1104 * Sent a message to all nodes indicating the next region to be resynced.
1105 * The message contains the region to be resynced and the current position in
1106 * the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
1107 * On entry the unit readerlock is held.
1109 static void
1110 send_mn_resync_next_message(
1111 mm_unit_t *un,
1112 diskaddr_t currentblk,
1113 size_t rsize,
1114 int flags
1117 md_mn_msg_resync_t *rmsg = un->un_rs_msg;
1118 set_t setno;
1119 md_mn_kresult_t *kres;
1120 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
1121 int rval;
1122 md_mps_t *ps;
1123 mm_submirror_t *sm;
1124 int smi;
1126 ASSERT(rmsg != NULL);
1127 #ifdef DEBUG
1128 if (mirror_debug_flag) {
1129 printf("send resync next message, mnum = %x, start=%lld, "
1130 "size=%ld, type=%x, done=%lld, 2_do=%lld\n",
1131 MD_SID(un), currentblk, rsize, un->un_rs_type,
1132 un->un_rs_resync_done, un->un_rs_resync_2_do);
1134 #endif
1135 rmsg->msg_resync_mnum = MD_SID(un);
1136 rmsg->msg_resync_type = un->un_rs_type;
1137 rmsg->msg_resync_start = currentblk;
1138 rmsg->msg_resync_rsize = rsize;
1139 rmsg->msg_resync_done = un->un_rs_resync_done;
1140 rmsg->msg_resync_2_do = un->un_rs_resync_2_do;
1141 rmsg->msg_originator = md_mn_mynode_id;
1142 if (flags & MD_FIRST_RESYNC_NEXT)
1143 rmsg->msg_resync_flags = MD_MN_RS_FIRST_RESYNC_NEXT;
1146 * Copy current submirror state and flags into message. This provides
1147 * a means of keeping all nodes that are currently active in the cluster
1148 * synchronised with regards to their submirror state settings. If we
1149 * did not pass this information here, the only time every node gets
1150 * submirror state updated is at the end of a resync phase. This can be
1151 * a significant amount of time for large metadevices.
1153 for (smi = 0; smi < NMIRROR; smi++) {
1154 sm = &un->un_sm[smi];
1155 rmsg->msg_sm_state[smi] = sm->sm_state;
1156 rmsg->msg_sm_flags[smi] = sm->sm_flags;
1158 setno = MD_MIN2SET(MD_SID(un));
1159 md_unit_readerexit(ui);
1160 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1162 mutex_enter(&un->un_rs_cpr_mx);
1163 CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1165 rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
1166 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1168 CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1169 mutex_exit(&un->un_rs_cpr_mx);
1171 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
1172 mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
1173 /* If we're shutting down already, pause things here. */
1174 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1175 while (!md_mn_is_commd_present()) {
1176 delay(md_hz);
1179 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
1181 kmem_free(kres, sizeof (md_mn_kresult_t));
1182 (void) md_unit_readerlock(ui);
1183 ps = un->un_rs_prev_overlap;
1185 /* Allocate previous overlap reference if needed */
1186 if (ps == NULL) {
1187 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
1188 ps->ps_un = un;
1189 ps->ps_ui = ui;
1190 ps->ps_firstblk = 0;
1191 ps->ps_lastblk = 0;
1192 ps->ps_flags = 0;
1193 md_unit_readerexit(ui);
1194 (void) md_unit_writerlock(ui);
1195 un->un_rs_prev_overlap = ps;
1196 md_unit_writerexit(ui);
1197 (void) md_unit_readerlock(ui);
1200 ps->ps_firstblk = currentblk;
1201 ps->ps_lastblk = currentblk + rsize - 1;
1204 static int
1205 resync_read_blk_range(
1206 mm_unit_t *un,
1207 diskaddr_t currentblk,
1208 diskaddr_t stopbefore,
1209 uint_t type,
1210 int flags
1213 size_t copysize; /* limited by max xfer buf size */
1214 size_t rsize; /* size of resync block (for MN) */
1215 set_t setno;
1216 diskaddr_t newstop;
1217 diskaddr_t rs_startblk;
1218 uint_t rs_type;
1219 int flags1 = flags & MD_FIRST_RESYNC_NEXT;
1221 rs_type = un->un_rs_type;
1222 rs_startblk = currentblk;
1223 if (stopbefore > un->c.un_total_blocks)
1224 stopbefore = un->c.un_total_blocks;
1225 if (currentblk < un->un_resync_startbl)
1226 currentblk = un->un_resync_startbl;
1228 copysize = un->un_rs_copysize;
1229 rsize = MD_DEF_RESYNC_BLK_SZ;
1231 setno = MD_MIN2SET(MD_SID(un));
1232 while (currentblk < stopbefore) {
1234 * Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
1235 * if a MN device and sendflag is set, send a RESYNC_MESSAGE
1236 * to all nodes.
1238 if ((currentblk + MD_DEF_RESYNC_BLK_SZ) > stopbefore)
1239 rsize = stopbefore - currentblk;
1240 if (MD_MNSET_SETNO(setno) && (flags & MD_SEND_MESS_XMIT)) {
1241 un->un_resync_startbl = currentblk;
1242 rs_startblk = currentblk;
1243 send_mn_resync_next_message(un, currentblk, rsize,
1244 flags1);
1245 if (flags1)
1246 flags1 = 0;
1247 /* check to see if we've been asked to terminate */
1248 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1249 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1250 ? 1:0);
1252 * Check to see if another node has completed this
1253 * block, if so either the type or the resync region
1254 * will have changed. If the resync type has changed,
1255 * just exit.
1256 * If the resync region has changed, reset currentblk
1257 * to the start of the current resync region and
1258 * continue.
1260 if (un->un_rs_type != rs_type)
1261 return (0);
1262 if (un->un_rs_prev_overlap->ps_firstblk >
1263 rs_startblk) {
1264 currentblk =
1265 un->un_rs_prev_overlap->ps_firstblk;
1266 continue;
1269 newstop = currentblk + rsize;
1270 while (currentblk < newstop) {
1271 if ((currentblk + copysize) > stopbefore)
1272 copysize = (size_t)(stopbefore - currentblk);
1273 if (resync_read_buffer(un, currentblk, copysize,
1274 (flags & MD_RESYNC_FLAG_ERR)))
1275 return (1);
1277 /* resync_read_buffer releases/grabs a new lock */
1278 un = (mm_unit_t *)MD_UNIT(MD_SID(un));
1279 currentblk += copysize;
1281 /* check to see if we've been asked to terminate */
1282 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1283 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1284 ? 1:0);
1285 if (MD_MNSET_SETNO(setno)) {
1287 * Check to see if another node has completed
1288 * this block, see above
1290 if (un->un_rs_type != rs_type)
1291 return (0);
1292 if (un->un_rs_prev_overlap->ps_firstblk >
1293 rs_startblk)
1294 currentblk =
1295 un->un_rs_prev_overlap->ps_firstblk;
1299 return (0);
1302 static void
1303 optimized_resync(mm_unit_t *un)
1305 mdi_unit_t *ui;
1306 minor_t mnum;
1307 int rr, smi;
1308 int resync_regions;
1309 uchar_t *dirtyregions;
1310 diskaddr_t first, stopbefore;
1311 int err;
1312 int cnt;
1313 sm_state_t state;
1314 int broke_out = 0;
1315 set_t setno;
1316 uint_t old_rs_type = un->un_rs_type;
1317 uint_t old_rs_done;
1318 uint_t flags1 = MD_FIRST_RESYNC_NEXT|MD_RESYNC_FLAG_ERR;
1319 size_t start_rr;
1321 mnum = MD_SID(un);
1322 ui = MDI_UNIT(mnum);
1323 setno = MD_UN2SET(un);
1325 if (!(un->c.un_status & MD_UN_OPT_NOT_DONE)) {
1327 * We aren't marked as needing a resync so for multi-node
1328 * sets we flag the completion so that all nodes see the same
1329 * metadevice state. This is a problem when a new node joins
1330 * an existing set as it has to perform a 'metasync -r' and
1331 * we have to step through all of the resync phases. If we
1332 * don't do this the nodes that were already in the set will
1333 * have the metadevices marked as 'Okay' but the joining node
1334 * will have 'Needs Maintenance' which is unclearable.
1336 if (MD_MNSET_SETNO(setno)) {
1337 send_mn_resync_done_message(un, CLEAR_OPT_NOT_DONE);
1339 return;
1343 * No need for optimized resync if ABR set, clear rs_type and flags
1344 * and exit
1346 if (ui->ui_tstate & MD_ABR_CAP) {
1347 un->un_rs_type = MD_RS_NONE;
1348 un->c.un_status &= ~(MD_UN_OPT_NOT_DONE | MD_UN_WAR);
1349 return;
1352 un->un_rs_dropped_lock = 1;
1353 un->c.un_status |= MD_UN_WAR;
1354 resync_regions = un->un_rrd_num;
1355 dirtyregions = un->un_resync_bm;
1356 md_unit_writerexit(ui);
1358 /* For MN sets, resync NOTIFY is done when processing resync messages */
1359 if (!MD_MNSET_SETNO(setno)) {
1360 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1361 SVM_TAG_METADEVICE, setno, MD_SID(un));
1363 un = (mm_unit_t *)md_unit_readerlock(ui);
1365 /* check to see if we've been asked to terminate */
1366 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1367 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1368 broke_out = RESYNC_ERR;
1371 * Check that we are still performing an optimized
1372 * resync. If not, another node must have completed it
1373 * so we have no more work to do.
1375 if (un->un_rs_type != old_rs_type) {
1376 md_unit_readerexit(ui);
1377 (void) md_unit_writerlock(ui);
1378 return;
1381 * If rs_resync_done is non-zero, we must be completing an optimized
1382 * resync that has already been partially done on another node.
1383 * Therefore clear the bits in resync_bm for the resync regions
1384 * already done. If resync_startbl is zero, calculate 2_do.
1386 if (un->un_rs_resync_done > 0) {
1387 BLK_TO_RR(start_rr, un->un_resync_startbl, un);
1388 for (rr = 0; rr < start_rr && rr < resync_regions; rr++)
1389 CLR_KEEPDIRTY(rr, un);
1390 } else {
1391 un->un_rs_resync_2_do = 0;
1392 for (rr = 0; rr < resync_regions; rr++)
1393 if (isset(dirtyregions, rr))
1394 un->un_rs_resync_2_do++;
1397 for (rr = 0; (rr < resync_regions) && (broke_out != RESYNC_ERR); rr++) {
1398 if (isset(dirtyregions, rr)) {
1399 RR_TO_BLK(first, rr, un);
1400 RR_TO_BLK(stopbefore, rr+1, un);
1401 old_rs_type = un->un_rs_type;
1402 old_rs_done = un->un_rs_resync_done;
1403 err = resync_read_blk_range(un, first, stopbefore,
1404 MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
1405 flags1 = MD_RESYNC_FLAG_ERR;
1407 /* resync_read_blk_range releases/grabs a new lock */
1408 un = (mm_unit_t *)MD_UNIT(mnum);
1410 if (err) {
1411 broke_out = RESYNC_ERR;
1412 break;
1416 * Check that we are still performing an optimized
1417 * resync. If not, another node must have completed it
1418 * so we have no more work to do.
1420 if (un->un_rs_type != old_rs_type) {
1421 md_unit_readerexit(ui);
1422 (void) md_unit_writerlock(ui);
1423 return;
1427 * If resync_done has increased, we must have
1428 * blocked in resync_read_blk_range while another node
1429 * continued with the resync. Therefore clear resync_bm
1430 * for the blocks that have been resynced on another
1431 * node and update rr to the next RR to be done.
1433 if (old_rs_done < un->un_rs_resync_done) {
1434 int i;
1435 BLK_TO_RR(start_rr, un->un_resync_startbl - 1,
1436 un);
1437 for (i = rr; i < start_rr; i++)
1438 CLR_KEEPDIRTY(i, un);
1439 rr = start_rr;
1440 } else
1441 un->un_rs_resync_done++;
1443 for (smi = 0, cnt = 0; smi < NMIRROR; smi++)
1444 if (SUBMIRROR_IS_WRITEABLE(un, smi) &&
1445 !(SMS_BY_INDEX_IS(un, smi, SMS_ALL_ERRED)))
1446 cnt++;
1447 if (cnt < 2) {
1448 broke_out = RESYNC_ERR;
1449 break;
1451 CLR_KEEPDIRTY(rr, un);
1452 /* Check to see if we've completed the resync cleanly */
1453 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1454 break;
1457 * Check that we haven't exceeded un_rs_resync_2_do. If
1458 * we have we've completed the resync.
1460 if (un->un_rs_resync_done > un->un_rs_resync_2_do)
1461 break;
1464 md_unit_readerexit(ui);
1465 un = (mm_unit_t *)md_unit_writerlock(ui);
1468 * If MN set send message to all nodes to indicate resync
1469 * phase is complete. The processing of the message will update the
1470 * mirror state
1472 if (MD_MNSET_SETNO(setno)) {
1473 send_mn_resync_done_message(un, broke_out);
1474 } else {
1476 if (!broke_out)
1477 un->c.un_status &= ~MD_UN_WAR;
1479 un->c.un_status &= ~MD_UN_KEEP_DIRTY;
1481 setno = MD_UN2SET(un);
1482 for (smi = 0; smi < NMIRROR; smi++) {
1483 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
1484 if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE_RESYNC)) {
1485 state = (broke_out ? SMS_OFFLINE : SMS_RUNNING);
1486 mirror_set_sm_state(&un->un_sm[smi],
1487 &un->un_smic[smi], state, broke_out);
1488 mirror_commit(un, NO_SUBMIRRORS, 0);
1490 if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE))
1491 un->c.un_status |= MD_UN_OFFLINE_SM;
1495 /* For MN sets, resync NOTIFY is done when processing resync messages */
1496 if (!MD_MNSET_SETNO(setno)) {
1497 if (broke_out) {
1498 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1499 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1500 } else {
1501 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1502 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1508 * recalc_resync_done
1510 * This function deals with a change in value of un_rs_resync_2_do in a
1511 * component resync. This may change if we are restarting a component
1512 * resync on a single node having rebooted with a different value of
1513 * md_resync_bufsz or if we are running in a multi-node with nodes having
1514 * different values of md_resync_bufsz.
1515 * If there is a change in un_rs_resync_2_do, we need to recalculate
1516 * the value of un_rs_resync_done given the new value for resync_2_do.
1517 * We have to calculate a new value for resync_done to be either
1518 * if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
1519 * or if it is not set, we need to calculate it from un_rs_resync_done,
1520 * (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
1521 * In addition we need to deal with the overflow case by using a factor to
1522 * prevent overflow
1525 static void
1526 recalc_resync_done(mm_unit_t *un, size_t resync_2_do, diskaddr_t initblock,
1527 u_longlong_t blk_size, u_longlong_t skip)
1529 diskaddr_t x;
1530 uint_t factor = 1;
1533 * If resync_2_do has not yet been calculated, no need to modify
1534 * resync_done
1536 if (un->un_rs_resync_2_do == 0) {
1537 return;
1539 if (un->un_rs_resync_2_do == resync_2_do)
1540 return; /* No change, so nothing to do */
1542 * If un_rs_startbl is set, another node must have already started
1543 * this resync and hence we can calculate resync_done from
1544 * resync_startbl
1546 if (un->un_resync_startbl) {
1547 un->un_rs_resync_done = (un->un_resync_startbl - initblock) /
1548 (blk_size + skip);
1549 return;
1552 * un_resync_startbl is not set so we must calculate it from
1553 * un_rs_resync_done.
1554 * If the larger of the two values of resync_2_do is greater than 32
1555 * bits, calculate a factor to divide by to ensure that we don't
1556 * overflow 64 bits when calculating the new value for resync_done
1558 x = (un->un_rs_resync_2_do > resync_2_do) ? un->un_rs_resync_2_do :
1559 resync_2_do;
1560 while (x > INT32_MAX) {
1561 x = x >> 1;
1562 factor = factor << 1;
1564 un->un_rs_resync_done = ((un->un_rs_resync_done/factor) *
1565 (resync_2_do/factor)) /
1566 ((un->un_rs_resync_2_do + (factor * factor) - 1)/
1567 (factor * factor));
1570 static void
1571 check_comp_4_resync(mm_unit_t *un, int smi, int ci)
1573 mdi_unit_t *ui;
1574 minor_t mnum;
1575 mm_submirror_t *sm;
1576 mm_submirror_ic_t *smic;
1577 size_t count;
1578 u_longlong_t skip;
1579 u_longlong_t size;
1580 u_longlong_t blk_size;
1581 diskaddr_t initblock;
1582 diskaddr_t block;
1583 diskaddr_t frag = 0;
1584 md_m_shared_t *shared;
1585 int err;
1586 set_t setno;
1587 int broke_out = 0;
1588 int blks;
1589 uint_t old_rs_type = un->un_rs_type;
1590 diskaddr_t old_rs_done;
1591 uint_t flags1 = MD_FIRST_RESYNC_NEXT;
1592 diskaddr_t resync_2_do;
1594 mnum = MD_SID(un);
1595 ui = MDI_UNIT(mnum);
1596 sm = &un->un_sm[smi];
1597 smic = &un->un_smic[smi];
1598 setno = MD_UN2SET(un);
1600 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1601 (sm->sm_dev, sm, ci);
1603 if (shared->ms_state != CS_RESYNC) {
1604 SET_RS_TYPE_NONE(un->un_rs_type);
1605 return;
1608 if (shared->ms_flags & MDM_S_RS_TRIED) {
1609 SET_RS_TYPE_NONE(un->un_rs_type);
1610 return;
1613 (void) (*(smic->sm_get_bcss))
1614 (sm->sm_dev, sm, ci, &initblock, &count, &skip, &size);
1616 if ((count == 1) && (skip == 0)) {
1617 count = (size_t)(size / un->un_rs_copysize);
1618 if ((frag = (size - (count * un->un_rs_copysize))) != 0)
1619 count++;
1620 size = (u_longlong_t)un->un_rs_copysize;
1622 blk_size = size; /* Save block size for this resync */
1624 ASSERT(count >= 1);
1625 resync_2_do = count;
1627 * If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
1628 * gives the proportion of the resync that has already been done.
1629 * If un_rs_copysize has changed since this previous partial resync,
1630 * either because this node has been rebooted with a different value
1631 * for md_resync_bufsz or because another node with a different value
1632 * for md_resync_bufsz performed the previous resync, we need to
1633 * recalculate un_rs_resync_done as a proportion of our value of
1634 * resync_2_do.
1636 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1639 * For MN mirrors we need to send a message to all nodes indicating
1640 * the next region to be resynced. For a component resync, the size of
1641 * the contiguous region that is processed by resync_read_blk_range()
1642 * may be small if there is the interleave size.
1643 * Therefore, rather than sending the message within
1644 * resync_read_blk_range(), we will send a message every
1645 * MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
1646 * the number of blocks. Then, if we are restarting a resync, round
1647 * un_rs_resync_done down to the previous resync region boundary. This
1648 * ensures that we send a RESYNC_NEXT message before resyncing any
1649 * blocks
1651 if (MD_MNSET_SETNO(setno)) {
1652 blks = ((MD_DEF_RESYNC_BLK_SZ + blk_size + skip - 1)/
1653 (blk_size + skip));
1654 un->un_rs_resync_done = (un->un_rs_resync_done/blks) * blks;
1657 * un_rs_resync_done is the number of ('size' + 'skip') increments
1658 * already resynced from the base 'block'
1659 * un_rs_resync_2_do is the number of iterations in
1660 * this component resync.
1662 ASSERT(count >= un->un_rs_resync_done);
1663 un->un_rs_resync_2_do = (diskaddr_t)count;
1665 un->c.un_status |= MD_UN_WAR;
1666 sm->sm_flags |= MD_SM_RESYNC_TARGET;
1667 md_unit_writerexit(ui);
1669 /* For MN sets, resync NOTIFY is done when processing resync messages */
1670 if (!MD_MNSET_SETNO(setno)) {
1671 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1672 SVM_TAG_METADEVICE, setno, MD_SID(un));
1674 un = (mm_unit_t *)md_unit_readerlock(ui);
1676 /* check to see if we've been asked to terminate */
1677 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1678 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1679 broke_out = RESYNC_ERR;
1682 * Check that we are still performing the same component
1683 * resync. If not, another node must have completed it
1684 * so we have no more work to do.
1686 if (un->un_rs_type != old_rs_type) {
1687 md_unit_readerexit(ui);
1688 (void) md_unit_writerlock(ui);
1689 return;
1692 * Adjust resync_done, resync_2_do, start of resync area and count to
1693 * skip already resync'd data. We need to recalculate resync_done as
1694 * we have dropped the unit lock above and may have lost ownership to
1695 * another node, with a different resync buffer size and it may have
1696 * sent us new values of resync_done and resync_2_do based on its
1697 * resync buffer size
1699 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1700 un->un_rs_resync_2_do = resync_2_do;
1701 count -= un->un_rs_resync_done;
1702 block = initblock + ((blk_size + skip) * (int)un->un_rs_resync_done);
1704 un->un_rs_dropped_lock = 1;
1705 while ((count > 0) && (broke_out != RESYNC_ERR)) {
1706 old_rs_done = un->un_rs_resync_done;
1708 * For MN mirrors send a message to the other nodes. This
1709 * message includes the size of the region that must be blocked
1710 * for all writes
1712 if (MD_MNSET_SETNO(setno)) {
1713 if ((un->un_rs_resync_done%blks == 0)) {
1714 un->un_resync_startbl = block;
1715 send_mn_resync_next_message(un, block,
1716 (blk_size+skip)*blks, flags1);
1717 flags1 = 0;
1719 * check to see if we've been asked to
1720 * terminate
1722 if (resync_kill_pending(un,
1723 MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1724 if (un->c.un_status &
1725 MD_UN_RESYNC_CANCEL) {
1726 broke_out = RESYNC_ERR;
1727 break;
1732 * Check that we are still performing the same
1733 * component resync. If not, another node must
1734 * have completed it so we have no more work to
1735 * do. Also reset count to remaining resync as
1736 * we may have lost ownership in in
1737 * send_mn_resync_next_message while another
1738 * node continued with the resync and
1739 * incremented resync_done.
1741 if (un->un_rs_type != old_rs_type) {
1742 md_unit_readerexit(ui);
1743 (void) md_unit_writerlock(ui);
1744 return;
1747 * recalculate resync_done, resync_2_do
1748 * We need to recalculate resync_done as
1749 * we have dropped the unit lock in
1750 * send_mn_resync_next_message above and may
1751 * have lost ownership to another node, with a
1752 * different resync buffer size and it may have
1753 * sent us new values of resync_done and
1754 * resync_2_do based on its resync buffer size
1756 recalc_resync_done(un, resync_2_do, initblock,
1757 blk_size, skip);
1758 un->un_rs_resync_2_do = resync_2_do;
1759 count = un->un_rs_resync_2_do -
1760 un->un_rs_resync_done;
1762 * Adjust start of resync area to skip already
1763 * resync'd data
1765 block = initblock + ((blk_size + skip) *
1766 (int)un->un_rs_resync_done);
1767 old_rs_done = un->un_rs_resync_done;
1770 err = resync_read_blk_range(un, block, block + size,
1771 MD_READER_HELD, MD_RESYNC_FLAG_ERR);
1773 /* resync_read_blk_range releases/grabs a new lock */
1774 un = (mm_unit_t *)MD_UNIT(mnum);
1776 if (err) {
1777 broke_out = RESYNC_ERR;
1778 break;
1781 * If we are no longer resyncing this component, return as
1782 * another node has progressed the resync.
1784 if (un->un_rs_type != old_rs_type) {
1785 md_unit_readerexit(ui);
1786 (void) md_unit_writerlock(ui);
1787 return;
1791 * recalculate resync_done, resync_2_do. We need to recalculate
1792 * resync_done as we have dropped the unit lock in
1793 * resync_read_blk_range above and may have lost ownership to
1794 * another node, with a different resync buffer size and it may
1795 * have sent us new values of resync_done and resync_2_do based
1796 * on its resync buffer size
1798 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1799 un->un_rs_resync_2_do = resync_2_do;
1802 * Reset count to remaining resync as we may have blocked in
1803 * resync_read_blk_range while another node continued
1804 * with the resync and incremented resync_done. Also adjust
1805 * start of resync area to skip already resync'd data.
1807 count = un->un_rs_resync_2_do - un->un_rs_resync_done;
1808 block = initblock +((blk_size + skip) *
1809 (int)un->un_rs_resync_done);
1812 * If we are picking up from another node, we retry the last
1813 * block otherwise step on to the next block
1815 if (old_rs_done == un->un_rs_resync_done) {
1816 block += blk_size + skip;
1817 un->un_rs_resync_done++;
1818 count--;
1821 if ((count == 1) && frag)
1822 size = frag;
1823 if (shared->ms_state == CS_ERRED) {
1824 err = 1;
1825 broke_out = RESYNC_ERR;
1826 break;
1829 /* Check to see if we've completed the resync cleanly */
1830 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1831 break;
1834 md_unit_readerexit(ui);
1835 un = (mm_unit_t *)md_unit_writerlock(ui);
1838 * If MN set send message to all nodes to indicate resync
1839 * phase is complete. The processing of the message will update the
1840 * mirror state
1842 if (MD_MNSET_SETNO(setno)) {
1843 send_mn_resync_done_message(un, broke_out);
1844 } else {
1845 un->c.un_status &= ~MD_UN_WAR;
1846 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
1848 if (err)
1849 shared->ms_flags |= MDM_S_RS_TRIED;
1850 else
1852 * As we don't transmit the changes,
1853 * no need to drop the lock.
1855 set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
1856 MD_STATE_NO_XMIT, (IOLOCK *)NULL);
1859 /* For MN sets, resync NOTIFY is done when processing resync messages */
1860 if (!MD_MNSET_SETNO(setno)) {
1861 if (broke_out) {
1862 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1863 SVM_TAG_METADEVICE, setno, MD_SID(un));
1864 } else {
1865 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1866 SVM_TAG_METADEVICE, setno, MD_SID(un));
1868 SET_RS_TYPE_NONE(un->un_rs_type);
1872 static void
1873 submirror_resync(mm_unit_t *un)
1875 mdi_unit_t *ui;
1876 minor_t mnum;
1877 mm_submirror_t *sm;
1878 mm_submirror_ic_t *smic;
1879 int smi;
1880 diskaddr_t chunk;
1881 diskaddr_t curblk;
1882 int err;
1883 int cnt;
1884 set_t setno;
1885 int broke_out = 0;
1886 int i;
1887 int flags1 = MD_FIRST_RESYNC_NEXT;
1888 int compcnt;
1890 mnum = MD_SID(un);
1891 ui = MDI_UNIT(mnum);
1892 setno = MD_UN2SET(un);
1895 * If the submirror_index is non-zero, we are continuing a resync
1896 * so restart resync from last submirror marked as being resynced.
1898 if (RS_SMI(un->un_rs_type) != 0) {
1899 smi = RS_SMI(un->un_rs_type);
1900 sm = &un->un_sm[smi];
1901 smic = &un->un_smic[smi];
1902 if (!SMS_IS(sm, SMS_ATTACHED_RESYNC)) {
1903 for (smi = 0; smi < NMIRROR; smi++) {
1904 sm = &un->un_sm[smi];
1905 smic = &un->un_smic[smi];
1906 if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1907 break;
1910 } else {
1911 for (smi = 0; smi < NMIRROR; smi++) {
1912 sm = &un->un_sm[smi];
1913 smic = &un->un_smic[smi];
1914 if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1915 break;
1918 if (smi == NMIRROR) {
1919 SET_RS_TYPE_NONE(un->un_rs_type);
1920 return;
1924 * If we've only got one component we can fail on a resync write
1925 * if an error is encountered. This stops an unnecessary read of the
1926 * whole mirror on a target write error.
1928 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
1929 if (compcnt == 1)
1930 flags1 |= MD_RESYNC_FLAG_ERR;
1932 un->c.un_status |= MD_UN_WAR;
1933 sm->sm_flags |= MD_SM_RESYNC_TARGET;
1934 SET_RS_SMI(un->un_rs_type, smi);
1935 md_unit_writerexit(ui);
1937 /* For MN sets, resync NOTIFY is done when processing resync messages */
1938 if (!MD_MNSET_SETNO(setno)) {
1939 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1940 SVM_TAG_METADEVICE, setno, MD_SID(un));
1942 un = (mm_unit_t *)md_unit_readerlock(ui);
1944 un->un_rs_dropped_lock = 1;
1946 /* check to see if we've been asked to terminate */
1947 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1948 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1949 broke_out = RESYNC_ERR;
1952 * Check that we are still performing the same submirror
1953 * resync. If not, another node must have completed it
1954 * so we have no more work to do.
1956 if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
1957 md_unit_readerexit(ui);
1958 (void) md_unit_writerlock(ui);
1959 return;
1962 /* if > 1TB mirror, increase percent done granularity */
1963 if (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)
1964 chunk = un->c.un_total_blocks / 1000;
1965 else
1966 chunk = un->c.un_total_blocks / 100;
1967 if (chunk == 0)
1968 chunk = un->c.un_total_blocks;
1970 * If a MN set, round the chunk size up to a multiple of
1971 * MD_DEF_RESYNC_BLK_SZ
1973 if (MD_MNSET_SETNO(setno)) {
1974 chunk = ((chunk + MD_DEF_RESYNC_BLK_SZ)/MD_DEF_RESYNC_BLK_SZ)
1975 * MD_DEF_RESYNC_BLK_SZ;
1976 if (chunk > un->c.un_total_blocks)
1977 chunk = un->c.un_total_blocks;
1980 * Handle restartable resyncs that continue from where the previous
1981 * resync left off. The new resync range is from un_rs_resync_done ..
1982 * un_rs_resync_2_do
1984 curblk = 0;
1985 if (un->un_rs_resync_done == 0) {
1986 un->un_rs_resync_2_do = un->c.un_total_blocks;
1987 } else {
1988 curblk = un->un_rs_resync_done;
1990 while ((curblk != un->c.un_total_blocks) && (broke_out != RESYNC_ERR)) {
1991 diskaddr_t rs_done;
1993 rs_done = un->un_rs_resync_done;
1994 err = resync_read_blk_range(un, curblk, curblk + chunk,
1995 MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
1996 flags1 = (compcnt == 1 ? MD_RESYNC_FLAG_ERR : 0);
1998 /* resync_read_blk_range releases/grabs a new lock */
1999 un = (mm_unit_t *)MD_UNIT(mnum);
2001 if (err) {
2002 broke_out = RESYNC_ERR;
2003 break;
2007 * If we are no longer executing a submirror resync, return
2008 * as another node has completed the submirror resync.
2010 if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
2011 md_unit_readerexit(ui);
2012 (void) md_unit_writerlock(ui);
2013 return;
2016 * If resync_done has changed, we must have blocked
2017 * in resync_read_blk_range while another node
2018 * continued with the resync so restart from resync_done.
2020 if (rs_done != un->un_rs_resync_done) {
2021 curblk = un->un_rs_resync_done;
2022 } else {
2023 curblk += chunk;
2024 un->un_rs_resync_done = curblk;
2027 if ((curblk + chunk) > un->c.un_total_blocks)
2028 chunk = un->c.un_total_blocks - curblk;
2029 for (i = 0, cnt = 0; i < NMIRROR; i++)
2030 if (SUBMIRROR_IS_WRITEABLE(un, i) &&
2031 !SMS_BY_INDEX_IS(un, i, SMS_ALL_ERRED) &&
2032 (un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET))
2033 cnt++;
2034 if (cnt == 0) {
2035 broke_out = RESYNC_ERR;
2036 break;
2039 /* Check to see if we've completed the resync cleanly */
2040 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
2041 break;
2043 md_unit_readerexit(ui);
2044 un = (mm_unit_t *)md_unit_writerlock(ui);
2047 * If MN set send message to all nodes to indicate resync
2048 * phase is complete. The processing of the message will update the
2049 * mirror state
2051 if (MD_MNSET_SETNO(setno)) {
2052 send_mn_resync_done_message(un, broke_out);
2053 } else {
2054 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
2055 if (err) {
2056 mirror_set_sm_state(sm, smic, SMS_ATTACHED, 1);
2057 } else {
2058 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2060 un->c.un_status &= ~MD_UN_WAR;
2061 mirror_commit(un, SMI2BIT(smi), 0);
2064 /* For MN sets, resync NOTIFY is done when processing resync messages */
2065 if (!MD_MNSET_SETNO(setno)) {
2066 if (broke_out) {
2067 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
2068 SVM_TAG_METADEVICE, setno, MD_SID(un));
2069 } else {
2070 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
2071 SVM_TAG_METADEVICE, setno, MD_SID(un));
2076 static void
2077 component_resync(mm_unit_t *un)
2079 mm_submirror_t *sm;
2080 mm_submirror_ic_t *smic;
2081 int ci;
2082 int i;
2083 int compcnt;
2086 * Handle the case where we are picking up a partially complete
2087 * component resync. In this case un_rs_type contains the submirror
2088 * and component index of where we should restart the resync.
2090 while (un->un_rs_type != MD_RS_COMPONENT) {
2091 i = RS_SMI(un->un_rs_type);
2092 ci = RS_CI(un->un_rs_type);
2093 check_comp_4_resync(un, i, ci);
2094 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2095 MD_WRITER_HELD))
2096 return;
2098 * If we have no current resync, contine to scan submirror and
2099 * components. If the resync has moved on to another component,
2100 * restart it and if the resync is no longer a component
2101 * resync, just exit
2103 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE)
2104 break;
2105 if (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT)
2106 return;
2108 /* Now continue scanning _all_ submirrors and components */
2109 for (i = 0; i < NMIRROR; i++) {
2110 sm = &un->un_sm[i];
2111 smic = &un->un_smic[i];
2112 if (!SMS_IS(sm, SMS_RUNNING | SMS_LIMPING))
2113 continue;
2114 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2115 for (ci = 0; ci < compcnt; ci++) {
2116 SET_RS_SMI(un->un_rs_type, i);
2117 SET_RS_CI(un->un_rs_type, ci);
2118 SET_RS_TYPE(un->un_rs_type, MD_RS_COMPONENT);
2119 check_comp_4_resync(un, i, ci);
2120 /* Bail out if we've been asked to abort/shutdown */
2121 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2122 MD_WRITER_HELD))
2123 return;
2125 * Now check if another node has continued with the
2126 * resync, if we are no longer in component resync,
2127 * exit, otherwise update to the current component - 1
2128 * so that the next call of check_comp_4 resync() will
2129 * resync the current component.
2131 if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2132 (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT))
2133 return;
2134 else {
2135 if (RS_SMI(un->un_rs_type) != i) {
2136 i = RS_SMI(un->un_rs_type);
2137 ci = RS_CI(un->un_rs_type) - 1;
2138 } else if (RS_CI(un->un_rs_type) != ci)
2139 ci = RS_CI(un->un_rs_type) - 1;
2145 static void
2146 reset_comp_flags(mm_unit_t *un)
2148 mm_submirror_t *sm;
2149 mm_submirror_ic_t *smic;
2150 md_m_shared_t *shared;
2151 int ci;
2152 int i;
2153 int compcnt;
2155 for (i = 0; i < NMIRROR; i++) {
2156 sm = &un->un_sm[i];
2157 smic = &un->un_smic[i];
2158 if (!SMS_IS(sm, SMS_INUSE))
2159 continue;
2160 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2161 for (ci = 0; ci < compcnt; ci++) {
2162 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2163 (sm->sm_dev, sm, ci);
2164 shared->ms_flags &= ~MDM_S_RS_TRIED;
2170 * resync_progress_thread:
2171 * ----------------------
2172 * Thread started on first resync of a unit which simply blocks until woken up
2173 * by a cv_signal, and then updates the mddb for the mirror unit record. This
2174 * saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
2175 * so that an aborted resync can be continued after an intervening reboot.
2177 static void
2178 resync_progress_thread(minor_t mnum)
2180 mm_unit_t *un = MD_UNIT(mnum);
2181 mdi_unit_t *ui = MDI_UNIT(mnum);
2182 set_t setno = MD_MIN2SET(mnum);
2184 while (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2185 mutex_enter(&un->un_rs_progress_mx);
2186 cv_wait(&un->un_rs_progress_cv, &un->un_rs_progress_mx);
2187 mutex_exit(&un->un_rs_progress_mx);
2188 if (un->un_rs_progress_flags & MD_RI_KILL)
2189 break;
2192 * Commit mirror unit if we're the Master node in a multi-node
2193 * environment
2195 if (MD_MNSET_SETNO(setno) && md_set[setno].s_am_i_master) {
2196 (void) md_unit_readerlock(ui);
2197 mirror_commit(un, NO_SUBMIRRORS, 0);
2198 md_unit_readerexit(ui);
2201 thread_exit();
2205 * resync_progress:
2206 * ---------------
2207 * Timeout handler for updating the progress of the resync thread.
2208 * Simply wake up the resync progress daemon which will then mirror_commit() the
2209 * unit structure to the mddb. This snapshots the current progress of the resync
2211 static void
2212 resync_progress(void *arg)
2214 mm_unit_t *un = (mm_unit_t *)arg;
2215 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
2216 uint_t active;
2218 mutex_enter(&un->un_rs_progress_mx);
2219 cv_signal(&un->un_rs_progress_cv);
2220 mutex_exit(&un->un_rs_progress_mx);
2222 /* schedule the next timeout if the resync is still marked active */
2223 (void) md_unit_readerlock(ui);
2224 active = un->c.un_status & MD_UN_RESYNC_ACTIVE ? 1 : 0;
2225 md_unit_readerexit(ui);
2226 if (active) {
2227 un->un_rs_resync_to_id = timeout(resync_progress, un,
2228 (clock_t)(drv_usectohz(60000000) *
2229 md_mirror_resync_update_intvl));
2234 * resync_unit:
2235 * -----------
2236 * Resync thread which drives all forms of resync (optimized, component,
2237 * submirror). Must handle thread suspension and kill to allow multi-node
2238 * resync to run without undue ownership changes.
2240 * For a MN set, the reync mechanism is as follows:
2242 * When a resync is started, either via metattach, metaonline, metareplace,
2243 * metasync or by a hotspare kicking in, a message is sent to all nodes, which
2244 * calls mirror_resync_thread. If there is currently no mirror owner, the
2245 * master node sends a CHOOSE_OWNER message to the handler on the master. This
2246 * chooses a mirror owner and sends a CHANGE_OWNER message requesting the
2247 * selected node to become the owner.
2248 * If this node is not the owner it sets itself to block in resync_kill_pending
2249 * and if there is no owner all nodes will block until the chosen owner is
2250 * selected, in which case it will unblock itself. So, on entry to this
2251 * function only one node will continue past resync_kill_pending().
2252 * Once the resync thread is started, it basically cycles through the optimized,
2253 * component and submirrors resyncs until there is no more work to do.
2255 * For an ABR mirror, once a mirror owner is chosen it will complete the resync
2256 * unless the nodes dies in which case a new owner will be chosen and it will
2257 * have to complete the resync from the point at which the previous owner died.
2258 * To do this we broadcast a RESYNC_NEXT message before each region to be
2259 * resynced and this message contains the address and length of the region
2260 * being resynced and the current progress through the resync. The size of
2261 * this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
2262 * block size to limit the amount of inter node traffic. The RESYNC_NEXT
2263 * message also indicates to all other nodes that all writes to this block
2264 * must be blocked until the next RESYNC_NEXT message is received. This ensures
2265 * that no node can write to a block that is being resynced. For all MN
2266 * mirrors we also block the whole resync region on the resync owner node so
2267 * that all writes to the resync region are blocked on all nodes. There is a
2268 * difference here between a MN set and a regular set in that for a MN set
2269 * we protect the mirror from writes to the current resync block by blocking
2270 * a larger region. For a regular set we just block writes to the current
2271 * resync block.
2273 * For a non-ABR mirror the same RESYNC_NEXT message is sent with an
2274 * additional purpose. In this case, there is only one mirror owner at a time
2275 * and rather than continually switching ownership between the chosen mirror
2276 * owner and the node that is writing to the mirror, we move the resync to the
2277 * mirror owner. When we swich ownership, we block the old owner and unblock
2278 * the resync thread on the new owner. To enable the new owner to continue the
2279 * resync, all nodes need to have the latest resync status, Then, following each
2280 * resync write, we check to see if the resync state has changed and if it
2281 * has this must be because we have lost ownership to another node(s) for a
2282 * period and then have become owner again later in the resync process. If we
2283 * are still dealing with the same resync, we just adjust addresses and counts
2284 * and then continue. If the resync has moved on to a different type, for
2285 * example from an optimized to a submirror resync, we move on to process the
2286 * resync described by rs_type and continue from the position described by
2287 * resync_done and resync_startbl.
2289 * Note that for non-ABR mirrors it is possible for a write to be made on a
2290 * non resync-owner node without a change of ownership. This is the case when
2291 * the mirror has a soft part created on it and a write in ABR mode is made
2292 * to that soft part. Therefore we still need to block writes to the resync
2293 * region on all nodes.
2295 * Sending the latest resync state to all nodes also enables them to continue
2296 * a resync in the event that the mirror owner dies. If a mirror owner for
2297 * a non-ABR mirror has died, there will be dirty resync regions. Therefore,
2298 * regardless of whether another type of resync was in progress, we must first
2299 * do an optimized resync to clean up the dirty regions before continuing
2300 * with the interrupted resync.
2302 * The resync status is held in the unit structure
2303 * On disk
2304 * un_rs_resync_done The number of contiguous resyc blocks done so far
2305 * un_rs_resync_2_do The total number of contiguous resync blocks
2306 * un_rs_type The resync type (inc submirror and component numbers)
2307 * In core
2308 * un_resync_startbl The address of the current resync block being processed
2310 * In the event that the whole cluster fails we need to just use
2311 * un_rs_resync_done to restart the resync and to ensure that this is
2312 * periodically written to disk, we have a thread which writes the record
2313 * to disk every 5 minutes. As the granularity of un_rs_resync_done is
2314 * usually coarse ( for an optimized resync 1001 is the max value) there is
2315 * little point in writing this more frequently.
2317 static void
2318 resync_unit(minor_t mnum)
2320 mdi_unit_t *ui;
2321 mm_unit_t *un;
2322 md_error_t mde = mdnullerror;
2323 int mn_resync = 0;
2324 int resync_finish = 0;
2325 set_t setno = MD_MIN2SET(mnum);
2326 uint_t old_rs_type = MD_RS_NONE;
2327 uint_t old_rs_done = 0, old_rs_2_do = 0;
2328 uint_t old_rs_startbl = 0;
2329 int block_resync = 1;
2330 char cpr_name[23]; /* Unique CPR name */
2331 int rs_copysize;
2332 char *rs_buffer;
2334 resync_restart:
2335 #ifdef DEBUG
2336 if (mirror_debug_flag)
2337 printf("Resync started (mnum = %x)\n", mnum);
2338 #endif
2340 * increment the mirror resync count
2342 mutex_enter(&md_cpr_resync.md_resync_mutex);
2343 md_cpr_resync.md_mirror_resync++;
2344 mutex_exit(&md_cpr_resync.md_resync_mutex);
2346 ui = MDI_UNIT(mnum);
2347 un = MD_UNIT(mnum);
2349 rs_copysize = un->un_rs_copysize;
2350 if (rs_copysize == 0) {
2352 * Don't allow buffer size to fall outside the
2353 * range 0 < bufsize <= md_max_xfer_bufsz.
2355 if (md_resync_bufsz <= 0)
2356 md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
2357 rs_copysize = MIN(md_resync_bufsz, md_max_xfer_bufsz);
2359 rs_buffer = kmem_zalloc(dbtob(rs_copysize), KM_SLEEP);
2360 un = md_unit_writerlock(ui);
2361 un->un_rs_copysize = rs_copysize;
2362 un->un_rs_buffer = rs_buffer;
2364 if (MD_MNSET_SETNO(setno)) {
2366 * Register this resync thread with the CPR mechanism. This
2367 * allows us to detect when the system is suspended and so
2368 * keep track of the RPC failure condition.
2370 (void) snprintf(cpr_name, sizeof (cpr_name),
2371 "mirror_resync%x", mnum);
2372 CALLB_CPR_INIT(&un->un_rs_cprinfo, &un->un_rs_cpr_mx,
2373 callb_md_mrs_cpr, cpr_name);
2375 if (ui->ui_tstate & MD_RESYNC_NOT_DONE) {
2377 * If this is the first resync following the initial
2378 * snarf (MD_RESYNC_NOT_DONE still set) and we've
2379 * been started outside a reconfig step (e.g. by being
2380 * added to an existing set) we need to query the
2381 * existing submirror state for this mirror.
2382 * The set_status flags will have MD_MN_SET_MIR_STATE_RC
2383 * set if we've been through a step4 reconfig, so only
2384 * query the master if this isn't (yet) set. In this
2385 * case we must continue the resync thread as there is
2386 * not guaranteed to be a currently running resync on
2387 * any of the other nodes. Worst case is that we will
2388 * initiate an ownership change to this node and then
2389 * find that there is no resync to perform. However, we
2390 * will then have correct status across the cluster.
2392 if (!md_set[setno].s_am_i_master) {
2393 if (!(md_get_setstatus(setno) &
2394 MD_SET_MN_MIR_STATE_RC)) {
2395 mirror_get_status(un, NULL);
2396 block_resync = 0;
2397 #ifdef DEBUG
2398 if (mirror_debug_flag) {
2399 mm_submirror_t *sm;
2400 int i;
2401 for (i = 0; i < NMIRROR; i++) {
2402 sm = &un->un_sm[i];
2403 printf(
2404 "sm[%d] state=%4x"
2405 " flags=%4x\n", i,
2406 sm->sm_state,
2407 sm->sm_flags);
2410 #endif
2413 ui->ui_tstate &= ~MD_RESYNC_NOT_DONE;
2416 * For MN set, if we have an owner, then start the resync on it.
2417 * If there is no owner the master must send a message to
2418 * choose the owner. This message will contain the current
2419 * resync count and it will only be sent to the master, where
2420 * the resync count will be used to choose the next node to
2421 * perform a resync, by cycling through the nodes in the set.
2422 * The message handler will then send a CHANGE_OWNER message to
2423 * all nodes, and on receipt of that message, the chosen owner
2424 * will issue a SET_OWNER ioctl to become the owner. This ioctl
2425 * will be requested to spawn a thread to issue the
2426 * REQUEST_OWNER message to become the owner which avoids the
2427 * need for concurrent ioctl requests.
2428 * After sending the message, we will block waiting for one
2429 * of the nodes to become the owner and start the resync
2431 if (MD_MN_NO_MIRROR_OWNER(un)) {
2433 * There is no owner, block and then the master will
2434 * choose the owner. Only perform this if 'block_resync'
2435 * is set.
2437 if (block_resync) {
2438 mutex_enter(&un->un_rs_thread_mx);
2439 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2440 mutex_exit(&un->un_rs_thread_mx);
2442 if (md_set[setno].s_am_i_master) {
2443 md_unit_writerexit(ui);
2444 (void) mirror_choose_owner(un, NULL);
2445 (void) md_unit_writerlock(ui);
2447 } else {
2448 /* There is an owner, block if we are not it */
2449 if (!MD_MN_MIRROR_OWNER(un)) {
2450 mutex_enter(&un->un_rs_thread_mx);
2451 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2452 mutex_exit(&un->un_rs_thread_mx);
2457 * Start a timeout chain to update the resync progress to the mddb.
2458 * This will run every md_mirror_resync_update_intvl minutes and allows
2459 * a resync to be continued over a reboot.
2461 ASSERT(un->un_rs_resync_to_id == 0);
2462 un->un_rs_resync_to_id = timeout(resync_progress, un,
2463 (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl));
2466 * Handle resync restart from the last logged position. The contents
2467 * of un_rs_resync_2_do and un_rs_resync_done are dependent on the
2468 * type of resync that was in progress.
2470 if (MD_MNSET_SETNO(setno)) {
2471 switch ((uint_t)RS_TYPE(un->un_rs_type)) {
2472 case MD_RS_NONE:
2473 case MD_RS_OPTIMIZED:
2474 case MD_RS_COMPONENT:
2475 case MD_RS_SUBMIRROR:
2476 case MD_RS_ABR:
2477 break;
2478 default:
2479 un->un_rs_type = MD_RS_NONE;
2481 /* Allocate a resync message, if required */
2482 if (un->un_rs_msg == NULL) {
2483 un->un_rs_msg = (md_mn_msg_resync_t *)kmem_zalloc(
2484 sizeof (md_mn_msg_resync_t), KM_SLEEP);
2486 mn_resync = 1;
2489 /* Check to see if we've been requested to block/kill */
2490 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2491 goto bail_out;
2494 do {
2495 un->un_rs_dropped_lock = 0;
2497 * Always perform an optimized resync first as this will bring
2498 * the mirror into an available state in the shortest time.
2499 * If we are resuming an interrupted resync, other than an
2500 * optimized resync, we save the type and amount done so that
2501 * we can resume the appropriate resync after the optimized
2502 * resync has completed.
2504 if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2505 (RS_TYPE(un->un_rs_type) != MD_RS_OPTIMIZED)) {
2506 old_rs_type = un->un_rs_type;
2507 old_rs_done = un->un_rs_resync_done;
2508 old_rs_2_do = un->un_rs_resync_2_do;
2509 old_rs_startbl = un->un_resync_startbl;
2511 SET_RS_TYPE(un->un_rs_type, MD_RS_OPTIMIZED);
2513 * If we are continuing a resync that is not an
2514 * OPTIMIZED one, then we start from the beginning when
2515 * doing this optimized resync
2517 if (RS_TYPE(old_rs_type) != MD_RS_OPTIMIZED) {
2518 un->un_rs_resync_done = 0;
2519 un->un_rs_resync_2_do = 0;
2520 un->un_resync_startbl = 0;
2522 optimized_resync(un);
2523 /* Check to see if we've been requested to block/kill */
2524 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2525 goto bail_out;
2527 un = (mm_unit_t *)MD_UNIT(mnum);
2529 * If another node has moved the resync on, we must
2530 * restart the correct resync
2532 if (mn_resync &&
2533 (RS_TYPE(un->un_rs_type) != MD_RS_NONE)) {
2534 old_rs_type = un->un_rs_type;
2535 old_rs_done = un->un_rs_resync_done;
2536 old_rs_2_do = un->un_rs_resync_2_do;
2537 old_rs_startbl = un->un_resync_startbl;
2541 * Restore previous resync progress or move onto a
2542 * component resync.
2544 if (RS_TYPE(old_rs_type) != MD_RS_NONE) {
2545 un->un_rs_type = old_rs_type;
2546 un->un_rs_resync_done = old_rs_done;
2547 un->un_rs_resync_2_do = old_rs_2_do;
2548 un->un_resync_startbl = old_rs_startbl;
2549 } else {
2550 un->un_rs_type = MD_RS_COMPONENT;
2551 un->un_rs_resync_done = 0;
2552 un->un_rs_resync_2_do = 0;
2553 un->un_resync_startbl = 0;
2556 if (RS_TYPE(un->un_rs_type) == MD_RS_COMPONENT) {
2557 component_resync(un);
2558 /* Check to see if we've been requested to block/kill */
2559 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2560 goto bail_out;
2562 un = (mm_unit_t *)MD_UNIT(mnum);
2564 * If we have moved on from a component resync, another
2565 * node must have completed it and started a submirror
2566 * resync, so leave the resync state alone. For non
2567 * multi-node sets we move onto the submirror resync.
2569 if (mn_resync) {
2570 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2571 un->un_rs_type = MD_RS_SUBMIRROR;
2572 un->un_rs_resync_done =
2573 un->un_rs_resync_2_do = 0;
2574 un->un_resync_startbl = 0;
2576 } else {
2577 un->un_rs_type = MD_RS_SUBMIRROR;
2578 un->un_rs_resync_done = 0;
2579 un->un_rs_resync_2_do = 0;
2580 un->un_resync_startbl = 0;
2583 if (RS_TYPE(un->un_rs_type) == MD_RS_SUBMIRROR) {
2584 submirror_resync(un);
2585 /* Check to see if we've been requested to block/kill */
2586 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2587 goto bail_out;
2589 un = (mm_unit_t *)MD_UNIT(mnum);
2591 * If we have moved on from a submirror resync, another
2592 * node must have completed it and started a different
2593 * resync, so leave the resync state alone
2595 if (mn_resync) {
2596 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2597 un->un_rs_resync_done =
2598 un->un_rs_resync_2_do = 0;
2599 un->un_resync_startbl = 0;
2601 } else {
2602 /* If non-MN mirror, reinitialize state */
2603 un->un_rs_type = MD_RS_NONE;
2604 un->un_rs_resync_done = 0;
2605 un->un_rs_resync_2_do = 0;
2606 un->un_resync_startbl = 0;
2609 } while (un->un_rs_dropped_lock);
2610 mutex_enter(&un->un_rs_thread_mx);
2611 un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
2612 mutex_exit(&un->un_rs_thread_mx);
2614 resync_finish = 1;
2615 bail_out:
2616 #ifdef DEBUG
2617 if (mirror_debug_flag)
2618 printf("Resync stopped (mnum = %x), resync_finish = %d\n",
2619 mnum, resync_finish);
2620 #endif
2621 kmem_free(un->un_rs_buffer, dbtob(un->un_rs_copysize));
2623 mutex_enter(&un->un_rs_progress_mx);
2624 un->un_rs_progress_flags |= MD_RI_KILL;
2625 cv_signal(&un->un_rs_progress_cv);
2626 mutex_exit(&un->un_rs_progress_mx);
2629 * For MN Set, send a RESYNC_FINISH if this node completed the resync.
2630 * There is no need to grow unit here, it will be done in the
2631 * handler for the RESYNC_FINISH message together with resetting
2632 * MD_UN_RESYNC_ACTIVE.
2634 if (mn_resync) {
2635 if (resync_finish) {
2637 * Normal resync completion. Issue a RESYNC_FINISH
2638 * message if we're part of a multi-node set.
2640 md_mn_kresult_t *kres;
2641 md_mn_msg_resync_t *rmsg;
2642 int rval;
2644 rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
2645 md_unit_writerexit(ui);
2647 rmsg->msg_resync_mnum = mnum;
2648 rmsg->msg_resync_type = 0;
2649 rmsg->msg_resync_done = 0;
2650 rmsg->msg_resync_2_do = 0;
2651 rmsg->msg_originator = md_mn_mynode_id;
2653 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2655 mutex_enter(&un->un_rs_cpr_mx);
2656 CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
2658 rval = mdmn_ksend_message(setno,
2659 MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
2660 (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
2662 CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
2663 &un->un_rs_cpr_mx);
2664 mutex_exit(&un->un_rs_cpr_mx);
2666 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
2667 mdmn_ksend_show_error(rval, kres,
2668 "RESYNC_FINISH");
2669 /* If we're shutting down, pause things here. */
2670 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
2671 while (!md_mn_is_commd_present()) {
2672 delay(md_hz);
2675 cmn_err(CE_PANIC,
2676 "ksend_message failure: RESYNC_FINISH");
2678 kmem_free(kres, sizeof (md_mn_kresult_t));
2679 (void) md_unit_writerlock(ui);
2682 * If the resync has been cancelled, clear flags, reset owner
2683 * for ABR mirror and release the resync region parent
2684 * structure.
2686 if (un->c.un_status & MD_UN_RESYNC_CANCEL) {
2687 md_mps_t *ps;
2689 if (ui->ui_tstate & MD_ABR_CAP) {
2690 /* Resync finished, if ABR set owner to NULL */
2691 mutex_enter(&un->un_owner_mx);
2692 un->un_mirror_owner = 0;
2693 mutex_exit(&un->un_owner_mx);
2696 un->c.un_status &= ~(MD_UN_RESYNC_CANCEL |
2697 MD_UN_RESYNC_ACTIVE);
2698 ps = un->un_rs_prev_overlap;
2699 if (ps != NULL) {
2700 /* Remove previous overlap resync region */
2701 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2702 mirror_overlap_tree_remove(ps);
2704 * Release the overlap range reference
2706 un->un_rs_prev_overlap = NULL;
2707 kmem_cache_free(mirror_parent_cache,
2708 ps);
2713 * Release resync message buffer. This will be reallocated on
2714 * the next invocation of the resync_unit thread.
2716 if (un->un_rs_msg) {
2717 kmem_free(un->un_rs_msg, sizeof (md_mn_msg_resync_t));
2718 un->un_rs_msg = NULL;
2720 } else {
2721 /* For non-MN sets deal with any pending grows */
2722 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2723 if (un->c.un_status & MD_UN_GROW_PENDING) {
2724 if ((mirror_grow_unit(un, &mde) != 0) ||
2725 (! mdismderror(&mde, MDE_GROW_DELAYED))) {
2726 un->c.un_status &= ~MD_UN_GROW_PENDING;
2731 reset_comp_flags(un);
2732 un->un_resync_completed = 0;
2733 mirror_commit(un, NO_SUBMIRRORS, 0);
2734 md_unit_writerexit(ui);
2737 * Stop the resync progress thread.
2739 if (un->un_rs_resync_to_id != 0) {
2740 (void) untimeout(un->un_rs_resync_to_id);
2741 un->un_rs_resync_to_id = 0;
2745 * Calling mirror_internal_close() makes further reference to un / ui
2746 * dangerous. If we are the only consumer of the mirror it is possible
2747 * for a metaclear to be processed after completion of the m_i_c()
2748 * routine. As we need to handle the case where another resync has been
2749 * scheduled for the mirror, we raise the open count on the device
2750 * which protects against the close / metaclear / lock => panic scenario
2752 (void) md_unit_incopen(MD_SID(un), FREAD|FWRITE, OTYP_LYR);
2753 (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2756 * deccrement the mirror resync count
2758 mutex_enter(&md_cpr_resync.md_resync_mutex);
2759 md_cpr_resync.md_mirror_resync--;
2760 mutex_exit(&md_cpr_resync.md_resync_mutex);
2763 * Remove the thread reference as we're about to exit. This allows a
2764 * subsequent mirror_resync_unit() to start a new thread.
2765 * If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
2766 * called to start a new resync, so reopen the mirror and go back to
2767 * the start.
2769 (void) md_unit_writerlock(ui);
2770 mutex_enter(&un->un_rs_thread_mx);
2771 un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2772 mutex_exit(&un->un_rs_thread_mx);
2773 if (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2774 md_unit_writerexit(ui);
2775 if (mirror_internal_open(MD_SID(un), (FREAD|FWRITE),
2776 OTYP_LYR, 0, (IOLOCK *)NULL) == 0) {
2777 /* Release the reference grabbed above */
2778 (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0,
2779 (IOLOCK *)NULL);
2780 goto resync_restart;
2782 (void) md_unit_writerlock(ui);
2783 cmn_err(CE_NOTE,
2784 "Could not open metadevice (%x) for resync\n",
2785 MD_SID(un));
2787 un->un_rs_thread = NULL;
2788 md_unit_writerexit(ui);
2791 * Check for hotspares once we've cleared the resync thread reference.
2792 * If there are any errored units a poke_hotspares() will result in
2793 * a call to mirror_resync_unit() which we need to allow to start.
2795 (void) poke_hotspares();
2798 * Remove this thread from the CPR callback table.
2800 if (mn_resync) {
2801 mutex_enter(&un->un_rs_cpr_mx);
2802 CALLB_CPR_EXIT(&un->un_rs_cprinfo);
2806 * Remove the extra reference to the unit we generated above. After
2807 * this call it is *unsafe* to reference either ui or un as they may
2808 * no longer be allocated.
2810 (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2812 thread_exit();
2816 * mirror_resync_unit:
2817 * ------------------
2818 * Start a resync for the given mirror metadevice. Save the resync thread ID in
2819 * un->un_rs_thread for later manipulation.
2821 * Returns:
2822 * 0 Success
2823 * !=0 Error
2825 /*ARGSUSED*/
2827 mirror_resync_unit(
2828 minor_t mnum,
2829 md_resync_ioctl_t *ri,
2830 md_error_t *ep,
2831 IOLOCK *lockp
2834 mdi_unit_t *ui;
2835 mm_unit_t *un;
2836 set_t setno = MD_MIN2SET(mnum);
2838 ui = MDI_UNIT(mnum);
2840 if (md_get_setstatus(setno) & MD_SET_STALE)
2841 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
2843 if (mirror_internal_open(mnum, (FREAD|FWRITE), OTYP_LYR, 0, lockp)) {
2844 return (mdmderror(ep, MDE_MIRROR_OPEN_FAILURE, mnum));
2846 if (lockp) {
2847 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
2848 } else {
2849 un = (mm_unit_t *)md_unit_writerlock(ui);
2853 * Check to see if we're attempting to start a resync while one is
2854 * already running.
2856 if (un->c.un_status & MD_UN_RESYNC_ACTIVE ||
2857 un->un_rs_thread != NULL) {
2859 * Ensure RESYNC_ACTIVE set, it may not be if the resync thread
2860 * is in the process of terminating, setting the flag will
2861 * cause the resync thread to return to the beginning
2863 un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2864 if (lockp) {
2865 md_ioctl_writerexit(lockp);
2866 } else {
2867 md_unit_writerexit(ui);
2869 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2870 return (0);
2872 un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2873 un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
2874 if ((ri) && (ri->ri_copysize > 0) &&
2875 (ri->ri_copysize <= md_max_xfer_bufsz))
2876 un->un_rs_copysize = ri->ri_copysize;
2877 else
2878 un->un_rs_copysize = 0;
2880 /* Start the resync progress thread off */
2881 un->un_rs_progress_flags = 0;
2882 (void) thread_create(NULL, 0, resync_progress_thread,
2883 (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
2886 * We have to store the thread ID in the unit structure so do not
2887 * drop writerlock until the thread is active. This means resync_unit
2888 * may spin on its first md_unit_readerlock(), but deadlock won't occur.
2890 mutex_enter(&un->un_rs_thread_mx);
2891 un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2892 mutex_exit(&un->un_rs_thread_mx);
2893 un->un_rs_thread = thread_create(NULL, 0, resync_unit,
2894 (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, 60);
2895 if (un->un_rs_thread == (kthread_id_t)NULL) {
2896 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2897 if (lockp) {
2898 md_ioctl_writerexit(lockp);
2899 } else {
2900 md_unit_writerexit(ui);
2902 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2903 return (mdmderror(ep, MDE_MIRROR_THREAD_FAILURE, mnum));
2904 } else {
2905 if (lockp) {
2906 md_ioctl_writerexit(lockp);
2907 } else {
2908 md_unit_writerexit(ui);
2912 return (0);
2916 * mirror_ioctl_resync:
2917 * -------------------
2918 * Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
2919 * or kill the resync thread associated with the specified unit.
2920 * Can return with locks held since mdioctl will free any locks
2921 * that are marked in lock->l_flags.
2923 * Returns:
2924 * 0 Success
2925 * !=0 Error Code
2928 mirror_ioctl_resync(
2929 md_resync_ioctl_t *ri,
2930 IOLOCK *lock
2933 minor_t mnum = ri->ri_mnum;
2934 mm_unit_t *un;
2935 uint_t bits;
2936 mm_submirror_t *sm;
2937 mm_submirror_ic_t *smic;
2938 int smi;
2939 kt_did_t tid;
2940 set_t setno = MD_MIN2SET(mnum);
2942 mdclrerror(&ri->mde);
2944 if ((setno >= md_nsets) ||
2945 (MD_MIN2UNIT(mnum) >= md_nunits)) {
2946 return (mdmderror(&ri->mde, MDE_INVAL_UNIT, mnum));
2949 /* RD_LOCK flag grabs the md_ioctl_readerlock */
2950 un = mirror_getun(mnum, &ri->mde, RD_LOCK, lock);
2952 if (un == NULL) {
2953 return (mdmderror(&ri->mde, MDE_UNIT_NOT_SETUP, mnum));
2955 if (un->c.un_type != MD_METAMIRROR) {
2956 return (mdmderror(&ri->mde, MDE_NOT_MM, mnum));
2958 if (un->un_nsm < 2) {
2959 return (0);
2963 * Determine the action to take based on the ri_flags field:
2964 * MD_RI_BLOCK: Block current resync thread
2965 * MD_RI_UNBLOCK: Unblock resync thread
2966 * MD_RI_KILL: Abort resync thread
2967 * MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
2968 * without using rpc.mdcommd messages.
2969 * any other: Start resync thread
2971 switch (ri->ri_flags & (MD_RI_BLOCK|MD_RI_UNBLOCK|MD_RI_KILL)) {
2973 case MD_RI_BLOCK:
2974 /* Halt resync thread by setting flag in un_rs_flags */
2975 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
2976 return (0);
2978 mutex_enter(&un->un_rs_thread_mx);
2979 un->un_rs_thread_flags |= MD_RI_BLOCK;
2980 mutex_exit(&un->un_rs_thread_mx);
2981 return (0);
2983 case MD_RI_UNBLOCK:
2985 * Restart resync thread by clearing flag in un_rs_flags and
2986 * cv_signal'ing the blocked thread.
2988 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
2989 return (0);
2991 mutex_enter(&un->un_rs_thread_mx);
2992 un->un_rs_thread_flags &= ~MD_RI_BLOCK;
2993 cv_signal(&un->un_rs_thread_cv);
2994 mutex_exit(&un->un_rs_thread_mx);
2995 return (0);
2997 case MD_RI_KILL:
2998 /* Abort resync thread. */
2999 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3000 return (0);
3002 mutex_enter(&un->un_rs_thread_mx);
3003 tid = un->un_rs_thread ? (un->un_rs_thread)->t_did : 0;
3004 un->un_rs_thread_flags &= ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
3005 un->un_rs_thread_flags |= MD_RI_KILL;
3006 cv_signal(&un->un_rs_thread_cv);
3007 mutex_exit(&un->un_rs_thread_mx);
3008 if (tid != 0) {
3009 if (!(ri->ri_flags & MD_RI_NO_WAIT)) {
3010 md_ioctl_readerexit(lock);
3011 thread_join(tid);
3012 un->un_rs_thread_flags &= ~MD_RI_KILL;
3013 un->un_rs_thread = NULL;
3014 cmn_err(CE_WARN, "md: %s: Resync cancelled\n",
3015 md_shortname(MD_SID(un)));
3018 return (0);
3021 md_ioctl_readerexit(lock);
3023 bits = 0;
3024 for (smi = 0; smi < NMIRROR; smi++) {
3025 sm = &un->un_sm[smi];
3026 smic = &un->un_smic[smi];
3027 if (!SMS_IS(sm, SMS_ATTACHED))
3028 continue;
3029 mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
3030 bits |= SMI2BIT(smi);
3032 if (bits != 0)
3033 mirror_commit(un, bits, 0);
3036 * If we are resyncing a mirror in a MN set and the rpc.mdcommd
3037 * can be used, we do not start the resync at this point.
3038 * Instead, the metasync command that issued the ioctl
3039 * will send a RESYNC_STARTING message to start the resync thread. The
3040 * reason we do it this way is to ensure that the metasync ioctl is
3041 * executed on all nodes before the resync thread is started.
3043 * If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
3044 * don't use rpc.mdcommd, but just start the resync thread. This
3045 * flag is set on a node when it is being added to a diskset
3046 * so that the resync threads are started on the newly added node.
3048 if ((!(MD_MNSET_SETNO(setno))) ||
3049 (ri->ri_flags & MD_RI_RESYNC_FORCE_MNSTART)) {
3050 return (mirror_resync_unit(mnum, ri, &ri->mde, lock));
3051 } else {
3052 return (0);
3057 mirror_mark_resync_region_non_owner(struct mm_unit *un,
3058 diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3060 int no_change;
3061 size_t start_rr;
3062 size_t current_rr;
3063 size_t end_rr;
3064 md_mn_msg_rr_dirty_t *rr;
3065 md_mn_kresult_t *kres;
3066 set_t setno = MD_UN2SET(un);
3067 int rval;
3068 md_mn_nodeid_t node_idx = source_node - 1;
3069 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
3070 md_mn_nodeid_t owner_node;
3071 minor_t mnum = MD_SID(un);
3073 if (un->un_nsm < 2)
3074 return (0);
3077 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3078 * not, allocate it and then fill the [start..end] entries.
3079 * Update un_pernode_dirty_sum if we've gone 0->1.
3080 * Update un_dirty_bm if the corresponding entries are clear.
3082 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3083 if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3084 un->un_pernode_dirty_bm[node_idx] =
3085 (uchar_t *)kmem_zalloc(
3086 (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3088 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3090 BLK_TO_RR(end_rr, endblk, un);
3091 BLK_TO_RR(start_rr, startblk, un);
3093 no_change = 1;
3095 mutex_enter(&un->un_resync_mx);
3096 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3097 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3098 un->un_outstanding_writes[current_rr]++;
3099 if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
3100 un->un_pernode_dirty_sum[current_rr]++;
3101 SET_PERNODE_DIRTY(source_node, current_rr, un);
3103 CLR_GOING_CLEAN(current_rr, un);
3104 if (!IS_REGION_DIRTY(current_rr, un)) {
3105 no_change = 0;
3106 SET_REGION_DIRTY(current_rr, un);
3107 SET_GOING_DIRTY(current_rr, un);
3108 } else if (IS_GOING_DIRTY(current_rr, un))
3109 no_change = 0;
3111 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3112 mutex_exit(&un->un_resync_mx);
3114 if (no_change) {
3115 return (0);
3119 * If we have dirty regions to commit, send a
3120 * message to the owning node so that the
3121 * in-core bitmap gets updated appropriately.
3122 * TODO: make this a kmem_cache pool to improve
3123 * alloc/free performance ???
3125 kres = (md_mn_kresult_t *)kmem_zalloc(sizeof (md_mn_kresult_t),
3126 KM_SLEEP);
3127 rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
3128 KM_SLEEP);
3130 resend_mmrr:
3131 owner_node = un->un_mirror_owner;
3133 rr->rr_mnum = mnum;
3134 rr->rr_nodeid = md_mn_mynode_id;
3135 rr->rr_range = (ushort_t)start_rr << 16;
3136 rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
3138 /* release readerlock before sending message */
3139 md_unit_readerexit(ui);
3141 rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
3142 MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
3143 un->un_mirror_owner, (char *)rr,
3144 sizeof (md_mn_msg_rr_dirty_t), kres);
3146 /* reaquire readerlock on message completion */
3147 (void) md_unit_readerlock(ui);
3149 /* if the message send failed, note it, and pass an error back up */
3150 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
3151 /* if commd is gone, no point in printing a message */
3152 if (md_mn_is_commd_present())
3153 mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
3154 kmem_free(kres, sizeof (md_mn_kresult_t));
3155 kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3156 return (1);
3160 * if the owner changed while we were sending the message, and it's
3161 * not us, the new mirror owner won't yet have done the right thing
3162 * with our data. Let him know. If we became the owner, we'll
3163 * deal with that differently below. Note that receiving a message
3164 * about another node twice won't hurt anything.
3166 if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
3167 goto resend_mmrr;
3169 kmem_free(kres, sizeof (md_mn_kresult_t));
3170 kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3172 mutex_enter(&un->un_resync_mx);
3175 * If we became the owner changed while we were sending the message,
3176 * we have dirty bits in the un_pernode_bm that aren't yet reflected
3177 * in the un_dirty_bm, as it was re-read from disk, and our bits
3178 * are also not reflected in the on-disk DRL. Fix that now.
3180 if (MD_MN_MIRROR_OWNER(un)) {
3181 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3182 mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
3183 un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
3184 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3186 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3188 mutex_exit(&un->un_resync_mx);
3189 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3190 mutex_enter(&un->un_resync_mx);
3192 un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
3193 cv_broadcast(&un->un_resync_cv);
3196 for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3197 CLR_GOING_DIRTY(current_rr, un);
3199 mutex_exit(&un->un_resync_mx);
3201 return (0);
3205 mirror_mark_resync_region_owner(struct mm_unit *un,
3206 diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3208 int no_change;
3209 size_t start_rr;
3210 size_t current_rr;
3211 size_t end_rr;
3212 int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3213 md_mn_nodeid_t node_idx = source_node - 1;
3215 if (un->un_nsm < 2)
3216 return (0);
3219 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3220 * not, allocate it and then fill the [start..end] entries.
3221 * Update un_pernode_dirty_sum if we've gone 0->1.
3222 * Update un_dirty_bm if the corresponding entries are clear.
3224 if (mnset) {
3225 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3226 if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3227 un->un_pernode_dirty_bm[node_idx] =
3228 (uchar_t *)kmem_zalloc(
3229 (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3231 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3234 mutex_enter(&un->un_resync_mx);
3236 if (mnset)
3237 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3239 no_change = 1;
3240 BLK_TO_RR(end_rr, endblk, un);
3241 BLK_TO_RR(start_rr, startblk, un);
3242 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3243 if (!mnset || source_node == md_mn_mynode_id)
3244 un->un_outstanding_writes[current_rr]++;
3245 if (mnset) {
3246 if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
3247 un->un_pernode_dirty_sum[current_rr]++;
3248 SET_PERNODE_DIRTY(source_node, current_rr, un);
3250 CLR_GOING_CLEAN(current_rr, un);
3251 if (!IS_REGION_DIRTY(current_rr, un))
3252 no_change = 0;
3253 if (IS_GOING_DIRTY(current_rr, un))
3254 no_change = 0;
3257 if (mnset)
3258 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3260 if (no_change) {
3261 mutex_exit(&un->un_resync_mx);
3262 return (0);
3264 un->un_waiting_to_mark++;
3265 while (un->un_resync_flg & MM_RF_GATECLOSED) {
3266 if (panicstr)
3267 return (1);
3268 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3270 un->un_waiting_to_mark--;
3272 no_change = 1;
3273 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3274 if (!IS_REGION_DIRTY(current_rr, un)) {
3275 SET_REGION_DIRTY(current_rr, un);
3276 SET_GOING_DIRTY(current_rr, un);
3277 no_change = 0;
3278 } else {
3279 if (IS_GOING_DIRTY(current_rr, un))
3280 no_change = 0;
3283 if (no_change) {
3284 if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
3285 cv_broadcast(&un->un_resync_cv);
3286 mutex_exit(&un->un_resync_mx);
3287 return (0);
3290 un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
3291 un->un_waiting_to_commit++;
3292 while (un->un_waiting_to_mark != 0 &&
3293 !(un->un_resync_flg & MM_RF_GATECLOSED)) {
3294 if (panicstr)
3295 return (1);
3296 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3299 if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
3300 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3301 un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
3303 mutex_exit(&un->un_resync_mx);
3304 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3305 mutex_enter(&un->un_resync_mx);
3307 un->un_resync_flg &= ~MM_RF_COMMITING;
3308 cv_broadcast(&un->un_resync_cv);
3310 while (un->un_resync_flg & MM_RF_COMMITING) {
3311 if (panicstr)
3312 return (1);
3313 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3316 for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3317 CLR_GOING_DIRTY(current_rr, un);
3319 if (--un->un_waiting_to_commit == 0) {
3320 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3321 cv_broadcast(&un->un_resync_cv);
3323 mutex_exit(&un->un_resync_mx);
3325 return (0);
3329 mirror_mark_resync_region(struct mm_unit *un,
3330 diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3332 int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3334 if (mnset && !MD_MN_MIRROR_OWNER(un)) {
3335 return (mirror_mark_resync_region_non_owner(un, startblk,
3336 endblk, source_node));
3337 } else {
3338 return (mirror_mark_resync_region_owner(un, startblk, endblk,
3339 source_node));
3344 mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3346 short *owp;
3347 optim_resync_t *orp;
3348 uint_t rr_mult = 1;
3349 uint_t old_nregions, new_nregions;
3350 int old_bm_size, new_bm_size;
3351 size_t size;
3352 mddb_recid_t recid, old_recid;
3353 uchar_t *old_dirty_bm;
3354 int i, j;
3355 mddb_type_t typ1;
3356 set_t setno = MD_UN2SET(un);
3357 uchar_t *old_pns;
3359 old_nregions = un->un_rrd_num;
3360 new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3362 while (new_nregions > MD_MAX_NUM_RR) {
3363 new_nregions >>= 1;
3364 rr_mult <<= 1;
3367 new_bm_size = howmany(new_nregions, NBBY);
3368 old_bm_size = howmany(old_nregions, NBBY);
3370 size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3372 typ1 = (mddb_type_t)md_getshared_key(setno,
3373 mirror_md_ops.md_driver.md_drivername);
3374 recid = mddb_createrec(size, typ1, RESYNC_REC,
3375 MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3376 if (recid < 0)
3377 return (-1);
3379 orp = (struct optim_resync *)mddb_getrecaddr(recid);
3380 ASSERT(orp != NULL);
3382 orp->or_magic = OR_MAGIC; /* Magic # */
3383 orp->or_blksize = un->un_rrd_blksize; /* Same block size */
3384 orp->or_num = new_nregions; /* New number of regions */
3386 old_dirty_bm = un->un_dirty_bm;
3387 un->un_dirty_bm = orp->or_rr;
3389 kmem_free((caddr_t)un->un_goingdirty_bm, old_bm_size);
3390 un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3392 kmem_free((caddr_t)un->un_goingclean_bm, old_bm_size);
3393 un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3395 kmem_free((caddr_t)un->un_resync_bm, old_bm_size);
3396 un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3398 owp = un->un_outstanding_writes;
3399 un->un_outstanding_writes = (short *)kmem_zalloc(
3400 new_nregions * sizeof (short), KM_SLEEP);
3402 old_pns = un->un_pernode_dirty_sum;
3403 if (old_pns)
3404 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
3405 KM_SLEEP);
3408 * Now translate the old records into the new
3409 * records
3411 for (i = 0; i < old_nregions; i++) {
3413 * only bring forward the
3414 * outstanding write counters and the dirty bits and also
3415 * the pernode_summary counts
3417 if (!isset(old_dirty_bm, i))
3418 continue;
3420 setbit(un->un_dirty_bm, (i / rr_mult));
3421 un->un_outstanding_writes[(i / rr_mult)] += owp[i];
3422 if (old_pns)
3423 un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
3425 kmem_free((caddr_t)owp, old_nregions * sizeof (short));
3426 if (old_pns)
3427 kmem_free((caddr_t)old_pns, old_nregions);
3430 * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
3432 for (j = 0; j < MD_MNMAXSIDES; j++) {
3433 rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
3434 old_dirty_bm = un->un_pernode_dirty_bm[j];
3435 if (old_dirty_bm) {
3436 un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
3437 new_bm_size, KM_SLEEP);
3438 for (i = 0; i < old_nregions; i++) {
3439 if (!isset(old_dirty_bm, i))
3440 continue;
3442 setbit(un->un_pernode_dirty_bm[j],
3443 (i / rr_mult));
3445 kmem_free((caddr_t)old_dirty_bm, old_bm_size);
3447 rw_exit(&un->un_pernode_dirty_mx[j]);
3450 /* Save the old record id */
3451 old_recid = un->un_rr_dirty_recid;
3453 /* Update the mirror unit struct */
3454 un->un_rr_dirty_recid = recid;
3455 un->un_rrd_num = new_nregions;
3456 un->un_rrd_blksize = un->un_rrd_blksize * rr_mult;
3458 orp->or_blksize = un->un_rrd_blksize;
3461 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3462 * instead of using mddb_commitrecs_wrapper, is that you cannot
3463 * atomically commit optimized records.
3465 mddb_commitrec_wrapper(recid);
3466 mddb_commitrec_wrapper(un->c.un_record_id);
3467 mddb_deleterec_wrapper(old_recid);
3468 return (0);
3471 /* lockp can be NULL for !MN diksets */
3473 mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3475 uchar_t *old;
3476 short *owp;
3477 optim_resync_t *orp;
3478 uint_t old_nregions, new_nregions;
3479 int old_bm_size, new_bm_size;
3480 size_t size;
3481 mddb_recid_t recid, old_recid;
3482 mddb_type_t typ1;
3483 set_t setno = MD_UN2SET(un);
3484 int i;
3486 old_nregions = un->un_rrd_num;
3487 new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3489 new_bm_size = howmany(new_nregions, NBBY);
3490 old_bm_size = howmany(old_nregions, NBBY);
3492 size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3494 typ1 = (mddb_type_t)md_getshared_key(setno,
3495 mirror_md_ops.md_driver.md_drivername);
3497 recid = mddb_createrec(size, typ1, RESYNC_REC,
3498 MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3499 if (recid < 0)
3500 return (-1);
3502 orp = (struct optim_resync *)mddb_getrecaddr(recid);
3503 ASSERT(orp != NULL);
3505 orp->or_magic = OR_MAGIC; /* Magic # */
3506 orp->or_blksize = un->un_rrd_blksize; /* Same block size */
3507 orp->or_num = new_nregions; /* New number of regions */
3509 /* Copy the old bm over the new bm */
3510 bcopy((caddr_t)un->un_dirty_bm, (caddr_t)orp->or_rr, old_bm_size);
3513 * Create new bigger incore arrays, copy, and free old ones:
3514 * un_goingdirty_bm
3515 * un_goingclean_bm
3516 * un_resync_bm
3517 * un_outstanding_writes
3518 * un_pernode_dirty_sum
3519 * un_pernode_dirty_bm[]
3521 old = un->un_goingdirty_bm;
3522 un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3523 bcopy((caddr_t)old, (caddr_t)un->un_goingdirty_bm, old_bm_size);
3524 kmem_free((caddr_t)old, old_bm_size);
3526 old = un->un_goingclean_bm;
3527 un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3528 bcopy((caddr_t)old, (caddr_t)un->un_goingclean_bm, old_bm_size);
3529 kmem_free((caddr_t)old, old_bm_size);
3531 old = un->un_resync_bm;
3532 un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3533 bcopy((caddr_t)old, (caddr_t)un->un_resync_bm, old_bm_size);
3534 kmem_free((caddr_t)old, old_bm_size);
3536 owp = un->un_outstanding_writes;
3537 un->un_outstanding_writes = (short *)kmem_zalloc(
3538 (uint_t)new_nregions * sizeof (short), KM_SLEEP);
3539 bcopy((caddr_t)owp, (caddr_t)un->un_outstanding_writes,
3540 old_nregions * sizeof (short));
3541 kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
3543 old = un->un_pernode_dirty_sum;
3544 if (old) {
3545 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
3546 new_nregions, KM_SLEEP);
3547 bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
3548 old_nregions);
3549 kmem_free((caddr_t)old, old_nregions);
3552 for (i = 0; i < MD_MNMAXSIDES; i++) {
3553 rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
3554 old = un->un_pernode_dirty_bm[i];
3555 if (old) {
3556 un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
3557 new_bm_size, KM_SLEEP);
3558 bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
3559 old_bm_size);
3560 kmem_free((caddr_t)old, old_bm_size);
3562 rw_exit(&un->un_pernode_dirty_mx[i]);
3565 /* Save the old record id */
3566 old_recid = un->un_rr_dirty_recid;
3568 /* Update the mirror unit struct */
3569 un->un_rr_dirty_recid = recid;
3570 un->un_rrd_num = new_nregions;
3571 un->un_dirty_bm = orp->or_rr;
3574 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3575 * instead of using mddb_commitrecs_wrapper, is that you cannot
3576 * atomically commit optimized records.
3578 mddb_commitrec_wrapper(recid);
3579 mddb_commitrec_wrapper(un->c.un_record_id);
3580 mddb_deleterec_wrapper(old_recid);
3581 return (0);
3585 * mirror_copy_rr:
3586 * --------------
3587 * Combine the dirty record bitmap with the in-core resync bitmap. This allows
3588 * us to carry a resync over an ownership change.
3590 void
3591 mirror_copy_rr(int sz, uchar_t *src, uchar_t *dest)
3593 int i;
3595 for (i = 0; i < sz; i++)
3596 *dest++ |= *src++;
3600 * mirror_set_dirty_rr:
3601 * -------------------
3602 * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
3603 * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
3604 * Called on every clean->dirty transition for the originating writer node.
3605 * Note: only the non-owning nodes will initiate this message and it is only
3606 * the owning node that has to process it.
3609 mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
3612 minor_t mnum = iocp->rr_mnum;
3613 mm_unit_t *un;
3614 int start = (int)iocp->rr_start;
3615 int end = (int)iocp->rr_end;
3616 set_t setno = MD_MIN2SET(mnum);
3617 md_mn_nodeid_t orignode = iocp->rr_nodeid; /* 1-based */
3618 diskaddr_t startblk, endblk;
3620 mdclrerror(&iocp->mde);
3622 if ((setno >= md_nsets) ||
3623 (MD_MIN2UNIT(mnum) >= md_nunits)) {
3624 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3627 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3628 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3630 if (un == NULL) {
3631 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3633 if (un->c.un_type != MD_METAMIRROR) {
3634 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3636 if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
3637 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3639 if (un->un_nsm < 2) {
3640 return (0);
3644 * Only process this message if we're the owner of the mirror.
3646 if (!MD_MN_MIRROR_OWNER(un)) {
3647 return (0);
3650 RR_TO_BLK(startblk, start, un);
3651 RR_TO_BLK(endblk, end, un);
3652 return (mirror_mark_resync_region_owner(un, startblk, endblk,
3653 orignode));
3657 * mirror_clean_rr_bits:
3658 * --------------------
3659 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3660 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3661 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3662 * nodes. Callable from ioctl / interrupt / whatever context.
3663 * un_resync_mx is held on entry.
3665 static void
3666 mirror_clean_rr_bits(
3667 md_mn_rr_clean_params_t *iocp)
3669 minor_t mnum = iocp->rr_mnum;
3670 mm_unit_t *un;
3671 uint_t cleared_bits;
3672 md_mn_nodeid_t node = iocp->rr_nodeid - 1;
3673 md_mn_nodeid_t orignode = iocp->rr_nodeid;
3674 int i, start, end;
3676 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3678 cleared_bits = 0;
3679 start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
3680 end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
3681 rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
3682 for (i = start; i < end; i++) {
3683 if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
3684 if (IS_PERNODE_DIRTY(orignode, i, un)) {
3685 un->un_pernode_dirty_sum[i]--;
3686 CLR_PERNODE_DIRTY(orignode, i, un);
3688 if (un->un_pernode_dirty_sum[i] == 0) {
3689 cleared_bits++;
3690 CLR_REGION_DIRTY(i, un);
3691 CLR_GOING_CLEAN(i, un);
3695 rw_exit(&un->un_pernode_dirty_mx[node]);
3696 if (cleared_bits) {
3698 * We can only be called iff we are the mirror owner, however
3699 * as this is a (potentially) decoupled routine the ownership
3700 * may have moved from us by the time we get to execute the
3701 * bit clearing. Hence we still need to check for being the
3702 * owner before flushing the DRL to the replica.
3704 if (MD_MN_MIRROR_OWNER(un)) {
3705 mutex_exit(&un->un_resync_mx);
3706 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3707 mutex_enter(&un->un_resync_mx);
3713 * mirror_drl_task:
3714 * ---------------
3715 * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
3716 * We need to obtain exclusive access to the un_resync_cv and then clear the
3717 * necessary bits.
3718 * On completion, we must also free the passed in argument as it is allocated
3719 * at the end of the ioctl handler and won't be freed on completion.
3721 static void
3722 mirror_drl_task(void *arg)
3724 md_mn_rr_clean_params_t *iocp = (md_mn_rr_clean_params_t *)arg;
3725 minor_t mnum = iocp->rr_mnum;
3726 mm_unit_t *un;
3728 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3730 mutex_enter(&un->un_rrp_inflight_mx);
3731 mutex_enter(&un->un_resync_mx);
3732 un->un_waiting_to_clear++;
3733 while (un->un_resync_flg & MM_RF_STALL_CLEAN)
3734 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3735 un->un_waiting_to_clear--;
3737 un->un_resync_flg |= MM_RF_GATECLOSED;
3738 mirror_clean_rr_bits(iocp);
3739 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3740 if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
3741 cv_broadcast(&un->un_resync_cv);
3743 mutex_exit(&un->un_resync_mx);
3744 mutex_exit(&un->un_rrp_inflight_mx);
3746 kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
3750 * mirror_set_clean_rr:
3751 * -------------------
3752 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3753 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3754 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3755 * nodes.
3757 * Only the mirror-owner need process this message as it is the only RR updater.
3758 * Non-owner nodes issue this request, but as we have no point-to-point message
3759 * support we will receive the message on all nodes.
3762 mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
3765 minor_t mnum = iocp->rr_mnum;
3766 mm_unit_t *un;
3767 set_t setno = MD_MIN2SET(mnum);
3768 md_mn_nodeid_t node = iocp->rr_nodeid - 1;
3769 int can_clear = 0;
3770 md_mn_rr_clean_params_t *newiocp;
3771 int rval = 0;
3773 mdclrerror(&iocp->mde);
3775 if ((setno >= md_nsets) ||
3776 (MD_MIN2UNIT(mnum) >= md_nunits)) {
3777 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3780 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3781 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3783 if (un == NULL) {
3784 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3786 if (un->c.un_type != MD_METAMIRROR) {
3787 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3789 if (un->un_nsm < 2) {
3790 return (0);
3794 * Check to see if we're the mirror owner. If not, there's nothing
3795 * for us to to.
3797 if (!MD_MN_MIRROR_OWNER(un)) {
3798 return (0);
3802 * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
3803 * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
3804 * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
3805 * we can just defer this cleaning until the next process_resync_regions
3806 * timeout.
3808 rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
3809 if (un->un_pernode_dirty_bm[node] == NULL) {
3810 un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
3811 un->un_rrd_num, KM_SLEEP);
3813 rw_exit(&un->un_pernode_dirty_mx[node]);
3816 * See if we can simply clear the un_dirty_bm[] entries. If we're not
3817 * the issuing node _and_ we aren't in the process of marking/clearing
3818 * the RR bitmaps, we can simply update the bits as needed.
3819 * If we're the owning node and _not_ the issuing node, we should also
3820 * sync the RR if we clear any bits in it.
3822 mutex_enter(&un->un_resync_mx);
3823 can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
3824 if (can_clear) {
3825 un->un_resync_flg |= MM_RF_GATECLOSED;
3826 mirror_clean_rr_bits(iocp);
3827 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3828 if (un->un_waiting_to_mark != 0 ||
3829 un->un_waiting_to_clear != 0) {
3830 cv_broadcast(&un->un_resync_cv);
3833 mutex_exit(&un->un_resync_mx);
3836 * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
3837 * we must schedule a blocking call to update the DRL on this node.
3838 * As we're invoked from an ioctl we are going to have the original data
3839 * disappear (kmem_free) once we return. So, copy the data into a new
3840 * structure and let the taskq routine release it on completion.
3842 if (!can_clear) {
3843 size_t sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
3845 newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
3847 bcopy(iocp, newiocp, sz);
3849 if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
3850 newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
3851 kmem_free(newiocp, sz);
3852 rval = ENOMEM; /* probably starvation */
3856 return (rval);