7712 mandoc -Tlint does always exit with error code 0
[unleashed.git] / usr / src / uts / common / io / lvm / mirror / mirror_resync.c
blob8630593ec964b059ce0178c85ad2638f5ea74203
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
26 #include <sys/param.h>
27 #include <sys/systm.h>
28 #include <sys/conf.h>
29 #include <sys/file.h>
30 #include <sys/user.h>
31 #include <sys/uio.h>
32 #include <sys/t_lock.h>
33 #include <sys/buf.h>
34 #include <sys/dkio.h>
35 #include <sys/vtoc.h>
36 #include <sys/kmem.h>
37 #include <vm/page.h>
38 #include <sys/cmn_err.h>
39 #include <sys/sysmacros.h>
40 #include <sys/types.h>
41 #include <sys/mkdev.h>
42 #include <sys/stat.h>
43 #include <sys/open.h>
44 #include <sys/disp.h>
45 #include <sys/lvm/md_mirror.h>
46 #include <sys/modctl.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/debug.h>
50 #include <sys/callb.h>
52 #include <sys/sysevent/eventdefs.h>
53 #include <sys/sysevent/svm.h>
54 #include <sys/lvm/mdmn_commd.h>
56 extern int md_status;
57 extern kmutex_t md_status_mx;
58 extern kmutex_t md_mx;
60 extern unit_t md_nunits;
61 extern set_t md_nsets;
62 extern md_set_t md_set[];
63 extern major_t md_major;
65 extern md_ops_t mirror_md_ops;
66 extern kmem_cache_t *mirror_child_cache; /* mirror child memory pool */
67 extern mdq_anchor_t md_mto_daemon;
68 extern daemon_request_t mirror_timeout;
69 extern md_resync_t md_cpr_resync;
70 extern clock_t md_hz;
71 extern int md_mtioctl_cnt;
73 extern kmem_cache_t *mirror_parent_cache;
74 #ifdef DEBUG
75 extern int mirror_debug_flag;
76 #endif
79 * Tunable resync thread timeout. This is used as the time interval for updating
80 * the resync progress to the mddb. This allows restartable resyncs to be
81 * continued across a system reboot.
82 * Default is to update the resync progress every 5 minutes.
84 int md_mirror_resync_update_intvl = MD_DEF_MIRROR_RESYNC_INTVL;
87 * Settable mirror resync buffer size. Specified in 512 byte
88 * blocks. This is set to MD_DEF_RESYNC_BUF_SIZE by default.
90 int md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
93 * Tunables for dirty region processing when
94 * closing down a mirror.
96 * Dirty region processing during close of a
97 * mirror is basically monitoring the state
98 * of the resync region bitmaps and the number
99 * of outstanding i/o's per submirror to
100 * determine that there are no more dirty
101 * regions left over.
103 * The approach taken is a retry logic over
104 * md_mirror_rr_cleans iterations to monitor
105 * the progress.
107 * There are two methods of polling the progress
108 * on dirty bitmap processing: busy-waits and
109 * non-busy-waits.
111 * Busy-waits are used at the beginning to
112 * determine the final state as quick as
113 * possible; md_mirror_rr_polls defines the
114 * number of busy-waits.
116 * In case the number of busy-waits got exhausted
117 * with dirty regions left over, the retry logic
118 * switches over to non-busy-waits, thus giving
119 * relief to an obviously heavily loaded system.
120 * The timeout value is defined by the tunable
121 * md_mirror_rr_sleep_timo in seconds.
123 * The number of non-busy-waits is given by:
124 * md_mirror_rr_cleans - md_mirror_rr_polls.
126 * The values were found by testing on a
127 * 'typical' system and may require tuning
128 * to meet specific customer's requirements.
131 int md_mirror_rr_cleans = 13;
132 int md_mirror_rr_polls = 3;
133 int md_mirror_rr_sleep_timo = 1;
136 * The value is not #defined because it will be computed
137 * in the future.
139 int md_max_xfer_bufsz = 2048;
142 * mirror_generate_rr_bitmap:
143 * -------------------
144 * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
145 * bitmap associated with mirror 'un'
147 * Input:
148 * un - mirror unit to get bitmap data from
149 * *msgp - location to return newly allocated md_mn_msg_rr_clean_t
150 * *activep- location to return # of active i/os
152 * Returns:
153 * 1 => dirty bits cleared from un_dirty_bm and DRL flush required
154 * *msgp contains bitmap of to-be-cleared bits
155 * 0 => no bits cleared
156 * *msgp == NULL
158 static int
159 mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
160 int *activep)
162 unsigned int i, next_bit, data_bytes, start_bit;
163 int cleared_dirty = 0;
165 /* Skip any initial 0s. */
166 retry_dirty_scan:
167 if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
168 un->un_rr_clean_start_bit = start_bit = 0;
171 * Handle case where NO bits are set in PERNODE_DIRTY but the
172 * un_dirty_bm[] map does have entries set (after a 1st resync)
174 for (; start_bit < un->un_rrd_num &&
175 !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
176 (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
179 if (start_bit >= un->un_rrd_num) {
180 if (un->un_rr_clean_start_bit == 0) {
181 return (0);
182 } else {
183 un->un_rr_clean_start_bit = 0;
184 goto retry_dirty_scan;
188 /* how much to fit into this message */
189 data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
190 MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
192 (*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
193 KM_SLEEP);
195 (*msgp)->rr_nodeid = md_mn_mynode_id;
196 (*msgp)->rr_mnum = MD_SID(un);
197 MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
199 next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
201 for (i = start_bit; i < next_bit; i++) {
202 if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
203 continue;
205 if (!IS_REGION_DIRTY(i, un)) {
206 continue;
208 if (un->un_outstanding_writes[i] != 0) {
209 (*activep)++;
210 continue;
214 * Handle the case where a resync has completed and we still
215 * have the un_dirty_bm[] entries marked as dirty (these are
216 * the most recent DRL re-read from the replica). They need
217 * to be cleared from our un_dirty_bm[] but they will not have
218 * corresponding un_pernode_dirty[] entries set unless (and
219 * until) further write()s have been issued to the area.
220 * This handles the case where only the un_dirty_bm[] entry is
221 * set. Without this we'd not clear this region until a local
222 * write is issued to the affected area.
224 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
225 (un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
226 if (!IS_GOING_CLEAN(i, un)) {
227 SET_GOING_CLEAN(i, un);
228 (*activep)++;
229 continue;
232 * Now we've got a flagged pernode_dirty, _or_ a clean
233 * bitmap entry to process. Update the bitmap to flush
234 * the REGION_DIRTY / GOING_CLEAN bits when we send the
235 * cross-cluster message.
237 cleared_dirty++;
238 setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
239 } else {
241 * Not marked as active in the pernode bitmap, so skip
242 * any update to this. We just increment the 0 count
243 * and adjust the active count by any outstanding
244 * un_pernode_dirty_sum[] entries. This means we don't
245 * leave the mirror permanently dirty.
247 (*activep) += (int)un->un_pernode_dirty_sum[i];
250 if (!cleared_dirty) {
251 kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
252 *msgp = NULL;
254 un->un_rr_clean_start_bit = next_bit;
255 return (cleared_dirty);
259 * There are three paths into here:
261 * md_daemon -> check_resync_regions -> prr
262 * mirror_internal_close -> mirror_process_unit_resync -> prr
263 * mirror_set_capability -> mirror_process_unit_resync -> prr
265 * The first one is a kernel daemon, the other two result from system calls.
266 * Thus, only the first case needs to deal with kernel CPR activity. This
267 * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
268 * NULL for system call paths.
270 static int
271 process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
273 int i, start, end;
274 int cleared_dirty = 0;
275 /* Number of reasons why we can not proceed shutting down the mirror. */
276 int active = 0;
277 set_t setno = MD_UN2SET(un);
278 md_mn_msg_rr_clean_t *rmsg;
279 md_mn_kresult_t *kres;
280 int rval;
281 minor_t mnum = MD_SID(un);
282 mdi_unit_t *ui = MDI_UNIT(mnum);
283 md_mn_nodeid_t owner_node;
286 * We drop the readerlock here to assist lock ordering with
287 * update_resync. Once we have the un_rrp_inflight_mx, we
288 * can re-acquire it.
290 md_unit_readerexit(ui);
293 * Resync region processing must be single threaded. We can't use
294 * un_resync_mx for this purpose since this mutex gets released
295 * when blocking on un_resync_cv.
297 mutex_enter(&un->un_rrp_inflight_mx);
299 (void) md_unit_readerlock(ui);
301 mutex_enter(&un->un_resync_mx);
303 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
304 cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
305 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
307 if (cleared_dirty) {
308 owner_node = un->un_mirror_owner;
309 mutex_exit(&un->un_resync_mx);
312 * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
313 * Receipt of the message will cause the mirror owner to
314 * update the on-disk DRL.
317 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
319 /* release readerlock before sending message */
320 md_unit_readerexit(ui);
322 if (cprinfop) {
323 mutex_enter(&un->un_prr_cpr_mx);
324 CALLB_CPR_SAFE_BEGIN(cprinfop);
327 rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
328 MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
329 MD_MSGF_DIRECTED, un->un_mirror_owner,
330 (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
332 if (cprinfop) {
333 CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
334 mutex_exit(&un->un_prr_cpr_mx);
337 /* reacquire readerlock after message */
338 (void) md_unit_readerlock(ui);
340 if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
341 (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
342 /* if commd is gone, no point in printing a message */
343 if (md_mn_is_commd_present())
344 mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
345 kmem_free(kres, sizeof (md_mn_kresult_t));
346 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
347 mutex_exit(&un->un_rrp_inflight_mx);
348 return (active);
350 kmem_free(kres, sizeof (md_mn_kresult_t));
353 * If ownership changed while we were sending, we probably
354 * sent the message to the wrong node. Leave fixing that for
355 * the next cycle.
357 if (un->un_mirror_owner != owner_node) {
358 mutex_exit(&un->un_rrp_inflight_mx);
359 return (active);
363 * Now that we've sent the message, clear them from the
364 * pernode_dirty arrays. These are ONLY cleared on a
365 * successful send, and failure has no impact.
367 cleared_dirty = 0;
368 start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
369 end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
370 mutex_enter(&un->un_resync_mx);
371 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
372 RW_READER);
373 for (i = start; i < end; i++) {
374 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
375 i - start)) {
376 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
377 un->un_pernode_dirty_sum[i]--;
378 CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
379 un);
381 if (IS_REGION_DIRTY(i, un)) {
382 cleared_dirty++;
383 CLR_REGION_DIRTY(i, un);
384 CLR_GOING_CLEAN(i, un);
388 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
390 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
392 mutex_exit(&un->un_resync_mx);
394 mutex_exit(&un->un_rrp_inflight_mx);
396 return (active);
399 static int
400 process_resync_regions_owner(mm_unit_t *un)
402 int i, start, end;
403 int cleared_dirty = 0;
404 /* Number of reasons why we can not proceed shutting down the mirror. */
405 int active = 0;
406 set_t setno = MD_UN2SET(un);
407 int mnset = MD_MNSET_SETNO(setno);
408 md_mn_msg_rr_clean_t *rmsg;
409 minor_t mnum = MD_SID(un);
410 mdi_unit_t *ui = MDI_UNIT(mnum);
413 * We drop the readerlock here to assist lock ordering with
414 * update_resync. Once we have the un_rrp_inflight_mx, we
415 * can re-acquire it.
417 md_unit_readerexit(ui);
420 * Resync region processing must be single threaded. We can't use
421 * un_resync_mx for this purpose since this mutex gets released
422 * when blocking on un_resync_cv.
424 mutex_enter(&un->un_rrp_inflight_mx);
426 (void) md_unit_readerlock(ui);
428 mutex_enter(&un->un_resync_mx);
429 un->un_waiting_to_clear++;
430 while (un->un_resync_flg & MM_RF_STALL_CLEAN)
431 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
432 un->un_waiting_to_clear--;
434 if (mnset) {
435 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
436 RW_READER);
437 cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
439 if (cleared_dirty) {
441 * Clear the bits from the pernode_dirty arrays.
442 * If that results in any being cleared from the
443 * un_dirty_bm, commit it.
445 cleared_dirty = 0;
446 start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
447 end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
448 for (i = start; i < end; i++) {
449 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
450 i - start)) {
451 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
452 un)) {
453 un->un_pernode_dirty_sum[i]--;
454 CLR_PERNODE_DIRTY(
455 md_mn_mynode_id, i, un);
457 if (un->un_pernode_dirty_sum[i] == 0) {
458 cleared_dirty++;
459 CLR_REGION_DIRTY(i, un);
460 CLR_GOING_CLEAN(i, un);
464 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
466 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
467 } else {
468 for (i = 0; i < un->un_rrd_num; i++) {
469 if (un->c.un_status & MD_UN_KEEP_DIRTY)
470 if (IS_KEEPDIRTY(i, un))
471 continue;
473 if (!IS_REGION_DIRTY(i, un))
474 continue;
475 if (un->un_outstanding_writes[i] != 0) {
476 active++;
477 continue;
480 if (!IS_GOING_CLEAN(i, un)) {
481 SET_GOING_CLEAN(i, un);
482 active++;
483 continue;
485 CLR_REGION_DIRTY(i, un);
486 CLR_GOING_CLEAN(i, un);
487 cleared_dirty++;
491 if (cleared_dirty) {
492 un->un_resync_flg |= MM_RF_GATECLOSED;
493 mutex_exit(&un->un_resync_mx);
494 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
495 mutex_enter(&un->un_resync_mx);
496 un->un_resync_flg &= ~MM_RF_GATECLOSED;
498 if (un->un_waiting_to_mark != 0 ||
499 un->un_waiting_to_clear != 0) {
500 active++;
501 cv_broadcast(&un->un_resync_cv);
504 mutex_exit(&un->un_resync_mx);
506 mutex_exit(&un->un_rrp_inflight_mx);
508 return (active);
511 static int
512 process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
514 int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
516 * For a mirror we can only update the on-disk resync-record if we
517 * currently own the mirror. If we are called and there is no owner we
518 * bail out before scanning the outstanding_writes[] array.
519 * NOTE: we only need to check here (before scanning the array) as we
520 * are called with the readerlock held. This means that a change
521 * of ownership away from us will block until this resync check
522 * has completed.
524 if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
525 (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
526 return (0);
527 } else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
528 return (process_resync_regions_non_owner(un, cprinfop));
529 } else {
530 return (process_resync_regions_owner(un));
535 * Function that is callable from other modules to provide
536 * ability to cleanup dirty region bitmap on demand. Used
537 * on last close of a unit to avoid massive device resyncs
538 * when coming back after rolling large amounts of data to
539 * a mirror (e.g. at umount with logging).
542 void
543 mirror_process_unit_resync(mm_unit_t *un)
545 int cleans = 0;
547 while (process_resync_regions(un, NULL)) {
549 cleans++;
550 if (cleans >= md_mirror_rr_cleans) {
551 cmn_err(CE_NOTE,
552 "Could not clean resync regions\n");
553 break;
555 if (cleans > md_mirror_rr_polls) {
557 * We did not make it with md_mirror_rr_polls
558 * iterations. Give the system relief and
559 * switch over to non-busy-wait.
561 delay(md_mirror_rr_sleep_timo * md_hz);
566 static void
567 check_resync_regions(daemon_request_t *timeout)
569 mdi_unit_t *ui;
570 mm_unit_t *un;
571 md_link_t *next;
572 callb_cpr_t cprinfo;
574 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
575 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
577 if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
578 continue;
580 un = MD_UNIT(next->ln_id);
583 * Register this resync thread with the CPR mechanism. This
584 * allows us to detect when the system is suspended and so
585 * keep track of the RPC failure condition.
587 CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
588 "check_resync_regions");
590 ui = MDI_UNIT(next->ln_id);
591 (void) md_unit_readerlock(ui);
594 * Do not clean up resync regions if it is an ABR
595 * mirror, or if a submirror is offline (we will use the resync
596 * region to resync when back online) or if there is only one
597 * submirror.
599 if ((ui->ui_tstate & MD_ABR_CAP) ||
600 (un->c.un_status & MD_UN_OFFLINE_SM) || (un->un_nsm < 2)) {
601 md_unit_readerexit(ui);
602 /* Remove this thread from the CPR callback table. */
603 mutex_enter(&un->un_prr_cpr_mx);
604 CALLB_CPR_EXIT(&cprinfo);
605 continue;
608 (void) process_resync_regions(un, &cprinfo);
610 md_unit_readerexit(ui);
612 /* Remove this thread from the CPR callback table. */
613 mutex_enter(&un->un_prr_cpr_mx);
614 CALLB_CPR_EXIT(&cprinfo);
617 rw_exit(&mirror_md_ops.md_link_rw.lock);
619 /* We are done */
620 mutex_enter(&mirror_timeout.dr_mx);
621 timeout->dr_pending = 0;
622 mutex_exit(&mirror_timeout.dr_mx);
625 static void
626 md_mirror_timeout(void *throwaway)
629 mutex_enter(&mirror_timeout.dr_mx);
630 if (!mirror_timeout.dr_pending) {
631 mirror_timeout.dr_pending = 1;
632 daemon_request(&md_mto_daemon, check_resync_regions,
633 (daemon_queue_t *)&mirror_timeout, REQ_OLD);
636 if (mirror_md_ops.md_head != NULL)
637 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
638 throwaway, (int)MD_MDELAY*hz);
639 else
640 mirror_timeout.dr_timeout_id = 0;
642 mutex_exit(&mirror_timeout.dr_mx);
645 void
646 resync_start_timeout(set_t setno)
648 if (md_get_setstatus(setno) & MD_SET_STALE)
649 return;
651 mutex_enter(&mirror_timeout.dr_mx);
652 if (mirror_timeout.dr_timeout_id == 0)
653 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
654 (void *)NULL, (int)MD_MDELAY*hz);
655 mutex_exit(&mirror_timeout.dr_mx);
658 static void
659 offlined_to_attached(mm_unit_t *un)
661 int i;
662 int changed = 0;
664 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
665 return;
667 for (i = 0; i < NMIRROR; i++) {
668 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
669 mirror_set_sm_state(&un->un_sm[i],
670 &un->un_smic[i], SMS_ATTACHED, 1);
671 changed++;
673 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) {
674 mirror_set_sm_state(&un->un_sm[i],
675 &un->un_smic[i], SMS_ATTACHED_RESYNC, 1);
676 changed++;
680 if (changed != 0) {
681 un->c.un_status &= ~MD_UN_OFFLINE_SM;
682 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
686 static void
687 get_unit_resync(mm_unit_t *un)
689 mddb_recstatus_t status;
690 struct optim_resync *orp;
692 if (un->un_rr_dirty_recid == 0) {
693 offlined_to_attached(un);
694 return;
697 status = mddb_getrecstatus(un->un_rr_dirty_recid);
698 if ((status == MDDB_NORECORD) || (status == MDDB_NODATA)) {
699 un->un_rr_dirty_recid = 0;
700 offlined_to_attached(un);
701 return;
704 mddb_setrecprivate(un->un_rr_dirty_recid, MD_PRV_GOTIT);
705 orp = (struct optim_resync *)mddb_getrecaddr(un->un_rr_dirty_recid);
706 un->un_dirty_bm = orp->or_rr;
709 static int
710 create_unit_resync(mm_unit_t *un, int snarfing)
712 diskaddr_t tb;
713 int i;
714 int blksize; /* rr size in blocks */
715 int num_rr;
716 mddb_recid_t recid;
717 size_t size; /* bitmap size */
718 optim_resync_t *orp;
719 mddb_type_t typ1;
720 set_t setno;
722 tb = un->c.un_total_blocks;
724 if (((tb + MD_MIN_RR_SIZE)/ MD_MIN_RR_SIZE) > MD_DEF_NUM_RR) {
725 blksize = (int)(tb / MD_DEF_NUM_RR);
726 num_rr = (int)((tb + (blksize)) / (blksize));
727 } else {
728 blksize = MD_MIN_RR_SIZE;
729 num_rr = (int)((tb + MD_MIN_RR_SIZE) / MD_MIN_RR_SIZE);
732 size = howmany(num_rr, NBBY) + sizeof (*orp) - sizeof (orp->or_rr);
734 setno = MD_UN2SET(un);
736 typ1 = (mddb_type_t)md_getshared_key(setno,
737 mirror_md_ops.md_driver.md_drivername);
739 recid = mddb_createrec(size, typ1, RESYNC_REC,
740 MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
741 if (recid < 0) {
742 if (snarfing && !(md_get_setstatus(setno) & MD_SET_STALE)) {
743 md_set_setstatus(setno, MD_SET_STALE);
744 cmn_err(CE_WARN, "md: state database is stale");
746 return (-1);
749 un->un_rr_dirty_recid = recid;
750 orp = (optim_resync_t *)mddb_getrecaddr(recid);
751 orp->or_magic = OR_MAGIC;
752 orp->or_blksize = blksize;
753 orp->or_num = num_rr;
755 un->un_rrd_blksize = blksize;
756 un->un_rrd_num = num_rr;
757 un->un_dirty_bm = orp->or_rr;
759 if (snarfing)
760 for (i = 0; i < howmany(num_rr, NBBY); i++)
761 orp->or_rr[i] = 0xFF;
763 if (!snarfing) {
764 mddb_commitrec_wrapper(recid);
765 mirror_commit(un, NO_SUBMIRRORS, 0);
766 return (0);
768 mddb_setrecprivate(recid, MD_PRV_PENDCOM);
769 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
770 return (0);
774 unit_setup_resync(mm_unit_t *un, int snarfing)
776 int err;
777 int syncable;
778 int i;
779 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
780 int nonABR = 1; /* only set if ABR marked in ui_tstate */
782 un->un_dirty_bm = NULL;
783 un->un_rs_buffer = NULL;
785 mutex_init(&un->un_rrp_inflight_mx, "rrp mx", MUTEX_DEFAULT, NULL);
787 mutex_init(&un->un_resync_mx, NULL, MUTEX_DEFAULT, NULL);
788 cv_init(&un->un_resync_cv, NULL, CV_DEFAULT, NULL);
789 un->un_resync_flg = 0;
790 un->un_waiting_to_mark = 0;
791 un->un_waiting_to_commit = 0;
792 un->un_waiting_to_clear = 0;
794 un->un_goingclean_bm = NULL;
795 un->un_goingdirty_bm = NULL;
796 un->un_outstanding_writes = NULL;
797 un->un_resync_bm = NULL;
799 if (snarfing)
800 get_unit_resync(un);
802 if (un->un_rr_dirty_recid == 0) {
804 * If a MN diskset and snarfing and this node is not the
805 * master, do not delete any records on snarf of the
806 * mirror records (create_unit_resync deletes records).
808 * Master node should have already handled this case.
810 if (MD_MNSET_SETNO(MD_UN2SET(un)) && snarfing &&
811 md_set[MD_UN2SET(un)].s_am_i_master == 0) {
812 #ifdef DEBUG
813 cmn_err(CE_NOTE, "unit_setup_resync: no rr for %s on"
814 " nodeid %d\n", md_shortname(MD_SID(un)),
815 md_set[MD_UN2SET(un)].s_nodeid);
816 #endif
817 return (-1);
819 if ((err = create_unit_resync(un, snarfing)) != 0)
820 return (err);
823 un->un_goingclean_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
824 un->un_rrd_num, NBBY)), KM_SLEEP);
825 un->un_goingdirty_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
826 un->un_rrd_num, NBBY)), KM_SLEEP);
827 un->un_outstanding_writes = (short *)kmem_zalloc(
828 (uint_t)un->un_rrd_num * sizeof (short), KM_SLEEP);
829 un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
830 un->un_rrd_num, NBBY)), KM_SLEEP);
833 * Allocate pernode bitmap for this node. All other nodes' maps will
834 * be created 'on-the-fly' in the ioctl message handler
836 if (MD_MNSET_SETNO(MD_UN2SET(un))) {
837 un->un_pernode_dirty_sum =
838 (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
839 if (md_mn_mynode_id > 0) {
840 un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
841 kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
842 KM_SLEEP);
846 * Allocate taskq to process deferred (due to locking) RR_CLEAN
847 * requests.
849 un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
850 MD_SID(un));
853 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
854 return (0);
857 * Only mark mirror which has an associated DRL as requiring a resync.
858 * For ABR mirrors we need not set the resync record bitmap up.
860 if (ui && (ui->ui_tstate & MD_ABR_CAP))
861 nonABR = 0;
863 for (i = 0, syncable = 0; i < NMIRROR; i++) {
864 if (nonABR) {
865 if ((SUBMIRROR_IS_READABLE(un, i) ||
866 SMS_BY_INDEX_IS(un, i,
867 (SMS_OFFLINE | SMS_OFFLINE_RESYNC))))
868 syncable++;
872 if (snarfing && un->un_pass_num && (syncable > 1)) {
873 bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
874 howmany(un->un_rrd_num, NBBY));
876 un->c.un_status |= (MD_UN_OPT_NOT_DONE | MD_UN_WAR);
877 un->c.un_status &= ~MD_UN_OFFLINE_SM;
878 for (i = 0; i < NMIRROR; i++) {
879 if ((SUBMIRROR_IS_READABLE(un, i)) ||
880 SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC))
881 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
883 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
884 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
885 mirror_set_sm_state(&un->un_sm[i],
886 &un->un_smic[i], SMS_OFFLINE_RESYNC, 1);
887 mddb_setrecprivate(un->c.un_record_id,
888 MD_PRV_PENDCOM);
892 return (0);
896 * resync_kill_pending:
897 * -------------------
898 * Determine if the resync thread has been requested to terminate.
899 * Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
900 * MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
901 * MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node mirror.
903 * Returns:
904 * 0 Kill not pending
905 * 1 Kill requested (set MD_UN_RESYNC_CANCEL in un->c.un_status)
907 * Note: this routine may block
908 * the writerlock for <ui> will be dropped and reacquired if <mx_type>
909 * is set to MD_WRITER_HELD.
910 * the readerlock for <ui> will be dropped and reacquired if <mx_type>
911 * is set to MD_READER_HELD.
913 static int
914 resync_kill_pending(
915 mm_unit_t *un,
916 mdi_unit_t *ui,
917 uint_t mx_type)
919 int retval = 0;
921 /* Ensure that we don't block with any mutex held */
922 if (mx_type == MD_WRITER_HELD) {
923 md_unit_writerexit(ui);
924 } else if (mx_type == MD_READER_HELD) {
925 md_unit_readerexit(ui);
927 mutex_enter(&un->un_rs_thread_mx);
928 while (un->un_rs_thread_flags & (MD_RI_BLOCK|MD_RI_BLOCK_OWNER)) {
929 cv_wait(&un->un_rs_thread_cv, &un->un_rs_thread_mx);
930 if (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN))
931 break;
933 /* Determine if we've been asked to abort or shutdown gracefully */
934 if (un->un_rs_thread_flags & MD_RI_KILL) {
935 un->c.un_status |= MD_UN_RESYNC_CANCEL;
936 retval = 1;
937 } else if (un->un_rs_thread_flags & MD_RI_SHUTDOWN) {
938 retval = 1;
940 mutex_exit(&un->un_rs_thread_mx);
942 /* Reacquire mutex if dropped on entry */
943 if (mx_type == MD_WRITER_HELD) {
944 (void) md_unit_writerlock(ui);
945 } else if (mx_type == MD_READER_HELD) {
946 (void) md_unit_readerlock(ui);
948 return (retval);
952 * resync_read_buffer:
953 * ------------------
954 * Issue the resync source read for the specified start block and size.
955 * This will cause the mirror strategy routine to issue a write-after-read
956 * once this request completes successfully.
957 * If 'flag_err' is set we expect to see a write error flagged in the b_error
958 * field of the buffer created for this i/o request. If clear we do not expect
959 * to see the error flagged for write failures.
960 * Read failures will always set the B_ERROR bit which will stop the resync
961 * immediately.
963 static int
964 resync_read_buffer(mm_unit_t *un, diskaddr_t blk, size_t cnt, int flag_err)
966 md_mcs_t *sp;
967 buf_t *bp;
968 int ret = 0;
970 sp = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
971 mirror_child_init(sp);
973 bp = &sp->cs_buf;
974 bp->b_edev = makedevice(md_major, MD_SID(un));
975 bp->b_flags = B_READ;
976 bp->b_lblkno = blk;
977 bp->b_bcount = dbtob(cnt);
978 bp->b_un.b_addr = un->un_rs_buffer;
979 md_unit_readerexit(MDI_UNIT(MD_SID(un)));
981 (void) md_mirror_strategy(bp, MD_STR_NOTTOP | MD_STR_MAPPED |
982 MD_STR_WAR | (flag_err ? MD_STR_FLAG_ERR : 0), NULL);
984 (void) biowait(bp);
986 (void) md_unit_readerlock(MDI_UNIT(MD_SID(un)));
987 if (bp->b_flags & B_ERROR) {
988 ret = 1;
990 kmem_cache_free(mirror_child_cache, sp);
991 return (ret);
995 * send_mn_resync_done_message
997 * At the end of a resync, send a message to all nodes to indicate that
998 * the resync is complete. The argument, flags, has the following values
1000 * RESYNC_ERR - if an error occurred that terminated the resync
1001 * CLEAR_OPT_NOT_DONE - Just need to clear the OPT_NOT_DONE flag
1003 * unit writerlock set on entry
1004 * Only send the message if the thread is not marked as shutting down:
1005 * [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
1006 * [un->c.un_status & MD_UN_RESYNC_CANCEL]
1007 * or if there has been an error that terminated the resync:
1008 * flags & RESYNC_ERR
1011 static void
1012 send_mn_resync_done_message(
1013 mm_unit_t *un,
1014 int flags
1017 md_mn_msg_resync_t *rmsg = un->un_rs_msg;
1018 set_t setno;
1019 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
1020 md_mn_kresult_t *kres;
1021 int dont_send = 0;
1022 int rval;
1023 int nretries = 0;
1025 rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
1028 * Only send the message if this resync thread is still active. This
1029 * handles the case where ownership changes to different nodes during
1030 * a resync can cause multiple spurious resync_done messages to occur
1031 * when the resync completes. This happens because only one node is
1032 * the resync owner but other nodes will have their resync_unit thread
1033 * blocked in 'resync_kill_pending'
1035 mutex_enter(&un->un_rs_thread_mx);
1036 dont_send = (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN)) ? 1
1037 : 0;
1038 mutex_exit(&un->un_rs_thread_mx);
1039 dont_send |= (un->c.un_status & MD_UN_RESYNC_CANCEL) ? 1 : 0;
1042 * Always send a message if we've encountered an error that terminated
1043 * the resync.
1045 if (flags & RESYNC_ERR)
1046 dont_send = 0;
1048 if (dont_send) {
1049 #ifdef DEBUG
1050 if (mirror_debug_flag) {
1051 printf("Don't send resync done message, mnum = %x,"
1052 " type = %x, flags = %d\n", MD_SID(un),
1053 un->un_rs_type, flags);
1055 #endif /* DEBUG */
1056 return;
1059 #ifdef DEBUG
1060 if (mirror_debug_flag) {
1061 printf("send resync done message, mnum = %x, type = %x\n",
1062 MD_SID(un), un->un_rs_type);
1064 #endif
1066 rmsg->msg_resync_mnum = MD_SID(un);
1067 rmsg->msg_resync_type = un->un_rs_type;
1068 rmsg->msg_originator = md_mn_mynode_id;
1069 rmsg->msg_resync_flags = 0;
1070 if (flags & RESYNC_ERR)
1071 rmsg->msg_resync_flags |= MD_MN_RS_ERR;
1072 if (flags & CLEAR_OPT_NOT_DONE)
1073 rmsg->msg_resync_flags |= MD_MN_RS_CLEAR_OPT_NOT_DONE;
1075 setno = MD_MIN2SET(MD_SID(un));
1076 md_unit_writerexit(ui);
1077 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1079 smrd_msg:
1080 mutex_enter(&un->un_rs_cpr_mx);
1081 CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1083 rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
1084 MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1086 CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1087 mutex_exit(&un->un_rs_cpr_mx);
1089 /* if the node hasn't yet joined, it's Ok. */
1090 if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
1091 (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
1092 mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
1093 /* If we're shutting down already, pause things here. */
1094 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1095 while (!md_mn_is_commd_present()) {
1096 delay(md_hz);
1099 * commd is now available again. Retry the message once.
1100 * If this fails we panic as the system is in an
1101 * unexpected state.
1103 if (nretries++ == 0)
1104 goto smrd_msg;
1106 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
1108 kmem_free(kres, sizeof (md_mn_kresult_t));
1109 (void) md_unit_writerlock(ui);
1113 * send_mn_resync_next_message
1115 * Sent a message to all nodes indicating the next region to be resynced.
1116 * The message contains the region to be resynced and the current position in
1117 * the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
1118 * On entry the unit readerlock is held.
1120 static void
1121 send_mn_resync_next_message(
1122 mm_unit_t *un,
1123 diskaddr_t currentblk,
1124 size_t rsize,
1125 int flags
1128 md_mn_msg_resync_t *rmsg = un->un_rs_msg;
1129 set_t setno;
1130 md_mn_kresult_t *kres;
1131 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
1132 int rval;
1133 md_mps_t *ps;
1134 mm_submirror_t *sm;
1135 int smi;
1136 int nretries = 0;
1138 ASSERT(rmsg != NULL);
1139 #ifdef DEBUG
1140 if (mirror_debug_flag) {
1141 printf("send resync next message, mnum = %x, start=%lld, "
1142 "size=%ld, type=%x, done=%lld, 2_do=%lld\n",
1143 MD_SID(un), currentblk, rsize, un->un_rs_type,
1144 un->un_rs_resync_done, un->un_rs_resync_2_do);
1146 #endif
1147 rmsg->msg_resync_mnum = MD_SID(un);
1148 rmsg->msg_resync_type = un->un_rs_type;
1149 rmsg->msg_resync_start = currentblk;
1150 rmsg->msg_resync_rsize = rsize;
1151 rmsg->msg_resync_done = un->un_rs_resync_done;
1152 rmsg->msg_resync_2_do = un->un_rs_resync_2_do;
1153 rmsg->msg_originator = md_mn_mynode_id;
1154 if (flags & MD_FIRST_RESYNC_NEXT)
1155 rmsg->msg_resync_flags = MD_MN_RS_FIRST_RESYNC_NEXT;
1158 * Copy current submirror state and flags into message. This provides
1159 * a means of keeping all nodes that are currently active in the cluster
1160 * synchronised with regards to their submirror state settings. If we
1161 * did not pass this information here, the only time every node gets
1162 * submirror state updated is at the end of a resync phase. This can be
1163 * a significant amount of time for large metadevices.
1165 for (smi = 0; smi < NMIRROR; smi++) {
1166 sm = &un->un_sm[smi];
1167 rmsg->msg_sm_state[smi] = sm->sm_state;
1168 rmsg->msg_sm_flags[smi] = sm->sm_flags;
1170 setno = MD_MIN2SET(MD_SID(un));
1171 md_unit_readerexit(ui);
1172 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1174 smrn_msg:
1175 mutex_enter(&un->un_rs_cpr_mx);
1176 CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1178 rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
1179 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1181 CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1182 mutex_exit(&un->un_rs_cpr_mx);
1184 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
1185 mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
1186 /* If we're shutting down already, pause things here. */
1187 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1188 while (!md_mn_is_commd_present()) {
1189 delay(md_hz);
1192 * commd is now available again. Retry the message once.
1193 * If this fails we panic as the system is in an
1194 * unexpected state.
1196 if (nretries++ == 0)
1197 goto smrn_msg;
1199 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
1201 kmem_free(kres, sizeof (md_mn_kresult_t));
1202 (void) md_unit_readerlock(ui);
1203 ps = un->un_rs_prev_overlap;
1205 /* Allocate previous overlap reference if needed */
1206 if (ps == NULL) {
1207 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
1208 ps->ps_un = un;
1209 ps->ps_ui = ui;
1210 ps->ps_firstblk = 0;
1211 ps->ps_lastblk = 0;
1212 ps->ps_flags = 0;
1213 md_unit_readerexit(ui);
1214 (void) md_unit_writerlock(ui);
1215 un->un_rs_prev_overlap = ps;
1216 md_unit_writerexit(ui);
1217 (void) md_unit_readerlock(ui);
1220 ps->ps_firstblk = currentblk;
1221 ps->ps_lastblk = currentblk + rsize - 1;
1224 static int
1225 resync_read_blk_range(
1226 mm_unit_t *un,
1227 diskaddr_t currentblk,
1228 diskaddr_t stopbefore,
1229 uint_t type,
1230 int flags
1233 size_t copysize; /* limited by max xfer buf size */
1234 size_t rsize; /* size of resync block (for MN) */
1235 set_t setno;
1236 diskaddr_t newstop;
1237 diskaddr_t rs_startblk;
1238 uint_t rs_type;
1239 int flags1 = flags & MD_FIRST_RESYNC_NEXT;
1241 rs_type = un->un_rs_type;
1242 rs_startblk = currentblk;
1243 if (stopbefore > un->c.un_total_blocks)
1244 stopbefore = un->c.un_total_blocks;
1245 if (currentblk < un->un_resync_startbl)
1246 currentblk = un->un_resync_startbl;
1248 copysize = un->un_rs_copysize;
1249 rsize = MD_DEF_RESYNC_BLK_SZ;
1251 setno = MD_MIN2SET(MD_SID(un));
1252 while (currentblk < stopbefore) {
1254 * Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
1255 * if a MN device and sendflag is set, send a RESYNC_MESSAGE
1256 * to all nodes.
1258 if ((currentblk + MD_DEF_RESYNC_BLK_SZ) > stopbefore)
1259 rsize = stopbefore - currentblk;
1260 if (MD_MNSET_SETNO(setno) && (flags & MD_SEND_MESS_XMIT)) {
1261 un->un_resync_startbl = currentblk;
1262 rs_startblk = currentblk;
1263 send_mn_resync_next_message(un, currentblk, rsize,
1264 flags1);
1265 if (flags1)
1266 flags1 = 0;
1267 /* check to see if we've been asked to terminate */
1268 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1269 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1270 ? 1:0);
1272 * Check to see if another node has completed this
1273 * block, if so either the type or the resync region
1274 * will have changed. If the resync type has changed,
1275 * just exit.
1276 * If the resync region has changed, reset currentblk
1277 * to the start of the current resync region and
1278 * continue.
1280 if (un->un_rs_type != rs_type)
1281 return (0);
1282 if (un->un_rs_prev_overlap->ps_firstblk >
1283 rs_startblk) {
1284 currentblk =
1285 un->un_rs_prev_overlap->ps_firstblk;
1286 continue;
1289 newstop = currentblk + rsize;
1290 while (currentblk < newstop) {
1291 if ((currentblk + copysize) > stopbefore)
1292 copysize = (size_t)(stopbefore - currentblk);
1293 if (resync_read_buffer(un, currentblk, copysize,
1294 (flags & MD_RESYNC_FLAG_ERR)))
1295 return (1);
1297 /* resync_read_buffer releases/grabs a new lock */
1298 un = (mm_unit_t *)MD_UNIT(MD_SID(un));
1299 currentblk += copysize;
1301 /* check to see if we've been asked to terminate */
1302 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1303 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1304 ? 1:0);
1305 if (MD_MNSET_SETNO(setno)) {
1307 * Check to see if another node has completed
1308 * this block, see above
1310 if (un->un_rs_type != rs_type)
1311 return (0);
1312 if (un->un_rs_prev_overlap->ps_firstblk >
1313 rs_startblk)
1314 currentblk =
1315 un->un_rs_prev_overlap->ps_firstblk;
1319 return (0);
1322 static void
1323 optimized_resync(mm_unit_t *un)
1325 mdi_unit_t *ui;
1326 minor_t mnum;
1327 int rr, smi;
1328 int resync_regions;
1329 uchar_t *dirtyregions;
1330 diskaddr_t first, stopbefore;
1331 int err;
1332 int cnt;
1333 sm_state_t state;
1334 int broke_out = 0;
1335 set_t setno;
1336 uint_t old_rs_type = un->un_rs_type;
1337 uint_t old_rs_done;
1338 uint_t flags1 = MD_FIRST_RESYNC_NEXT|MD_RESYNC_FLAG_ERR;
1339 size_t start_rr;
1341 mnum = MD_SID(un);
1342 ui = MDI_UNIT(mnum);
1343 setno = MD_UN2SET(un);
1345 if (!(un->c.un_status & MD_UN_OPT_NOT_DONE)) {
1347 * We aren't marked as needing a resync so for multi-node
1348 * sets we flag the completion so that all nodes see the same
1349 * metadevice state. This is a problem when a new node joins
1350 * an existing set as it has to perform a 'metasync -r' and
1351 * we have to step through all of the resync phases. If we
1352 * don't do this the nodes that were already in the set will
1353 * have the metadevices marked as 'Okay' but the joining node
1354 * will have 'Needs Maintenance' which is unclearable.
1356 if (MD_MNSET_SETNO(setno)) {
1357 send_mn_resync_done_message(un, CLEAR_OPT_NOT_DONE);
1359 return;
1363 * No need for optimized resync if ABR set, clear rs_type and flags
1364 * and exit
1366 if (ui->ui_tstate & MD_ABR_CAP) {
1367 un->un_rs_type = MD_RS_NONE;
1368 un->c.un_status &= ~(MD_UN_OPT_NOT_DONE | MD_UN_WAR);
1369 return;
1372 un->un_rs_dropped_lock = 1;
1373 un->c.un_status |= MD_UN_WAR;
1374 resync_regions = un->un_rrd_num;
1375 dirtyregions = un->un_resync_bm;
1376 md_unit_writerexit(ui);
1378 /* For MN sets, resync NOTIFY is done when processing resync messages */
1379 if (!MD_MNSET_SETNO(setno)) {
1380 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1381 SVM_TAG_METADEVICE, setno, MD_SID(un));
1383 un = (mm_unit_t *)md_unit_readerlock(ui);
1385 /* check to see if we've been asked to terminate */
1386 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1387 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1388 broke_out = RESYNC_ERR;
1391 * Check that we are still performing an optimized
1392 * resync. If not, another node must have completed it
1393 * so we have no more work to do.
1395 if (un->un_rs_type != old_rs_type) {
1396 md_unit_readerexit(ui);
1397 (void) md_unit_writerlock(ui);
1398 return;
1401 * If rs_resync_done is non-zero, we must be completing an optimized
1402 * resync that has already been partially done on another node.
1403 * Therefore clear the bits in resync_bm for the resync regions
1404 * already done. If resync_startbl is zero, calculate 2_do.
1406 if (un->un_rs_resync_done > 0) {
1407 BLK_TO_RR(start_rr, un->un_resync_startbl, un);
1408 for (rr = 0; rr < start_rr && rr < resync_regions; rr++)
1409 CLR_KEEPDIRTY(rr, un);
1410 } else {
1411 un->un_rs_resync_2_do = 0;
1412 for (rr = 0; rr < resync_regions; rr++)
1413 if (isset(dirtyregions, rr))
1414 un->un_rs_resync_2_do++;
1417 for (rr = 0; (rr < resync_regions) && (broke_out != RESYNC_ERR); rr++) {
1418 if (isset(dirtyregions, rr)) {
1419 RR_TO_BLK(first, rr, un);
1420 RR_TO_BLK(stopbefore, rr+1, un);
1421 old_rs_type = un->un_rs_type;
1422 old_rs_done = un->un_rs_resync_done;
1423 err = resync_read_blk_range(un, first, stopbefore,
1424 MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
1425 flags1 = MD_RESYNC_FLAG_ERR;
1427 /* resync_read_blk_range releases/grabs a new lock */
1428 un = (mm_unit_t *)MD_UNIT(mnum);
1430 if (err) {
1431 broke_out = RESYNC_ERR;
1432 break;
1436 * Check that we are still performing an optimized
1437 * resync. If not, another node must have completed it
1438 * so we have no more work to do.
1440 if (un->un_rs_type != old_rs_type) {
1441 md_unit_readerexit(ui);
1442 (void) md_unit_writerlock(ui);
1443 return;
1447 * If resync_done has increased, we must have
1448 * blocked in resync_read_blk_range while another node
1449 * continued with the resync. Therefore clear resync_bm
1450 * for the blocks that have been resynced on another
1451 * node and update rr to the next RR to be done.
1453 if (old_rs_done < un->un_rs_resync_done) {
1454 int i;
1455 BLK_TO_RR(start_rr, un->un_resync_startbl - 1,
1456 un);
1457 for (i = rr; i < start_rr; i++)
1458 CLR_KEEPDIRTY(i, un);
1459 rr = start_rr;
1460 } else
1461 un->un_rs_resync_done++;
1463 for (smi = 0, cnt = 0; smi < NMIRROR; smi++)
1464 if (SUBMIRROR_IS_WRITEABLE(un, smi) &&
1465 !(SMS_BY_INDEX_IS(un, smi, SMS_ALL_ERRED)))
1466 cnt++;
1467 if (cnt < 2) {
1468 broke_out = RESYNC_ERR;
1469 break;
1471 CLR_KEEPDIRTY(rr, un);
1472 /* Check to see if we've completed the resync cleanly */
1473 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1474 break;
1477 * Check that we haven't exceeded un_rs_resync_2_do. If
1478 * we have we've completed the resync.
1480 if (un->un_rs_resync_done > un->un_rs_resync_2_do)
1481 break;
1484 md_unit_readerexit(ui);
1485 un = (mm_unit_t *)md_unit_writerlock(ui);
1488 * If MN set send message to all nodes to indicate resync
1489 * phase is complete. The processing of the message will update the
1490 * mirror state
1492 if (MD_MNSET_SETNO(setno)) {
1493 send_mn_resync_done_message(un, broke_out);
1494 } else {
1496 if (!broke_out)
1497 un->c.un_status &= ~MD_UN_WAR;
1499 un->c.un_status &= ~MD_UN_KEEP_DIRTY;
1501 setno = MD_UN2SET(un);
1502 for (smi = 0; smi < NMIRROR; smi++) {
1503 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
1504 if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE_RESYNC)) {
1505 state = (broke_out ? SMS_OFFLINE : SMS_RUNNING);
1506 mirror_set_sm_state(&un->un_sm[smi],
1507 &un->un_smic[smi], state, broke_out);
1508 mirror_commit(un, NO_SUBMIRRORS, 0);
1510 if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE))
1511 un->c.un_status |= MD_UN_OFFLINE_SM;
1515 /* For MN sets, resync NOTIFY is done when processing resync messages */
1516 if (!MD_MNSET_SETNO(setno)) {
1517 if (broke_out) {
1518 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1519 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1520 } else {
1521 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1522 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1528 * recalc_resync_done
1530 * This function deals with a change in value of un_rs_resync_2_do in a
1531 * component resync. This may change if we are restarting a component
1532 * resync on a single node having rebooted with a different value of
1533 * md_resync_bufsz or if we are running in a multi-node with nodes having
1534 * different values of md_resync_bufsz.
1535 * If there is a change in un_rs_resync_2_do, we need to recalculate
1536 * the value of un_rs_resync_done given the new value for resync_2_do.
1537 * We have to calculate a new value for resync_done to be either
1538 * if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
1539 * or if it is not set, we need to calculate it from un_rs_resync_done,
1540 * (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
1541 * In addition we need to deal with the overflow case by using a factor to
1542 * prevent overflow
1545 static void
1546 recalc_resync_done(mm_unit_t *un, size_t resync_2_do, diskaddr_t initblock,
1547 u_longlong_t blk_size, u_longlong_t skip)
1549 diskaddr_t x;
1550 uint_t factor = 1;
1553 * If resync_2_do has not yet been calculated, no need to modify
1554 * resync_done
1556 if (un->un_rs_resync_2_do == 0) {
1557 return;
1559 if (un->un_rs_resync_2_do == resync_2_do)
1560 return; /* No change, so nothing to do */
1562 * If un_rs_startbl is set, another node must have already started
1563 * this resync and hence we can calculate resync_done from
1564 * resync_startbl
1566 if (un->un_resync_startbl) {
1567 un->un_rs_resync_done = (un->un_resync_startbl - initblock) /
1568 (blk_size + skip);
1569 return;
1572 * un_resync_startbl is not set so we must calculate it from
1573 * un_rs_resync_done.
1574 * If the larger of the two values of resync_2_do is greater than 32
1575 * bits, calculate a factor to divide by to ensure that we don't
1576 * overflow 64 bits when calculating the new value for resync_done
1578 x = (un->un_rs_resync_2_do > resync_2_do) ? un->un_rs_resync_2_do :
1579 resync_2_do;
1580 while (x > INT32_MAX) {
1581 x = x >> 1;
1582 factor = factor << 1;
1584 un->un_rs_resync_done = ((un->un_rs_resync_done/factor) *
1585 (resync_2_do/factor)) /
1586 ((un->un_rs_resync_2_do + (factor * factor) - 1)/
1587 (factor * factor));
1590 static void
1591 check_comp_4_resync(mm_unit_t *un, int smi, int ci)
1593 mdi_unit_t *ui;
1594 minor_t mnum;
1595 mm_submirror_t *sm;
1596 mm_submirror_ic_t *smic;
1597 size_t count;
1598 u_longlong_t skip;
1599 u_longlong_t size;
1600 u_longlong_t blk_size;
1601 diskaddr_t initblock;
1602 diskaddr_t block;
1603 diskaddr_t frag = 0;
1604 md_m_shared_t *shared;
1605 int err;
1606 set_t setno;
1607 int broke_out = 0;
1608 int blks;
1609 uint_t old_rs_type = un->un_rs_type;
1610 diskaddr_t old_rs_done;
1611 uint_t flags1 = MD_FIRST_RESYNC_NEXT;
1612 diskaddr_t resync_2_do;
1614 mnum = MD_SID(un);
1615 ui = MDI_UNIT(mnum);
1616 sm = &un->un_sm[smi];
1617 smic = &un->un_smic[smi];
1618 setno = MD_UN2SET(un);
1620 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1621 (sm->sm_dev, sm, ci);
1623 if (shared->ms_state != CS_RESYNC) {
1624 SET_RS_TYPE_NONE(un->un_rs_type);
1625 return;
1628 if (shared->ms_flags & MDM_S_RS_TRIED) {
1629 SET_RS_TYPE_NONE(un->un_rs_type);
1630 return;
1633 (void) (*(smic->sm_get_bcss))
1634 (sm->sm_dev, sm, ci, &initblock, &count, &skip, &size);
1636 if ((count == 1) && (skip == 0)) {
1637 count = (size_t)(size / un->un_rs_copysize);
1638 if ((frag = (size - (count * un->un_rs_copysize))) != 0)
1639 count++;
1640 size = (u_longlong_t)un->un_rs_copysize;
1642 blk_size = size; /* Save block size for this resync */
1644 ASSERT(count >= 1);
1645 resync_2_do = count;
1647 * If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
1648 * gives the proportion of the resync that has already been done.
1649 * If un_rs_copysize has changed since this previous partial resync,
1650 * either because this node has been rebooted with a different value
1651 * for md_resync_bufsz or because another node with a different value
1652 * for md_resync_bufsz performed the previous resync, we need to
1653 * recalculate un_rs_resync_done as a proportion of our value of
1654 * resync_2_do.
1656 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1659 * For MN mirrors we need to send a message to all nodes indicating
1660 * the next region to be resynced. For a component resync, the size of
1661 * the contiguous region that is processed by resync_read_blk_range()
1662 * may be small if there is the interleave size.
1663 * Therefore, rather than sending the message within
1664 * resync_read_blk_range(), we will send a message every
1665 * MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
1666 * the number of blocks. Then, if we are restarting a resync, round
1667 * un_rs_resync_done down to the previous resync region boundary. This
1668 * ensures that we send a RESYNC_NEXT message before resyncing any
1669 * blocks
1671 if (MD_MNSET_SETNO(setno)) {
1672 blks = ((MD_DEF_RESYNC_BLK_SZ + blk_size + skip - 1)/
1673 (blk_size + skip));
1674 un->un_rs_resync_done = (un->un_rs_resync_done/blks) * blks;
1677 * un_rs_resync_done is the number of ('size' + 'skip') increments
1678 * already resynced from the base 'block'
1679 * un_rs_resync_2_do is the number of iterations in
1680 * this component resync.
1682 ASSERT(count >= un->un_rs_resync_done);
1683 un->un_rs_resync_2_do = (diskaddr_t)count;
1685 un->c.un_status |= MD_UN_WAR;
1686 sm->sm_flags |= MD_SM_RESYNC_TARGET;
1687 md_unit_writerexit(ui);
1689 /* For MN sets, resync NOTIFY is done when processing resync messages */
1690 if (!MD_MNSET_SETNO(setno)) {
1691 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1692 SVM_TAG_METADEVICE, setno, MD_SID(un));
1694 un = (mm_unit_t *)md_unit_readerlock(ui);
1696 /* check to see if we've been asked to terminate */
1697 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1698 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1699 broke_out = RESYNC_ERR;
1702 * Check that we are still performing the same component
1703 * resync. If not, another node must have completed it
1704 * so we have no more work to do.
1706 if (un->un_rs_type != old_rs_type) {
1707 md_unit_readerexit(ui);
1708 (void) md_unit_writerlock(ui);
1709 return;
1712 * Adjust resync_done, resync_2_do, start of resync area and count to
1713 * skip already resync'd data. We need to recalculate resync_done as
1714 * we have dropped the unit lock above and may have lost ownership to
1715 * another node, with a different resync buffer size and it may have
1716 * sent us new values of resync_done and resync_2_do based on its
1717 * resync buffer size
1719 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1720 un->un_rs_resync_2_do = resync_2_do;
1721 count -= un->un_rs_resync_done;
1722 block = initblock + ((blk_size + skip) * (int)un->un_rs_resync_done);
1724 un->un_rs_dropped_lock = 1;
1725 while ((count > 0) && (broke_out != RESYNC_ERR)) {
1726 old_rs_done = un->un_rs_resync_done;
1728 * For MN mirrors send a message to the other nodes. This
1729 * message includes the size of the region that must be blocked
1730 * for all writes
1732 if (MD_MNSET_SETNO(setno)) {
1733 if ((un->un_rs_resync_done%blks == 0)) {
1734 un->un_resync_startbl = block;
1735 send_mn_resync_next_message(un, block,
1736 (blk_size+skip)*blks, flags1);
1737 flags1 = 0;
1739 * check to see if we've been asked to
1740 * terminate
1742 if (resync_kill_pending(un,
1743 MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1744 if (un->c.un_status &
1745 MD_UN_RESYNC_CANCEL) {
1746 broke_out = RESYNC_ERR;
1747 break;
1752 * Check that we are still performing the same
1753 * component resync. If not, another node must
1754 * have completed it so we have no more work to
1755 * do. Also reset count to remaining resync as
1756 * we may have lost ownership in in
1757 * send_mn_resync_next_message while another
1758 * node continued with the resync and
1759 * incremented resync_done.
1761 if (un->un_rs_type != old_rs_type) {
1762 md_unit_readerexit(ui);
1763 (void) md_unit_writerlock(ui);
1764 return;
1767 * recalculate resync_done, resync_2_do
1768 * We need to recalculate resync_done as
1769 * we have dropped the unit lock in
1770 * send_mn_resync_next_message above and may
1771 * have lost ownership to another node, with a
1772 * different resync buffer size and it may have
1773 * sent us new values of resync_done and
1774 * resync_2_do based on its resync buffer size
1776 recalc_resync_done(un, resync_2_do, initblock,
1777 blk_size, skip);
1778 un->un_rs_resync_2_do = resync_2_do;
1779 count = un->un_rs_resync_2_do -
1780 un->un_rs_resync_done;
1782 * Adjust start of resync area to skip already
1783 * resync'd data
1785 block = initblock + ((blk_size + skip) *
1786 (int)un->un_rs_resync_done);
1787 old_rs_done = un->un_rs_resync_done;
1790 err = resync_read_blk_range(un, block, block + size,
1791 MD_READER_HELD, MD_RESYNC_FLAG_ERR);
1793 /* resync_read_blk_range releases/grabs a new lock */
1794 un = (mm_unit_t *)MD_UNIT(mnum);
1796 if (err) {
1797 broke_out = RESYNC_ERR;
1798 break;
1801 * If we are no longer resyncing this component, return as
1802 * another node has progressed the resync.
1804 if (un->un_rs_type != old_rs_type) {
1805 md_unit_readerexit(ui);
1806 (void) md_unit_writerlock(ui);
1807 return;
1811 * recalculate resync_done, resync_2_do. We need to recalculate
1812 * resync_done as we have dropped the unit lock in
1813 * resync_read_blk_range above and may have lost ownership to
1814 * another node, with a different resync buffer size and it may
1815 * have sent us new values of resync_done and resync_2_do based
1816 * on its resync buffer size
1818 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1819 un->un_rs_resync_2_do = resync_2_do;
1822 * Reset count to remaining resync as we may have blocked in
1823 * resync_read_blk_range while another node continued
1824 * with the resync and incremented resync_done. Also adjust
1825 * start of resync area to skip already resync'd data.
1827 count = un->un_rs_resync_2_do - un->un_rs_resync_done;
1828 block = initblock +((blk_size + skip) *
1829 (int)un->un_rs_resync_done);
1832 * If we are picking up from another node, we retry the last
1833 * block otherwise step on to the next block
1835 if (old_rs_done == un->un_rs_resync_done) {
1836 block += blk_size + skip;
1837 un->un_rs_resync_done++;
1838 count--;
1841 if ((count == 1) && frag)
1842 size = frag;
1843 if (shared->ms_state == CS_ERRED) {
1844 err = 1;
1845 broke_out = RESYNC_ERR;
1846 break;
1849 /* Check to see if we've completed the resync cleanly */
1850 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1851 break;
1854 md_unit_readerexit(ui);
1855 un = (mm_unit_t *)md_unit_writerlock(ui);
1858 * If MN set send message to all nodes to indicate resync
1859 * phase is complete. The processing of the message will update the
1860 * mirror state
1862 if (MD_MNSET_SETNO(setno)) {
1863 send_mn_resync_done_message(un, broke_out);
1864 } else {
1865 un->c.un_status &= ~MD_UN_WAR;
1866 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
1868 if (err)
1869 shared->ms_flags |= MDM_S_RS_TRIED;
1870 else
1872 * As we don't transmit the changes,
1873 * no need to drop the lock.
1875 set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
1876 MD_STATE_NO_XMIT, (IOLOCK *)NULL);
1879 /* For MN sets, resync NOTIFY is done when processing resync messages */
1880 if (!MD_MNSET_SETNO(setno)) {
1881 if (broke_out) {
1882 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1883 SVM_TAG_METADEVICE, setno, MD_SID(un));
1884 } else {
1885 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1886 SVM_TAG_METADEVICE, setno, MD_SID(un));
1888 SET_RS_TYPE_NONE(un->un_rs_type);
1892 static void
1893 submirror_resync(mm_unit_t *un)
1895 mdi_unit_t *ui;
1896 minor_t mnum;
1897 mm_submirror_t *sm;
1898 mm_submirror_ic_t *smic;
1899 int smi;
1900 diskaddr_t chunk;
1901 diskaddr_t curblk;
1902 int err;
1903 int cnt;
1904 set_t setno;
1905 int broke_out = 0;
1906 int i;
1907 int flags1 = MD_FIRST_RESYNC_NEXT;
1908 int compcnt;
1910 mnum = MD_SID(un);
1911 ui = MDI_UNIT(mnum);
1912 setno = MD_UN2SET(un);
1915 * If the submirror_index is non-zero, we are continuing a resync
1916 * so restart resync from last submirror marked as being resynced.
1918 if (RS_SMI(un->un_rs_type) != 0) {
1919 smi = RS_SMI(un->un_rs_type);
1920 sm = &un->un_sm[smi];
1921 smic = &un->un_smic[smi];
1922 if (!SMS_IS(sm, SMS_ATTACHED_RESYNC)) {
1923 for (smi = 0; smi < NMIRROR; smi++) {
1924 sm = &un->un_sm[smi];
1925 smic = &un->un_smic[smi];
1926 if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1927 break;
1930 } else {
1931 for (smi = 0; smi < NMIRROR; smi++) {
1932 sm = &un->un_sm[smi];
1933 smic = &un->un_smic[smi];
1934 if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1935 break;
1938 if (smi == NMIRROR) {
1939 SET_RS_TYPE_NONE(un->un_rs_type);
1940 return;
1944 * If we've only got one component we can fail on a resync write
1945 * if an error is encountered. This stops an unnecessary read of the
1946 * whole mirror on a target write error.
1948 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
1949 if (compcnt == 1)
1950 flags1 |= MD_RESYNC_FLAG_ERR;
1952 un->c.un_status |= MD_UN_WAR;
1953 sm->sm_flags |= MD_SM_RESYNC_TARGET;
1954 SET_RS_SMI(un->un_rs_type, smi);
1955 md_unit_writerexit(ui);
1957 /* For MN sets, resync NOTIFY is done when processing resync messages */
1958 if (!MD_MNSET_SETNO(setno)) {
1959 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1960 SVM_TAG_METADEVICE, setno, MD_SID(un));
1962 un = (mm_unit_t *)md_unit_readerlock(ui);
1964 un->un_rs_dropped_lock = 1;
1966 /* check to see if we've been asked to terminate */
1967 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1968 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1969 broke_out = RESYNC_ERR;
1972 * Check that we are still performing the same submirror
1973 * resync. If not, another node must have completed it
1974 * so we have no more work to do.
1976 if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
1977 md_unit_readerexit(ui);
1978 (void) md_unit_writerlock(ui);
1979 return;
1982 /* if > 1TB mirror, increase percent done granularity */
1983 if (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)
1984 chunk = un->c.un_total_blocks / 1000;
1985 else
1986 chunk = un->c.un_total_blocks / 100;
1987 if (chunk == 0)
1988 chunk = un->c.un_total_blocks;
1990 * If a MN set, round the chunk size up to a multiple of
1991 * MD_DEF_RESYNC_BLK_SZ
1993 if (MD_MNSET_SETNO(setno)) {
1994 chunk = ((chunk + MD_DEF_RESYNC_BLK_SZ)/MD_DEF_RESYNC_BLK_SZ)
1995 * MD_DEF_RESYNC_BLK_SZ;
1996 if (chunk > un->c.un_total_blocks)
1997 chunk = un->c.un_total_blocks;
2000 * Handle restartable resyncs that continue from where the previous
2001 * resync left off. The new resync range is from un_rs_resync_done ..
2002 * un_rs_resync_2_do
2004 curblk = 0;
2005 if (un->un_rs_resync_done == 0) {
2006 un->un_rs_resync_2_do = un->c.un_total_blocks;
2007 } else {
2008 curblk = un->un_rs_resync_done;
2010 while ((curblk != un->c.un_total_blocks) && (broke_out != RESYNC_ERR)) {
2011 diskaddr_t rs_done;
2013 rs_done = un->un_rs_resync_done;
2014 err = resync_read_blk_range(un, curblk, curblk + chunk,
2015 MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
2016 flags1 = (compcnt == 1 ? MD_RESYNC_FLAG_ERR : 0);
2018 /* resync_read_blk_range releases/grabs a new lock */
2019 un = (mm_unit_t *)MD_UNIT(mnum);
2021 if (err) {
2022 broke_out = RESYNC_ERR;
2023 break;
2027 * If we are no longer executing a submirror resync, return
2028 * as another node has completed the submirror resync.
2030 if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
2031 md_unit_readerexit(ui);
2032 (void) md_unit_writerlock(ui);
2033 return;
2036 * If resync_done has changed, we must have blocked
2037 * in resync_read_blk_range while another node
2038 * continued with the resync so restart from resync_done.
2040 if (rs_done != un->un_rs_resync_done) {
2041 curblk = un->un_rs_resync_done;
2042 } else {
2043 curblk += chunk;
2044 un->un_rs_resync_done = curblk;
2047 if ((curblk + chunk) > un->c.un_total_blocks)
2048 chunk = un->c.un_total_blocks - curblk;
2049 for (i = 0, cnt = 0; i < NMIRROR; i++)
2050 if (SUBMIRROR_IS_WRITEABLE(un, i) &&
2051 !SMS_BY_INDEX_IS(un, i, SMS_ALL_ERRED) &&
2052 (un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET))
2053 cnt++;
2054 if (cnt == 0) {
2055 broke_out = RESYNC_ERR;
2056 break;
2059 /* Check to see if we've completed the resync cleanly */
2060 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
2061 break;
2063 md_unit_readerexit(ui);
2064 un = (mm_unit_t *)md_unit_writerlock(ui);
2067 * If MN set send message to all nodes to indicate resync
2068 * phase is complete. The processing of the message will update the
2069 * mirror state
2071 if (MD_MNSET_SETNO(setno)) {
2072 send_mn_resync_done_message(un, broke_out);
2073 } else {
2074 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
2075 if (err) {
2076 mirror_set_sm_state(sm, smic, SMS_ATTACHED, 1);
2077 } else {
2078 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2080 un->c.un_status &= ~MD_UN_WAR;
2081 mirror_commit(un, SMI2BIT(smi), 0);
2084 /* For MN sets, resync NOTIFY is done when processing resync messages */
2085 if (!MD_MNSET_SETNO(setno)) {
2086 if (broke_out) {
2087 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
2088 SVM_TAG_METADEVICE, setno, MD_SID(un));
2089 } else {
2090 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
2091 SVM_TAG_METADEVICE, setno, MD_SID(un));
2096 static void
2097 component_resync(mm_unit_t *un)
2099 mm_submirror_t *sm;
2100 mm_submirror_ic_t *smic;
2101 int ci;
2102 int i;
2103 int compcnt;
2106 * Handle the case where we are picking up a partially complete
2107 * component resync. In this case un_rs_type contains the submirror
2108 * and component index of where we should restart the resync.
2110 while (un->un_rs_type != MD_RS_COMPONENT) {
2111 i = RS_SMI(un->un_rs_type);
2112 ci = RS_CI(un->un_rs_type);
2113 check_comp_4_resync(un, i, ci);
2114 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2115 MD_WRITER_HELD))
2116 return;
2118 * If we have no current resync, contine to scan submirror and
2119 * components. If the resync has moved on to another component,
2120 * restart it and if the resync is no longer a component
2121 * resync, just exit
2123 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE)
2124 break;
2125 if (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT)
2126 return;
2128 /* Now continue scanning _all_ submirrors and components */
2129 for (i = 0; i < NMIRROR; i++) {
2130 sm = &un->un_sm[i];
2131 smic = &un->un_smic[i];
2132 if (!SMS_IS(sm, SMS_RUNNING | SMS_LIMPING))
2133 continue;
2134 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2135 for (ci = 0; ci < compcnt; ci++) {
2136 SET_RS_SMI(un->un_rs_type, i);
2137 SET_RS_CI(un->un_rs_type, ci);
2138 SET_RS_TYPE(un->un_rs_type, MD_RS_COMPONENT);
2139 check_comp_4_resync(un, i, ci);
2140 /* Bail out if we've been asked to abort/shutdown */
2141 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2142 MD_WRITER_HELD))
2143 return;
2145 * Now check if another node has continued with the
2146 * resync, if we are no longer in component resync,
2147 * exit, otherwise update to the current component - 1
2148 * so that the next call of check_comp_4 resync() will
2149 * resync the current component.
2151 if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2152 (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT))
2153 return;
2154 else {
2155 if (RS_SMI(un->un_rs_type) != i) {
2156 i = RS_SMI(un->un_rs_type);
2157 ci = RS_CI(un->un_rs_type) - 1;
2158 } else if (RS_CI(un->un_rs_type) != ci)
2159 ci = RS_CI(un->un_rs_type) - 1;
2165 static void
2166 reset_comp_flags(mm_unit_t *un)
2168 mm_submirror_t *sm;
2169 mm_submirror_ic_t *smic;
2170 md_m_shared_t *shared;
2171 int ci;
2172 int i;
2173 int compcnt;
2175 for (i = 0; i < NMIRROR; i++) {
2176 sm = &un->un_sm[i];
2177 smic = &un->un_smic[i];
2178 if (!SMS_IS(sm, SMS_INUSE))
2179 continue;
2180 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2181 for (ci = 0; ci < compcnt; ci++) {
2182 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2183 (sm->sm_dev, sm, ci);
2184 shared->ms_flags &= ~MDM_S_RS_TRIED;
2190 * resync_progress_thread:
2191 * ----------------------
2192 * Thread started on first resync of a unit which simply blocks until woken up
2193 * by a cv_signal, and then updates the mddb for the mirror unit record. This
2194 * saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
2195 * so that an aborted resync can be continued after an intervening reboot.
2197 static void
2198 resync_progress_thread(minor_t mnum)
2200 mm_unit_t *un = MD_UNIT(mnum);
2201 mdi_unit_t *ui = MDI_UNIT(mnum);
2202 set_t setno = MD_MIN2SET(mnum);
2204 while (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2205 mutex_enter(&un->un_rs_progress_mx);
2206 cv_wait(&un->un_rs_progress_cv, &un->un_rs_progress_mx);
2207 mutex_exit(&un->un_rs_progress_mx);
2208 if (un->un_rs_progress_flags & MD_RI_KILL)
2209 break;
2212 * Commit mirror unit if we're the Master node in a multi-node
2213 * environment
2215 if (MD_MNSET_SETNO(setno) && md_set[setno].s_am_i_master) {
2216 (void) md_unit_readerlock(ui);
2217 mirror_commit(un, NO_SUBMIRRORS, 0);
2218 md_unit_readerexit(ui);
2221 thread_exit();
2225 * resync_progress:
2226 * ---------------
2227 * Timeout handler for updating the progress of the resync thread.
2228 * Simply wake up the resync progress daemon which will then mirror_commit() the
2229 * unit structure to the mddb. This snapshots the current progress of the resync
2231 static void
2232 resync_progress(void *arg)
2234 mm_unit_t *un = (mm_unit_t *)arg;
2235 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
2236 uint_t active;
2238 mutex_enter(&un->un_rs_progress_mx);
2239 cv_signal(&un->un_rs_progress_cv);
2240 mutex_exit(&un->un_rs_progress_mx);
2242 /* schedule the next timeout if the resync is still marked active */
2243 (void) md_unit_readerlock(ui);
2244 active = un->c.un_status & MD_UN_RESYNC_ACTIVE ? 1 : 0;
2245 md_unit_readerexit(ui);
2246 if (active) {
2247 un->un_rs_resync_to_id = timeout(resync_progress, un,
2248 (clock_t)(drv_usectohz(60000000) *
2249 md_mirror_resync_update_intvl));
2254 * resync_unit:
2255 * -----------
2256 * Resync thread which drives all forms of resync (optimized, component,
2257 * submirror). Must handle thread suspension and kill to allow multi-node
2258 * resync to run without undue ownership changes.
2260 * For a MN set, the reync mechanism is as follows:
2262 * When a resync is started, either via metattach, metaonline, metareplace,
2263 * metasync or by a hotspare kicking in, a message is sent to all nodes, which
2264 * calls mirror_resync_thread. If there is currently no mirror owner, the
2265 * master node sends a CHOOSE_OWNER message to the handler on the master. This
2266 * chooses a mirror owner and sends a CHANGE_OWNER message requesting the
2267 * selected node to become the owner.
2268 * If this node is not the owner it sets itself to block in resync_kill_pending
2269 * and if there is no owner all nodes will block until the chosen owner is
2270 * selected, in which case it will unblock itself. So, on entry to this
2271 * function only one node will continue past resync_kill_pending().
2272 * Once the resync thread is started, it basically cycles through the optimized,
2273 * component and submirrors resyncs until there is no more work to do.
2275 * For an ABR mirror, once a mirror owner is chosen it will complete the resync
2276 * unless the nodes dies in which case a new owner will be chosen and it will
2277 * have to complete the resync from the point at which the previous owner died.
2278 * To do this we broadcast a RESYNC_NEXT message before each region to be
2279 * resynced and this message contains the address and length of the region
2280 * being resynced and the current progress through the resync. The size of
2281 * this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
2282 * block size to limit the amount of inter node traffic. The RESYNC_NEXT
2283 * message also indicates to all other nodes that all writes to this block
2284 * must be blocked until the next RESYNC_NEXT message is received. This ensures
2285 * that no node can write to a block that is being resynced. For all MN
2286 * mirrors we also block the whole resync region on the resync owner node so
2287 * that all writes to the resync region are blocked on all nodes. There is a
2288 * difference here between a MN set and a regular set in that for a MN set
2289 * we protect the mirror from writes to the current resync block by blocking
2290 * a larger region. For a regular set we just block writes to the current
2291 * resync block.
2293 * For a non-ABR mirror the same RESYNC_NEXT message is sent with an
2294 * additional purpose. In this case, there is only one mirror owner at a time
2295 * and rather than continually switching ownership between the chosen mirror
2296 * owner and the node that is writing to the mirror, we move the resync to the
2297 * mirror owner. When we swich ownership, we block the old owner and unblock
2298 * the resync thread on the new owner. To enable the new owner to continue the
2299 * resync, all nodes need to have the latest resync status, Then, following each
2300 * resync write, we check to see if the resync state has changed and if it
2301 * has this must be because we have lost ownership to another node(s) for a
2302 * period and then have become owner again later in the resync process. If we
2303 * are still dealing with the same resync, we just adjust addresses and counts
2304 * and then continue. If the resync has moved on to a different type, for
2305 * example from an optimized to a submirror resync, we move on to process the
2306 * resync described by rs_type and continue from the position described by
2307 * resync_done and resync_startbl.
2309 * Note that for non-ABR mirrors it is possible for a write to be made on a
2310 * non resync-owner node without a change of ownership. This is the case when
2311 * the mirror has a soft part created on it and a write in ABR mode is made
2312 * to that soft part. Therefore we still need to block writes to the resync
2313 * region on all nodes.
2315 * Sending the latest resync state to all nodes also enables them to continue
2316 * a resync in the event that the mirror owner dies. If a mirror owner for
2317 * a non-ABR mirror has died, there will be dirty resync regions. Therefore,
2318 * regardless of whether another type of resync was in progress, we must first
2319 * do an optimized resync to clean up the dirty regions before continuing
2320 * with the interrupted resync.
2322 * The resync status is held in the unit structure
2323 * On disk
2324 * un_rs_resync_done The number of contiguous resyc blocks done so far
2325 * un_rs_resync_2_do The total number of contiguous resync blocks
2326 * un_rs_type The resync type (inc submirror and component numbers)
2327 * In core
2328 * un_resync_startbl The address of the current resync block being processed
2330 * In the event that the whole cluster fails we need to just use
2331 * un_rs_resync_done to restart the resync and to ensure that this is
2332 * periodically written to disk, we have a thread which writes the record
2333 * to disk every 5 minutes. As the granularity of un_rs_resync_done is
2334 * usually coarse ( for an optimized resync 1001 is the max value) there is
2335 * little point in writing this more frequently.
2337 static void
2338 resync_unit(minor_t mnum)
2340 mdi_unit_t *ui;
2341 mm_unit_t *un;
2342 md_error_t mde = mdnullerror;
2343 int mn_resync = 0;
2344 int resync_finish = 0;
2345 set_t setno = MD_MIN2SET(mnum);
2346 uint_t old_rs_type = MD_RS_NONE;
2347 uint_t old_rs_done = 0, old_rs_2_do = 0;
2348 uint_t old_rs_startbl = 0;
2349 int block_resync = 1;
2350 char cpr_name[23]; /* Unique CPR name */
2351 int rs_copysize;
2352 char *rs_buffer;
2353 int nretries = 0;
2355 resync_restart:
2356 #ifdef DEBUG
2357 if (mirror_debug_flag)
2358 printf("Resync started (mnum = %x)\n", mnum);
2359 #endif
2361 * increment the mirror resync count
2363 mutex_enter(&md_cpr_resync.md_resync_mutex);
2364 md_cpr_resync.md_mirror_resync++;
2365 mutex_exit(&md_cpr_resync.md_resync_mutex);
2367 ui = MDI_UNIT(mnum);
2368 un = MD_UNIT(mnum);
2370 rs_copysize = un->un_rs_copysize;
2371 if (rs_copysize == 0) {
2373 * Don't allow buffer size to fall outside the
2374 * range 0 < bufsize <= md_max_xfer_bufsz.
2376 if (md_resync_bufsz <= 0)
2377 md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
2378 rs_copysize = MIN(md_resync_bufsz, md_max_xfer_bufsz);
2380 rs_buffer = kmem_zalloc(dbtob(rs_copysize), KM_SLEEP);
2381 un = md_unit_writerlock(ui);
2382 un->un_rs_copysize = rs_copysize;
2383 un->un_rs_buffer = rs_buffer;
2385 if (MD_MNSET_SETNO(setno)) {
2387 * Register this resync thread with the CPR mechanism. This
2388 * allows us to detect when the system is suspended and so
2389 * keep track of the RPC failure condition.
2391 (void) snprintf(cpr_name, sizeof (cpr_name),
2392 "mirror_resync%x", mnum);
2393 CALLB_CPR_INIT(&un->un_rs_cprinfo, &un->un_rs_cpr_mx,
2394 callb_md_mrs_cpr, cpr_name);
2396 if (ui->ui_tstate & MD_RESYNC_NOT_DONE) {
2398 * If this is the first resync following the initial
2399 * snarf (MD_RESYNC_NOT_DONE still set) and we've
2400 * been started outside a reconfig step (e.g. by being
2401 * added to an existing set) we need to query the
2402 * existing submirror state for this mirror.
2403 * The set_status flags will have MD_MN_SET_MIR_STATE_RC
2404 * set if we've been through a step4 reconfig, so only
2405 * query the master if this isn't (yet) set. In this
2406 * case we must continue the resync thread as there is
2407 * not guaranteed to be a currently running resync on
2408 * any of the other nodes. Worst case is that we will
2409 * initiate an ownership change to this node and then
2410 * find that there is no resync to perform. However, we
2411 * will then have correct status across the cluster.
2413 if (!md_set[setno].s_am_i_master) {
2414 if (!(md_get_setstatus(setno) &
2415 MD_SET_MN_MIR_STATE_RC)) {
2416 mirror_get_status(un, NULL);
2417 block_resync = 0;
2418 #ifdef DEBUG
2419 if (mirror_debug_flag) {
2420 mm_submirror_t *sm;
2421 int i;
2422 for (i = 0; i < NMIRROR; i++) {
2423 sm = &un->un_sm[i];
2424 printf(
2425 "sm[%d] state=%4x"
2426 " flags=%4x\n", i,
2427 sm->sm_state,
2428 sm->sm_flags);
2431 #endif
2434 ui->ui_tstate &= ~MD_RESYNC_NOT_DONE;
2437 * For MN set, if we have an owner, then start the resync on it.
2438 * If there is no owner the master must send a message to
2439 * choose the owner. This message will contain the current
2440 * resync count and it will only be sent to the master, where
2441 * the resync count will be used to choose the next node to
2442 * perform a resync, by cycling through the nodes in the set.
2443 * The message handler will then send a CHANGE_OWNER message to
2444 * all nodes, and on receipt of that message, the chosen owner
2445 * will issue a SET_OWNER ioctl to become the owner. This ioctl
2446 * will be requested to spawn a thread to issue the
2447 * REQUEST_OWNER message to become the owner which avoids the
2448 * need for concurrent ioctl requests.
2449 * After sending the message, we will block waiting for one
2450 * of the nodes to become the owner and start the resync
2452 if (MD_MN_NO_MIRROR_OWNER(un)) {
2454 * There is no owner, block and then the master will
2455 * choose the owner. Only perform this if 'block_resync'
2456 * is set.
2458 if (block_resync) {
2459 mutex_enter(&un->un_rs_thread_mx);
2460 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2461 mutex_exit(&un->un_rs_thread_mx);
2463 if (md_set[setno].s_am_i_master) {
2464 md_unit_writerexit(ui);
2465 (void) mirror_choose_owner(un, NULL);
2466 (void) md_unit_writerlock(ui);
2468 } else {
2469 /* There is an owner, block if we are not it */
2470 if (!MD_MN_MIRROR_OWNER(un)) {
2471 mutex_enter(&un->un_rs_thread_mx);
2472 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2473 mutex_exit(&un->un_rs_thread_mx);
2478 * Start a timeout chain to update the resync progress to the mddb.
2479 * This will run every md_mirror_resync_update_intvl minutes and allows
2480 * a resync to be continued over a reboot.
2482 ASSERT(un->un_rs_resync_to_id == 0);
2483 un->un_rs_resync_to_id = timeout(resync_progress, un,
2484 (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl));
2487 * Handle resync restart from the last logged position. The contents
2488 * of un_rs_resync_2_do and un_rs_resync_done are dependent on the
2489 * type of resync that was in progress.
2491 if (MD_MNSET_SETNO(setno)) {
2492 switch ((uint_t)RS_TYPE(un->un_rs_type)) {
2493 case MD_RS_NONE:
2494 case MD_RS_OPTIMIZED:
2495 case MD_RS_COMPONENT:
2496 case MD_RS_SUBMIRROR:
2497 case MD_RS_ABR:
2498 break;
2499 default:
2500 un->un_rs_type = MD_RS_NONE;
2502 /* Allocate a resync message, if required */
2503 if (un->un_rs_msg == NULL) {
2504 un->un_rs_msg = (md_mn_msg_resync_t *)kmem_zalloc(
2505 sizeof (md_mn_msg_resync_t), KM_SLEEP);
2507 mn_resync = 1;
2510 /* Check to see if we've been requested to block/kill */
2511 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2512 goto bail_out;
2515 do {
2516 un->un_rs_dropped_lock = 0;
2518 * Always perform an optimized resync first as this will bring
2519 * the mirror into an available state in the shortest time.
2520 * If we are resuming an interrupted resync, other than an
2521 * optimized resync, we save the type and amount done so that
2522 * we can resume the appropriate resync after the optimized
2523 * resync has completed.
2525 if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2526 (RS_TYPE(un->un_rs_type) != MD_RS_OPTIMIZED)) {
2527 old_rs_type = un->un_rs_type;
2528 old_rs_done = un->un_rs_resync_done;
2529 old_rs_2_do = un->un_rs_resync_2_do;
2530 old_rs_startbl = un->un_resync_startbl;
2532 SET_RS_TYPE(un->un_rs_type, MD_RS_OPTIMIZED);
2534 * If we are continuing a resync that is not an
2535 * OPTIMIZED one, then we start from the beginning when
2536 * doing this optimized resync
2538 if (RS_TYPE(old_rs_type) != MD_RS_OPTIMIZED) {
2539 un->un_rs_resync_done = 0;
2540 un->un_rs_resync_2_do = 0;
2541 un->un_resync_startbl = 0;
2543 optimized_resync(un);
2544 /* Check to see if we've been requested to block/kill */
2545 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2546 goto bail_out;
2548 un = (mm_unit_t *)MD_UNIT(mnum);
2550 * If another node has moved the resync on, we must
2551 * restart the correct resync
2553 if (mn_resync &&
2554 (RS_TYPE(un->un_rs_type) != MD_RS_NONE)) {
2555 old_rs_type = un->un_rs_type;
2556 old_rs_done = un->un_rs_resync_done;
2557 old_rs_2_do = un->un_rs_resync_2_do;
2558 old_rs_startbl = un->un_resync_startbl;
2562 * Restore previous resync progress or move onto a
2563 * component resync.
2565 if (RS_TYPE(old_rs_type) != MD_RS_NONE) {
2566 un->un_rs_type = old_rs_type;
2567 un->un_rs_resync_done = old_rs_done;
2568 un->un_rs_resync_2_do = old_rs_2_do;
2569 un->un_resync_startbl = old_rs_startbl;
2570 } else {
2571 un->un_rs_type = MD_RS_COMPONENT;
2572 un->un_rs_resync_done = 0;
2573 un->un_rs_resync_2_do = 0;
2574 un->un_resync_startbl = 0;
2577 if (RS_TYPE(un->un_rs_type) == MD_RS_COMPONENT) {
2578 component_resync(un);
2579 /* Check to see if we've been requested to block/kill */
2580 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2581 goto bail_out;
2583 un = (mm_unit_t *)MD_UNIT(mnum);
2585 * If we have moved on from a component resync, another
2586 * node must have completed it and started a submirror
2587 * resync, so leave the resync state alone. For non
2588 * multi-node sets we move onto the submirror resync.
2590 if (mn_resync) {
2591 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2592 un->un_rs_type = MD_RS_SUBMIRROR;
2593 un->un_rs_resync_done =
2594 un->un_rs_resync_2_do = 0;
2595 un->un_resync_startbl = 0;
2597 } else {
2598 un->un_rs_type = MD_RS_SUBMIRROR;
2599 un->un_rs_resync_done = 0;
2600 un->un_rs_resync_2_do = 0;
2601 un->un_resync_startbl = 0;
2604 if (RS_TYPE(un->un_rs_type) == MD_RS_SUBMIRROR) {
2605 submirror_resync(un);
2606 /* Check to see if we've been requested to block/kill */
2607 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2608 goto bail_out;
2610 un = (mm_unit_t *)MD_UNIT(mnum);
2612 * If we have moved on from a submirror resync, another
2613 * node must have completed it and started a different
2614 * resync, so leave the resync state alone
2616 if (mn_resync) {
2617 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2618 un->un_rs_resync_done =
2619 un->un_rs_resync_2_do = 0;
2620 un->un_resync_startbl = 0;
2622 } else {
2623 /* If non-MN mirror, reinitialize state */
2624 un->un_rs_type = MD_RS_NONE;
2625 un->un_rs_resync_done = 0;
2626 un->un_rs_resync_2_do = 0;
2627 un->un_resync_startbl = 0;
2630 } while (un->un_rs_dropped_lock);
2631 mutex_enter(&un->un_rs_thread_mx);
2632 un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
2633 mutex_exit(&un->un_rs_thread_mx);
2635 resync_finish = 1;
2636 bail_out:
2637 #ifdef DEBUG
2638 if (mirror_debug_flag)
2639 printf("Resync stopped (mnum = %x), resync_finish = %d\n",
2640 mnum, resync_finish);
2641 #endif
2642 kmem_free(un->un_rs_buffer, dbtob(un->un_rs_copysize));
2644 mutex_enter(&un->un_rs_progress_mx);
2645 un->un_rs_progress_flags |= MD_RI_KILL;
2646 cv_signal(&un->un_rs_progress_cv);
2647 mutex_exit(&un->un_rs_progress_mx);
2650 * For MN Set, send a RESYNC_FINISH if this node completed the resync.
2651 * There is no need to grow unit here, it will be done in the
2652 * handler for the RESYNC_FINISH message together with resetting
2653 * MD_UN_RESYNC_ACTIVE.
2655 if (mn_resync) {
2656 if (resync_finish) {
2658 * Normal resync completion. Issue a RESYNC_FINISH
2659 * message if we're part of a multi-node set.
2661 md_mn_kresult_t *kres;
2662 md_mn_msg_resync_t *rmsg;
2663 int rval;
2665 rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
2666 md_unit_writerexit(ui);
2668 rmsg->msg_resync_mnum = mnum;
2669 rmsg->msg_resync_type = 0;
2670 rmsg->msg_resync_done = 0;
2671 rmsg->msg_resync_2_do = 0;
2672 rmsg->msg_originator = md_mn_mynode_id;
2674 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2676 smrf_msg:
2677 mutex_enter(&un->un_rs_cpr_mx);
2678 CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
2680 rval = mdmn_ksend_message(setno,
2681 MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
2682 (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
2684 CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
2685 &un->un_rs_cpr_mx);
2686 mutex_exit(&un->un_rs_cpr_mx);
2688 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
2689 mdmn_ksend_show_error(rval, kres,
2690 "RESYNC_FINISH");
2691 /* If we're shutting down, pause things here. */
2692 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
2693 while (!md_mn_is_commd_present()) {
2694 delay(md_hz);
2697 * commd is now available again. Retry
2698 * the message once. If this fails we
2699 * panic as the system is in an
2700 * unexpected state.
2702 if (nretries++ == 0)
2703 goto smrf_msg;
2705 cmn_err(CE_PANIC,
2706 "ksend_message failure: RESYNC_FINISH");
2708 kmem_free(kres, sizeof (md_mn_kresult_t));
2709 (void) md_unit_writerlock(ui);
2712 * If the resync has been cancelled, clear flags, reset owner
2713 * for ABR mirror and release the resync region parent
2714 * structure.
2716 if (un->c.un_status & MD_UN_RESYNC_CANCEL) {
2717 md_mps_t *ps;
2719 if (ui->ui_tstate & MD_ABR_CAP) {
2720 /* Resync finished, if ABR set owner to NULL */
2721 mutex_enter(&un->un_owner_mx);
2722 un->un_mirror_owner = 0;
2723 mutex_exit(&un->un_owner_mx);
2726 un->c.un_status &= ~(MD_UN_RESYNC_CANCEL |
2727 MD_UN_RESYNC_ACTIVE);
2728 ps = un->un_rs_prev_overlap;
2729 if (ps != NULL) {
2730 /* Remove previous overlap resync region */
2731 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2732 mirror_overlap_tree_remove(ps);
2734 * Release the overlap range reference
2736 un->un_rs_prev_overlap = NULL;
2737 kmem_cache_free(mirror_parent_cache,
2738 ps);
2743 * Release resync message buffer. This will be reallocated on
2744 * the next invocation of the resync_unit thread.
2746 if (un->un_rs_msg) {
2747 kmem_free(un->un_rs_msg, sizeof (md_mn_msg_resync_t));
2748 un->un_rs_msg = NULL;
2750 } else {
2751 /* For non-MN sets deal with any pending grows */
2752 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2753 if (un->c.un_status & MD_UN_GROW_PENDING) {
2754 if ((mirror_grow_unit(un, &mde) != 0) ||
2755 (! mdismderror(&mde, MDE_GROW_DELAYED))) {
2756 un->c.un_status &= ~MD_UN_GROW_PENDING;
2761 reset_comp_flags(un);
2762 un->un_resync_completed = 0;
2763 mirror_commit(un, NO_SUBMIRRORS, 0);
2764 md_unit_writerexit(ui);
2767 * Stop the resync progress thread.
2769 if (un->un_rs_resync_to_id != 0) {
2770 (void) untimeout(un->un_rs_resync_to_id);
2771 un->un_rs_resync_to_id = 0;
2775 * Calling mirror_internal_close() makes further reference to un / ui
2776 * dangerous. If we are the only consumer of the mirror it is possible
2777 * for a metaclear to be processed after completion of the m_i_c()
2778 * routine. As we need to handle the case where another resync has been
2779 * scheduled for the mirror, we raise the open count on the device
2780 * which protects against the close / metaclear / lock => panic scenario
2782 (void) md_unit_incopen(MD_SID(un), FREAD|FWRITE, OTYP_LYR);
2783 (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2786 * deccrement the mirror resync count
2788 mutex_enter(&md_cpr_resync.md_resync_mutex);
2789 md_cpr_resync.md_mirror_resync--;
2790 mutex_exit(&md_cpr_resync.md_resync_mutex);
2793 * Remove the thread reference as we're about to exit. This allows a
2794 * subsequent mirror_resync_unit() to start a new thread.
2795 * If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
2796 * called to start a new resync, so reopen the mirror and go back to
2797 * the start.
2799 (void) md_unit_writerlock(ui);
2800 mutex_enter(&un->un_rs_thread_mx);
2801 un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2802 mutex_exit(&un->un_rs_thread_mx);
2803 if (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2804 md_unit_writerexit(ui);
2805 if (mirror_internal_open(MD_SID(un), (FREAD|FWRITE),
2806 OTYP_LYR, 0, (IOLOCK *)NULL) == 0) {
2807 /* Release the reference grabbed above */
2808 (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0,
2809 (IOLOCK *)NULL);
2810 goto resync_restart;
2812 (void) md_unit_writerlock(ui);
2813 cmn_err(CE_NOTE,
2814 "Could not open metadevice (%x) for resync\n",
2815 MD_SID(un));
2817 un->un_rs_thread = NULL;
2818 md_unit_writerexit(ui);
2821 * Check for hotspares once we've cleared the resync thread reference.
2822 * If there are any errored units a poke_hotspares() will result in
2823 * a call to mirror_resync_unit() which we need to allow to start.
2825 (void) poke_hotspares();
2828 * Remove this thread from the CPR callback table.
2830 if (mn_resync) {
2831 mutex_enter(&un->un_rs_cpr_mx);
2832 CALLB_CPR_EXIT(&un->un_rs_cprinfo);
2836 * Remove the extra reference to the unit we generated above. After
2837 * this call it is *unsafe* to reference either ui or un as they may
2838 * no longer be allocated.
2840 (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2842 thread_exit();
2846 * mirror_resync_unit:
2847 * ------------------
2848 * Start a resync for the given mirror metadevice. Save the resync thread ID in
2849 * un->un_rs_thread for later manipulation.
2851 * Returns:
2852 * 0 Success
2853 * !=0 Error
2855 /*ARGSUSED*/
2857 mirror_resync_unit(
2858 minor_t mnum,
2859 md_resync_ioctl_t *ri,
2860 md_error_t *ep,
2861 IOLOCK *lockp
2864 mdi_unit_t *ui;
2865 mm_unit_t *un;
2866 set_t setno = MD_MIN2SET(mnum);
2868 ui = MDI_UNIT(mnum);
2870 if (md_get_setstatus(setno) & MD_SET_STALE)
2871 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
2873 if (mirror_internal_open(mnum, (FREAD|FWRITE), OTYP_LYR, 0, lockp)) {
2874 return (mdmderror(ep, MDE_MIRROR_OPEN_FAILURE, mnum));
2876 if (lockp) {
2877 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
2878 } else {
2879 un = (mm_unit_t *)md_unit_writerlock(ui);
2883 * Check to see if we're attempting to start a resync while one is
2884 * already running.
2886 if (un->c.un_status & MD_UN_RESYNC_ACTIVE ||
2887 un->un_rs_thread != NULL) {
2889 * Ensure RESYNC_ACTIVE set, it may not be if the resync thread
2890 * is in the process of terminating, setting the flag will
2891 * cause the resync thread to return to the beginning
2893 un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2894 if (lockp) {
2895 md_ioctl_writerexit(lockp);
2896 } else {
2897 md_unit_writerexit(ui);
2899 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2900 return (0);
2902 un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2903 un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
2904 if ((ri) && (ri->ri_copysize > 0) &&
2905 (ri->ri_copysize <= md_max_xfer_bufsz))
2906 un->un_rs_copysize = ri->ri_copysize;
2907 else
2908 un->un_rs_copysize = 0;
2910 /* Start the resync progress thread off */
2911 un->un_rs_progress_flags = 0;
2912 (void) thread_create(NULL, 0, resync_progress_thread,
2913 (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
2916 * We have to store the thread ID in the unit structure so do not
2917 * drop writerlock until the thread is active. This means resync_unit
2918 * may spin on its first md_unit_readerlock(), but deadlock won't occur.
2920 mutex_enter(&un->un_rs_thread_mx);
2921 un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2922 mutex_exit(&un->un_rs_thread_mx);
2923 un->un_rs_thread = thread_create(NULL, 0, resync_unit,
2924 (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, 60);
2925 if (un->un_rs_thread == (kthread_id_t)NULL) {
2926 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2927 if (lockp) {
2928 md_ioctl_writerexit(lockp);
2929 } else {
2930 md_unit_writerexit(ui);
2932 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2933 return (mdmderror(ep, MDE_MIRROR_THREAD_FAILURE, mnum));
2934 } else {
2935 if (lockp) {
2936 md_ioctl_writerexit(lockp);
2937 } else {
2938 md_unit_writerexit(ui);
2942 return (0);
2946 * mirror_ioctl_resync:
2947 * -------------------
2948 * Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
2949 * or kill the resync thread associated with the specified unit.
2950 * Can return with locks held since mdioctl will free any locks
2951 * that are marked in lock->l_flags.
2953 * Returns:
2954 * 0 Success
2955 * !=0 Error Code
2958 mirror_ioctl_resync(
2959 md_resync_ioctl_t *ri,
2960 IOLOCK *lock
2963 minor_t mnum = ri->ri_mnum;
2964 mm_unit_t *un;
2965 uint_t bits;
2966 mm_submirror_t *sm;
2967 mm_submirror_ic_t *smic;
2968 int smi;
2969 kt_did_t tid;
2970 set_t setno = MD_MIN2SET(mnum);
2972 mdclrerror(&ri->mde);
2974 if ((setno >= md_nsets) ||
2975 (MD_MIN2UNIT(mnum) >= md_nunits)) {
2976 return (mdmderror(&ri->mde, MDE_INVAL_UNIT, mnum));
2979 /* RD_LOCK flag grabs the md_ioctl_readerlock */
2980 un = mirror_getun(mnum, &ri->mde, RD_LOCK, lock);
2982 if (un == NULL) {
2983 return (mdmderror(&ri->mde, MDE_UNIT_NOT_SETUP, mnum));
2985 if (un->c.un_type != MD_METAMIRROR) {
2986 return (mdmderror(&ri->mde, MDE_NOT_MM, mnum));
2988 if (un->un_nsm < 2) {
2989 return (0);
2993 * Determine the action to take based on the ri_flags field:
2994 * MD_RI_BLOCK: Block current resync thread
2995 * MD_RI_UNBLOCK: Unblock resync thread
2996 * MD_RI_KILL: Abort resync thread
2997 * MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
2998 * without using rpc.mdcommd messages.
2999 * any other: Start resync thread
3001 switch (ri->ri_flags & (MD_RI_BLOCK|MD_RI_UNBLOCK|MD_RI_KILL)) {
3003 case MD_RI_BLOCK:
3004 /* Halt resync thread by setting flag in un_rs_flags */
3005 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3006 return (0);
3008 mutex_enter(&un->un_rs_thread_mx);
3009 un->un_rs_thread_flags |= MD_RI_BLOCK;
3010 mutex_exit(&un->un_rs_thread_mx);
3011 return (0);
3013 case MD_RI_UNBLOCK:
3015 * Restart resync thread by clearing flag in un_rs_flags and
3016 * cv_signal'ing the blocked thread.
3018 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3019 return (0);
3021 mutex_enter(&un->un_rs_thread_mx);
3022 un->un_rs_thread_flags &= ~MD_RI_BLOCK;
3023 cv_signal(&un->un_rs_thread_cv);
3024 mutex_exit(&un->un_rs_thread_mx);
3025 return (0);
3027 case MD_RI_KILL:
3028 /* Abort resync thread. */
3029 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3030 return (0);
3032 mutex_enter(&un->un_rs_thread_mx);
3033 tid = un->un_rs_thread ? (un->un_rs_thread)->t_did : 0;
3034 un->un_rs_thread_flags &= ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
3035 un->un_rs_thread_flags |= MD_RI_KILL;
3036 cv_signal(&un->un_rs_thread_cv);
3037 mutex_exit(&un->un_rs_thread_mx);
3038 if (tid != 0) {
3039 if (!(ri->ri_flags & MD_RI_NO_WAIT)) {
3040 md_ioctl_readerexit(lock);
3041 thread_join(tid);
3042 un->un_rs_thread_flags &= ~MD_RI_KILL;
3043 un->un_rs_thread = NULL;
3044 cmn_err(CE_WARN, "md: %s: Resync cancelled\n",
3045 md_shortname(MD_SID(un)));
3048 return (0);
3051 md_ioctl_readerexit(lock);
3053 bits = 0;
3054 for (smi = 0; smi < NMIRROR; smi++) {
3055 sm = &un->un_sm[smi];
3056 smic = &un->un_smic[smi];
3057 if (!SMS_IS(sm, SMS_ATTACHED))
3058 continue;
3059 mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
3060 bits |= SMI2BIT(smi);
3062 if (bits != 0)
3063 mirror_commit(un, bits, 0);
3066 * If we are resyncing a mirror in a MN set and the rpc.mdcommd
3067 * can be used, we do not start the resync at this point.
3068 * Instead, the metasync command that issued the ioctl
3069 * will send a RESYNC_STARTING message to start the resync thread. The
3070 * reason we do it this way is to ensure that the metasync ioctl is
3071 * executed on all nodes before the resync thread is started.
3073 * If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
3074 * don't use rpc.mdcommd, but just start the resync thread. This
3075 * flag is set on a node when it is being added to a diskset
3076 * so that the resync threads are started on the newly added node.
3078 if ((!(MD_MNSET_SETNO(setno))) ||
3079 (ri->ri_flags & MD_RI_RESYNC_FORCE_MNSTART)) {
3080 return (mirror_resync_unit(mnum, ri, &ri->mde, lock));
3081 } else {
3082 return (0);
3087 mirror_mark_resync_region_non_owner(struct mm_unit *un,
3088 diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3090 int no_change;
3091 size_t start_rr;
3092 size_t current_rr;
3093 size_t end_rr;
3094 md_mn_msg_rr_dirty_t *rr;
3095 md_mn_kresult_t *kres;
3096 set_t setno = MD_UN2SET(un);
3097 int rval;
3098 md_mn_nodeid_t node_idx = source_node - 1;
3099 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
3100 md_mn_nodeid_t owner_node;
3101 minor_t mnum = MD_SID(un);
3103 if (un->un_nsm < 2)
3104 return (0);
3107 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3108 * not, allocate it and then fill the [start..end] entries.
3109 * Update un_pernode_dirty_sum if we've gone 0->1.
3110 * Update un_dirty_bm if the corresponding entries are clear.
3112 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3113 if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3114 un->un_pernode_dirty_bm[node_idx] =
3115 (uchar_t *)kmem_zalloc(
3116 (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3118 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3120 BLK_TO_RR(end_rr, endblk, un);
3121 BLK_TO_RR(start_rr, startblk, un);
3123 no_change = 1;
3125 mutex_enter(&un->un_resync_mx);
3126 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3127 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3128 un->un_outstanding_writes[current_rr]++;
3129 if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
3130 un->un_pernode_dirty_sum[current_rr]++;
3131 SET_PERNODE_DIRTY(source_node, current_rr, un);
3133 CLR_GOING_CLEAN(current_rr, un);
3134 if (!IS_REGION_DIRTY(current_rr, un)) {
3135 no_change = 0;
3136 SET_REGION_DIRTY(current_rr, un);
3137 SET_GOING_DIRTY(current_rr, un);
3138 } else if (IS_GOING_DIRTY(current_rr, un))
3139 no_change = 0;
3141 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3142 mutex_exit(&un->un_resync_mx);
3144 if (no_change) {
3145 return (0);
3149 * If we have dirty regions to commit, send a
3150 * message to the owning node so that the
3151 * in-core bitmap gets updated appropriately.
3152 * TODO: make this a kmem_cache pool to improve
3153 * alloc/free performance ???
3155 kres = (md_mn_kresult_t *)kmem_alloc(sizeof (md_mn_kresult_t),
3156 KM_SLEEP);
3157 rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
3158 KM_SLEEP);
3160 resend_mmrr:
3161 owner_node = un->un_mirror_owner;
3163 rr->rr_mnum = mnum;
3164 rr->rr_nodeid = md_mn_mynode_id;
3165 rr->rr_range = (ushort_t)start_rr << 16;
3166 rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
3168 /* release readerlock before sending message */
3169 md_unit_readerexit(ui);
3171 rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
3172 MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
3173 un->un_mirror_owner, (char *)rr,
3174 sizeof (md_mn_msg_rr_dirty_t), kres);
3176 /* reaquire readerlock on message completion */
3177 (void) md_unit_readerlock(ui);
3179 /* if the message send failed, note it, and pass an error back up */
3180 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
3181 /* if commd is gone, no point in printing a message */
3182 if (md_mn_is_commd_present())
3183 mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
3184 kmem_free(kres, sizeof (md_mn_kresult_t));
3185 kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3186 return (1);
3190 * if the owner changed while we were sending the message, and it's
3191 * not us, the new mirror owner won't yet have done the right thing
3192 * with our data. Let him know. If we became the owner, we'll
3193 * deal with that differently below. Note that receiving a message
3194 * about another node twice won't hurt anything.
3196 if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
3197 goto resend_mmrr;
3199 kmem_free(kres, sizeof (md_mn_kresult_t));
3200 kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3202 mutex_enter(&un->un_resync_mx);
3205 * If we became the owner changed while we were sending the message,
3206 * we have dirty bits in the un_pernode_bm that aren't yet reflected
3207 * in the un_dirty_bm, as it was re-read from disk, and our bits
3208 * are also not reflected in the on-disk DRL. Fix that now.
3210 if (MD_MN_MIRROR_OWNER(un)) {
3211 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3212 mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
3213 un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
3214 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3216 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3218 mutex_exit(&un->un_resync_mx);
3219 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3220 mutex_enter(&un->un_resync_mx);
3222 un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
3223 cv_broadcast(&un->un_resync_cv);
3226 for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3227 CLR_GOING_DIRTY(current_rr, un);
3229 mutex_exit(&un->un_resync_mx);
3231 return (0);
3235 mirror_mark_resync_region_owner(struct mm_unit *un,
3236 diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3238 int no_change;
3239 size_t start_rr;
3240 size_t current_rr;
3241 size_t end_rr;
3242 int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3243 md_mn_nodeid_t node_idx = source_node - 1;
3245 if (un->un_nsm < 2)
3246 return (0);
3249 * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3250 * not, allocate it and then fill the [start..end] entries.
3251 * Update un_pernode_dirty_sum if we've gone 0->1.
3252 * Update un_dirty_bm if the corresponding entries are clear.
3254 if (mnset) {
3255 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3256 if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3257 un->un_pernode_dirty_bm[node_idx] =
3258 (uchar_t *)kmem_zalloc(
3259 (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3261 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3264 mutex_enter(&un->un_resync_mx);
3266 if (mnset)
3267 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3269 no_change = 1;
3270 BLK_TO_RR(end_rr, endblk, un);
3271 BLK_TO_RR(start_rr, startblk, un);
3272 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3273 if (!mnset || source_node == md_mn_mynode_id)
3274 un->un_outstanding_writes[current_rr]++;
3275 if (mnset) {
3276 if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
3277 un->un_pernode_dirty_sum[current_rr]++;
3278 SET_PERNODE_DIRTY(source_node, current_rr, un);
3280 CLR_GOING_CLEAN(current_rr, un);
3281 if (!IS_REGION_DIRTY(current_rr, un))
3282 no_change = 0;
3283 if (IS_GOING_DIRTY(current_rr, un))
3284 no_change = 0;
3287 if (mnset)
3288 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3290 if (no_change) {
3291 mutex_exit(&un->un_resync_mx);
3292 return (0);
3294 un->un_waiting_to_mark++;
3295 while (un->un_resync_flg & MM_RF_GATECLOSED) {
3296 if (panicstr)
3297 return (1);
3298 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3300 un->un_waiting_to_mark--;
3302 no_change = 1;
3303 for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3304 if (!IS_REGION_DIRTY(current_rr, un)) {
3305 SET_REGION_DIRTY(current_rr, un);
3306 SET_GOING_DIRTY(current_rr, un);
3307 no_change = 0;
3308 } else {
3309 if (IS_GOING_DIRTY(current_rr, un))
3310 no_change = 0;
3313 if (no_change) {
3314 if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
3315 cv_broadcast(&un->un_resync_cv);
3316 mutex_exit(&un->un_resync_mx);
3317 return (0);
3320 un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
3321 un->un_waiting_to_commit++;
3322 while (un->un_waiting_to_mark != 0 &&
3323 !(un->un_resync_flg & MM_RF_GATECLOSED)) {
3324 if (panicstr)
3325 return (1);
3326 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3329 if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
3330 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3331 un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
3333 mutex_exit(&un->un_resync_mx);
3334 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3335 mutex_enter(&un->un_resync_mx);
3337 un->un_resync_flg &= ~MM_RF_COMMITING;
3338 cv_broadcast(&un->un_resync_cv);
3340 while (un->un_resync_flg & MM_RF_COMMITING) {
3341 if (panicstr)
3342 return (1);
3343 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3346 for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3347 CLR_GOING_DIRTY(current_rr, un);
3349 if (--un->un_waiting_to_commit == 0) {
3350 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3351 cv_broadcast(&un->un_resync_cv);
3353 mutex_exit(&un->un_resync_mx);
3355 return (0);
3359 mirror_mark_resync_region(struct mm_unit *un,
3360 diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3362 int mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3364 if (mnset && !MD_MN_MIRROR_OWNER(un)) {
3365 return (mirror_mark_resync_region_non_owner(un, startblk,
3366 endblk, source_node));
3367 } else {
3368 return (mirror_mark_resync_region_owner(un, startblk, endblk,
3369 source_node));
3374 mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3376 short *owp;
3377 optim_resync_t *orp;
3378 uint_t rr_mult = 1;
3379 uint_t old_nregions, new_nregions;
3380 int old_bm_size, new_bm_size;
3381 size_t size;
3382 mddb_recid_t recid, old_recid;
3383 uchar_t *old_dirty_bm;
3384 int i, j;
3385 mddb_type_t typ1;
3386 set_t setno = MD_UN2SET(un);
3387 uchar_t *old_pns;
3389 old_nregions = un->un_rrd_num;
3390 new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3392 while (new_nregions > MD_MAX_NUM_RR) {
3393 new_nregions >>= 1;
3394 rr_mult <<= 1;
3397 new_bm_size = howmany(new_nregions, NBBY);
3398 old_bm_size = howmany(old_nregions, NBBY);
3400 size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3402 typ1 = (mddb_type_t)md_getshared_key(setno,
3403 mirror_md_ops.md_driver.md_drivername);
3404 recid = mddb_createrec(size, typ1, RESYNC_REC,
3405 MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3406 if (recid < 0)
3407 return (-1);
3409 orp = (struct optim_resync *)mddb_getrecaddr(recid);
3410 ASSERT(orp != NULL);
3412 orp->or_magic = OR_MAGIC; /* Magic # */
3413 orp->or_blksize = un->un_rrd_blksize; /* Same block size */
3414 orp->or_num = new_nregions; /* New number of regions */
3416 old_dirty_bm = un->un_dirty_bm;
3417 un->un_dirty_bm = orp->or_rr;
3419 kmem_free((caddr_t)un->un_goingdirty_bm, old_bm_size);
3420 un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3422 kmem_free((caddr_t)un->un_goingclean_bm, old_bm_size);
3423 un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3425 kmem_free((caddr_t)un->un_resync_bm, old_bm_size);
3426 un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3428 owp = un->un_outstanding_writes;
3429 un->un_outstanding_writes = (short *)kmem_zalloc(
3430 new_nregions * sizeof (short), KM_SLEEP);
3432 old_pns = un->un_pernode_dirty_sum;
3433 if (old_pns)
3434 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
3435 KM_SLEEP);
3438 * Now translate the old records into the new
3439 * records
3441 for (i = 0; i < old_nregions; i++) {
3443 * only bring forward the
3444 * outstanding write counters and the dirty bits and also
3445 * the pernode_summary counts
3447 if (!isset(old_dirty_bm, i))
3448 continue;
3450 setbit(un->un_dirty_bm, (i / rr_mult));
3451 un->un_outstanding_writes[(i / rr_mult)] += owp[i];
3452 if (old_pns)
3453 un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
3455 kmem_free((caddr_t)owp, old_nregions * sizeof (short));
3456 if (old_pns)
3457 kmem_free((caddr_t)old_pns, old_nregions);
3460 * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
3462 for (j = 0; j < MD_MNMAXSIDES; j++) {
3463 rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
3464 old_dirty_bm = un->un_pernode_dirty_bm[j];
3465 if (old_dirty_bm) {
3466 un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
3467 new_bm_size, KM_SLEEP);
3468 for (i = 0; i < old_nregions; i++) {
3469 if (!isset(old_dirty_bm, i))
3470 continue;
3472 setbit(un->un_pernode_dirty_bm[j],
3473 (i / rr_mult));
3475 kmem_free((caddr_t)old_dirty_bm, old_bm_size);
3477 rw_exit(&un->un_pernode_dirty_mx[j]);
3480 /* Save the old record id */
3481 old_recid = un->un_rr_dirty_recid;
3483 /* Update the mirror unit struct */
3484 un->un_rr_dirty_recid = recid;
3485 un->un_rrd_num = new_nregions;
3486 un->un_rrd_blksize = un->un_rrd_blksize * rr_mult;
3488 orp->or_blksize = un->un_rrd_blksize;
3491 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3492 * instead of using mddb_commitrecs_wrapper, is that you cannot
3493 * atomically commit optimized records.
3495 mddb_commitrec_wrapper(recid);
3496 mddb_commitrec_wrapper(un->c.un_record_id);
3497 mddb_deleterec_wrapper(old_recid);
3498 return (0);
3501 /* lockp can be NULL for !MN diksets */
3503 mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3505 uchar_t *old;
3506 short *owp;
3507 optim_resync_t *orp;
3508 uint_t old_nregions, new_nregions;
3509 int old_bm_size, new_bm_size;
3510 size_t size;
3511 mddb_recid_t recid, old_recid;
3512 mddb_type_t typ1;
3513 set_t setno = MD_UN2SET(un);
3514 int i;
3516 old_nregions = un->un_rrd_num;
3517 new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3519 new_bm_size = howmany(new_nregions, NBBY);
3520 old_bm_size = howmany(old_nregions, NBBY);
3522 size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3524 typ1 = (mddb_type_t)md_getshared_key(setno,
3525 mirror_md_ops.md_driver.md_drivername);
3527 recid = mddb_createrec(size, typ1, RESYNC_REC,
3528 MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3529 if (recid < 0)
3530 return (-1);
3532 orp = (struct optim_resync *)mddb_getrecaddr(recid);
3533 ASSERT(orp != NULL);
3535 orp->or_magic = OR_MAGIC; /* Magic # */
3536 orp->or_blksize = un->un_rrd_blksize; /* Same block size */
3537 orp->or_num = new_nregions; /* New number of regions */
3539 /* Copy the old bm over the new bm */
3540 bcopy((caddr_t)un->un_dirty_bm, (caddr_t)orp->or_rr, old_bm_size);
3543 * Create new bigger incore arrays, copy, and free old ones:
3544 * un_goingdirty_bm
3545 * un_goingclean_bm
3546 * un_resync_bm
3547 * un_outstanding_writes
3548 * un_pernode_dirty_sum
3549 * un_pernode_dirty_bm[]
3551 old = un->un_goingdirty_bm;
3552 un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3553 bcopy((caddr_t)old, (caddr_t)un->un_goingdirty_bm, old_bm_size);
3554 kmem_free((caddr_t)old, old_bm_size);
3556 old = un->un_goingclean_bm;
3557 un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3558 bcopy((caddr_t)old, (caddr_t)un->un_goingclean_bm, old_bm_size);
3559 kmem_free((caddr_t)old, old_bm_size);
3561 old = un->un_resync_bm;
3562 un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3563 bcopy((caddr_t)old, (caddr_t)un->un_resync_bm, old_bm_size);
3564 kmem_free((caddr_t)old, old_bm_size);
3566 owp = un->un_outstanding_writes;
3567 un->un_outstanding_writes = (short *)kmem_zalloc(
3568 (uint_t)new_nregions * sizeof (short), KM_SLEEP);
3569 bcopy((caddr_t)owp, (caddr_t)un->un_outstanding_writes,
3570 old_nregions * sizeof (short));
3571 kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
3573 old = un->un_pernode_dirty_sum;
3574 if (old) {
3575 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
3576 new_nregions, KM_SLEEP);
3577 bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
3578 old_nregions);
3579 kmem_free((caddr_t)old, old_nregions);
3582 for (i = 0; i < MD_MNMAXSIDES; i++) {
3583 rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
3584 old = un->un_pernode_dirty_bm[i];
3585 if (old) {
3586 un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
3587 new_bm_size, KM_SLEEP);
3588 bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
3589 old_bm_size);
3590 kmem_free((caddr_t)old, old_bm_size);
3592 rw_exit(&un->un_pernode_dirty_mx[i]);
3595 /* Save the old record id */
3596 old_recid = un->un_rr_dirty_recid;
3598 /* Update the mirror unit struct */
3599 un->un_rr_dirty_recid = recid;
3600 un->un_rrd_num = new_nregions;
3601 un->un_dirty_bm = orp->or_rr;
3604 * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3605 * instead of using mddb_commitrecs_wrapper, is that you cannot
3606 * atomically commit optimized records.
3608 mddb_commitrec_wrapper(recid);
3609 mddb_commitrec_wrapper(un->c.un_record_id);
3610 mddb_deleterec_wrapper(old_recid);
3611 return (0);
3615 * mirror_copy_rr:
3616 * --------------
3617 * Combine the dirty record bitmap with the in-core resync bitmap. This allows
3618 * us to carry a resync over an ownership change.
3620 void
3621 mirror_copy_rr(int sz, uchar_t *src, uchar_t *dest)
3623 int i;
3625 for (i = 0; i < sz; i++)
3626 *dest++ |= *src++;
3630 * mirror_set_dirty_rr:
3631 * -------------------
3632 * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
3633 * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
3634 * Called on every clean->dirty transition for the originating writer node.
3635 * Note: only the non-owning nodes will initiate this message and it is only
3636 * the owning node that has to process it.
3639 mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
3642 minor_t mnum = iocp->rr_mnum;
3643 mm_unit_t *un;
3644 int start = (int)iocp->rr_start;
3645 int end = (int)iocp->rr_end;
3646 set_t setno = MD_MIN2SET(mnum);
3647 md_mn_nodeid_t orignode = iocp->rr_nodeid; /* 1-based */
3648 diskaddr_t startblk, endblk;
3650 mdclrerror(&iocp->mde);
3652 if ((setno >= md_nsets) ||
3653 (MD_MIN2UNIT(mnum) >= md_nunits)) {
3654 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3657 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3658 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3660 if (un == NULL) {
3661 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3663 if (un->c.un_type != MD_METAMIRROR) {
3664 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3666 if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
3667 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3669 if (un->un_nsm < 2) {
3670 return (0);
3674 * Only process this message if we're the owner of the mirror.
3676 if (!MD_MN_MIRROR_OWNER(un)) {
3677 return (0);
3680 RR_TO_BLK(startblk, start, un);
3681 RR_TO_BLK(endblk, end, un);
3682 return (mirror_mark_resync_region_owner(un, startblk, endblk,
3683 orignode));
3687 * mirror_clean_rr_bits:
3688 * --------------------
3689 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3690 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3691 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3692 * nodes. Callable from ioctl / interrupt / whatever context.
3693 * un_resync_mx is held on entry.
3695 static void
3696 mirror_clean_rr_bits(
3697 md_mn_rr_clean_params_t *iocp)
3699 minor_t mnum = iocp->rr_mnum;
3700 mm_unit_t *un;
3701 uint_t cleared_bits;
3702 md_mn_nodeid_t node = iocp->rr_nodeid - 1;
3703 md_mn_nodeid_t orignode = iocp->rr_nodeid;
3704 int i, start, end;
3706 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3708 cleared_bits = 0;
3709 start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
3710 end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
3711 rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
3712 for (i = start; i < end; i++) {
3713 if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
3714 if (IS_PERNODE_DIRTY(orignode, i, un)) {
3715 un->un_pernode_dirty_sum[i]--;
3716 CLR_PERNODE_DIRTY(orignode, i, un);
3718 if (un->un_pernode_dirty_sum[i] == 0) {
3719 cleared_bits++;
3720 CLR_REGION_DIRTY(i, un);
3721 CLR_GOING_CLEAN(i, un);
3725 rw_exit(&un->un_pernode_dirty_mx[node]);
3726 if (cleared_bits) {
3728 * We can only be called iff we are the mirror owner, however
3729 * as this is a (potentially) decoupled routine the ownership
3730 * may have moved from us by the time we get to execute the
3731 * bit clearing. Hence we still need to check for being the
3732 * owner before flushing the DRL to the replica.
3734 if (MD_MN_MIRROR_OWNER(un)) {
3735 mutex_exit(&un->un_resync_mx);
3736 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3737 mutex_enter(&un->un_resync_mx);
3743 * mirror_drl_task:
3744 * ---------------
3745 * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
3746 * We need to obtain exclusive access to the un_resync_cv and then clear the
3747 * necessary bits.
3748 * On completion, we must also free the passed in argument as it is allocated
3749 * at the end of the ioctl handler and won't be freed on completion.
3751 static void
3752 mirror_drl_task(void *arg)
3754 md_mn_rr_clean_params_t *iocp = (md_mn_rr_clean_params_t *)arg;
3755 minor_t mnum = iocp->rr_mnum;
3756 mm_unit_t *un;
3758 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3760 mutex_enter(&un->un_rrp_inflight_mx);
3761 mutex_enter(&un->un_resync_mx);
3762 un->un_waiting_to_clear++;
3763 while (un->un_resync_flg & MM_RF_STALL_CLEAN)
3764 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3765 un->un_waiting_to_clear--;
3767 un->un_resync_flg |= MM_RF_GATECLOSED;
3768 mirror_clean_rr_bits(iocp);
3769 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3770 if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
3771 cv_broadcast(&un->un_resync_cv);
3773 mutex_exit(&un->un_resync_mx);
3774 mutex_exit(&un->un_rrp_inflight_mx);
3776 kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
3780 * mirror_set_clean_rr:
3781 * -------------------
3782 * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3783 * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3784 * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3785 * nodes.
3787 * Only the mirror-owner need process this message as it is the only RR updater.
3788 * Non-owner nodes issue this request, but as we have no point-to-point message
3789 * support we will receive the message on all nodes.
3792 mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
3795 minor_t mnum = iocp->rr_mnum;
3796 mm_unit_t *un;
3797 set_t setno = MD_MIN2SET(mnum);
3798 md_mn_nodeid_t node = iocp->rr_nodeid - 1;
3799 int can_clear = 0;
3800 md_mn_rr_clean_params_t *newiocp;
3801 int rval = 0;
3803 mdclrerror(&iocp->mde);
3805 if ((setno >= md_nsets) ||
3806 (MD_MIN2UNIT(mnum) >= md_nunits)) {
3807 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3810 /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3811 un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3813 if (un == NULL) {
3814 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3816 if (un->c.un_type != MD_METAMIRROR) {
3817 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3819 if (un->un_nsm < 2) {
3820 return (0);
3824 * Check to see if we're the mirror owner. If not, there's nothing
3825 * for us to to.
3827 if (!MD_MN_MIRROR_OWNER(un)) {
3828 return (0);
3832 * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
3833 * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
3834 * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
3835 * we can just defer this cleaning until the next process_resync_regions
3836 * timeout.
3838 rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
3839 if (un->un_pernode_dirty_bm[node] == NULL) {
3840 un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
3841 howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3843 rw_exit(&un->un_pernode_dirty_mx[node]);
3846 * See if we can simply clear the un_dirty_bm[] entries. If we're not
3847 * the issuing node _and_ we aren't in the process of marking/clearing
3848 * the RR bitmaps, we can simply update the bits as needed.
3849 * If we're the owning node and _not_ the issuing node, we should also
3850 * sync the RR if we clear any bits in it.
3852 mutex_enter(&un->un_resync_mx);
3853 can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
3854 if (can_clear) {
3855 un->un_resync_flg |= MM_RF_GATECLOSED;
3856 mirror_clean_rr_bits(iocp);
3857 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3858 if (un->un_waiting_to_mark != 0 ||
3859 un->un_waiting_to_clear != 0) {
3860 cv_broadcast(&un->un_resync_cv);
3863 mutex_exit(&un->un_resync_mx);
3866 * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
3867 * we must schedule a blocking call to update the DRL on this node.
3868 * As we're invoked from an ioctl we are going to have the original data
3869 * disappear (kmem_free) once we return. So, copy the data into a new
3870 * structure and let the taskq routine release it on completion.
3872 if (!can_clear) {
3873 size_t sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
3875 newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
3877 bcopy(iocp, newiocp, sz);
3879 if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
3880 newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
3881 kmem_free(newiocp, sz);
3882 rval = ENOMEM; /* probably starvation */
3886 return (rval);