usr/src/uts/common/io/lvm/mirror/mirror_resync.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/conf.h>
  30 #include <sys/file.h>
  31 #include <sys/user.h>
  32 #include <sys/uio.h>
  33 #include <sys/t_lock.h>
  34 #include <sys/buf.h>
  35 #include <sys/dkio.h>
  36 #include <sys/vtoc.h>
  37 #include <sys/kmem.h>
  38 #include <vm/page.h>
  39 #include <sys/cmn_err.h>
  40 #include <sys/sysmacros.h>
  41 #include <sys/types.h>
  42 #include <sys/mkdev.h>
  43 #include <sys/stat.h>
  44 #include <sys/open.h>
  45 #include <sys/disp.h>
  46 #include <sys/lvm/md_mirror.h>
  47 #include <sys/modctl.h>
  48 #include <sys/ddi.h>
  49 #include <sys/sunddi.h>
  50 #include <sys/debug.h>
  51 #include <sys/callb.h>
  52
  53 #include <sys/sysevent/eventdefs.h>
  54 #include <sys/sysevent/svm.h>
  55 #include <sys/lvm/mdmn_commd.h>
  56
  57 extern int              md_status;
  58 extern kmutex_t         md_status_mx;
  59 extern kmutex_t         md_mx;
  60
  61 extern unit_t           md_nunits;
  62 extern set_t            md_nsets;
  63 extern md_set_t         md_set[];
  64 extern major_t          md_major;
  65
  66 extern md_ops_t         mirror_md_ops;
  67 extern kmem_cache_t     *mirror_child_cache; /* mirror child memory pool */
  68 extern mdq_anchor_t     md_mto_daemon;
  69 extern daemon_request_t mirror_timeout;
  70 extern md_resync_t      md_cpr_resync;
  71 extern clock_t          md_hz;
  72 extern int              md_mtioctl_cnt;
  73
  74 extern kmem_cache_t     *mirror_parent_cache;
  75 #ifdef DEBUG
  76 extern int              mirror_debug_flag;
  77 #endif
  78
  79 /*
  80  * Tunable resync thread timeout. This is used as the time interval for updating
  81  * the resync progress to the mddb. This allows restartable resyncs to be
  82  * continued across a system reboot.
  83  * Default is to update the resync progress every 5 minutes.
  84  */
  85 int md_mirror_resync_update_intvl = MD_DEF_MIRROR_RESYNC_INTVL;
  86
  87 /*
  88  * Settable mirror resync buffer size.  Specified in 512 byte
  89  * blocks.  This is set to MD_DEF_RESYNC_BUF_SIZE by default.
  90  */
  91 int md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
  92
  93 /*
  94  * Tunables for dirty region processing when
  95  * closing down a mirror.
  96  *
  97  * Dirty region processing during close of a
  98  * mirror is basically monitoring the state
  99  * of the resync region bitmaps and the number
 100  * of outstanding i/o's per submirror to
 101  * determine that there are no more dirty
 102  * regions left over.
 103  *
 104  * The approach taken is a retry logic over
 105  * md_mirror_rr_cleans iterations to monitor
 106  * the progress.
 107  *
 108  * There are two methods of polling the progress
 109  * on dirty bitmap processing: busy-waits and
 110  * non-busy-waits.
 111  *
 112  * Busy-waits are used at the beginning to
 113  * determine the final state as quick as
 114  * possible; md_mirror_rr_polls defines the
 115  * number of busy-waits.
 116  *
 117  * In case the number of busy-waits got exhausted
 118  * with dirty regions left over, the retry logic
 119  * switches over to non-busy-waits, thus giving
 120  * relief to an obviously heavily loaded system.
 121  * The timeout value is defined by the tunable
 122  * md_mirror_rr_sleep_timo in seconds.
 123  *
 124  * The number of non-busy-waits is given by:
 125  * md_mirror_rr_cleans - md_mirror_rr_polls.
 126  *
 127  * The values were found by testing on a
 128  * 'typical' system and may require tuning
 129  * to meet specific customer's requirements.
 130  */
 131
 132 int md_mirror_rr_cleans = 13;
 133 int md_mirror_rr_polls = 3;
 134 int md_mirror_rr_sleep_timo = 1;
 135
 136 /*
 137  * The value is not #defined because it will be computed
 138  * in the future.
 139  */
 140 int md_max_xfer_bufsz = 2048;
 141
 142 /*
 143  * mirror_generate_rr_bitmap:
 144  * -------------------
 145  * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
 146  * bitmap associated with mirror 'un'
 147  *
 148  * Input:
 149  *      un      - mirror unit to get bitmap data from
 150  *      *msgp   - location to return newly allocated md_mn_msg_rr_clean_t
 151  *      *activep- location to return # of active i/os
 152  *
 153  * Returns:
 154  *      1 => dirty bits cleared from un_dirty_bm and DRL flush required
 155  *          *msgp contains bitmap of to-be-cleared bits
 156  *      0 => no bits cleared
 157  *          *msgp == NULL
 158  */
 159 static int
 160 mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
 161     int *activep)
 162 {
 163         unsigned int    i, next_bit, data_bytes, start_bit;
 164         int             cleared_dirty = 0;
 165
 166         /* Skip any initial 0s. */
 167 retry_dirty_scan:
 168         if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
 169                 un->un_rr_clean_start_bit = start_bit = 0;
 170
 171         /*
 172          * Handle case where NO bits are set in PERNODE_DIRTY but the
 173          * un_dirty_bm[] map does have entries set (after a 1st resync)
 174          */
 175         for (; start_bit < un->un_rrd_num &&
 176             !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
 177             (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
 178                 ;
 179
 180         if (start_bit >= un->un_rrd_num) {
 181                 if (un->un_rr_clean_start_bit == 0) {
 182                         return (0);
 183                 } else {
 184                         un->un_rr_clean_start_bit = 0;
 185                         goto retry_dirty_scan;
 186                 }
 187         }
 188
 189         /* how much to fit into this message */
 190         data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
 191             MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
 192
 193         (*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
 194             KM_SLEEP);
 195
 196         (*msgp)->rr_nodeid = md_mn_mynode_id;
 197         (*msgp)->rr_mnum = MD_SID(un);
 198         MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
 199
 200         next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
 201
 202         for (i = start_bit; i < next_bit; i++) {
 203                 if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
 204                         continue;
 205                 }
 206                 if (!IS_REGION_DIRTY(i, un)) {
 207                         continue;
 208                 }
 209                 if (un->un_outstanding_writes[i] != 0) {
 210                         (*activep)++;
 211                         continue;
 212                 }
 213
 214                 /*
 215                  * Handle the case where a resync has completed and we still
 216                  * have the un_dirty_bm[] entries marked as dirty (these are
 217                  * the most recent DRL re-read from the replica). They need
 218                  * to be cleared from our un_dirty_bm[] but they will not have
 219                  * corresponding un_pernode_dirty[] entries set unless (and
 220                  * until) further write()s have been issued to the area.
 221                  * This handles the case where only the un_dirty_bm[] entry is
 222                  * set. Without this we'd not clear this region until a local
 223                  * write is issued to the affected area.
 224                  */
 225                 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
 226                     (un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
 227                         if (!IS_GOING_CLEAN(i, un)) {
 228                                 SET_GOING_CLEAN(i, un);
 229                                 (*activep)++;
 230                                 continue;
 231                         }
 232                         /*
 233                          * Now we've got a flagged pernode_dirty, _or_ a clean
 234                          * bitmap entry to process. Update the bitmap to flush
 235                          * the REGION_DIRTY / GOING_CLEAN bits when we send the
 236                          * cross-cluster message.
 237                          */
 238                         cleared_dirty++;
 239                         setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
 240                 } else {
 241                         /*
 242                          * Not marked as active in the pernode bitmap, so skip
 243                          * any update to this. We just increment the 0 count
 244                          * and adjust the active count by any outstanding
 245                          * un_pernode_dirty_sum[] entries. This means we don't
 246                          * leave the mirror permanently dirty.
 247                          */
 248                         (*activep) += (int)un->un_pernode_dirty_sum[i];
 249                 }
 250         }
 251         if (!cleared_dirty) {
 252                 kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
 253                 *msgp = NULL;
 254         }
 255         un->un_rr_clean_start_bit = next_bit;
 256         return (cleared_dirty);
 257 }
 258
 259 /*
 260  * There are three paths into here:
 261  *
 262  * md_daemon -> check_resync_regions -> prr
 263  * mirror_internal_close -> mirror_process_unit_resync -> prr
 264  * mirror_set_capability -> mirror_process_unit_resync -> prr
 265  *
 266  * The first one is a kernel daemon, the other two result from system calls.
 267  * Thus, only the first case needs to deal with kernel CPR activity.  This
 268  * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
 269  * NULL for system call paths.
 270  */
 271 static int
 272 process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
 273 {
 274         int                     i, start, end;
 275         int                     cleared_dirty = 0;
 276         /* Number of reasons why we can not proceed shutting down the mirror. */
 277         int                     active = 0;
 278         set_t                   setno = MD_UN2SET(un);
 279         md_mn_msg_rr_clean_t    *rmsg;
 280         md_mn_kresult_t         *kres;
 281         int                     rval;
 282         minor_t                 mnum = MD_SID(un);
 283         mdi_unit_t              *ui = MDI_UNIT(mnum);
 284         md_mn_nodeid_t          owner_node;
 285
 286         /*
 287          * We drop the readerlock here to assist lock ordering with
 288          * update_resync.  Once we have the un_rrp_inflight_mx, we
 289          * can re-acquire it.
 290          */
 291         md_unit_readerexit(ui);
 292
 293         /*
 294          * Resync region processing must be single threaded. We can't use
 295          * un_resync_mx for this purpose since this mutex gets released
 296          * when blocking on un_resync_cv.
 297          */
 298         mutex_enter(&un->un_rrp_inflight_mx);
 299
 300         (void) md_unit_readerlock(ui);
 301
 302         mutex_enter(&un->un_resync_mx);
 303
 304         rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
 305         cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
 306         rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
 307
 308         if (cleared_dirty) {
 309                 owner_node = un->un_mirror_owner;
 310                 mutex_exit(&un->un_resync_mx);
 311
 312                 /*
 313                  * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
 314                  * Receipt of the message will cause the mirror owner to
 315                  * update the on-disk DRL.
 316                  */
 317
 318                 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 319
 320                 /* release readerlock before sending message */
 321                 md_unit_readerexit(ui);
 322
 323                 if (cprinfop) {
 324                         mutex_enter(&un->un_prr_cpr_mx);
 325                         CALLB_CPR_SAFE_BEGIN(cprinfop);
 326                 }
 327
 328                 rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
 329                     MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
 330                     MD_MSGF_DIRECTED, un->un_mirror_owner,
 331                     (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
 332
 333                 if (cprinfop) {
 334                         CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
 335                         mutex_exit(&un->un_prr_cpr_mx);
 336                 }
 337
 338                 /* reacquire readerlock after message */
 339                 (void) md_unit_readerlock(ui);
 340
 341                 if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
 342                     (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
 343                         /* if commd is gone, no point in printing a message */
 344                         if (md_mn_is_commd_present())
 345                                 mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
 346                         kmem_free(kres, sizeof (md_mn_kresult_t));
 347                         kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
 348                         mutex_exit(&un->un_rrp_inflight_mx);
 349                         return (active);
 350                 }
 351                 kmem_free(kres, sizeof (md_mn_kresult_t));
 352
 353                 /*
 354                  * If ownership changed while we were sending, we probably
 355                  * sent the message to the wrong node.  Leave fixing that for
 356                  * the next cycle.
 357                  */
 358                 if (un->un_mirror_owner != owner_node) {
 359                         mutex_exit(&un->un_rrp_inflight_mx);
 360                         return (active);
 361                 }
 362
 363                 /*
 364                  * Now that we've sent the message, clear them from the
 365                  * pernode_dirty arrays.  These are ONLY cleared on a
 366                  * successful send, and failure has no impact.
 367                  */
 368                 cleared_dirty = 0;
 369                 start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
 370                 end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
 371                 mutex_enter(&un->un_resync_mx);
 372                 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
 373                     RW_READER);
 374                 for (i = start; i < end; i++) {
 375                         if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
 376                             i - start)) {
 377                                 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
 378                                         un->un_pernode_dirty_sum[i]--;
 379                                         CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
 380                                             un);
 381                                 }
 382                                 if (IS_REGION_DIRTY(i, un)) {
 383                                         cleared_dirty++;
 384                                         CLR_REGION_DIRTY(i, un);
 385                                         CLR_GOING_CLEAN(i, un);
 386                                 }
 387                         }
 388                 }
 389                 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
 390
 391                 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
 392         }
 393         mutex_exit(&un->un_resync_mx);
 394
 395         mutex_exit(&un->un_rrp_inflight_mx);
 396
 397         return (active);
 398 }
 399
 400 static int
 401 process_resync_regions_owner(mm_unit_t *un)
 402 {
 403         int                     i, start, end;
 404         int                     cleared_dirty = 0;
 405         /* Number of reasons why we can not proceed shutting down the mirror. */
 406         int                     active = 0;
 407         set_t                   setno = MD_UN2SET(un);
 408         int                     mnset = MD_MNSET_SETNO(setno);
 409         md_mn_msg_rr_clean_t    *rmsg;
 410         minor_t                 mnum = MD_SID(un);
 411         mdi_unit_t              *ui = MDI_UNIT(mnum);
 412
 413         /*
 414          * We drop the readerlock here to assist lock ordering with
 415          * update_resync.  Once we have the un_rrp_inflight_mx, we
 416          * can re-acquire it.
 417          */
 418         md_unit_readerexit(ui);
 419
 420         /*
 421          * Resync region processing must be single threaded. We can't use
 422          * un_resync_mx for this purpose since this mutex gets released
 423          * when blocking on un_resync_cv.
 424          */
 425         mutex_enter(&un->un_rrp_inflight_mx);
 426
 427         (void) md_unit_readerlock(ui);
 428
 429         mutex_enter(&un->un_resync_mx);
 430         un->un_waiting_to_clear++;
 431         while (un->un_resync_flg & MM_RF_STALL_CLEAN)
 432                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
 433         un->un_waiting_to_clear--;
 434
 435         if (mnset) {
 436                 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
 437                     RW_READER);
 438                 cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
 439
 440                 if (cleared_dirty) {
 441                         /*
 442                          * Clear the bits from the pernode_dirty arrays.
 443                          * If that results in any being cleared from the
 444                          * un_dirty_bm, commit it.
 445                          */
 446                         cleared_dirty = 0;
 447                         start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
 448                         end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
 449                         for (i = start; i < end; i++) {
 450                                 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
 451                                     i - start)) {
 452                                         if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
 453                                             un)) {
 454                                                 un->un_pernode_dirty_sum[i]--;
 455                                                 CLR_PERNODE_DIRTY(
 456                                                     md_mn_mynode_id, i, un);
 457                                         }
 458                                         if (un->un_pernode_dirty_sum[i] == 0) {
 459                                                 cleared_dirty++;
 460                                                 CLR_REGION_DIRTY(i, un);
 461                                                 CLR_GOING_CLEAN(i, un);
 462                                         }
 463                                 }
 464                         }
 465                         kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
 466                 }
 467                 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
 468         } else {
 469                 for (i = 0; i < un->un_rrd_num; i++) {
 470                         if (un->c.un_status & MD_UN_KEEP_DIRTY)
 471                                 if (IS_KEEPDIRTY(i, un))
 472                                         continue;
 473
 474                         if (!IS_REGION_DIRTY(i, un))
 475                                 continue;
 476                         if (un->un_outstanding_writes[i] != 0) {
 477                                 active++;
 478                                 continue;
 479                         }
 480
 481                         if (!IS_GOING_CLEAN(i, un)) {
 482                                 SET_GOING_CLEAN(i, un);
 483                                 active++;
 484                                 continue;
 485                         }
 486                         CLR_REGION_DIRTY(i, un);
 487                         CLR_GOING_CLEAN(i, un);
 488                         cleared_dirty++;
 489                 }
 490         }
 491
 492         if (cleared_dirty) {
 493                 un->un_resync_flg |= MM_RF_GATECLOSED;
 494                 mutex_exit(&un->un_resync_mx);
 495                 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
 496                 mutex_enter(&un->un_resync_mx);
 497                 un->un_resync_flg &= ~MM_RF_GATECLOSED;
 498
 499                 if (un->un_waiting_to_mark != 0 ||
 500                     un->un_waiting_to_clear != 0) {
 501                         active++;
 502                         cv_broadcast(&un->un_resync_cv);
 503                 }
 504         }
 505         mutex_exit(&un->un_resync_mx);
 506
 507         mutex_exit(&un->un_rrp_inflight_mx);
 508
 509         return (active);
 510 }
 511
 512 static int
 513 process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
 514 {
 515         int     mnset = MD_MNSET_SETNO(MD_UN2SET(un));
 516         /*
 517          * For a mirror we can only update the on-disk resync-record if we
 518          * currently own the mirror. If we are called and there is no owner we
 519          * bail out before scanning the outstanding_writes[] array.
 520          * NOTE: we only need to check here (before scanning the array) as we
 521          *      are called with the readerlock held. This means that a change
 522          *      of ownership away from us will block until this resync check
 523          *      has completed.
 524          */
 525         if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
 526             (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
 527                 return (0);
 528         } else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
 529                 return (process_resync_regions_non_owner(un, cprinfop));
 530         } else {
 531                 return (process_resync_regions_owner(un));
 532         }
 533 }
 534
 535 /*
 536  * Function that is callable from other modules to provide
 537  * ability to cleanup dirty region bitmap on demand. Used
 538  * on last close of a unit to avoid massive device resyncs
 539  * when coming back after rolling large amounts of data to
 540  * a mirror (e.g. at umount with logging).
 541  */
 542
 543 void
 544 mirror_process_unit_resync(mm_unit_t *un)
 545 {
 546         int     cleans = 0;
 547
 548         while (process_resync_regions(un, NULL)) {
 549
 550                 cleans++;
 551                 if (cleans >= md_mirror_rr_cleans) {
 552                         cmn_err(CE_NOTE,
 553                             "Could not clean resync regions\n");
 554                         break;
 555                 }
 556                 if (cleans > md_mirror_rr_polls) {
 557                         /*
 558                          * We did not make it with md_mirror_rr_polls
 559                          * iterations. Give the system relief and
 560                          * switch over to non-busy-wait.
 561                          */
 562                         delay(md_mirror_rr_sleep_timo * md_hz);
 563                 }
 564         }
 565 }
 566
 567 static void
 568 check_resync_regions(daemon_request_t *timeout)
 569 {
 570         mdi_unit_t      *ui;
 571         mm_unit_t       *un;
 572         md_link_t       *next;
 573         callb_cpr_t     cprinfo;
 574
 575         rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
 576         for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
 577
 578                 if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
 579                         continue;
 580
 581                 un = MD_UNIT(next->ln_id);
 582
 583                 /*
 584                  * Register this resync thread with the CPR mechanism. This
 585                  * allows us to detect when the system is suspended and so
 586                  * keep track of the RPC failure condition.
 587                  */
 588                 CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
 589                     "check_resync_regions");
 590
 591                 ui = MDI_UNIT(next->ln_id);
 592                 (void) md_unit_readerlock(ui);
 593
 594                 /*
 595                  * Do not clean up resync regions if it is an ABR
 596                  * mirror, or if a submirror is offline (we will use the resync
 597                  * region to resync when back online) or if there is only one
 598                  * submirror.
 599                  */
 600                 if ((ui->ui_tstate & MD_ABR_CAP) ||
 601                     (un->c.un_status & MD_UN_OFFLINE_SM) || (un->un_nsm < 2)) {
 602                         md_unit_readerexit(ui);
 603                         continue;
 604                 }
 605
 606                 (void) process_resync_regions(un, &cprinfo);
 607
 608                 md_unit_readerexit(ui);
 609
 610                 /* Remove this thread from the CPR callback table. */
 611                 mutex_enter(&un->un_prr_cpr_mx);
 612                 CALLB_CPR_EXIT(&cprinfo);
 613         }
 614
 615         rw_exit(&mirror_md_ops.md_link_rw.lock);
 616
 617         /* We are done */
 618         mutex_enter(&mirror_timeout.dr_mx);
 619         timeout->dr_pending = 0;
 620         mutex_exit(&mirror_timeout.dr_mx);
 621 }
 622
 623 static void
 624 md_mirror_timeout(void *throwaway)
 625 {
 626
 627         mutex_enter(&mirror_timeout.dr_mx);
 628         if (!mirror_timeout.dr_pending) {
 629                 mirror_timeout.dr_pending = 1;
 630                 daemon_request(&md_mto_daemon, check_resync_regions,
 631                     (daemon_queue_t *)&mirror_timeout, REQ_OLD);
 632         }
 633
 634         if (mirror_md_ops.md_head != NULL)
 635                 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
 636                     throwaway, (int)MD_MDELAY*hz);
 637         else
 638                 mirror_timeout.dr_timeout_id = 0;
 639
 640         mutex_exit(&mirror_timeout.dr_mx);
 641 }
 642
 643 void
 644 resync_start_timeout(set_t setno)
 645 {
 646         if (md_get_setstatus(setno) & MD_SET_STALE)
 647                 return;
 648
 649         mutex_enter(&mirror_timeout.dr_mx);
 650         if (mirror_timeout.dr_timeout_id == 0)
 651                 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
 652                     (void *)NULL, (int)MD_MDELAY*hz);
 653         mutex_exit(&mirror_timeout.dr_mx);
 654 }
 655
 656 static void
 657 offlined_to_attached(mm_unit_t *un)
 658 {
 659         int             i;
 660         int             changed = 0;
 661
 662         if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
 663                 return;
 664
 665         for (i = 0; i < NMIRROR; i++) {
 666                 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
 667                         mirror_set_sm_state(&un->un_sm[i],
 668                             &un->un_smic[i], SMS_ATTACHED, 1);
 669                         changed++;
 670                 }
 671                 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) {
 672                         mirror_set_sm_state(&un->un_sm[i],
 673                             &un->un_smic[i], SMS_ATTACHED_RESYNC, 1);
 674                         changed++;
 675                 }
 676         }
 677
 678         if (changed != 0) {
 679                 un->c.un_status &= ~MD_UN_OFFLINE_SM;
 680                 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
 681         }
 682 }
 683
 684 static void
 685 get_unit_resync(mm_unit_t *un)
 686 {
 687         mddb_recstatus_t        status;
 688         struct optim_resync     *orp;
 689
 690         if (un->un_rr_dirty_recid == 0) {
 691                 offlined_to_attached(un);
 692                 return;
 693         }
 694
 695         status = mddb_getrecstatus(un->un_rr_dirty_recid);
 696         if ((status == MDDB_NORECORD) || (status == MDDB_NODATA)) {
 697                 un->un_rr_dirty_recid = 0;
 698                 offlined_to_attached(un);
 699                 return;
 700         }
 701
 702         mddb_setrecprivate(un->un_rr_dirty_recid, MD_PRV_GOTIT);
 703         orp = (struct optim_resync *)mddb_getrecaddr(un->un_rr_dirty_recid);
 704         un->un_dirty_bm = orp->or_rr;
 705 }
 706
 707 static int
 708 create_unit_resync(mm_unit_t *un, int snarfing)
 709 {
 710         diskaddr_t      tb;
 711         int             i;
 712         int             blksize;        /* rr size in blocks */
 713         int             num_rr;
 714         mddb_recid_t    recid;
 715         size_t          size;   /* bitmap size */
 716         optim_resync_t  *orp;
 717         mddb_type_t     typ1;
 718         set_t           setno;
 719
 720         tb = un->c.un_total_blocks;
 721
 722         if (((tb + MD_MIN_RR_SIZE)/ MD_MIN_RR_SIZE) > MD_DEF_NUM_RR) {
 723                 blksize = (int)(tb / MD_DEF_NUM_RR);
 724                 num_rr = (int)((tb + (blksize)) / (blksize));
 725         } else {
 726                 blksize = MD_MIN_RR_SIZE;
 727                 num_rr = (int)((tb + MD_MIN_RR_SIZE) / MD_MIN_RR_SIZE);
 728         }
 729
 730         size = howmany(num_rr, NBBY) + sizeof (*orp) - sizeof (orp->or_rr);
 731
 732         setno = MD_UN2SET(un);
 733
 734         typ1 = (mddb_type_t)md_getshared_key(setno,
 735             mirror_md_ops.md_driver.md_drivername);
 736
 737         recid =  mddb_createrec(size, typ1, RESYNC_REC,
 738             MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
 739         if (recid < 0) {
 740                 if (snarfing && !(md_get_setstatus(setno) & MD_SET_STALE)) {
 741                         md_set_setstatus(setno, MD_SET_STALE);
 742                         cmn_err(CE_WARN, "md: state database is stale");
 743                 }
 744                 return (-1);
 745         }
 746
 747         un->un_rr_dirty_recid = recid;
 748         orp = (optim_resync_t *)mddb_getrecaddr(recid);
 749         orp->or_magic = OR_MAGIC;
 750         orp->or_blksize = blksize;
 751         orp->or_num = num_rr;
 752
 753         un->un_rrd_blksize = blksize;
 754         un->un_rrd_num  = num_rr;
 755         un->un_dirty_bm = orp->or_rr;
 756
 757         if (snarfing)
 758                 for (i = 0; i < howmany(num_rr, NBBY); i++)
 759                         orp->or_rr[i] = 0xFF;
 760
 761         if (!snarfing) {
 762                 mddb_commitrec_wrapper(recid);
 763                 mirror_commit(un, NO_SUBMIRRORS, 0);
 764                 return (0);
 765         }
 766         mddb_setrecprivate(recid, MD_PRV_PENDCOM);
 767         mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
 768         return (0);
 769 }
 770
 771 int
 772 unit_setup_resync(mm_unit_t *un, int snarfing)
 773 {
 774         int err;
 775         int syncable;
 776         int i;
 777         mdi_unit_t      *ui = MDI_UNIT(MD_SID(un));
 778         int nonABR = 1;         /* only set if ABR marked in ui_tstate */
 779
 780         un->un_dirty_bm = NULL;
 781         un->un_rs_buffer = NULL;
 782
 783         mutex_init(&un->un_rrp_inflight_mx, "rrp mx", MUTEX_DEFAULT, NULL);
 784
 785         mutex_init(&un->un_resync_mx, NULL, MUTEX_DEFAULT, NULL);
 786         cv_init(&un->un_resync_cv, NULL, CV_DEFAULT, NULL);
 787         un->un_resync_flg = 0;
 788         un->un_waiting_to_mark = 0;
 789         un->un_waiting_to_commit = 0;
 790         un->un_waiting_to_clear = 0;
 791
 792         un->un_goingclean_bm = NULL;
 793         un->un_goingdirty_bm = NULL;
 794         un->un_outstanding_writes = NULL;
 795         un->un_resync_bm = NULL;
 796
 797         if (snarfing)
 798                 get_unit_resync(un);
 799
 800         if (un->un_rr_dirty_recid == 0) {
 801                 /*
 802                  * If a MN diskset and snarfing and this node is not the
 803                  * master, do not delete any records on snarf of the
 804                  * mirror records (create_unit_resync deletes records).
 805                  *
 806                  * Master node should have already handled this case.
 807                  */
 808                 if (MD_MNSET_SETNO(MD_UN2SET(un)) && snarfing &&
 809                     md_set[MD_UN2SET(un)].s_am_i_master == 0) {
 810 #ifdef DEBUG
 811                         cmn_err(CE_NOTE, "unit_setup_resync: no rr for %s on"
 812                             " nodeid %d\n", md_shortname(MD_SID(un)),
 813                             md_set[MD_UN2SET(un)].s_nodeid);
 814 #endif
 815                         return (-1);
 816                 }
 817                 if ((err = create_unit_resync(un, snarfing)) != 0)
 818                         return (err);
 819         }
 820
 821         un->un_goingclean_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
 822             un->un_rrd_num, NBBY)), KM_SLEEP);
 823         un->un_goingdirty_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
 824             un->un_rrd_num, NBBY)), KM_SLEEP);
 825         un->un_outstanding_writes = (short *)kmem_zalloc(
 826             (uint_t)un->un_rrd_num * sizeof (short), KM_SLEEP);
 827         un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
 828             un->un_rrd_num, NBBY)), KM_SLEEP);
 829
 830         /*
 831          * Allocate pernode bitmap for this node. All other nodes' maps will
 832          * be created 'on-the-fly' in the ioctl message handler
 833          */
 834         if (MD_MNSET_SETNO(MD_UN2SET(un))) {
 835                 un->un_pernode_dirty_sum =
 836                     (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
 837                 if (md_mn_mynode_id > 0) {
 838                         un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
 839                             kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
 840                             KM_SLEEP);
 841                 }
 842
 843                 /*
 844                  * Allocate taskq to process deferred (due to locking) RR_CLEAN
 845                  * requests.
 846                  */
 847                 un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
 848                     MD_SID(un));
 849         }
 850
 851         if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
 852                 return (0);
 853
 854         /*
 855          * Only mark mirror which has an associated DRL as requiring a resync.
 856          * For ABR mirrors we need not set the resync record bitmap up.
 857          */
 858         if (ui && (ui->ui_tstate & MD_ABR_CAP))
 859                 nonABR = 0;
 860
 861         for (i = 0, syncable = 0; i < NMIRROR; i++) {
 862                 if (nonABR) {
 863                         if ((SUBMIRROR_IS_READABLE(un, i) ||
 864                             SMS_BY_INDEX_IS(un, i,
 865                             (SMS_OFFLINE | SMS_OFFLINE_RESYNC))))
 866                                 syncable++;
 867                 }
 868         }
 869
 870         if (snarfing && un->un_pass_num && (syncable > 1)) {
 871                 bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
 872                     howmany(un->un_rrd_num, NBBY));
 873
 874                 un->c.un_status |= (MD_UN_OPT_NOT_DONE | MD_UN_WAR);
 875                 un->c.un_status &= ~MD_UN_OFFLINE_SM;
 876                 for (i = 0; i < NMIRROR; i++) {
 877                         if ((SUBMIRROR_IS_READABLE(un, i)) ||
 878                             SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC))
 879                                 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
 880
 881                         if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
 882                                 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
 883                                 mirror_set_sm_state(&un->un_sm[i],
 884                                     &un->un_smic[i], SMS_OFFLINE_RESYNC, 1);
 885                                 mddb_setrecprivate(un->c.un_record_id,
 886                                     MD_PRV_PENDCOM);
 887                         }
 888                 }
 889         }
 890         return (0);
 891 }
 892
 893 /*
 894  * resync_kill_pending:
 895  * -------------------
 896  * Determine if the resync thread has been requested to terminate.
 897  * Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
 898  * MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
 899  * MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node  mirror.
 900  *
 901  * Returns:
 902  *      0       Kill not pending
 903  *      1       Kill requested  (set MD_UN_RESYNC_CANCEL in un->c.un_status)
 904  *
 905  * Note: this routine may block
 906  *       the writerlock for <ui> will be dropped and reacquired if <mx_type>
 907  *       is set to MD_WRITER_HELD.
 908  *       the readerlock for <ui> will be dropped and reacquired if <mx_type>
 909  *       is set to MD_READER_HELD.
 910  */
 911 static int
 912 resync_kill_pending(
 913         mm_unit_t *un,
 914         mdi_unit_t *ui,
 915         uint_t mx_type)
 916 {
 917         int     retval = 0;
 918
 919         /* Ensure that we don't block with any mutex held */
 920         if (mx_type == MD_WRITER_HELD) {
 921                 md_unit_writerexit(ui);
 922         } else if (mx_type == MD_READER_HELD) {
 923                 md_unit_readerexit(ui);
 924         }
 925         mutex_enter(&un->un_rs_thread_mx);
 926         while (un->un_rs_thread_flags & (MD_RI_BLOCK|MD_RI_BLOCK_OWNER)) {
 927                 cv_wait(&un->un_rs_thread_cv, &un->un_rs_thread_mx);
 928                 if (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN))
 929                         break;
 930         }
 931         /* Determine if we've been asked to abort or shutdown gracefully */
 932         if (un->un_rs_thread_flags & MD_RI_KILL) {
 933                 un->c.un_status |= MD_UN_RESYNC_CANCEL;
 934                 retval = 1;
 935         } else if (un->un_rs_thread_flags & MD_RI_SHUTDOWN) {
 936                 retval = 1;
 937         }
 938         mutex_exit(&un->un_rs_thread_mx);
 939
 940         /* Reacquire mutex if dropped on entry */
 941         if (mx_type == MD_WRITER_HELD) {
 942                 (void) md_unit_writerlock(ui);
 943         } else if (mx_type == MD_READER_HELD) {
 944                 (void) md_unit_readerlock(ui);
 945         }
 946         return (retval);
 947 }
 948
 949 /*
 950  * resync_read_buffer:
 951  * ------------------
 952  * Issue the resync source read for the specified start block and size.
 953  * This will cause the mirror strategy routine to issue a write-after-read
 954  * once this request completes successfully.
 955  * If 'flag_err' is set we expect to see a write error flagged in the b_error
 956  * field of the buffer created for this i/o request. If clear we do not expect
 957  * to see the error flagged for write failures.
 958  * Read failures will always set the B_ERROR bit which will stop the resync
 959  * immediately.
 960  */
 961 static int
 962 resync_read_buffer(mm_unit_t *un, diskaddr_t blk, size_t cnt, int flag_err)
 963 {
 964         md_mcs_t        *sp;
 965         buf_t           *bp;
 966         int             ret = 0;
 967
 968         sp = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
 969         mirror_child_init(sp);
 970
 971         bp = &sp->cs_buf;
 972         bp->b_edev = makedevice(md_major, MD_SID(un));
 973         bp->b_flags = B_READ;
 974         bp->b_lblkno = blk;
 975         bp->b_bcount = dbtob(cnt);
 976         bp->b_un.b_addr = un->un_rs_buffer;
 977         md_unit_readerexit(MDI_UNIT(MD_SID(un)));
 978
 979         (void) md_mirror_strategy(bp, MD_STR_NOTTOP | MD_STR_MAPPED |
 980             MD_STR_WAR | (flag_err ? MD_STR_FLAG_ERR : 0), NULL);
 981
 982         (void) biowait(bp);
 983
 984         (void) md_unit_readerlock(MDI_UNIT(MD_SID(un)));
 985         if (bp->b_flags & B_ERROR) {
 986                 ret = 1;
 987         }
 988         kmem_cache_free(mirror_child_cache, sp);
 989         return (ret);
 990 }
 991
 992 /*
 993  * send_mn_resync_done_message
 994  *
 995  * At the end of a resync, send a message to all nodes to indicate that
 996  * the resync is complete. The argument, flags, has the following values
 997  *
 998  * RESYNC_ERR - if an error occurred that terminated the resync
 999  * CLEAR_OPT_NOT_DONE   - Just need to clear the OPT_NOT_DONE flag
1000  *
1001  * unit writerlock set on entry
1002  * Only send the message if the thread is not marked as shutting down:
1003  * [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
1004  * [un->c.un_status & MD_UN_RESYNC_CANCEL]
1005  * or if there has been an error that terminated the resync:
1006  *      flags & RESYNC_ERR
1007  *
1008  */
1009 static void
1010 send_mn_resync_done_message(
1011         mm_unit_t       *un,
1012         int             flags
1013 )
1014 {
1015         md_mn_msg_resync_t      *rmsg = un->un_rs_msg;
1016         set_t                   setno;
1017         mdi_unit_t              *ui = MDI_UNIT(MD_SID(un));
1018         md_mn_kresult_t         *kres;
1019         int                     dont_send = 0;
1020         int                     rval;
1021
1022         rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
1023
1024         /*
1025          * Only send the message if this resync thread is still active. This
1026          * handles the case where ownership changes to different nodes during
1027          * a resync can cause multiple spurious resync_done messages to occur
1028          * when the resync completes. This happens because only one node is
1029          * the resync owner but other nodes will have their resync_unit thread
1030          * blocked in 'resync_kill_pending'
1031          */
1032         mutex_enter(&un->un_rs_thread_mx);
1033         dont_send = (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN)) ? 1
1034             : 0;
1035         mutex_exit(&un->un_rs_thread_mx);
1036         dont_send |= (un->c.un_status & MD_UN_RESYNC_CANCEL) ? 1 : 0;
1037
1038         /*
1039          * Always send a message if we've encountered an error that terminated
1040          * the resync.
1041          */
1042         if (flags & RESYNC_ERR)
1043                 dont_send = 0;
1044
1045         if (dont_send) {
1046 #ifdef DEBUG
1047                 if (mirror_debug_flag) {
1048                         printf("Don't send resync done message, mnum = %x,"
1049                             " type = %x, flags = %d\n", MD_SID(un),
1050                             un->un_rs_type, flags);
1051                 }
1052 #endif  /* DEBUG */
1053                 return;
1054         }
1055
1056 #ifdef DEBUG
1057         if (mirror_debug_flag) {
1058                 printf("send resync done message, mnum = %x, type = %x\n",
1059                     MD_SID(un), un->un_rs_type);
1060         }
1061 #endif
1062
1063         rmsg->msg_resync_mnum = MD_SID(un);
1064         rmsg->msg_resync_type = un->un_rs_type;
1065         rmsg->msg_originator = md_mn_mynode_id;
1066         rmsg->msg_resync_flags = 0;
1067         if (flags & RESYNC_ERR)
1068                 rmsg->msg_resync_flags |= MD_MN_RS_ERR;
1069         if (flags & CLEAR_OPT_NOT_DONE)
1070                 rmsg->msg_resync_flags |= MD_MN_RS_CLEAR_OPT_NOT_DONE;
1071
1072         setno = MD_MIN2SET(MD_SID(un));
1073         md_unit_writerexit(ui);
1074         kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1075
1076         mutex_enter(&un->un_rs_cpr_mx);
1077         CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1078
1079         rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
1080             MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1081
1082         CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1083         mutex_exit(&un->un_rs_cpr_mx);
1084
1085         /* if the node hasn't yet joined, it's Ok. */
1086         if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
1087             (kres->kmmr_comm_state !=  MDMNE_NOT_JOINED)) {
1088                 mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
1089                 /* If we're shutting down already, pause things here. */
1090                 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1091                         while (!md_mn_is_commd_present()) {
1092                                 delay(md_hz);
1093                         }
1094                 }
1095                 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
1096         }
1097         kmem_free(kres, sizeof (md_mn_kresult_t));
1098         (void) md_unit_writerlock(ui);
1099 }
1100
1101 /*
1102  * send_mn_resync_next_message
1103  *
1104  * Sent a message to all nodes indicating the next region to be resynced.
1105  * The message contains the region to be resynced and the current position in
1106  * the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
1107  * On entry the unit readerlock is held.
1108  */
1109 static void
1110 send_mn_resync_next_message(
1111         mm_unit_t       *un,
1112         diskaddr_t      currentblk,
1113         size_t          rsize,
1114         int             flags
1115 )
1116 {
1117         md_mn_msg_resync_t      *rmsg = un->un_rs_msg;
1118         set_t                   setno;
1119         md_mn_kresult_t         *kres;
1120         mdi_unit_t              *ui = MDI_UNIT(MD_SID(un));
1121         int                     rval;
1122         md_mps_t                *ps;
1123         mm_submirror_t          *sm;
1124         int                     smi;
1125
1126         ASSERT(rmsg != NULL);
1127 #ifdef DEBUG
1128         if (mirror_debug_flag) {
1129                 printf("send resync next message, mnum = %x, start=%lld, "
1130                     "size=%ld, type=%x, done=%lld, 2_do=%lld\n",
1131                     MD_SID(un), currentblk, rsize, un->un_rs_type,
1132                     un->un_rs_resync_done, un->un_rs_resync_2_do);
1133         }
1134 #endif
1135         rmsg->msg_resync_mnum = MD_SID(un);
1136         rmsg->msg_resync_type = un->un_rs_type;
1137         rmsg->msg_resync_start = currentblk;
1138         rmsg->msg_resync_rsize = rsize;
1139         rmsg->msg_resync_done = un->un_rs_resync_done;
1140         rmsg->msg_resync_2_do = un->un_rs_resync_2_do;
1141         rmsg->msg_originator = md_mn_mynode_id;
1142         if (flags & MD_FIRST_RESYNC_NEXT)
1143                 rmsg->msg_resync_flags = MD_MN_RS_FIRST_RESYNC_NEXT;
1144
1145         /*
1146          * Copy current submirror state and flags into message. This provides
1147          * a means of keeping all nodes that are currently active in the cluster
1148          * synchronised with regards to their submirror state settings. If we
1149          * did not pass this information here, the only time every node gets
1150          * submirror state updated is at the end of a resync phase. This can be
1151          * a significant amount of time for large metadevices.
1152          */
1153         for (smi = 0; smi < NMIRROR; smi++) {
1154                 sm = &un->un_sm[smi];
1155                 rmsg->msg_sm_state[smi] = sm->sm_state;
1156                 rmsg->msg_sm_flags[smi] = sm->sm_flags;
1157         }
1158         setno = MD_MIN2SET(MD_SID(un));
1159         md_unit_readerexit(ui);
1160         kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1161
1162         mutex_enter(&un->un_rs_cpr_mx);
1163         CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1164
1165         rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
1166             0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1167
1168         CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1169         mutex_exit(&un->un_rs_cpr_mx);
1170
1171         if (!MDMN_KSEND_MSG_OK(rval, kres)) {
1172                 mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
1173                 /* If we're shutting down already, pause things here. */
1174                 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1175                         while (!md_mn_is_commd_present()) {
1176                                 delay(md_hz);
1177                         }
1178                 }
1179                 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
1180         }
1181         kmem_free(kres, sizeof (md_mn_kresult_t));
1182         (void) md_unit_readerlock(ui);
1183         ps = un->un_rs_prev_overlap;
1184
1185         /* Allocate previous overlap reference if needed */
1186         if (ps == NULL) {
1187                 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
1188                 ps->ps_un = un;
1189                 ps->ps_ui = ui;
1190                 ps->ps_firstblk = 0;
1191                 ps->ps_lastblk = 0;
1192                 ps->ps_flags = 0;
1193                 md_unit_readerexit(ui);
1194                 (void) md_unit_writerlock(ui);
1195                 un->un_rs_prev_overlap = ps;
1196                 md_unit_writerexit(ui);
1197                 (void) md_unit_readerlock(ui);
1198         }
1199
1200         ps->ps_firstblk = currentblk;
1201         ps->ps_lastblk = currentblk + rsize - 1;
1202 }
1203
1204 static int
1205 resync_read_blk_range(
1206         mm_unit_t *un,
1207         diskaddr_t currentblk,
1208         diskaddr_t stopbefore,
1209         uint_t type,
1210         int     flags
1211 )
1212 {
1213         size_t copysize;        /* limited by max xfer buf size */
1214         size_t rsize;           /* size of resync block (for MN) */
1215         set_t           setno;
1216         diskaddr_t      newstop;
1217         diskaddr_t      rs_startblk;
1218         uint_t          rs_type;
1219         int             flags1 = flags & MD_FIRST_RESYNC_NEXT;
1220
1221         rs_type = un->un_rs_type;
1222         rs_startblk = currentblk;
1223         if (stopbefore > un->c.un_total_blocks)
1224                 stopbefore = un->c.un_total_blocks;
1225         if (currentblk < un->un_resync_startbl)
1226                 currentblk = un->un_resync_startbl;
1227
1228         copysize = un->un_rs_copysize;
1229         rsize = MD_DEF_RESYNC_BLK_SZ;
1230
1231         setno = MD_MIN2SET(MD_SID(un));
1232         while (currentblk < stopbefore) {
1233                 /*
1234                  * Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
1235                  * if a MN device and sendflag is set, send a RESYNC_MESSAGE
1236                  * to all nodes.
1237                  */
1238                 if ((currentblk + MD_DEF_RESYNC_BLK_SZ) > stopbefore)
1239                         rsize = stopbefore - currentblk;
1240                 if (MD_MNSET_SETNO(setno) && (flags & MD_SEND_MESS_XMIT)) {
1241                         un->un_resync_startbl = currentblk;
1242                         rs_startblk = currentblk;
1243                         send_mn_resync_next_message(un, currentblk, rsize,
1244                             flags1);
1245                         if (flags1)
1246                                 flags1 = 0;
1247                         /* check to see if we've been asked to terminate */
1248                         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1249                                 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1250                                     ? 1:0);
1251                         /*
1252                          * Check to see if another node has completed this
1253                          * block, if so either the type or the resync region
1254                          * will have changed. If the resync type has changed,
1255                          * just exit.
1256                          * If the resync region has changed, reset currentblk
1257                          * to the start of the current resync region and
1258                          * continue.
1259                          */
1260                         if (un->un_rs_type != rs_type)
1261                                 return (0);
1262                         if (un->un_rs_prev_overlap->ps_firstblk >
1263                             rs_startblk) {
1264                                 currentblk =
1265                                     un->un_rs_prev_overlap->ps_firstblk;
1266                                 continue;
1267                         }
1268                 }
1269                 newstop = currentblk + rsize;
1270                 while (currentblk < newstop) {
1271                         if ((currentblk + copysize) > stopbefore)
1272                                 copysize = (size_t)(stopbefore - currentblk);
1273                         if (resync_read_buffer(un, currentblk, copysize,
1274                             (flags & MD_RESYNC_FLAG_ERR)))
1275                                 return (1);
1276
1277                         /* resync_read_buffer releases/grabs a new lock */
1278                         un = (mm_unit_t *)MD_UNIT(MD_SID(un));
1279                         currentblk += copysize;
1280
1281                         /* check to see if we've been asked to terminate */
1282                         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1283                                 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1284                                     ? 1:0);
1285                         if (MD_MNSET_SETNO(setno)) {
1286                                 /*
1287                                  * Check to see if another node has completed
1288                                  * this block, see above
1289                                  */
1290                                 if (un->un_rs_type != rs_type)
1291                                         return (0);
1292                                 if (un->un_rs_prev_overlap->ps_firstblk >
1293                                     rs_startblk)
1294                                         currentblk =
1295                                             un->un_rs_prev_overlap->ps_firstblk;
1296                         }
1297                 }
1298         }
1299         return (0);
1300 }
1301
1302 static void
1303 optimized_resync(mm_unit_t *un)
1304 {
1305         mdi_unit_t      *ui;
1306         minor_t         mnum;
1307         int             rr, smi;
1308         int             resync_regions;
1309         uchar_t         *dirtyregions;
1310         diskaddr_t      first, stopbefore;
1311         int             err;
1312         int             cnt;
1313         sm_state_t      state;
1314         int             broke_out = 0;
1315         set_t           setno;
1316         uint_t          old_rs_type = un->un_rs_type;
1317         uint_t          old_rs_done;
1318         uint_t          flags1 = MD_FIRST_RESYNC_NEXT|MD_RESYNC_FLAG_ERR;
1319         size_t          start_rr;
1320
1321         mnum = MD_SID(un);
1322         ui = MDI_UNIT(mnum);
1323         setno = MD_UN2SET(un);
1324
1325         if (!(un->c.un_status & MD_UN_OPT_NOT_DONE)) {
1326                 /*
1327                  * We aren't marked as needing a resync so for multi-node
1328                  * sets we flag the completion so that all nodes see the same
1329                  * metadevice state. This is a problem when a new node joins
1330                  * an existing set as it has to perform a 'metasync -r' and
1331                  * we have to step through all of the resync phases. If we
1332                  * don't do this the nodes that were already in the set will
1333                  * have the metadevices marked as 'Okay' but the joining node
1334                  * will have 'Needs Maintenance' which is unclearable.
1335                  */
1336                 if (MD_MNSET_SETNO(setno)) {
1337                         send_mn_resync_done_message(un, CLEAR_OPT_NOT_DONE);
1338                 }
1339                 return;
1340         }
1341
1342         /*
1343          * No need for optimized resync if ABR set, clear rs_type and flags
1344          * and exit
1345          */
1346         if (ui->ui_tstate & MD_ABR_CAP) {
1347                 un->un_rs_type = MD_RS_NONE;
1348                 un->c.un_status &= ~(MD_UN_OPT_NOT_DONE | MD_UN_WAR);
1349                 return;
1350         }
1351
1352         un->un_rs_dropped_lock = 1;
1353         un->c.un_status |= MD_UN_WAR;
1354         resync_regions = un->un_rrd_num;
1355         dirtyregions = un->un_resync_bm;
1356         md_unit_writerexit(ui);
1357
1358         /* For MN sets, resync NOTIFY is done when processing resync messages */
1359         if (!MD_MNSET_SETNO(setno)) {
1360                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1361                     SVM_TAG_METADEVICE, setno, MD_SID(un));
1362         }
1363         un = (mm_unit_t *)md_unit_readerlock(ui);
1364
1365         /* check to see if we've been asked to terminate */
1366         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1367                 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1368                         broke_out = RESYNC_ERR;
1369         }
1370         /*
1371          * Check that we are still performing an optimized
1372          * resync. If not, another node must have completed it
1373          * so we have no more work to do.
1374          */
1375         if (un->un_rs_type != old_rs_type) {
1376                 md_unit_readerexit(ui);
1377                 (void) md_unit_writerlock(ui);
1378                 return;
1379         }
1380         /*
1381          * If rs_resync_done is non-zero, we must be completing an optimized
1382          * resync that has already been partially done on another node.
1383          * Therefore clear the bits in resync_bm for the resync regions
1384          * already done. If resync_startbl is zero, calculate 2_do.
1385          */
1386         if (un->un_rs_resync_done > 0) {
1387                 BLK_TO_RR(start_rr, un->un_resync_startbl, un);
1388                 for (rr = 0; rr < start_rr && rr < resync_regions; rr++)
1389                         CLR_KEEPDIRTY(rr, un);
1390         } else {
1391                 un->un_rs_resync_2_do = 0;
1392                 for (rr = 0; rr < resync_regions; rr++)
1393                         if (isset(dirtyregions, rr))
1394                                 un->un_rs_resync_2_do++;
1395         }
1396
1397         for (rr = 0; (rr < resync_regions) && (broke_out != RESYNC_ERR); rr++) {
1398                 if (isset(dirtyregions, rr)) {
1399                         RR_TO_BLK(first, rr, un);
1400                         RR_TO_BLK(stopbefore, rr+1, un);
1401                         old_rs_type = un->un_rs_type;
1402                         old_rs_done = un->un_rs_resync_done;
1403                         err = resync_read_blk_range(un, first, stopbefore,
1404                             MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
1405                         flags1 = MD_RESYNC_FLAG_ERR;
1406
1407                         /* resync_read_blk_range releases/grabs a new lock */
1408                         un = (mm_unit_t *)MD_UNIT(mnum);
1409
1410                         if (err) {
1411                                 broke_out = RESYNC_ERR;
1412                                 break;
1413                         }
1414
1415                         /*
1416                          * Check that we are still performing an optimized
1417                          * resync. If not, another node must have completed it
1418                          * so we have no more work to do.
1419                          */
1420                         if (un->un_rs_type != old_rs_type) {
1421                                 md_unit_readerexit(ui);
1422                                 (void) md_unit_writerlock(ui);
1423                                 return;
1424                         }
1425
1426                         /*
1427                          * If resync_done has increased, we must have
1428                          * blocked in resync_read_blk_range while another node
1429                          * continued with the resync. Therefore clear resync_bm
1430                          * for the blocks that have been resynced on another
1431                          * node and update rr to the next RR to be done.
1432                          */
1433                         if (old_rs_done < un->un_rs_resync_done) {
1434                                 int i;
1435                                 BLK_TO_RR(start_rr, un->un_resync_startbl - 1,
1436                                     un);
1437                                 for (i = rr; i < start_rr; i++)
1438                                         CLR_KEEPDIRTY(i, un);
1439                                 rr = start_rr;
1440                         } else
1441                                 un->un_rs_resync_done++;
1442
1443                         for (smi = 0, cnt = 0; smi < NMIRROR; smi++)
1444                                 if (SUBMIRROR_IS_WRITEABLE(un, smi) &&
1445                                     !(SMS_BY_INDEX_IS(un, smi, SMS_ALL_ERRED)))
1446                                         cnt++;
1447                         if (cnt < 2) {
1448                                 broke_out = RESYNC_ERR;
1449                                 break;
1450                         }
1451                         CLR_KEEPDIRTY(rr, un);
1452                         /* Check to see if we've completed the resync cleanly */
1453                         if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1454                                 break;
1455
1456                         /*
1457                          * Check that we haven't exceeded un_rs_resync_2_do. If
1458                          * we have we've completed the resync.
1459                          */
1460                         if (un->un_rs_resync_done > un->un_rs_resync_2_do)
1461                                 break;
1462                 }
1463         }
1464         md_unit_readerexit(ui);
1465         un = (mm_unit_t *)md_unit_writerlock(ui);
1466
1467         /*
1468          * If MN set send message to all nodes to indicate resync
1469          * phase is complete. The processing of the message will update the
1470          * mirror state
1471          */
1472         if (MD_MNSET_SETNO(setno)) {
1473                 send_mn_resync_done_message(un, broke_out);
1474         } else {
1475
1476                 if (!broke_out)
1477                         un->c.un_status &= ~MD_UN_WAR;
1478
1479                 un->c.un_status &= ~MD_UN_KEEP_DIRTY;
1480
1481                 setno = MD_UN2SET(un);
1482                 for (smi = 0; smi < NMIRROR; smi++) {
1483                         un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
1484                         if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE_RESYNC)) {
1485                                 state = (broke_out ? SMS_OFFLINE : SMS_RUNNING);
1486                                 mirror_set_sm_state(&un->un_sm[smi],
1487                                     &un->un_smic[smi], state, broke_out);
1488                                 mirror_commit(un, NO_SUBMIRRORS, 0);
1489                         }
1490                         if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE))
1491                                 un->c.un_status |= MD_UN_OFFLINE_SM;
1492                 }
1493         }
1494
1495         /* For MN sets, resync NOTIFY is done when processing resync messages */
1496         if (!MD_MNSET_SETNO(setno)) {
1497                 if (broke_out) {
1498                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1499                             SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1500                 } else {
1501                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1502                             SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1503                 }
1504         }
1505 }
1506
1507 /*
1508  * recalc_resync_done
1509  *
1510  * This function deals with a change in value of un_rs_resync_2_do in a
1511  * component resync. This may change if we are restarting a component
1512  * resync on a single node having rebooted with a different value of
1513  * md_resync_bufsz or if we are running in a multi-node with nodes having
1514  * different values of md_resync_bufsz.
1515  * If there is a change in un_rs_resync_2_do, we need to recalculate
1516  * the value of un_rs_resync_done given the new value for resync_2_do.
1517  * We have to calculate a new value for resync_done to be either
1518  * if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
1519  * or if it is not set, we need to calculate it from un_rs_resync_done,
1520  * (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
1521  * In addition we need to deal with the overflow case by using a factor to
1522  * prevent overflow
1523  */
1524
1525 static void
1526 recalc_resync_done(mm_unit_t *un, size_t resync_2_do, diskaddr_t initblock,
1527     u_longlong_t blk_size, u_longlong_t skip)
1528 {
1529         diskaddr_t              x;
1530         uint_t                  factor = 1;
1531
1532         /*
1533          * If resync_2_do has not yet been calculated, no need to modify
1534          * resync_done
1535          */
1536         if (un->un_rs_resync_2_do == 0) {
1537                 return;
1538         }
1539         if (un->un_rs_resync_2_do == resync_2_do)
1540                 return; /* No change, so nothing to do */
1541         /*
1542          * If un_rs_startbl is set, another node must have already started
1543          * this resync and hence we can calculate resync_done from
1544          * resync_startbl
1545          */
1546         if (un->un_resync_startbl) {
1547                 un->un_rs_resync_done = (un->un_resync_startbl - initblock) /
1548                     (blk_size + skip);
1549                 return;
1550         }
1551         /*
1552          * un_resync_startbl is not set so we must calculate it from
1553          * un_rs_resync_done.
1554          * If the larger of the two values of resync_2_do is greater than 32
1555          * bits, calculate a factor to divide by to ensure that we don't
1556          * overflow 64 bits when calculating the new value for resync_done
1557          */
1558         x = (un->un_rs_resync_2_do > resync_2_do) ? un->un_rs_resync_2_do :
1559             resync_2_do;
1560         while (x > INT32_MAX) {
1561                 x = x >> 1;
1562                 factor = factor << 1;
1563         }
1564         un->un_rs_resync_done = ((un->un_rs_resync_done/factor) *
1565             (resync_2_do/factor)) /
1566             ((un->un_rs_resync_2_do + (factor * factor) - 1)/
1567             (factor * factor));
1568 }
1569
1570 static void
1571 check_comp_4_resync(mm_unit_t *un, int smi, int ci)
1572 {
1573         mdi_unit_t              *ui;
1574         minor_t                 mnum;
1575         mm_submirror_t          *sm;
1576         mm_submirror_ic_t       *smic;
1577         size_t                  count;
1578         u_longlong_t            skip;
1579         u_longlong_t            size;
1580         u_longlong_t            blk_size;
1581         diskaddr_t              initblock;
1582         diskaddr_t              block;
1583         diskaddr_t              frag = 0;
1584         md_m_shared_t           *shared;
1585         int                     err;
1586         set_t                   setno;
1587         int                     broke_out = 0;
1588         int                     blks;
1589         uint_t                  old_rs_type = un->un_rs_type;
1590         diskaddr_t              old_rs_done;
1591         uint_t                  flags1 = MD_FIRST_RESYNC_NEXT;
1592         diskaddr_t              resync_2_do;
1593
1594         mnum = MD_SID(un);
1595         ui = MDI_UNIT(mnum);
1596         sm = &un->un_sm[smi];
1597         smic = &un->un_smic[smi];
1598         setno = MD_UN2SET(un);
1599
1600         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1601             (sm->sm_dev, sm, ci);
1602
1603         if (shared->ms_state != CS_RESYNC) {
1604                 SET_RS_TYPE_NONE(un->un_rs_type);
1605                 return;
1606         }
1607
1608         if (shared->ms_flags & MDM_S_RS_TRIED) {
1609                 SET_RS_TYPE_NONE(un->un_rs_type);
1610                 return;
1611         }
1612
1613         (void) (*(smic->sm_get_bcss))
1614             (sm->sm_dev, sm, ci, &initblock, &count, &skip, &size);
1615
1616         if ((count == 1) && (skip == 0)) {
1617                 count = (size_t)(size / un->un_rs_copysize);
1618                 if ((frag = (size - (count * un->un_rs_copysize))) != 0)
1619                         count++;
1620                 size = (u_longlong_t)un->un_rs_copysize;
1621         }
1622         blk_size = size; /* Save block size for this resync */
1623
1624         ASSERT(count >= 1);
1625         resync_2_do = count;
1626         /*
1627          * If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
1628          * gives the proportion of the resync that has already been done.
1629          * If un_rs_copysize has changed since this previous partial resync,
1630          * either because this node has been rebooted with a different value
1631          * for md_resync_bufsz or because another node with a different value
1632          * for md_resync_bufsz performed the previous resync, we need to
1633          * recalculate un_rs_resync_done as a proportion of our value of
1634          * resync_2_do.
1635          */
1636         recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1637
1638         /*
1639          * For MN mirrors we need to send a message to all nodes indicating
1640          * the next region to be resynced. For a component resync, the size of
1641          * the contiguous region that is processed by resync_read_blk_range()
1642          * may be small if there is the interleave size.
1643          * Therefore, rather than sending the message within
1644          * resync_read_blk_range(), we will send a message every
1645          * MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
1646          * the number of blocks. Then, if we are restarting a resync, round
1647          * un_rs_resync_done down to the previous resync region boundary. This
1648          * ensures that we send a RESYNC_NEXT message before resyncing any
1649          * blocks
1650          */
1651         if (MD_MNSET_SETNO(setno)) {
1652                 blks = ((MD_DEF_RESYNC_BLK_SZ + blk_size + skip - 1)/
1653                     (blk_size + skip));
1654                 un->un_rs_resync_done = (un->un_rs_resync_done/blks) * blks;
1655         }
1656         /*
1657          * un_rs_resync_done is the number of ('size' + 'skip') increments
1658          * already resynced from the base 'block'
1659          * un_rs_resync_2_do is the number of iterations in
1660          * this component resync.
1661          */
1662         ASSERT(count >= un->un_rs_resync_done);
1663         un->un_rs_resync_2_do = (diskaddr_t)count;
1664
1665         un->c.un_status |= MD_UN_WAR;
1666         sm->sm_flags |= MD_SM_RESYNC_TARGET;
1667         md_unit_writerexit(ui);
1668
1669         /* For MN sets, resync NOTIFY is done when processing resync messages */
1670         if (!MD_MNSET_SETNO(setno)) {
1671                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1672                     SVM_TAG_METADEVICE, setno, MD_SID(un));
1673         }
1674         un = (mm_unit_t *)md_unit_readerlock(ui);
1675
1676         /* check to see if we've been asked to terminate */
1677         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1678                 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1679                         broke_out = RESYNC_ERR;
1680         }
1681         /*
1682          * Check that we are still performing the same component
1683          * resync. If not, another node must have completed it
1684          * so we have no more work to do.
1685          */
1686         if (un->un_rs_type != old_rs_type) {
1687                 md_unit_readerexit(ui);
1688                 (void) md_unit_writerlock(ui);
1689                 return;
1690         }
1691         /*
1692          * Adjust resync_done, resync_2_do, start of resync area and count to
1693          * skip already resync'd data. We need to recalculate resync_done as
1694          * we have dropped the unit lock above and may have lost ownership to
1695          * another node, with a different resync buffer size and it may have
1696          * sent us new values of resync_done and resync_2_do based on its
1697          * resync buffer size
1698          */
1699         recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1700         un->un_rs_resync_2_do = resync_2_do;
1701         count -= un->un_rs_resync_done;
1702         block = initblock + ((blk_size + skip) * (int)un->un_rs_resync_done);
1703
1704         un->un_rs_dropped_lock = 1;
1705         while ((count > 0) && (broke_out != RESYNC_ERR)) {
1706                 old_rs_done = un->un_rs_resync_done;
1707                 /*
1708                  * For MN mirrors send a message to the other nodes. This
1709                  * message includes the size of the region that must be blocked
1710                  * for all writes
1711                  */
1712                 if (MD_MNSET_SETNO(setno)) {
1713                         if ((un->un_rs_resync_done%blks == 0)) {
1714                                 un->un_resync_startbl = block;
1715                                 send_mn_resync_next_message(un, block,
1716                                     (blk_size+skip)*blks, flags1);
1717                                 flags1 = 0;
1718                                 /*
1719                                  * check to see if we've been asked to
1720                                  * terminate
1721                                  */
1722                                 if (resync_kill_pending(un,
1723                                     MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1724                                         if (un->c.un_status &
1725                                             MD_UN_RESYNC_CANCEL) {
1726                                                 broke_out = RESYNC_ERR;
1727                                                 break;
1728                                         }
1729                                 }
1730
1731                                 /*
1732                                  * Check that we are still performing the same
1733                                  * component resync. If not, another node must
1734                                  * have completed it so we have no more work to
1735                                  * do. Also reset count to remaining resync as
1736                                  * we may have lost ownership in in
1737                                  * send_mn_resync_next_message while another
1738                                  * node continued with the resync and
1739                                  * incremented resync_done.
1740                                  */
1741                                 if (un->un_rs_type != old_rs_type) {
1742                                         md_unit_readerexit(ui);
1743                                         (void) md_unit_writerlock(ui);
1744                                         return;
1745                                 }
1746                                 /*
1747                                  * recalculate resync_done, resync_2_do
1748                                  * We need to recalculate resync_done as
1749                                  * we have dropped the unit lock in
1750                                  * send_mn_resync_next_message above and may
1751                                  * have lost ownership to another node, with a
1752                                  * different resync buffer size and it may have
1753                                  * sent us new values of resync_done and
1754                                  * resync_2_do based on its resync buffer size
1755                                  */
1756                                 recalc_resync_done(un, resync_2_do, initblock,
1757                                     blk_size, skip);
1758                                 un->un_rs_resync_2_do = resync_2_do;
1759                                 count = un->un_rs_resync_2_do -
1760                                     un->un_rs_resync_done;
1761                                 /*
1762                                  * Adjust start of resync area to skip already
1763                                  * resync'd data
1764                                  */
1765                                 block = initblock + ((blk_size + skip) *
1766                                     (int)un->un_rs_resync_done);
1767                                 old_rs_done = un->un_rs_resync_done;
1768                         }
1769                 }
1770                 err = resync_read_blk_range(un, block, block + size,
1771                     MD_READER_HELD, MD_RESYNC_FLAG_ERR);
1772
1773                 /* resync_read_blk_range releases/grabs a new lock */
1774                 un = (mm_unit_t *)MD_UNIT(mnum);
1775
1776                 if (err) {
1777                         broke_out = RESYNC_ERR;
1778                         break;
1779                 }
1780                 /*
1781                  * If we are no longer resyncing this component, return as
1782                  * another node has progressed the resync.
1783                  */
1784                 if (un->un_rs_type != old_rs_type) {
1785                         md_unit_readerexit(ui);
1786                         (void) md_unit_writerlock(ui);
1787                         return;
1788                 }
1789
1790                 /*
1791                  * recalculate resync_done, resync_2_do. We need to recalculate
1792                  * resync_done as we have dropped the unit lock in
1793                  * resync_read_blk_range above and may have lost ownership to
1794                  * another node, with a different resync buffer size and it may
1795                  * have sent us new values of resync_done and resync_2_do based
1796                  * on its resync buffer size
1797                  */
1798                 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1799                 un->un_rs_resync_2_do = resync_2_do;
1800
1801                 /*
1802                  * Reset count to remaining resync as we may have blocked in
1803                  * resync_read_blk_range while another node continued
1804                  * with the resync and incremented resync_done. Also adjust
1805                  * start of resync area to skip already resync'd data.
1806                  */
1807                 count = un->un_rs_resync_2_do - un->un_rs_resync_done;
1808                 block = initblock +((blk_size + skip) *
1809                     (int)un->un_rs_resync_done);
1810
1811                 /*
1812                  * If we are picking up from another node, we retry the last
1813                  * block otherwise step on to the next block
1814                  */
1815                 if (old_rs_done == un->un_rs_resync_done) {
1816                         block += blk_size + skip;
1817                         un->un_rs_resync_done++;
1818                         count--;
1819                 }
1820
1821                 if ((count == 1) && frag)
1822                         size = frag;
1823                 if (shared->ms_state == CS_ERRED) {
1824                         err = 1;
1825                         broke_out = RESYNC_ERR;
1826                         break;
1827                 }
1828
1829                 /* Check to see if we've completed the resync cleanly */
1830                 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1831                         break;
1832         }
1833
1834         md_unit_readerexit(ui);
1835         un = (mm_unit_t *)md_unit_writerlock(ui);
1836
1837         /*
1838          * If MN set send message to all nodes to indicate resync
1839          * phase is complete. The processing of the message will update the
1840          * mirror state
1841          */
1842         if (MD_MNSET_SETNO(setno)) {
1843                 send_mn_resync_done_message(un, broke_out);
1844         } else {
1845                 un->c.un_status &= ~MD_UN_WAR;
1846                 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
1847
1848                 if (err)
1849                         shared->ms_flags |= MDM_S_RS_TRIED;
1850                 else
1851                         /*
1852                          * As we don't transmit the changes,
1853                          * no need to drop the lock.
1854                          */
1855                         set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
1856                             MD_STATE_NO_XMIT, (IOLOCK *)NULL);
1857         }
1858
1859         /* For MN sets, resync NOTIFY is done when processing resync messages */
1860         if (!MD_MNSET_SETNO(setno)) {
1861                 if (broke_out) {
1862                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1863                             SVM_TAG_METADEVICE, setno, MD_SID(un));
1864                 } else {
1865                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1866                             SVM_TAG_METADEVICE, setno, MD_SID(un));
1867                 }
1868                 SET_RS_TYPE_NONE(un->un_rs_type);
1869         }
1870 }
1871
1872 static void
1873 submirror_resync(mm_unit_t *un)
1874 {
1875         mdi_unit_t              *ui;
1876         minor_t                 mnum;
1877         mm_submirror_t          *sm;
1878         mm_submirror_ic_t       *smic;
1879         int                     smi;
1880         diskaddr_t              chunk;
1881         diskaddr_t              curblk;
1882         int                     err;
1883         int                     cnt;
1884         set_t                   setno;
1885         int                     broke_out = 0;
1886         int                     i;
1887         int                     flags1 = MD_FIRST_RESYNC_NEXT;
1888         int                     compcnt;
1889
1890         mnum = MD_SID(un);
1891         ui = MDI_UNIT(mnum);
1892         setno = MD_UN2SET(un);
1893
1894         /*
1895          * If the submirror_index is non-zero, we are continuing a resync
1896          * so restart resync from last submirror marked as being resynced.
1897          */
1898         if (RS_SMI(un->un_rs_type) != 0) {
1899                 smi = RS_SMI(un->un_rs_type);
1900                 sm = &un->un_sm[smi];
1901                 smic = &un->un_smic[smi];
1902                 if (!SMS_IS(sm, SMS_ATTACHED_RESYNC)) {
1903                         for (smi = 0; smi < NMIRROR; smi++) {
1904                                 sm = &un->un_sm[smi];
1905                                 smic = &un->un_smic[smi];
1906                                 if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1907                                         break;
1908                         }
1909                 }
1910         } else {
1911                 for (smi = 0; smi < NMIRROR; smi++) {
1912                         sm = &un->un_sm[smi];
1913                         smic = &un->un_smic[smi];
1914                         if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1915                                 break;
1916                 }
1917         }
1918         if (smi == NMIRROR) {
1919                 SET_RS_TYPE_NONE(un->un_rs_type);
1920                 return;
1921         }
1922
1923         /*
1924          * If we've only got one component we can fail on a resync write
1925          * if an error is encountered. This stops an unnecessary read of the
1926          * whole mirror on a target write error.
1927          */
1928         compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
1929         if (compcnt == 1)
1930                 flags1 |= MD_RESYNC_FLAG_ERR;
1931
1932         un->c.un_status |= MD_UN_WAR;
1933         sm->sm_flags |= MD_SM_RESYNC_TARGET;
1934         SET_RS_SMI(un->un_rs_type, smi);
1935         md_unit_writerexit(ui);
1936
1937         /* For MN sets, resync NOTIFY is done when processing resync messages */
1938         if (!MD_MNSET_SETNO(setno)) {
1939                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1940                     SVM_TAG_METADEVICE, setno, MD_SID(un));
1941         }
1942         un = (mm_unit_t *)md_unit_readerlock(ui);
1943
1944         un->un_rs_dropped_lock = 1;
1945
1946         /* check to see if we've been asked to terminate */
1947         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1948                 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1949                         broke_out = RESYNC_ERR;
1950         }
1951         /*
1952          * Check that we are still performing the same submirror
1953          * resync. If not, another node must have completed it
1954          * so we have no more work to do.
1955          */
1956         if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
1957                 md_unit_readerexit(ui);
1958                 (void) md_unit_writerlock(ui);
1959                 return;
1960         }
1961
1962         /* if > 1TB mirror, increase percent done granularity */
1963         if (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)
1964                 chunk = un->c.un_total_blocks / 1000;
1965         else
1966                 chunk = un->c.un_total_blocks / 100;
1967         if (chunk == 0)
1968                 chunk = un->c.un_total_blocks;
1969         /*
1970          * If a MN set, round the chunk size up to a multiple of
1971          * MD_DEF_RESYNC_BLK_SZ
1972          */
1973         if (MD_MNSET_SETNO(setno)) {
1974                 chunk = ((chunk + MD_DEF_RESYNC_BLK_SZ)/MD_DEF_RESYNC_BLK_SZ)
1975                     * MD_DEF_RESYNC_BLK_SZ;
1976                 if (chunk > un->c.un_total_blocks)
1977                         chunk = un->c.un_total_blocks;
1978         }
1979         /*
1980          * Handle restartable resyncs that continue from where the previous
1981          * resync left off. The new resync range is from un_rs_resync_done ..
1982          * un_rs_resync_2_do
1983          */
1984         curblk = 0;
1985         if (un->un_rs_resync_done == 0) {
1986                 un->un_rs_resync_2_do = un->c.un_total_blocks;
1987         } else {
1988                 curblk = un->un_rs_resync_done;
1989         }
1990         while ((curblk != un->c.un_total_blocks) && (broke_out != RESYNC_ERR)) {
1991                 diskaddr_t      rs_done;
1992
1993                 rs_done = un->un_rs_resync_done;
1994                 err = resync_read_blk_range(un, curblk, curblk + chunk,
1995                     MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
1996                 flags1 = (compcnt == 1 ? MD_RESYNC_FLAG_ERR : 0);
1997
1998                 /* resync_read_blk_range releases/grabs a new lock */
1999                 un = (mm_unit_t *)MD_UNIT(mnum);
2000
2001                 if (err) {
2002                         broke_out = RESYNC_ERR;
2003                         break;
2004                 }
2005
2006                 /*
2007                  * If we are no longer executing a submirror resync, return
2008                  * as another node has completed the submirror resync.
2009                  */
2010                 if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
2011                         md_unit_readerexit(ui);
2012                         (void) md_unit_writerlock(ui);
2013                         return;
2014                 }
2015                 /*
2016                  * If resync_done has changed, we must have blocked
2017                  * in resync_read_blk_range while another node
2018                  * continued with the resync so restart from resync_done.
2019                  */
2020                 if (rs_done != un->un_rs_resync_done) {
2021                         curblk = un->un_rs_resync_done;
2022                 } else {
2023                         curblk += chunk;
2024                         un->un_rs_resync_done = curblk;
2025                 }
2026
2027                 if ((curblk + chunk) > un->c.un_total_blocks)
2028                         chunk = un->c.un_total_blocks - curblk;
2029                 for (i = 0, cnt = 0; i < NMIRROR; i++)
2030                         if (SUBMIRROR_IS_WRITEABLE(un, i) &&
2031                             !SMS_BY_INDEX_IS(un, i, SMS_ALL_ERRED) &&
2032                             (un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET))
2033                                 cnt++;
2034                 if (cnt == 0) {
2035                         broke_out = RESYNC_ERR;
2036                         break;
2037                 }
2038
2039                 /* Check to see if we've completed the resync cleanly */
2040                 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
2041                         break;
2042         }
2043         md_unit_readerexit(ui);
2044         un = (mm_unit_t *)md_unit_writerlock(ui);
2045
2046         /*
2047          * If MN set send message to all nodes to indicate resync
2048          * phase is complete. The processing of the message will update the
2049          * mirror state
2050          */
2051         if (MD_MNSET_SETNO(setno)) {
2052                 send_mn_resync_done_message(un, broke_out);
2053         } else {
2054                 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
2055                 if (err) {
2056                         mirror_set_sm_state(sm, smic, SMS_ATTACHED, 1);
2057                 } else {
2058                         mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2059                 }
2060                 un->c.un_status &= ~MD_UN_WAR;
2061                 mirror_commit(un, SMI2BIT(smi), 0);
2062         }
2063
2064         /* For MN sets, resync NOTIFY is done when processing resync messages */
2065         if (!MD_MNSET_SETNO(setno)) {
2066                 if (broke_out) {
2067                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
2068                             SVM_TAG_METADEVICE, setno, MD_SID(un));
2069                 } else {
2070                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
2071                             SVM_TAG_METADEVICE, setno, MD_SID(un));
2072                 }
2073         }
2074 }
2075
2076 static void
2077 component_resync(mm_unit_t *un)
2078 {
2079         mm_submirror_t          *sm;
2080         mm_submirror_ic_t       *smic;
2081         int                     ci;
2082         int                     i;
2083         int                     compcnt;
2084
2085         /*
2086          * Handle the case where we are picking up a partially complete
2087          * component resync. In this case un_rs_type contains the submirror
2088          * and component index of where we should restart the resync.
2089          */
2090         while (un->un_rs_type != MD_RS_COMPONENT) {
2091                 i = RS_SMI(un->un_rs_type);
2092                 ci = RS_CI(un->un_rs_type);
2093                 check_comp_4_resync(un, i, ci);
2094                 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2095                     MD_WRITER_HELD))
2096                         return;
2097                 /*
2098                  * If we have no current resync, contine to scan submirror and
2099                  * components. If the resync has moved on to another component,
2100                  * restart it and if the resync is no longer a component
2101                  * resync, just exit
2102                  */
2103                 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE)
2104                         break;
2105                 if (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT)
2106                         return;
2107         }
2108         /* Now continue scanning _all_ submirrors and components */
2109         for (i = 0; i < NMIRROR; i++) {
2110                 sm = &un->un_sm[i];
2111                 smic = &un->un_smic[i];
2112                 if (!SMS_IS(sm, SMS_RUNNING | SMS_LIMPING))
2113                         continue;
2114                 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2115                 for (ci = 0; ci < compcnt; ci++) {
2116                         SET_RS_SMI(un->un_rs_type, i);
2117                         SET_RS_CI(un->un_rs_type, ci);
2118                         SET_RS_TYPE(un->un_rs_type, MD_RS_COMPONENT);
2119                         check_comp_4_resync(un, i, ci);
2120                         /* Bail out if we've been asked to abort/shutdown */
2121                         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2122                             MD_WRITER_HELD))
2123                                 return;
2124                         /*
2125                          * Now check if another node has continued with the
2126                          * resync, if we are no longer in component resync,
2127                          * exit, otherwise update to the current component - 1
2128                          * so that the next call of check_comp_4 resync() will
2129                          * resync the current component.
2130                          */
2131                         if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2132                             (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT))
2133                                 return;
2134                         else {
2135                                 if (RS_SMI(un->un_rs_type) != i) {
2136                                         i = RS_SMI(un->un_rs_type);
2137                                         ci = RS_CI(un->un_rs_type) - 1;
2138                                 } else if (RS_CI(un->un_rs_type) != ci)
2139                                         ci = RS_CI(un->un_rs_type) - 1;
2140                         }
2141                 }
2142         }
2143 }
2144
2145 static void
2146 reset_comp_flags(mm_unit_t *un)
2147 {
2148         mm_submirror_t          *sm;
2149         mm_submirror_ic_t       *smic;
2150         md_m_shared_t           *shared;
2151         int                     ci;
2152         int                     i;
2153         int                     compcnt;
2154
2155         for (i = 0; i < NMIRROR; i++) {
2156                 sm = &un->un_sm[i];
2157                 smic = &un->un_smic[i];
2158                 if (!SMS_IS(sm, SMS_INUSE))
2159                         continue;
2160                 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2161                 for (ci = 0; ci < compcnt; ci++) {
2162                         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2163                             (sm->sm_dev, sm, ci);
2164                         shared->ms_flags &= ~MDM_S_RS_TRIED;
2165                 }
2166         }
2167 }
2168
2169 /*
2170  * resync_progress_thread:
2171  * ----------------------
2172  * Thread started on first resync of a unit which simply blocks until woken up
2173  * by a cv_signal, and then updates the mddb for the mirror unit record. This
2174  * saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
2175  * so that an aborted resync can be continued after an intervening reboot.
2176  */
2177 static void
2178 resync_progress_thread(minor_t mnum)
2179 {
2180         mm_unit_t       *un = MD_UNIT(mnum);
2181         mdi_unit_t      *ui = MDI_UNIT(mnum);
2182         set_t           setno = MD_MIN2SET(mnum);
2183
2184         while (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2185                 mutex_enter(&un->un_rs_progress_mx);
2186                 cv_wait(&un->un_rs_progress_cv, &un->un_rs_progress_mx);
2187                 mutex_exit(&un->un_rs_progress_mx);
2188                 if (un->un_rs_progress_flags & MD_RI_KILL)
2189                         break;
2190
2191                 /*
2192                  * Commit mirror unit if we're the Master node in a multi-node
2193                  * environment
2194                  */
2195                 if (MD_MNSET_SETNO(setno) && md_set[setno].s_am_i_master) {
2196                         (void) md_unit_readerlock(ui);
2197                         mirror_commit(un, NO_SUBMIRRORS, 0);
2198                         md_unit_readerexit(ui);
2199                 }
2200         }
2201         thread_exit();
2202 }
2203
2204 /*
2205  * resync_progress:
2206  * ---------------
2207  * Timeout handler for updating the progress of the resync thread.
2208  * Simply wake up the resync progress daemon which will then mirror_commit() the
2209  * unit structure to the mddb. This snapshots the current progress of the resync
2210  */
2211 static void
2212 resync_progress(void *arg)
2213 {
2214         mm_unit_t       *un = (mm_unit_t *)arg;
2215         mdi_unit_t      *ui = MDI_UNIT(MD_SID(un));
2216         uint_t          active;
2217
2218         mutex_enter(&un->un_rs_progress_mx);
2219         cv_signal(&un->un_rs_progress_cv);
2220         mutex_exit(&un->un_rs_progress_mx);
2221
2222         /* schedule the next timeout if the resync is still marked active */
2223         (void) md_unit_readerlock(ui);
2224         active = un->c.un_status & MD_UN_RESYNC_ACTIVE ? 1 : 0;
2225         md_unit_readerexit(ui);
2226         if (active) {
2227                 un->un_rs_resync_to_id = timeout(resync_progress, un,
2228                     (clock_t)(drv_usectohz(60000000) *
2229                     md_mirror_resync_update_intvl));
2230         }
2231 }
2232
2233 /*
2234  * resync_unit:
2235  * -----------
2236  * Resync thread which drives all forms of resync (optimized, component,
2237  * submirror). Must handle thread suspension and kill to allow multi-node
2238  * resync to run without undue ownership changes.
2239  *
2240  * For a MN set, the reync mechanism is as follows:
2241  *
2242  * When a resync is started, either via metattach, metaonline, metareplace,
2243  * metasync or by a hotspare kicking in, a message is sent to all nodes, which
2244  * calls mirror_resync_thread. If there is currently no mirror owner, the
2245  * master node sends a CHOOSE_OWNER message to the handler on the master. This
2246  * chooses a mirror owner and sends a CHANGE_OWNER message requesting the
2247  * selected node to become the owner.
2248  * If this node is not the owner it sets itself to block in resync_kill_pending
2249  * and if there is no owner all nodes will block until the chosen owner is
2250  * selected, in which case it will unblock itself. So, on entry to this
2251  * function only one node will continue past resync_kill_pending().
2252  * Once the resync thread is started, it basically cycles through the optimized,
2253  * component and submirrors resyncs until there is no more work to do.
2254  *
2255  * For an ABR mirror, once a mirror owner is chosen it will complete the resync
2256  * unless the nodes dies in which case a new owner will be chosen and it will
2257  * have to complete the resync from the point at which the previous owner died.
2258  * To do this we broadcast a RESYNC_NEXT message before each region to be
2259  * resynced and this message contains the address and length of the region
2260  * being resynced and the current progress through the resync. The size of
2261  * this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
2262  * block size to limit the amount of inter node traffic. The RESYNC_NEXT
2263  * message also indicates to all other nodes that all writes to this block
2264  * must be blocked until the next RESYNC_NEXT message is received. This ensures
2265  * that no node can write to a block that is being resynced. For all MN
2266  * mirrors we also block the whole resync region on the resync owner node so
2267  * that all writes to the resync region are blocked on all nodes. There is a
2268  * difference here between a MN set and a regular set in that for a MN set
2269  * we protect the mirror from writes to the current resync block by blocking
2270  * a larger region. For a regular set we just block writes to the current
2271  * resync block.
2272  *
2273  * For a non-ABR mirror the same RESYNC_NEXT message is sent with an
2274  * additional purpose. In this case, there is only one mirror owner at a time
2275  * and rather than continually switching ownership between the chosen mirror
2276  * owner and the node that is writing to the mirror, we move the resync to the
2277  * mirror owner. When we swich ownership, we block the old owner and unblock
2278  * the resync thread on the new owner. To enable the new owner to continue the
2279  * resync, all nodes need to have the latest resync status, Then, following each
2280  * resync write, we check to see if the resync state has changed and if it
2281  * has this must be because we have lost ownership to another node(s) for a
2282  * period and then have become owner again later in the resync process. If we
2283  * are still dealing with the same resync, we just adjust addresses and counts
2284  * and then continue. If the resync has moved on to a different type, for
2285  * example from an optimized to a submirror resync, we move on to process the
2286  * resync described by rs_type and continue from the position described by
2287  * resync_done and resync_startbl.
2288  *
2289  * Note that for non-ABR mirrors it is possible for a write to be made on a
2290  * non resync-owner node without a change of ownership. This is the case when
2291  * the mirror has a soft part created on it and a write in ABR mode is made
2292  * to that soft part. Therefore we still need to block writes to the resync
2293  * region on all nodes.
2294  *
2295  * Sending the latest resync state to all nodes also enables them to continue
2296  * a resync in the event that the mirror owner dies. If a mirror owner for
2297  * a non-ABR mirror has died, there will be dirty resync regions. Therefore,
2298  * regardless of whether another type of resync was in progress, we must first
2299  * do an optimized resync to clean up the dirty regions before continuing
2300  * with the interrupted resync.
2301  *
2302  * The resync status is held in the unit structure
2303  * On disk
2304  * un_rs_resync_done    The number of contiguous resyc blocks done so far
2305  * un_rs_resync_2_do    The total number of contiguous resync blocks
2306  * un_rs_type           The resync type (inc submirror and component numbers)
2307  * In core
2308  * un_resync_startbl    The address of the current resync block being processed
2309  *
2310  * In the event that the whole cluster fails we need to just use
2311  * un_rs_resync_done to restart the resync and to ensure that this is
2312  * periodically written to disk, we have a thread which writes the record
2313  * to disk every 5 minutes. As the granularity of un_rs_resync_done is
2314  * usually coarse ( for an optimized resync 1001 is the max value) there is
2315  * little point in writing this more frequently.
2316  */
2317 static void
2318 resync_unit(minor_t mnum)
2319 {
2320         mdi_unit_t      *ui;
2321         mm_unit_t       *un;
2322         md_error_t      mde = mdnullerror;
2323         int             mn_resync = 0;
2324         int             resync_finish = 0;
2325         set_t           setno = MD_MIN2SET(mnum);
2326         uint_t          old_rs_type = MD_RS_NONE;
2327         uint_t          old_rs_done = 0, old_rs_2_do = 0;
2328         uint_t          old_rs_startbl = 0;
2329         int             block_resync = 1;
2330         char            cpr_name[23];   /* Unique CPR name */
2331         int             rs_copysize;
2332         char            *rs_buffer;
2333
2334 resync_restart:
2335 #ifdef DEBUG
2336         if (mirror_debug_flag)
2337                 printf("Resync started (mnum = %x)\n", mnum);
2338 #endif
2339         /*
2340          * increment the mirror resync count
2341          */
2342         mutex_enter(&md_cpr_resync.md_resync_mutex);
2343         md_cpr_resync.md_mirror_resync++;
2344         mutex_exit(&md_cpr_resync.md_resync_mutex);
2345
2346         ui = MDI_UNIT(mnum);
2347         un = MD_UNIT(mnum);
2348
2349         rs_copysize = un->un_rs_copysize;
2350         if (rs_copysize == 0) {
2351                 /*
2352                  * Don't allow buffer size to fall outside the
2353                  * range 0 < bufsize <= md_max_xfer_bufsz.
2354                  */
2355                 if (md_resync_bufsz <= 0)
2356                         md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
2357                 rs_copysize = MIN(md_resync_bufsz, md_max_xfer_bufsz);
2358         }
2359         rs_buffer = kmem_zalloc(dbtob(rs_copysize), KM_SLEEP);
2360         un = md_unit_writerlock(ui);
2361         un->un_rs_copysize = rs_copysize;
2362         un->un_rs_buffer = rs_buffer;
2363
2364         if (MD_MNSET_SETNO(setno)) {
2365                 /*
2366                  * Register this resync thread with the CPR mechanism. This
2367                  * allows us to detect when the system is suspended and so
2368                  * keep track of the RPC failure condition.
2369                  */
2370                 (void) snprintf(cpr_name, sizeof (cpr_name),
2371                     "mirror_resync%x", mnum);
2372                 CALLB_CPR_INIT(&un->un_rs_cprinfo, &un->un_rs_cpr_mx,
2373                     callb_md_mrs_cpr, cpr_name);
2374
2375                 if (ui->ui_tstate & MD_RESYNC_NOT_DONE) {
2376                         /*
2377                          * If this is the first resync following the initial
2378                          * snarf (MD_RESYNC_NOT_DONE still set) and we've
2379                          * been started outside a reconfig step (e.g. by being
2380                          * added to an existing set) we need to query the
2381                          * existing submirror state for this mirror.
2382                          * The set_status flags will have MD_MN_SET_MIR_STATE_RC
2383                          * set if we've been through a step4 reconfig, so only
2384                          * query the master if this isn't (yet) set. In this
2385                          * case we must continue the resync thread as there is
2386                          * not guaranteed to be a currently running resync on
2387                          * any of the other nodes. Worst case is that we will
2388                          * initiate an ownership change to this node and then
2389                          * find that there is no resync to perform. However, we
2390                          * will then have correct status across the cluster.
2391                          */
2392                         if (!md_set[setno].s_am_i_master) {
2393                                 if (!(md_get_setstatus(setno) &
2394                                     MD_SET_MN_MIR_STATE_RC)) {
2395                                         mirror_get_status(un, NULL);
2396                                         block_resync = 0;
2397 #ifdef DEBUG
2398                                         if (mirror_debug_flag) {
2399                                                 mm_submirror_t *sm;
2400                                                 int i;
2401                                                 for (i = 0; i < NMIRROR; i++) {
2402                                                         sm = &un->un_sm[i];
2403                                                         printf(
2404                                                             "sm[%d] state=%4x"
2405                                                             " flags=%4x\n", i,
2406                                                             sm->sm_state,
2407                                                             sm->sm_flags);
2408                                                 }
2409                                         }
2410 #endif
2411                                 }
2412                         }
2413                         ui->ui_tstate &= ~MD_RESYNC_NOT_DONE;
2414                 }
2415                 /*
2416                  * For MN set, if we have an owner, then start the resync on it.
2417                  * If there is no owner the master must send a message to
2418                  * choose the owner. This message will contain the current
2419                  * resync count and it will only be sent to the master, where
2420                  * the resync count will be used to choose the next node to
2421                  * perform a resync, by cycling through the nodes in the set.
2422                  * The message handler will then send a CHANGE_OWNER message to
2423                  * all nodes, and on receipt of that message, the chosen owner
2424                  * will issue a SET_OWNER ioctl to become the owner. This ioctl
2425                  * will be requested to spawn a thread to issue the
2426                  * REQUEST_OWNER message to become the owner which avoids the
2427                  * need for concurrent ioctl requests.
2428                  * After sending the message, we will block waiting for one
2429                  * of the nodes to become the owner and start the resync
2430                  */
2431                 if (MD_MN_NO_MIRROR_OWNER(un)) {
2432                         /*
2433                          * There is no owner, block and then the master will
2434                          * choose the owner. Only perform this if 'block_resync'
2435                          * is set.
2436                          */
2437                         if (block_resync) {
2438                                 mutex_enter(&un->un_rs_thread_mx);
2439                                 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2440                                 mutex_exit(&un->un_rs_thread_mx);
2441                         }
2442                         if (md_set[setno].s_am_i_master) {
2443                                 md_unit_writerexit(ui);
2444                                 (void) mirror_choose_owner(un, NULL);
2445                                 (void) md_unit_writerlock(ui);
2446                         }
2447                 } else {
2448                         /* There is an owner, block if we are not it */
2449                         if (!MD_MN_MIRROR_OWNER(un)) {
2450                                 mutex_enter(&un->un_rs_thread_mx);
2451                                 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2452                                 mutex_exit(&un->un_rs_thread_mx);
2453                         }
2454                 }
2455         }
2456         /*
2457          * Start a timeout chain to update the resync progress to the mddb.
2458          * This will run every md_mirror_resync_update_intvl minutes and allows
2459          * a resync to be continued over a reboot.
2460          */
2461         ASSERT(un->un_rs_resync_to_id == 0);
2462         un->un_rs_resync_to_id = timeout(resync_progress, un,
2463             (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl));
2464
2465         /*
2466          * Handle resync restart from the last logged position. The contents
2467          * of un_rs_resync_2_do and un_rs_resync_done are dependent on the
2468          * type of resync that was in progress.
2469          */
2470         if (MD_MNSET_SETNO(setno)) {
2471                 switch ((uint_t)RS_TYPE(un->un_rs_type)) {
2472                 case MD_RS_NONE:
2473                 case MD_RS_OPTIMIZED:
2474                 case MD_RS_COMPONENT:
2475                 case MD_RS_SUBMIRROR:
2476                 case MD_RS_ABR:
2477                         break;
2478                 default:
2479                         un->un_rs_type = MD_RS_NONE;
2480                 }
2481                 /* Allocate a resync message, if required */
2482                 if (un->un_rs_msg == NULL) {
2483                         un->un_rs_msg = (md_mn_msg_resync_t *)kmem_zalloc(
2484                             sizeof (md_mn_msg_resync_t), KM_SLEEP);
2485                 }
2486                 mn_resync = 1;
2487         }
2488
2489         /* Check to see if we've been requested to block/kill */
2490         if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2491                 goto bail_out;
2492         }
2493
2494         do {
2495                 un->un_rs_dropped_lock = 0;
2496                 /*
2497                  * Always perform an optimized resync first as this will bring
2498                  * the mirror into an available state in the shortest time.
2499                  * If we are resuming an interrupted resync, other than an
2500                  * optimized resync, we save the type and amount done so that
2501                  * we can resume the appropriate resync after the optimized
2502                  * resync has completed.
2503                  */
2504                 if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2505                     (RS_TYPE(un->un_rs_type) != MD_RS_OPTIMIZED)) {
2506                         old_rs_type = un->un_rs_type;
2507                         old_rs_done = un->un_rs_resync_done;
2508                         old_rs_2_do = un->un_rs_resync_2_do;
2509                         old_rs_startbl = un->un_resync_startbl;
2510                 }
2511                 SET_RS_TYPE(un->un_rs_type, MD_RS_OPTIMIZED);
2512                 /*
2513                  * If we are continuing a resync that is not an
2514                  * OPTIMIZED one, then we start from the beginning when
2515                  * doing this optimized resync
2516                  */
2517                 if (RS_TYPE(old_rs_type) != MD_RS_OPTIMIZED) {
2518                         un->un_rs_resync_done = 0;
2519                         un->un_rs_resync_2_do = 0;
2520                         un->un_resync_startbl = 0;
2521                 }
2522                 optimized_resync(un);
2523                 /* Check to see if we've been requested to block/kill */
2524                 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2525                         goto bail_out;
2526                 }
2527                 un = (mm_unit_t *)MD_UNIT(mnum);
2528                 /*
2529                  * If another node has moved the resync on, we must
2530                  * restart the correct resync
2531                  */
2532                 if (mn_resync &&
2533                     (RS_TYPE(un->un_rs_type) != MD_RS_NONE)) {
2534                         old_rs_type = un->un_rs_type;
2535                         old_rs_done = un->un_rs_resync_done;
2536                         old_rs_2_do = un->un_rs_resync_2_do;
2537                         old_rs_startbl = un->un_resync_startbl;
2538                 }
2539
2540                 /*
2541                  * Restore previous resync progress or move onto a
2542                  * component resync.
2543                  */
2544                 if (RS_TYPE(old_rs_type) != MD_RS_NONE) {
2545                         un->un_rs_type = old_rs_type;
2546                         un->un_rs_resync_done = old_rs_done;
2547                         un->un_rs_resync_2_do = old_rs_2_do;
2548                         un->un_resync_startbl = old_rs_startbl;
2549                 } else {
2550                         un->un_rs_type = MD_RS_COMPONENT;
2551                         un->un_rs_resync_done = 0;
2552                         un->un_rs_resync_2_do = 0;
2553                         un->un_resync_startbl = 0;
2554                 }
2555
2556                 if (RS_TYPE(un->un_rs_type) == MD_RS_COMPONENT) {
2557                         component_resync(un);
2558                         /* Check to see if we've been requested to block/kill */
2559                         if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2560                                 goto bail_out;
2561                         }
2562                         un = (mm_unit_t *)MD_UNIT(mnum);
2563                         /*
2564                          * If we have moved on from a component resync, another
2565                          * node must have completed it and started a submirror
2566                          * resync, so leave the resync state alone. For non
2567                          * multi-node sets we move onto the submirror resync.
2568                          */
2569                         if (mn_resync) {
2570                                 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2571                                         un->un_rs_type = MD_RS_SUBMIRROR;
2572                                         un->un_rs_resync_done =
2573                                             un->un_rs_resync_2_do = 0;
2574                                         un->un_resync_startbl = 0;
2575                                 }
2576                         } else {
2577                                 un->un_rs_type = MD_RS_SUBMIRROR;
2578                                 un->un_rs_resync_done = 0;
2579                                 un->un_rs_resync_2_do = 0;
2580                                 un->un_resync_startbl = 0;
2581                         }
2582                 }
2583                 if (RS_TYPE(un->un_rs_type) == MD_RS_SUBMIRROR) {
2584                         submirror_resync(un);
2585                         /* Check to see if we've been requested to block/kill */
2586                         if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2587                                 goto bail_out;
2588                         }
2589                         un = (mm_unit_t *)MD_UNIT(mnum);
2590                         /*
2591                          * If we have moved on from a submirror resync, another
2592                          * node must have completed it and started a different
2593                          * resync, so leave the resync state alone
2594                          */
2595                         if (mn_resync) {
2596                                 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2597                                         un->un_rs_resync_done =
2598                                             un->un_rs_resync_2_do = 0;
2599                                         un->un_resync_startbl = 0;
2600                                 }
2601                         } else {
2602                                 /* If non-MN mirror, reinitialize state */
2603                                 un->un_rs_type = MD_RS_NONE;
2604                                 un->un_rs_resync_done = 0;
2605                                 un->un_rs_resync_2_do = 0;
2606                                 un->un_resync_startbl = 0;
2607                         }
2608                 }
2609         } while (un->un_rs_dropped_lock);
2610         mutex_enter(&un->un_rs_thread_mx);
2611         un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
2612         mutex_exit(&un->un_rs_thread_mx);
2613
2614         resync_finish = 1;
2615 bail_out:
2616 #ifdef DEBUG
2617         if (mirror_debug_flag)
2618                 printf("Resync stopped (mnum = %x), resync_finish = %d\n",
2619                     mnum, resync_finish);
2620 #endif
2621         kmem_free(un->un_rs_buffer, dbtob(un->un_rs_copysize));
2622
2623         mutex_enter(&un->un_rs_progress_mx);
2624         un->un_rs_progress_flags |= MD_RI_KILL;
2625         cv_signal(&un->un_rs_progress_cv);
2626         mutex_exit(&un->un_rs_progress_mx);
2627
2628         /*
2629          * For MN Set, send a RESYNC_FINISH if this node completed the resync.
2630          * There is no need to grow unit here, it will be done in the
2631          * handler for the RESYNC_FINISH message together with resetting
2632          * MD_UN_RESYNC_ACTIVE.
2633          */
2634         if (mn_resync) {
2635                 if (resync_finish) {
2636                         /*
2637                          * Normal resync completion. Issue a RESYNC_FINISH
2638                          * message if we're part of a multi-node set.
2639                          */
2640                         md_mn_kresult_t *kres;
2641                         md_mn_msg_resync_t *rmsg;
2642                         int             rval;
2643
2644                         rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
2645                         md_unit_writerexit(ui);
2646
2647                         rmsg->msg_resync_mnum = mnum;
2648                         rmsg->msg_resync_type = 0;
2649                         rmsg->msg_resync_done = 0;
2650                         rmsg->msg_resync_2_do = 0;
2651                         rmsg->msg_originator = md_mn_mynode_id;
2652
2653                         kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2654
2655                         mutex_enter(&un->un_rs_cpr_mx);
2656                         CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
2657
2658                         rval = mdmn_ksend_message(setno,
2659                             MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
2660                             (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
2661
2662                         CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
2663                             &un->un_rs_cpr_mx);
2664                         mutex_exit(&un->un_rs_cpr_mx);
2665
2666                         if (!MDMN_KSEND_MSG_OK(rval, kres)) {
2667                                 mdmn_ksend_show_error(rval, kres,
2668                                     "RESYNC_FINISH");
2669                                 /* If we're shutting down, pause things here. */
2670                                 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
2671                                         while (!md_mn_is_commd_present()) {
2672                                                 delay(md_hz);
2673                                         }
2674                                 }
2675                                 cmn_err(CE_PANIC,
2676                                     "ksend_message failure: RESYNC_FINISH");
2677                         }
2678                         kmem_free(kres, sizeof (md_mn_kresult_t));
2679                         (void) md_unit_writerlock(ui);
2680                 }
2681                 /*
2682                  * If the resync has been cancelled, clear flags, reset owner
2683                  * for ABR mirror and release the resync region parent
2684                  * structure.
2685                  */
2686                 if (un->c.un_status & MD_UN_RESYNC_CANCEL) {
2687                         md_mps_t        *ps;
2688
2689                         if (ui->ui_tstate & MD_ABR_CAP) {
2690                                 /* Resync finished, if ABR set owner to NULL */
2691                                 mutex_enter(&un->un_owner_mx);
2692                                 un->un_mirror_owner = 0;
2693                                 mutex_exit(&un->un_owner_mx);
2694                         }
2695
2696                         un->c.un_status &= ~(MD_UN_RESYNC_CANCEL |
2697                             MD_UN_RESYNC_ACTIVE);
2698                         ps = un->un_rs_prev_overlap;
2699                         if (ps != NULL) {
2700                                 /* Remove previous overlap resync region */
2701                                 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2702                                 mirror_overlap_tree_remove(ps);
2703                                 /*
2704                                  * Release the overlap range reference
2705                                  */
2706                                 un->un_rs_prev_overlap = NULL;
2707                                 kmem_cache_free(mirror_parent_cache,
2708                                     ps);
2709                         }
2710                 }
2711
2712                 /*
2713                  * Release resync message buffer. This will be reallocated on
2714                  * the next invocation of the resync_unit thread.
2715                  */
2716                 if (un->un_rs_msg) {
2717                         kmem_free(un->un_rs_msg, sizeof (md_mn_msg_resync_t));
2718                         un->un_rs_msg = NULL;
2719                 }
2720         } else {
2721                 /* For non-MN sets deal with any pending grows */
2722                 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2723                 if (un->c.un_status & MD_UN_GROW_PENDING) {
2724                         if ((mirror_grow_unit(un, &mde) != 0) ||
2725                             (! mdismderror(&mde, MDE_GROW_DELAYED))) {
2726                                 un->c.un_status &= ~MD_UN_GROW_PENDING;
2727                         }
2728                 }
2729         }
2730
2731         reset_comp_flags(un);
2732         un->un_resync_completed = 0;
2733         mirror_commit(un, NO_SUBMIRRORS, 0);
2734         md_unit_writerexit(ui);
2735
2736         /*
2737          * Stop the resync progress thread.
2738          */
2739         if (un->un_rs_resync_to_id != 0) {
2740                 (void) untimeout(un->un_rs_resync_to_id);
2741                 un->un_rs_resync_to_id = 0;
2742         }
2743
2744         /*
2745          * Calling mirror_internal_close() makes further reference to un / ui
2746          * dangerous. If we are the only consumer of the mirror it is possible
2747          * for a metaclear to be processed after completion of the m_i_c()
2748          * routine. As we need to handle the case where another resync has been
2749          * scheduled for the mirror, we raise the open count on the device
2750          * which protects against the close / metaclear / lock => panic scenario
2751          */
2752         (void) md_unit_incopen(MD_SID(un), FREAD|FWRITE, OTYP_LYR);
2753         (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2754
2755         /*
2756          * deccrement the mirror resync count
2757          */
2758         mutex_enter(&md_cpr_resync.md_resync_mutex);
2759         md_cpr_resync.md_mirror_resync--;
2760         mutex_exit(&md_cpr_resync.md_resync_mutex);
2761
2762         /*
2763          * Remove the thread reference as we're about to exit. This allows a
2764          * subsequent mirror_resync_unit() to start a new thread.
2765          * If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
2766          * called to start a new resync, so reopen the mirror and go back to
2767          * the start.
2768          */
2769         (void) md_unit_writerlock(ui);
2770         mutex_enter(&un->un_rs_thread_mx);
2771         un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2772         mutex_exit(&un->un_rs_thread_mx);
2773         if (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2774                 md_unit_writerexit(ui);
2775                 if (mirror_internal_open(MD_SID(un), (FREAD|FWRITE),
2776                     OTYP_LYR, 0, (IOLOCK *)NULL) == 0) {
2777                         /* Release the reference grabbed above */
2778                         (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0,
2779                             (IOLOCK *)NULL);
2780                         goto resync_restart;
2781                 }
2782                 (void) md_unit_writerlock(ui);
2783                 cmn_err(CE_NOTE,
2784                     "Could not open metadevice (%x) for resync\n",
2785                     MD_SID(un));
2786         }
2787         un->un_rs_thread = NULL;
2788         md_unit_writerexit(ui);
2789
2790         /*
2791          * Check for hotspares once we've cleared the resync thread reference.
2792          * If there are any errored units a poke_hotspares() will result in
2793          * a call to mirror_resync_unit() which we need to allow to start.
2794          */
2795         (void) poke_hotspares();
2796
2797         /*
2798          * Remove this thread from the CPR callback table.
2799          */
2800         if (mn_resync) {
2801                 mutex_enter(&un->un_rs_cpr_mx);
2802                 CALLB_CPR_EXIT(&un->un_rs_cprinfo);
2803         }
2804
2805         /*
2806          * Remove the extra reference to the unit we generated above. After
2807          * this call it is *unsafe* to reference either ui or un as they may
2808          * no longer be allocated.
2809          */
2810         (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2811
2812         thread_exit();
2813 }
2814
2815 /*
2816  * mirror_resync_unit:
2817  * ------------------
2818  * Start a resync for the given mirror metadevice. Save the resync thread ID in
2819  * un->un_rs_thread for later manipulation.
2820  *
2821  * Returns:
2822  *      0       Success
2823  *      !=0     Error
2824  */
2825 /*ARGSUSED*/
2826 int
2827 mirror_resync_unit(
2828         minor_t                 mnum,
2829         md_resync_ioctl_t       *ri,
2830         md_error_t              *ep,
2831         IOLOCK                  *lockp
2832 )
2833 {
2834         mdi_unit_t              *ui;
2835         mm_unit_t               *un;
2836         set_t                   setno = MD_MIN2SET(mnum);
2837
2838         ui = MDI_UNIT(mnum);
2839
2840         if (md_get_setstatus(setno) & MD_SET_STALE)
2841                 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
2842
2843         if (mirror_internal_open(mnum, (FREAD|FWRITE), OTYP_LYR, 0, lockp)) {
2844                 return (mdmderror(ep, MDE_MIRROR_OPEN_FAILURE, mnum));
2845         }
2846         if (lockp) {
2847                 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
2848         } else {
2849                 un = (mm_unit_t *)md_unit_writerlock(ui);
2850         }
2851
2852         /*
2853          * Check to see if we're attempting to start a resync while one is
2854          * already running.
2855          */
2856         if (un->c.un_status & MD_UN_RESYNC_ACTIVE ||
2857             un->un_rs_thread != NULL) {
2858                 /*
2859                  * Ensure RESYNC_ACTIVE set, it may not be if the resync thread
2860                  * is in the process of terminating, setting the flag will
2861                  * cause the resync thread to return to the beginning
2862                  */
2863                 un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2864                 if (lockp) {
2865                         md_ioctl_writerexit(lockp);
2866                 } else {
2867                         md_unit_writerexit(ui);
2868                 }
2869                 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2870                 return (0);
2871         }
2872         un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2873         un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
2874         if ((ri) && (ri->ri_copysize > 0) &&
2875             (ri->ri_copysize <= md_max_xfer_bufsz))
2876                 un->un_rs_copysize = ri->ri_copysize;
2877         else
2878                 un->un_rs_copysize = 0;
2879
2880         /* Start the resync progress thread off */
2881         un->un_rs_progress_flags = 0;
2882         (void) thread_create(NULL, 0, resync_progress_thread,
2883             (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
2884
2885         /*
2886          * We have to store the thread ID in the unit structure so do not
2887          * drop writerlock until the thread is active. This means resync_unit
2888          * may spin on its first md_unit_readerlock(), but deadlock won't occur.
2889          */
2890         mutex_enter(&un->un_rs_thread_mx);
2891         un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2892         mutex_exit(&un->un_rs_thread_mx);
2893         un->un_rs_thread = thread_create(NULL, 0, resync_unit,
2894             (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, 60);
2895         if (un->un_rs_thread == (kthread_id_t)NULL) {
2896                 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2897                 if (lockp) {
2898                         md_ioctl_writerexit(lockp);
2899                 } else {
2900                         md_unit_writerexit(ui);
2901                 }
2902                 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2903                 return (mdmderror(ep, MDE_MIRROR_THREAD_FAILURE, mnum));
2904         } else {
2905                 if (lockp) {
2906                         md_ioctl_writerexit(lockp);
2907                 } else {
2908                         md_unit_writerexit(ui);
2909                 }
2910         }
2911
2912         return (0);
2913 }
2914
2915 /*
2916  * mirror_ioctl_resync:
2917  * -------------------
2918  * Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
2919  * or kill the resync thread associated with the specified unit.
2920  * Can return with locks held since mdioctl will free any locks
2921  * that are marked in lock->l_flags.
2922  *
2923  * Returns:
2924  *      0       Success
2925  *      !=0     Error Code
2926  */
2927 int
2928 mirror_ioctl_resync(
2929         md_resync_ioctl_t       *ri,
2930         IOLOCK                  *lock
2931 )
2932 {
2933         minor_t                 mnum = ri->ri_mnum;
2934         mm_unit_t               *un;
2935         uint_t                  bits;
2936         mm_submirror_t          *sm;
2937         mm_submirror_ic_t       *smic;
2938         int                     smi;
2939         kt_did_t                tid;
2940         set_t                   setno = MD_MIN2SET(mnum);
2941
2942         mdclrerror(&ri->mde);
2943
2944         if ((setno >= md_nsets) ||
2945             (MD_MIN2UNIT(mnum) >= md_nunits)) {
2946                 return (mdmderror(&ri->mde, MDE_INVAL_UNIT, mnum));
2947         }
2948
2949         /* RD_LOCK flag grabs the md_ioctl_readerlock */
2950         un = mirror_getun(mnum, &ri->mde, RD_LOCK, lock);
2951
2952         if (un == NULL) {
2953                 return (mdmderror(&ri->mde, MDE_UNIT_NOT_SETUP, mnum));
2954         }
2955         if (un->c.un_type != MD_METAMIRROR) {
2956                 return (mdmderror(&ri->mde, MDE_NOT_MM, mnum));
2957         }
2958         if (un->un_nsm < 2) {
2959                 return (0);
2960         }
2961
2962         /*
2963          * Determine the action to take based on the ri_flags field:
2964          *      MD_RI_BLOCK:    Block current resync thread
2965          *      MD_RI_UNBLOCK:  Unblock resync thread
2966          *      MD_RI_KILL:     Abort resync thread
2967          *      MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
2968          *              without using rpc.mdcommd messages.
2969          *      any other:      Start resync thread
2970          */
2971         switch (ri->ri_flags & (MD_RI_BLOCK|MD_RI_UNBLOCK|MD_RI_KILL)) {
2972
2973         case MD_RI_BLOCK:
2974                 /* Halt resync thread by setting flag in un_rs_flags */
2975                 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
2976                         return (0);
2977                 }
2978                 mutex_enter(&un->un_rs_thread_mx);
2979                 un->un_rs_thread_flags |= MD_RI_BLOCK;
2980                 mutex_exit(&un->un_rs_thread_mx);
2981                 return (0);
2982
2983         case MD_RI_UNBLOCK:
2984                 /*
2985                  * Restart resync thread by clearing flag in un_rs_flags and
2986                  * cv_signal'ing the blocked thread.
2987                  */
2988                 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
2989                         return (0);
2990                 }
2991                 mutex_enter(&un->un_rs_thread_mx);
2992                 un->un_rs_thread_flags &= ~MD_RI_BLOCK;
2993                 cv_signal(&un->un_rs_thread_cv);
2994                 mutex_exit(&un->un_rs_thread_mx);
2995                 return (0);
2996
2997         case MD_RI_KILL:
2998                 /* Abort resync thread. */
2999                 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3000                         return (0);
3001                 }
3002                 mutex_enter(&un->un_rs_thread_mx);
3003                 tid = un->un_rs_thread ? (un->un_rs_thread)->t_did : 0;
3004                 un->un_rs_thread_flags &= ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
3005                 un->un_rs_thread_flags |= MD_RI_KILL;
3006                 cv_signal(&un->un_rs_thread_cv);
3007                 mutex_exit(&un->un_rs_thread_mx);
3008                 if (tid != 0) {
3009                         if (!(ri->ri_flags & MD_RI_NO_WAIT)) {
3010                                 md_ioctl_readerexit(lock);
3011                                 thread_join(tid);
3012                                 un->un_rs_thread_flags &= ~MD_RI_KILL;
3013                                 un->un_rs_thread = NULL;
3014                                 cmn_err(CE_WARN, "md: %s: Resync cancelled\n",
3015                                     md_shortname(MD_SID(un)));
3016                         }
3017                 }
3018                 return (0);
3019         }
3020
3021         md_ioctl_readerexit(lock);
3022
3023         bits = 0;
3024         for (smi = 0; smi < NMIRROR; smi++) {
3025                 sm = &un->un_sm[smi];
3026                 smic = &un->un_smic[smi];
3027                 if (!SMS_IS(sm, SMS_ATTACHED))
3028                         continue;
3029                 mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
3030                 bits |= SMI2BIT(smi);
3031         }
3032         if (bits != 0)
3033                 mirror_commit(un, bits, 0);
3034
3035         /*
3036          * If we are resyncing a mirror in a MN set and the rpc.mdcommd
3037          * can be used, we do not start the resync at this point.
3038          * Instead, the metasync command that issued the ioctl
3039          * will send a RESYNC_STARTING message to start the resync thread. The
3040          * reason we do it this way is to ensure that the metasync ioctl is
3041          * executed on all nodes before the resync thread is started.
3042          *
3043          * If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
3044          * don't use rpc.mdcommd, but just start the resync thread.  This
3045          * flag is set on a node when it is being added to a diskset
3046          * so that the resync threads are started on the newly added node.
3047          */
3048         if ((!(MD_MNSET_SETNO(setno))) ||
3049             (ri->ri_flags & MD_RI_RESYNC_FORCE_MNSTART)) {
3050                 return (mirror_resync_unit(mnum, ri, &ri->mde, lock));
3051         } else {
3052                 return (0);
3053         }
3054 }
3055
3056 int
3057 mirror_mark_resync_region_non_owner(struct mm_unit *un,
3058         diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3059 {
3060         int                     no_change;
3061         size_t                  start_rr;
3062         size_t                  current_rr;
3063         size_t                  end_rr;
3064         md_mn_msg_rr_dirty_t    *rr;
3065         md_mn_kresult_t         *kres;
3066         set_t                   setno = MD_UN2SET(un);
3067         int                     rval;
3068         md_mn_nodeid_t          node_idx = source_node - 1;
3069         mdi_unit_t              *ui = MDI_UNIT(MD_SID(un));
3070         md_mn_nodeid_t          owner_node;
3071         minor_t                 mnum = MD_SID(un);
3072
3073         if (un->un_nsm < 2)
3074                 return (0);
3075
3076         /*
3077          * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3078          * not, allocate it and then fill the [start..end] entries.
3079          * Update un_pernode_dirty_sum if we've gone 0->1.
3080          * Update un_dirty_bm if the corresponding entries are clear.
3081          */
3082         rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3083         if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3084                 un->un_pernode_dirty_bm[node_idx] =
3085                     (uchar_t *)kmem_zalloc(
3086                     (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3087         }
3088         rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3089
3090         BLK_TO_RR(end_rr, endblk, un);
3091         BLK_TO_RR(start_rr, startblk, un);
3092
3093         no_change = 1;
3094
3095         mutex_enter(&un->un_resync_mx);
3096         rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3097         for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3098                 un->un_outstanding_writes[current_rr]++;
3099                 if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
3100                         un->un_pernode_dirty_sum[current_rr]++;
3101                         SET_PERNODE_DIRTY(source_node, current_rr, un);
3102                 }
3103                 CLR_GOING_CLEAN(current_rr, un);
3104                 if (!IS_REGION_DIRTY(current_rr, un)) {
3105                         no_change = 0;
3106                         SET_REGION_DIRTY(current_rr, un);
3107                         SET_GOING_DIRTY(current_rr, un);
3108                 } else if (IS_GOING_DIRTY(current_rr, un))
3109                         no_change = 0;
3110         }
3111         rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3112         mutex_exit(&un->un_resync_mx);
3113
3114         if (no_change) {
3115                 return (0);
3116         }
3117
3118         /*
3119          * If we have dirty regions to commit, send a
3120          * message to the owning node so that the
3121          * in-core bitmap gets updated appropriately.
3122          * TODO: make this a kmem_cache pool to improve
3123          * alloc/free performance ???
3124          */
3125         kres = (md_mn_kresult_t *)kmem_zalloc(sizeof (md_mn_kresult_t),
3126             KM_SLEEP);
3127         rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
3128             KM_SLEEP);
3129
3130 resend_mmrr:
3131         owner_node = un->un_mirror_owner;
3132
3133         rr->rr_mnum = mnum;
3134         rr->rr_nodeid = md_mn_mynode_id;
3135         rr->rr_range = (ushort_t)start_rr << 16;
3136         rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
3137
3138         /* release readerlock before sending message */
3139         md_unit_readerexit(ui);
3140
3141         rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
3142             MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
3143             un->un_mirror_owner, (char *)rr,
3144             sizeof (md_mn_msg_rr_dirty_t), kres);
3145
3146         /* reaquire readerlock on message completion */
3147         (void) md_unit_readerlock(ui);
3148
3149         /* if the message send failed, note it, and pass an error back up */
3150         if (!MDMN_KSEND_MSG_OK(rval, kres)) {
3151                 /* if commd is gone, no point in printing a message */
3152                 if (md_mn_is_commd_present())
3153                         mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
3154                 kmem_free(kres, sizeof (md_mn_kresult_t));
3155                 kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3156                 return (1);
3157         }
3158
3159         /*
3160          * if the owner changed while we were sending the message, and it's
3161          * not us, the new mirror owner won't yet have done the right thing
3162          * with our data.  Let him know.  If we became the owner, we'll
3163          * deal with that differently below.  Note that receiving a message
3164          * about another node twice won't hurt anything.
3165          */
3166         if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
3167                 goto resend_mmrr;
3168
3169         kmem_free(kres, sizeof (md_mn_kresult_t));
3170         kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3171
3172         mutex_enter(&un->un_resync_mx);
3173
3174         /*
3175          * If we became the owner changed while we were sending the message,
3176          * we have dirty bits in the un_pernode_bm that aren't yet reflected
3177          * in the un_dirty_bm, as it was re-read from disk, and our bits
3178          * are also not reflected in the on-disk DRL.  Fix that now.
3179          */
3180         if (MD_MN_MIRROR_OWNER(un)) {
3181                 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3182                 mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
3183                     un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
3184                 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3185
3186                 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3187
3188                 mutex_exit(&un->un_resync_mx);
3189                 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3190                 mutex_enter(&un->un_resync_mx);
3191
3192                 un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
3193                 cv_broadcast(&un->un_resync_cv);
3194         }
3195
3196         for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3197                 CLR_GOING_DIRTY(current_rr, un);
3198
3199         mutex_exit(&un->un_resync_mx);
3200
3201         return (0);
3202 }
3203
3204 int
3205 mirror_mark_resync_region_owner(struct mm_unit *un,
3206         diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3207 {
3208         int                     no_change;
3209         size_t                  start_rr;
3210         size_t                  current_rr;
3211         size_t                  end_rr;
3212         int                     mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3213         md_mn_nodeid_t          node_idx = source_node - 1;
3214
3215         if (un->un_nsm < 2)
3216                 return (0);
3217
3218         /*
3219          * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3220          * not, allocate it and then fill the [start..end] entries.
3221          * Update un_pernode_dirty_sum if we've gone 0->1.
3222          * Update un_dirty_bm if the corresponding entries are clear.
3223          */
3224         if (mnset) {
3225                 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3226                 if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3227                         un->un_pernode_dirty_bm[node_idx] =
3228                             (uchar_t *)kmem_zalloc(
3229                             (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3230                 }
3231                 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3232         }
3233
3234         mutex_enter(&un->un_resync_mx);
3235
3236         if (mnset)
3237                 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3238
3239         no_change = 1;
3240         BLK_TO_RR(end_rr, endblk, un);
3241         BLK_TO_RR(start_rr, startblk, un);
3242         for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3243                 if (!mnset || source_node == md_mn_mynode_id)
3244                         un->un_outstanding_writes[current_rr]++;
3245                 if (mnset) {
3246                         if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
3247                                 un->un_pernode_dirty_sum[current_rr]++;
3248                         SET_PERNODE_DIRTY(source_node, current_rr, un);
3249                 }
3250                 CLR_GOING_CLEAN(current_rr, un);
3251                 if (!IS_REGION_DIRTY(current_rr, un))
3252                         no_change = 0;
3253                 if (IS_GOING_DIRTY(current_rr, un))
3254                         no_change = 0;
3255         }
3256
3257         if (mnset)
3258                 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3259
3260         if (no_change) {
3261                 mutex_exit(&un->un_resync_mx);
3262                 return (0);
3263         }
3264         un->un_waiting_to_mark++;
3265         while (un->un_resync_flg & MM_RF_GATECLOSED) {
3266                 if (panicstr)
3267                         return (1);
3268                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3269         }
3270         un->un_waiting_to_mark--;
3271
3272         no_change = 1;
3273         for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3274                 if (!IS_REGION_DIRTY(current_rr, un)) {
3275                         SET_REGION_DIRTY(current_rr, un);
3276                         SET_GOING_DIRTY(current_rr, un);
3277                         no_change = 0;
3278                 } else {
3279                         if (IS_GOING_DIRTY(current_rr, un))
3280                                 no_change = 0;
3281                 }
3282         }
3283         if (no_change) {
3284                 if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
3285                         cv_broadcast(&un->un_resync_cv);
3286                 mutex_exit(&un->un_resync_mx);
3287                 return (0);
3288         }
3289
3290         un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
3291         un->un_waiting_to_commit++;
3292         while (un->un_waiting_to_mark != 0 &&
3293             !(un->un_resync_flg & MM_RF_GATECLOSED)) {
3294                 if (panicstr)
3295                         return (1);
3296                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3297         }
3298
3299         if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
3300                 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3301                 un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
3302
3303                 mutex_exit(&un->un_resync_mx);
3304                 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3305                 mutex_enter(&un->un_resync_mx);
3306
3307                 un->un_resync_flg &= ~MM_RF_COMMITING;
3308                 cv_broadcast(&un->un_resync_cv);
3309         }
3310         while (un->un_resync_flg & MM_RF_COMMITING) {
3311                 if (panicstr)
3312                         return (1);
3313                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3314         }
3315
3316         for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3317                 CLR_GOING_DIRTY(current_rr, un);
3318
3319         if (--un->un_waiting_to_commit == 0) {
3320                 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3321                 cv_broadcast(&un->un_resync_cv);
3322         }
3323         mutex_exit(&un->un_resync_mx);
3324
3325         return (0);
3326 }
3327
3328 int
3329 mirror_mark_resync_region(struct mm_unit *un,
3330         diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3331 {
3332         int     mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3333
3334         if (mnset && !MD_MN_MIRROR_OWNER(un)) {
3335                 return (mirror_mark_resync_region_non_owner(un, startblk,
3336                     endblk, source_node));
3337         } else {
3338                 return (mirror_mark_resync_region_owner(un, startblk, endblk,
3339                     source_node));
3340         }
3341 }
3342
3343 int
3344 mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3345 {
3346         short           *owp;
3347         optim_resync_t  *orp;
3348         uint_t          rr_mult = 1;
3349         uint_t          old_nregions, new_nregions;
3350         int             old_bm_size, new_bm_size;
3351         size_t          size;
3352         mddb_recid_t    recid, old_recid;
3353         uchar_t         *old_dirty_bm;
3354         int             i, j;
3355         mddb_type_t     typ1;
3356         set_t           setno = MD_UN2SET(un);
3357         uchar_t         *old_pns;
3358
3359         old_nregions = un->un_rrd_num;
3360         new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3361
3362         while (new_nregions > MD_MAX_NUM_RR) {
3363                 new_nregions >>= 1;
3364                 rr_mult <<= 1;
3365         }
3366
3367         new_bm_size = howmany(new_nregions, NBBY);
3368         old_bm_size = howmany(old_nregions, NBBY);
3369
3370         size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3371
3372         typ1 = (mddb_type_t)md_getshared_key(setno,
3373             mirror_md_ops.md_driver.md_drivername);
3374         recid = mddb_createrec(size, typ1, RESYNC_REC,
3375             MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3376         if (recid < 0)
3377                 return (-1);
3378
3379         orp = (struct optim_resync *)mddb_getrecaddr(recid);
3380         ASSERT(orp != NULL);
3381
3382         orp->or_magic = OR_MAGIC;               /* Magic # */
3383         orp->or_blksize = un->un_rrd_blksize;   /* Same block size */
3384         orp->or_num = new_nregions;             /* New number of regions */
3385
3386         old_dirty_bm = un->un_dirty_bm;
3387         un->un_dirty_bm = orp->or_rr;
3388
3389         kmem_free((caddr_t)un->un_goingdirty_bm, old_bm_size);
3390         un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3391
3392         kmem_free((caddr_t)un->un_goingclean_bm, old_bm_size);
3393         un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3394
3395         kmem_free((caddr_t)un->un_resync_bm, old_bm_size);
3396         un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3397
3398         owp = un->un_outstanding_writes;
3399         un->un_outstanding_writes = (short *)kmem_zalloc(
3400             new_nregions * sizeof (short), KM_SLEEP);
3401
3402         old_pns = un->un_pernode_dirty_sum;
3403         if (old_pns)
3404                 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
3405                     KM_SLEEP);
3406
3407         /*
3408          * Now translate the old records into the new
3409          * records
3410          */
3411         for (i = 0; i < old_nregions; i++) {
3412                 /*
3413                  * only bring forward the
3414                  * outstanding write counters and the dirty bits and also
3415                  * the pernode_summary counts
3416                  */
3417                 if (!isset(old_dirty_bm, i))
3418                         continue;
3419
3420                 setbit(un->un_dirty_bm, (i / rr_mult));
3421                 un->un_outstanding_writes[(i / rr_mult)] += owp[i];
3422                 if (old_pns)
3423                         un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
3424         }
3425         kmem_free((caddr_t)owp, old_nregions * sizeof (short));
3426         if (old_pns)
3427                 kmem_free((caddr_t)old_pns, old_nregions);
3428
3429         /*
3430          * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
3431          */
3432         for (j = 0; j < MD_MNMAXSIDES; j++) {
3433                 rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
3434                 old_dirty_bm = un->un_pernode_dirty_bm[j];
3435                 if (old_dirty_bm) {
3436                         un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
3437                             new_bm_size, KM_SLEEP);
3438                         for (i = 0; i < old_nregions; i++) {
3439                                 if (!isset(old_dirty_bm, i))
3440                                         continue;
3441
3442                                 setbit(un->un_pernode_dirty_bm[j],
3443                                     (i / rr_mult));
3444                         }
3445                         kmem_free((caddr_t)old_dirty_bm, old_bm_size);
3446                 }
3447                 rw_exit(&un->un_pernode_dirty_mx[j]);
3448         }
3449
3450         /* Save the old record id */
3451         old_recid = un->un_rr_dirty_recid;
3452
3453         /* Update the mirror unit struct */
3454         un->un_rr_dirty_recid = recid;
3455         un->un_rrd_num = new_nregions;
3456         un->un_rrd_blksize = un->un_rrd_blksize * rr_mult;
3457
3458         orp->or_blksize = un->un_rrd_blksize;
3459
3460         /*
3461          * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3462          * instead of using mddb_commitrecs_wrapper, is that you cannot
3463          * atomically commit optimized records.
3464          */
3465         mddb_commitrec_wrapper(recid);
3466         mddb_commitrec_wrapper(un->c.un_record_id);
3467         mddb_deleterec_wrapper(old_recid);
3468         return (0);
3469 }
3470
3471 /* lockp can be NULL for !MN diksets */
3472 int
3473 mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3474 {
3475         uchar_t         *old;
3476         short           *owp;
3477         optim_resync_t  *orp;
3478         uint_t          old_nregions, new_nregions;
3479         int             old_bm_size, new_bm_size;
3480         size_t          size;
3481         mddb_recid_t    recid, old_recid;
3482         mddb_type_t     typ1;
3483         set_t           setno = MD_UN2SET(un);
3484         int             i;
3485
3486         old_nregions = un->un_rrd_num;
3487         new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3488
3489         new_bm_size = howmany(new_nregions, NBBY);
3490         old_bm_size = howmany(old_nregions, NBBY);
3491
3492         size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3493
3494         typ1 = (mddb_type_t)md_getshared_key(setno,
3495             mirror_md_ops.md_driver.md_drivername);
3496
3497         recid = mddb_createrec(size, typ1, RESYNC_REC,
3498             MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3499         if (recid < 0)
3500                 return (-1);
3501
3502         orp = (struct optim_resync *)mddb_getrecaddr(recid);
3503         ASSERT(orp != NULL);
3504
3505         orp->or_magic = OR_MAGIC;               /* Magic # */
3506         orp->or_blksize = un->un_rrd_blksize;   /* Same block size */
3507         orp->or_num = new_nregions;             /* New number of regions */
3508
3509         /* Copy the old bm over the new bm */
3510         bcopy((caddr_t)un->un_dirty_bm, (caddr_t)orp->or_rr, old_bm_size);
3511
3512         /*
3513          * Create new bigger incore arrays, copy, and free old ones:
3514          *              un_goingdirty_bm
3515          *              un_goingclean_bm
3516          *              un_resync_bm
3517          *              un_outstanding_writes
3518          *              un_pernode_dirty_sum
3519          *              un_pernode_dirty_bm[]
3520          */
3521         old = un->un_goingdirty_bm;
3522         un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3523         bcopy((caddr_t)old, (caddr_t)un->un_goingdirty_bm, old_bm_size);
3524         kmem_free((caddr_t)old, old_bm_size);
3525
3526         old = un->un_goingclean_bm;
3527         un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3528         bcopy((caddr_t)old, (caddr_t)un->un_goingclean_bm, old_bm_size);
3529         kmem_free((caddr_t)old, old_bm_size);
3530
3531         old = un->un_resync_bm;
3532         un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3533         bcopy((caddr_t)old, (caddr_t)un->un_resync_bm, old_bm_size);
3534         kmem_free((caddr_t)old, old_bm_size);
3535
3536         owp = un->un_outstanding_writes;
3537         un->un_outstanding_writes = (short *)kmem_zalloc(
3538             (uint_t)new_nregions * sizeof (short), KM_SLEEP);
3539         bcopy((caddr_t)owp, (caddr_t)un->un_outstanding_writes,
3540             old_nregions * sizeof (short));
3541         kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
3542
3543         old = un->un_pernode_dirty_sum;
3544         if (old) {
3545                 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
3546                     new_nregions, KM_SLEEP);
3547                 bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
3548                     old_nregions);
3549                 kmem_free((caddr_t)old, old_nregions);
3550         }
3551
3552         for (i = 0; i < MD_MNMAXSIDES; i++) {
3553                 rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
3554                 old = un->un_pernode_dirty_bm[i];
3555                 if (old) {
3556                         un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
3557                             new_bm_size, KM_SLEEP);
3558                         bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
3559                             old_bm_size);
3560                         kmem_free((caddr_t)old, old_bm_size);
3561                 }
3562                 rw_exit(&un->un_pernode_dirty_mx[i]);
3563         }
3564
3565         /* Save the old record id */
3566         old_recid = un->un_rr_dirty_recid;
3567
3568         /* Update the mirror unit struct */
3569         un->un_rr_dirty_recid = recid;
3570         un->un_rrd_num = new_nregions;
3571         un->un_dirty_bm = orp->or_rr;
3572
3573         /*
3574          * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3575          * instead of using mddb_commitrecs_wrapper, is that you cannot
3576          * atomically commit optimized records.
3577          */
3578         mddb_commitrec_wrapper(recid);
3579         mddb_commitrec_wrapper(un->c.un_record_id);
3580         mddb_deleterec_wrapper(old_recid);
3581         return (0);
3582 }
3583
3584 /*
3585  * mirror_copy_rr:
3586  * --------------
3587  * Combine the dirty record bitmap with the in-core resync bitmap. This allows
3588  * us to carry a resync over an ownership change.
3589  */
3590 void
3591 mirror_copy_rr(int sz, uchar_t *src, uchar_t *dest)
3592 {
3593         int     i;
3594
3595         for (i = 0; i < sz; i++)
3596                 *dest++ |= *src++;
3597 }
3598
3599 /*
3600  * mirror_set_dirty_rr:
3601  * -------------------
3602  * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
3603  * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
3604  * Called on every clean->dirty transition for the originating writer node.
3605  * Note: only the non-owning nodes will initiate this message and it is only
3606  * the owning node that has to process it.
3607  */
3608 int
3609 mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
3610 {
3611
3612         minor_t                 mnum = iocp->rr_mnum;
3613         mm_unit_t               *un;
3614         int                     start = (int)iocp->rr_start;
3615         int                     end = (int)iocp->rr_end;
3616         set_t                   setno = MD_MIN2SET(mnum);
3617         md_mn_nodeid_t          orignode = iocp->rr_nodeid;     /* 1-based */
3618         diskaddr_t              startblk, endblk;
3619
3620         mdclrerror(&iocp->mde);
3621
3622         if ((setno >= md_nsets) ||
3623             (MD_MIN2UNIT(mnum) >= md_nunits)) {
3624                 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3625         }
3626
3627         /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3628         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3629
3630         if (un == NULL) {
3631                 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3632         }
3633         if (un->c.un_type != MD_METAMIRROR) {
3634                 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3635         }
3636         if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
3637                 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3638         }
3639         if (un->un_nsm < 2) {
3640                 return (0);
3641         }
3642
3643         /*
3644          * Only process this message if we're the owner of the mirror.
3645          */
3646         if (!MD_MN_MIRROR_OWNER(un)) {
3647                 return (0);
3648         }
3649
3650         RR_TO_BLK(startblk, start, un);
3651         RR_TO_BLK(endblk, end, un);
3652         return (mirror_mark_resync_region_owner(un, startblk, endblk,
3653             orignode));
3654 }
3655
3656 /*
3657  * mirror_clean_rr_bits:
3658  * --------------------
3659  * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3660  * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3661  * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3662  * nodes. Callable from ioctl / interrupt / whatever context.
3663  * un_resync_mx is held on entry.
3664  */
3665 static void
3666 mirror_clean_rr_bits(
3667         md_mn_rr_clean_params_t *iocp)
3668 {
3669         minor_t                 mnum = iocp->rr_mnum;
3670         mm_unit_t               *un;
3671         uint_t                  cleared_bits;
3672         md_mn_nodeid_t          node = iocp->rr_nodeid - 1;
3673         md_mn_nodeid_t          orignode = iocp->rr_nodeid;
3674         int                     i, start, end;
3675
3676         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3677
3678         cleared_bits = 0;
3679         start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
3680         end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
3681         rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
3682         for (i = start; i < end; i++) {
3683                 if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
3684                         if (IS_PERNODE_DIRTY(orignode, i, un)) {
3685                                 un->un_pernode_dirty_sum[i]--;
3686                                 CLR_PERNODE_DIRTY(orignode, i, un);
3687                         }
3688                         if (un->un_pernode_dirty_sum[i] == 0) {
3689                                 cleared_bits++;
3690                                 CLR_REGION_DIRTY(i, un);
3691                                 CLR_GOING_CLEAN(i, un);
3692                         }
3693                 }
3694         }
3695         rw_exit(&un->un_pernode_dirty_mx[node]);
3696         if (cleared_bits) {
3697                 /*
3698                  * We can only be called iff we are the mirror owner, however
3699                  * as this is a (potentially) decoupled routine the ownership
3700                  * may have moved from us by the time we get to execute the
3701                  * bit clearing. Hence we still need to check for being the
3702                  * owner before flushing the DRL to the replica.
3703                  */
3704                 if (MD_MN_MIRROR_OWNER(un)) {
3705                         mutex_exit(&un->un_resync_mx);
3706                         mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3707                         mutex_enter(&un->un_resync_mx);
3708                 }
3709         }
3710 }
3711
3712 /*
3713  * mirror_drl_task:
3714  * ---------------
3715  * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
3716  * We need to obtain exclusive access to the un_resync_cv and then clear the
3717  * necessary bits.
3718  * On completion, we must also free the passed in argument as it is allocated
3719  * at the end of the ioctl handler and won't be freed on completion.
3720  */
3721 static void
3722 mirror_drl_task(void *arg)
3723 {
3724         md_mn_rr_clean_params_t *iocp = (md_mn_rr_clean_params_t *)arg;
3725         minor_t                 mnum = iocp->rr_mnum;
3726         mm_unit_t               *un;
3727
3728         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3729
3730         mutex_enter(&un->un_rrp_inflight_mx);
3731         mutex_enter(&un->un_resync_mx);
3732         un->un_waiting_to_clear++;
3733         while (un->un_resync_flg & MM_RF_STALL_CLEAN)
3734                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3735         un->un_waiting_to_clear--;
3736
3737         un->un_resync_flg |= MM_RF_GATECLOSED;
3738         mirror_clean_rr_bits(iocp);
3739         un->un_resync_flg &= ~MM_RF_GATECLOSED;
3740         if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
3741                 cv_broadcast(&un->un_resync_cv);
3742         }
3743         mutex_exit(&un->un_resync_mx);
3744         mutex_exit(&un->un_rrp_inflight_mx);
3745
3746         kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
3747 }
3748
3749 /*
3750  * mirror_set_clean_rr:
3751  * -------------------
3752  * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3753  * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3754  * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3755  * nodes.
3756  *
3757  * Only the mirror-owner need process this message as it is the only RR updater.
3758  * Non-owner nodes issue this request, but as we have no point-to-point message
3759  * support we will receive the message on all nodes.
3760  */
3761 int
3762 mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
3763 {
3764
3765         minor_t                 mnum = iocp->rr_mnum;
3766         mm_unit_t               *un;
3767         set_t                   setno = MD_MIN2SET(mnum);
3768         md_mn_nodeid_t          node = iocp->rr_nodeid - 1;
3769         int                     can_clear = 0;
3770         md_mn_rr_clean_params_t *newiocp;
3771         int                     rval = 0;
3772
3773         mdclrerror(&iocp->mde);
3774
3775         if ((setno >= md_nsets) ||
3776             (MD_MIN2UNIT(mnum) >= md_nunits)) {
3777                 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3778         }
3779
3780         /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3781         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3782
3783         if (un == NULL) {
3784                 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3785         }
3786         if (un->c.un_type != MD_METAMIRROR) {
3787                 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3788         }
3789         if (un->un_nsm < 2) {
3790                 return (0);
3791         }
3792
3793         /*
3794          * Check to see if we're the mirror owner. If not, there's nothing
3795          * for us to to.
3796          */
3797         if (!MD_MN_MIRROR_OWNER(un)) {
3798                 return (0);
3799         }
3800
3801         /*
3802          * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
3803          * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
3804          * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
3805          * we can just defer this cleaning until the next process_resync_regions
3806          * timeout.
3807          */
3808         rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
3809         if (un->un_pernode_dirty_bm[node] == NULL) {
3810                 un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
3811                     un->un_rrd_num, KM_SLEEP);
3812         }
3813         rw_exit(&un->un_pernode_dirty_mx[node]);
3814
3815         /*
3816          * See if we can simply clear the un_dirty_bm[] entries. If we're not
3817          * the issuing node _and_ we aren't in the process of marking/clearing
3818          * the RR bitmaps, we can simply update the bits as needed.
3819          * If we're the owning node and _not_ the issuing node, we should also
3820          * sync the RR if we clear any bits in it.
3821          */
3822         mutex_enter(&un->un_resync_mx);
3823         can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
3824         if (can_clear) {
3825                 un->un_resync_flg |= MM_RF_GATECLOSED;
3826                 mirror_clean_rr_bits(iocp);
3827                 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3828                 if (un->un_waiting_to_mark != 0 ||
3829                     un->un_waiting_to_clear != 0) {
3830                         cv_broadcast(&un->un_resync_cv);
3831                 }
3832         }
3833         mutex_exit(&un->un_resync_mx);
3834
3835         /*
3836          * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
3837          * we must schedule a blocking call to update the DRL on this node.
3838          * As we're invoked from an ioctl we are going to have the original data
3839          * disappear (kmem_free) once we return. So, copy the data into a new
3840          * structure and let the taskq routine release it on completion.
3841          */
3842         if (!can_clear) {
3843                 size_t  sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
3844
3845                 newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
3846
3847                 bcopy(iocp, newiocp, sz);
3848
3849                 if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
3850                     newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
3851                         kmem_free(newiocp, sz);
3852                         rval = ENOMEM;  /* probably starvation */
3853                 }
3854         }
3855
3856         return (rval);
3857 }