usr/src/uts/common/io/lvm/mirror/mirror_resync.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 #include <sys/param.h>
  27 #include <sys/systm.h>
  28 #include <sys/conf.h>
  29 #include <sys/file.h>
  30 #include <sys/user.h>
  31 #include <sys/uio.h>
  32 #include <sys/t_lock.h>
  33 #include <sys/buf.h>
  34 #include <sys/dkio.h>
  35 #include <sys/vtoc.h>
  36 #include <sys/kmem.h>
  37 #include <vm/page.h>
  38 #include <sys/cmn_err.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/types.h>
  41 #include <sys/mkdev.h>
  42 #include <sys/stat.h>
  43 #include <sys/open.h>
  44 #include <sys/disp.h>
  45 #include <sys/lvm/md_mirror.h>
  46 #include <sys/modctl.h>
  47 #include <sys/ddi.h>
  48 #include <sys/sunddi.h>
  49 #include <sys/debug.h>
  50 #include <sys/callb.h>
  51
  52 #include <sys/sysevent/eventdefs.h>
  53 #include <sys/sysevent/svm.h>
  54 #include <sys/lvm/mdmn_commd.h>
  55
  56 extern int              md_status;
  57 extern kmutex_t         md_status_mx;
  58 extern kmutex_t         md_mx;
  59
  60 extern unit_t           md_nunits;
  61 extern set_t            md_nsets;
  62 extern md_set_t         md_set[];
  63 extern major_t          md_major;
  64
  65 extern md_ops_t         mirror_md_ops;
  66 extern kmem_cache_t     *mirror_child_cache; /* mirror child memory pool */
  67 extern mdq_anchor_t     md_mto_daemon;
  68 extern daemon_request_t mirror_timeout;
  69 extern md_resync_t      md_cpr_resync;
  70 extern clock_t          md_hz;
  71 extern int              md_mtioctl_cnt;
  72
  73 extern kmem_cache_t     *mirror_parent_cache;
  74 #ifdef DEBUG
  75 extern int              mirror_debug_flag;
  76 #endif
  77
  78 /*
  79  * Tunable resync thread timeout. This is used as the time interval for updating
  80  * the resync progress to the mddb. This allows restartable resyncs to be
  81  * continued across a system reboot.
  82  * Default is to update the resync progress every 5 minutes.
  83  */
  84 int md_mirror_resync_update_intvl = MD_DEF_MIRROR_RESYNC_INTVL;
  85
  86 /*
  87  * Settable mirror resync buffer size.  Specified in 512 byte
  88  * blocks.  This is set to MD_DEF_RESYNC_BUF_SIZE by default.
  89  */
  90 int md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
  91
  92 /*
  93  * Tunables for dirty region processing when
  94  * closing down a mirror.
  95  *
  96  * Dirty region processing during close of a
  97  * mirror is basically monitoring the state
  98  * of the resync region bitmaps and the number
  99  * of outstanding i/o's per submirror to
 100  * determine that there are no more dirty
 101  * regions left over.
 102  *
 103  * The approach taken is a retry logic over
 104  * md_mirror_rr_cleans iterations to monitor
 105  * the progress.
 106  *
 107  * There are two methods of polling the progress
 108  * on dirty bitmap processing: busy-waits and
 109  * non-busy-waits.
 110  *
 111  * Busy-waits are used at the beginning to
 112  * determine the final state as quick as
 113  * possible; md_mirror_rr_polls defines the
 114  * number of busy-waits.
 115  *
 116  * In case the number of busy-waits got exhausted
 117  * with dirty regions left over, the retry logic
 118  * switches over to non-busy-waits, thus giving
 119  * relief to an obviously heavily loaded system.
 120  * The timeout value is defined by the tunable
 121  * md_mirror_rr_sleep_timo in seconds.
 122  *
 123  * The number of non-busy-waits is given by:
 124  * md_mirror_rr_cleans - md_mirror_rr_polls.
 125  *
 126  * The values were found by testing on a
 127  * 'typical' system and may require tuning
 128  * to meet specific customer's requirements.
 129  */
 130
 131 int md_mirror_rr_cleans = 13;
 132 int md_mirror_rr_polls = 3;
 133 int md_mirror_rr_sleep_timo = 1;
 134
 135 /*
 136  * The value is not #defined because it will be computed
 137  * in the future.
 138  */
 139 int md_max_xfer_bufsz = 2048;
 140
 141 /*
 142  * mirror_generate_rr_bitmap:
 143  * -------------------
 144  * Generate a compressed bitmap md_mn_msg_rr_clean_t for the given clean
 145  * bitmap associated with mirror 'un'
 146  *
 147  * Input:
 148  *      un      - mirror unit to get bitmap data from
 149  *      *msgp   - location to return newly allocated md_mn_msg_rr_clean_t
 150  *      *activep- location to return # of active i/os
 151  *
 152  * Returns:
 153  *      1 => dirty bits cleared from un_dirty_bm and DRL flush required
 154  *          *msgp contains bitmap of to-be-cleared bits
 155  *      0 => no bits cleared
 156  *          *msgp == NULL
 157  */
 158 static int
 159 mirror_generate_rr_bitmap(mm_unit_t *un, md_mn_msg_rr_clean_t **msgp,
 160     int *activep)
 161 {
 162         unsigned int    i, next_bit, data_bytes, start_bit;
 163         int             cleared_dirty = 0;
 164
 165         /* Skip any initial 0s. */
 166 retry_dirty_scan:
 167         if ((start_bit = un->un_rr_clean_start_bit) >= un->un_rrd_num)
 168                 un->un_rr_clean_start_bit = start_bit = 0;
 169
 170         /*
 171          * Handle case where NO bits are set in PERNODE_DIRTY but the
 172          * un_dirty_bm[] map does have entries set (after a 1st resync)
 173          */
 174         for (; start_bit < un->un_rrd_num &&
 175             !IS_PERNODE_DIRTY(md_mn_mynode_id, start_bit, un) &&
 176             (un->un_pernode_dirty_sum[start_bit] != (uchar_t)0); start_bit++)
 177                 ;
 178
 179         if (start_bit >= un->un_rrd_num) {
 180                 if (un->un_rr_clean_start_bit == 0) {
 181                         return (0);
 182                 } else {
 183                         un->un_rr_clean_start_bit = 0;
 184                         goto retry_dirty_scan;
 185                 }
 186         }
 187
 188         /* how much to fit into this message */
 189         data_bytes = MIN(howmany(un->un_rrd_num - start_bit, NBBY),
 190             MDMN_MSG_RR_CLEAN_DATA_MAX_BYTES);
 191
 192         (*msgp) = kmem_zalloc(MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes),
 193             KM_SLEEP);
 194
 195         (*msgp)->rr_nodeid = md_mn_mynode_id;
 196         (*msgp)->rr_mnum = MD_SID(un);
 197         MDMN_MSG_RR_CLEAN_START_SIZE_SET(*msgp, start_bit, data_bytes);
 198
 199         next_bit = MIN(start_bit + data_bytes * NBBY, un->un_rrd_num);
 200
 201         for (i = start_bit; i < next_bit; i++) {
 202                 if (un->c.un_status & MD_UN_KEEP_DIRTY && IS_KEEPDIRTY(i, un)) {
 203                         continue;
 204                 }
 205                 if (!IS_REGION_DIRTY(i, un)) {
 206                         continue;
 207                 }
 208                 if (un->un_outstanding_writes[i] != 0) {
 209                         (*activep)++;
 210                         continue;
 211                 }
 212
 213                 /*
 214                  * Handle the case where a resync has completed and we still
 215                  * have the un_dirty_bm[] entries marked as dirty (these are
 216                  * the most recent DRL re-read from the replica). They need
 217                  * to be cleared from our un_dirty_bm[] but they will not have
 218                  * corresponding un_pernode_dirty[] entries set unless (and
 219                  * until) further write()s have been issued to the area.
 220                  * This handles the case where only the un_dirty_bm[] entry is
 221                  * set. Without this we'd not clear this region until a local
 222                  * write is issued to the affected area.
 223                  */
 224                 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un) ||
 225                     (un->un_pernode_dirty_sum[i] == (uchar_t)0)) {
 226                         if (!IS_GOING_CLEAN(i, un)) {
 227                                 SET_GOING_CLEAN(i, un);
 228                                 (*activep)++;
 229                                 continue;
 230                         }
 231                         /*
 232                          * Now we've got a flagged pernode_dirty, _or_ a clean
 233                          * bitmap entry to process. Update the bitmap to flush
 234                          * the REGION_DIRTY / GOING_CLEAN bits when we send the
 235                          * cross-cluster message.
 236                          */
 237                         cleared_dirty++;
 238                         setbit(MDMN_MSG_RR_CLEAN_DATA(*msgp), i - start_bit);
 239                 } else {
 240                         /*
 241                          * Not marked as active in the pernode bitmap, so skip
 242                          * any update to this. We just increment the 0 count
 243                          * and adjust the active count by any outstanding
 244                          * un_pernode_dirty_sum[] entries. This means we don't
 245                          * leave the mirror permanently dirty.
 246                          */
 247                         (*activep) += (int)un->un_pernode_dirty_sum[i];
 248                 }
 249         }
 250         if (!cleared_dirty) {
 251                 kmem_free(*msgp, MDMN_MSG_RR_CLEAN_SIZE_DATA(data_bytes));
 252                 *msgp = NULL;
 253         }
 254         un->un_rr_clean_start_bit = next_bit;
 255         return (cleared_dirty);
 256 }
 257
 258 /*
 259  * There are three paths into here:
 260  *
 261  * md_daemon -> check_resync_regions -> prr
 262  * mirror_internal_close -> mirror_process_unit_resync -> prr
 263  * mirror_set_capability -> mirror_process_unit_resync -> prr
 264  *
 265  * The first one is a kernel daemon, the other two result from system calls.
 266  * Thus, only the first case needs to deal with kernel CPR activity.  This
 267  * is indicated by the cprinfop being non-NULL for kernel daemon calls, and
 268  * NULL for system call paths.
 269  */
 270 static int
 271 process_resync_regions_non_owner(mm_unit_t *un, callb_cpr_t *cprinfop)
 272 {
 273         int                     i, start, end;
 274         int                     cleared_dirty = 0;
 275         /* Number of reasons why we can not proceed shutting down the mirror. */
 276         int                     active = 0;
 277         set_t                   setno = MD_UN2SET(un);
 278         md_mn_msg_rr_clean_t    *rmsg;
 279         md_mn_kresult_t         *kres;
 280         int                     rval;
 281         minor_t                 mnum = MD_SID(un);
 282         mdi_unit_t              *ui = MDI_UNIT(mnum);
 283         md_mn_nodeid_t          owner_node;
 284
 285         /*
 286          * We drop the readerlock here to assist lock ordering with
 287          * update_resync.  Once we have the un_rrp_inflight_mx, we
 288          * can re-acquire it.
 289          */
 290         md_unit_readerexit(ui);
 291
 292         /*
 293          * Resync region processing must be single threaded. We can't use
 294          * un_resync_mx for this purpose since this mutex gets released
 295          * when blocking on un_resync_cv.
 296          */
 297         mutex_enter(&un->un_rrp_inflight_mx);
 298
 299         (void) md_unit_readerlock(ui);
 300
 301         mutex_enter(&un->un_resync_mx);
 302
 303         rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1], RW_READER);
 304         cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
 305         rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
 306
 307         if (cleared_dirty) {
 308                 owner_node = un->un_mirror_owner;
 309                 mutex_exit(&un->un_resync_mx);
 310
 311                 /*
 312                  * Transmit the 'to-be-cleared' bitmap to all cluster nodes.
 313                  * Receipt of the message will cause the mirror owner to
 314                  * update the on-disk DRL.
 315                  */
 316
 317                 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
 318
 319                 /* release readerlock before sending message */
 320                 md_unit_readerexit(ui);
 321
 322                 if (cprinfop) {
 323                         mutex_enter(&un->un_prr_cpr_mx);
 324                         CALLB_CPR_SAFE_BEGIN(cprinfop);
 325                 }
 326
 327                 rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_CLEAN,
 328                     MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_KSEND_NORETRY|
 329                     MD_MSGF_DIRECTED, un->un_mirror_owner,
 330                     (char *)rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg), kres);
 331
 332                 if (cprinfop) {
 333                         CALLB_CPR_SAFE_END(cprinfop, &un->un_prr_cpr_mx);
 334                         mutex_exit(&un->un_prr_cpr_mx);
 335                 }
 336
 337                 /* reacquire readerlock after message */
 338                 (void) md_unit_readerlock(ui);
 339
 340                 if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
 341                     (kres->kmmr_comm_state != MDMNE_NOT_JOINED)) {
 342                         /* if commd is gone, no point in printing a message */
 343                         if (md_mn_is_commd_present())
 344                                 mdmn_ksend_show_error(rval, kres, "RR_CLEAN");
 345                         kmem_free(kres, sizeof (md_mn_kresult_t));
 346                         kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
 347                         mutex_exit(&un->un_rrp_inflight_mx);
 348                         return (active);
 349                 }
 350                 kmem_free(kres, sizeof (md_mn_kresult_t));
 351
 352                 /*
 353                  * If ownership changed while we were sending, we probably
 354                  * sent the message to the wrong node.  Leave fixing that for
 355                  * the next cycle.
 356                  */
 357                 if (un->un_mirror_owner != owner_node) {
 358                         mutex_exit(&un->un_rrp_inflight_mx);
 359                         return (active);
 360                 }
 361
 362                 /*
 363                  * Now that we've sent the message, clear them from the
 364                  * pernode_dirty arrays.  These are ONLY cleared on a
 365                  * successful send, and failure has no impact.
 366                  */
 367                 cleared_dirty = 0;
 368                 start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
 369                 end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
 370                 mutex_enter(&un->un_resync_mx);
 371                 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
 372                     RW_READER);
 373                 for (i = start; i < end; i++) {
 374                         if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
 375                             i - start)) {
 376                                 if (IS_PERNODE_DIRTY(md_mn_mynode_id, i, un)) {
 377                                         un->un_pernode_dirty_sum[i]--;
 378                                         CLR_PERNODE_DIRTY(md_mn_mynode_id, i,
 379                                             un);
 380                                 }
 381                                 if (IS_REGION_DIRTY(i, un)) {
 382                                         cleared_dirty++;
 383                                         CLR_REGION_DIRTY(i, un);
 384                                         CLR_GOING_CLEAN(i, un);
 385                                 }
 386                         }
 387                 }
 388                 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
 389
 390                 kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
 391         }
 392         mutex_exit(&un->un_resync_mx);
 393
 394         mutex_exit(&un->un_rrp_inflight_mx);
 395
 396         return (active);
 397 }
 398
 399 static int
 400 process_resync_regions_owner(mm_unit_t *un)
 401 {
 402         int                     i, start, end;
 403         int                     cleared_dirty = 0;
 404         /* Number of reasons why we can not proceed shutting down the mirror. */
 405         int                     active = 0;
 406         set_t                   setno = MD_UN2SET(un);
 407         int                     mnset = MD_MNSET_SETNO(setno);
 408         md_mn_msg_rr_clean_t    *rmsg;
 409         minor_t                 mnum = MD_SID(un);
 410         mdi_unit_t              *ui = MDI_UNIT(mnum);
 411
 412         /*
 413          * We drop the readerlock here to assist lock ordering with
 414          * update_resync.  Once we have the un_rrp_inflight_mx, we
 415          * can re-acquire it.
 416          */
 417         md_unit_readerexit(ui);
 418
 419         /*
 420          * Resync region processing must be single threaded. We can't use
 421          * un_resync_mx for this purpose since this mutex gets released
 422          * when blocking on un_resync_cv.
 423          */
 424         mutex_enter(&un->un_rrp_inflight_mx);
 425
 426         (void) md_unit_readerlock(ui);
 427
 428         mutex_enter(&un->un_resync_mx);
 429         un->un_waiting_to_clear++;
 430         while (un->un_resync_flg & MM_RF_STALL_CLEAN)
 431                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
 432         un->un_waiting_to_clear--;
 433
 434         if (mnset) {
 435                 rw_enter(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1],
 436                     RW_READER);
 437                 cleared_dirty = mirror_generate_rr_bitmap(un, &rmsg, &active);
 438
 439                 if (cleared_dirty) {
 440                         /*
 441                          * Clear the bits from the pernode_dirty arrays.
 442                          * If that results in any being cleared from the
 443                          * un_dirty_bm, commit it.
 444                          */
 445                         cleared_dirty = 0;
 446                         start = MDMN_MSG_RR_CLEAN_START_BIT(rmsg);
 447                         end = start + MDMN_MSG_RR_CLEAN_DATA_BYTES(rmsg) * NBBY;
 448                         for (i = start; i < end; i++) {
 449                                 if (isset(MDMN_MSG_RR_CLEAN_DATA(rmsg),
 450                                     i - start)) {
 451                                         if (IS_PERNODE_DIRTY(md_mn_mynode_id, i,
 452                                             un)) {
 453                                                 un->un_pernode_dirty_sum[i]--;
 454                                                 CLR_PERNODE_DIRTY(
 455                                                     md_mn_mynode_id, i, un);
 456                                         }
 457                                         if (un->un_pernode_dirty_sum[i] == 0) {
 458                                                 cleared_dirty++;
 459                                                 CLR_REGION_DIRTY(i, un);
 460                                                 CLR_GOING_CLEAN(i, un);
 461                                         }
 462                                 }
 463                         }
 464                         kmem_free(rmsg, MDMN_MSG_RR_CLEAN_MSG_SIZE(rmsg));
 465                 }
 466                 rw_exit(&un->un_pernode_dirty_mx[md_mn_mynode_id - 1]);
 467         } else {
 468                 for (i = 0; i < un->un_rrd_num; i++) {
 469                         if (un->c.un_status & MD_UN_KEEP_DIRTY)
 470                                 if (IS_KEEPDIRTY(i, un))
 471                                         continue;
 472
 473                         if (!IS_REGION_DIRTY(i, un))
 474                                 continue;
 475                         if (un->un_outstanding_writes[i] != 0) {
 476                                 active++;
 477                                 continue;
 478                         }
 479
 480                         if (!IS_GOING_CLEAN(i, un)) {
 481                                 SET_GOING_CLEAN(i, un);
 482                                 active++;
 483                                 continue;
 484                         }
 485                         CLR_REGION_DIRTY(i, un);
 486                         CLR_GOING_CLEAN(i, un);
 487                         cleared_dirty++;
 488                 }
 489         }
 490
 491         if (cleared_dirty) {
 492                 un->un_resync_flg |= MM_RF_GATECLOSED;
 493                 mutex_exit(&un->un_resync_mx);
 494                 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
 495                 mutex_enter(&un->un_resync_mx);
 496                 un->un_resync_flg &= ~MM_RF_GATECLOSED;
 497
 498                 if (un->un_waiting_to_mark != 0 ||
 499                     un->un_waiting_to_clear != 0) {
 500                         active++;
 501                         cv_broadcast(&un->un_resync_cv);
 502                 }
 503         }
 504         mutex_exit(&un->un_resync_mx);
 505
 506         mutex_exit(&un->un_rrp_inflight_mx);
 507
 508         return (active);
 509 }
 510
 511 static int
 512 process_resync_regions(mm_unit_t *un, callb_cpr_t *cprinfop)
 513 {
 514         int     mnset = MD_MNSET_SETNO(MD_UN2SET(un));
 515         /*
 516          * For a mirror we can only update the on-disk resync-record if we
 517          * currently own the mirror. If we are called and there is no owner we
 518          * bail out before scanning the outstanding_writes[] array.
 519          * NOTE: we only need to check here (before scanning the array) as we
 520          *      are called with the readerlock held. This means that a change
 521          *      of ownership away from us will block until this resync check
 522          *      has completed.
 523          */
 524         if (mnset && (MD_MN_NO_MIRROR_OWNER(un) ||
 525             (!MD_MN_MIRROR_OWNER(un) && !md_mn_is_commd_present_lite()))) {
 526                 return (0);
 527         } else if (mnset && !MD_MN_MIRROR_OWNER(un)) {
 528                 return (process_resync_regions_non_owner(un, cprinfop));
 529         } else {
 530                 return (process_resync_regions_owner(un));
 531         }
 532 }
 533
 534 /*
 535  * Function that is callable from other modules to provide
 536  * ability to cleanup dirty region bitmap on demand. Used
 537  * on last close of a unit to avoid massive device resyncs
 538  * when coming back after rolling large amounts of data to
 539  * a mirror (e.g. at umount with logging).
 540  */
 541
 542 void
 543 mirror_process_unit_resync(mm_unit_t *un)
 544 {
 545         int     cleans = 0;
 546
 547         while (process_resync_regions(un, NULL)) {
 548
 549                 cleans++;
 550                 if (cleans >= md_mirror_rr_cleans) {
 551                         cmn_err(CE_NOTE,
 552                             "Could not clean resync regions\n");
 553                         break;
 554                 }
 555                 if (cleans > md_mirror_rr_polls) {
 556                         /*
 557                          * We did not make it with md_mirror_rr_polls
 558                          * iterations. Give the system relief and
 559                          * switch over to non-busy-wait.
 560                          */
 561                         delay(md_mirror_rr_sleep_timo * md_hz);
 562                 }
 563         }
 564 }
 565
 566 static void
 567 check_resync_regions(daemon_request_t *timeout)
 568 {
 569         mdi_unit_t      *ui;
 570         mm_unit_t       *un;
 571         md_link_t       *next;
 572         callb_cpr_t     cprinfo;
 573
 574         rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
 575         for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
 576
 577                 if (md_get_setstatus(next->ln_setno) & MD_SET_STALE)
 578                         continue;
 579
 580                 un = MD_UNIT(next->ln_id);
 581
 582                 /*
 583                  * Register this resync thread with the CPR mechanism. This
 584                  * allows us to detect when the system is suspended and so
 585                  * keep track of the RPC failure condition.
 586                  */
 587                 CALLB_CPR_INIT(&cprinfo, &un->un_prr_cpr_mx, callb_md_mrs_cpr,
 588                     "check_resync_regions");
 589
 590                 ui = MDI_UNIT(next->ln_id);
 591                 (void) md_unit_readerlock(ui);
 592
 593                 /*
 594                  * Do not clean up resync regions if it is an ABR
 595                  * mirror, or if a submirror is offline (we will use the resync
 596                  * region to resync when back online) or if there is only one
 597                  * submirror.
 598                  */
 599                 if ((ui->ui_tstate & MD_ABR_CAP) ||
 600                     (un->c.un_status & MD_UN_OFFLINE_SM) || (un->un_nsm < 2)) {
 601                         md_unit_readerexit(ui);
 602                         /* Remove this thread from the CPR callback table. */
 603                         mutex_enter(&un->un_prr_cpr_mx);
 604                         CALLB_CPR_EXIT(&cprinfo);
 605                         continue;
 606                 }
 607
 608                 (void) process_resync_regions(un, &cprinfo);
 609
 610                 md_unit_readerexit(ui);
 611
 612                 /* Remove this thread from the CPR callback table. */
 613                 mutex_enter(&un->un_prr_cpr_mx);
 614                 CALLB_CPR_EXIT(&cprinfo);
 615         }
 616
 617         rw_exit(&mirror_md_ops.md_link_rw.lock);
 618
 619         /* We are done */
 620         mutex_enter(&mirror_timeout.dr_mx);
 621         timeout->dr_pending = 0;
 622         mutex_exit(&mirror_timeout.dr_mx);
 623 }
 624
 625 static void
 626 md_mirror_timeout(void *throwaway)
 627 {
 628
 629         mutex_enter(&mirror_timeout.dr_mx);
 630         if (!mirror_timeout.dr_pending) {
 631                 mirror_timeout.dr_pending = 1;
 632                 daemon_request(&md_mto_daemon, check_resync_regions,
 633                     (daemon_queue_t *)&mirror_timeout, REQ_OLD);
 634         }
 635
 636         if (mirror_md_ops.md_head != NULL)
 637                 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
 638                     throwaway, (int)MD_MDELAY*hz);
 639         else
 640                 mirror_timeout.dr_timeout_id = 0;
 641
 642         mutex_exit(&mirror_timeout.dr_mx);
 643 }
 644
 645 void
 646 resync_start_timeout(set_t setno)
 647 {
 648         if (md_get_setstatus(setno) & MD_SET_STALE)
 649                 return;
 650
 651         mutex_enter(&mirror_timeout.dr_mx);
 652         if (mirror_timeout.dr_timeout_id == 0)
 653                 mirror_timeout.dr_timeout_id = timeout(md_mirror_timeout,
 654                     (void *)NULL, (int)MD_MDELAY*hz);
 655         mutex_exit(&mirror_timeout.dr_mx);
 656 }
 657
 658 static void
 659 offlined_to_attached(mm_unit_t *un)
 660 {
 661         int             i;
 662         int             changed = 0;
 663
 664         if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
 665                 return;
 666
 667         for (i = 0; i < NMIRROR; i++) {
 668                 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
 669                         mirror_set_sm_state(&un->un_sm[i],
 670                             &un->un_smic[i], SMS_ATTACHED, 1);
 671                         changed++;
 672                 }
 673                 if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC)) {
 674                         mirror_set_sm_state(&un->un_sm[i],
 675                             &un->un_smic[i], SMS_ATTACHED_RESYNC, 1);
 676                         changed++;
 677                 }
 678         }
 679
 680         if (changed != 0) {
 681                 un->c.un_status &= ~MD_UN_OFFLINE_SM;
 682                 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
 683         }
 684 }
 685
 686 static void
 687 get_unit_resync(mm_unit_t *un)
 688 {
 689         mddb_recstatus_t        status;
 690         struct optim_resync     *orp;
 691
 692         if (un->un_rr_dirty_recid == 0) {
 693                 offlined_to_attached(un);
 694                 return;
 695         }
 696
 697         status = mddb_getrecstatus(un->un_rr_dirty_recid);
 698         if ((status == MDDB_NORECORD) || (status == MDDB_NODATA)) {
 699                 un->un_rr_dirty_recid = 0;
 700                 offlined_to_attached(un);
 701                 return;
 702         }
 703
 704         mddb_setrecprivate(un->un_rr_dirty_recid, MD_PRV_GOTIT);
 705         orp = (struct optim_resync *)mddb_getrecaddr(un->un_rr_dirty_recid);
 706         un->un_dirty_bm = orp->or_rr;
 707 }
 708
 709 static int
 710 create_unit_resync(mm_unit_t *un, int snarfing)
 711 {
 712         diskaddr_t      tb;
 713         int             i;
 714         int             blksize;        /* rr size in blocks */
 715         int             num_rr;
 716         mddb_recid_t    recid;
 717         size_t          size;   /* bitmap size */
 718         optim_resync_t  *orp;
 719         mddb_type_t     typ1;
 720         set_t           setno;
 721
 722         tb = un->c.un_total_blocks;
 723
 724         if (((tb + MD_MIN_RR_SIZE)/ MD_MIN_RR_SIZE) > MD_DEF_NUM_RR) {
 725                 blksize = (int)(tb / MD_DEF_NUM_RR);
 726                 num_rr = (int)((tb + (blksize)) / (blksize));
 727         } else {
 728                 blksize = MD_MIN_RR_SIZE;
 729                 num_rr = (int)((tb + MD_MIN_RR_SIZE) / MD_MIN_RR_SIZE);
 730         }
 731
 732         size = howmany(num_rr, NBBY) + sizeof (*orp) - sizeof (orp->or_rr);
 733
 734         setno = MD_UN2SET(un);
 735
 736         typ1 = (mddb_type_t)md_getshared_key(setno,
 737             mirror_md_ops.md_driver.md_drivername);
 738
 739         recid =  mddb_createrec(size, typ1, RESYNC_REC,
 740             MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
 741         if (recid < 0) {
 742                 if (snarfing && !(md_get_setstatus(setno) & MD_SET_STALE)) {
 743                         md_set_setstatus(setno, MD_SET_STALE);
 744                         cmn_err(CE_WARN, "md: state database is stale");
 745                 }
 746                 return (-1);
 747         }
 748
 749         un->un_rr_dirty_recid = recid;
 750         orp = (optim_resync_t *)mddb_getrecaddr(recid);
 751         orp->or_magic = OR_MAGIC;
 752         orp->or_blksize = blksize;
 753         orp->or_num = num_rr;
 754
 755         un->un_rrd_blksize = blksize;
 756         un->un_rrd_num  = num_rr;
 757         un->un_dirty_bm = orp->or_rr;
 758
 759         if (snarfing)
 760                 for (i = 0; i < howmany(num_rr, NBBY); i++)
 761                         orp->or_rr[i] = 0xFF;
 762
 763         if (!snarfing) {
 764                 mddb_commitrec_wrapper(recid);
 765                 mirror_commit(un, NO_SUBMIRRORS, 0);
 766                 return (0);
 767         }
 768         mddb_setrecprivate(recid, MD_PRV_PENDCOM);
 769         mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCOM);
 770         return (0);
 771 }
 772
 773 int
 774 unit_setup_resync(mm_unit_t *un, int snarfing)
 775 {
 776         int err;
 777         int syncable;
 778         int i;
 779         mdi_unit_t      *ui = MDI_UNIT(MD_SID(un));
 780         int nonABR = 1;         /* only set if ABR marked in ui_tstate */
 781
 782         un->un_dirty_bm = NULL;
 783         un->un_rs_buffer = NULL;
 784
 785         mutex_init(&un->un_rrp_inflight_mx, "rrp mx", MUTEX_DEFAULT, NULL);
 786
 787         mutex_init(&un->un_resync_mx, NULL, MUTEX_DEFAULT, NULL);
 788         cv_init(&un->un_resync_cv, NULL, CV_DEFAULT, NULL);
 789         un->un_resync_flg = 0;
 790         un->un_waiting_to_mark = 0;
 791         un->un_waiting_to_commit = 0;
 792         un->un_waiting_to_clear = 0;
 793
 794         un->un_goingclean_bm = NULL;
 795         un->un_goingdirty_bm = NULL;
 796         un->un_outstanding_writes = NULL;
 797         un->un_resync_bm = NULL;
 798
 799         if (snarfing)
 800                 get_unit_resync(un);
 801
 802         if (un->un_rr_dirty_recid == 0) {
 803                 /*
 804                  * If a MN diskset and snarfing and this node is not the
 805                  * master, do not delete any records on snarf of the
 806                  * mirror records (create_unit_resync deletes records).
 807                  *
 808                  * Master node should have already handled this case.
 809                  */
 810                 if (MD_MNSET_SETNO(MD_UN2SET(un)) && snarfing &&
 811                     md_set[MD_UN2SET(un)].s_am_i_master == 0) {
 812 #ifdef DEBUG
 813                         cmn_err(CE_NOTE, "unit_setup_resync: no rr for %s on"
 814                             " nodeid %d\n", md_shortname(MD_SID(un)),
 815                             md_set[MD_UN2SET(un)].s_nodeid);
 816 #endif
 817                         return (-1);
 818                 }
 819                 if ((err = create_unit_resync(un, snarfing)) != 0)
 820                         return (err);
 821         }
 822
 823         un->un_goingclean_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
 824             un->un_rrd_num, NBBY)), KM_SLEEP);
 825         un->un_goingdirty_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
 826             un->un_rrd_num, NBBY)), KM_SLEEP);
 827         un->un_outstanding_writes = (short *)kmem_zalloc(
 828             (uint_t)un->un_rrd_num * sizeof (short), KM_SLEEP);
 829         un->un_resync_bm = (uchar_t *)kmem_zalloc((uint_t)(howmany(
 830             un->un_rrd_num, NBBY)), KM_SLEEP);
 831
 832         /*
 833          * Allocate pernode bitmap for this node. All other nodes' maps will
 834          * be created 'on-the-fly' in the ioctl message handler
 835          */
 836         if (MD_MNSET_SETNO(MD_UN2SET(un))) {
 837                 un->un_pernode_dirty_sum =
 838                     (uchar_t *)kmem_zalloc(un->un_rrd_num, KM_SLEEP);
 839                 if (md_mn_mynode_id > 0) {
 840                         un->un_pernode_dirty_bm[md_mn_mynode_id-1] = (uchar_t *)
 841                             kmem_zalloc((uint_t)(howmany(un->un_rrd_num, NBBY)),
 842                             KM_SLEEP);
 843                 }
 844
 845                 /*
 846                  * Allocate taskq to process deferred (due to locking) RR_CLEAN
 847                  * requests.
 848                  */
 849                 un->un_drl_task = (ddi_taskq_t *)md_create_taskq(MD_UN2SET(un),
 850                     MD_SID(un));
 851         }
 852
 853         if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
 854                 return (0);
 855
 856         /*
 857          * Only mark mirror which has an associated DRL as requiring a resync.
 858          * For ABR mirrors we need not set the resync record bitmap up.
 859          */
 860         if (ui && (ui->ui_tstate & MD_ABR_CAP))
 861                 nonABR = 0;
 862
 863         for (i = 0, syncable = 0; i < NMIRROR; i++) {
 864                 if (nonABR) {
 865                         if ((SUBMIRROR_IS_READABLE(un, i) ||
 866                             SMS_BY_INDEX_IS(un, i,
 867                             (SMS_OFFLINE | SMS_OFFLINE_RESYNC))))
 868                                 syncable++;
 869                 }
 870         }
 871
 872         if (snarfing && un->un_pass_num && (syncable > 1)) {
 873                 bcopy((caddr_t)un->un_dirty_bm, (caddr_t)un->un_resync_bm,
 874                     howmany(un->un_rrd_num, NBBY));
 875
 876                 un->c.un_status |= (MD_UN_OPT_NOT_DONE | MD_UN_WAR);
 877                 un->c.un_status &= ~MD_UN_OFFLINE_SM;
 878                 for (i = 0; i < NMIRROR; i++) {
 879                         if ((SUBMIRROR_IS_READABLE(un, i)) ||
 880                             SMS_BY_INDEX_IS(un, i, SMS_OFFLINE_RESYNC))
 881                                 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
 882
 883                         if (SMS_BY_INDEX_IS(un, i, SMS_OFFLINE)) {
 884                                 un->un_sm[i].sm_flags |= MD_SM_RESYNC_TARGET;
 885                                 mirror_set_sm_state(&un->un_sm[i],
 886                                     &un->un_smic[i], SMS_OFFLINE_RESYNC, 1);
 887                                 mddb_setrecprivate(un->c.un_record_id,
 888                                     MD_PRV_PENDCOM);
 889                         }
 890                 }
 891         }
 892         return (0);
 893 }
 894
 895 /*
 896  * resync_kill_pending:
 897  * -------------------
 898  * Determine if the resync thread has been requested to terminate.
 899  * Block if MD_RI_BLOCK or MD_RI_BLOCK_OWNER is set in un->un_rs_thread_flags.
 900  * MD_RI_BLOCK is only set as a result of a user-initiated ioctl via metasync.
 901  * MD_RI_BLOCK_OWNER is set by the ownership change of a multi-node  mirror.
 902  *
 903  * Returns:
 904  *      0       Kill not pending
 905  *      1       Kill requested  (set MD_UN_RESYNC_CANCEL in un->c.un_status)
 906  *
 907  * Note: this routine may block
 908  *       the writerlock for <ui> will be dropped and reacquired if <mx_type>
 909  *       is set to MD_WRITER_HELD.
 910  *       the readerlock for <ui> will be dropped and reacquired if <mx_type>
 911  *       is set to MD_READER_HELD.
 912  */
 913 static int
 914 resync_kill_pending(
 915         mm_unit_t *un,
 916         mdi_unit_t *ui,
 917         uint_t mx_type)
 918 {
 919         int     retval = 0;
 920
 921         /* Ensure that we don't block with any mutex held */
 922         if (mx_type == MD_WRITER_HELD) {
 923                 md_unit_writerexit(ui);
 924         } else if (mx_type == MD_READER_HELD) {
 925                 md_unit_readerexit(ui);
 926         }
 927         mutex_enter(&un->un_rs_thread_mx);
 928         while (un->un_rs_thread_flags & (MD_RI_BLOCK|MD_RI_BLOCK_OWNER)) {
 929                 cv_wait(&un->un_rs_thread_cv, &un->un_rs_thread_mx);
 930                 if (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN))
 931                         break;
 932         }
 933         /* Determine if we've been asked to abort or shutdown gracefully */
 934         if (un->un_rs_thread_flags & MD_RI_KILL) {
 935                 un->c.un_status |= MD_UN_RESYNC_CANCEL;
 936                 retval = 1;
 937         } else if (un->un_rs_thread_flags & MD_RI_SHUTDOWN) {
 938                 retval = 1;
 939         }
 940         mutex_exit(&un->un_rs_thread_mx);
 941
 942         /* Reacquire mutex if dropped on entry */
 943         if (mx_type == MD_WRITER_HELD) {
 944                 (void) md_unit_writerlock(ui);
 945         } else if (mx_type == MD_READER_HELD) {
 946                 (void) md_unit_readerlock(ui);
 947         }
 948         return (retval);
 949 }
 950
 951 /*
 952  * resync_read_buffer:
 953  * ------------------
 954  * Issue the resync source read for the specified start block and size.
 955  * This will cause the mirror strategy routine to issue a write-after-read
 956  * once this request completes successfully.
 957  * If 'flag_err' is set we expect to see a write error flagged in the b_error
 958  * field of the buffer created for this i/o request. If clear we do not expect
 959  * to see the error flagged for write failures.
 960  * Read failures will always set the B_ERROR bit which will stop the resync
 961  * immediately.
 962  */
 963 static int
 964 resync_read_buffer(mm_unit_t *un, diskaddr_t blk, size_t cnt, int flag_err)
 965 {
 966         md_mcs_t        *sp;
 967         buf_t           *bp;
 968         int             ret = 0;
 969
 970         sp = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
 971         mirror_child_init(sp);
 972
 973         bp = &sp->cs_buf;
 974         bp->b_edev = makedevice(md_major, MD_SID(un));
 975         bp->b_flags = B_READ;
 976         bp->b_lblkno = blk;
 977         bp->b_bcount = dbtob(cnt);
 978         bp->b_un.b_addr = un->un_rs_buffer;
 979         md_unit_readerexit(MDI_UNIT(MD_SID(un)));
 980
 981         (void) md_mirror_strategy(bp, MD_STR_NOTTOP | MD_STR_MAPPED |
 982             MD_STR_WAR | (flag_err ? MD_STR_FLAG_ERR : 0), NULL);
 983
 984         (void) biowait(bp);
 985
 986         (void) md_unit_readerlock(MDI_UNIT(MD_SID(un)));
 987         if (bp->b_flags & B_ERROR) {
 988                 ret = 1;
 989         }
 990         kmem_cache_free(mirror_child_cache, sp);
 991         return (ret);
 992 }
 993
 994 /*
 995  * send_mn_resync_done_message
 996  *
 997  * At the end of a resync, send a message to all nodes to indicate that
 998  * the resync is complete. The argument, flags, has the following values
 999  *
1000  * RESYNC_ERR - if an error occurred that terminated the resync
1001  * CLEAR_OPT_NOT_DONE   - Just need to clear the OPT_NOT_DONE flag
1002  *
1003  * unit writerlock set on entry
1004  * Only send the message if the thread is not marked as shutting down:
1005  * [un_rs_thread_flags & MD_RI_SHUTDOWN] or being killed:
1006  * [un->c.un_status & MD_UN_RESYNC_CANCEL]
1007  * or if there has been an error that terminated the resync:
1008  *      flags & RESYNC_ERR
1009  *
1010  */
1011 static void
1012 send_mn_resync_done_message(
1013         mm_unit_t       *un,
1014         int             flags
1015 )
1016 {
1017         md_mn_msg_resync_t      *rmsg = un->un_rs_msg;
1018         set_t                   setno;
1019         mdi_unit_t              *ui = MDI_UNIT(MD_SID(un));
1020         md_mn_kresult_t         *kres;
1021         int                     dont_send = 0;
1022         int                     rval;
1023         int                     nretries = 0;
1024
1025         rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
1026
1027         /*
1028          * Only send the message if this resync thread is still active. This
1029          * handles the case where ownership changes to different nodes during
1030          * a resync can cause multiple spurious resync_done messages to occur
1031          * when the resync completes. This happens because only one node is
1032          * the resync owner but other nodes will have their resync_unit thread
1033          * blocked in 'resync_kill_pending'
1034          */
1035         mutex_enter(&un->un_rs_thread_mx);
1036         dont_send = (un->un_rs_thread_flags & (MD_RI_KILL|MD_RI_SHUTDOWN)) ? 1
1037             : 0;
1038         mutex_exit(&un->un_rs_thread_mx);
1039         dont_send |= (un->c.un_status & MD_UN_RESYNC_CANCEL) ? 1 : 0;
1040
1041         /*
1042          * Always send a message if we've encountered an error that terminated
1043          * the resync.
1044          */
1045         if (flags & RESYNC_ERR)
1046                 dont_send = 0;
1047
1048         if (dont_send) {
1049 #ifdef DEBUG
1050                 if (mirror_debug_flag) {
1051                         printf("Don't send resync done message, mnum = %x,"
1052                             " type = %x, flags = %d\n", MD_SID(un),
1053                             un->un_rs_type, flags);
1054                 }
1055 #endif  /* DEBUG */
1056                 return;
1057         }
1058
1059 #ifdef DEBUG
1060         if (mirror_debug_flag) {
1061                 printf("send resync done message, mnum = %x, type = %x\n",
1062                     MD_SID(un), un->un_rs_type);
1063         }
1064 #endif
1065
1066         rmsg->msg_resync_mnum = MD_SID(un);
1067         rmsg->msg_resync_type = un->un_rs_type;
1068         rmsg->msg_originator = md_mn_mynode_id;
1069         rmsg->msg_resync_flags = 0;
1070         if (flags & RESYNC_ERR)
1071                 rmsg->msg_resync_flags |= MD_MN_RS_ERR;
1072         if (flags & CLEAR_OPT_NOT_DONE)
1073                 rmsg->msg_resync_flags |= MD_MN_RS_CLEAR_OPT_NOT_DONE;
1074
1075         setno = MD_MIN2SET(MD_SID(un));
1076         md_unit_writerexit(ui);
1077         kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1078
1079 smrd_msg:
1080         mutex_enter(&un->un_rs_cpr_mx);
1081         CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1082
1083         rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_PHASE_DONE,
1084             MD_MSGF_NO_LOG, 0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1085
1086         CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1087         mutex_exit(&un->un_rs_cpr_mx);
1088
1089         /* if the node hasn't yet joined, it's Ok. */
1090         if ((!MDMN_KSEND_MSG_OK(rval, kres)) &&
1091             (kres->kmmr_comm_state !=  MDMNE_NOT_JOINED)) {
1092                 mdmn_ksend_show_error(rval, kres, "RESYNC_PHASE_DONE");
1093                 /* If we're shutting down already, pause things here. */
1094                 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1095                         while (!md_mn_is_commd_present()) {
1096                                 delay(md_hz);
1097                         }
1098                         /*
1099                          * commd is now available again. Retry the message once.
1100                          * If this fails we panic as the system is in an
1101                          * unexpected state.
1102                          */
1103                         if (nretries++ == 0)
1104                                 goto smrd_msg;
1105                 }
1106                 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_PHASE_DONE");
1107         }
1108         kmem_free(kres, sizeof (md_mn_kresult_t));
1109         (void) md_unit_writerlock(ui);
1110 }
1111
1112 /*
1113  * send_mn_resync_next_message
1114  *
1115  * Sent a message to all nodes indicating the next region to be resynced.
1116  * The message contains the region to be resynced and the current position in
1117  * the resync as denoted by un_rs_resync_done and un_rs_resync_2_do.
1118  * On entry the unit readerlock is held.
1119  */
1120 static void
1121 send_mn_resync_next_message(
1122         mm_unit_t       *un,
1123         diskaddr_t      currentblk,
1124         size_t          rsize,
1125         int             flags
1126 )
1127 {
1128         md_mn_msg_resync_t      *rmsg = un->un_rs_msg;
1129         set_t                   setno;
1130         md_mn_kresult_t         *kres;
1131         mdi_unit_t              *ui = MDI_UNIT(MD_SID(un));
1132         int                     rval;
1133         md_mps_t                *ps;
1134         mm_submirror_t          *sm;
1135         int                     smi;
1136         int                     nretries = 0;
1137
1138         ASSERT(rmsg != NULL);
1139 #ifdef DEBUG
1140         if (mirror_debug_flag) {
1141                 printf("send resync next message, mnum = %x, start=%lld, "
1142                     "size=%ld, type=%x, done=%lld, 2_do=%lld\n",
1143                     MD_SID(un), currentblk, rsize, un->un_rs_type,
1144                     un->un_rs_resync_done, un->un_rs_resync_2_do);
1145         }
1146 #endif
1147         rmsg->msg_resync_mnum = MD_SID(un);
1148         rmsg->msg_resync_type = un->un_rs_type;
1149         rmsg->msg_resync_start = currentblk;
1150         rmsg->msg_resync_rsize = rsize;
1151         rmsg->msg_resync_done = un->un_rs_resync_done;
1152         rmsg->msg_resync_2_do = un->un_rs_resync_2_do;
1153         rmsg->msg_originator = md_mn_mynode_id;
1154         if (flags & MD_FIRST_RESYNC_NEXT)
1155                 rmsg->msg_resync_flags = MD_MN_RS_FIRST_RESYNC_NEXT;
1156
1157         /*
1158          * Copy current submirror state and flags into message. This provides
1159          * a means of keeping all nodes that are currently active in the cluster
1160          * synchronised with regards to their submirror state settings. If we
1161          * did not pass this information here, the only time every node gets
1162          * submirror state updated is at the end of a resync phase. This can be
1163          * a significant amount of time for large metadevices.
1164          */
1165         for (smi = 0; smi < NMIRROR; smi++) {
1166                 sm = &un->un_sm[smi];
1167                 rmsg->msg_sm_state[smi] = sm->sm_state;
1168                 rmsg->msg_sm_flags[smi] = sm->sm_flags;
1169         }
1170         setno = MD_MIN2SET(MD_SID(un));
1171         md_unit_readerexit(ui);
1172         kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
1173
1174 smrn_msg:
1175         mutex_enter(&un->un_rs_cpr_mx);
1176         CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
1177
1178         rval = mdmn_ksend_message(setno, MD_MN_MSG_RESYNC_NEXT, MD_MSGF_NO_LOG,
1179             0, (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
1180
1181         CALLB_CPR_SAFE_END(&un->un_rs_cprinfo, &un->un_rs_cpr_mx);
1182         mutex_exit(&un->un_rs_cpr_mx);
1183
1184         if (!MDMN_KSEND_MSG_OK(rval, kres)) {
1185                 mdmn_ksend_show_error(rval, kres, "RESYNC_NEXT");
1186                 /* If we're shutting down already, pause things here. */
1187                 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
1188                         while (!md_mn_is_commd_present()) {
1189                                 delay(md_hz);
1190                         }
1191                         /*
1192                          * commd is now available again. Retry the message once.
1193                          * If this fails we panic as the system is in an
1194                          * unexpected state.
1195                          */
1196                         if (nretries++ == 0)
1197                                 goto smrn_msg;
1198                 }
1199                 cmn_err(CE_PANIC, "ksend_message failure: RESYNC_NEXT");
1200         }
1201         kmem_free(kres, sizeof (md_mn_kresult_t));
1202         (void) md_unit_readerlock(ui);
1203         ps = un->un_rs_prev_overlap;
1204
1205         /* Allocate previous overlap reference if needed */
1206         if (ps == NULL) {
1207                 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
1208                 ps->ps_un = un;
1209                 ps->ps_ui = ui;
1210                 ps->ps_firstblk = 0;
1211                 ps->ps_lastblk = 0;
1212                 ps->ps_flags = 0;
1213                 md_unit_readerexit(ui);
1214                 (void) md_unit_writerlock(ui);
1215                 un->un_rs_prev_overlap = ps;
1216                 md_unit_writerexit(ui);
1217                 (void) md_unit_readerlock(ui);
1218         }
1219
1220         ps->ps_firstblk = currentblk;
1221         ps->ps_lastblk = currentblk + rsize - 1;
1222 }
1223
1224 static int
1225 resync_read_blk_range(
1226         mm_unit_t *un,
1227         diskaddr_t currentblk,
1228         diskaddr_t stopbefore,
1229         uint_t type,
1230         int     flags
1231 )
1232 {
1233         size_t copysize;        /* limited by max xfer buf size */
1234         size_t rsize;           /* size of resync block (for MN) */
1235         set_t           setno;
1236         diskaddr_t      newstop;
1237         diskaddr_t      rs_startblk;
1238         uint_t          rs_type;
1239         int             flags1 = flags & MD_FIRST_RESYNC_NEXT;
1240
1241         rs_type = un->un_rs_type;
1242         rs_startblk = currentblk;
1243         if (stopbefore > un->c.un_total_blocks)
1244                 stopbefore = un->c.un_total_blocks;
1245         if (currentblk < un->un_resync_startbl)
1246                 currentblk = un->un_resync_startbl;
1247
1248         copysize = un->un_rs_copysize;
1249         rsize = MD_DEF_RESYNC_BLK_SZ;
1250
1251         setno = MD_MIN2SET(MD_SID(un));
1252         while (currentblk < stopbefore) {
1253                 /*
1254                  * Split the block up into units of MD_DEF_RESYNC_BLK_SZ and
1255                  * if a MN device and sendflag is set, send a RESYNC_MESSAGE
1256                  * to all nodes.
1257                  */
1258                 if ((currentblk + MD_DEF_RESYNC_BLK_SZ) > stopbefore)
1259                         rsize = stopbefore - currentblk;
1260                 if (MD_MNSET_SETNO(setno) && (flags & MD_SEND_MESS_XMIT)) {
1261                         un->un_resync_startbl = currentblk;
1262                         rs_startblk = currentblk;
1263                         send_mn_resync_next_message(un, currentblk, rsize,
1264                             flags1);
1265                         if (flags1)
1266                                 flags1 = 0;
1267                         /* check to see if we've been asked to terminate */
1268                         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1269                                 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1270                                     ? 1:0);
1271                         /*
1272                          * Check to see if another node has completed this
1273                          * block, if so either the type or the resync region
1274                          * will have changed. If the resync type has changed,
1275                          * just exit.
1276                          * If the resync region has changed, reset currentblk
1277                          * to the start of the current resync region and
1278                          * continue.
1279                          */
1280                         if (un->un_rs_type != rs_type)
1281                                 return (0);
1282                         if (un->un_rs_prev_overlap->ps_firstblk >
1283                             rs_startblk) {
1284                                 currentblk =
1285                                     un->un_rs_prev_overlap->ps_firstblk;
1286                                 continue;
1287                         }
1288                 }
1289                 newstop = currentblk + rsize;
1290                 while (currentblk < newstop) {
1291                         if ((currentblk + copysize) > stopbefore)
1292                                 copysize = (size_t)(stopbefore - currentblk);
1293                         if (resync_read_buffer(un, currentblk, copysize,
1294                             (flags & MD_RESYNC_FLAG_ERR)))
1295                                 return (1);
1296
1297                         /* resync_read_buffer releases/grabs a new lock */
1298                         un = (mm_unit_t *)MD_UNIT(MD_SID(un));
1299                         currentblk += copysize;
1300
1301                         /* check to see if we've been asked to terminate */
1302                         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), type))
1303                                 return ((un->c.un_status & MD_UN_RESYNC_CANCEL)
1304                                     ? 1:0);
1305                         if (MD_MNSET_SETNO(setno)) {
1306                                 /*
1307                                  * Check to see if another node has completed
1308                                  * this block, see above
1309                                  */
1310                                 if (un->un_rs_type != rs_type)
1311                                         return (0);
1312                                 if (un->un_rs_prev_overlap->ps_firstblk >
1313                                     rs_startblk)
1314                                         currentblk =
1315                                             un->un_rs_prev_overlap->ps_firstblk;
1316                         }
1317                 }
1318         }
1319         return (0);
1320 }
1321
1322 static void
1323 optimized_resync(mm_unit_t *un)
1324 {
1325         mdi_unit_t      *ui;
1326         minor_t         mnum;
1327         int             rr, smi;
1328         int             resync_regions;
1329         uchar_t         *dirtyregions;
1330         diskaddr_t      first, stopbefore;
1331         int             err;
1332         int             cnt;
1333         sm_state_t      state;
1334         int             broke_out = 0;
1335         set_t           setno;
1336         uint_t          old_rs_type = un->un_rs_type;
1337         uint_t          old_rs_done;
1338         uint_t          flags1 = MD_FIRST_RESYNC_NEXT|MD_RESYNC_FLAG_ERR;
1339         size_t          start_rr;
1340
1341         mnum = MD_SID(un);
1342         ui = MDI_UNIT(mnum);
1343         setno = MD_UN2SET(un);
1344
1345         if (!(un->c.un_status & MD_UN_OPT_NOT_DONE)) {
1346                 /*
1347                  * We aren't marked as needing a resync so for multi-node
1348                  * sets we flag the completion so that all nodes see the same
1349                  * metadevice state. This is a problem when a new node joins
1350                  * an existing set as it has to perform a 'metasync -r' and
1351                  * we have to step through all of the resync phases. If we
1352                  * don't do this the nodes that were already in the set will
1353                  * have the metadevices marked as 'Okay' but the joining node
1354                  * will have 'Needs Maintenance' which is unclearable.
1355                  */
1356                 if (MD_MNSET_SETNO(setno)) {
1357                         send_mn_resync_done_message(un, CLEAR_OPT_NOT_DONE);
1358                 }
1359                 return;
1360         }
1361
1362         /*
1363          * No need for optimized resync if ABR set, clear rs_type and flags
1364          * and exit
1365          */
1366         if (ui->ui_tstate & MD_ABR_CAP) {
1367                 un->un_rs_type = MD_RS_NONE;
1368                 un->c.un_status &= ~(MD_UN_OPT_NOT_DONE | MD_UN_WAR);
1369                 return;
1370         }
1371
1372         un->un_rs_dropped_lock = 1;
1373         un->c.un_status |= MD_UN_WAR;
1374         resync_regions = un->un_rrd_num;
1375         dirtyregions = un->un_resync_bm;
1376         md_unit_writerexit(ui);
1377
1378         /* For MN sets, resync NOTIFY is done when processing resync messages */
1379         if (!MD_MNSET_SETNO(setno)) {
1380                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1381                     SVM_TAG_METADEVICE, setno, MD_SID(un));
1382         }
1383         un = (mm_unit_t *)md_unit_readerlock(ui);
1384
1385         /* check to see if we've been asked to terminate */
1386         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1387                 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1388                         broke_out = RESYNC_ERR;
1389         }
1390         /*
1391          * Check that we are still performing an optimized
1392          * resync. If not, another node must have completed it
1393          * so we have no more work to do.
1394          */
1395         if (un->un_rs_type != old_rs_type) {
1396                 md_unit_readerexit(ui);
1397                 (void) md_unit_writerlock(ui);
1398                 return;
1399         }
1400         /*
1401          * If rs_resync_done is non-zero, we must be completing an optimized
1402          * resync that has already been partially done on another node.
1403          * Therefore clear the bits in resync_bm for the resync regions
1404          * already done. If resync_startbl is zero, calculate 2_do.
1405          */
1406         if (un->un_rs_resync_done > 0) {
1407                 BLK_TO_RR(start_rr, un->un_resync_startbl, un);
1408                 for (rr = 0; rr < start_rr && rr < resync_regions; rr++)
1409                         CLR_KEEPDIRTY(rr, un);
1410         } else {
1411                 un->un_rs_resync_2_do = 0;
1412                 for (rr = 0; rr < resync_regions; rr++)
1413                         if (isset(dirtyregions, rr))
1414                                 un->un_rs_resync_2_do++;
1415         }
1416
1417         for (rr = 0; (rr < resync_regions) && (broke_out != RESYNC_ERR); rr++) {
1418                 if (isset(dirtyregions, rr)) {
1419                         RR_TO_BLK(first, rr, un);
1420                         RR_TO_BLK(stopbefore, rr+1, un);
1421                         old_rs_type = un->un_rs_type;
1422                         old_rs_done = un->un_rs_resync_done;
1423                         err = resync_read_blk_range(un, first, stopbefore,
1424                             MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
1425                         flags1 = MD_RESYNC_FLAG_ERR;
1426
1427                         /* resync_read_blk_range releases/grabs a new lock */
1428                         un = (mm_unit_t *)MD_UNIT(mnum);
1429
1430                         if (err) {
1431                                 broke_out = RESYNC_ERR;
1432                                 break;
1433                         }
1434
1435                         /*
1436                          * Check that we are still performing an optimized
1437                          * resync. If not, another node must have completed it
1438                          * so we have no more work to do.
1439                          */
1440                         if (un->un_rs_type != old_rs_type) {
1441                                 md_unit_readerexit(ui);
1442                                 (void) md_unit_writerlock(ui);
1443                                 return;
1444                         }
1445
1446                         /*
1447                          * If resync_done has increased, we must have
1448                          * blocked in resync_read_blk_range while another node
1449                          * continued with the resync. Therefore clear resync_bm
1450                          * for the blocks that have been resynced on another
1451                          * node and update rr to the next RR to be done.
1452                          */
1453                         if (old_rs_done < un->un_rs_resync_done) {
1454                                 int i;
1455                                 BLK_TO_RR(start_rr, un->un_resync_startbl - 1,
1456                                     un);
1457                                 for (i = rr; i < start_rr; i++)
1458                                         CLR_KEEPDIRTY(i, un);
1459                                 rr = start_rr;
1460                         } else
1461                                 un->un_rs_resync_done++;
1462
1463                         for (smi = 0, cnt = 0; smi < NMIRROR; smi++)
1464                                 if (SUBMIRROR_IS_WRITEABLE(un, smi) &&
1465                                     !(SMS_BY_INDEX_IS(un, smi, SMS_ALL_ERRED)))
1466                                         cnt++;
1467                         if (cnt < 2) {
1468                                 broke_out = RESYNC_ERR;
1469                                 break;
1470                         }
1471                         CLR_KEEPDIRTY(rr, un);
1472                         /* Check to see if we've completed the resync cleanly */
1473                         if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1474                                 break;
1475
1476                         /*
1477                          * Check that we haven't exceeded un_rs_resync_2_do. If
1478                          * we have we've completed the resync.
1479                          */
1480                         if (un->un_rs_resync_done > un->un_rs_resync_2_do)
1481                                 break;
1482                 }
1483         }
1484         md_unit_readerexit(ui);
1485         un = (mm_unit_t *)md_unit_writerlock(ui);
1486
1487         /*
1488          * If MN set send message to all nodes to indicate resync
1489          * phase is complete. The processing of the message will update the
1490          * mirror state
1491          */
1492         if (MD_MNSET_SETNO(setno)) {
1493                 send_mn_resync_done_message(un, broke_out);
1494         } else {
1495
1496                 if (!broke_out)
1497                         un->c.un_status &= ~MD_UN_WAR;
1498
1499                 un->c.un_status &= ~MD_UN_KEEP_DIRTY;
1500
1501                 setno = MD_UN2SET(un);
1502                 for (smi = 0; smi < NMIRROR; smi++) {
1503                         un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
1504                         if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE_RESYNC)) {
1505                                 state = (broke_out ? SMS_OFFLINE : SMS_RUNNING);
1506                                 mirror_set_sm_state(&un->un_sm[smi],
1507                                     &un->un_smic[smi], state, broke_out);
1508                                 mirror_commit(un, NO_SUBMIRRORS, 0);
1509                         }
1510                         if (SMS_BY_INDEX_IS(un, smi, SMS_OFFLINE))
1511                                 un->c.un_status |= MD_UN_OFFLINE_SM;
1512                 }
1513         }
1514
1515         /* For MN sets, resync NOTIFY is done when processing resync messages */
1516         if (!MD_MNSET_SETNO(setno)) {
1517                 if (broke_out) {
1518                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1519                             SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1520                 } else {
1521                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1522                             SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
1523                 }
1524         }
1525 }
1526
1527 /*
1528  * recalc_resync_done
1529  *
1530  * This function deals with a change in value of un_rs_resync_2_do in a
1531  * component resync. This may change if we are restarting a component
1532  * resync on a single node having rebooted with a different value of
1533  * md_resync_bufsz or if we are running in a multi-node with nodes having
1534  * different values of md_resync_bufsz.
1535  * If there is a change in un_rs_resync_2_do, we need to recalculate
1536  * the value of un_rs_resync_done given the new value for resync_2_do.
1537  * We have to calculate a new value for resync_done to be either
1538  * if un_resync_startbl is set, (un_resync_startbl - initblock)/(blksize + skip)
1539  * or if it is not set, we need to calculate it from un_rs_resync_done,
1540  * (un_rs_resync_done/un_rs_resync_2_do) * resync_2_do
1541  * In addition we need to deal with the overflow case by using a factor to
1542  * prevent overflow
1543  */
1544
1545 static void
1546 recalc_resync_done(mm_unit_t *un, size_t resync_2_do, diskaddr_t initblock,
1547     u_longlong_t blk_size, u_longlong_t skip)
1548 {
1549         diskaddr_t              x;
1550         uint_t                  factor = 1;
1551
1552         /*
1553          * If resync_2_do has not yet been calculated, no need to modify
1554          * resync_done
1555          */
1556         if (un->un_rs_resync_2_do == 0) {
1557                 return;
1558         }
1559         if (un->un_rs_resync_2_do == resync_2_do)
1560                 return; /* No change, so nothing to do */
1561         /*
1562          * If un_rs_startbl is set, another node must have already started
1563          * this resync and hence we can calculate resync_done from
1564          * resync_startbl
1565          */
1566         if (un->un_resync_startbl) {
1567                 un->un_rs_resync_done = (un->un_resync_startbl - initblock) /
1568                     (blk_size + skip);
1569                 return;
1570         }
1571         /*
1572          * un_resync_startbl is not set so we must calculate it from
1573          * un_rs_resync_done.
1574          * If the larger of the two values of resync_2_do is greater than 32
1575          * bits, calculate a factor to divide by to ensure that we don't
1576          * overflow 64 bits when calculating the new value for resync_done
1577          */
1578         x = (un->un_rs_resync_2_do > resync_2_do) ? un->un_rs_resync_2_do :
1579             resync_2_do;
1580         while (x > INT32_MAX) {
1581                 x = x >> 1;
1582                 factor = factor << 1;
1583         }
1584         un->un_rs_resync_done = ((un->un_rs_resync_done/factor) *
1585             (resync_2_do/factor)) /
1586             ((un->un_rs_resync_2_do + (factor * factor) - 1)/
1587             (factor * factor));
1588 }
1589
1590 static void
1591 check_comp_4_resync(mm_unit_t *un, int smi, int ci)
1592 {
1593         mdi_unit_t              *ui;
1594         minor_t                 mnum;
1595         mm_submirror_t          *sm;
1596         mm_submirror_ic_t       *smic;
1597         size_t                  count;
1598         u_longlong_t            skip;
1599         u_longlong_t            size;
1600         u_longlong_t            blk_size;
1601         diskaddr_t              initblock;
1602         diskaddr_t              block;
1603         diskaddr_t              frag = 0;
1604         md_m_shared_t           *shared;
1605         int                     err;
1606         set_t                   setno;
1607         int                     broke_out = 0;
1608         int                     blks;
1609         uint_t                  old_rs_type = un->un_rs_type;
1610         diskaddr_t              old_rs_done;
1611         uint_t                  flags1 = MD_FIRST_RESYNC_NEXT;
1612         diskaddr_t              resync_2_do;
1613
1614         mnum = MD_SID(un);
1615         ui = MDI_UNIT(mnum);
1616         sm = &un->un_sm[smi];
1617         smic = &un->un_smic[smi];
1618         setno = MD_UN2SET(un);
1619
1620         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1621             (sm->sm_dev, sm, ci);
1622
1623         if (shared->ms_state != CS_RESYNC) {
1624                 SET_RS_TYPE_NONE(un->un_rs_type);
1625                 return;
1626         }
1627
1628         if (shared->ms_flags & MDM_S_RS_TRIED) {
1629                 SET_RS_TYPE_NONE(un->un_rs_type);
1630                 return;
1631         }
1632
1633         (void) (*(smic->sm_get_bcss))
1634             (sm->sm_dev, sm, ci, &initblock, &count, &skip, &size);
1635
1636         if ((count == 1) && (skip == 0)) {
1637                 count = (size_t)(size / un->un_rs_copysize);
1638                 if ((frag = (size - (count * un->un_rs_copysize))) != 0)
1639                         count++;
1640                 size = (u_longlong_t)un->un_rs_copysize;
1641         }
1642         blk_size = size; /* Save block size for this resync */
1643
1644         ASSERT(count >= 1);
1645         resync_2_do = count;
1646         /*
1647          * If part way through a resync, un_rs_resync_done/un_rs_resync_2_do
1648          * gives the proportion of the resync that has already been done.
1649          * If un_rs_copysize has changed since this previous partial resync,
1650          * either because this node has been rebooted with a different value
1651          * for md_resync_bufsz or because another node with a different value
1652          * for md_resync_bufsz performed the previous resync, we need to
1653          * recalculate un_rs_resync_done as a proportion of our value of
1654          * resync_2_do.
1655          */
1656         recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1657
1658         /*
1659          * For MN mirrors we need to send a message to all nodes indicating
1660          * the next region to be resynced. For a component resync, the size of
1661          * the contiguous region that is processed by resync_read_blk_range()
1662          * may be small if there is the interleave size.
1663          * Therefore, rather than sending the message within
1664          * resync_read_blk_range(), we will send a message every
1665          * MD_DEF_RESYNC_BLK_SZ blocks. Calculate the frequency in terms of
1666          * the number of blocks. Then, if we are restarting a resync, round
1667          * un_rs_resync_done down to the previous resync region boundary. This
1668          * ensures that we send a RESYNC_NEXT message before resyncing any
1669          * blocks
1670          */
1671         if (MD_MNSET_SETNO(setno)) {
1672                 blks = ((MD_DEF_RESYNC_BLK_SZ + blk_size + skip - 1)/
1673                     (blk_size + skip));
1674                 un->un_rs_resync_done = (un->un_rs_resync_done/blks) * blks;
1675         }
1676         /*
1677          * un_rs_resync_done is the number of ('size' + 'skip') increments
1678          * already resynced from the base 'block'
1679          * un_rs_resync_2_do is the number of iterations in
1680          * this component resync.
1681          */
1682         ASSERT(count >= un->un_rs_resync_done);
1683         un->un_rs_resync_2_do = (diskaddr_t)count;
1684
1685         un->c.un_status |= MD_UN_WAR;
1686         sm->sm_flags |= MD_SM_RESYNC_TARGET;
1687         md_unit_writerexit(ui);
1688
1689         /* For MN sets, resync NOTIFY is done when processing resync messages */
1690         if (!MD_MNSET_SETNO(setno)) {
1691                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1692                     SVM_TAG_METADEVICE, setno, MD_SID(un));
1693         }
1694         un = (mm_unit_t *)md_unit_readerlock(ui);
1695
1696         /* check to see if we've been asked to terminate */
1697         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1698                 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1699                         broke_out = RESYNC_ERR;
1700         }
1701         /*
1702          * Check that we are still performing the same component
1703          * resync. If not, another node must have completed it
1704          * so we have no more work to do.
1705          */
1706         if (un->un_rs_type != old_rs_type) {
1707                 md_unit_readerexit(ui);
1708                 (void) md_unit_writerlock(ui);
1709                 return;
1710         }
1711         /*
1712          * Adjust resync_done, resync_2_do, start of resync area and count to
1713          * skip already resync'd data. We need to recalculate resync_done as
1714          * we have dropped the unit lock above and may have lost ownership to
1715          * another node, with a different resync buffer size and it may have
1716          * sent us new values of resync_done and resync_2_do based on its
1717          * resync buffer size
1718          */
1719         recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1720         un->un_rs_resync_2_do = resync_2_do;
1721         count -= un->un_rs_resync_done;
1722         block = initblock + ((blk_size + skip) * (int)un->un_rs_resync_done);
1723
1724         un->un_rs_dropped_lock = 1;
1725         while ((count > 0) && (broke_out != RESYNC_ERR)) {
1726                 old_rs_done = un->un_rs_resync_done;
1727                 /*
1728                  * For MN mirrors send a message to the other nodes. This
1729                  * message includes the size of the region that must be blocked
1730                  * for all writes
1731                  */
1732                 if (MD_MNSET_SETNO(setno)) {
1733                         if ((un->un_rs_resync_done%blks == 0)) {
1734                                 un->un_resync_startbl = block;
1735                                 send_mn_resync_next_message(un, block,
1736                                     (blk_size+skip)*blks, flags1);
1737                                 flags1 = 0;
1738                                 /*
1739                                  * check to see if we've been asked to
1740                                  * terminate
1741                                  */
1742                                 if (resync_kill_pending(un,
1743                                     MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1744                                         if (un->c.un_status &
1745                                             MD_UN_RESYNC_CANCEL) {
1746                                                 broke_out = RESYNC_ERR;
1747                                                 break;
1748                                         }
1749                                 }
1750
1751                                 /*
1752                                  * Check that we are still performing the same
1753                                  * component resync. If not, another node must
1754                                  * have completed it so we have no more work to
1755                                  * do. Also reset count to remaining resync as
1756                                  * we may have lost ownership in in
1757                                  * send_mn_resync_next_message while another
1758                                  * node continued with the resync and
1759                                  * incremented resync_done.
1760                                  */
1761                                 if (un->un_rs_type != old_rs_type) {
1762                                         md_unit_readerexit(ui);
1763                                         (void) md_unit_writerlock(ui);
1764                                         return;
1765                                 }
1766                                 /*
1767                                  * recalculate resync_done, resync_2_do
1768                                  * We need to recalculate resync_done as
1769                                  * we have dropped the unit lock in
1770                                  * send_mn_resync_next_message above and may
1771                                  * have lost ownership to another node, with a
1772                                  * different resync buffer size and it may have
1773                                  * sent us new values of resync_done and
1774                                  * resync_2_do based on its resync buffer size
1775                                  */
1776                                 recalc_resync_done(un, resync_2_do, initblock,
1777                                     blk_size, skip);
1778                                 un->un_rs_resync_2_do = resync_2_do;
1779                                 count = un->un_rs_resync_2_do -
1780                                     un->un_rs_resync_done;
1781                                 /*
1782                                  * Adjust start of resync area to skip already
1783                                  * resync'd data
1784                                  */
1785                                 block = initblock + ((blk_size + skip) *
1786                                     (int)un->un_rs_resync_done);
1787                                 old_rs_done = un->un_rs_resync_done;
1788                         }
1789                 }
1790                 err = resync_read_blk_range(un, block, block + size,
1791                     MD_READER_HELD, MD_RESYNC_FLAG_ERR);
1792
1793                 /* resync_read_blk_range releases/grabs a new lock */
1794                 un = (mm_unit_t *)MD_UNIT(mnum);
1795
1796                 if (err) {
1797                         broke_out = RESYNC_ERR;
1798                         break;
1799                 }
1800                 /*
1801                  * If we are no longer resyncing this component, return as
1802                  * another node has progressed the resync.
1803                  */
1804                 if (un->un_rs_type != old_rs_type) {
1805                         md_unit_readerexit(ui);
1806                         (void) md_unit_writerlock(ui);
1807                         return;
1808                 }
1809
1810                 /*
1811                  * recalculate resync_done, resync_2_do. We need to recalculate
1812                  * resync_done as we have dropped the unit lock in
1813                  * resync_read_blk_range above and may have lost ownership to
1814                  * another node, with a different resync buffer size and it may
1815                  * have sent us new values of resync_done and resync_2_do based
1816                  * on its resync buffer size
1817                  */
1818                 recalc_resync_done(un, resync_2_do, initblock, blk_size, skip);
1819                 un->un_rs_resync_2_do = resync_2_do;
1820
1821                 /*
1822                  * Reset count to remaining resync as we may have blocked in
1823                  * resync_read_blk_range while another node continued
1824                  * with the resync and incremented resync_done. Also adjust
1825                  * start of resync area to skip already resync'd data.
1826                  */
1827                 count = un->un_rs_resync_2_do - un->un_rs_resync_done;
1828                 block = initblock +((blk_size + skip) *
1829                     (int)un->un_rs_resync_done);
1830
1831                 /*
1832                  * If we are picking up from another node, we retry the last
1833                  * block otherwise step on to the next block
1834                  */
1835                 if (old_rs_done == un->un_rs_resync_done) {
1836                         block += blk_size + skip;
1837                         un->un_rs_resync_done++;
1838                         count--;
1839                 }
1840
1841                 if ((count == 1) && frag)
1842                         size = frag;
1843                 if (shared->ms_state == CS_ERRED) {
1844                         err = 1;
1845                         broke_out = RESYNC_ERR;
1846                         break;
1847                 }
1848
1849                 /* Check to see if we've completed the resync cleanly */
1850                 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
1851                         break;
1852         }
1853
1854         md_unit_readerexit(ui);
1855         un = (mm_unit_t *)md_unit_writerlock(ui);
1856
1857         /*
1858          * If MN set send message to all nodes to indicate resync
1859          * phase is complete. The processing of the message will update the
1860          * mirror state
1861          */
1862         if (MD_MNSET_SETNO(setno)) {
1863                 send_mn_resync_done_message(un, broke_out);
1864         } else {
1865                 un->c.un_status &= ~MD_UN_WAR;
1866                 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
1867
1868                 if (err)
1869                         shared->ms_flags |= MDM_S_RS_TRIED;
1870                 else
1871                         /*
1872                          * As we don't transmit the changes,
1873                          * no need to drop the lock.
1874                          */
1875                         set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
1876                             MD_STATE_NO_XMIT, (IOLOCK *)NULL);
1877         }
1878
1879         /* For MN sets, resync NOTIFY is done when processing resync messages */
1880         if (!MD_MNSET_SETNO(setno)) {
1881                 if (broke_out) {
1882                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
1883                             SVM_TAG_METADEVICE, setno, MD_SID(un));
1884                 } else {
1885                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
1886                             SVM_TAG_METADEVICE, setno, MD_SID(un));
1887                 }
1888                 SET_RS_TYPE_NONE(un->un_rs_type);
1889         }
1890 }
1891
1892 static void
1893 submirror_resync(mm_unit_t *un)
1894 {
1895         mdi_unit_t              *ui;
1896         minor_t                 mnum;
1897         mm_submirror_t          *sm;
1898         mm_submirror_ic_t       *smic;
1899         int                     smi;
1900         diskaddr_t              chunk;
1901         diskaddr_t              curblk;
1902         int                     err;
1903         int                     cnt;
1904         set_t                   setno;
1905         int                     broke_out = 0;
1906         int                     i;
1907         int                     flags1 = MD_FIRST_RESYNC_NEXT;
1908         int                     compcnt;
1909
1910         mnum = MD_SID(un);
1911         ui = MDI_UNIT(mnum);
1912         setno = MD_UN2SET(un);
1913
1914         /*
1915          * If the submirror_index is non-zero, we are continuing a resync
1916          * so restart resync from last submirror marked as being resynced.
1917          */
1918         if (RS_SMI(un->un_rs_type) != 0) {
1919                 smi = RS_SMI(un->un_rs_type);
1920                 sm = &un->un_sm[smi];
1921                 smic = &un->un_smic[smi];
1922                 if (!SMS_IS(sm, SMS_ATTACHED_RESYNC)) {
1923                         for (smi = 0; smi < NMIRROR; smi++) {
1924                                 sm = &un->un_sm[smi];
1925                                 smic = &un->un_smic[smi];
1926                                 if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1927                                         break;
1928                         }
1929                 }
1930         } else {
1931                 for (smi = 0; smi < NMIRROR; smi++) {
1932                         sm = &un->un_sm[smi];
1933                         smic = &un->un_smic[smi];
1934                         if (SMS_IS(sm, SMS_ATTACHED_RESYNC))
1935                                 break;
1936                 }
1937         }
1938         if (smi == NMIRROR) {
1939                 SET_RS_TYPE_NONE(un->un_rs_type);
1940                 return;
1941         }
1942
1943         /*
1944          * If we've only got one component we can fail on a resync write
1945          * if an error is encountered. This stops an unnecessary read of the
1946          * whole mirror on a target write error.
1947          */
1948         compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
1949         if (compcnt == 1)
1950                 flags1 |= MD_RESYNC_FLAG_ERR;
1951
1952         un->c.un_status |= MD_UN_WAR;
1953         sm->sm_flags |= MD_SM_RESYNC_TARGET;
1954         SET_RS_SMI(un->un_rs_type, smi);
1955         md_unit_writerexit(ui);
1956
1957         /* For MN sets, resync NOTIFY is done when processing resync messages */
1958         if (!MD_MNSET_SETNO(setno)) {
1959                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
1960                     SVM_TAG_METADEVICE, setno, MD_SID(un));
1961         }
1962         un = (mm_unit_t *)md_unit_readerlock(ui);
1963
1964         un->un_rs_dropped_lock = 1;
1965
1966         /* check to see if we've been asked to terminate */
1967         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)), MD_READER_HELD)) {
1968                 if (un->c.un_status & MD_UN_RESYNC_CANCEL)
1969                         broke_out = RESYNC_ERR;
1970         }
1971         /*
1972          * Check that we are still performing the same submirror
1973          * resync. If not, another node must have completed it
1974          * so we have no more work to do.
1975          */
1976         if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
1977                 md_unit_readerexit(ui);
1978                 (void) md_unit_writerlock(ui);
1979                 return;
1980         }
1981
1982         /* if > 1TB mirror, increase percent done granularity */
1983         if (un->c.un_total_blocks > MD_MAX_BLKS_FOR_SMALL_DEVS)
1984                 chunk = un->c.un_total_blocks / 1000;
1985         else
1986                 chunk = un->c.un_total_blocks / 100;
1987         if (chunk == 0)
1988                 chunk = un->c.un_total_blocks;
1989         /*
1990          * If a MN set, round the chunk size up to a multiple of
1991          * MD_DEF_RESYNC_BLK_SZ
1992          */
1993         if (MD_MNSET_SETNO(setno)) {
1994                 chunk = ((chunk + MD_DEF_RESYNC_BLK_SZ)/MD_DEF_RESYNC_BLK_SZ)
1995                     * MD_DEF_RESYNC_BLK_SZ;
1996                 if (chunk > un->c.un_total_blocks)
1997                         chunk = un->c.un_total_blocks;
1998         }
1999         /*
2000          * Handle restartable resyncs that continue from where the previous
2001          * resync left off. The new resync range is from un_rs_resync_done ..
2002          * un_rs_resync_2_do
2003          */
2004         curblk = 0;
2005         if (un->un_rs_resync_done == 0) {
2006                 un->un_rs_resync_2_do = un->c.un_total_blocks;
2007         } else {
2008                 curblk = un->un_rs_resync_done;
2009         }
2010         while ((curblk != un->c.un_total_blocks) && (broke_out != RESYNC_ERR)) {
2011                 diskaddr_t      rs_done;
2012
2013                 rs_done = un->un_rs_resync_done;
2014                 err = resync_read_blk_range(un, curblk, curblk + chunk,
2015                     MD_READER_HELD, MD_SEND_MESS_XMIT | flags1);
2016                 flags1 = (compcnt == 1 ? MD_RESYNC_FLAG_ERR : 0);
2017
2018                 /* resync_read_blk_range releases/grabs a new lock */
2019                 un = (mm_unit_t *)MD_UNIT(mnum);
2020
2021                 if (err) {
2022                         broke_out = RESYNC_ERR;
2023                         break;
2024                 }
2025
2026                 /*
2027                  * If we are no longer executing a submirror resync, return
2028                  * as another node has completed the submirror resync.
2029                  */
2030                 if (RS_TYPE(un->un_rs_type) != MD_RS_SUBMIRROR) {
2031                         md_unit_readerexit(ui);
2032                         (void) md_unit_writerlock(ui);
2033                         return;
2034                 }
2035                 /*
2036                  * If resync_done has changed, we must have blocked
2037                  * in resync_read_blk_range while another node
2038                  * continued with the resync so restart from resync_done.
2039                  */
2040                 if (rs_done != un->un_rs_resync_done) {
2041                         curblk = un->un_rs_resync_done;
2042                 } else {
2043                         curblk += chunk;
2044                         un->un_rs_resync_done = curblk;
2045                 }
2046
2047                 if ((curblk + chunk) > un->c.un_total_blocks)
2048                         chunk = un->c.un_total_blocks - curblk;
2049                 for (i = 0, cnt = 0; i < NMIRROR; i++)
2050                         if (SUBMIRROR_IS_WRITEABLE(un, i) &&
2051                             !SMS_BY_INDEX_IS(un, i, SMS_ALL_ERRED) &&
2052                             (un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET))
2053                                 cnt++;
2054                 if (cnt == 0) {
2055                         broke_out = RESYNC_ERR;
2056                         break;
2057                 }
2058
2059                 /* Check to see if we've completed the resync cleanly */
2060                 if (un->un_rs_thread_flags & MD_RI_SHUTDOWN)
2061                         break;
2062         }
2063         md_unit_readerexit(ui);
2064         un = (mm_unit_t *)md_unit_writerlock(ui);
2065
2066         /*
2067          * If MN set send message to all nodes to indicate resync
2068          * phase is complete. The processing of the message will update the
2069          * mirror state
2070          */
2071         if (MD_MNSET_SETNO(setno)) {
2072                 send_mn_resync_done_message(un, broke_out);
2073         } else {
2074                 sm->sm_flags &= ~MD_SM_RESYNC_TARGET;
2075                 if (err) {
2076                         mirror_set_sm_state(sm, smic, SMS_ATTACHED, 1);
2077                 } else {
2078                         mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2079                 }
2080                 un->c.un_status &= ~MD_UN_WAR;
2081                 mirror_commit(un, SMI2BIT(smi), 0);
2082         }
2083
2084         /* For MN sets, resync NOTIFY is done when processing resync messages */
2085         if (!MD_MNSET_SETNO(setno)) {
2086                 if (broke_out) {
2087                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
2088                             SVM_TAG_METADEVICE, setno, MD_SID(un));
2089                 } else {
2090                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
2091                             SVM_TAG_METADEVICE, setno, MD_SID(un));
2092                 }
2093         }
2094 }
2095
2096 static void
2097 component_resync(mm_unit_t *un)
2098 {
2099         mm_submirror_t          *sm;
2100         mm_submirror_ic_t       *smic;
2101         int                     ci;
2102         int                     i;
2103         int                     compcnt;
2104
2105         /*
2106          * Handle the case where we are picking up a partially complete
2107          * component resync. In this case un_rs_type contains the submirror
2108          * and component index of where we should restart the resync.
2109          */
2110         while (un->un_rs_type != MD_RS_COMPONENT) {
2111                 i = RS_SMI(un->un_rs_type);
2112                 ci = RS_CI(un->un_rs_type);
2113                 check_comp_4_resync(un, i, ci);
2114                 if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2115                     MD_WRITER_HELD))
2116                         return;
2117                 /*
2118                  * If we have no current resync, contine to scan submirror and
2119                  * components. If the resync has moved on to another component,
2120                  * restart it and if the resync is no longer a component
2121                  * resync, just exit
2122                  */
2123                 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE)
2124                         break;
2125                 if (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT)
2126                         return;
2127         }
2128         /* Now continue scanning _all_ submirrors and components */
2129         for (i = 0; i < NMIRROR; i++) {
2130                 sm = &un->un_sm[i];
2131                 smic = &un->un_smic[i];
2132                 if (!SMS_IS(sm, SMS_RUNNING | SMS_LIMPING))
2133                         continue;
2134                 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2135                 for (ci = 0; ci < compcnt; ci++) {
2136                         SET_RS_SMI(un->un_rs_type, i);
2137                         SET_RS_CI(un->un_rs_type, ci);
2138                         SET_RS_TYPE(un->un_rs_type, MD_RS_COMPONENT);
2139                         check_comp_4_resync(un, i, ci);
2140                         /* Bail out if we've been asked to abort/shutdown */
2141                         if (resync_kill_pending(un, MDI_UNIT(MD_SID(un)),
2142                             MD_WRITER_HELD))
2143                                 return;
2144                         /*
2145                          * Now check if another node has continued with the
2146                          * resync, if we are no longer in component resync,
2147                          * exit, otherwise update to the current component - 1
2148                          * so that the next call of check_comp_4 resync() will
2149                          * resync the current component.
2150                          */
2151                         if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2152                             (RS_TYPE(un->un_rs_type) != MD_RS_COMPONENT))
2153                                 return;
2154                         else {
2155                                 if (RS_SMI(un->un_rs_type) != i) {
2156                                         i = RS_SMI(un->un_rs_type);
2157                                         ci = RS_CI(un->un_rs_type) - 1;
2158                                 } else if (RS_CI(un->un_rs_type) != ci)
2159                                         ci = RS_CI(un->un_rs_type) - 1;
2160                         }
2161                 }
2162         }
2163 }
2164
2165 static void
2166 reset_comp_flags(mm_unit_t *un)
2167 {
2168         mm_submirror_t          *sm;
2169         mm_submirror_ic_t       *smic;
2170         md_m_shared_t           *shared;
2171         int                     ci;
2172         int                     i;
2173         int                     compcnt;
2174
2175         for (i = 0; i < NMIRROR; i++) {
2176                 sm = &un->un_sm[i];
2177                 smic = &un->un_smic[i];
2178                 if (!SMS_IS(sm, SMS_INUSE))
2179                         continue;
2180                 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
2181                 for (ci = 0; ci < compcnt; ci++) {
2182                         shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2183                             (sm->sm_dev, sm, ci);
2184                         shared->ms_flags &= ~MDM_S_RS_TRIED;
2185                 }
2186         }
2187 }
2188
2189 /*
2190  * resync_progress_thread:
2191  * ----------------------
2192  * Thread started on first resync of a unit which simply blocks until woken up
2193  * by a cv_signal, and then updates the mddb for the mirror unit record. This
2194  * saves the resync progress information (un_rs_resync_done, un_rs_resync_2_do)
2195  * so that an aborted resync can be continued after an intervening reboot.
2196  */
2197 static void
2198 resync_progress_thread(minor_t mnum)
2199 {
2200         mm_unit_t       *un = MD_UNIT(mnum);
2201         mdi_unit_t      *ui = MDI_UNIT(mnum);
2202         set_t           setno = MD_MIN2SET(mnum);
2203
2204         while (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2205                 mutex_enter(&un->un_rs_progress_mx);
2206                 cv_wait(&un->un_rs_progress_cv, &un->un_rs_progress_mx);
2207                 mutex_exit(&un->un_rs_progress_mx);
2208                 if (un->un_rs_progress_flags & MD_RI_KILL)
2209                         break;
2210
2211                 /*
2212                  * Commit mirror unit if we're the Master node in a multi-node
2213                  * environment
2214                  */
2215                 if (MD_MNSET_SETNO(setno) && md_set[setno].s_am_i_master) {
2216                         (void) md_unit_readerlock(ui);
2217                         mirror_commit(un, NO_SUBMIRRORS, 0);
2218                         md_unit_readerexit(ui);
2219                 }
2220         }
2221         thread_exit();
2222 }
2223
2224 /*
2225  * resync_progress:
2226  * ---------------
2227  * Timeout handler for updating the progress of the resync thread.
2228  * Simply wake up the resync progress daemon which will then mirror_commit() the
2229  * unit structure to the mddb. This snapshots the current progress of the resync
2230  */
2231 static void
2232 resync_progress(void *arg)
2233 {
2234         mm_unit_t       *un = (mm_unit_t *)arg;
2235         mdi_unit_t      *ui = MDI_UNIT(MD_SID(un));
2236         uint_t          active;
2237
2238         mutex_enter(&un->un_rs_progress_mx);
2239         cv_signal(&un->un_rs_progress_cv);
2240         mutex_exit(&un->un_rs_progress_mx);
2241
2242         /* schedule the next timeout if the resync is still marked active */
2243         (void) md_unit_readerlock(ui);
2244         active = un->c.un_status & MD_UN_RESYNC_ACTIVE ? 1 : 0;
2245         md_unit_readerexit(ui);
2246         if (active) {
2247                 un->un_rs_resync_to_id = timeout(resync_progress, un,
2248                     (clock_t)(drv_usectohz(60000000) *
2249                     md_mirror_resync_update_intvl));
2250         }
2251 }
2252
2253 /*
2254  * resync_unit:
2255  * -----------
2256  * Resync thread which drives all forms of resync (optimized, component,
2257  * submirror). Must handle thread suspension and kill to allow multi-node
2258  * resync to run without undue ownership changes.
2259  *
2260  * For a MN set, the reync mechanism is as follows:
2261  *
2262  * When a resync is started, either via metattach, metaonline, metareplace,
2263  * metasync or by a hotspare kicking in, a message is sent to all nodes, which
2264  * calls mirror_resync_thread. If there is currently no mirror owner, the
2265  * master node sends a CHOOSE_OWNER message to the handler on the master. This
2266  * chooses a mirror owner and sends a CHANGE_OWNER message requesting the
2267  * selected node to become the owner.
2268  * If this node is not the owner it sets itself to block in resync_kill_pending
2269  * and if there is no owner all nodes will block until the chosen owner is
2270  * selected, in which case it will unblock itself. So, on entry to this
2271  * function only one node will continue past resync_kill_pending().
2272  * Once the resync thread is started, it basically cycles through the optimized,
2273  * component and submirrors resyncs until there is no more work to do.
2274  *
2275  * For an ABR mirror, once a mirror owner is chosen it will complete the resync
2276  * unless the nodes dies in which case a new owner will be chosen and it will
2277  * have to complete the resync from the point at which the previous owner died.
2278  * To do this we broadcast a RESYNC_NEXT message before each region to be
2279  * resynced and this message contains the address and length of the region
2280  * being resynced and the current progress through the resync. The size of
2281  * this region is MD_DEF_RESYNC_BLK_SZ blocks. It is larger than the resync
2282  * block size to limit the amount of inter node traffic. The RESYNC_NEXT
2283  * message also indicates to all other nodes that all writes to this block
2284  * must be blocked until the next RESYNC_NEXT message is received. This ensures
2285  * that no node can write to a block that is being resynced. For all MN
2286  * mirrors we also block the whole resync region on the resync owner node so
2287  * that all writes to the resync region are blocked on all nodes. There is a
2288  * difference here between a MN set and a regular set in that for a MN set
2289  * we protect the mirror from writes to the current resync block by blocking
2290  * a larger region. For a regular set we just block writes to the current
2291  * resync block.
2292  *
2293  * For a non-ABR mirror the same RESYNC_NEXT message is sent with an
2294  * additional purpose. In this case, there is only one mirror owner at a time
2295  * and rather than continually switching ownership between the chosen mirror
2296  * owner and the node that is writing to the mirror, we move the resync to the
2297  * mirror owner. When we swich ownership, we block the old owner and unblock
2298  * the resync thread on the new owner. To enable the new owner to continue the
2299  * resync, all nodes need to have the latest resync status, Then, following each
2300  * resync write, we check to see if the resync state has changed and if it
2301  * has this must be because we have lost ownership to another node(s) for a
2302  * period and then have become owner again later in the resync process. If we
2303  * are still dealing with the same resync, we just adjust addresses and counts
2304  * and then continue. If the resync has moved on to a different type, for
2305  * example from an optimized to a submirror resync, we move on to process the
2306  * resync described by rs_type and continue from the position described by
2307  * resync_done and resync_startbl.
2308  *
2309  * Note that for non-ABR mirrors it is possible for a write to be made on a
2310  * non resync-owner node without a change of ownership. This is the case when
2311  * the mirror has a soft part created on it and a write in ABR mode is made
2312  * to that soft part. Therefore we still need to block writes to the resync
2313  * region on all nodes.
2314  *
2315  * Sending the latest resync state to all nodes also enables them to continue
2316  * a resync in the event that the mirror owner dies. If a mirror owner for
2317  * a non-ABR mirror has died, there will be dirty resync regions. Therefore,
2318  * regardless of whether another type of resync was in progress, we must first
2319  * do an optimized resync to clean up the dirty regions before continuing
2320  * with the interrupted resync.
2321  *
2322  * The resync status is held in the unit structure
2323  * On disk
2324  * un_rs_resync_done    The number of contiguous resyc blocks done so far
2325  * un_rs_resync_2_do    The total number of contiguous resync blocks
2326  * un_rs_type           The resync type (inc submirror and component numbers)
2327  * In core
2328  * un_resync_startbl    The address of the current resync block being processed
2329  *
2330  * In the event that the whole cluster fails we need to just use
2331  * un_rs_resync_done to restart the resync and to ensure that this is
2332  * periodically written to disk, we have a thread which writes the record
2333  * to disk every 5 minutes. As the granularity of un_rs_resync_done is
2334  * usually coarse ( for an optimized resync 1001 is the max value) there is
2335  * little point in writing this more frequently.
2336  */
2337 static void
2338 resync_unit(minor_t mnum)
2339 {
2340         mdi_unit_t      *ui;
2341         mm_unit_t       *un;
2342         md_error_t      mde = mdnullerror;
2343         int             mn_resync = 0;
2344         int             resync_finish = 0;
2345         set_t           setno = MD_MIN2SET(mnum);
2346         uint_t          old_rs_type = MD_RS_NONE;
2347         uint_t          old_rs_done = 0, old_rs_2_do = 0;
2348         uint_t          old_rs_startbl = 0;
2349         int             block_resync = 1;
2350         char            cpr_name[23];   /* Unique CPR name */
2351         int             rs_copysize;
2352         char            *rs_buffer;
2353         int             nretries = 0;
2354
2355 resync_restart:
2356 #ifdef DEBUG
2357         if (mirror_debug_flag)
2358                 printf("Resync started (mnum = %x)\n", mnum);
2359 #endif
2360         /*
2361          * increment the mirror resync count
2362          */
2363         mutex_enter(&md_cpr_resync.md_resync_mutex);
2364         md_cpr_resync.md_mirror_resync++;
2365         mutex_exit(&md_cpr_resync.md_resync_mutex);
2366
2367         ui = MDI_UNIT(mnum);
2368         un = MD_UNIT(mnum);
2369
2370         rs_copysize = un->un_rs_copysize;
2371         if (rs_copysize == 0) {
2372                 /*
2373                  * Don't allow buffer size to fall outside the
2374                  * range 0 < bufsize <= md_max_xfer_bufsz.
2375                  */
2376                 if (md_resync_bufsz <= 0)
2377                         md_resync_bufsz = MD_DEF_RESYNC_BUF_SIZE;
2378                 rs_copysize = MIN(md_resync_bufsz, md_max_xfer_bufsz);
2379         }
2380         rs_buffer = kmem_zalloc(dbtob(rs_copysize), KM_SLEEP);
2381         un = md_unit_writerlock(ui);
2382         un->un_rs_copysize = rs_copysize;
2383         un->un_rs_buffer = rs_buffer;
2384
2385         if (MD_MNSET_SETNO(setno)) {
2386                 /*
2387                  * Register this resync thread with the CPR mechanism. This
2388                  * allows us to detect when the system is suspended and so
2389                  * keep track of the RPC failure condition.
2390                  */
2391                 (void) snprintf(cpr_name, sizeof (cpr_name),
2392                     "mirror_resync%x", mnum);
2393                 CALLB_CPR_INIT(&un->un_rs_cprinfo, &un->un_rs_cpr_mx,
2394                     callb_md_mrs_cpr, cpr_name);
2395
2396                 if (ui->ui_tstate & MD_RESYNC_NOT_DONE) {
2397                         /*
2398                          * If this is the first resync following the initial
2399                          * snarf (MD_RESYNC_NOT_DONE still set) and we've
2400                          * been started outside a reconfig step (e.g. by being
2401                          * added to an existing set) we need to query the
2402                          * existing submirror state for this mirror.
2403                          * The set_status flags will have MD_MN_SET_MIR_STATE_RC
2404                          * set if we've been through a step4 reconfig, so only
2405                          * query the master if this isn't (yet) set. In this
2406                          * case we must continue the resync thread as there is
2407                          * not guaranteed to be a currently running resync on
2408                          * any of the other nodes. Worst case is that we will
2409                          * initiate an ownership change to this node and then
2410                          * find that there is no resync to perform. However, we
2411                          * will then have correct status across the cluster.
2412                          */
2413                         if (!md_set[setno].s_am_i_master) {
2414                                 if (!(md_get_setstatus(setno) &
2415                                     MD_SET_MN_MIR_STATE_RC)) {
2416                                         mirror_get_status(un, NULL);
2417                                         block_resync = 0;
2418 #ifdef DEBUG
2419                                         if (mirror_debug_flag) {
2420                                                 mm_submirror_t *sm;
2421                                                 int i;
2422                                                 for (i = 0; i < NMIRROR; i++) {
2423                                                         sm = &un->un_sm[i];
2424                                                         printf(
2425                                                             "sm[%d] state=%4x"
2426                                                             " flags=%4x\n", i,
2427                                                             sm->sm_state,
2428                                                             sm->sm_flags);
2429                                                 }
2430                                         }
2431 #endif
2432                                 }
2433                         }
2434                         ui->ui_tstate &= ~MD_RESYNC_NOT_DONE;
2435                 }
2436                 /*
2437                  * For MN set, if we have an owner, then start the resync on it.
2438                  * If there is no owner the master must send a message to
2439                  * choose the owner. This message will contain the current
2440                  * resync count and it will only be sent to the master, where
2441                  * the resync count will be used to choose the next node to
2442                  * perform a resync, by cycling through the nodes in the set.
2443                  * The message handler will then send a CHANGE_OWNER message to
2444                  * all nodes, and on receipt of that message, the chosen owner
2445                  * will issue a SET_OWNER ioctl to become the owner. This ioctl
2446                  * will be requested to spawn a thread to issue the
2447                  * REQUEST_OWNER message to become the owner which avoids the
2448                  * need for concurrent ioctl requests.
2449                  * After sending the message, we will block waiting for one
2450                  * of the nodes to become the owner and start the resync
2451                  */
2452                 if (MD_MN_NO_MIRROR_OWNER(un)) {
2453                         /*
2454                          * There is no owner, block and then the master will
2455                          * choose the owner. Only perform this if 'block_resync'
2456                          * is set.
2457                          */
2458                         if (block_resync) {
2459                                 mutex_enter(&un->un_rs_thread_mx);
2460                                 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2461                                 mutex_exit(&un->un_rs_thread_mx);
2462                         }
2463                         if (md_set[setno].s_am_i_master) {
2464                                 md_unit_writerexit(ui);
2465                                 (void) mirror_choose_owner(un, NULL);
2466                                 (void) md_unit_writerlock(ui);
2467                         }
2468                 } else {
2469                         /* There is an owner, block if we are not it */
2470                         if (!MD_MN_MIRROR_OWNER(un)) {
2471                                 mutex_enter(&un->un_rs_thread_mx);
2472                                 un->un_rs_thread_flags |= MD_RI_BLOCK_OWNER;
2473                                 mutex_exit(&un->un_rs_thread_mx);
2474                         }
2475                 }
2476         }
2477         /*
2478          * Start a timeout chain to update the resync progress to the mddb.
2479          * This will run every md_mirror_resync_update_intvl minutes and allows
2480          * a resync to be continued over a reboot.
2481          */
2482         ASSERT(un->un_rs_resync_to_id == 0);
2483         un->un_rs_resync_to_id = timeout(resync_progress, un,
2484             (clock_t)(drv_usectohz(60000000) * md_mirror_resync_update_intvl));
2485
2486         /*
2487          * Handle resync restart from the last logged position. The contents
2488          * of un_rs_resync_2_do and un_rs_resync_done are dependent on the
2489          * type of resync that was in progress.
2490          */
2491         if (MD_MNSET_SETNO(setno)) {
2492                 switch ((uint_t)RS_TYPE(un->un_rs_type)) {
2493                 case MD_RS_NONE:
2494                 case MD_RS_OPTIMIZED:
2495                 case MD_RS_COMPONENT:
2496                 case MD_RS_SUBMIRROR:
2497                 case MD_RS_ABR:
2498                         break;
2499                 default:
2500                         un->un_rs_type = MD_RS_NONE;
2501                 }
2502                 /* Allocate a resync message, if required */
2503                 if (un->un_rs_msg == NULL) {
2504                         un->un_rs_msg = (md_mn_msg_resync_t *)kmem_zalloc(
2505                             sizeof (md_mn_msg_resync_t), KM_SLEEP);
2506                 }
2507                 mn_resync = 1;
2508         }
2509
2510         /* Check to see if we've been requested to block/kill */
2511         if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2512                 goto bail_out;
2513         }
2514
2515         do {
2516                 un->un_rs_dropped_lock = 0;
2517                 /*
2518                  * Always perform an optimized resync first as this will bring
2519                  * the mirror into an available state in the shortest time.
2520                  * If we are resuming an interrupted resync, other than an
2521                  * optimized resync, we save the type and amount done so that
2522                  * we can resume the appropriate resync after the optimized
2523                  * resync has completed.
2524                  */
2525                 if ((RS_TYPE(un->un_rs_type) != MD_RS_NONE) &&
2526                     (RS_TYPE(un->un_rs_type) != MD_RS_OPTIMIZED)) {
2527                         old_rs_type = un->un_rs_type;
2528                         old_rs_done = un->un_rs_resync_done;
2529                         old_rs_2_do = un->un_rs_resync_2_do;
2530                         old_rs_startbl = un->un_resync_startbl;
2531                 }
2532                 SET_RS_TYPE(un->un_rs_type, MD_RS_OPTIMIZED);
2533                 /*
2534                  * If we are continuing a resync that is not an
2535                  * OPTIMIZED one, then we start from the beginning when
2536                  * doing this optimized resync
2537                  */
2538                 if (RS_TYPE(old_rs_type) != MD_RS_OPTIMIZED) {
2539                         un->un_rs_resync_done = 0;
2540                         un->un_rs_resync_2_do = 0;
2541                         un->un_resync_startbl = 0;
2542                 }
2543                 optimized_resync(un);
2544                 /* Check to see if we've been requested to block/kill */
2545                 if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2546                         goto bail_out;
2547                 }
2548                 un = (mm_unit_t *)MD_UNIT(mnum);
2549                 /*
2550                  * If another node has moved the resync on, we must
2551                  * restart the correct resync
2552                  */
2553                 if (mn_resync &&
2554                     (RS_TYPE(un->un_rs_type) != MD_RS_NONE)) {
2555                         old_rs_type = un->un_rs_type;
2556                         old_rs_done = un->un_rs_resync_done;
2557                         old_rs_2_do = un->un_rs_resync_2_do;
2558                         old_rs_startbl = un->un_resync_startbl;
2559                 }
2560
2561                 /*
2562                  * Restore previous resync progress or move onto a
2563                  * component resync.
2564                  */
2565                 if (RS_TYPE(old_rs_type) != MD_RS_NONE) {
2566                         un->un_rs_type = old_rs_type;
2567                         un->un_rs_resync_done = old_rs_done;
2568                         un->un_rs_resync_2_do = old_rs_2_do;
2569                         un->un_resync_startbl = old_rs_startbl;
2570                 } else {
2571                         un->un_rs_type = MD_RS_COMPONENT;
2572                         un->un_rs_resync_done = 0;
2573                         un->un_rs_resync_2_do = 0;
2574                         un->un_resync_startbl = 0;
2575                 }
2576
2577                 if (RS_TYPE(un->un_rs_type) == MD_RS_COMPONENT) {
2578                         component_resync(un);
2579                         /* Check to see if we've been requested to block/kill */
2580                         if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2581                                 goto bail_out;
2582                         }
2583                         un = (mm_unit_t *)MD_UNIT(mnum);
2584                         /*
2585                          * If we have moved on from a component resync, another
2586                          * node must have completed it and started a submirror
2587                          * resync, so leave the resync state alone. For non
2588                          * multi-node sets we move onto the submirror resync.
2589                          */
2590                         if (mn_resync) {
2591                                 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2592                                         un->un_rs_type = MD_RS_SUBMIRROR;
2593                                         un->un_rs_resync_done =
2594                                             un->un_rs_resync_2_do = 0;
2595                                         un->un_resync_startbl = 0;
2596                                 }
2597                         } else {
2598                                 un->un_rs_type = MD_RS_SUBMIRROR;
2599                                 un->un_rs_resync_done = 0;
2600                                 un->un_rs_resync_2_do = 0;
2601                                 un->un_resync_startbl = 0;
2602                         }
2603                 }
2604                 if (RS_TYPE(un->un_rs_type) == MD_RS_SUBMIRROR) {
2605                         submirror_resync(un);
2606                         /* Check to see if we've been requested to block/kill */
2607                         if (resync_kill_pending(un, ui, MD_WRITER_HELD)) {
2608                                 goto bail_out;
2609                         }
2610                         un = (mm_unit_t *)MD_UNIT(mnum);
2611                         /*
2612                          * If we have moved on from a submirror resync, another
2613                          * node must have completed it and started a different
2614                          * resync, so leave the resync state alone
2615                          */
2616                         if (mn_resync) {
2617                                 if (RS_TYPE(un->un_rs_type) == MD_RS_NONE) {
2618                                         un->un_rs_resync_done =
2619                                             un->un_rs_resync_2_do = 0;
2620                                         un->un_resync_startbl = 0;
2621                                 }
2622                         } else {
2623                                 /* If non-MN mirror, reinitialize state */
2624                                 un->un_rs_type = MD_RS_NONE;
2625                                 un->un_rs_resync_done = 0;
2626                                 un->un_rs_resync_2_do = 0;
2627                                 un->un_resync_startbl = 0;
2628                         }
2629                 }
2630         } while (un->un_rs_dropped_lock);
2631         mutex_enter(&un->un_rs_thread_mx);
2632         un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
2633         mutex_exit(&un->un_rs_thread_mx);
2634
2635         resync_finish = 1;
2636 bail_out:
2637 #ifdef DEBUG
2638         if (mirror_debug_flag)
2639                 printf("Resync stopped (mnum = %x), resync_finish = %d\n",
2640                     mnum, resync_finish);
2641 #endif
2642         kmem_free(un->un_rs_buffer, dbtob(un->un_rs_copysize));
2643
2644         mutex_enter(&un->un_rs_progress_mx);
2645         un->un_rs_progress_flags |= MD_RI_KILL;
2646         cv_signal(&un->un_rs_progress_cv);
2647         mutex_exit(&un->un_rs_progress_mx);
2648
2649         /*
2650          * For MN Set, send a RESYNC_FINISH if this node completed the resync.
2651          * There is no need to grow unit here, it will be done in the
2652          * handler for the RESYNC_FINISH message together with resetting
2653          * MD_UN_RESYNC_ACTIVE.
2654          */
2655         if (mn_resync) {
2656                 if (resync_finish) {
2657                         /*
2658                          * Normal resync completion. Issue a RESYNC_FINISH
2659                          * message if we're part of a multi-node set.
2660                          */
2661                         md_mn_kresult_t *kres;
2662                         md_mn_msg_resync_t *rmsg;
2663                         int             rval;
2664
2665                         rmsg = (md_mn_msg_resync_t *)un->un_rs_msg;
2666                         md_unit_writerexit(ui);
2667
2668                         rmsg->msg_resync_mnum = mnum;
2669                         rmsg->msg_resync_type = 0;
2670                         rmsg->msg_resync_done = 0;
2671                         rmsg->msg_resync_2_do = 0;
2672                         rmsg->msg_originator = md_mn_mynode_id;
2673
2674                         kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2675
2676 smrf_msg:
2677                         mutex_enter(&un->un_rs_cpr_mx);
2678                         CALLB_CPR_SAFE_BEGIN(&un->un_rs_cprinfo);
2679
2680                         rval = mdmn_ksend_message(setno,
2681                             MD_MN_MSG_RESYNC_FINISH, MD_MSGF_NO_LOG, 0,
2682                             (char *)rmsg, sizeof (md_mn_msg_resync_t), kres);
2683
2684                         CALLB_CPR_SAFE_END(&un->un_rs_cprinfo,
2685                             &un->un_rs_cpr_mx);
2686                         mutex_exit(&un->un_rs_cpr_mx);
2687
2688                         if (!MDMN_KSEND_MSG_OK(rval, kres)) {
2689                                 mdmn_ksend_show_error(rval, kres,
2690                                     "RESYNC_FINISH");
2691                                 /* If we're shutting down, pause things here. */
2692                                 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
2693                                         while (!md_mn_is_commd_present()) {
2694                                                 delay(md_hz);
2695                                         }
2696                                         /*
2697                                          * commd is now available again. Retry
2698                                          * the message once. If this fails we
2699                                          * panic as the system is in an
2700                                          * unexpected state.
2701                                          */
2702                                         if (nretries++ == 0)
2703                                                 goto smrf_msg;
2704                                 }
2705                                 cmn_err(CE_PANIC,
2706                                     "ksend_message failure: RESYNC_FINISH");
2707                         }
2708                         kmem_free(kres, sizeof (md_mn_kresult_t));
2709                         (void) md_unit_writerlock(ui);
2710                 }
2711                 /*
2712                  * If the resync has been cancelled, clear flags, reset owner
2713                  * for ABR mirror and release the resync region parent
2714                  * structure.
2715                  */
2716                 if (un->c.un_status & MD_UN_RESYNC_CANCEL) {
2717                         md_mps_t        *ps;
2718
2719                         if (ui->ui_tstate & MD_ABR_CAP) {
2720                                 /* Resync finished, if ABR set owner to NULL */
2721                                 mutex_enter(&un->un_owner_mx);
2722                                 un->un_mirror_owner = 0;
2723                                 mutex_exit(&un->un_owner_mx);
2724                         }
2725
2726                         un->c.un_status &= ~(MD_UN_RESYNC_CANCEL |
2727                             MD_UN_RESYNC_ACTIVE);
2728                         ps = un->un_rs_prev_overlap;
2729                         if (ps != NULL) {
2730                                 /* Remove previous overlap resync region */
2731                                 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2732                                 mirror_overlap_tree_remove(ps);
2733                                 /*
2734                                  * Release the overlap range reference
2735                                  */
2736                                 un->un_rs_prev_overlap = NULL;
2737                                 kmem_cache_free(mirror_parent_cache,
2738                                     ps);
2739                         }
2740                 }
2741
2742                 /*
2743                  * Release resync message buffer. This will be reallocated on
2744                  * the next invocation of the resync_unit thread.
2745                  */
2746                 if (un->un_rs_msg) {
2747                         kmem_free(un->un_rs_msg, sizeof (md_mn_msg_resync_t));
2748                         un->un_rs_msg = NULL;
2749                 }
2750         } else {
2751                 /* For non-MN sets deal with any pending grows */
2752                 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2753                 if (un->c.un_status & MD_UN_GROW_PENDING) {
2754                         if ((mirror_grow_unit(un, &mde) != 0) ||
2755                             (! mdismderror(&mde, MDE_GROW_DELAYED))) {
2756                                 un->c.un_status &= ~MD_UN_GROW_PENDING;
2757                         }
2758                 }
2759         }
2760
2761         reset_comp_flags(un);
2762         un->un_resync_completed = 0;
2763         mirror_commit(un, NO_SUBMIRRORS, 0);
2764         md_unit_writerexit(ui);
2765
2766         /*
2767          * Stop the resync progress thread.
2768          */
2769         if (un->un_rs_resync_to_id != 0) {
2770                 (void) untimeout(un->un_rs_resync_to_id);
2771                 un->un_rs_resync_to_id = 0;
2772         }
2773
2774         /*
2775          * Calling mirror_internal_close() makes further reference to un / ui
2776          * dangerous. If we are the only consumer of the mirror it is possible
2777          * for a metaclear to be processed after completion of the m_i_c()
2778          * routine. As we need to handle the case where another resync has been
2779          * scheduled for the mirror, we raise the open count on the device
2780          * which protects against the close / metaclear / lock => panic scenario
2781          */
2782         (void) md_unit_incopen(MD_SID(un), FREAD|FWRITE, OTYP_LYR);
2783         (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2784
2785         /*
2786          * deccrement the mirror resync count
2787          */
2788         mutex_enter(&md_cpr_resync.md_resync_mutex);
2789         md_cpr_resync.md_mirror_resync--;
2790         mutex_exit(&md_cpr_resync.md_resync_mutex);
2791
2792         /*
2793          * Remove the thread reference as we're about to exit. This allows a
2794          * subsequent mirror_resync_unit() to start a new thread.
2795          * If RESYNC_ACTIVE is set, mirror_resync_unit() must have been
2796          * called to start a new resync, so reopen the mirror and go back to
2797          * the start.
2798          */
2799         (void) md_unit_writerlock(ui);
2800         mutex_enter(&un->un_rs_thread_mx);
2801         un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2802         mutex_exit(&un->un_rs_thread_mx);
2803         if (un->c.un_status & MD_UN_RESYNC_ACTIVE) {
2804                 md_unit_writerexit(ui);
2805                 if (mirror_internal_open(MD_SID(un), (FREAD|FWRITE),
2806                     OTYP_LYR, 0, (IOLOCK *)NULL) == 0) {
2807                         /* Release the reference grabbed above */
2808                         (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0,
2809                             (IOLOCK *)NULL);
2810                         goto resync_restart;
2811                 }
2812                 (void) md_unit_writerlock(ui);
2813                 cmn_err(CE_NOTE,
2814                     "Could not open metadevice (%x) for resync\n",
2815                     MD_SID(un));
2816         }
2817         un->un_rs_thread = NULL;
2818         md_unit_writerexit(ui);
2819
2820         /*
2821          * Check for hotspares once we've cleared the resync thread reference.
2822          * If there are any errored units a poke_hotspares() will result in
2823          * a call to mirror_resync_unit() which we need to allow to start.
2824          */
2825         (void) poke_hotspares();
2826
2827         /*
2828          * Remove this thread from the CPR callback table.
2829          */
2830         if (mn_resync) {
2831                 mutex_enter(&un->un_rs_cpr_mx);
2832                 CALLB_CPR_EXIT(&un->un_rs_cprinfo);
2833         }
2834
2835         /*
2836          * Remove the extra reference to the unit we generated above. After
2837          * this call it is *unsafe* to reference either ui or un as they may
2838          * no longer be allocated.
2839          */
2840         (void) mirror_internal_close(MD_SID(un), OTYP_LYR, 0, (IOLOCK *)NULL);
2841
2842         thread_exit();
2843 }
2844
2845 /*
2846  * mirror_resync_unit:
2847  * ------------------
2848  * Start a resync for the given mirror metadevice. Save the resync thread ID in
2849  * un->un_rs_thread for later manipulation.
2850  *
2851  * Returns:
2852  *      0       Success
2853  *      !=0     Error
2854  */
2855 /*ARGSUSED*/
2856 int
2857 mirror_resync_unit(
2858         minor_t                 mnum,
2859         md_resync_ioctl_t       *ri,
2860         md_error_t              *ep,
2861         IOLOCK                  *lockp
2862 )
2863 {
2864         mdi_unit_t              *ui;
2865         mm_unit_t               *un;
2866         set_t                   setno = MD_MIN2SET(mnum);
2867
2868         ui = MDI_UNIT(mnum);
2869
2870         if (md_get_setstatus(setno) & MD_SET_STALE)
2871                 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
2872
2873         if (mirror_internal_open(mnum, (FREAD|FWRITE), OTYP_LYR, 0, lockp)) {
2874                 return (mdmderror(ep, MDE_MIRROR_OPEN_FAILURE, mnum));
2875         }
2876         if (lockp) {
2877                 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
2878         } else {
2879                 un = (mm_unit_t *)md_unit_writerlock(ui);
2880         }
2881
2882         /*
2883          * Check to see if we're attempting to start a resync while one is
2884          * already running.
2885          */
2886         if (un->c.un_status & MD_UN_RESYNC_ACTIVE ||
2887             un->un_rs_thread != NULL) {
2888                 /*
2889                  * Ensure RESYNC_ACTIVE set, it may not be if the resync thread
2890                  * is in the process of terminating, setting the flag will
2891                  * cause the resync thread to return to the beginning
2892                  */
2893                 un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2894                 if (lockp) {
2895                         md_ioctl_writerexit(lockp);
2896                 } else {
2897                         md_unit_writerexit(ui);
2898                 }
2899                 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2900                 return (0);
2901         }
2902         un->c.un_status |= MD_UN_RESYNC_ACTIVE;
2903         un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
2904         if ((ri) && (ri->ri_copysize > 0) &&
2905             (ri->ri_copysize <= md_max_xfer_bufsz))
2906                 un->un_rs_copysize = ri->ri_copysize;
2907         else
2908                 un->un_rs_copysize = 0;
2909
2910         /* Start the resync progress thread off */
2911         un->un_rs_progress_flags = 0;
2912         (void) thread_create(NULL, 0, resync_progress_thread,
2913             (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
2914
2915         /*
2916          * We have to store the thread ID in the unit structure so do not
2917          * drop writerlock until the thread is active. This means resync_unit
2918          * may spin on its first md_unit_readerlock(), but deadlock won't occur.
2919          */
2920         mutex_enter(&un->un_rs_thread_mx);
2921         un->un_rs_thread_flags &= ~(MD_RI_KILL|MD_RI_SHUTDOWN);
2922         mutex_exit(&un->un_rs_thread_mx);
2923         un->un_rs_thread = thread_create(NULL, 0, resync_unit,
2924             (caddr_t)(uintptr_t)mnum, 0, &p0, TS_RUN, 60);
2925         if (un->un_rs_thread == (kthread_id_t)NULL) {
2926                 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
2927                 if (lockp) {
2928                         md_ioctl_writerexit(lockp);
2929                 } else {
2930                         md_unit_writerexit(ui);
2931                 }
2932                 (void) mirror_internal_close(mnum, OTYP_LYR, 0, lockp);
2933                 return (mdmderror(ep, MDE_MIRROR_THREAD_FAILURE, mnum));
2934         } else {
2935                 if (lockp) {
2936                         md_ioctl_writerexit(lockp);
2937                 } else {
2938                         md_unit_writerexit(ui);
2939                 }
2940         }
2941
2942         return (0);
2943 }
2944
2945 /*
2946  * mirror_ioctl_resync:
2947  * -------------------
2948  * Called as a result of an MD_IOCSETSYNC ioctl. Either start, block, unblock
2949  * or kill the resync thread associated with the specified unit.
2950  * Can return with locks held since mdioctl will free any locks
2951  * that are marked in lock->l_flags.
2952  *
2953  * Returns:
2954  *      0       Success
2955  *      !=0     Error Code
2956  */
2957 int
2958 mirror_ioctl_resync(
2959         md_resync_ioctl_t       *ri,
2960         IOLOCK                  *lock
2961 )
2962 {
2963         minor_t                 mnum = ri->ri_mnum;
2964         mm_unit_t               *un;
2965         uint_t                  bits;
2966         mm_submirror_t          *sm;
2967         mm_submirror_ic_t       *smic;
2968         int                     smi;
2969         kt_did_t                tid;
2970         set_t                   setno = MD_MIN2SET(mnum);
2971
2972         mdclrerror(&ri->mde);
2973
2974         if ((setno >= md_nsets) ||
2975             (MD_MIN2UNIT(mnum) >= md_nunits)) {
2976                 return (mdmderror(&ri->mde, MDE_INVAL_UNIT, mnum));
2977         }
2978
2979         /* RD_LOCK flag grabs the md_ioctl_readerlock */
2980         un = mirror_getun(mnum, &ri->mde, RD_LOCK, lock);
2981
2982         if (un == NULL) {
2983                 return (mdmderror(&ri->mde, MDE_UNIT_NOT_SETUP, mnum));
2984         }
2985         if (un->c.un_type != MD_METAMIRROR) {
2986                 return (mdmderror(&ri->mde, MDE_NOT_MM, mnum));
2987         }
2988         if (un->un_nsm < 2) {
2989                 return (0);
2990         }
2991
2992         /*
2993          * Determine the action to take based on the ri_flags field:
2994          *      MD_RI_BLOCK:    Block current resync thread
2995          *      MD_RI_UNBLOCK:  Unblock resync thread
2996          *      MD_RI_KILL:     Abort resync thread
2997          *      MD_RI_RESYNC_FORCE_MNSTART: Directly start resync thread
2998          *              without using rpc.mdcommd messages.
2999          *      any other:      Start resync thread
3000          */
3001         switch (ri->ri_flags & (MD_RI_BLOCK|MD_RI_UNBLOCK|MD_RI_KILL)) {
3002
3003         case MD_RI_BLOCK:
3004                 /* Halt resync thread by setting flag in un_rs_flags */
3005                 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3006                         return (0);
3007                 }
3008                 mutex_enter(&un->un_rs_thread_mx);
3009                 un->un_rs_thread_flags |= MD_RI_BLOCK;
3010                 mutex_exit(&un->un_rs_thread_mx);
3011                 return (0);
3012
3013         case MD_RI_UNBLOCK:
3014                 /*
3015                  * Restart resync thread by clearing flag in un_rs_flags and
3016                  * cv_signal'ing the blocked thread.
3017                  */
3018                 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3019                         return (0);
3020                 }
3021                 mutex_enter(&un->un_rs_thread_mx);
3022                 un->un_rs_thread_flags &= ~MD_RI_BLOCK;
3023                 cv_signal(&un->un_rs_thread_cv);
3024                 mutex_exit(&un->un_rs_thread_mx);
3025                 return (0);
3026
3027         case MD_RI_KILL:
3028                 /* Abort resync thread. */
3029                 if (!(un->c.un_status & MD_UN_RESYNC_ACTIVE)) {
3030                         return (0);
3031                 }
3032                 mutex_enter(&un->un_rs_thread_mx);
3033                 tid = un->un_rs_thread ? (un->un_rs_thread)->t_did : 0;
3034                 un->un_rs_thread_flags &= ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
3035                 un->un_rs_thread_flags |= MD_RI_KILL;
3036                 cv_signal(&un->un_rs_thread_cv);
3037                 mutex_exit(&un->un_rs_thread_mx);
3038                 if (tid != 0) {
3039                         if (!(ri->ri_flags & MD_RI_NO_WAIT)) {
3040                                 md_ioctl_readerexit(lock);
3041                                 thread_join(tid);
3042                                 un->un_rs_thread_flags &= ~MD_RI_KILL;
3043                                 un->un_rs_thread = NULL;
3044                                 cmn_err(CE_WARN, "md: %s: Resync cancelled\n",
3045                                     md_shortname(MD_SID(un)));
3046                         }
3047                 }
3048                 return (0);
3049         }
3050
3051         md_ioctl_readerexit(lock);
3052
3053         bits = 0;
3054         for (smi = 0; smi < NMIRROR; smi++) {
3055                 sm = &un->un_sm[smi];
3056                 smic = &un->un_smic[smi];
3057                 if (!SMS_IS(sm, SMS_ATTACHED))
3058                         continue;
3059                 mirror_set_sm_state(sm, smic, SMS_ATTACHED_RESYNC, 1);
3060                 bits |= SMI2BIT(smi);
3061         }
3062         if (bits != 0)
3063                 mirror_commit(un, bits, 0);
3064
3065         /*
3066          * If we are resyncing a mirror in a MN set and the rpc.mdcommd
3067          * can be used, we do not start the resync at this point.
3068          * Instead, the metasync command that issued the ioctl
3069          * will send a RESYNC_STARTING message to start the resync thread. The
3070          * reason we do it this way is to ensure that the metasync ioctl is
3071          * executed on all nodes before the resync thread is started.
3072          *
3073          * If a MN set and the MD_RI_RESYNC_FORCE_MNSTART flag is set, then
3074          * don't use rpc.mdcommd, but just start the resync thread.  This
3075          * flag is set on a node when it is being added to a diskset
3076          * so that the resync threads are started on the newly added node.
3077          */
3078         if ((!(MD_MNSET_SETNO(setno))) ||
3079             (ri->ri_flags & MD_RI_RESYNC_FORCE_MNSTART)) {
3080                 return (mirror_resync_unit(mnum, ri, &ri->mde, lock));
3081         } else {
3082                 return (0);
3083         }
3084 }
3085
3086 int
3087 mirror_mark_resync_region_non_owner(struct mm_unit *un,
3088         diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3089 {
3090         int                     no_change;
3091         size_t                  start_rr;
3092         size_t                  current_rr;
3093         size_t                  end_rr;
3094         md_mn_msg_rr_dirty_t    *rr;
3095         md_mn_kresult_t         *kres;
3096         set_t                   setno = MD_UN2SET(un);
3097         int                     rval;
3098         md_mn_nodeid_t          node_idx = source_node - 1;
3099         mdi_unit_t              *ui = MDI_UNIT(MD_SID(un));
3100         md_mn_nodeid_t          owner_node;
3101         minor_t                 mnum = MD_SID(un);
3102
3103         if (un->un_nsm < 2)
3104                 return (0);
3105
3106         /*
3107          * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3108          * not, allocate it and then fill the [start..end] entries.
3109          * Update un_pernode_dirty_sum if we've gone 0->1.
3110          * Update un_dirty_bm if the corresponding entries are clear.
3111          */
3112         rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3113         if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3114                 un->un_pernode_dirty_bm[node_idx] =
3115                     (uchar_t *)kmem_zalloc(
3116                     (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3117         }
3118         rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3119
3120         BLK_TO_RR(end_rr, endblk, un);
3121         BLK_TO_RR(start_rr, startblk, un);
3122
3123         no_change = 1;
3124
3125         mutex_enter(&un->un_resync_mx);
3126         rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3127         for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3128                 un->un_outstanding_writes[current_rr]++;
3129                 if (!IS_PERNODE_DIRTY(source_node, current_rr, un)) {
3130                         un->un_pernode_dirty_sum[current_rr]++;
3131                         SET_PERNODE_DIRTY(source_node, current_rr, un);
3132                 }
3133                 CLR_GOING_CLEAN(current_rr, un);
3134                 if (!IS_REGION_DIRTY(current_rr, un)) {
3135                         no_change = 0;
3136                         SET_REGION_DIRTY(current_rr, un);
3137                         SET_GOING_DIRTY(current_rr, un);
3138                 } else if (IS_GOING_DIRTY(current_rr, un))
3139                         no_change = 0;
3140         }
3141         rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3142         mutex_exit(&un->un_resync_mx);
3143
3144         if (no_change) {
3145                 return (0);
3146         }
3147
3148         /*
3149          * If we have dirty regions to commit, send a
3150          * message to the owning node so that the
3151          * in-core bitmap gets updated appropriately.
3152          * TODO: make this a kmem_cache pool to improve
3153          * alloc/free performance ???
3154          */
3155         kres = (md_mn_kresult_t *)kmem_alloc(sizeof (md_mn_kresult_t),
3156             KM_SLEEP);
3157         rr = (md_mn_msg_rr_dirty_t *)kmem_alloc(sizeof (md_mn_msg_rr_dirty_t),
3158             KM_SLEEP);
3159
3160 resend_mmrr:
3161         owner_node = un->un_mirror_owner;
3162
3163         rr->rr_mnum = mnum;
3164         rr->rr_nodeid = md_mn_mynode_id;
3165         rr->rr_range = (ushort_t)start_rr << 16;
3166         rr->rr_range |= (ushort_t)end_rr & 0xFFFF;
3167
3168         /* release readerlock before sending message */
3169         md_unit_readerexit(ui);
3170
3171         rval = mdmn_ksend_message(setno, MD_MN_MSG_RR_DIRTY,
3172             MD_MSGF_NO_LOG|MD_MSGF_BLK_SIGNAL|MD_MSGF_DIRECTED,
3173             un->un_mirror_owner, (char *)rr,
3174             sizeof (md_mn_msg_rr_dirty_t), kres);
3175
3176         /* reaquire readerlock on message completion */
3177         (void) md_unit_readerlock(ui);
3178
3179         /* if the message send failed, note it, and pass an error back up */
3180         if (!MDMN_KSEND_MSG_OK(rval, kres)) {
3181                 /* if commd is gone, no point in printing a message */
3182                 if (md_mn_is_commd_present())
3183                         mdmn_ksend_show_error(rval, kres, "RR_DIRTY");
3184                 kmem_free(kres, sizeof (md_mn_kresult_t));
3185                 kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3186                 return (1);
3187         }
3188
3189         /*
3190          * if the owner changed while we were sending the message, and it's
3191          * not us, the new mirror owner won't yet have done the right thing
3192          * with our data.  Let him know.  If we became the owner, we'll
3193          * deal with that differently below.  Note that receiving a message
3194          * about another node twice won't hurt anything.
3195          */
3196         if (un->un_mirror_owner != owner_node && !MD_MN_MIRROR_OWNER(un))
3197                 goto resend_mmrr;
3198
3199         kmem_free(kres, sizeof (md_mn_kresult_t));
3200         kmem_free(rr, sizeof (md_mn_msg_rr_dirty_t));
3201
3202         mutex_enter(&un->un_resync_mx);
3203
3204         /*
3205          * If we became the owner changed while we were sending the message,
3206          * we have dirty bits in the un_pernode_bm that aren't yet reflected
3207          * in the un_dirty_bm, as it was re-read from disk, and our bits
3208          * are also not reflected in the on-disk DRL.  Fix that now.
3209          */
3210         if (MD_MN_MIRROR_OWNER(un)) {
3211                 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3212                 mirror_copy_rr(howmany(un->un_rrd_num, NBBY),
3213                     un->un_pernode_dirty_bm[node_idx], un->un_dirty_bm);
3214                 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3215
3216                 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3217
3218                 mutex_exit(&un->un_resync_mx);
3219                 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3220                 mutex_enter(&un->un_resync_mx);
3221
3222                 un->un_resync_flg &= ~(MM_RF_COMMITING | MM_RF_GATECLOSED);
3223                 cv_broadcast(&un->un_resync_cv);
3224         }
3225
3226         for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3227                 CLR_GOING_DIRTY(current_rr, un);
3228
3229         mutex_exit(&un->un_resync_mx);
3230
3231         return (0);
3232 }
3233
3234 int
3235 mirror_mark_resync_region_owner(struct mm_unit *un,
3236         diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3237 {
3238         int                     no_change;
3239         size_t                  start_rr;
3240         size_t                  current_rr;
3241         size_t                  end_rr;
3242         int                     mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3243         md_mn_nodeid_t          node_idx = source_node - 1;
3244
3245         if (un->un_nsm < 2)
3246                 return (0);
3247
3248         /*
3249          * Check to see if we have a un_pernode_dirty_bm[] entry allocated. If
3250          * not, allocate it and then fill the [start..end] entries.
3251          * Update un_pernode_dirty_sum if we've gone 0->1.
3252          * Update un_dirty_bm if the corresponding entries are clear.
3253          */
3254         if (mnset) {
3255                 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_WRITER);
3256                 if (un->un_pernode_dirty_bm[node_idx] == NULL) {
3257                         un->un_pernode_dirty_bm[node_idx] =
3258                             (uchar_t *)kmem_zalloc(
3259                             (uint_t)howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3260                 }
3261                 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3262         }
3263
3264         mutex_enter(&un->un_resync_mx);
3265
3266         if (mnset)
3267                 rw_enter(&un->un_pernode_dirty_mx[node_idx], RW_READER);
3268
3269         no_change = 1;
3270         BLK_TO_RR(end_rr, endblk, un);
3271         BLK_TO_RR(start_rr, startblk, un);
3272         for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3273                 if (!mnset || source_node == md_mn_mynode_id)
3274                         un->un_outstanding_writes[current_rr]++;
3275                 if (mnset) {
3276                         if (!IS_PERNODE_DIRTY(source_node, current_rr, un))
3277                                 un->un_pernode_dirty_sum[current_rr]++;
3278                         SET_PERNODE_DIRTY(source_node, current_rr, un);
3279                 }
3280                 CLR_GOING_CLEAN(current_rr, un);
3281                 if (!IS_REGION_DIRTY(current_rr, un))
3282                         no_change = 0;
3283                 if (IS_GOING_DIRTY(current_rr, un))
3284                         no_change = 0;
3285         }
3286
3287         if (mnset)
3288                 rw_exit(&un->un_pernode_dirty_mx[node_idx]);
3289
3290         if (no_change) {
3291                 mutex_exit(&un->un_resync_mx);
3292                 return (0);
3293         }
3294         un->un_waiting_to_mark++;
3295         while (un->un_resync_flg & MM_RF_GATECLOSED) {
3296                 if (panicstr)
3297                         return (1);
3298                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3299         }
3300         un->un_waiting_to_mark--;
3301
3302         no_change = 1;
3303         for (current_rr = start_rr; current_rr <= end_rr; current_rr++) {
3304                 if (!IS_REGION_DIRTY(current_rr, un)) {
3305                         SET_REGION_DIRTY(current_rr, un);
3306                         SET_GOING_DIRTY(current_rr, un);
3307                         no_change = 0;
3308                 } else {
3309                         if (IS_GOING_DIRTY(current_rr, un))
3310                                 no_change = 0;
3311                 }
3312         }
3313         if (no_change) {
3314                 if (un->un_waiting_to_mark == 0 || un->un_waiting_to_clear != 0)
3315                         cv_broadcast(&un->un_resync_cv);
3316                 mutex_exit(&un->un_resync_mx);
3317                 return (0);
3318         }
3319
3320         un->un_resync_flg |= MM_RF_COMMIT_NEEDED;
3321         un->un_waiting_to_commit++;
3322         while (un->un_waiting_to_mark != 0 &&
3323             !(un->un_resync_flg & MM_RF_GATECLOSED)) {
3324                 if (panicstr)
3325                         return (1);
3326                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3327         }
3328
3329         if (un->un_resync_flg & MM_RF_COMMIT_NEEDED) {
3330                 un->un_resync_flg |= MM_RF_COMMITING | MM_RF_GATECLOSED;
3331                 un->un_resync_flg &= ~MM_RF_COMMIT_NEEDED;
3332
3333                 mutex_exit(&un->un_resync_mx);
3334                 mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3335                 mutex_enter(&un->un_resync_mx);
3336
3337                 un->un_resync_flg &= ~MM_RF_COMMITING;
3338                 cv_broadcast(&un->un_resync_cv);
3339         }
3340         while (un->un_resync_flg & MM_RF_COMMITING) {
3341                 if (panicstr)
3342                         return (1);
3343                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3344         }
3345
3346         for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3347                 CLR_GOING_DIRTY(current_rr, un);
3348
3349         if (--un->un_waiting_to_commit == 0) {
3350                 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3351                 cv_broadcast(&un->un_resync_cv);
3352         }
3353         mutex_exit(&un->un_resync_mx);
3354
3355         return (0);
3356 }
3357
3358 int
3359 mirror_mark_resync_region(struct mm_unit *un,
3360         diskaddr_t startblk, diskaddr_t endblk, md_mn_nodeid_t source_node)
3361 {
3362         int     mnset = MD_MNSET_SETNO(MD_UN2SET(un));
3363
3364         if (mnset && !MD_MN_MIRROR_OWNER(un)) {
3365                 return (mirror_mark_resync_region_non_owner(un, startblk,
3366                     endblk, source_node));
3367         } else {
3368                 return (mirror_mark_resync_region_owner(un, startblk, endblk,
3369                     source_node));
3370         }
3371 }
3372
3373 int
3374 mirror_resize_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3375 {
3376         short           *owp;
3377         optim_resync_t  *orp;
3378         uint_t          rr_mult = 1;
3379         uint_t          old_nregions, new_nregions;
3380         int             old_bm_size, new_bm_size;
3381         size_t          size;
3382         mddb_recid_t    recid, old_recid;
3383         uchar_t         *old_dirty_bm;
3384         int             i, j;
3385         mddb_type_t     typ1;
3386         set_t           setno = MD_UN2SET(un);
3387         uchar_t         *old_pns;
3388
3389         old_nregions = un->un_rrd_num;
3390         new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3391
3392         while (new_nregions > MD_MAX_NUM_RR) {
3393                 new_nregions >>= 1;
3394                 rr_mult <<= 1;
3395         }
3396
3397         new_bm_size = howmany(new_nregions, NBBY);
3398         old_bm_size = howmany(old_nregions, NBBY);
3399
3400         size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3401
3402         typ1 = (mddb_type_t)md_getshared_key(setno,
3403             mirror_md_ops.md_driver.md_drivername);
3404         recid = mddb_createrec(size, typ1, RESYNC_REC,
3405             MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3406         if (recid < 0)
3407                 return (-1);
3408
3409         orp = (struct optim_resync *)mddb_getrecaddr(recid);
3410         ASSERT(orp != NULL);
3411
3412         orp->or_magic = OR_MAGIC;               /* Magic # */
3413         orp->or_blksize = un->un_rrd_blksize;   /* Same block size */
3414         orp->or_num = new_nregions;             /* New number of regions */
3415
3416         old_dirty_bm = un->un_dirty_bm;
3417         un->un_dirty_bm = orp->or_rr;
3418
3419         kmem_free((caddr_t)un->un_goingdirty_bm, old_bm_size);
3420         un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3421
3422         kmem_free((caddr_t)un->un_goingclean_bm, old_bm_size);
3423         un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3424
3425         kmem_free((caddr_t)un->un_resync_bm, old_bm_size);
3426         un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3427
3428         owp = un->un_outstanding_writes;
3429         un->un_outstanding_writes = (short *)kmem_zalloc(
3430             new_nregions * sizeof (short), KM_SLEEP);
3431
3432         old_pns = un->un_pernode_dirty_sum;
3433         if (old_pns)
3434                 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(new_nregions,
3435                     KM_SLEEP);
3436
3437         /*
3438          * Now translate the old records into the new
3439          * records
3440          */
3441         for (i = 0; i < old_nregions; i++) {
3442                 /*
3443                  * only bring forward the
3444                  * outstanding write counters and the dirty bits and also
3445                  * the pernode_summary counts
3446                  */
3447                 if (!isset(old_dirty_bm, i))
3448                         continue;
3449
3450                 setbit(un->un_dirty_bm, (i / rr_mult));
3451                 un->un_outstanding_writes[(i / rr_mult)] += owp[i];
3452                 if (old_pns)
3453                         un->un_pernode_dirty_sum[(i / rr_mult)] += old_pns[i];
3454         }
3455         kmem_free((caddr_t)owp, old_nregions * sizeof (short));
3456         if (old_pns)
3457                 kmem_free((caddr_t)old_pns, old_nregions);
3458
3459         /*
3460          * Copy all non-zero un_pernode_dirty_bm[] arrays to new versions
3461          */
3462         for (j = 0; j < MD_MNMAXSIDES; j++) {
3463                 rw_enter(&un->un_pernode_dirty_mx[j], RW_WRITER);
3464                 old_dirty_bm = un->un_pernode_dirty_bm[j];
3465                 if (old_dirty_bm) {
3466                         un->un_pernode_dirty_bm[j] = (uchar_t *)kmem_zalloc(
3467                             new_bm_size, KM_SLEEP);
3468                         for (i = 0; i < old_nregions; i++) {
3469                                 if (!isset(old_dirty_bm, i))
3470                                         continue;
3471
3472                                 setbit(un->un_pernode_dirty_bm[j],
3473                                     (i / rr_mult));
3474                         }
3475                         kmem_free((caddr_t)old_dirty_bm, old_bm_size);
3476                 }
3477                 rw_exit(&un->un_pernode_dirty_mx[j]);
3478         }
3479
3480         /* Save the old record id */
3481         old_recid = un->un_rr_dirty_recid;
3482
3483         /* Update the mirror unit struct */
3484         un->un_rr_dirty_recid = recid;
3485         un->un_rrd_num = new_nregions;
3486         un->un_rrd_blksize = un->un_rrd_blksize * rr_mult;
3487
3488         orp->or_blksize = un->un_rrd_blksize;
3489
3490         /*
3491          * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3492          * instead of using mddb_commitrecs_wrapper, is that you cannot
3493          * atomically commit optimized records.
3494          */
3495         mddb_commitrec_wrapper(recid);
3496         mddb_commitrec_wrapper(un->c.un_record_id);
3497         mddb_deleterec_wrapper(old_recid);
3498         return (0);
3499 }
3500
3501 /* lockp can be NULL for !MN diksets */
3502 int
3503 mirror_add_resync_regions(mm_unit_t *un, diskaddr_t new_tb)
3504 {
3505         uchar_t         *old;
3506         short           *owp;
3507         optim_resync_t  *orp;
3508         uint_t          old_nregions, new_nregions;
3509         int             old_bm_size, new_bm_size;
3510         size_t          size;
3511         mddb_recid_t    recid, old_recid;
3512         mddb_type_t     typ1;
3513         set_t           setno = MD_UN2SET(un);
3514         int             i;
3515
3516         old_nregions = un->un_rrd_num;
3517         new_nregions = (uint_t)((new_tb/un->un_rrd_blksize) + 1);
3518
3519         new_bm_size = howmany(new_nregions, NBBY);
3520         old_bm_size = howmany(old_nregions, NBBY);
3521
3522         size = new_bm_size + sizeof (*orp) - sizeof (orp->or_rr);
3523
3524         typ1 = (mddb_type_t)md_getshared_key(setno,
3525             mirror_md_ops.md_driver.md_drivername);
3526
3527         recid = mddb_createrec(size, typ1, RESYNC_REC,
3528             MD_CRO_OPTIMIZE|MD_CRO_32BIT, setno);
3529         if (recid < 0)
3530                 return (-1);
3531
3532         orp = (struct optim_resync *)mddb_getrecaddr(recid);
3533         ASSERT(orp != NULL);
3534
3535         orp->or_magic = OR_MAGIC;               /* Magic # */
3536         orp->or_blksize = un->un_rrd_blksize;   /* Same block size */
3537         orp->or_num = new_nregions;             /* New number of regions */
3538
3539         /* Copy the old bm over the new bm */
3540         bcopy((caddr_t)un->un_dirty_bm, (caddr_t)orp->or_rr, old_bm_size);
3541
3542         /*
3543          * Create new bigger incore arrays, copy, and free old ones:
3544          *              un_goingdirty_bm
3545          *              un_goingclean_bm
3546          *              un_resync_bm
3547          *              un_outstanding_writes
3548          *              un_pernode_dirty_sum
3549          *              un_pernode_dirty_bm[]
3550          */
3551         old = un->un_goingdirty_bm;
3552         un->un_goingdirty_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3553         bcopy((caddr_t)old, (caddr_t)un->un_goingdirty_bm, old_bm_size);
3554         kmem_free((caddr_t)old, old_bm_size);
3555
3556         old = un->un_goingclean_bm;
3557         un->un_goingclean_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3558         bcopy((caddr_t)old, (caddr_t)un->un_goingclean_bm, old_bm_size);
3559         kmem_free((caddr_t)old, old_bm_size);
3560
3561         old = un->un_resync_bm;
3562         un->un_resync_bm = (uchar_t *)kmem_zalloc(new_bm_size, KM_SLEEP);
3563         bcopy((caddr_t)old, (caddr_t)un->un_resync_bm, old_bm_size);
3564         kmem_free((caddr_t)old, old_bm_size);
3565
3566         owp = un->un_outstanding_writes;
3567         un->un_outstanding_writes = (short *)kmem_zalloc(
3568             (uint_t)new_nregions * sizeof (short), KM_SLEEP);
3569         bcopy((caddr_t)owp, (caddr_t)un->un_outstanding_writes,
3570             old_nregions * sizeof (short));
3571         kmem_free((caddr_t)owp, (old_nregions * sizeof (short)));
3572
3573         old = un->un_pernode_dirty_sum;
3574         if (old) {
3575                 un->un_pernode_dirty_sum = (uchar_t *)kmem_zalloc(
3576                     new_nregions, KM_SLEEP);
3577                 bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_sum,
3578                     old_nregions);
3579                 kmem_free((caddr_t)old, old_nregions);
3580         }
3581
3582         for (i = 0; i < MD_MNMAXSIDES; i++) {
3583                 rw_enter(&un->un_pernode_dirty_mx[i], RW_WRITER);
3584                 old = un->un_pernode_dirty_bm[i];
3585                 if (old) {
3586                         un->un_pernode_dirty_bm[i] = (uchar_t *)kmem_zalloc(
3587                             new_bm_size, KM_SLEEP);
3588                         bcopy((caddr_t)old, (caddr_t)un->un_pernode_dirty_bm[i],
3589                             old_bm_size);
3590                         kmem_free((caddr_t)old, old_bm_size);
3591                 }
3592                 rw_exit(&un->un_pernode_dirty_mx[i]);
3593         }
3594
3595         /* Save the old record id */
3596         old_recid = un->un_rr_dirty_recid;
3597
3598         /* Update the mirror unit struct */
3599         un->un_rr_dirty_recid = recid;
3600         un->un_rrd_num = new_nregions;
3601         un->un_dirty_bm = orp->or_rr;
3602
3603         /*
3604          * NOTE: The reason there are distinct calls to mddb_commitrec_wrapper
3605          * instead of using mddb_commitrecs_wrapper, is that you cannot
3606          * atomically commit optimized records.
3607          */
3608         mddb_commitrec_wrapper(recid);
3609         mddb_commitrec_wrapper(un->c.un_record_id);
3610         mddb_deleterec_wrapper(old_recid);
3611         return (0);
3612 }
3613
3614 /*
3615  * mirror_copy_rr:
3616  * --------------
3617  * Combine the dirty record bitmap with the in-core resync bitmap. This allows
3618  * us to carry a resync over an ownership change.
3619  */
3620 void
3621 mirror_copy_rr(int sz, uchar_t *src, uchar_t *dest)
3622 {
3623         int     i;
3624
3625         for (i = 0; i < sz; i++)
3626                 *dest++ |= *src++;
3627 }
3628
3629 /*
3630  * mirror_set_dirty_rr:
3631  * -------------------
3632  * Set the pernode_dirty_bm[node] entries and un_dirty_bm[] if appropriate.
3633  * For the owning node (DRL/mirror owner) update the on-disk RR if needed.
3634  * Called on every clean->dirty transition for the originating writer node.
3635  * Note: only the non-owning nodes will initiate this message and it is only
3636  * the owning node that has to process it.
3637  */
3638 int
3639 mirror_set_dirty_rr(md_mn_rr_dirty_params_t *iocp)
3640 {
3641
3642         minor_t                 mnum = iocp->rr_mnum;
3643         mm_unit_t               *un;
3644         int                     start = (int)iocp->rr_start;
3645         int                     end = (int)iocp->rr_end;
3646         set_t                   setno = MD_MIN2SET(mnum);
3647         md_mn_nodeid_t          orignode = iocp->rr_nodeid;     /* 1-based */
3648         diskaddr_t              startblk, endblk;
3649
3650         mdclrerror(&iocp->mde);
3651
3652         if ((setno >= md_nsets) ||
3653             (MD_MIN2UNIT(mnum) >= md_nunits)) {
3654                 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3655         }
3656
3657         /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3658         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3659
3660         if (un == NULL) {
3661                 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3662         }
3663         if (un->c.un_type != MD_METAMIRROR) {
3664                 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3665         }
3666         if (orignode < 1 || orignode >= MD_MNMAXSIDES) {
3667                 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3668         }
3669         if (un->un_nsm < 2) {
3670                 return (0);
3671         }
3672
3673         /*
3674          * Only process this message if we're the owner of the mirror.
3675          */
3676         if (!MD_MN_MIRROR_OWNER(un)) {
3677                 return (0);
3678         }
3679
3680         RR_TO_BLK(startblk, start, un);
3681         RR_TO_BLK(endblk, end, un);
3682         return (mirror_mark_resync_region_owner(un, startblk, endblk,
3683             orignode));
3684 }
3685
3686 /*
3687  * mirror_clean_rr_bits:
3688  * --------------------
3689  * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3690  * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3691  * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3692  * nodes. Callable from ioctl / interrupt / whatever context.
3693  * un_resync_mx is held on entry.
3694  */
3695 static void
3696 mirror_clean_rr_bits(
3697         md_mn_rr_clean_params_t *iocp)
3698 {
3699         minor_t                 mnum = iocp->rr_mnum;
3700         mm_unit_t               *un;
3701         uint_t                  cleared_bits;
3702         md_mn_nodeid_t          node = iocp->rr_nodeid - 1;
3703         md_mn_nodeid_t          orignode = iocp->rr_nodeid;
3704         int                     i, start, end;
3705
3706         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3707
3708         cleared_bits = 0;
3709         start = MDMN_RR_CLEAN_PARAMS_START_BIT(iocp);
3710         end = start + MDMN_RR_CLEAN_PARAMS_DATA_BYTES(iocp) * NBBY;
3711         rw_enter(&un->un_pernode_dirty_mx[node], RW_READER);
3712         for (i = start; i < end; i++) {
3713                 if (isset(MDMN_RR_CLEAN_PARAMS_DATA(iocp), i - start)) {
3714                         if (IS_PERNODE_DIRTY(orignode, i, un)) {
3715                                 un->un_pernode_dirty_sum[i]--;
3716                                 CLR_PERNODE_DIRTY(orignode, i, un);
3717                         }
3718                         if (un->un_pernode_dirty_sum[i] == 0) {
3719                                 cleared_bits++;
3720                                 CLR_REGION_DIRTY(i, un);
3721                                 CLR_GOING_CLEAN(i, un);
3722                         }
3723                 }
3724         }
3725         rw_exit(&un->un_pernode_dirty_mx[node]);
3726         if (cleared_bits) {
3727                 /*
3728                  * We can only be called iff we are the mirror owner, however
3729                  * as this is a (potentially) decoupled routine the ownership
3730                  * may have moved from us by the time we get to execute the
3731                  * bit clearing. Hence we still need to check for being the
3732                  * owner before flushing the DRL to the replica.
3733                  */
3734                 if (MD_MN_MIRROR_OWNER(un)) {
3735                         mutex_exit(&un->un_resync_mx);
3736                         mddb_commitrec_wrapper(un->un_rr_dirty_recid);
3737                         mutex_enter(&un->un_resync_mx);
3738                 }
3739         }
3740 }
3741
3742 /*
3743  * mirror_drl_task:
3744  * ---------------
3745  * Service routine for clearing the DRL bits on a deferred MD_MN_RR_CLEAN call
3746  * We need to obtain exclusive access to the un_resync_cv and then clear the
3747  * necessary bits.
3748  * On completion, we must also free the passed in argument as it is allocated
3749  * at the end of the ioctl handler and won't be freed on completion.
3750  */
3751 static void
3752 mirror_drl_task(void *arg)
3753 {
3754         md_mn_rr_clean_params_t *iocp = (md_mn_rr_clean_params_t *)arg;
3755         minor_t                 mnum = iocp->rr_mnum;
3756         mm_unit_t               *un;
3757
3758         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3759
3760         mutex_enter(&un->un_rrp_inflight_mx);
3761         mutex_enter(&un->un_resync_mx);
3762         un->un_waiting_to_clear++;
3763         while (un->un_resync_flg & MM_RF_STALL_CLEAN)
3764                 cv_wait(&un->un_resync_cv, &un->un_resync_mx);
3765         un->un_waiting_to_clear--;
3766
3767         un->un_resync_flg |= MM_RF_GATECLOSED;
3768         mirror_clean_rr_bits(iocp);
3769         un->un_resync_flg &= ~MM_RF_GATECLOSED;
3770         if (un->un_waiting_to_mark != 0 || un->un_waiting_to_clear != 0) {
3771                 cv_broadcast(&un->un_resync_cv);
3772         }
3773         mutex_exit(&un->un_resync_mx);
3774         mutex_exit(&un->un_rrp_inflight_mx);
3775
3776         kmem_free((caddr_t)iocp, MDMN_RR_CLEAN_PARAMS_SIZE(iocp));
3777 }
3778
3779 /*
3780  * mirror_set_clean_rr:
3781  * -------------------
3782  * Clear the pernode_dirty_bm[node] entries which are passed in the bitmap
3783  * Once _all_ references are removed (pernode_dirty_count[x] == 0) this region
3784  * is 'cleanable' and will get flushed out by clearing un_dirty_bm[] on all
3785  * nodes.
3786  *
3787  * Only the mirror-owner need process this message as it is the only RR updater.
3788  * Non-owner nodes issue this request, but as we have no point-to-point message
3789  * support we will receive the message on all nodes.
3790  */
3791 int
3792 mirror_set_clean_rr(md_mn_rr_clean_params_t *iocp)
3793 {
3794
3795         minor_t                 mnum = iocp->rr_mnum;
3796         mm_unit_t               *un;
3797         set_t                   setno = MD_MIN2SET(mnum);
3798         md_mn_nodeid_t          node = iocp->rr_nodeid - 1;
3799         int                     can_clear = 0;
3800         md_mn_rr_clean_params_t *newiocp;
3801         int                     rval = 0;
3802
3803         mdclrerror(&iocp->mde);
3804
3805         if ((setno >= md_nsets) ||
3806             (MD_MIN2UNIT(mnum) >= md_nunits)) {
3807                 return (mdmderror(&iocp->mde, MDE_INVAL_UNIT, mnum));
3808         }
3809
3810         /* Must have _NO_ ioctl lock set if we update the RR on-disk */
3811         un = mirror_getun(mnum, &iocp->mde, NO_LOCK, NULL);
3812
3813         if (un == NULL) {
3814                 return (mdmderror(&iocp->mde, MDE_UNIT_NOT_SETUP, mnum));
3815         }
3816         if (un->c.un_type != MD_METAMIRROR) {
3817                 return (mdmderror(&iocp->mde, MDE_NOT_MM, mnum));
3818         }
3819         if (un->un_nsm < 2) {
3820                 return (0);
3821         }
3822
3823         /*
3824          * Check to see if we're the mirror owner. If not, there's nothing
3825          * for us to to.
3826          */
3827         if (!MD_MN_MIRROR_OWNER(un)) {
3828                 return (0);
3829         }
3830
3831         /*
3832          * Process the to-be-cleaned bitmap. We need to update the pernode_dirty
3833          * bits and pernode_dirty_sum[n], and if, and only if, the sum goes 0
3834          * we can then mark the un_dirty_bm entry as GOINGCLEAN. Alternatively
3835          * we can just defer this cleaning until the next process_resync_regions
3836          * timeout.
3837          */
3838         rw_enter(&un->un_pernode_dirty_mx[node], RW_WRITER);
3839         if (un->un_pernode_dirty_bm[node] == NULL) {
3840                 un->un_pernode_dirty_bm[node] = (uchar_t *)kmem_zalloc(
3841                     howmany(un->un_rrd_num, NBBY), KM_SLEEP);
3842         }
3843         rw_exit(&un->un_pernode_dirty_mx[node]);
3844
3845         /*
3846          * See if we can simply clear the un_dirty_bm[] entries. If we're not
3847          * the issuing node _and_ we aren't in the process of marking/clearing
3848          * the RR bitmaps, we can simply update the bits as needed.
3849          * If we're the owning node and _not_ the issuing node, we should also
3850          * sync the RR if we clear any bits in it.
3851          */
3852         mutex_enter(&un->un_resync_mx);
3853         can_clear = (un->un_resync_flg & MM_RF_STALL_CLEAN) ? 0 : 1;
3854         if (can_clear) {
3855                 un->un_resync_flg |= MM_RF_GATECLOSED;
3856                 mirror_clean_rr_bits(iocp);
3857                 un->un_resync_flg &= ~MM_RF_GATECLOSED;
3858                 if (un->un_waiting_to_mark != 0 ||
3859                     un->un_waiting_to_clear != 0) {
3860                         cv_broadcast(&un->un_resync_cv);
3861                 }
3862         }
3863         mutex_exit(&un->un_resync_mx);
3864
3865         /*
3866          * If we couldn't clear the bits, due to DRL update from m_m_r_r / p_r_r
3867          * we must schedule a blocking call to update the DRL on this node.
3868          * As we're invoked from an ioctl we are going to have the original data
3869          * disappear (kmem_free) once we return. So, copy the data into a new
3870          * structure and let the taskq routine release it on completion.
3871          */
3872         if (!can_clear) {
3873                 size_t  sz = MDMN_RR_CLEAN_PARAMS_SIZE(iocp);
3874
3875                 newiocp = (md_mn_rr_clean_params_t *)kmem_alloc(sz, KM_SLEEP);
3876
3877                 bcopy(iocp, newiocp, sz);
3878
3879                 if (ddi_taskq_dispatch(un->un_drl_task, mirror_drl_task,
3880                     newiocp, DDI_NOSLEEP) != DDI_SUCCESS) {
3881                         kmem_free(newiocp, sz);
3882                         rval = ENOMEM;  /* probably starvation */
3883                 }
3884         }
3885
3886         return (rval);
3887 }