usr/src/uts/common/io/lvm/raid/raid_ioctl.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2012 Milan Jurik. All rights reserved.
  26  */
  27
  28 /*
  29  * NAME:        raid_ioctl.c
  30  *
  31  * DESCRIPTION: RAID driver source file containing IOCTL operations.
  32  *
  33  * ROUTINES PROVIDED FOR EXTERNAL USE:
  34  *        raid_commit() - commits MD database updates for a RAID metadevice
  35  *      md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
  36  *
  37  * ROUTINES PROVIDED FOR INTERNAL USE:
  38  *       raid_getun() - Performs unit checking on a RAID metadevice
  39  *    init_col_nextio() - normal backend when zeroing column of RAID metadevice.
  40  *       init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
  41  *  raid_init_columns() - Zero one or more columns of a RAID metadevice.
  42  *           raid_set() - used to create a RAID metadevice
  43  *           raid_get() - used to get the unit structure of a RAID metadevice
  44  *       raid_replace() - used to replace a component of a RAID metadevice
  45  *          raid_grow() - Concatenate to a RAID metadevice
  46  *        raid_change() - change dynamic values of a RAID metadevice
  47  *         raid_reset() - used to reset (clear / remove) a RAID metadevice
  48  *      raid_get_geom() - used to get the geometry of a RAID metadevice
  49  *      raid_get_vtoc() - used to get the VTOC on a RAID metadevice
  50  *      raid_set_vtoc() - used to set the VTOC on a RAID metadevice
  51  *      raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice
  52  *      raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice
  53  *       raid_getdevs() - return all devices within a RAID metadevice
  54  *   raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
  55  */
  56
  57
  58 #include <sys/param.h>
  59 #include <sys/systm.h>
  60 #include <sys/conf.h>
  61 #include <sys/file.h>
  62 #include <sys/user.h>
  63 #include <sys/uio.h>
  64 #include <sys/t_lock.h>
  65 #include <sys/buf.h>
  66 #include <sys/dkio.h>
  67 #include <sys/vtoc.h>
  68 #include <sys/kmem.h>
  69 #include <vm/page.h>
  70 #include <sys/sysmacros.h>
  71 #include <sys/types.h>
  72 #include <sys/mkdev.h>
  73 #include <sys/stat.h>
  74 #include <sys/open.h>
  75 #include <sys/disp.h>
  76 #include <sys/modctl.h>
  77 #include <sys/ddi.h>
  78 #include <sys/sunddi.h>
  79 #include <sys/cred.h>
  80 #include <sys/lvm/mdvar.h>
  81 #include <sys/lvm/md_names.h>
  82 #include <sys/lvm/md_mddb.h>
  83 #include <sys/lvm/md_raid.h>
  84 #include <sys/lvm/md_convert.h>
  85
  86 #include <sys/sysevent/eventdefs.h>
  87 #include <sys/sysevent/svm.h>
  88
  89 extern int              md_status;
  90 extern unit_t           md_nunits;
  91 extern set_t            md_nsets;
  92 extern md_set_t         md_set[];
  93 extern md_ops_t         raid_md_ops;
  94 extern major_t          md_major;
  95 extern md_krwlock_t     md_unit_array_rw;
  96 extern mdq_anchor_t     md_done_daemon;
  97 extern mdq_anchor_t     md_ff_daemonq;
  98 extern  int             mdopen();
  99 extern  int             mdclose();
 100 extern  void            md_probe_one(probe_req_t *);
 101 extern int              md_init_probereq(md_probedev_impl_t *,
 102                                 daemon_queue_t **);
 103 extern md_resync_t      md_cpr_resync;
 104
 105
 106 extern void dump_mr_unit(mr_unit_t *);
 107
 108 typedef struct raid_ci {
 109         DAEMON_QUEUE
 110         struct raid_ci  *ci_next;
 111         mr_unit_t       *ci_un;
 112         int             ci_col;
 113         int             ci_err;
 114         int             ci_flag;
 115         size_t          ci_zerosize;
 116         diskaddr_t      ci_blkno;
 117         diskaddr_t      ci_lastblk;
 118         buf_t           ci_buf;
 119 } raid_ci_t;
 120 /* values for the ci_flag */
 121 #define COL_INITING     (0x0001)
 122 #define COL_INIT_DONE   (0x0002)
 123 #define COL_READY       (0x0004)
 124
 125 /*
 126  * NAME:        raid_getun
 127  * DESCRIPTION: performs a lot of unit checking on a RAID metadevice
 128  * PARAMETERS:  minor_t       mnum - minor device number for RAID unit
 129  *              md_error_t    *mde - pointer to error reporting structure
 130  *              int          flags - pointer to error reporting structure
 131  *                                      STALE_OK - allow stale MD memory
 132  *                                        NO_OLD - unit must not exist
 133  *                                       NO_LOCK - no IOCTL lock needed
 134  *                                       WR_LOCK - write IOCTL lock needed
 135  *                                       RD_LOCK - read IOCTL lock needed
 136  *              IOLOCK       *lock - pointer to IOCTL lock
 137  *
 138  * LOCKS:       obtains unit reader or writer lock via IOLOCK
 139  *
 140  */
 141 static mr_unit_t *
 142 raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
 143 {
 144         mr_unit_t       *un;
 145         mdi_unit_t      *ui;
 146         set_t           setno = MD_MIN2SET(mnum);
 147
 148         if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
 149                 (void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
 150                 return (NULL);
 151         }
 152
 153         if (!(flags & STALE_OK)) {
 154                 if (md_get_setstatus(setno) & MD_SET_STALE) {
 155                         (void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
 156                         return (NULL);
 157                 }
 158         }
 159
 160         ui = MDI_UNIT(mnum);
 161         if (flags & NO_OLD) {
 162                 if (ui != NULL) {
 163                         (void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
 164                         return (NULL);
 165                 }
 166                 return ((mr_unit_t *)1);
 167         }
 168
 169         if (ui == NULL) {
 170                 (void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
 171                 return (NULL);
 172         }
 173         if (flags & ARRAY_WRITER)
 174                 md_array_writer(lock);
 175         else if (flags & ARRAY_READER)
 176                 md_array_reader(lock);
 177
 178         if (!(flags & NO_LOCK)) {
 179                 if (flags & WR_LOCK) {
 180                         (void) md_ioctl_io_lock(lock, ui);
 181                         (void) md_ioctl_writerlock(lock, ui);
 182                 } else /* RD_LOCK */
 183                         (void) md_ioctl_readerlock(lock, ui);
 184         }
 185         un = (mr_unit_t *)MD_UNIT(mnum);
 186
 187         if (un->c.un_type != MD_METARAID) {
 188                 (void) mdmderror(mde, MDE_NOT_RAID, mnum);
 189                 return (NULL);
 190         }
 191
 192         return (un);
 193 }
 194
 195
 196 /*
 197  * NAME:        raid_commit
 198  * DESCRIPTION: commits MD database updates for a RAID metadevice
 199  * PARAMETERS:  mr_unit_t        *un - RAID unit to update in the MD database
 200  *              mddb_recid_t *extras - array of other record IDs to update
 201  *
 202  * LOCKS:       assumes caller holds unit writer lock
 203  *
 204  */
 205 void
 206 raid_commit(mr_unit_t *un, mddb_recid_t *extras)
 207 {
 208         mddb_recid_t    *recids;
 209         int             ri = 0;
 210         int             nrecids = 0;
 211
 212         if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
 213                 return;
 214
 215         /* Count the extra recids */
 216         if (extras != NULL) {
 217                 while (extras[nrecids] != 0) {
 218                         nrecids++;
 219                 }
 220         }
 221
 222         /*
 223          * Allocate space for two recids in addition to the extras:
 224          * one for the unit structure, one for the null terminator.
 225          */
 226         nrecids += 2;
 227         recids = (mddb_recid_t *)
 228             kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
 229
 230         if (un != NULL) {
 231                 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
 232                 recids[ri++] = un->c.un_record_id;
 233         }
 234
 235         if (extras != NULL) {
 236                 while (*extras != 0) {
 237                         recids[ri++] = *extras;
 238                         extras++;
 239                 }
 240         }
 241
 242         if (ri > 0) {
 243                 mddb_commitrecs_wrapper(recids);
 244         }
 245
 246         kmem_free(recids, nrecids * sizeof (mddb_recid_t));
 247 }
 248
 249 static int
 250 raid_check_pw(mr_unit_t *un)
 251 {
 252         buf_t           bp;
 253         char            *buf;
 254         mr_column_t     *colptr;
 255         minor_t         mnum = MD_SID(un);
 256         int             i;
 257         int             err = 0;
 258         minor_t         unit;
 259
 260         buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
 261
 262         for (i = 0; i < un->un_totalcolumncnt; i++) {
 263                 md_dev64_t tmpdev;
 264
 265                 colptr = &un->un_column[i];
 266
 267                 tmpdev = colptr->un_dev;
 268                 /*
 269                  * Open by device id
 270                  * If this device is hotspared
 271                  * use the hotspare key
 272                  */
 273                 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
 274                     colptr->un_hs_key : colptr->un_orig_key);
 275                 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
 276                         colptr->un_dev = tmpdev;
 277                         return (1);
 278                 }
 279                 colptr->un_dev = tmpdev;
 280
 281                 bzero((caddr_t)&bp, sizeof (buf_t));
 282                 bp.b_back = &bp;
 283                 bp.b_forw = &bp;
 284                 bp.b_flags = B_READ | B_BUSY;
 285                 sema_init(&bp.b_io, 0, NULL,
 286                     SEMA_DEFAULT, NULL);
 287                 sema_init(&bp.b_sem, 0, NULL,
 288                     SEMA_DEFAULT, NULL);
 289                 bp.b_edev = md_dev64_to_dev(colptr->un_dev);
 290                 bp.b_lblkno = colptr->un_pwstart;
 291                 bp.b_bcount = DEV_BSIZE;
 292                 bp.b_bufsize = DEV_BSIZE;
 293                 bp.b_un.b_addr = (caddr_t)buf;
 294                 bp.b_offset = -1;
 295                 (void) md_call_strategy(&bp, 0, NULL);
 296                 if (biowait(&bp))
 297                         err = 1;
 298                 if (i == 0) {
 299                         if (un->c.un_revision & MD_64BIT_META_DEV) {
 300                                 unit = ((raid_pwhdr_t *)buf)->rpw_unit;
 301                         } else {
 302                                 unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
 303                         }
 304                 }
 305                 /*
 306                  * depending upon being an 64bit or 32 bit raid, the
 307                  * pre write headers have different layout
 308                  */
 309                 if (un->c.un_revision & MD_64BIT_META_DEV) {
 310                         if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
 311                             (((raid_pwhdr_t *)buf)->rpw_unit != unit))
 312                                 err = 1;
 313                 } else {
 314                         if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
 315                             (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
 316                                 err = 1;
 317                 }
 318                 md_layered_close(colptr->un_dev, MD_OFLG_NULL);
 319                 if (err)
 320                         break;
 321         }
 322         kmem_free(buf, DEV_BSIZE);
 323         return (err);
 324 }
 325
 326 /*
 327  * NAME:        init_col_nextio
 328  * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
 329  * PARAMETERS:  raid_ci_t *cur - struct for column being zeroed
 330  *
 331  * LOCKS:       assumes caller holds unit reader lock,
 332  *              preiodically releases and reacquires unit reader lock,
 333  *              broadcasts on unit conditional variable (un_cv)
 334  *
 335  */
 336 #define INIT_RLS_CNT    10
 337 static void
 338 init_col_nextio(raid_ci_t *cur)
 339 {
 340         mr_unit_t       *un;
 341
 342         un = cur->ci_un;
 343
 344         cur->ci_blkno += cur->ci_zerosize;
 345
 346         mutex_enter(&un->un_mx);
 347         /* ===> update un_percent_done */
 348         un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
 349         mutex_exit(&un->un_mx);
 350
 351         /*
 352          * When gorwing a device, normal I/O is still going on.
 353          * The init thread still holds the unit reader lock which
 354          * prevents I/O from doing state changes.
 355          * So every INIT_RLS_CNT init I/Os, we will release the
 356          * unit reader lock.
 357          *
 358          * CAVEAT:
 359          * We know we are in the middle of a grow operation and the
 360          * unit cannot be grown or removed (through reset or halt)
 361          * so the mr_unit_t structure will not move or disappear.
 362          * In addition, we know that only one of the init I/Os
 363          * can be in col_init_nextio at a time because they are
 364          * placed on the md_done_daemon queue and md only processes
 365          * one element of this queue at a time. In addition, any
 366          * code that needs to acquire the unit writer lock to change
 367          * state is supposed to be on the md_mstr_daemon queue so
 368          * it can be processing while we sit here waiting to get the
 369          * unit reader lock back.
 370          */
 371
 372         if (cur->ci_blkno < cur->ci_lastblk) {
 373                 /* truncate last chunk to end_addr if needed */
 374                 if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
 375                         cur->ci_zerosize = (size_t)
 376                             (cur->ci_lastblk - cur->ci_blkno);
 377                 }
 378
 379                 /* set address and length for I/O bufs */
 380                 cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
 381                 cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
 382                 cur->ci_buf.b_lblkno = cur->ci_blkno;
 383
 384                 (void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
 385                 return;
 386         }
 387         /* finished initializing this column */
 388         mutex_enter(&un->un_mx);
 389         cur->ci_flag = COL_INIT_DONE;
 390         uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
 391         mutex_exit(&un->un_mx);
 392         cv_broadcast(&un->un_cv);
 393 }
 394
 395 /*
 396  * NAME:        init_col_int
 397  * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
 398  * PARAMETERS:  buf_t     *cb - I/O buffer for which interrupt occurred
 399  *
 400  * LOCKS:       assumes caller holds unit reader or writer lock
 401  *
 402  */
 403 static int
 404 init_col_int(buf_t *cb)
 405 {
 406         raid_ci_t       *cur;
 407
 408         cur = (raid_ci_t *)cb->b_chain;
 409         if (cb->b_flags & B_ERROR) {
 410                 mutex_enter(&cur->ci_un->un_mx);
 411                 cur->ci_err = EIO;
 412                 mutex_exit(&cur->ci_un->un_mx);
 413                 cv_broadcast(&cur->ci_un->un_cv);
 414                 return (1);
 415         }
 416         daemon_request(&md_done_daemon, init_col_nextio,
 417             (daemon_queue_t *)cur, REQ_OLD);
 418         return (1);
 419 }
 420
 421 /*
 422  * NAME:        raid_init_columns
 423  * DESCRIPTION: Zero one or more columns of a RAID metadevice.
 424  * PARAMETERS:  minor_t  mnum - RAID unit minor identifier
 425  *
 426  * LOCKS:       obtains and releases unit reader lock,
 427  *              obtains and releases unit writer lock,
 428  *              obtains and releases md_unit_array_rw write lock,
 429  *              obtains and releases unit mutex (un_mx) lock,
 430  *              waits on unit conditional variable (un_cv)
 431  *
 432  */
 433 static void
 434 raid_init_columns(minor_t mnum)
 435 {
 436         mr_unit_t       *un;
 437         mdi_unit_t      *ui;
 438         raid_ci_t       *ci_chain = NULL, *cur;
 439         rus_state_t     state;
 440         caddr_t         zero_addr;
 441         diskaddr_t      end_off;
 442         size_t          zerosize;
 443         int             err = 0;
 444         int             ix;
 445         int             colcnt = 0;
 446         int             col;
 447         set_t           setno = MD_MIN2SET(mnum);
 448
 449         /*
 450          * Increment the raid resync count for cpr
 451          */
 452         mutex_enter(&md_cpr_resync.md_resync_mutex);
 453         md_cpr_resync.md_raid_resync++;
 454         mutex_exit(&md_cpr_resync.md_resync_mutex);
 455
 456         /*
 457          * initialization is a multiple step process.  The first step
 458          * is to go through the unit structure and start each device
 459          * in the init state writing zeros over the component.
 460          * Next initialize the prewrite areas, so the device can be
 461          * used if a metainit -k is done.  Now close the componenets.
 462          *
 463          * Once this complete set the state of each component being
 464          * zeroed and set the correct state for the unit.
 465          *
 466          * last commit the records.
 467          */
 468
 469         ui = MDI_UNIT(mnum);
 470         un = md_unit_readerlock(ui);
 471
 472         /* check for active init on this column */
 473         /* exiting is cpr safe */
 474         if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
 475                 md_unit_readerexit(ui);
 476                 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
 477                 /*
 478                  * Decrement the raid resync count for cpr
 479                  */
 480                 mutex_enter(&md_cpr_resync.md_resync_mutex);
 481                 md_cpr_resync.md_raid_resync--;
 482                 mutex_exit(&md_cpr_resync.md_resync_mutex);
 483                 thread_exit();
 484         }
 485
 486         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
 487             MD_SID(un));
 488         un->un_init_colcnt = 0;
 489         un->un_init_iocnt = 0;
 490         end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
 491         zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
 492
 493         /* allocate zero-filled buffer */
 494         zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
 495
 496         for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
 497                 if (un->un_column[ix].un_devstate != RCS_INIT)
 498                         continue;
 499                 /* allocate new column init structure */
 500                 cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
 501                 ASSERT(cur != NULL);
 502                 un->un_init_colcnt++;
 503                 cur->ci_next = ci_chain;
 504                 ci_chain = cur;
 505                 cur->ci_un = un;
 506                 cur->ci_col = ix;
 507                 cur->ci_err = 0;
 508                 cur->ci_flag = COL_INITING;
 509                 cur->ci_zerosize = zerosize;
 510                 cur->ci_blkno = un->un_column[ix].un_pwstart;
 511                 cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
 512                     + (un->un_segsize * un->un_segsincolumn);
 513                 /* initialize static buf fields */
 514                 cur->ci_buf.b_un.b_addr = zero_addr;
 515                 cur->ci_buf.b_chain = (buf_t *)cur;
 516                 cur->ci_buf.b_back = &cur->ci_buf;
 517                 cur->ci_buf.b_forw = &cur->ci_buf;
 518                 cur->ci_buf.b_iodone = init_col_int;
 519                 cur->ci_buf.b_flags = B_BUSY | B_WRITE;
 520                 cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
 521                 sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL);
 522                 sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL);
 523                 /* set address and length for I/O bufs */
 524                 cur->ci_buf.b_bufsize = dbtob(zerosize);
 525                 cur->ci_buf.b_bcount = dbtob(zerosize);
 526                 cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
 527                 cur->ci_buf.b_offset = -1;
 528
 529                 if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
 530                         md_dev64_t tmpdev = un->un_column[ix].un_dev;
 531                         /*
 532                          * Open by device id
 533                          * If this column is hotspared then
 534                          * use the hotspare key
 535                          */
 536                         tmpdev = md_resolve_bydevid(mnum, tmpdev,
 537                             HOTSPARED(un, ix) ?
 538                             un->un_column[ix].un_hs_key :
 539                             un->un_column[ix].un_orig_key);
 540                         if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
 541                             MD_OFLG_NULL)) == 0)
 542                                 un->un_column[ix].un_devflags |=
 543                                     MD_RAID_DEV_ISOPEN;
 544                         un->un_column[ix].un_dev = tmpdev;
 545                 }
 546                 if (cur->ci_err == 0)
 547                         md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
 548         }
 549
 550         md_unit_readerexit(ui);
 551         state = un->un_state;
 552         colcnt = un->un_init_colcnt;
 553         mutex_enter(&un->un_mx);
 554         while (colcnt) {
 555                 cv_wait(&un->un_cv, &un->un_mx);
 556
 557                 colcnt = 0;
 558                 for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
 559                         col = cur->ci_col;
 560                         if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
 561                                 if (cur->ci_err)
 562                                         err = cur->ci_err;
 563                                 else if (cur->ci_flag == COL_INIT_DONE) {
 564                                         (void) init_pw_area(un,
 565                                             un->un_column[col].un_dev,
 566                                             un->un_column[col].un_pwstart,
 567                                             col);
 568                                         cur->ci_flag = COL_READY;
 569                                 }
 570                         } else {
 571                                 colcnt++;
 572                         }
 573                 }
 574         }
 575         mutex_exit(&un->un_mx);
 576
 577         /* This prevents new opens */
 578         rw_enter(&md_unit_array_rw.lock, RW_WRITER);
 579         (void) md_io_writerlock(ui);
 580         un = (mr_unit_t *)md_unit_writerlock(ui);
 581         while (ci_chain) {
 582                 cur = ci_chain;
 583
 584                 /* take this element out of the chain */
 585                 ci_chain = cur->ci_next;
 586                 /* free this element */
 587                 sema_destroy(&cur->ci_buf.b_io);
 588                 sema_destroy(&cur->ci_buf.b_sem);
 589                 if (cur->ci_err)
 590                         raid_set_state(cur->ci_un, cur->ci_col,
 591                             RCS_INIT_ERRED, 0);
 592                 else
 593                         raid_set_state(cur->ci_un, cur->ci_col,
 594                             RCS_OKAY, 0);
 595                 kmem_free(cur, sizeof (raid_ci_t));
 596         }
 597
 598         /* free the zeroed buffer */
 599         kmem_free(zero_addr, dbtob(zerosize));
 600
 601         /* determine new unit state */
 602         if (err == 0) {
 603                 if (state == RUS_INIT)
 604                         un->un_state = RUS_OKAY;
 605                 else {
 606                         un->c.un_total_blocks = un->un_grow_tb;
 607                         md_nblocks_set(mnum, un->c.un_total_blocks);
 608                         un->un_grow_tb = 0;
 609                         if (raid_state_cnt(un, RCS_OKAY) ==
 610                             un->un_totalcolumncnt)
 611                                 un->un_state = RUS_OKAY;
 612                 }
 613         } else {  /* error orcurred */
 614                 if (state & RUS_INIT)
 615                         un->un_state = RUS_DOI;
 616         }
 617         uniqtime32(&un->un_timestamp);
 618         MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
 619         un->un_init_colcnt = 0;
 620         un->un_init_iocnt = 0;
 621         raid_commit(un, NULL);
 622         md_unit_writerexit(ui);
 623         (void) md_io_writerexit(ui);
 624         rw_exit(&md_unit_array_rw.lock);
 625         if (err) {
 626                 if (un->un_state & RUS_DOI) {
 627                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
 628                             SVM_TAG_METADEVICE, setno, MD_SID(un));
 629                 } else {
 630                         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
 631                             SVM_TAG_METADEVICE, setno, MD_SID(un));
 632                 }
 633         } else {
 634                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
 635                     SVM_TAG_METADEVICE, setno, MD_SID(un));
 636         }
 637         (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
 638         /*
 639          * Decrement the raid resync count for cpr
 640          */
 641         mutex_enter(&md_cpr_resync.md_resync_mutex);
 642         md_cpr_resync.md_raid_resync--;
 643         mutex_exit(&md_cpr_resync.md_resync_mutex);
 644         thread_exit();
 645         /*NOTREACHED*/
 646 }
 647
 648 static int
 649 raid_init_unit(minor_t mnum, md_error_t *ep)
 650 {
 651         mdi_unit_t      *ui;
 652         mr_unit_t       *un;
 653         int             rval, i;
 654         set_t           setno = MD_MIN2SET(mnum);
 655
 656         ui = MDI_UNIT(mnum);
 657         if (md_get_setstatus(setno) & MD_SET_STALE)
 658                 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
 659
 660         /* Don't start an init if the device is not available */
 661         if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
 662                 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
 663         }
 664
 665         if (raid_internal_open(mnum, (FREAD | FWRITE),
 666             OTYP_LYR, MD_OFLG_ISINIT)) {
 667                 rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
 668                 goto out;
 669         }
 670
 671         un = md_unit_readerlock(ui);
 672         un->un_percent_done = 0;
 673         md_unit_readerexit(ui);
 674         /* start resync_unit thread */
 675         (void) thread_create(NULL, 0, raid_init_columns,
 676             (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
 677
 678         return (0);
 679
 680 out:
 681         un = md_unit_writerlock(ui);
 682         MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
 683         /* recover state */
 684         for (i = 0; i < un->un_totalcolumncnt; i++)
 685                 if (COLUMN_STATE(un, i) == RCS_INIT)
 686                         raid_set_state(un, i, RCS_ERRED, 0);
 687         if (un->un_state & RUS_INIT)
 688                 un->un_state = RUS_DOI;
 689         raid_commit(un, NULL);
 690         md_unit_writerexit(ui);
 691         if (un->un_state & RUS_DOI) {
 692                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
 693                     SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
 694         } else {
 695                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
 696                     SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
 697         }
 698         return (rval);
 699 }
 700
 701 /*
 702  * NAME:        raid_regen
 703  *
 704  * DESCRIPTION: regenerate all the parity on the raid device.  This
 705  *              routine starts a thread that will regenerate the
 706  *              parity on a raid device.  If an I/O error occurs during
 707  *              this process the entire device is placed in error.
 708  *
 709  * PARAMETERS:  md_set_params_t *msp - ioctl packet
 710  */
 711 static void
 712 regen_unit(minor_t mnum)
 713 {
 714         mdi_unit_t      *ui = MDI_UNIT(mnum);
 715         mr_unit_t       *un = MD_UNIT(mnum);
 716         buf_t           buf, *bp;
 717         caddr_t         buffer;
 718         int             err = 0;
 719         diskaddr_t      total_segments;
 720         diskaddr_t      line;
 721         size_t          iosize;
 722
 723         /*
 724          * Increment raid resync count for cpr
 725          */
 726         mutex_enter(&md_cpr_resync.md_resync_mutex);
 727         md_cpr_resync.md_raid_resync++;
 728         mutex_exit(&md_cpr_resync.md_resync_mutex);
 729
 730         iosize = dbtob(un->un_segsize);
 731         buffer = kmem_alloc(iosize, KM_SLEEP);
 732         bp = &buf;
 733         total_segments = un->un_segsincolumn;
 734         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
 735             MD_UN2SET(un), MD_SID(un));
 736         un->un_percent_done = 0;
 737         init_buf(bp, B_READ | B_BUSY, iosize);
 738
 739         for (line = 0; line < total_segments; line++) {
 740                 bp->b_lblkno = line *
 741                     ((un->un_origcolumncnt - 1) * un->un_segsize);
 742                 bp->b_un.b_addr = buffer;
 743                 bp->b_bcount = iosize;
 744                 bp->b_iodone = NULL;
 745                 /*
 746                  * The following assignment is only correct because
 747                  * md_raid_strategy is fine when it's only a minor number
 748                  * and not a real dev_t. Yuck.
 749                  */
 750                 bp->b_edev = mnum;
 751                 md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
 752                 if (biowait(bp)) {
 753                         err = 1;
 754                         break;
 755                 }
 756                 un->un_percent_done = (uint_t)((line * 1000) /
 757                     un->un_segsincolumn);
 758                 /* just to avoid rounding errors */
 759                 if (un->un_percent_done > 1000)
 760                         un->un_percent_done = 1000;
 761                 reset_buf(bp, B_READ | B_BUSY, iosize);
 762         }
 763         destroy_buf(bp);
 764         kmem_free(buffer, iosize);
 765
 766         (void) md_io_writerlock(ui);
 767         (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
 768         (void) md_io_writerexit(ui);
 769         un = md_unit_writerlock(ui);
 770         if (!err &&
 771             (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
 772                         un->un_state = RUS_OKAY;
 773         raid_commit(un, NULL);
 774         md_unit_writerexit(ui);
 775         if (err ||
 776             raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
 777                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
 778                     SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
 779         } else {
 780                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
 781                     MD_UN2SET(un), MD_SID(un));
 782         }
 783
 784         /*
 785          * Decrement the raid resync count for cpr
 786          */
 787         mutex_enter(&md_cpr_resync.md_resync_mutex);
 788         md_cpr_resync.md_raid_resync--;
 789         mutex_exit(&md_cpr_resync.md_resync_mutex);
 790         thread_exit();
 791 }
 792
 793 static int
 794 raid_regen_unit(minor_t mnum, md_error_t *ep)
 795 {
 796         mdi_unit_t      *ui;
 797         mr_unit_t       *un;
 798         int             i;
 799         set_t           setno = MD_MIN2SET(mnum);
 800
 801         ui = MDI_UNIT(mnum);
 802         un = (mr_unit_t *)MD_UNIT(mnum);
 803
 804         if (md_get_setstatus(setno) & MD_SET_STALE)
 805                 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
 806
 807         /* Don't start a regen if the device is not available */
 808         if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
 809                 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
 810         }
 811
 812         if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
 813                 (void) md_unit_writerlock(ui);
 814                 for (i = 0; i < un->un_totalcolumncnt; i++)
 815                         raid_set_state(un, i, RCS_ERRED, 0);
 816                 md_unit_writerexit(ui);
 817                 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
 818         }
 819
 820         /* start resync_unit thread */
 821         (void) thread_create(NULL, 0, regen_unit,
 822             (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
 823
 824         return (0);
 825 }
 826
 827 static int
 828 raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
 829 {
 830         minor_t         mnum = mrp->mnum;
 831         mr_unit_t       *un;
 832
 833         mdclrerror(&mrp->mde);
 834
 835         un = md_unit_readerlock(MDI_UNIT(mnum));
 836
 837         if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
 838                 md_unit_readerexit(MDI_UNIT(mnum));
 839                 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
 840         }
 841
 842         if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
 843             (raid_state_cnt(un, RCS_RESYNC))) {
 844                 md_unit_readerexit(MDI_UNIT(mnum));
 845                 return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
 846         }
 847
 848         if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
 849                 md_unit_readerexit(MDI_UNIT(mnum));
 850                 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
 851         }
 852
 853         if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
 854             (! (un->un_state & RUS_OKAY))) {
 855                 md_unit_readerexit(MDI_UNIT(mnum));
 856                 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
 857         }
 858
 859         md_unit_readerexit(MDI_UNIT(mnum));
 860
 861         /* get locks and recheck to be sure something did not change */
 862         if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
 863                 return (0);
 864
 865         if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
 866             (! (un->un_state & RUS_OKAY))) {
 867                 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
 868         }
 869
 870         raid_set_state(un, 0, RCS_REGEN, 0);
 871         raid_commit(un, NULL);
 872         md_ioctl_droplocks(lock);
 873         return (raid_regen_unit(mnum, &mrp->mde));
 874 }
 875
 876 /*
 877  * NAME:        raid_set
 878  * DESCRIPTION: used to create a RAID metadevice
 879  * PARAMETERS:  md_set_params_t *d   - pointer to set data structure
 880  *              int             mode - must be FWRITE
 881  *
 882  * LOCKS:       none
 883  *
 884  */
 885 static int
 886 raid_set(void   *d, int mode)
 887 {
 888         minor_t         mnum;
 889         mr_unit_t       *un;
 890         mddb_recid_t    mr_recid;
 891         mddb_recid_t    *recids;
 892         mddb_type_t     typ1;
 893         int             err;
 894         set_t           setno;
 895         int             num_recs;
 896         int             rid;
 897         int             col;
 898         md_set_params_t *msp = d;
 899
 900
 901         mnum = msp->mnum;
 902         setno = MD_MIN2SET(mnum);
 903
 904         mdclrerror(&msp->mde);
 905
 906         if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
 907                 return (0);
 908
 909         typ1 = (mddb_type_t)md_getshared_key(setno,
 910             raid_md_ops.md_driver.md_drivername);
 911
 912         /* create the db record for this mdstruct */
 913
 914         if (msp->options & MD_CRO_64BIT) {
 915 #if defined(_ILP32)
 916                 return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
 917 #else
 918                 mr_recid = mddb_createrec(msp->size, typ1, 0,
 919                     MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
 920 #endif
 921         } else {
 922                 mr_recid = mddb_createrec(msp->size, typ1, 0,
 923                     MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
 924         }
 925
 926         if (mr_recid < 0)
 927                 return (mddbstatus2error(&msp->mde,
 928                     (int)mr_recid, mnum, setno));
 929
 930         /* get the address of the mdstruct */
 931         un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
 932         /*
 933          * It is okay that we muck with the mdstruct here,
 934          * since no one else will know about the mdstruct
 935          * until we commit it. If we crash, the record will
 936          * be automatically purged, since we haven't
 937          * committed it yet.
 938          */
 939
 940         /* copy in the user's mdstruct */
 941         if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
 942             msp->size, mode)) {
 943                 mddb_deleterec_wrapper(mr_recid);
 944                 return (EFAULT);
 945         }
 946         /* All 64 bit metadevices only support EFI labels. */
 947         if (msp->options & MD_CRO_64BIT) {
 948                 un->c.un_flag |= MD_EFILABEL;
 949         }
 950
 951         /*
 952          * allocate the real recids array.  since we may have to commit
 953          * underlying metadevice records, we need an array of size:
 954          * total number of components in raid + 3 (1 for the raid itself,
 955          * one for the hotspare, one for the end marker).
 956          */
 957         num_recs = un->un_totalcolumncnt + 3;
 958         rid = 0;
 959         recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
 960         recids[rid++] = mr_recid;
 961
 962         MD_SID(un) = mnum;
 963         MD_RECID(un) = recids[0];
 964         MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
 965         MD_PARENT(un) = MD_NO_PARENT;
 966         un->un_resync_copysize = 0;
 967         un->c.un_revision |= MD_FN_META_DEV;
 968
 969         if (UNIT_STATE(un) == RUS_INIT)
 970                 MD_STATUS(un) |= MD_UN_GROW_PENDING;
 971
 972         if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
 973                 mddb_deleterec_wrapper(mr_recid);
 974                 err = mderror(&msp->mde, MDE_RAID_INVALID);
 975                 goto out;
 976         }
 977
 978         if (err = raid_build_incore(un, 0)) {
 979                 if (un->mr_ic) {
 980                         kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
 981                             un->un_totalcolumncnt);
 982                         kmem_free(un->mr_ic, sizeof (*un->mr_ic));
 983                 }
 984
 985                 md_nblocks_set(mnum, -1ULL);
 986                 MD_UNIT(mnum) = NULL;
 987
 988                 mddb_deleterec_wrapper(mr_recid);
 989                 goto out;
 990         }
 991
 992         /*
 993          * Update unit availability
 994          */
 995         md_set[setno].s_un_avail--;
 996
 997         recids[rid] = 0;
 998         if (un->un_hsp_id != -1) {
 999                 /* increment the reference count of the hot spare pool */
1000                 err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
1001                     &recids[rid], NULL, NULL, NULL);
1002                 if (err) {
1003                         md_nblocks_set(mnum, -1ULL);
1004                         MD_UNIT(mnum) = NULL;
1005
1006                         mddb_deleterec_wrapper(mr_recid);
1007                         goto out;
1008                 }
1009                 rid++;
1010         }
1011
1012         /*
1013          * set the parent on any metadevice components.
1014          * NOTE: currently soft partitions are the only metadevices
1015          * which can appear within a RAID metadevice.
1016          */
1017         for (col = 0; col < un->un_totalcolumncnt; col++) {
1018                 mr_column_t     *mr_col = &un->un_column[col];
1019                 md_unit_t       *comp_un;
1020
1021                 if (md_getmajor(mr_col->un_dev) == md_major) {
1022                         comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1023                         recids[rid++] = MD_RECID(comp_un);
1024                         md_set_parent(mr_col->un_dev, MD_SID(un));
1025                 }
1026         }
1027
1028         /* set the end marker */
1029         recids[rid] = 0;
1030
1031         mddb_commitrecs_wrapper(recids);
1032         md_create_unit_incore(mnum, &raid_md_ops, 1);
1033
1034         SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
1035             MD_SID(un));
1036
1037 out:
1038         kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
1039         if (err)
1040                 return (err);
1041
1042         /* only attempt to init a device that is in the init state */
1043         if (UNIT_STATE(un) != RUS_INIT)
1044                 return (0);
1045
1046         return (raid_init_unit(mnum, &msp->mde));
1047 }
1048
1049 /*
1050  * NAME:        raid_get
1051  * DESCRIPTION: used to get the unit structure of a RAID metadevice
1052  * PARAMETERS:  md_i_get_t   *migp - pointer to get data structure
1053  *              int           mode - must be FREAD
1054  *              IOLOCK       *lock - pointer to IOCTL lock
1055  *
1056  * LOCKS:       obtains unit reader lock via IOLOCK
1057  *
1058  */
1059 static int
1060 raid_get(
1061         void            *migp,
1062         int             mode,
1063         IOLOCK          *lock
1064 )
1065 {
1066         minor_t         mnum;
1067         mr_unit_t       *un;
1068         md_i_get_t      *migph = migp;
1069
1070
1071         mnum = migph->id;
1072
1073         mdclrerror(&migph->mde);
1074
1075         if ((un = raid_getun(mnum, &migph->mde,
1076             RD_LOCK, lock)) == NULL)
1077                 return (0);
1078
1079         if (migph->size == 0) {
1080                 migph->size = un->c.un_size;
1081                 return (0);
1082         }
1083
1084         if (migph->size < un->c.un_size) {
1085                 return (EFAULT);
1086         }
1087         if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
1088             un->c.un_size, mode))
1089                 return (EFAULT);
1090
1091         return (0);
1092 }
1093
1094
1095 /*
1096  * NAME:        raid_replace
1097  * DESCRIPTION: used to replace a component of a RAID metadevice
1098  * PARAMETERS:  replace_params_t *mrp - pointer to replace data structure
1099  *              IOLOCK       *lock - pointer to IOCTL lock
1100  *
1101  * LOCKS:       obtains unit writer lock via IOLOCK (through raid_getun),
1102  *              obtains and releases md_unit_array_rw write lock
1103  *
1104  */
1105 static int
1106 raid_replace(
1107         replace_params_t        *mrp,
1108         IOLOCK                  *lock
1109 )
1110 {
1111         minor_t         mnum = mrp->mnum;
1112         md_dev64_t      odev = mrp->old_dev;
1113         md_error_t      *ep = &mrp->mde;
1114         mr_unit_t       *un;
1115         rcs_state_t     state;
1116         int             ix, col = -1;
1117         int             force = 0;
1118         int             err = 0;
1119         replace_cmd_t   cmd;
1120         set_t           setno;
1121         side_t          side;
1122         mdkey_t         devkey;
1123         int             nkeys;
1124         mddb_recid_t    extra_recids[3] = { 0, 0, 0 };
1125         int             extra_rids = 0;
1126         md_error_t      mde = mdnullerror;
1127         sv_dev_t        sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
1128
1129         mdclrerror(ep);
1130         setno = MD_MIN2SET(mnum);
1131         side = mddb_getsidenum(setno);
1132
1133         un = md_unit_readerlock(MDI_UNIT(mnum));
1134
1135         if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1136             (raid_state_cnt(un, RCS_RESYNC) != 0)) {
1137                 md_unit_readerexit(MDI_UNIT(mnum));
1138                 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1139         }
1140
1141         if (un->un_state & RUS_DOI) {
1142                 md_unit_readerexit(MDI_UNIT(mnum));
1143                 return (mdmderror(ep, MDE_RAID_DOI, mnum));
1144         }
1145
1146         if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1147             (MD_STATUS(un) & MD_UN_GROW_PENDING)) {
1148                 md_unit_readerexit(MDI_UNIT(mnum));
1149                 return (mdmderror(ep, MDE_IN_USE, mnum));
1150         }
1151
1152         md_unit_readerexit(MDI_UNIT(mnum));
1153
1154         /* get locks and recheck to be sure something did not change */
1155         if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
1156                 return (0);
1157
1158         if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
1159                 return (mddeverror(ep, MDE_NAME_SPACE, odev));
1160         }
1161
1162         for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1163                 md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
1164                 /*
1165                  * Try to resolve devt again if NODEV64
1166                  */
1167                 if (tmpdevt == NODEV64) {
1168                         tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
1169                             un->un_column[ix].un_orig_key);
1170                         un->un_column[ix].un_orig_dev = tmpdevt;
1171                 }
1172
1173                 if (un->un_column[ix].un_orig_dev == odev) {
1174                         col = ix;
1175                         break;
1176                 } else {
1177                         if (un->un_column[ix].un_orig_dev == NODEV64) {
1178                                 /*
1179                                  * Now we use the keys to match.
1180                                  * If no key found, continue.
1181                                  */
1182                                 if (nkeys == 0) {
1183                                         continue;
1184                                 }
1185                                 if (un->un_column[ix].un_orig_key == devkey) {
1186                                         if (nkeys > 1)
1187                                                 return (mddeverror(ep,
1188                                                     MDE_MULTNM, odev));
1189                                         col = ix;
1190                                         break;
1191                                 }
1192                         }
1193                 }
1194         }
1195
1196         if (col == -1)
1197                 return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1198                     mnum, odev));
1199
1200         if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1201             (raid_state_cnt(un, RCS_RESYNC) != 0))
1202                 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1203
1204         if (un->un_state & RUS_DOI)
1205                 return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
1206                     un->un_column[col].un_dev));
1207
1208         if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1209             (MD_STATUS(un) & MD_UN_GROW_PENDING))
1210                 return (mdmderror(ep, MDE_IN_USE, mnum));
1211
1212         if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
1213                 force = 1;
1214         if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
1215                 cmd = ENABLE_COMP;
1216         if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
1217                 cmd = REPLACE_COMP;
1218
1219         if (un->un_state == RUS_LAST_ERRED) {
1220                 /* Must use -f force flag for unit in LAST_ERRED state */
1221                 if (!force)
1222                         return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum));
1223
1224                 /* Must use -f force flag on ERRED column first */
1225                 if (un->un_column[col].un_devstate != RCS_ERRED) {
1226                         for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1227                                 if (un->un_column[ix].un_devstate & RCS_ERRED)
1228                                         return (mdcomperror(ep,
1229                                             MDE_RAID_COMP_ERRED, mnum,
1230                                             un->un_column[ix].un_dev));
1231                         }
1232                 }
1233
1234                 /* must use -f force flag on LAST_ERRED columns next */
1235                 if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
1236                     (un->un_column[col].un_devstate != RCS_ERRED))
1237                         return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1238                             mnum, un->un_column[col].un_dev));
1239         }
1240
1241         if (un->un_state == RUS_ERRED) {
1242                 if (! (un->un_column[col].un_devstate &
1243                     (RCS_ERRED | RCS_INIT_ERRED)))
1244                         return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1245                             mnum, un->un_column[ix].un_dev));
1246         }
1247
1248         ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
1249         ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
1250
1251         state = un->un_column[col].un_devstate;
1252         if (state & RCS_INIT_ERRED) {
1253                 MD_STATUS(un) |= MD_UN_GROW_PENDING;
1254                 un->un_percent_done = 0;
1255                 raid_set_state(un, col, RCS_INIT, 0);
1256         } else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
1257             resync_request(mnum, col, 0, ep))
1258                 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1259
1260
1261         if (cmd == REPLACE_COMP) {
1262                 md_dev64_t tmpdev = mrp->new_dev;
1263
1264                 /*
1265                  * open the device by device id
1266                  */
1267                 tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
1268                 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
1269                         return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
1270                             tmpdev));
1271                 }
1272
1273                 /*
1274                  * If it's a metadevice, make sure it gets reparented
1275                  */
1276                 if (md_getmajor(tmpdev) == md_major) {
1277                         minor_t         new_mnum = md_getminor(tmpdev);
1278                         md_unit_t       *new_un = MD_UNIT(new_mnum);
1279
1280                         md_set_parent(tmpdev, MD_SID(un));
1281                         extra_recids[extra_rids++] = MD_RECID(new_un);
1282                 }
1283
1284                 mrp->new_dev = tmpdev;
1285                 un->un_column[col].un_orig_dev = tmpdev;
1286                 un->un_column[col].un_orig_key = mrp->new_key;
1287                 un->un_column[col].un_orig_pwstart = mrp->start_blk;
1288                 un->un_column[col].un_orig_devstart =
1289                     mrp->start_blk + un->un_pwsize;
1290
1291                 /*
1292                  * If the old device was a metadevice, make sure to
1293                  * reset its parent.
1294                  */
1295                 if (md_getmajor(odev) == md_major) {
1296                         minor_t         old_mnum = md_getminor(odev);
1297                         md_unit_t       *old_un = MD_UNIT(old_mnum);
1298
1299                         md_reset_parent(odev);
1300                         extra_recids[extra_rids++] =
1301                             MD_RECID(old_un);
1302                 }
1303
1304                 if (HOTSPARED(un, col)) {
1305                         md_layered_close(mrp->new_dev, MD_OFLG_NULL);
1306                         un->un_column[col].un_alt_dev = mrp->new_dev;
1307                         un->un_column[col].un_alt_pwstart = mrp->start_blk;
1308                         un->un_column[col].un_alt_devstart =
1309                             mrp->start_blk + un->un_pwsize;
1310                         un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1311                 } else {
1312                         /*
1313                          * not hot spared.  Close the old device and
1314                          * move the new device in.
1315                          */
1316                         if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
1317                                 md_layered_close(odev, MD_OFLG_NULL);
1318                         un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1319                         un->un_column[col].un_dev = mrp->new_dev;
1320                         un->un_column[col].un_pwstart = mrp->start_blk;
1321                         un->un_column[col].un_devstart =
1322                             mrp->start_blk + un->un_pwsize;
1323                         if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
1324                                 un->un_column[col].un_devflags |=
1325                                     MD_RAID_REGEN_RESYNC;
1326                         }
1327                 }
1328                 /*
1329                  * If the old device is not a metadevice then
1330                  * save off the set number and key so that it
1331                  * can be removed from the namespace later.
1332                  */
1333                 if (md_getmajor(odev) != md_major) {
1334                         sv.setno = setno;
1335                         sv.key = devkey;
1336                 }
1337         }
1338
1339         if (cmd == ENABLE_COMP) {
1340                 md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
1341                 mdkey_t raidkey =  un->un_column[col].un_orig_key;
1342
1343                 /*
1344                  * We trust the dev_t because we cannot determine the
1345                  * dev_t from the device id since a new disk is in the
1346                  * same location. Since this is a call from metareplace -e dx
1347                  * AND it is SCSI a new dev_t is not generated.  So the
1348                  * dev_t from the mddb is used. Before enabling the device
1349                  * we check to make sure that multiple entries for the same
1350                  * device does not exist in the namespace. If they do we
1351                  * fail the ioctl.
1352                  * One of the many ways multiple entries in the name space
1353                  * can occur is if one removed the failed component in a
1354                  * RAID metadevice and put another disk that was part of
1355                  * another metadevice. After reboot metadevadm would correctly
1356                  * update the device name for the metadevice whose component
1357                  * has moved. However now in the metadb there are two entries
1358                  * for the same name (ctds) that belong to different
1359                  * metadevices. One is valid, the other is a ghost or "last
1360                  * know as" ctds.
1361                  */
1362                 tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
1363                 if (tmpdev == NODEV64)
1364                         tmpdev = md_getdevnum(setno, side, raidkey,
1365                             MD_TRUST_DEVT);
1366                 /*
1367                  * check for multiple entries in namespace for the
1368                  * same dev
1369                  */
1370
1371                 if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
1372                     &nkeys) != 0)
1373                         return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
1374                 /*
1375                  * If number of keys are greater that
1376                  * 1, then we have an invalid
1377                  * namespace. STOP and return.
1378                  */
1379                 if (nkeys > 1)
1380                         return (mddeverror(ep, MDE_MULTNM, tmpdev));
1381                 if (devkey != raidkey)
1382                         return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1383                             mnum, tmpdev));
1384
1385                 if (un->un_column[col].un_orig_dev == NODEV64)
1386                         un->un_column[col].un_orig_dev = tmpdev;
1387
1388                 if (HOTSPARED(un, col)) {
1389                         un->un_column[col].un_alt_dev =
1390                             un->un_column[col].un_orig_dev;
1391                         un->un_column[col].un_alt_pwstart =
1392                             un->un_column[col].un_orig_pwstart;
1393                         un->un_column[col].un_alt_devstart =
1394                             un->un_column[col].un_orig_devstart;
1395                         un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1396                 } else {
1397                         if (!(un->un_column[col].un_devflags &
1398                             MD_RAID_DEV_ISOPEN)) {
1399                                 if (md_layered_open(mnum, &tmpdev,
1400                                     MD_OFLG_NULL)) {
1401                                         un->un_column[col].un_dev = tmpdev;
1402                                         return (mdcomperror(ep,
1403                                             MDE_COMP_OPEN_ERR, mnum, tmpdev));
1404                                 }
1405                                 ASSERT(tmpdev != NODEV64 &&
1406                                     tmpdev != 0);
1407
1408                                 if ((md_getmajor(tmpdev) != md_major) &&
1409                                     (md_devid_found(setno, side, raidkey)
1410                                     == 1)) {
1411                                         if (md_update_namespace_did(setno, side,
1412                                             raidkey, &mde) != 0) {
1413                                                 cmn_err(CE_WARN,
1414                                                     "md: could not"
1415                                                     " update namespace\n");
1416                                         }
1417                                 }
1418                                 un->un_column[col].un_dev =
1419                                     un->un_column[col].un_orig_dev;
1420                         }
1421                         un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1422                         un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
1423                 }
1424         }
1425         if (mrp->has_label) {
1426                 un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
1427         } else {
1428                 un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
1429         }
1430
1431         raid_commit(un, extra_recids);
1432
1433         /* If the component has been replaced - clean up the name space */
1434         if (sv.setno != MD_SET_BAD) {
1435                 md_rem_names(&sv, 1);
1436         }
1437
1438         md_ioctl_droplocks(lock);
1439
1440         if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
1441                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
1442                     setno, MD_SID(un));
1443         } else {
1444                 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
1445                     setno, MD_SID(un));
1446         }
1447
1448         if (un->un_column[col].un_devstate & RCS_INIT)
1449                 err = raid_init_unit(mnum, ep);
1450         else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
1451                 err = raid_resync_unit(mnum, ep);
1452
1453         mdclrerror(ep);
1454         if (!err)
1455                 return (0);
1456
1457         /* be sure state */
1458         /* is already set by this time */
1459         /* fix state  and commit record */
1460         un = md_unit_writerlock(MDI_UNIT(mnum));
1461         if (state & RCS_INIT_ERRED)
1462                 raid_set_state(un, col, state, 1);
1463         else if (state & RCS_OKAY)
1464                 raid_set_state(un, col, RCS_ERRED, 0);
1465         else
1466                 raid_set_state(un, col, state, 1);
1467         raid_commit(un, NULL);
1468         md_unit_writerexit(MDI_UNIT(mnum));
1469         mdclrerror(ep);
1470         return (0);
1471 }
1472
1473
1474 /*
1475  * NAME:        raid_set_sync
1476  * DESCRIPTION: used to sync a component of a RAID metadevice
1477  * PARAMETERS:  md_resync_ioctl_t *mrp - pointer to resync data structure
1478  *              int           mode - must be FWRITE
1479  *              IOLOCK       *lock - pointer to IOCTL lock
1480  *
1481  * LOCKS:       obtains unit writer lock via IOLOCK (through raid_getun),
1482  *              obtains and releases md_unit_array_rw write lock
1483  *
1484  */
1485 static int
1486 raid_set_sync(
1487         md_resync_ioctl_t       *rip,
1488         IOLOCK                  *lock
1489 )
1490 {
1491         minor_t                 mnum = rip->ri_mnum;
1492         mr_unit_t               *un;
1493         int                     init = 0;
1494         int                     resync = 0;
1495         int                     regen = 0;
1496         int                     ix;
1497         int                     err;
1498
1499         mdclrerror(&rip->mde);
1500
1501         if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
1502                 return (0);
1503
1504         if (un->un_state & RUS_DOI)
1505                 return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
1506
1507         if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
1508                 return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
1509
1510         /* This prevents new opens */
1511
1512         rip->ri_flags = 0;
1513         if (un->un_state & RUS_REGEN)
1514                 regen++;
1515
1516         if (raid_state_cnt(un, RCS_RESYNC))
1517                 resync++;
1518
1519         if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
1520                 init++;
1521
1522         ASSERT(!(resync && init && regen));
1523         md_ioctl_droplocks(lock);
1524         rip->ri_percent_done = 0;
1525
1526         if (init) {
1527                 MD_STATUS(un) |= MD_UN_GROW_PENDING;
1528                 return (raid_init_unit(mnum, &rip->mde));
1529         }
1530
1531         /*
1532          * If resync is needed, it will call raid_internal_open forcing
1533          * replay before the open completes.
1534          * Otherwise, call raid_internal_open directly to force
1535          * replay to complete during boot (metasync -r).
1536          * NOTE: the unit writer lock must remain held while setting
1537          *       MD_UN_RESYNC_ACTIVE but must be released before
1538          *       calling raid_resync_unit or raid_internal_open.
1539          */
1540         if (resync) {
1541                 ASSERT(resync < 2);
1542                 un = md_unit_writerlock(MDI_UNIT(mnum));
1543                 MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
1544                 /* Must release unit writer lock for resync */
1545                 /*
1546                  * correctly setup the devices before trying to start the
1547                  * resync operation.
1548                  */
1549                 for (ix = 0; un->un_totalcolumncnt; ix++) {
1550                         if (un->un_column[ix].un_devstate & RCS_RESYNC) {
1551                                 if ((un->un_column[ix].un_devflags &
1552                                     MD_RAID_COPY_RESYNC) &&
1553                                     HOTSPARED(un, ix)) {
1554                                         un->un_column[ix].un_alt_dev =
1555                                             un->un_column[ix].un_orig_dev;
1556                                         un->un_column[ix].un_alt_devstart =
1557                                             un->un_column[ix].un_orig_devstart;
1558                                         un->un_column[ix].un_alt_pwstart =
1559                                             un->un_column[ix].un_orig_pwstart;
1560                                 }
1561                                 break;
1562                         }
1563                 }
1564                 ASSERT(un->un_column[ix].un_devflags &
1565                     (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1566                 rip->ri_percent_done = 0;
1567                 un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
1568                 (void) resync_request(mnum, ix, 0, NULL);
1569                 md_unit_writerexit(MDI_UNIT(mnum));
1570                 err = raid_resync_unit(mnum, &rip->mde);
1571                 return (err);
1572         }
1573
1574         if (regen) {
1575                 err = raid_regen_unit(mnum, &rip->mde);
1576                 return (err);
1577         }
1578
1579         /* The unit requires not work so just force replay of the device */
1580         if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
1581                 return (mdmderror(&rip->mde,
1582                     MDE_RAID_OPEN_FAILURE, mnum));
1583         (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1584
1585         return (0);
1586 }
1587
1588 /*
1589  * NAME:        raid_get_resync
1590  * DESCRIPTION: used to check resync status on a component of a RAID metadevice
1591  * PARAMETERS:  md_resync_ioctl_t *mrp - pointer to resync data structure
1592  *              int           mode - must be FWRITE
1593  *              IOLOCK       *lock - pointer to IOCTL lock
1594  *
1595  * LOCKS:       none
1596  *
1597  */
1598 static int
1599 raid_get_resync(
1600         md_resync_ioctl_t       *rip,
1601         IOLOCK                  *lock
1602 )
1603 {
1604         minor_t                 mnum = rip->ri_mnum;
1605         mr_unit_t               *un;
1606         u_longlong_t            percent;
1607         int                     cnt;
1608         int                     ix;
1609         uint64_t                d;
1610
1611         mdclrerror(&rip->mde);
1612
1613         if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
1614                 return (0);
1615
1616         rip->ri_flags = 0;
1617         if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1618                 d = un->un_segsincolumn;
1619                 percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
1620                 if (percent > 1000)
1621                         percent = 1000; /* can't go over 100% */
1622                 rip->ri_percent_done = (int)percent;
1623                 rip->ri_flags |= MD_RI_INPROGRESS;
1624         }
1625
1626         if (UNIT_STATE(un) & RUS_INIT) {
1627                 d = un->un_segsize * un->un_segsincolumn *
1628                     un->un_totalcolumncnt;
1629                 percent =
1630                     d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
1631                 if (percent > 1000)
1632                         percent = 1000; /* can't go over 100% */
1633                 rip->ri_percent_done = (int)percent;
1634                 rip->ri_flags |= MD_GROW_INPROGRESS;
1635         } else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1636                 d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
1637                 percent =
1638                     d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
1639                 if (percent > 1000)
1640                         percent = 1000;
1641                 rip->ri_percent_done = (int)percent;
1642                 rip->ri_flags |= MD_GROW_INPROGRESS;
1643         }
1644
1645         if (un->un_state & RUS_REGEN)
1646                 rip->ri_percent_done = un->un_percent_done;
1647
1648         cnt = 0;
1649         for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1650                 switch (un->un_column[ix].un_devstate) {
1651                 case RCS_INIT:
1652                 case RCS_ERRED:
1653                 case RCS_LAST_ERRED:
1654                         cnt++;
1655                         break;
1656                 default:
1657                         break;
1658                 }
1659         }
1660         d = un->un_totalcolumncnt;
1661         rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
1662         return (0);
1663 }
1664
1665 /*
1666  * NAME:        raid_grow
1667  * DESCRIPTION: Concatenate to a RAID metadevice
1668  * PARAMETERS:  md_grow_params_t *mgp
1669  *                            - pointer to IOCGROW data structure
1670  *              int      mode - must be FWRITE
1671  *              IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
1672  *
1673  * LOCKS:       obtains unit writer lock via IOLOCK (through raid_getun),
1674  *              obtains and releases md_unit_array_rw write lock
1675  *
1676  */
1677 static int
1678 raid_grow(void *mgp, int mode, IOLOCK *lock)
1679 {
1680         minor_t         mnum;
1681         mr_unit_t       *un, *new_un;
1682         mdi_unit_t      *ui;
1683         mddb_type_t     typ1;
1684         mddb_recid_t    mr_recid;
1685         mddb_recid_t    old_vtoc = 0;
1686         mddb_recid_t    *recids;
1687         md_create_rec_option_t options;
1688         int             err;
1689         int             col, i;
1690         int64_t         tb, atb;
1691         u_longlong_t    unrev;
1692         int             tc;
1693         int             rval = 0;
1694         set_t           setno;
1695         mr_column_ic_t  *mrc;
1696         int             num_recs, rid;
1697         md_grow_params_t        *mgph = mgp;
1698
1699
1700         mnum = mgph->mnum;
1701
1702         mdclrerror(&mgph->mde);
1703
1704         ui = MDI_UNIT(mnum);
1705         un = md_unit_readerlock(ui);
1706
1707         if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1708                 md_unit_readerexit(ui);
1709                 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1710         }
1711
1712         if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1713                 md_unit_readerexit(ui);
1714                 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1715         }
1716
1717         if (UNIT_STATE(un) & RUS_LAST_ERRED) {
1718                 md_unit_readerexit(ui);
1719                 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1720         }
1721
1722         if (UNIT_STATE(un) & RUS_DOI) {
1723                 md_unit_readerexit(ui);
1724                 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1725         }
1726
1727         if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
1728                 md_unit_readerexit(ui);
1729                 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1730         }
1731
1732         md_unit_readerexit(ui);
1733
1734         if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
1735             NULL)
1736                 return (0);
1737
1738         if (MD_STATUS(un) & MD_UN_GROW_PENDING)
1739                 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1740
1741         if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
1742                 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1743
1744         if (un->c.un_size >= mgph->size)
1745                 return (EINVAL);
1746
1747         if (UNIT_STATE(un) & RUS_LAST_ERRED)
1748                 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1749
1750         if (UNIT_STATE(un) & RUS_DOI)
1751                 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1752
1753         if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
1754                 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1755
1756         setno = MD_MIN2SET(mnum);
1757
1758         typ1 = (mddb_type_t)md_getshared_key(setno,
1759             raid_md_ops.md_driver.md_drivername);
1760
1761         /*
1762          * Preserve the friendly name nature of the device that is
1763          * growing.
1764          */
1765         options = MD_CRO_RAID;
1766         if (un->c.un_revision & MD_FN_META_DEV)
1767                 options |= MD_CRO_FN;
1768         if (mgph->options & MD_CRO_64BIT) {
1769 #if defined(_ILP32)
1770                 return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
1771 #else
1772                 mr_recid = mddb_createrec(mgph->size, typ1, 0,
1773                     MD_CRO_64BIT | options, setno);
1774 #endif
1775         } else {
1776                 mr_recid = mddb_createrec(mgph->size, typ1, 0,
1777                     MD_CRO_32BIT | options, setno);
1778         }
1779         if (mr_recid < 0) {
1780                 rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
1781                     mnum, setno);
1782                 return (rval);
1783         }
1784
1785         /* get the address of the new unit */
1786         new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
1787
1788         /*
1789          * It is okay that we muck with the new unit here,
1790          * since no one else will know about the unit struct
1791          * until we commit it. If we crash, the record will
1792          * be automatically purged, since we haven't
1793          * committed it yet and the old unit struct will be found.
1794          */
1795
1796         /* copy in the user's unit struct */
1797         err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
1798             mgph->size, mode);
1799         if (err) {
1800                 mddb_deleterec_wrapper(mr_recid);
1801                 return (EFAULT);
1802         }
1803
1804         /* make sure columns are being added */
1805         if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
1806                 mddb_deleterec_wrapper(mr_recid);
1807                 return (EINVAL);
1808         }
1809
1810         /*
1811          * Save a few of the new unit structs fields.
1812          * Before they get clobbered.
1813          */
1814         tc = new_un->un_totalcolumncnt;
1815         tb = new_un->c.un_total_blocks;
1816         atb = new_un->c.un_actual_tb;
1817         unrev = new_un->c.un_revision;
1818
1819         /*
1820          * Copy the old unit struct (static stuff)
1821          * into new unit struct
1822          */
1823         bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
1824
1825         /*
1826          * Restore a few of the new unit struct values.
1827          */
1828         new_un->un_totalcolumncnt = tc;
1829         new_un->c.un_actual_tb = atb;
1830         new_un->un_grow_tb = tb;
1831         new_un->c.un_revision = unrev;
1832         new_un->c.un_record_id = mr_recid;
1833         new_un->c.un_size = mgph->size;
1834
1835         ASSERT(new_un->mr_ic == un->mr_ic);
1836
1837         /*
1838          * Save old column slots
1839          */
1840         mrc = un->un_column_ic;
1841
1842         /*
1843          * Allocate new column slot
1844          */
1845         new_un->un_column_ic = (mr_column_ic_t *)
1846             kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
1847             KM_SLEEP);
1848
1849         /*
1850          * Restore old column slots
1851          * Free the old column slots
1852          */
1853         bcopy(mrc, new_un->un_column_ic,
1854             sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1855         kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1856
1857         /* All 64 bit metadevices only support EFI labels. */
1858         if (mgph->options & MD_CRO_64BIT) {
1859                 new_un->c.un_flag |= MD_EFILABEL;
1860                 /*
1861                  * If the device was previously smaller than a terabyte,
1862                  * and had a vtoc record attached to it, we remove the
1863                  * vtoc record, because the layout has changed completely.
1864                  */
1865                 if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
1866                     (un->c.un_vtoc_id != 0)) {
1867                         old_vtoc = un->c.un_vtoc_id;
1868                         new_un->c.un_vtoc_id =
1869                             md_vtoc_to_efi_record(old_vtoc, setno);
1870                 }
1871         }
1872
1873
1874         /*
1875          * allocate the real recids array.  since we may have to commit
1876          * underlying metadevice records, we need an array of size:
1877          * total number of new components being attach + 2 (one for the
1878          * raid itself, one for the end marker).
1879          */
1880         num_recs = new_un->un_totalcolumncnt + 2;
1881         rid = 0;
1882         recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
1883         recids[rid++] = mr_recid;
1884
1885         for (col = un->un_totalcolumncnt;
1886             (col < new_un->un_totalcolumncnt); col++) {
1887                 mr_column_t     *mr_col = &new_un->un_column[col];
1888                 md_unit_t       *comp_un;
1889
1890                 if (raid_build_pw_reservation(new_un, col) != 0) {
1891                         /* release pwslots already allocated by grow */
1892                         for (i = un->un_totalcolumncnt; i < col; i++) {
1893                                 raid_free_pw_reservation(new_un, i);
1894                         }
1895                         kmem_free(new_un->un_column_ic,
1896                             sizeof (mr_column_ic_t) *
1897                             new_un->un_totalcolumncnt);
1898                         kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
1899                         kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1900                         mddb_deleterec_wrapper(mr_recid);
1901                         return (EINVAL);
1902                 }
1903                 /*
1904                  * set parent on metadevices being added.
1905                  * NOTE: currently soft partitions are the only metadevices
1906                  * which can appear within a RAID metadevice.
1907                  */
1908                 if (md_getmajor(mr_col->un_dev) == md_major) {
1909                         comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1910                         recids[rid++] = MD_RECID(comp_un);
1911                         md_set_parent(mr_col->un_dev, MD_SID(new_un));
1912                 }
1913                 new_un->un_column[col].un_devflags = 0;
1914         }
1915
1916         /* set end marker */
1917         recids[rid] = 0;
1918
1919         /* commit new unit struct */
1920         mddb_commitrecs_wrapper(recids);
1921
1922         /* delete old unit struct */
1923         mddb_deleterec_wrapper(un->c.un_record_id);
1924
1925         /* place new unit in in-core array */
1926         md_nblocks_set(mnum, new_un->c.un_total_blocks);
1927         MD_UNIT(mnum) = new_un;
1928
1929         /*
1930          * If old_vtoc has a non zero value, we know:
1931          * - This unit crossed the border from smaller to larger one TB
1932          * - There was a vtoc record for the unit,
1933          * - This vtoc record is no longer needed, because
1934          *   a new efi record has been created for this un.
1935          */
1936         if (old_vtoc != 0) {
1937                 mddb_deleterec_wrapper(old_vtoc);
1938         }
1939
1940         /* free recids */
1941         kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1942
1943         SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1944             MD_UN2SET(new_un), MD_SID(new_un));
1945         MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
1946
1947         /*
1948          * Since the md_ioctl_writelock aquires the unit write lock
1949          * and open/close aquires the unit reader lock it is necessary
1950          * to drop the unit write lock and then reaquire it as needed
1951          * later.
1952          */
1953         md_unit_writerexit(ui);
1954
1955         if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
1956                 rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum);
1957                 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1958                     MD_UN2SET(new_un), MD_SID(new_un));
1959                 return (rval);
1960         }
1961         (void) md_unit_writerlock(ui);
1962         for (i = 0; i < new_un->un_totalcolumncnt; i++) {
1963                 if (new_un->un_column[i].un_devstate & RCS_OKAY)
1964                         (void) init_pw_area(new_un, new_un->un_column[i].un_dev,
1965                             new_un->un_column[i].un_pwstart, i);
1966         }
1967         md_unit_writerexit(ui);
1968         (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1969         (void) md_unit_writerlock(ui);
1970         /* create a background thread to initialize the columns */
1971         md_ioctl_droplocks(lock);
1972
1973         return (raid_init_unit(mnum, &mgph->mde));
1974 }
1975
1976 /*
1977  * NAME:        raid_reset
1978  * DESCRIPTION: used to reset (clear / remove) a RAID metadevice
1979  * PARAMETERS:  md_i_reset_t *mirp - pointer to reset data structure
1980  *
1981  * LOCKS:       obtains and releases md_unit_array_rw write lock
1982  *
1983  */
1984 static int
1985 raid_reset(md_i_reset_t *mirp)
1986 {
1987         minor_t         mnum = mirp->mnum;
1988         mr_unit_t       *un;
1989         mdi_unit_t      *ui;
1990         set_t           setno = MD_MIN2SET(mnum);
1991
1992         mdclrerror(&mirp->mde);
1993
1994         rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1995         /*
1996          * NOTE: need to get md_unit_writerlock to avoid conflict
1997          * with raid_init thread.
1998          */
1999         if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
2000             NULL) {
2001                 rw_exit(&md_unit_array_rw.lock);
2002                 return (0);
2003         }
2004         ui = MDI_UNIT(mnum);
2005
2006         if (MD_HAS_PARENT(MD_PARENT(un))) {
2007                 rw_exit(&md_unit_array_rw.lock);
2008                 return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
2009         }
2010
2011         un = (mr_unit_t *)md_unit_openclose_enter(ui);
2012         if (md_unit_isopen(MDI_UNIT(mnum))) {
2013                 md_unit_openclose_exit(ui);
2014                 rw_exit(&md_unit_array_rw.lock);
2015                 return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
2016         }
2017         md_unit_openclose_exit(ui);
2018         if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
2019                 rw_exit(&md_unit_array_rw.lock);
2020                 return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
2021         }
2022
2023         reset_raid(un, mnum, 1);
2024
2025         /*
2026          * Update unit availability
2027          */
2028         md_set[setno].s_un_avail++;
2029
2030         /*
2031          * If MN set, reset s_un_next so all nodes can have
2032          * the same view of the next available slot when
2033          * nodes are -w and -j
2034          */
2035         if (MD_MNSET_SETNO(setno)) {
2036                 (void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
2037         }
2038
2039         rw_exit(&md_unit_array_rw.lock);
2040
2041         return (0);
2042 }
2043
2044 /*
2045  * NAME:        raid_get_geom
2046  * DESCRIPTION: used to get the geometry of a RAID metadevice
2047  * PARAMETERS:  mr_unit_t    *un - RAID unit to get the geometry for
2048  *              struct dk_geom *gp - pointer to geometry data structure
2049  *
2050  * LOCKS:       none
2051  *
2052  */
2053 static int
2054 raid_get_geom(
2055         mr_unit_t       *un,
2056         struct dk_geom  *geomp
2057 )
2058 {
2059         md_get_geom((md_unit_t *)un, geomp);
2060
2061         return (0);
2062 }
2063
2064 /*
2065  * NAME:        raid_get_vtoc
2066  * DESCRIPTION: used to get the VTOC on a RAID metadevice
2067  * PARAMETERS:  mr_unit_t    *un - RAID unit to get the VTOC from
2068  *              struct vtoc *vtocp - pointer to VTOC data structure
2069  *
2070  * LOCKS:       none
2071  *
2072  */
2073 static int
2074 raid_get_vtoc(
2075         mr_unit_t       *un,
2076         struct vtoc     *vtocp
2077 )
2078 {
2079         md_get_vtoc((md_unit_t *)un, vtocp);
2080
2081         return (0);
2082 }
2083
2084 /*
2085  * NAME:        raid_set_vtoc
2086  * DESCRIPTION: used to set the VTOC on a RAID metadevice
2087  * PARAMETERS:  mr_unit_t    *un - RAID unit to set the VTOC on
2088  *              struct vtoc *vtocp - pointer to VTOC data structure
2089  *
2090  * LOCKS:       none
2091  *
2092  */
2093 static int
2094 raid_set_vtoc(
2095         mr_unit_t       *un,
2096         struct vtoc     *vtocp
2097 )
2098 {
2099         return (md_set_vtoc((md_unit_t *)un, vtocp));
2100 }
2101
2102
2103 /*
2104  * NAME:        raid_get_extvtoc
2105  * DESCRIPTION: used to get the extended VTOC on a RAID metadevice
2106  * PARAMETERS:  mr_unit_t    *un - RAID unit to get the VTOC from
2107  *              struct extvtoc *vtocp - pointer to extended VTOC data structure
2108  *
2109  * LOCKS:       none
2110  *
2111  */
2112 static int
2113 raid_get_extvtoc(
2114         mr_unit_t       *un,
2115         struct extvtoc  *vtocp
2116 )
2117 {
2118         md_get_extvtoc((md_unit_t *)un, vtocp);
2119
2120         return (0);
2121 }
2122
2123 /*
2124  * NAME:        raid_set_extvtoc
2125  * DESCRIPTION: used to set the extended VTOC on a RAID metadevice
2126  * PARAMETERS:  mr_unit_t    *un - RAID unit to set the VTOC on
2127  *              struct extvtoc *vtocp - pointer to extended VTOC data structure
2128  *
2129  * LOCKS:       none
2130  *
2131  */
2132 static int
2133 raid_set_extvtoc(
2134         mr_unit_t       *un,
2135         struct extvtoc  *vtocp
2136 )
2137 {
2138         return (md_set_extvtoc((md_unit_t *)un, vtocp));
2139 }
2140
2141
2142
2143 /*
2144  * NAME:        raid_get_cgapart
2145  * DESCRIPTION: used to get the dk_map on a RAID metadevice
2146  * PARAMETERS:  mr_unit_t    *un - RAID unit to set the VTOC on
2147  *              struct vtoc *dkmapp - pointer to dk_map data structure
2148  *
2149  * LOCKS:       none
2150  *
2151  */
2152
2153 static int
2154 raid_get_cgapart(
2155         mr_unit_t       *un,
2156         struct dk_map   *dkmapp
2157 )
2158 {
2159         md_get_cgapart((md_unit_t *)un, dkmapp);
2160         return (0);
2161 }
2162
2163 /*
2164  * NAME:        raid_getdevs
2165  * DESCRIPTION: return all devices within a RAID metadevice
2166  * PARAMETERS:  md_getdevs_params_t *mgdp
2167  *                            - pointer to getdevs IOCTL data structure
2168  *              int      mode - should be FREAD
2169  *              IOLOCK *lockp - IOCTL read/write lock
2170  *
2171  * LOCKS:       obtains unit reader lock via IOLOCK
2172  *
2173  */
2174 static int
2175 raid_getdevs(
2176         void                    *mgdp,
2177         int                     mode,
2178         IOLOCK                  *lock
2179 )
2180 {
2181         minor_t                 mnum;
2182         mr_unit_t               *un;
2183         md_dev64_t              *udevs;
2184         int                     i, cnt;
2185         md_dev64_t              unit_dev;
2186         md_getdevs_params_t     *mgdph = mgdp;
2187
2188
2189         mnum = mgdph->mnum;
2190
2191         /* check out unit */
2192         mdclrerror(&mgdph->mde);
2193
2194         if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL)
2195                 return (0);
2196
2197         udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
2198
2199         for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
2200                 if (cnt < mgdph->cnt) {
2201                         unit_dev = un->un_column[i].un_orig_dev;
2202                         if (md_getmajor(unit_dev) != md_major) {
2203                                 if ((unit_dev = md_xlate_mini_2_targ
2204                                     (unit_dev)) == NODEV64)
2205                                         return (ENODEV);
2206                         }
2207
2208                         if (ddi_copyout((caddr_t)&unit_dev,
2209                             (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2210                                 return (EFAULT);
2211                 }
2212                 if (HOTSPARED(un, i)) {
2213                         cnt++;
2214                         if (cnt >= mgdph->cnt)
2215                                 continue;
2216
2217                         unit_dev = un->un_column[i].un_dev;
2218                         if (md_getmajor(unit_dev) != md_major) {
2219                                 if ((unit_dev = md_xlate_mini_2_targ
2220                                     (unit_dev)) == NODEV64)
2221                                         return (ENODEV);
2222                         }
2223
2224                         if (ddi_copyout((caddr_t)&unit_dev,
2225                             (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2226                                 return (EFAULT);
2227                 }
2228         }
2229         mgdph->cnt = cnt;
2230         return (0);
2231 }
2232
2233 /*
2234  * NAME:        raid_change
2235  * DESCRIPTION: used to change the following dynamic values:
2236  *                      the hot spare pool
2237  *              in the unit structure of a RAID metadevice
2238  * PARAMETERS:  md_change_params_t   *mcp - pointer to change data structure
2239  *              IOLOCK       *lock - pointer to IOCTL lock
2240  *
2241  * LOCKS:       obtains unit writer lock via IOLOCK (through raid_getun)
2242  *
2243  */
2244 static int
2245 raid_change(
2246         md_raid_params_t        *mrp,
2247         IOLOCK                  *lock
2248 )
2249 {
2250         minor_t         mnum = mrp->mnum;
2251         mr_unit_t       *un;
2252         int             ix;
2253         mddb_recid_t    recids[3] = {0, 0, 0};
2254         int             err;
2255         int             irecid;
2256         int             inc_new_hsp = 0;
2257
2258         mdclrerror(&mrp->mde);
2259
2260         if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
2261                 return (0);
2262
2263         if (!mrp->params.change_hsp_id)
2264                 return (0);
2265
2266         /* verify that no hotspare is in use */
2267         for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
2268                 if (HOTSPARED(un, ix)) {
2269                         return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
2270                 }
2271         }
2272
2273         /* replace the hot spare pool */
2274
2275         irecid = 0;
2276         if (mrp->params.hsp_id != -1) {
2277                 /* increment the reference count of the new hsp */
2278                 err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
2279                     &recids[0], NULL, NULL, NULL);
2280                 if (err) {
2281                         return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2282                             mrp->params.hsp_id));
2283                 }
2284                 inc_new_hsp = 1;
2285                 irecid++;
2286         }
2287
2288         if (un->un_hsp_id != -1) {
2289                 /* decrement the reference count of the old hsp */
2290                 err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
2291                     &recids[irecid], NULL, NULL, NULL);
2292                 if (err) {
2293                         err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2294                             mrp->params.hsp_id);
2295                         if (inc_new_hsp) {
2296                                 (void) md_hot_spare_ifc(HSP_DECREF,
2297                                     mrp->params.hsp_id, 0, 0,
2298                                     &recids[0], NULL, NULL, NULL);
2299                                 /*
2300                                  * Don't need to commit the record,
2301                                  * because it wasn't committed before
2302                                  */
2303                         }
2304                         return (err);
2305                 }
2306         }
2307
2308         un->un_hsp_id = mrp->params.hsp_id;
2309
2310         raid_commit(un, recids);
2311         SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
2312             MD_UN2SET(un), MD_SID(un));
2313
2314         /* Now trigger hot spare processing in case one is needed. */
2315         if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
2316                 (void) raid_hotspares();
2317
2318         return (0);
2319 }
2320
2321 /*
2322  * NAME:        raid_admin_ioctl
2323  * DESCRIPTION: IOCTL operations unique to metadevices and RAID
2324  * PARAMETERS:  int       cmd - IOCTL command to be executed
2325  *              void    *data - pointer to IOCTL data structure
2326  *              int      mode - either FREAD or FWRITE
2327  *              IOLOCK *lockp - IOCTL read/write lock
2328  *
2329  * LOCKS:       none
2330  *
2331  */
2332 static int
2333 raid_admin_ioctl(
2334         int             cmd,
2335         void            *data,
2336         int             mode,
2337         IOLOCK          *lockp
2338 )
2339 {
2340         size_t          sz = 0;
2341         void            *d = NULL;
2342         int             err = 0;
2343
2344         /* We can only handle 32-bit clients for internal commands */
2345         if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
2346                 return (EINVAL);
2347         }
2348
2349
2350         /* dispatch ioctl */
2351         switch (cmd) {
2352
2353         case MD_IOCSET:
2354         {
2355                 if (! (mode & FWRITE))
2356                         return (EACCES);
2357
2358                 sz = sizeof (md_set_params_t);
2359                 d = kmem_alloc(sz, KM_SLEEP);
2360
2361                 if (ddi_copyin(data, d, sz, mode)) {
2362                         err = EFAULT;
2363                         break;
2364                 }
2365
2366                 err = raid_set(d, mode);
2367                 break;
2368         }
2369
2370         case MD_IOCGET:
2371         {
2372                 if (! (mode & FREAD))
2373                         return (EACCES);
2374
2375                 sz = sizeof (md_i_get_t);
2376                 d = kmem_alloc(sz, KM_SLEEP);
2377
2378                 if (ddi_copyin(data, d, sz, mode)) {
2379                         err = EFAULT;
2380                         break;
2381                 }
2382
2383                 err = raid_get(d, mode, lockp);
2384                 break;
2385         }
2386
2387         case MD_IOCREPLACE:
2388         {
2389                 if (! (mode & FWRITE))
2390                         return (EACCES);
2391
2392                 sz = sizeof (replace_params_t);
2393                 d = kmem_alloc(sz, KM_SLEEP);
2394
2395                 if (ddi_copyin(data, d, sz, mode)) {
2396                         err = EFAULT;
2397                         break;
2398                 }
2399
2400                 err = raid_replace((replace_params_t *)d, lockp);
2401                 break;
2402         }
2403
2404         case MD_IOCSETSYNC:
2405         {
2406                 if (! (mode & FWRITE))
2407                         return (EACCES);
2408
2409                 sz = sizeof (md_resync_ioctl_t);
2410                 d = kmem_alloc(sz, KM_SLEEP);
2411
2412                 if (ddi_copyin(data, d, sz, mode)) {
2413                         err = EFAULT;
2414                         break;
2415                 }
2416
2417                 err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
2418                 break;
2419         }
2420
2421         case MD_IOCGETSYNC:
2422         {
2423                 if (! (mode & FREAD))
2424                         return (EACCES);
2425
2426                 sz = sizeof (md_resync_ioctl_t);
2427                 d = kmem_alloc(sz, KM_SLEEP);
2428
2429                 if (ddi_copyin(data, d, sz, mode)) {
2430                         err = EFAULT;
2431                         break;
2432                 }
2433                 err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
2434
2435                 break;
2436         }
2437
2438         case MD_IOCGROW:
2439         {
2440                 if (! (mode & FWRITE))
2441                         return (EACCES);
2442
2443                 sz = sizeof (md_grow_params_t);
2444                 d = kmem_alloc(sz, KM_SLEEP);
2445
2446                 if (ddi_copyin(data, d, sz, mode)) {
2447                         err = EFAULT;
2448                         break;
2449                 }
2450
2451                 err = raid_grow(d, mode, lockp);
2452                 break;
2453         }
2454
2455         case MD_IOCCHANGE:
2456         {
2457                 if (! (mode & FWRITE))
2458                         return (EACCES);
2459
2460                 sz = sizeof (md_raid_params_t);
2461                 d = kmem_alloc(sz, KM_SLEEP);
2462
2463                 if (ddi_copyin(data, d, sz, mode)) {
2464                         err = EFAULT;
2465                         break;
2466                 }
2467
2468                 err = raid_change((md_raid_params_t *)d, lockp);
2469                 break;
2470         }
2471
2472         case MD_IOCRESET:
2473         {
2474                 if (! (mode & FWRITE))
2475                         return (EACCES);
2476
2477                 sz = sizeof (md_i_reset_t);
2478                 d = kmem_alloc(sz, KM_SLEEP);
2479
2480                 if (ddi_copyin(data, d, sz, mode)) {
2481                         err = EFAULT;
2482                         break;
2483                 }
2484
2485                 err = raid_reset((md_i_reset_t *)d);
2486                 break;
2487         }
2488
2489         case MD_IOCGET_DEVS:
2490         {
2491                 if (! (mode & FREAD))
2492                         return (EACCES);
2493
2494                 sz = sizeof (md_getdevs_params_t);
2495                 d = kmem_alloc(sz, KM_SLEEP);
2496
2497                 if (ddi_copyin(data, d, sz, mode)) {
2498                         err = EFAULT;
2499                         break;
2500                 }
2501
2502                 err = raid_getdevs(d, mode, lockp);
2503                 break;
2504         }
2505
2506         case MD_IOCSETREGEN:
2507         {
2508                 if (! (mode & FWRITE))
2509                         return (EACCES);
2510
2511                 sz = sizeof (md_regen_param_t);
2512                 d = kmem_alloc(sz, KM_SLEEP);
2513
2514                 if (ddi_copyin(data, d, sz, mode)) {
2515                         err = EFAULT;
2516                         break;
2517                 }
2518
2519                 err = raid_regen((md_regen_param_t *)d, lockp);
2520                 break;
2521         }
2522
2523         case MD_IOCPROBE_DEV:
2524         {
2525                 md_probedev_impl_t      *p = NULL;
2526                 md_probedev_t           *ph = NULL;
2527                 daemon_queue_t          *hdr = NULL;
2528                 int                     i;
2529                 size_t                  sz1 = 0;
2530
2531
2532                 if (! (mode & FREAD))
2533                         return (EACCES);
2534
2535                 sz = sizeof (md_probedev_t);
2536
2537                 d = kmem_alloc(sz, KM_SLEEP);
2538
2539                 /* now copy in the data */
2540                 if (ddi_copyin(data, d, sz, mode)) {
2541                         err = EFAULT;
2542                         goto free_mem;
2543                 }
2544
2545                 /*
2546                  * Sanity test the args. Test name should have the keyword
2547                  * probe.
2548                  */
2549                 p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
2550                 p->probe_sema = NULL;
2551                 p->probe_mx = NULL;
2552                 p->probe.mnum_list = (uint64_t)NULL;
2553
2554                 ph = (md_probedev_t *)d;
2555                 p->probe.nmdevs = ph->nmdevs;
2556                 (void) strcpy(p->probe.test_name, ph->test_name);
2557                 bcopy(&ph->md_driver, &(p->probe.md_driver),
2558                     sizeof (md_driver_t));
2559
2560                 if ((p->probe.nmdevs < 1) ||
2561                     (strstr(p->probe.test_name, "probe") == NULL)) {
2562                         err = EINVAL;
2563                         goto free_mem;
2564                 }
2565
2566                 sz1 = sizeof (minor_t) * p->probe.nmdevs;
2567
2568                 p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
2569                     KM_SLEEP);
2570
2571                 if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
2572                     (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
2573                         err = EFAULT;
2574                         goto free_mem;
2575                 }
2576
2577                 if (err = md_init_probereq(p, &hdr))
2578                         goto free_mem;
2579
2580                 /*
2581                  * put the request on the queue and wait.
2582                  */
2583
2584                 daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
2585
2586                 (void) IOLOCK_RETURN(0, lockp);
2587                 /* wait for the events to occur */
2588                 for (i = 0; i < p->probe.nmdevs; i++) {
2589                         sema_p(PROBE_SEMA(p));
2590                 }
2591                 while (md_ioctl_lock_enter() == EINTR)
2592                         ;
2593
2594                 /*
2595                  * clean up. The hdr list is freed in the probe routines
2596                  * since the list is NULL by the time we get here.
2597                  */
2598 free_mem:
2599                 if (p) {
2600                         if (p->probe_sema != NULL) {
2601                                 sema_destroy(PROBE_SEMA(p));
2602                                 kmem_free(p->probe_sema, sizeof (ksema_t));
2603                         }
2604                         if (p->probe_mx != NULL) {
2605                                 mutex_destroy(PROBE_MX(p));
2606                                 kmem_free(p->probe_mx, sizeof (kmutex_t));
2607                         }
2608                         if (p->probe.mnum_list)
2609                                 kmem_free((caddr_t)(uintptr_t)
2610                                     p->probe.mnum_list, sz1);
2611
2612                         kmem_free(p, sizeof (md_probedev_impl_t));
2613                 }
2614                 break;
2615         }
2616
2617         default:
2618                 return (ENOTTY);
2619         }
2620
2621         /*
2622          * copyout and free any args
2623          */
2624         if (sz != 0) {
2625                 if (err == 0) {
2626                         if (ddi_copyout(d, data, sz, mode) != 0) {
2627                                 err = EFAULT;
2628                         }
2629                 }
2630                 kmem_free(d, sz);
2631         }
2632         return (err);
2633 }
2634
2635 /*
2636  * NAME:        md_raid_ioctl
2637  * DESCRIPTION: RAID metadevice IOCTL operations entry point.
2638  * PARAMETERS:  md_dev64_t dev - RAID device identifier
2639  *              int       cmd  - IOCTL command to be executed
2640  *              void    *data  - pointer to IOCTL data structure
2641  *              int      mode  - either FREAD or FWRITE
2642  *              IOLOCK *lockp  - IOCTL read/write lock
2643  *
2644  * LOCKS:       none
2645  *
2646  */
2647 int
2648 md_raid_ioctl(
2649         dev_t           dev,
2650         int             cmd,
2651         void            *data,
2652         int             mode,
2653         IOLOCK          *lockp
2654 )
2655 {
2656         minor_t         mnum = getminor(dev);
2657         mr_unit_t       *un;
2658         int             err = 0;
2659
2660         /* handle admin ioctls */
2661         if (mnum == MD_ADM_MINOR)
2662                 return (raid_admin_ioctl(cmd, data, mode, lockp));
2663
2664         /* check unit */
2665         if ((MD_MIN2SET(mnum) >= md_nsets) ||
2666             (MD_MIN2UNIT(mnum) >= md_nunits) ||
2667             ((un = MD_UNIT(mnum)) == NULL))
2668                 return (ENXIO);
2669
2670         /* is this a supported ioctl? */
2671         err = md_check_ioctl_against_unit(cmd, un->c);
2672         if (err != 0) {
2673                 return (err);
2674         }
2675
2676         /* dispatch ioctl */
2677         switch (cmd) {
2678
2679         case DKIOCINFO:
2680         {
2681                 struct dk_cinfo *p;
2682
2683                 if (! (mode & FREAD))
2684                         return (EACCES);
2685
2686                 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2687
2688                 get_info(p, mnum);
2689                 if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
2690                         err = EFAULT;
2691
2692                 kmem_free(p, sizeof (*p));
2693                 return (err);
2694         }
2695
2696         case DKIOCGMEDIAINFO:
2697         {
2698                 struct dk_minfo p;
2699
2700                 if (! (mode & FREAD))
2701                         return (EACCES);
2702
2703                 get_minfo(&p, mnum);
2704                 if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
2705                         err = EFAULT;
2706
2707                 return (err);
2708         }
2709
2710         case DKIOCGGEOM:
2711         {
2712                 struct dk_geom  *p;
2713
2714                 if (! (mode & FREAD))
2715                         return (EACCES);
2716
2717                 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2718
2719                 if ((err = raid_get_geom(un, p)) == 0) {
2720                         if (ddi_copyout((caddr_t)p, data, sizeof (*p),
2721                             mode) != 0)
2722                                 err = EFAULT;
2723                 }
2724
2725                 kmem_free(p, sizeof (*p));
2726                 return (err);
2727         }
2728
2729         case DKIOCGVTOC:
2730         {
2731                 struct vtoc     *vtoc;
2732
2733                 if (! (mode & FREAD))
2734                         return (EACCES);
2735
2736                 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2737                 if ((err = raid_get_vtoc(un, vtoc)) != 0) {
2738                         kmem_free(vtoc, sizeof (*vtoc));
2739                         return (err);
2740                 }
2741
2742                 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2743                         if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode))
2744                                 err = EFAULT;
2745                 }
2746 #ifdef _SYSCALL32
2747                 else {
2748                         struct vtoc32   *vtoc32;
2749
2750                         vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2751
2752                         vtoctovtoc32((*vtoc), (*vtoc32));
2753                         if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode))
2754                                 err = EFAULT;
2755                         kmem_free(vtoc32, sizeof (*vtoc32));
2756                 }
2757 #endif /* _SYSCALL32 */
2758
2759                 kmem_free(vtoc, sizeof (*vtoc));
2760                 return (err);
2761         }
2762
2763         case DKIOCSVTOC:
2764         {
2765                 struct vtoc     *vtoc;
2766
2767                 if (! (mode & FWRITE))
2768                         return (EACCES);
2769
2770                 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2771                 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2772                         if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) {
2773                                 err = EFAULT;
2774                         }
2775                 }
2776 #ifdef _SYSCALL32
2777                 else {
2778                         struct vtoc32   *vtoc32;
2779
2780                         vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2781
2782                         if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) {
2783                                 err = EFAULT;
2784                         } else {
2785                                 vtoc32tovtoc((*vtoc32), (*vtoc));
2786                         }
2787                         kmem_free(vtoc32, sizeof (*vtoc32));
2788                 }
2789 #endif /* _SYSCALL32 */
2790
2791                 if (err == 0)
2792                         err = raid_set_vtoc(un, vtoc);
2793
2794                 kmem_free(vtoc, sizeof (*vtoc));
2795                 return (err);
2796         }
2797
2798         case DKIOCGEXTVTOC:
2799         {
2800                 struct extvtoc  *extvtoc;
2801
2802                 if (! (mode & FREAD))
2803                         return (EACCES);
2804
2805                 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2806                 if ((err = raid_get_extvtoc(un, extvtoc)) != 0) {
2807                         kmem_free(extvtoc, sizeof (*extvtoc));
2808                         return (err);
2809                 }
2810
2811                 if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode))
2812                         err = EFAULT;
2813
2814                 kmem_free(extvtoc, sizeof (*extvtoc));
2815                 return (err);
2816         }
2817
2818         case DKIOCSEXTVTOC:
2819         {
2820                 struct extvtoc  *extvtoc;
2821
2822                 if (! (mode & FWRITE))
2823                         return (EACCES);
2824
2825                 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2826                 if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) {
2827                         err = EFAULT;
2828                 }
2829
2830                 if (err == 0)
2831                         err = raid_set_extvtoc(un, extvtoc);
2832
2833                 kmem_free(extvtoc, sizeof (*extvtoc));
2834                 return (err);
2835         }
2836
2837         case DKIOCGAPART:
2838         {
2839                 struct dk_map   dmp;
2840
2841                 if ((err = raid_get_cgapart(un, &dmp)) != 0) {
2842                         return (err);
2843                 }
2844
2845                 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2846                         if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
2847                             mode) != 0)
2848                                 err = EFAULT;
2849                 }
2850 #ifdef _SYSCALL32
2851                 else {
2852                         struct dk_map32 dmp32;
2853
2854                         dmp32.dkl_cylno = dmp.dkl_cylno;
2855                         dmp32.dkl_nblk = dmp.dkl_nblk;
2856
2857                         if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
2858                             mode) != 0)
2859                                 err = EFAULT;
2860                 }
2861 #endif /* _SYSCALL32 */
2862
2863                 return (err);
2864         }
2865         case DKIOCGETEFI:
2866         {
2867                 /*
2868                  * This one can be done centralized,
2869                  * no need to put in the same code for all types of metadevices
2870                  */
2871                 return (md_dkiocgetefi(mnum, data, mode));
2872         }
2873
2874         case DKIOCSETEFI:
2875         {
2876                 /*
2877                  * This one can be done centralized,
2878                  * no need to put in the same code for all types of metadevices
2879                  */
2880                 return (md_dkiocsetefi(mnum, data, mode));
2881         }
2882
2883         case DKIOCPARTITION:
2884         {
2885                 return (md_dkiocpartition(mnum, data, mode));
2886         }
2887
2888         default:
2889                 return (ENOTTY);
2890         }
2891 }
2892
2893 /*
2894  * rename/exchange named service entry points and support functions follow.
2895  * Most functions are handled generically, except for raid-specific locking
2896  * and checking
2897  */
2898
2899 /*
2900  * NAME:        raid_may_renexch_self
2901  * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
2902  * PARAMETERS:  mr_unit_t       *un - unit struct of raid unit to be renamed
2903  *              mdi_unit_t      *ui - in-core unit struct of same raid unit
2904  *              md_rentxn_t     *rtxnp - rename transaction state
2905  *
2906  * LOCKS:       none
2907  *
2908  */
2909 static int
2910 raid_may_renexch_self(
2911         mr_unit_t       *un,
2912         mdi_unit_t      *ui,
2913         md_rentxn_t     *rtxnp)
2914 {
2915         minor_t from_min;
2916         minor_t to_min;
2917         bool_t  toplevel;
2918         bool_t  related;
2919
2920         from_min = rtxnp->from.mnum;
2921         to_min = rtxnp->to.mnum;
2922
2923         if (!un || !ui) {
2924                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2925                     from_min);
2926                 return (EINVAL);
2927         }
2928
2929         ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
2930         if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
2931                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2932                 return (EINVAL);
2933         }
2934
2935         if (MD_PARENT(un) == MD_MULTI_PARENT) {
2936                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2937                 return (EINVAL);
2938         }
2939
2940         toplevel = !MD_HAS_PARENT(MD_PARENT(un));
2941
2942         /* we're related if trying to swap with our parent */
2943         related = (!toplevel) && (MD_PARENT(un) == to_min);
2944
2945         switch (rtxnp->op) {
2946         case MDRNOP_EXCHANGE:
2947
2948                 if (!related) {
2949                         (void) mdmderror(&rtxnp->mde,
2950                             MDE_RENAME_TARGET_UNRELATED, to_min);
2951                         return (EINVAL);
2952                 }
2953
2954                 break;
2955
2956         case MDRNOP_RENAME:
2957                 /*
2958                  * if from is top-level and is open, then the kernel is using
2959                  * the md_dev64_t.
2960                  */
2961
2962                 if (toplevel && md_unit_isopen(ui)) {
2963                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
2964                             from_min);
2965                         return (EBUSY);
2966                 }
2967                 break;
2968
2969         default:
2970                 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2971                     from_min);
2972                 return (EINVAL);
2973         }
2974
2975         return (0);     /* ok */
2976 }
2977
2978 /*
2979  * NAME:        raid_rename_check
2980  * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
2981  * PARAMETERS:  md_rendelta_t   *delta - describes changes to be made to this
2982  *                                       raid device for rename transaction
2983  *              md_rentxn_t     *rtxnp - rename transaction state
2984  *
2985  * LOCKS:       none
2986  *
2987  */
2988 intptr_t
2989 raid_rename_check(
2990         md_rendelta_t   *delta,
2991         md_rentxn_t     *rtxnp)
2992 {
2993         int              err    = 0;
2994         int              column;
2995         mr_unit_t       *un;
2996
2997         ASSERT(delta);
2998         ASSERT(rtxnp);
2999         ASSERT(delta->unp);
3000         ASSERT(delta->uip);
3001
3002         if (!delta || !rtxnp || !delta->unp || !delta->uip) {
3003                 (void) mdsyserror(&rtxnp->mde, EINVAL);
3004                 return (EINVAL);
3005         }
3006
3007         un = (mr_unit_t *)delta->unp;
3008
3009         for (column = 0; column < un->un_totalcolumncnt; column++) {
3010                 rcs_state_t     colstate;
3011
3012                 colstate = un->un_column[column].un_devstate;
3013
3014                 if (colstate & RCS_LAST_ERRED) {
3015                         (void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
3016                             md_getminor(delta->dev));
3017                         return (EINVAL);
3018                 }
3019
3020                 if (colstate & RCS_INIT_ERRED) {
3021                         (void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
3022                             md_getminor(delta->dev));
3023                         return (EINVAL);
3024                 }
3025
3026                 /* How did we get this far before detecting this? */
3027                 if (colstate & RCS_RESYNC) {
3028                         (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
3029                             md_getminor(delta->dev));
3030                         return (EBUSY);
3031                 }
3032
3033                 if (colstate & RCS_ERRED) {
3034                         (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3035                             md_getminor(delta->dev));
3036                         return (EINVAL);
3037                 }
3038
3039                 if (!(colstate & RCS_OKAY)) {
3040                         (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3041                             md_getminor(delta->dev));
3042                         return (EINVAL);
3043                 }
3044
3045                 if (HOTSPARED(un, column)) {
3046                         (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3047                             md_getminor(delta->dev));
3048                         return (EINVAL);
3049                 }
3050         }
3051
3052         /* self does additional checks */
3053         if (delta->old_role == MDRR_SELF) {
3054                 err = raid_may_renexch_self((mr_unit_t *)delta->unp,
3055                     delta->uip, rtxnp);
3056         }
3057         return (err);
3058 }
3059
3060 /*
3061  * NAME:        raid_rename_lock
3062  * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
3063  * PARAMETERS:  md_rendelta_t   *delta - describes changes to be made to this
3064  *                                       raid device for rename transaction
3065  *              md_rentxn_t     *rtxnp - rename transaction state
3066  *
3067  * LOCKS:       io and unit locks (taken explicitly *not* via ioctl wrappers)
3068  *
3069  */
3070 intptr_t
3071 raid_rename_lock(
3072         md_rendelta_t   *delta,
3073         md_rentxn_t     *rtxnp)
3074 {
3075         minor_t         mnum;
3076
3077         ASSERT(delta);
3078         ASSERT(rtxnp);
3079
3080         mnum = md_getminor(delta->dev);
3081         if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
3082                 return (0);
3083         }
3084
3085         ASSERT(delta->uip);
3086         if (!delta->uip) {
3087                 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
3088                 return (ENODEV);
3089         }
3090
3091         ASSERT(delta->unp);
3092         if (!delta->unp) {
3093
3094                 return (ENODEV);
3095         }
3096
3097         ASSERT(!IO_WRITER_HELD(delta->unp));
3098         (void) md_io_writerlock(delta->uip);
3099         ASSERT(IO_WRITER_HELD(delta->unp));
3100
3101
3102         ASSERT(!UNIT_WRITER_HELD(delta->unp));
3103         (void) md_unit_writerlock(delta->uip);
3104         ASSERT(UNIT_WRITER_HELD(delta->unp));
3105
3106         return (0);
3107 }
3108
3109 /*
3110  * NAME:        raid_rename_unlock
3111  * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
3112  * PARAMETERS:  md_rendelta_t   *delta - describes changes to be made to this
3113  *                                       raid device for rename transaction
3114  *              md_rentxn_t     *rtxnp - rename transaction state
3115  *
3116  * LOCKS:       drops io and unit locks
3117  *
3118  */
3119 /* ARGSUSED */
3120 void
3121 raid_rename_unlock(
3122         md_rendelta_t   *delta,
3123         md_rentxn_t     *rtxnp)
3124 {
3125         mr_unit_t       *un = (mr_unit_t *)delta->unp;
3126         minor_t         mnum = MD_SID(un);
3127         int             col;
3128
3129         ASSERT(delta);
3130         ASSERT(delta->unp);
3131         ASSERT(delta->uip);
3132
3133         ASSERT(UNIT_WRITER_HELD(delta->unp));
3134         md_unit_writerexit(delta->uip);
3135         ASSERT(!UNIT_WRITER_HELD(delta->unp));
3136
3137         if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
3138                 goto out;
3139         }
3140         if (raid_internal_open(mnum, (FREAD | FWRITE),
3141             OTYP_LYR, MD_OFLG_ISINIT) == 0) {
3142                 for (col = 0; col < un->un_totalcolumncnt; col++) {
3143                         if (un->un_column[col].un_devstate & RCS_OKAY)
3144                                 (void) init_pw_area(un,
3145                                     un->un_column[col].un_dev,
3146                                     un->un_column[col].un_pwstart, col);
3147                 }
3148                 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
3149         }
3150
3151 out:
3152         ASSERT(IO_WRITER_HELD(delta->unp));
3153         md_io_writerexit(delta->uip);
3154         ASSERT(!IO_WRITER_HELD(delta->unp));
3155 }
3156 /* end of rename/exchange named service and support functions */