7712 mandoc -Tlint does always exit with error code 0
[unleashed.git] / usr / src / lib / lvm / libmeta / common / meta_set_drv.c
blob4c4eaf0d78c3943e9f34d584173e1a9ccddd9041
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * Metadevice diskset interfaces
32 #include <meta.h>
33 #include <mdmn_changelog.h>
34 #include "meta_set_prv.h"
35 #include "meta_repartition.h"
37 static int
38 check_setnodes_againstdrivelist(
39 mdsetname_t *sp,
40 mddrivenamelist_t *dnlp,
41 md_error_t *ep
44 md_set_desc *sd;
45 mddrivenamelist_t *p;
46 int i;
47 md_mnnode_desc *nd;
49 if ((sd = metaget_setdesc(sp, ep)) == NULL)
50 return (-1);
52 if (MD_MNSET_DESC(sd)) {
53 nd = sd->sd_nodelist;
54 while (nd) {
55 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
56 nd = nd->nd_next;
57 continue;
59 for (p = dnlp; p != NULL; p = p->next)
60 if (checkdrive_onnode(sp, p->drivenamep,
61 nd->nd_nodename, ep))
62 return (-1);
63 nd = nd->nd_next;
65 } else {
66 for (i = 0; i < MD_MAXSIDES; i++) {
67 /* Skip empty slots */
68 if (sd->sd_nodes[i][0] == '\0')
69 continue;
71 for (p = dnlp; p != NULL; p = p->next)
72 if (checkdrive_onnode(sp, p->drivenamep,
73 sd->sd_nodes[i], ep))
74 return (-1);
77 return (0);
80 static int
81 drvsuniq(mdsetname_t *sp, mddrivenamelist_t *dnlp, md_error_t *ep)
83 mddrivenamelist_t *dl1, *dl2;
84 mddrivename_t *dn1, *dn2;
86 for (dl1 = dnlp; dl1 != NULL; dl1 = dl1->next) {
87 dn1 = dl1->drivenamep;
89 for (dl2 = dl1->next; dl2 != NULL; dl2 = dl2->next) {
90 dn2 = dl2->drivenamep;
91 if (strcmp(dn1->cname, dn2->cname) != 0)
92 continue;
94 return (mddserror(ep, MDE_DS_DUPDRIVE, sp->setno,
95 NULL, dn1->cname, sp->setname));
98 return (0);
101 static md_drive_desc *
102 metaget_drivedesc_fromdrivelist(
103 mdsetname_t *sp,
104 mddrivenamelist_t *dnlp,
105 uint_t flags,
106 md_error_t *ep
109 mddrivenamelist_t *p;
110 md_drive_desc *dd = NULL;
111 md_set_desc *sd;
113 if ((sd = metaget_setdesc(sp, ep)) == NULL)
114 return (NULL);
116 for (p = dnlp; p != NULL; p = p->next) {
117 (void) metadrivedesc_append(&dd, p->drivenamep, 0, 0,
118 sd->sd_ctime, sd->sd_genid, flags);
121 return (dd);
125 * Exported Entry Points
129 meta_make_sidenmlist(
130 mdsetname_t *sp,
131 mddrivename_t *dnp,
132 int import_flag, /* flags partial import */
133 md_im_drive_info_t *midp, /* import drive information */
134 md_error_t *ep
137 mdsidenames_t *sn, **sn_next;
138 mdname_t *np;
139 int done;
140 side_t sideno = MD_SIDEWILD;
141 uint_t rep_slice;
142 char *bname;
144 if (!import_flag) {
146 * Normal (aka NOT partial import) code path.
148 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
149 return (-1);
152 dnp->side_names_key = MD_KEYWILD;
154 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
155 return (-1);
156 bname = Strdup(np->bname);
157 } else {
159 * When doing a partial import, we'll get the needed
160 * information from somewhere other than the system.
162 dnp->side_names_key = MD_KEYWILD;
163 bname = Strdup(midp->mid_devname);
165 metaflushsidenames(dnp);
166 sn_next = &dnp->side_names;
167 /*CONSTCOND*/
168 while (1) {
169 sn = Zalloc(sizeof (*sn));
171 if ((done = meta_getnextside_devinfo(sp, bname, &sideno,
172 &sn->cname, &sn->dname, &sn->mnum, ep)) == -1) {
173 if (import_flag) {
174 mdclrerror(ep);
175 sn->dname = Strdup(midp->mid_driver_name);
176 sn->mnum = midp->mid_mnum;
177 } else {
178 Free(sn);
179 Free(bname);
180 return (-1);
184 if (done == 0) {
185 Free(sn);
186 Free(bname);
187 return (0);
190 sn->sideno = sideno;
192 /* Add to the end of the linked list */
193 assert(*sn_next == NULL);
194 *sn_next = sn;
195 sn_next = &sn->next;
197 /*NOTREACHED*/
201 meta_set_adddrives(
202 mdsetname_t *sp,
203 mddrivenamelist_t *dnlp,
204 daddr_t dbsize,
205 int force_label,
206 md_error_t *ep
209 md_set_desc *sd;
210 md_drive_desc *dd = NULL, *curdd = NULL, *ddp;
211 int i;
212 mddrivenamelist_t *p;
213 mhd_mhiargs_t mhiargs;
214 int rval = 0;
215 md_timeval32_t now;
216 sigset_t oldsigs;
217 ulong_t genid;
218 ulong_t max_genid = 0;
219 md_setkey_t *cl_sk;
220 int rb_level = 0;
221 md_error_t xep = mdnullerror;
222 md_mnnode_desc *nd;
223 int suspendall_flag = 0;
224 int suspend1_flag = 0;
225 int lock_flag = 0;
226 int flush_set_onerr = 0;
227 md_replicalist_t *rlp = NULL, *rl;
229 if ((sd = metaget_setdesc(sp, ep)) == NULL)
230 return (-1);
232 /* Make sure we own the set */
233 if (meta_check_ownership(sp, ep) != 0)
234 return (-1);
237 * The drive and node records are stored in the local mddbs of each
238 * node in the diskset. Each node's rpc.metad daemon reads in the set,
239 * drive and node records from that node's local mddb and caches them
240 * internally. Any process needing diskset information contacts its
241 * local rpc.metad to get this information. Since each node in the
242 * diskset is independently reading the set information from its local
243 * mddb, the set, drive and node records in the local mddbs must stay
244 * in-sync, so that all nodes have a consistent view of the diskset.
246 * For a multinode diskset, explicitly verify that all nodes in the
247 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
248 * fail this operation since all nodes must be ALIVE in order to add
249 * the new drive record to their local mddb. If a panic of this node
250 * leaves the local mddbs set, node and drive records out-of-sync, the
251 * reconfig cycle will fix the local mddbs and force them back into
252 * synchronization.
254 if (MD_MNSET_DESC(sd)) {
255 nd = sd->sd_nodelist;
256 while (nd) {
257 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
258 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
259 sp->setno,
260 nd->nd_nodename, NULL, sp->setname);
261 return (-1);
263 nd = nd->nd_next;
267 if (drvsuniq(sp, dnlp, ep) == -1)
268 return (-1);
271 * Lock the set on current set members.
272 * Set locking done much earlier for MN diskset than for traditional
273 * diskset since lock_set and SUSPEND are used to protect against
274 * other meta* commands running on the other nodes.
276 if (MD_MNSET_DESC(sd)) {
277 /* Make sure we are blocking all signals */
278 if (procsigs(TRUE, &oldsigs, &xep) < 0)
279 mdclrerror(&xep);
281 nd = sd->sd_nodelist;
282 /* All nodes are guaranteed to be ALIVE */
283 while (nd) {
284 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
285 rval = -1;
286 goto out;
288 lock_flag = 1;
289 nd = nd->nd_next;
292 * Lock out other meta* commands by suspending
293 * class 1 messages across the diskset.
295 nd = sd->sd_nodelist;
296 /* All nodes are guaranteed to be ALIVE */
297 while (nd) {
298 if (clnt_mdcommdctl(nd->nd_nodename,
299 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
300 MD_MSCF_NO_FLAGS, ep)) {
301 rval = -1;
302 goto out;
304 suspend1_flag = 1;
305 nd = nd->nd_next;
309 if (check_setnodes_againstdrivelist(sp, dnlp, ep)) {
310 rval = -1;
311 goto out;
314 for (p = dnlp; p != NULL; p = p->next) {
315 mdsetname_t *tmp;
317 if (meta_is_drive_in_anyset(p->drivenamep, &tmp, FALSE,
318 ep) == -1) {
319 rval = -1;
320 goto out;
323 if (tmp != NULL) {
324 (void) mddserror(ep, MDE_DS_DRIVEINSET, sp->setno,
325 tmp->setname, p->drivenamep->cname, sp->setname);
326 rval = -1;
327 goto out;
331 /* END CHECK CODE */
334 * This is a separate loop (from above) so that we validate all the
335 * drives handed to us before we repartition any one drive.
337 for (p = dnlp; p != NULL; p = p->next) {
338 if (meta_repartition_drive(sp,
339 p->drivenamep, force_label == TRUE ? MD_REPART_FORCE : 0,
340 NULL, /* Don't return the VTOC. */
341 ep) != 0) {
342 rval = -1;
343 goto out;
346 * Create the names for the drives we are adding per side.
348 if (meta_make_sidenmlist(sp, p->drivenamep, 0, NULL,
349 ep) == -1) {
350 rval = -1;
351 goto out;
356 * Get the list of drives descriptors that we are adding.
358 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
360 if (! mdisok(ep)) {
361 rval = -1;
362 goto out;
366 * Get the set timeout information.
368 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
369 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
370 rval = -1;
371 goto out;
375 * Get timestamp and generation id for new records
377 now = sd->sd_ctime;
378 genid = sd->sd_genid;
381 /* At this point, in case of error, set should be flushed. */
382 flush_set_onerr = 1;
384 /* Lock the set on current set members */
385 if (!(MD_MNSET_DESC(sd))) {
386 md_rb_sig_handling_on();
387 for (i = 0; i < MD_MAXSIDES; i++) {
388 /* Skip empty slots */
389 if (sd->sd_nodes[i][0] == '\0')
390 continue;
392 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
393 rval = -1;
394 goto out;
396 lock_flag = 1;
401 * Get drive descriptors for the drives that are currently in the set.
403 curdd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep);
404 if (! mdisok(ep))
405 goto rollback;
408 * If first drive being added to set, set the mastership
409 * of the multinode diskset to be this node.
410 * Only set it on this node. If all goes well
411 * and there are no errors, the mastership of this node will be set
412 * on all nodes in user space and in the kernel.
414 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
415 if (clnt_mnsetmaster(mynode(), sp,
416 sd->sd_mn_mynode->nd_nodename,
417 sd->sd_mn_mynode->nd_nodeid, ep)) {
418 goto rollback;
421 * Set this up in my local cache of the set desc so that
422 * the set descriptor won't have to be gotten again from
423 * rpc.metad. If it is flushed and gotten again, these
424 * values will be set in sr2setdesc.
426 sd->sd_mn_master_nodeid = sd->sd_mn_mynode->nd_nodeid;
427 (void) strcpy(sd->sd_mn_master_nodenm,
428 sd->sd_mn_mynode->nd_nodename);
429 sd->sd_mn_am_i_master = 1;
432 RB_TEST(1, "adddrives", ep)
434 RB_PREEMPT;
435 rb_level = 1; /* level 1 */
437 RB_TEST(2, "adddrives", ep)
440 * Add the drive records for the drives that we are adding to
441 * each host in the set. Marks the drive as MD_DR_ADD.
443 if (MD_MNSET_DESC(sd)) {
444 nd = sd->sd_nodelist;
445 /* All nodes are guaranteed to be ALIVE */
446 while (nd) {
447 if (clnt_adddrvs(nd->nd_nodename, sp, dd, now, genid,
448 ep) == -1)
449 goto rollback;
451 RB_TEST(3, "adddrives", ep)
452 nd = nd->nd_next;
454 } else {
455 for (i = 0; i < MD_MAXSIDES; i++) {
456 /* Skip empty slots */
457 if (sd->sd_nodes[i][0] == '\0')
458 continue;
460 if (clnt_adddrvs(sd->sd_nodes[i], sp, dd, now, genid,
461 ep) == -1)
462 goto rollback;
464 RB_TEST(3, "adddrives", ep)
468 RB_TEST(4, "adddrives", ep)
470 RB_PREEMPT;
471 rb_level = 2; /* level 2 */
473 RB_TEST(5, "adddrives", ep)
476 * Take ownership of the added drives.
478 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
479 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
480 goto rollback;
484 * If this is not a MN set and the state flags do not indicate the
485 * presence of devids, update the set records on all nodes.
487 if (!(sd->sd_flags & MD_SR_MB_DEVID) && !(MD_MNSET_DESC(sd))) {
488 if (meta_update_mb(sp, dd, ep) == 0) {
489 mdclrerror(ep);
491 /* update the sr_flags on all hosts */
492 for (i = 0; i < MD_MAXSIDES; i++) {
493 if (sd->sd_nodes[i][0] == '\0')
494 continue;
496 if (clnt_upd_sr_flags(sd->sd_nodes[i],
497 sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
498 goto rollback;
503 RB_TEST(6, "adddrives", ep)
505 RB_PREEMPT;
506 rb_level = 3; /* level 3 */
508 RB_TEST(7, "adddrives", ep)
511 * Balance the DB's according to the list of existing drives and the
512 * list of added drives.
514 if ((rval = meta_db_balance(sp, dd, curdd, dbsize, ep)) == -1)
515 goto rollback;
518 * Slam a dummy master block on all the disks that we are adding
519 * that don't have replicas on them.
520 * Used by diskset import if the disksets are remotely replicated
522 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
523 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
524 uint_t rep_slice;
525 int fd = -1;
526 mdname_t *np = NULL;
527 char *drive_name;
529 drive_name = ddp->dd_dnp->cname;
531 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
532 char *rep_name;
534 rep_name =
535 rl->rl_repp->r_namep->drivenamep->cname;
537 if (strcmp(drive_name, rep_name) == 0) {
539 * Disk has a replica on it so don't
540 * add dummy master block.
542 break;
545 if (rl == NULL) {
547 * Drive doesn't have a replica on it so
548 * we need a dummy master block. Add it.
550 if (meta_replicaslice(ddp->dd_dnp, &rep_slice,
551 &xep) != 0) {
552 mdclrerror(&xep);
553 continue;
556 if ((np = metaslicename(ddp->dd_dnp, rep_slice,
557 &xep)) == NULL) {
558 mdclrerror(&xep);
559 continue;
562 if ((fd = open(np->rname, O_RDWR)) >= 0) {
563 meta_mkdummymaster(sp, fd, 16);
564 (void) close(fd);
570 if ((curdd == NULL) && (MD_MNSET_DESC(sd))) {
572 * Notify rpc.mdcommd on all nodes of a nodelist change.
573 * Start by suspending rpc.mdcommd (which drains it of all
574 * messages), then change the nodelist followed by a reinit
575 * and resume.
577 nd = sd->sd_nodelist;
578 /* All nodes are guaranteed to be ALIVE */
579 while (nd) {
580 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
581 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
582 rval = -1;
583 goto out;
585 suspendall_flag = 1;
586 nd = nd->nd_next;
591 * If a MN diskset and this is the first disk(s) being added
592 * to set, then pre-allocate change log records here.
593 * When the other nodes are joined into the MN diskset, the
594 * USER records will just be snarfed in.
596 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
597 if (mdmn_allocate_changelog(sp, ep) != 0)
598 goto rollback;
602 * Mark the drives MD_DR_OK.
603 * If first drive being added to MN diskset, then set
604 * master on all nodes to be this node and then join
605 * all alive nodes (nodes in membership list) to set.
607 if (MD_MNSET_DESC(sd)) {
608 nd = sd->sd_nodelist;
609 /* All nodes are guaranteed to be ALIVE */
610 while (nd) {
611 /* don't set master on this node - done earlier */
612 if ((curdd == NULL) && (nd->nd_nodeid !=
613 sd->sd_mn_mynode->nd_nodeid)) {
615 * Set master on all alive nodes since
616 * all alive nodes will become joined nodes.
618 if (clnt_mnsetmaster(nd->nd_nodename, sp,
619 sd->sd_mn_mynode->nd_nodename,
620 sd->sd_mn_mynode->nd_nodeid, ep)) {
621 goto rollback;
625 if (curdd == NULL) {
627 * No special flags for join set. Since
628 * all nodes are joining if 1st drive is being
629 * added to set then all nodes will be either
630 * STALE or non-STALE and each node can
631 * determine this on its own.
633 if (clnt_joinset(nd->nd_nodename, sp,
634 NULL, ep)) {
635 goto rollback;
637 /* Sets join node flag on all nodes in list */
638 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
639 sd->sd_nodelist, MD_NR_JOIN, NULL, ep)) {
640 goto rollback;
645 * Set MD_DR_OK as last thing before unlock.
646 * In case of panic on this node, recovery
647 * code can check for MD_DR_OK to determine
648 * status of diskset.
650 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
651 MD_DR_OK, ep) == -1)
652 goto rollback;
655 RB_TEST(8, "adddrives", ep)
656 nd = nd->nd_next;
658 } else {
659 for (i = 0; i < MD_MAXSIDES; i++) {
660 /* Skip empty slots */
661 if (sd->sd_nodes[i][0] == '\0')
662 continue;
664 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd, MD_DR_OK,
665 ep) == -1)
666 goto rollback;
668 RB_TEST(8, "adddrives", ep)
672 RB_TEST(9, "adddrives", ep)
674 out:
676 * Notify rpc.mdcommd on all nodes of a nodelist change.
677 * Send reinit command to mdcommd which forces it to get
678 * fresh set description.
680 if (suspendall_flag) {
681 /* Send reinit */
682 nd = sd->sd_nodelist;
683 /* All nodes are guaranteed to be ALIVE */
684 while (nd) {
685 /* Class is ignored for REINIT */
686 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
687 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
688 if (rval == 0)
689 (void) mdstealerror(ep, &xep);
690 rval = -1;
691 mde_perror(ep, dgettext(TEXT_DOMAIN,
692 "Unable to reinit rpc.mdcommd.\n"));
694 nd = nd->nd_next;
698 * Unlock diskset by resuming messages across the diskset.
699 * Just resume all classes so that resume is the same whether
700 * just one class was locked or all classes were locked.
702 if ((suspend1_flag) || (suspendall_flag)) {
703 nd = sd->sd_nodelist;
704 /* All nodes are guaranteed to be ALIVE */
705 while (nd) {
706 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
707 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
708 if (rval == 0)
709 (void) mdstealerror(ep, &xep);
710 rval = -1;
711 mde_perror(ep, dgettext(TEXT_DOMAIN,
712 "Unable to resume rpc.mdcommd.\n"));
714 nd = nd->nd_next;
716 meta_ping_mnset(sp->setno);
719 if (lock_flag) {
720 cl_sk = cl_get_setkey(sp->setno, sp->setname);
721 if (MD_MNSET_DESC(sd)) {
722 nd = sd->sd_nodelist;
723 /* All nodes are guaranteed to be ALIVE */
724 while (nd) {
725 if (clnt_unlock_set(nd->nd_nodename,
726 cl_sk, &xep)) {
727 if (rval == 0)
728 (void) mdstealerror(ep, &xep);
729 rval = -1;
731 nd = nd->nd_next;
733 } else {
734 for (i = 0; i < MD_MAXSIDES; i++) {
735 /* Skip empty slots */
736 if (sd->sd_nodes[i][0] == '\0')
737 continue;
739 if (clnt_unlock_set(sd->sd_nodes[i],
740 cl_sk, &xep)) {
741 if (rval == 0)
742 (void) mdstealerror(ep, &xep);
743 rval = -1;
747 cl_set_setkey(NULL);
750 metafreedrivedesc(&dd);
752 if (flush_set_onerr) {
753 metaflushsetname(sp);
754 if (!(MD_MNSET_DESC(sd))) {
755 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
759 if (MD_MNSET_DESC(sd)) {
760 /* release signals back to what they were on entry */
761 if (procsigs(FALSE, &oldsigs, &xep) < 0)
762 mdclrerror(&xep);
765 return (rval);
767 rollback:
768 /* all signals already blocked for MN disket */
769 if (!(MD_MNSET_DESC(sd))) {
770 /* Make sure we are blocking all signals */
771 if (procsigs(TRUE, &oldsigs, &xep) < 0)
772 mdclrerror(&xep);
775 rval = -1;
777 max_genid = sd->sd_genid;
779 /* level 3 */
780 if (rb_level > 2) {
782 * Since the add drive operation is failing, need
783 * to reset config back to the way it was
784 * before the add drive opration.
785 * If a MN diskset and this is the first drive being added,
786 * then reset master on all ALIVE nodes (which is all nodes)
787 * since the master would have not been set previously.
788 * Don't reset master on this node, since this
789 * is done later.
790 * This is ok to fail since next node to add first
791 * disk to diskset will also set the master on all nodes.
793 * Also, if this is the first drive being added,
794 * need to have each node withdraw itself from the set.
796 if ((MD_MNSET_DESC(sd)) && (curdd == NULL)) {
797 nd = sd->sd_nodelist;
798 /* All nodes are guaranteed to be ALIVE */
799 while (nd) {
801 * Be careful with ordering in case of
802 * panic between the steps and the
803 * effect on recovery during reconfig.
805 if (clnt_withdrawset(nd->nd_nodename, sp, &xep))
806 mdclrerror(&xep);
808 /* Sets withdraw flag on all nodes in list */
809 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
810 sd->sd_nodelist, MD_NR_WITHDRAW,
811 NULL, &xep)) {
812 mdclrerror(&xep);
815 /* Skip this node */
816 if (nd->nd_nodeid ==
817 sd->sd_mn_mynode->nd_nodeid) {
818 nd = nd->nd_next;
819 continue;
821 /* Reset master on all of the other nodes. */
822 if (clnt_mnsetmaster(nd->nd_nodename, sp,
823 "", MD_MN_INVALID_NID, &xep))
824 mdclrerror(&xep);
825 nd = nd->nd_next;
831 * Send resume command to mdcommd. Don't send reinit command
832 * since nodelist should not have changed.
833 * If suspendall_flag is set, then user would have been adding
834 * first drives to set. Since this failed, there is certainly
835 * no reinit message to send to rpc.commd since no nodes will
836 * be joined to set at the end of this metaset command.
838 if (suspendall_flag) {
839 /* Send resume */
840 nd = sd->sd_nodelist;
841 /* All nodes are guaranteed to be ALIVE */
842 while (nd) {
844 * Resume all classes but class 1 so that lock is held
845 * against meta* commands.
846 * To later resume class1, must issue a class0 resume.
848 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
849 sp, MD_MSG_CLASS0,
850 MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
851 mde_perror(&xep, dgettext(TEXT_DOMAIN,
852 "Unable to resume rpc.mdcommd.\n"));
853 mdclrerror(&xep);
855 nd = nd->nd_next;
857 meta_ping_mnset(sp->setno);
860 /* level 3 */
861 if (rb_level > 2) {
862 mdnamelist_t *nlp;
863 mdname_t *np;
865 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
866 uint_t rep_slice;
868 if ((meta_replicaslice(ddp->dd_dnp,
869 &rep_slice, &xep) != 0) ||
870 ((np = metaslicename(ddp->dd_dnp, rep_slice,
871 &xep)) == NULL)) {
872 mdclrerror(&xep);
873 continue;
875 nlp = NULL;
876 (void) metanamelist_append(&nlp, np);
878 if (meta_db_detach(sp, nlp,
879 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL, &xep))
880 mdclrerror(&xep);
882 metafreenamelist(nlp);
885 /* Re-balance */
886 if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
887 mdclrerror(&xep);
889 /* Only if we are adding the first drive */
890 /* Handled MN diskset above. */
891 if ((curdd == NULL) && !(MD_MNSET_DESC(sd))) {
892 if (clnt_stimeout(mynode(), sp, &defmhiargs,
893 &xep) == -1)
894 mdclrerror(&xep);
896 /* This is needed because of a corner case */
897 if (halt_set(sp, &xep))
898 mdclrerror(&xep);
900 max_genid++;
903 /* level 2 */
904 if (rb_level > 1) {
905 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
906 if (rel_own_bydd(sp, dd, TRUE, &xep))
907 mdclrerror(&xep);
911 /* level 1 */
912 if (rb_level > 0) {
913 if (MD_MNSET_DESC(sd)) {
914 nd = sd->sd_nodelist;
915 /* All nodes are guaranteed to be ALIVE */
916 while (nd) {
917 if (clnt_deldrvs(nd->nd_nodename, sp, dd,
918 &xep) == -1)
919 mdclrerror(&xep);
920 nd = nd->nd_next;
922 } else {
923 for (i = 0; i < MD_MAXSIDES; i++) {
924 /* Skip empty slots */
925 if (sd->sd_nodes[i][0] == '\0')
926 continue;
928 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd,
929 &xep) == -1)
930 mdclrerror(&xep);
933 max_genid += 2;
934 resync_genid(sp, sd, max_genid, 0, NULL);
937 if ((suspend1_flag) || (suspendall_flag)) {
938 /* Send resume */
939 nd = sd->sd_nodelist;
940 /* All nodes are guaranteed to be ALIVE */
941 while (nd) {
943 * Just resume all classes so that resume is the
944 * same whether just one class was locked or all
945 * classes were locked.
947 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
948 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
949 mdclrerror(&xep);
951 nd = nd->nd_next;
953 meta_ping_mnset(sp->setno);
956 /* level 0 */
957 cl_sk = cl_get_setkey(sp->setno, sp->setname);
958 /* Don't test lock flag since guaranteed to be set if in rollback */
959 if (MD_MNSET_DESC(sd)) {
961 * Since the add drive operation is failing, need
962 * to reset config back to the way it was
963 * before the add drive opration.
964 * If a MN diskset and this is the first drive being
965 * added, then reset master on this node since
966 * the master would have not been set previously.
967 * This is ok to fail since next node to add first
968 * disk to diskset will also set the master on all nodes.
970 if (curdd == NULL) {
971 /* Reset master on mynode */
972 if (clnt_mnsetmaster(mynode(), sp, "",
973 MD_MN_INVALID_NID, &xep))
974 mdclrerror(&xep);
976 nd = sd->sd_nodelist;
977 /* All nodes are guaranteed to be ALIVE */
978 while (nd) {
979 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
980 mdclrerror(&xep);
981 nd = nd->nd_next;
983 } else {
984 for (i = 0; i < MD_MAXSIDES; i++) {
985 /* Skip empty slots */
986 if (sd->sd_nodes[i][0] == '\0')
987 continue;
989 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
990 mdclrerror(&xep);
993 cl_set_setkey(NULL);
995 /* release signals back to what they were on entry */
996 if (procsigs(FALSE, &oldsigs, &xep) < 0)
997 mdclrerror(&xep);
999 metafreedrivedesc(&dd);
1001 if (flush_set_onerr) {
1002 metaflushsetname(sp);
1003 if (!(MD_MNSET_DESC(sd))) {
1004 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1008 return (rval);
1012 * Add drives routine used during import of a diskset.
1015 meta_imp_set_adddrives(
1016 mdsetname_t *sp,
1017 mddrivenamelist_t *dnlp,
1018 md_im_set_desc_t *misp,
1019 md_error_t *ep
1022 md_set_desc *sd;
1023 mddrivenamelist_t *p;
1024 md_drive_desc *dd = NULL, *ddp;
1025 int flush_set_onerr = 0;
1026 md_timeval32_t now;
1027 ulong_t genid;
1028 mhd_mhiargs_t mhiargs;
1029 md_im_replica_info_t *mirp;
1030 md_im_drive_info_t *midp;
1031 int rval = 0;
1032 sigset_t oldsigs;
1033 ulong_t max_genid = 0;
1034 int rb_level = 0;
1035 md_error_t xep = mdnullerror;
1037 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1038 return (-1);
1040 for (p = dnlp; p != NULL; p = p->next) {
1041 int imp_flag = 0;
1044 * If we have a partial diskset, meta_make_sidenmlist will
1045 * need information from midp to complete making the
1046 * side name structure.
1048 if (misp->mis_partial) {
1049 imp_flag = MDDB_C_IMPORT;
1050 for (midp = misp->mis_drives; midp != NULL;
1051 midp = midp->mid_next) {
1052 if (midp->mid_dnp == p->drivenamep)
1053 break;
1055 if (midp == NULL) {
1056 (void) mddserror(ep, MDE_DS_SETNOTIMP,
1057 MD_SET_BAD, mynode(), NULL, sp->setname);
1058 rval = -1;
1059 goto out;
1063 * Create the names for the drives we are adding per side.
1065 if (meta_make_sidenmlist(sp, p->drivenamep, imp_flag,
1066 midp, ep) == -1) {
1067 rval = -1;
1068 goto out;
1073 * Get the list of drives descriptors that we are adding.
1075 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_ADD, ep);
1077 if (! mdisok(ep)) {
1078 rval = -1;
1079 goto out;
1083 * Get the set timeout information.
1085 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
1086 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1087 rval = -1;
1088 goto out;
1092 * Get timestamp and generation id for new records
1094 now = sd->sd_ctime;
1095 genid = sd->sd_genid;
1097 /* At this point, in case of error, set should be flushed. */
1098 flush_set_onerr = 1;
1100 rb_level = 1; /* level 1 */
1102 for (midp = misp->mis_drives; midp != NULL; midp = midp->mid_next) {
1103 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1104 if (ddp->dd_dnp == midp->mid_dnp) {
1105 /* same disk */
1106 ddp->dd_dnp->devid =
1107 devid_str_encode(midp->mid_devid,
1108 midp->mid_minor_name);
1110 ddp->dd_dbcnt = 0;
1111 mirp = midp->mid_replicas;
1112 if (mirp) {
1113 ddp->dd_dbsize = mirp->mir_length;
1114 for (; mirp != NULL;
1115 mirp = mirp->mir_next) {
1116 ddp->dd_dbcnt++;
1119 if ((midp->mid_available &
1120 MD_IM_DISK_NOT_AVAILABLE) &&
1121 (misp->mis_flags & MD_IM_SET_REPLICATED)) {
1122 ddp->dd_flags = MD_DR_UNRSLV_REPLICATED;
1129 * Add the drive records for the drives that we are adding to
1130 * each host in the set. Marks the drive records as MD_DR_ADD.
1131 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
1132 * this flag was set in the dd_flags for that drive.
1134 if (clnt_imp_adddrvs(mynode(), sp, dd, now, genid, ep) == -1)
1135 goto rollback;
1137 rb_level = 2; /* level 2 */
1140 * Take ownership of the added drives.
1142 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, ep))
1143 goto rollback;
1145 out:
1146 metafreedrivedesc(&dd);
1148 if (flush_set_onerr) {
1149 metaflushsetname(sp);
1152 return (rval);
1154 rollback:
1155 /* Make sure we are blocking all signals */
1156 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1157 mdclrerror(&xep);
1159 rval = -1;
1161 max_genid = sd->sd_genid;
1163 /* level 2 */
1164 if (rb_level > 1) {
1165 if (!MD_ATSET_DESC(sd)) {
1166 if (rel_own_bydd(sp, dd, TRUE, &xep)) {
1167 mdclrerror(&xep);
1172 /* level 1 */
1173 if (rb_level > 0) {
1174 if (clnt_deldrvs(mynode(), sp, dd, &xep) == -1) {
1175 mdclrerror(&xep);
1177 max_genid += 2;
1178 resync_genid(sp, sd, max_genid, 0, NULL);
1181 /* level 0 */
1183 /* release signals back to what they were on entry */
1184 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1185 mdclrerror(&xep);
1187 metafreedrivedesc(&dd);
1189 if (flush_set_onerr) {
1190 metaflushsetname(sp);
1191 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1194 return (rval);
1198 meta_set_deletedrives(
1199 mdsetname_t *sp,
1200 mddrivenamelist_t *dnlp,
1201 int forceflg,
1202 md_error_t *ep
1205 md_set_desc *sd;
1206 md_drive_desc *ddp, *dd = NULL, *curdd = NULL;
1207 md_replicalist_t *rlp = NULL, *rl;
1208 mddrivenamelist_t *p;
1209 int deldrvcnt = 0;
1210 int rval = 0;
1211 mhd_mhiargs_t mhiargs;
1212 int i;
1213 sigset_t oldsigs;
1214 md_setkey_t *cl_sk;
1215 ulong_t max_genid = 0;
1216 int rb_level = 0;
1217 md_error_t xep = mdnullerror;
1218 md_mnnode_desc *nd;
1219 int has_set;
1220 int current_drv_cnt = 0;
1221 int suspendall_flag = 0, suspendall_flag_rb = 0;
1222 int suspend1_flag = 0;
1223 int lock_flag = 0;
1224 bool_t stale_bool = FALSE;
1225 int flush_set_onerr = 0;
1226 mdnamelist_t *nlp;
1227 mdname_t *np;
1229 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1230 return (-1);
1232 /* Make sure we own the set */
1233 if (meta_check_ownership(sp, ep) != 0)
1234 return (-1);
1236 if (drvsuniq(sp, dnlp, ep) == -1)
1237 return (-1);
1240 * Check and see if all the nodes have the set.
1242 * The drive and node records are stored in the local mddbs of each
1243 * node in the diskset. Each node's rpc.metad daemon reads in the set,
1244 * drive and node records from that node's local mddb and caches them
1245 * internally. Any process needing diskset information contacts its
1246 * local rpc.metad to get this information. Since each node in the
1247 * diskset is independently reading the set information from its local
1248 * mddb, the set, drive and node records in the local mddbs must stay
1249 * in-sync, so that all nodes have a consistent view of the diskset.
1251 * For a multinode diskset, explicitly verify that all nodes in the
1252 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
1253 * fail this operation since all nodes must be ALIVE in order to delete
1254 * a drive record from their local mddb. If a panic of this node
1255 * leaves the local mddbs set, node and drive records out-of-sync, the
1256 * reconfig cycle will fix the local mddbs and force them back into
1257 * synchronization.
1259 if (MD_MNSET_DESC(sd)) {
1260 nd = sd->sd_nodelist;
1261 while (nd) {
1262 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1263 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1264 sp->setno,
1265 nd->nd_nodename, NULL, sp->setname);
1266 return (-1);
1268 nd = nd->nd_next;
1271 /* Make sure we are blocking all signals */
1272 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1273 mdclrerror(&xep);
1276 * Lock the set on current set members.
1277 * Set locking done much earlier for MN diskset than for
1278 * traditional diskset since lock_set and SUSPEND are used
1279 * to protect against other meta* commands running on the
1280 * other nodes.
1282 nd = sd->sd_nodelist;
1283 /* All nodes are guaranteed to be ALIVE */
1284 while (nd) {
1285 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
1286 rval = -1;
1287 goto out;
1289 lock_flag = 1;
1290 nd = nd->nd_next;
1293 * Lock out other meta* commands by suspending
1294 * class 1 messages across the diskset.
1296 nd = sd->sd_nodelist;
1297 /* All nodes are guaranteed to be ALIVE */
1298 while (nd) {
1299 if (clnt_mdcommdctl(nd->nd_nodename,
1300 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
1301 MD_MSCF_NO_FLAGS, ep)) {
1302 rval = -1;
1303 goto out;
1305 suspend1_flag = 1;
1306 nd = nd->nd_next;
1309 nd = sd->sd_nodelist;
1310 /* All nodes are guaranteed to be ALIVE */
1311 while (nd) {
1312 if (strcmp(nd->nd_nodename, mynode()) == 0) {
1313 nd = nd->nd_next;
1314 continue;
1317 has_set = nodehasset(sp, nd->nd_nodename,
1318 NHS_NSTG_EQ, ep);
1319 if (has_set < 0) {
1320 rval = -1;
1321 goto out;
1324 if (! has_set) {
1325 (void) mddserror(ep, MDE_DS_NODENOSET,
1326 sp->setno, nd->nd_nodename,
1327 NULL, sp->setname);
1328 rval = -1;
1329 goto out;
1331 nd = nd->nd_next;
1333 } else {
1334 for (i = 0; i < MD_MAXSIDES; i++) {
1335 /* Skip empty slots */
1336 if (sd->sd_nodes[i][0] == '\0')
1337 continue;
1339 if (strcmp(sd->sd_nodes[i], mynode()) == 0)
1340 continue;
1342 has_set = nodehasset(sp, sd->sd_nodes[i], NHS_NSTG_EQ,
1343 ep);
1344 if (has_set < 0) {
1346 * Can directly return since !MN diskset;
1347 * nothing to unlock.
1349 return (-1);
1352 if (! has_set) {
1354 * Can directly return since !MN diskset;
1355 * nothing to unlock.
1357 return (mddserror(ep, MDE_DS_NODENOSET,
1358 sp->setno, sd->sd_nodes[i], NULL,
1359 sp->setname));
1364 for (p = dnlp; p != NULL; p = p->next) {
1365 int is_it;
1366 mddrivename_t *dnp;
1368 dnp = p->drivenamep;
1370 if ((is_it = meta_is_drive_in_thisset(sp, dnp, FALSE, ep))
1371 == -1) {
1372 rval = -1;
1373 goto out;
1376 if (! is_it) {
1377 (void) mddserror(ep, MDE_DS_DRIVENOTINSET, sp->setno,
1378 NULL, dnp->cname, sp->setname);
1379 rval = -1;
1380 goto out;
1383 if ((meta_check_drive_inuse(sp, dnp, FALSE, ep)) == -1) {
1384 rval = -1;
1385 goto out;
1388 deldrvcnt++;
1390 current_drv_cnt = deldrvcnt;
1393 * Get drive descriptors for the drives that are currently in the set.
1395 curdd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
1396 if (! mdisok(ep)) {
1397 rval = -1;
1398 goto out;
1402 * Decrement the the delete drive count for each drive currently in the
1403 * set.
1405 for (ddp = curdd; ddp != NULL; ddp = ddp->dd_next)
1406 deldrvcnt--;
1409 * If the count of drives we are deleting is equal to the drives in the
1410 * set, and we haven't specified forceflg, return an error
1412 if (deldrvcnt == 0 && forceflg == FALSE) {
1413 (void) mderror(ep, MDE_FORCE_DEL_ALL_DRV, NULL);
1414 rval = -1;
1415 goto out;
1419 * Get the list of drive descriptors that we are deleting.
1421 dd = metaget_drivedesc_fromdrivelist(sp, dnlp, MD_DR_DEL, ep);
1422 if (! mdisok(ep)) {
1423 rval = -1;
1424 goto out;
1428 * Get the set timeout information in case we have to roll back.
1430 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
1431 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) == -1) {
1432 rval = -1;
1433 goto out;
1436 /* At this point, in case of error, set should be flushed. */
1437 flush_set_onerr = 1;
1439 /* END CHECK CODE */
1441 /* Lock the set on current set members */
1442 if (!(MD_MNSET_DESC(sd))) {
1443 md_rb_sig_handling_on();
1444 for (i = 0; i < MD_MAXSIDES; i++) {
1445 /* Skip empty slots */
1446 if (sd->sd_nodes[i][0] == '\0')
1447 continue;
1449 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1450 rval = -1;
1451 goto out;
1453 lock_flag = 1;
1457 if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1458 mddb_config_t c;
1460 * Is current set STALE?
1462 (void) memset(&c, 0, sizeof (c));
1463 c.c_id = 0;
1464 c.c_setno = sp->setno;
1465 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1466 (void) mdstealerror(ep, &c.c_mde);
1467 rval = -1;
1468 goto out;
1470 if (c.c_flags & MDDB_C_STALE) {
1471 stale_bool = TRUE;
1475 RB_TEST(1, "deletedrives", ep)
1477 RB_PREEMPT;
1478 rb_level = 1; /* level 1 */
1480 RB_TEST(2, "deletedrives", ep)
1483 * Mark the drives MD_DR_DEL
1485 if (MD_MNSET_DESC(sd)) {
1486 nd = sd->sd_nodelist;
1487 /* All nodes are guaranteed to be ALIVE */
1488 while (nd) {
1489 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
1490 MD_DR_DEL, ep) == -1)
1491 goto rollback;
1493 RB_TEST(3, "deletedrives", ep)
1494 nd = nd->nd_next;
1496 } else {
1497 for (i = 0; i < MD_MAXSIDES; i++) {
1498 /* Skip empty slots */
1499 if (sd->sd_nodes[i][0] == '\0')
1500 continue;
1502 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
1503 MD_DR_DEL, ep) == -1)
1504 goto rollback;
1506 RB_TEST(3, "deletedrives", ep)
1510 RB_TEST(4, "deletedrives", ep)
1512 RB_PREEMPT;
1513 rb_level = 2; /* level 2 */
1515 RB_TEST(5, "deletedrives", ep)
1518 * Balance the DB's according to the list of existing drives and the
1519 * list of deleted drives.
1521 if (meta_db_balance(sp, dd, curdd, 0, ep) == -1)
1522 goto rollback;
1525 * If the drive(s) to be deleted cannot be accessed,
1526 * they haven't really been deleted yet. Check and delete now
1527 * if need be.
1529 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) >= 0) {
1530 nlp = NULL;
1531 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1532 char *delete_name;
1534 delete_name = ddp->dd_dnp->cname;
1536 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1537 char *cur_name;
1539 cur_name =
1540 rl->rl_repp->r_namep->drivenamep->cname;
1542 if (strcmp(delete_name, cur_name) == 0) {
1543 /* put it on the delete list */
1544 np = rl->rl_repp->r_namep;
1545 (void) metanamelist_append(&nlp, np);
1551 if (nlp != NULL) {
1552 if (meta_db_detach(sp, nlp,
1553 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
1554 ep) == -1) {
1555 metafreenamelist(nlp);
1556 goto rollback;
1558 metafreenamelist(nlp);
1562 RB_TEST(6, "deletedrives", ep)
1564 RB_PREEMPT;
1565 rb_level = 3; /* level 3 */
1567 RB_TEST(7, "deletedrives", ep)
1570 * Cannot suspend set until after meta_db_balance since
1571 * meta_db_balance uses META_DB_ATTACH/DETACH messages.
1573 if ((deldrvcnt == 0) && (MD_MNSET_DESC(sd))) {
1575 * Notify rpc.mdcommd on all nodes of a nodelist change.
1576 * Start by suspending rpc.mdcommd (which drains it of all
1577 * messages), then change the nodelist followed by a reinit
1578 * and resume.
1580 nd = sd->sd_nodelist;
1581 /* All nodes are guaranteed to be ALIVE */
1582 while (nd) {
1583 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
1584 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
1585 rval = -1;
1586 goto out;
1588 suspendall_flag = 1;
1589 nd = nd->nd_next;
1594 * Remove the drive records for the drives that were deleted from
1595 * each host in the set. This removes the record and dr_flags.
1597 if (MD_MNSET_DESC(sd)) {
1598 nd = sd->sd_nodelist;
1599 /* All nodes are guaranteed to be ALIVE */
1600 while (nd) {
1601 if (clnt_deldrvs(nd->nd_nodename, sp, dd, ep) == -1)
1602 goto rollback;
1604 RB_TEST(8, "deletedrives", ep)
1605 nd = nd->nd_next;
1607 } else {
1608 for (i = 0; i < MD_MAXSIDES; i++) {
1609 /* Skip empty slots */
1610 if (sd->sd_nodes[i][0] == '\0')
1611 continue;
1613 if (clnt_deldrvs(sd->sd_nodes[i], sp, dd, ep) == -1)
1614 goto rollback;
1616 RB_TEST(8, "deletedrives", ep)
1620 RB_TEST(9, "deletedrives", ep)
1622 RB_PREEMPT;
1623 rb_level = 4; /* level 4 */
1625 RB_TEST(10, "deletedrives", ep)
1627 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
1628 if (rel_own_bydd(sp, dd, TRUE, ep))
1629 goto rollback;
1632 /* If we deleted all the drives, then we need to halt the set. */
1633 if (deldrvcnt == 0) {
1634 RB_TEST(11, "deletedrives", ep)
1636 RB_PREEMPT;
1637 rb_level = 5; /* level 5 */
1639 RB_TEST(12, "deletedrives", ep)
1641 if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
1642 goto rollback;
1644 RB_TEST(13, "deletedrives", ep)
1646 RB_PREEMPT;
1647 rb_level = 6; /* level 6 */
1649 RB_TEST(14, "deletedrives", ep)
1651 /* Halt MN diskset on all nodes by having node withdraw */
1652 if (MD_MNSET_DESC(sd)) {
1653 nd = sd->sd_nodelist;
1654 /* All nodes are guaranteed to be ALIVE */
1655 while (nd) {
1656 /* Only withdraw nodes that are joined */
1657 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
1658 nd = nd->nd_next;
1659 continue;
1662 * Going to set locally cached node flags to
1663 * rollback join so in case of error, the
1664 * rollback code knows which nodes to re-join.
1666 nd->nd_flags |= MD_MN_NODE_RB_JOIN;
1669 * Be careful in ordering of following steps
1670 * so that recovery from a panic between
1671 * the steps is viable.
1672 * Only reset master info in rpc.metad -
1673 * don't reset local cached information
1674 * which will be used to set master information
1675 * back in case of failure (rollback).
1677 if (clnt_withdrawset(nd->nd_nodename, sp, ep))
1678 goto rollback;
1679 /* Sets withdraw flag on all nodes in list */
1680 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
1681 sd->sd_nodelist, MD_NR_WITHDRAW,
1682 NULL, ep)) {
1683 goto rollback;
1685 if (clnt_mnsetmaster(nd->nd_nodename, sp,
1686 "", MD_MN_INVALID_NID, ep)) {
1687 goto rollback;
1689 nd = nd->nd_next;
1691 } else {
1692 if (halt_set(sp, ep))
1693 goto rollback;
1696 RB_TEST(15, "deletedrives", ep)
1699 RB_TEST(16, "deletedrives", ep)
1701 out:
1703 * Notify rpc.mdcommd on all nodes of a nodelist change.
1704 * Send reinit command to mdcommd which forces it to get
1705 * fresh set description.
1707 if (suspendall_flag) {
1708 /* Send reinit */
1709 nd = sd->sd_nodelist;
1710 /* All nodes are guaranteed to be ALIVE */
1711 while (nd) {
1712 /* Class is ignored for REINIT */
1713 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1714 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1715 if (rval == 0)
1716 (void) mdstealerror(ep, &xep);
1717 rval = -1;
1718 mde_perror(ep, dgettext(TEXT_DOMAIN,
1719 "Unable to reinit rpc.mdcommd.\n"));
1721 nd = nd->nd_next;
1726 * Just resume all classes so that resume is the same whether
1727 * just one class was locked or all classes were locked.
1729 if ((suspend1_flag) || (suspendall_flag)) {
1730 /* Send resume */
1731 nd = sd->sd_nodelist;
1732 /* All nodes are guaranteed to be ALIVE */
1733 while (nd) {
1734 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1735 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
1736 if (rval == 0)
1737 (void) mdstealerror(ep, &xep);
1738 rval = -1;
1739 mde_perror(ep, dgettext(TEXT_DOMAIN,
1740 "Unable to resume rpc.mdcommd.\n"));
1742 nd = nd->nd_next;
1744 meta_ping_mnset(sp->setno);
1746 if (lock_flag) {
1747 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1748 if (MD_MNSET_DESC(sd)) {
1749 nd = sd->sd_nodelist;
1750 /* All nodes are guaranteed to be ALIVE */
1751 while (nd) {
1752 if (clnt_unlock_set(nd->nd_nodename,
1753 cl_sk, &xep)) {
1754 if (rval == 0)
1755 (void) mdstealerror(ep, &xep);
1756 rval = -1;
1758 nd = nd->nd_next;
1760 } else {
1761 for (i = 0; i < MD_MAXSIDES; i++) {
1762 /* Skip empty slots */
1763 if (sd->sd_nodes[i][0] == '\0')
1764 continue;
1766 if (clnt_unlock_set(sd->sd_nodes[i],
1767 cl_sk, &xep)) {
1768 if (rval == 0)
1769 (void) mdstealerror(ep, &xep);
1770 rval = -1;
1774 cl_set_setkey(NULL);
1777 metafreedrivedesc(&dd);
1779 if (flush_set_onerr) {
1780 metaflushsetname(sp);
1781 if (!(MD_MNSET_DESC(sd))) {
1782 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1786 if (MD_MNSET_DESC(sd)) {
1787 /* release signals back to what they were on entry */
1788 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1789 mdclrerror(&xep);
1792 return (rval);
1794 rollback:
1795 /* all signals already blocked for MN disket */
1796 if (!(MD_MNSET_DESC(sd))) {
1797 /* Make sure we are blocking all signals */
1798 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1799 mdclrerror(&xep);
1802 rval = -1;
1804 max_genid = sd->sd_genid;
1806 /* Set the master on all nodes first thing */
1807 if (rb_level > 5) {
1808 if (MD_MNSET_DESC(sd)) {
1809 nd = sd->sd_nodelist;
1810 /* All nodes are guaranteed to be ALIVE */
1811 while (nd) {
1812 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
1813 continue;
1816 * Set master on all re-joining nodes to be
1817 * my cached view of master.
1819 if (clnt_mnsetmaster(nd->nd_nodename, sp,
1820 sd->sd_mn_master_nodenm,
1821 sd->sd_mn_master_nodeid, &xep)) {
1822 mdclrerror(&xep);
1828 /* level 3 */
1829 if (rb_level > 2) {
1830 md_set_record *sr;
1831 md_mnset_record *mnsr;
1832 md_drive_record *dr;
1833 int sr_drive_cnt;
1836 * See if we have to re-add the drives specified.
1838 if (MD_MNSET_DESC(sd)) {
1839 nd = sd->sd_nodelist;
1840 /* All nodes are guaranteed to be ALIVE */
1841 while (nd) {
1843 * Must get current set record from each
1844 * node to see what else must be done
1845 * to recover.
1846 * Record should be for a multi-node diskset.
1848 if (clnt_mngetset(nd->nd_nodename, sp->setname,
1849 MD_SET_BAD, &mnsr, &xep) == -1) {
1850 mdclrerror(&xep);
1851 nd = nd->nd_next;
1852 continue;
1856 * If all drives are already there, skip
1857 * to next node.
1859 sr_drive_cnt = 0;
1860 dr = mnsr->sr_drivechain;
1861 while (dr) {
1862 sr_drive_cnt++;
1863 dr = dr->dr_next;
1865 if (sr_drive_cnt == current_drv_cnt) {
1866 free_sr((md_set_record *)mnsr);
1867 nd = nd->nd_next;
1868 continue;
1871 /* Readd all drives */
1872 if (clnt_adddrvs(nd->nd_nodename, sp, dd,
1873 mnsr->sr_ctime, mnsr->sr_genid, &xep) == -1)
1874 mdclrerror(&xep);
1876 free_sr((struct md_set_record *)mnsr);
1877 nd = nd->nd_next;
1879 } else {
1880 for (i = 0; i < MD_MAXSIDES; i++) {
1881 /* Skip empty slots */
1882 if (sd->sd_nodes[i][0] == '\0')
1883 continue;
1885 /* Record should be for a non-multi-node set */
1886 if (clnt_getset(sd->sd_nodes[i], sp->setname,
1887 MD_SET_BAD, &sr, &xep) == -1) {
1888 mdclrerror(&xep);
1889 continue;
1893 * Set record structure was allocated from RPC
1894 * routine getset so this structure is only of
1895 * size md_set_record even if the MN flag is
1896 * set. So, clear the flag so that the free
1897 * code doesn't attempt to free a structure
1898 * the size of md_mnset_record.
1900 if (MD_MNSET_REC(sr)) {
1901 sr->sr_flags &= ~MD_SR_MN;
1902 free_sr(sr);
1903 continue;
1906 /* Drive already added, skip to next node */
1907 if (sr->sr_drivechain != NULL) {
1908 free_sr(sr);
1909 continue;
1912 if (clnt_adddrvs(sd->sd_nodes[i], sp, dd,
1913 sr->sr_ctime, sr->sr_genid, &xep) == -1)
1914 mdclrerror(&xep);
1916 free_sr(sr);
1919 max_genid += 2;
1923 * Notify rpc.mdcommd on all nodes of a nodelist change.
1924 * At this point in time, don't know which nodes are joined
1925 * to the set. So, send a reinit command to mdcommd
1926 * which forces it to get fresh set description. Then send resume.
1928 * Later, this code will use rpc.mdcommd messages to reattach disks
1929 * and then rpc.mdcommd may be suspended again, rest of the nodes
1930 * joined, rpc.mdcommd reinited and then resumed.
1932 if (suspendall_flag) {
1933 /* Send reinit */
1934 nd = sd->sd_nodelist;
1935 /* All nodes are guaranteed to be ALIVE */
1936 while (nd) {
1937 /* Class is ignored for REINIT */
1938 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
1939 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
1940 mde_perror(&xep, dgettext(TEXT_DOMAIN,
1941 "Unable to reinit rpc.mdcommd.\n"));
1942 mdclrerror(&xep);
1944 nd = nd->nd_next;
1947 /* Send resume */
1948 nd = sd->sd_nodelist;
1949 /* All nodes are guaranteed to be ALIVE */
1950 while (nd) {
1952 * Resume all classes but class 1 so that lock is held
1953 * against meta* commands.
1954 * To later resume class1, must issue a class0 resume.
1956 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
1957 sp, MD_MSG_CLASS0,
1958 MD_MSCF_DONT_RESUME_CLASS1, &xep)) {
1959 mde_perror(&xep, dgettext(TEXT_DOMAIN,
1960 "Unable to resume rpc.mdcommd.\n"));
1961 mdclrerror(&xep);
1963 nd = nd->nd_next;
1965 meta_ping_mnset(sp->setno);
1968 /* level 2 */
1969 if (rb_level > 1) {
1970 mdnamelist_t *nlp;
1971 mdname_t *np;
1973 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
1974 uint_t rep_slice;
1976 if ((meta_replicaslice(ddp->dd_dnp,
1977 &rep_slice, &xep) != 0) ||
1978 ((np = metaslicename(ddp->dd_dnp, rep_slice,
1979 &xep)) == NULL)) {
1980 mdclrerror(&xep);
1981 continue;
1983 nlp = NULL;
1984 (void) metanamelist_append(&nlp, np);
1986 if (meta_db_attach(sp, nlp,
1987 (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
1988 &sd->sd_ctime, ddp->dd_dbcnt, ddp->dd_dbsize,
1989 NULL, &xep) == -1)
1990 mdclrerror(&xep);
1992 metafreenamelist(nlp);
1994 /* Re-balance */
1995 if (meta_db_balance(sp, NULL, curdd, 0, &xep) == -1)
1996 mdclrerror(&xep);
1999 /* level 4 */
2000 if (rb_level > 3) {
2001 if (!(MD_MNSET_DESC(sd)) && !MD_ATSET_DESC(sd)) {
2002 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
2003 mdclrerror(&xep);
2007 /* level 5 */
2008 if (rb_level > 4) {
2009 if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
2010 mdclrerror(&xep);
2014 * If at least one node needs to be rejoined to MN diskset,
2015 * then suspend commd again.
2017 if (MD_MNSET_DESC(sd)) {
2018 nd = sd->sd_nodelist;
2019 /* All nodes are guaranteed to be ALIVE */
2020 while (nd) {
2021 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2022 nd = nd->nd_next;
2023 continue;
2025 break;
2027 if (nd) {
2029 * Found node that will be rejoined so
2030 * notify rpc.mdcommd on all nodes of a nodelist change.
2031 * Start by suspending rpc.mdcommd (which drains it of
2032 * all messages), then change the nodelist followed by
2033 * a reinit and resume.
2035 nd = sd->sd_nodelist;
2036 /* All nodes are guaranteed to be ALIVE */
2037 while (nd) {
2038 if (clnt_mdcommdctl(nd->nd_nodename,
2039 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS0,
2040 MD_MSCF_NO_FLAGS, &xep)) {
2041 mdclrerror(&xep);
2043 suspendall_flag_rb = 1;
2044 nd = nd->nd_next;
2051 /* level 6 */
2052 if (rb_level > 5) {
2053 if (MD_MNSET_DESC(sd)) {
2054 int join_flags = 0;
2056 nd = sd->sd_nodelist;
2057 /* All nodes are guaranteed to be ALIVE */
2058 while (nd) {
2059 /* Only rejoin nodes that were joined before */
2060 if (!(nd->nd_flags & MD_MN_NODE_RB_JOIN)) {
2061 nd = nd->nd_next;
2062 continue;
2065 * Rejoin nodes to same state as before -
2066 * either STALE or non-STALE.
2068 if (stale_bool == TRUE)
2069 join_flags = MNSET_IS_STALE;
2070 if (clnt_joinset(nd->nd_nodename, sp,
2071 join_flags, &xep))
2072 mdclrerror(&xep);
2073 /* Sets OWN flag on all nodes in list */
2074 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2075 sd->sd_nodelist, MD_NR_JOIN, NULL, &xep)) {
2076 mdclrerror(&xep);
2078 nd = nd->nd_next;
2080 } else {
2081 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
2082 mdclrerror(&xep);
2084 /* No special flag for traditional diskset */
2085 if (snarf_set(sp, NULL, &xep))
2086 mdclrerror(&xep);
2090 /* level 1 */
2091 if (rb_level > 0) {
2093 * Mark the drives as OK.
2095 if (MD_MNSET_DESC(sd)) {
2096 nd = sd->sd_nodelist;
2097 /* All nodes are guaranteed to be ALIVE */
2098 while (nd) {
2100 * Must be last action before unlock.
2101 * In case of panic, recovery code checks
2102 * for MD_DR_OK to know that drive
2103 * and possible master are fully added back.
2105 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2106 MD_DR_OK, &xep) == -1)
2107 mdclrerror(&xep);
2108 nd = nd->nd_next;
2110 } else {
2111 for (i = 0; i < MD_MAXSIDES; i++) {
2112 /* Skip empty slots */
2113 if (sd->sd_nodes[i][0] == '\0')
2114 continue;
2116 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
2117 MD_DR_OK, &xep) == -1)
2118 mdclrerror(&xep);
2122 max_genid += 2;
2123 resync_genid(sp, sd, max_genid, 0, NULL);
2126 * Notify rpc.mdcommd on all nodes of a nodelist change.
2127 * Send a reinit command to mdcommd which forces it to get
2128 * fresh set description.
2130 if (suspendall_flag_rb) {
2131 /* Send reinit */
2132 nd = sd->sd_nodelist;
2133 /* All nodes are guaranteed to be ALIVE */
2134 while (nd) {
2135 /* Class is ignored for REINIT */
2136 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2137 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2138 mde_perror(&xep, dgettext(TEXT_DOMAIN,
2139 "Unable to reinit rpc.mdcommd.\n"));
2140 mdclrerror(&xep);
2142 nd = nd->nd_next;
2147 * Just resume all classes so that resume is the same whether
2148 * just one class was locked or all classes were locked.
2150 if ((suspend1_flag) || (suspendall_flag_rb) || (suspendall_flag)) {
2151 /* Send resume */
2152 nd = sd->sd_nodelist;
2153 /* All nodes are guaranteed to be ALIVE */
2154 while (nd) {
2155 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2156 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2157 mde_perror(&xep, dgettext(TEXT_DOMAIN,
2158 "Unable to resume rpc.mdcommd.\n"));
2159 mdclrerror(&xep);
2161 nd = nd->nd_next;
2163 meta_ping_mnset(sp->setno);
2167 /* level 0 */
2168 cl_sk = cl_get_setkey(sp->setno, sp->setname);
2169 /* Don't test lock flag since guaranteed to be set if in rollback */
2170 if (MD_MNSET_DESC(sd)) {
2171 nd = sd->sd_nodelist;
2172 /* All nodes are guaranteed to be ALIVE */
2173 while (nd) {
2174 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
2175 mdclrerror(&xep);
2176 nd = nd->nd_next;
2178 } else {
2179 for (i = 0; i < MD_MAXSIDES; i++) {
2180 /* Skip empty slots */
2181 if (sd->sd_nodes[i][0] == '\0')
2182 continue;
2184 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
2185 mdclrerror(&xep);
2188 cl_set_setkey(NULL);
2190 /* release signals back to what they were on entry */
2191 if (procsigs(FALSE, &oldsigs, &xep) < 0)
2192 mdclrerror(&xep);
2194 metafreedrivedesc(&dd);
2196 if (flush_set_onerr) {
2197 metaflushsetname(sp);
2198 if (!(MD_MNSET_DESC(sd))) {
2199 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
2203 return (rval);