7712 mandoc -Tlint does always exit with error code 0
[unleashed.git] / usr / src / lib / lvm / libmeta / common / meta_set_hst.c
blob364b463c84f8ea720cd5b07fdfa73139e1e5e6a0
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * Just in case we're not in a build environment, make sure that
29 * TEXT_DOMAIN gets set to something.
31 #if !defined(TEXT_DOMAIN)
32 #define TEXT_DOMAIN "SYS_TEST"
33 #endif
36 * Metadevice diskset interfaces
39 #include "meta_set_prv.h"
40 #include <meta.h>
41 #include <sys/lvm/md_crc.h>
42 #include <sys/time.h>
43 #include <sdssc.h>
45 static int
46 add_db_sidenms(
47 mdsetname_t *sp,
48 md_error_t *ep
51 md_replicalist_t *rlp = NULL;
52 md_replicalist_t *rl;
53 int rval = 0;
55 if (metareplicalist(sp, MD_FULLNAME_ONLY, &rlp, ep) < 0)
56 return (-1);
58 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
59 md_replica_t *r = rl->rl_repp;
62 * This is not the first replica being added to the
63 * diskset so call with ADDSIDENMS_BCAST. If this
64 * is a traditional diskset, the bcast flag is ignored
65 * since traditional disksets don't use the rpc.mdcommd.
67 if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
68 DB_ADDSIDENMS_BCAST, ep)) {
69 rval = -1;
70 goto out;
74 out:
75 metafreereplicalist(rlp);
76 return (rval);
79 static int
80 add_drvs_to_hosts(
81 mdsetname_t *sp,
82 int node_c,
83 char **node_v,
84 md_error_t *ep
87 int i;
88 md_set_desc *sd;
89 md_drive_desc *dd;
90 md_timeval32_t now;
91 ulong_t genid;
93 if ((sd = metaget_setdesc(sp, ep)) == NULL)
94 return (-1);
96 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
97 if (! mdisok(ep))
98 return (-1);
99 return (0);
102 now = sd->sd_ctime;
103 genid = sd->sd_genid - 1;
105 for (i = 0; i < node_c; i++) {
106 if (clnt_adddrvs(node_v[i], sp, dd, now, genid, ep) == -1)
107 return (-1);
110 return (0);
113 static int
114 add_md_sidenms(mdsetname_t *sp, side_t sideno, side_t otherside, md_error_t *ep)
116 mdnm_params_t nm;
117 char *cname, *dname;
118 side_t tmp_sideno;
119 minor_t mnum;
120 int done, i;
121 int rval = 0;
122 md_set_desc *sd;
124 (void) memset(&nm, '\0', sizeof (nm));
125 nm.key = MD_KEYWILD;
127 if (!metaislocalset(sp)) {
128 if ((sd = metaget_setdesc(sp, ep)) == NULL)
129 return (-1);
131 /* Use rpc.mdcommd to add md side info from all nodes */
132 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
133 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
134 md_mn_result_t *resultp = NULL;
135 md_mn_msg_meta_md_addside_t md_as;
136 int send_rval;
138 md_as.msg_sideno = sideno;
139 md_as.msg_otherside = otherside;
141 * If reconfig cycle has been started, this node is stuck in
142 * in the return step until this command has completed. If
143 * mdcommd is suspended, ask send_message to fail (instead of
144 * retrying) so that metaset can finish allowing the
145 * reconfig cycle to proceed.
147 send_rval = mdmn_send_message(sp->setno,
148 MD_MN_MSG_META_MD_ADDSIDE,
149 MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
150 0, (char *)&md_as, sizeof (md_mn_msg_meta_md_addside_t),
151 &resultp, ep);
152 if (send_rval != 0) {
153 (void) mdstealerror(ep, &(resultp->mmr_ep));
154 if (resultp)
155 free_result(resultp);
156 return (-1);
158 if (resultp)
159 free_result(resultp);
160 return (0);
161 } else {
162 /*CONSTCOND*/
163 while (1) {
164 char *drvnm = NULL;
166 nm.mde = mdnullerror;
167 nm.setno = sp->setno;
168 nm.side = otherside;
169 if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
170 return (mdstealerror(ep, &nm.mde));
172 if (nm.key == MD_KEYWILD)
173 return (0);
176 * Okay we have a valid key
177 * Let's see if it is hsp or not
179 nm.devname = (uintptr_t)meta_getnmentbykey(sp->setno,
180 otherside, nm.key, &drvnm, NULL, NULL, ep);
181 if (nm.devname == NULL || drvnm == NULL) {
182 if (nm.devname)
183 Free((void *)(uintptr_t)nm.devname);
184 if (drvnm)
185 Free((void *)(uintptr_t)drvnm);
186 return (-1);
190 * If it is hsp add here
192 if (strcmp(drvnm, MD_HOTSPARES) == 0) {
193 if (add_name(sp, sideno, nm.key, MD_HOTSPARES,
194 minor(NODEV), (char *)(uintptr_t)nm.devname,
195 NULL, NULL, ep) == -1) {
196 Free((void *)(uintptr_t)nm.devname);
197 Free((void *)(uintptr_t)drvnm);
198 return (-1);
199 } else {
200 Free((void *)(uintptr_t)nm.devname);
201 Free((void *)(uintptr_t)drvnm);
202 continue;
206 nm.side = sideno;
207 if (MD_MNSET_DESC(sd)) {
208 tmp_sideno = sideno;
209 } else {
210 tmp_sideno = sideno - 1;
213 if ((done = meta_getnextside_devinfo(sp,
214 (char *)(uintptr_t)nm.devname, &tmp_sideno,
215 &cname, &dname, &mnum, ep)) == -1) {
216 Free((void *)(uintptr_t)nm.devname);
217 return (-1);
220 assert(done == 1);
221 Free((void *)(uintptr_t)nm.devname);
222 Free((void *)(uintptr_t)drvnm);
225 * The device reference count can be greater than 1 if
226 * more than one softpart is configured on top of the
227 * same device. If this is the case then we want to
228 * increment the count to sync up with the other sides.
230 for (i = 0; i < nm.ref_count; i++) {
231 if (add_name(sp, sideno, nm.key, dname, mnum,
232 cname, NULL, NULL, ep) == -1)
233 rval = -1;
236 Free(cname);
237 Free(dname);
239 if (rval != 0)
240 return (rval);
244 /*NOTREACHED*/
247 static int
248 check_setdrvs_againstnode(mdsetname_t *sp, char *node, md_error_t *ep)
250 mddrivename_t *dp;
251 md_drive_desc *dd, *ddp;
253 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
254 if (! mdisok(ep))
255 return (-1);
257 for (ddp = dd; ddp != NULL; ddp = ddp->dd_next) {
258 dp = ddp->dd_dnp;
260 if (checkdrive_onnode(sp, dp, node, ep))
261 return (-1);
264 return (0);
267 static int
268 create_multinode_set_on_hosts(
269 mdsetname_t *sp,
270 int node_c, /* Number of new nodes */
271 char **node_v, /* Nodes which are being added */
272 int new_set,
273 md_error_t *ep
276 int i;
277 md_set_desc *sd;
278 md_timeval32_t now;
279 ulong_t genid;
280 int rval = 0;
281 md_mnnode_desc *nd, *ndm = NULL;
282 md_mnnode_desc *nd_prev, *nd_curr;
283 int nodecnt;
284 mndiskset_membershiplist_t *nl, *nl2;
286 if (!new_set) {
287 if ((sd = metaget_setdesc(sp, ep)) == NULL)
288 return (-1);
289 now = sd->sd_ctime;
290 genid = sd->sd_genid - 1;
291 if (sd->sd_drvs)
292 genid--;
293 } else {
294 sd = Zalloc(sizeof (*sd));
296 if (meta_gettimeofday(&now) == -1) {
297 (void) mdsyserror(ep, errno,
298 dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
299 rval = -1;
300 goto out;
303 /* Put the new entries into the set */
305 * Get membershiplist from API routine. If there's
306 * an error, fail to create set and pass back error.
308 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
309 rval = -1;
310 goto out;
314 * meta_set_addhosts has already verified that
315 * this node list is in the membership list
316 * so set ALIVE flag.
317 * Since this is a new set, all hosts being
318 * added are new to the set, so also set ADD flag.
320 for (i = 0; i < node_c; i++) {
321 nd = Zalloc(sizeof (*nd));
322 (void) strcpy(nd->nd_nodename, node_v[i]);
323 nd->nd_ctime = now;
324 nd->nd_flags = (MD_MN_NODE_ALIVE |
325 MD_MN_NODE_ADD);
326 nl2 = nl;
327 while (nl2) {
328 if (strcmp(nl2->msl_node_name,
329 node_v[i]) == 0) {
330 nd->nd_nodeid = nl2->msl_node_id;
331 (void) strcpy(nd->nd_priv_ic,
332 nl2->msl_node_addr);
333 break;
335 nl2 = nl2->next;
339 * Nodelist must be kept in ascending
340 * nodeid order.
342 if (sd->sd_nodelist == NULL) {
343 /* Nothing in list, just add it */
344 sd->sd_nodelist = nd;
345 } else if (nd->nd_nodeid < sd->sd_nodelist->nd_nodeid) {
346 /* Add to head of list */
347 nd->nd_next = sd->sd_nodelist;
348 sd->sd_nodelist = nd;
349 } else {
350 nd_curr = sd->sd_nodelist->nd_next;
351 nd_prev = sd->sd_nodelist;
352 /* Search for place ot add it */
353 while (nd_curr) {
354 if (nd->nd_nodeid <
355 nd_curr->nd_nodeid) {
356 /* Add before nd_curr */
357 nd->nd_next = nd_curr;
358 nd_prev->nd_next = nd;
359 break;
361 nd_prev = nd_curr;
362 nd_curr = nd_curr->nd_next;
364 /* Add to end of list */
365 if (nd_curr == NULL) {
366 nd_prev->nd_next = nd;
370 /* Set master to be first node added */
371 if (ndm == NULL)
372 ndm = nd;
375 meta_free_nodelist(nl);
377 * Creating mnset for first time.
378 * Set master to be invalid until first drive is
379 * in set.
381 (void) strcpy(sd->sd_mn_master_nodenm, "");
382 sd->sd_mn_master_nodeid = MD_MN_INVALID_NID;
383 sd->sd_mn_masternode = ndm;
384 sd->sd_ctime = now;
385 genid = sd->sd_genid = 0;
388 /* Create the set where needed */
389 for (i = 0; i < node_c; i++) {
391 * Create the set on each new node. If the set already
392 * exists, then the node list being created on each new node
393 * is the current node list from before the new nodes
394 * were added. If the set doesn't exist, then the node
395 * list being created on each new node is the entire
396 * new node list.
398 if (clnt_mncreateset(node_v[i], sp, sd->sd_nodelist,
399 now, genid, sd->sd_mn_master_nodenm,
400 sd->sd_mn_master_nodeid, ep) == -1) {
401 rval = -1;
402 break;
406 out:
407 if (new_set) {
408 nd = sd->sd_nodelist;
409 while (nd) {
410 sd->sd_nodelist = nd->nd_next;
411 Free(nd);
412 nd = sd->sd_nodelist;
414 Free(sd);
417 if (rval != 0 || new_set)
418 return (rval);
421 * Add the drive records to the new sets
422 * and names for the new sides.
424 return (add_drvs_to_hosts(sp, node_c, node_v, ep));
428 static int
429 create_traditional_set_on_hosts(
430 mdsetname_t *sp,
431 int node_c, /* Number of new nodes */
432 char **node_v, /* Nodes which are being added */
433 int new_set,
434 md_error_t *ep
437 int i;
438 md_set_desc *sd;
439 md_timeval32_t now;
440 ulong_t genid;
441 int rval = 0;
443 if (!new_set) {
445 if ((sd = metaget_setdesc(sp, ep)) == NULL)
446 return (-1);
447 now = sd->sd_ctime;
449 genid = sd->sd_genid;
451 if (sd->sd_drvs)
452 genid--;
453 } else {
454 if (node_c > MD_MAXSIDES)
455 return (mddserror(ep, MDE_DS_SIDENUMNOTAVAIL,
456 sp->setno, NULL, NULL, sp->setname));
458 sd = Zalloc(sizeof (*sd));
460 /* Put the new entries into the set */
461 for (i = 0; i < node_c; i++) {
462 (void) strcpy(sd->sd_nodes[i], node_v[i]);
465 if (meta_gettimeofday(&now) == -1) {
466 (void) mdsyserror(ep, errno, "meta_gettimeofday()");
467 rval = -1;
468 goto out;
471 sd->sd_ctime = now;
472 genid = sd->sd_genid = 0;
475 /* Create the set where needed */
476 for (i = 0; i < node_c; i++) {
478 * Create the set on each new host
480 if (clnt_createset(node_v[i], sp, sd->sd_nodes, now, genid,
481 ep) == -1) {
482 rval = -1;
483 break;
487 out:
488 if (new_set)
489 Free(sd);
491 if (rval != 0 || new_set)
492 return (rval);
495 * Add the drive records to the new sets
496 * and names for the new sides.
498 return (add_drvs_to_hosts(sp, node_c, node_v, ep));
501 static int
502 create_set_on_hosts(
503 mdsetname_t *sp,
504 int multi_node, /* Multi_node diskset or not? */
505 int node_c, /* Number of new nodes */
506 char **node_v, /* Nodes which are being added */
507 int new_set,
508 md_error_t *ep
511 if (multi_node)
512 return (create_multinode_set_on_hosts(sp, node_c, node_v,
513 new_set, ep));
514 else
515 return (create_traditional_set_on_hosts(sp, node_c, node_v,
516 new_set, ep));
519 static int
520 create_set(
521 mdsetname_t *sp,
522 int multi_node, /* Multi-node diskset or not? */
523 int node_c,
524 char **node_v,
525 int auto_take,
526 md_error_t *ep
529 int i;
530 int rval = 0;
531 set_t max_sets;
532 set_t setno;
533 int bool;
534 uint_t sr_flags;
535 sigset_t oldsigs;
536 md_setkey_t *cl_sk;
537 int rb_level = 0;
538 md_error_t xep = mdnullerror;
539 rval_e sdssc_rval;
540 int lock_flag = 0;
541 int sig_flag = 0;
543 if ((max_sets = get_max_sets(ep)) == 0)
544 return (-1);
546 /* We must be a member of the set we are creating */
547 if (! strinlst(mynode(), node_c, node_v))
548 return (mddserror(ep, MDE_DS_SELFNOTIN,
549 sp->setno, mynode(), NULL, sp->setname));
552 * If auto_take then we must be the only member of the set
553 * that we are creating.
555 if (auto_take && node_c > 1)
556 return (mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
557 sp->setname));
560 * If we're part of SC3.0 we'll already have allocated the
561 * set number so we can skip the allocation algorithm used.
562 * Set number is unique across traditional and MN disksets.
564 if ((sdssc_rval = sdssc_get_index(sp->setname, &setno))
565 == SDSSC_NOT_BOUND) {
567 for (i = 0; i < node_c; i++) {
568 int has_set;
570 /* Skip my node */
571 if (strcmp(mynode(), node_v[i]) == 0)
572 continue;
575 * Make sure this set name is not used on the
576 * other hosts
578 has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
579 if (has_set < 0) {
580 if (! mdiserror(ep, MDE_NO_SET)) {
581 rval = -1;
582 goto out;
584 mdclrerror(ep);
585 continue;
588 if (has_set) {
589 (void) mddserror(ep, MDE_DS_NODEHASSET,
590 sp->setno, node_v[i], NULL, sp->setname);
591 rval = -1;
592 goto out;
596 for (setno = 1; setno < max_sets; setno++) {
597 for (i = 0; i < node_c; i++) {
598 if (clnt_setnumbusy(node_v[i], setno,
599 &bool, ep) == -1) {
600 rval = -1;
601 goto out;
604 if (bool == TRUE)
605 break;
607 if (i == node_c)
608 break;
610 } else if (sdssc_rval != SDSSC_OKAY) {
611 (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
612 NULL, sp->setname);
613 rval = -1;
614 goto out;
617 if (setno == max_sets) {
618 (void) mddserror(ep, MDE_DS_SETNUMNOTAVAIL, MD_SET_BAD, NULL,
619 NULL, sp->setname);
620 rval = -1;
621 goto out;
624 sp->setno = setno;
627 * Lock the set on current set members.
628 * Set locking done much earlier for MN diskset than for traditional
629 * diskset since lock_set is used to protect against
630 * other meta* commands running on the other nodes.
631 * Don't issue mdcommd SUSPEND command since there is nothing
632 * to suspend since there currently is no set.
634 if (multi_node) {
635 /* Make sure we are blocking all signals */
636 if (procsigs(TRUE, &oldsigs, &xep) < 0)
637 mdclrerror(&xep);
638 sig_flag = 1;
640 /* Lock the set on new set members */
641 for (i = 0; i < node_c; i++) {
642 if (clnt_lock_set(node_v[i], sp, ep)) {
643 rval = -1;
644 goto out;
646 lock_flag = 1;
648 /* Now have the diskset locked, verify set number is still ok */
649 for (i = 0; i < node_c; i++) {
650 if (clnt_setnumbusy(node_v[i], setno,
651 &bool, ep) == -1) {
652 rval = -1;
653 goto out;
659 if (meta_set_checkname(sp->setname, ep)) {
660 rval = -1;
661 goto out;
664 for (i = 0; i < node_c; i++) {
665 if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
666 rval = -1;
667 goto out;
669 if (bool == FALSE) {
670 (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
671 node_v[i], NULL, sp->setname);
672 rval = -1;
673 goto out;
677 /* END CHECK CODE */
679 /* Lock the set on new set members */
680 if (!multi_node) {
681 md_rb_sig_handling_on();
682 sig_flag = 1;
683 for (i = 0; i < node_c; i++) {
684 if (clnt_lock_set(node_v[i], sp, ep)) {
685 rval = -1;
686 goto out;
688 lock_flag = 1;
692 RB_TEST(1, "create_set", ep)
694 RB_PREEMPT;
695 rb_level = 1; /* level 1 */
697 RB_TEST(2, "create_set", ep)
699 if ((rval = create_set_on_hosts(sp, multi_node, node_c, node_v,
700 1, ep)) == -1)
701 goto rollback;
703 RB_TEST(3, "create_set", ep)
705 if (auto_take)
706 sr_flags = MD_SR_OK | MD_SR_AUTO_TAKE;
707 else
708 sr_flags = MD_SR_OK;
711 * Mark the set record MD_SR_OK
713 for (i = 0; i < node_c; i++)
714 if (clnt_upd_sr_flags(node_v[i], sp, sr_flags, ep))
715 goto rollback;
717 rb_level = 2; /* level 2 */
720 * For MN diskset:
721 * On each added node, set the node record for that node
722 * to OK. Then set all node records for the newly added
723 * nodes on all nodes to ok.
725 * By setting a node's own node record to ok first, even if
726 * the node adding the hosts panics, the rest of the nodes can
727 * determine the same node list during the choosing of the master
728 * during reconfig. So, only nodes considered for mastership
729 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
730 * on that node's rpc.metad. If all nodes have MD_SR_OK set,
731 * but no node has its own MD_MN_NODE_OK set, then the set will
732 * be removed during reconfig since a panic occurred during the
733 * creation of the initial diskset.
736 if (multi_node) {
737 md_mnnode_desc *nd, *saved_nd_next;
738 md_set_desc *sd;
740 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
741 goto rollback;
744 for (i = 0; i < node_c; i++) {
745 nd = sd->sd_nodelist;
746 /* All nodes are guaranteed to be ALIVE */
747 while (nd) {
748 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
749 break;
750 nd = nd->nd_next;
752 /* Something wrong, will pick this up in next loop */
753 if (nd == NULL)
754 continue;
756 /* Only changing my local cache of node list */
757 saved_nd_next = nd->nd_next;
758 nd->nd_next = NULL;
760 /* Set node record for added host to ok on that host */
761 if (clnt_upd_nr_flags(node_v[i], sp,
762 nd, MD_NR_OK, NULL, ep)) {
763 nd->nd_next = saved_nd_next;
764 goto rollback;
766 nd->nd_next = saved_nd_next;
769 /* Now set all node records on all nodes to be ok */
770 nd = sd->sd_nodelist;
771 /* All nodes are guaranteed to be ALIVE */
772 while (nd) {
773 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
774 sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
775 goto rollback;
777 nd = nd->nd_next;
781 RB_TEST(4, "create_set", ep)
783 out:
784 if ((rval == 0) && multi_node) {
786 * Set successfully created.
787 * Notify rpc.mdcommd on all nodes of a nodelist change.
788 * Send reinit command to mdcommd which forces it to get
789 * fresh set description. Then send resume.
790 * Resume on class 0 will resume all classes.
792 for (i = 0; i < node_c; i++) {
793 /* Class is ignored for REINIT */
794 if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
795 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
796 if (rval == 0)
797 (void) mdstealerror(ep, &xep);
798 rval = -1;
799 mde_perror(ep, dgettext(TEXT_DOMAIN,
800 "Unable to reinit rpc.mdcommd.\n"));
803 for (i = 0; i < node_c; i++) {
804 if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
805 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
806 if (rval == 0)
807 (void) mdstealerror(ep, &xep);
808 rval = -1;
809 mde_perror(ep, dgettext(TEXT_DOMAIN,
810 "Unable to resume rpc.mdcommd.\n"));
813 meta_ping_mnset(sp->setno);
815 if (lock_flag) {
816 cl_sk = cl_get_setkey(sp->setno, sp->setname);
817 for (i = 0; i < node_c; i++) {
818 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
819 if (rval == 0)
820 (void) mdstealerror(ep, &xep);
821 rval = -1;
824 cl_set_setkey(NULL);
827 if (sig_flag) {
828 if (multi_node) {
829 /* release signals back to what they were on entry */
830 if (procsigs(FALSE, &oldsigs, &xep) < 0)
831 mdclrerror(&xep);
832 } else {
833 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
837 return (rval);
839 rollback:
840 /* all signals already blocked for MN disket */
841 if (!multi_node) {
842 /* Make sure we are blocking all signals */
843 if (procsigs(TRUE, &oldsigs, &xep) < 0)
844 mdclrerror(&xep);
847 rval = -1;
850 * For MN diskset:
851 * On each added node (which is now each node to be deleted),
852 * set the node record for that node to DEL. Then set all
853 * node records for the newly added (soon to be deleted) nodes
854 * on all nodes to ok.
856 * By setting a node's own node record to DEL first, even if
857 * the node doing the rollback panics, the rest of the nodes can
858 * determine the same node list during the choosing of the master
859 * during reconfig.
862 /* level 3 */
863 if ((rb_level > 1) && (multi_node)) {
864 md_mnnode_desc *nd, *saved_nd_next;
865 md_set_desc *sd;
867 if ((sd = metaget_setdesc(sp, &xep)) == NULL) {
868 mdclrerror(&xep);
871 for (i = 0; i < node_c; i++) {
872 nd = sd->sd_nodelist;
873 /* All nodes are guaranteed to be ALIVE */
874 while (nd) {
875 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
876 break;
877 nd = nd->nd_next;
879 /* Something wrong, will pick this up in next loop */
880 if (nd == NULL)
881 continue;
883 /* Only changing my local cache of node list */
884 saved_nd_next = nd->nd_next;
885 nd->nd_next = NULL;
887 /* Set node record for added host to DEL on that host */
888 if (clnt_upd_nr_flags(node_v[i], sp,
889 nd, MD_NR_DEL, NULL, &xep)) {
890 nd->nd_next = saved_nd_next;
891 mdclrerror(&xep);
893 nd->nd_next = saved_nd_next;
896 /* Now set all node records on all nodes to be DEL */
897 nd = sd->sd_nodelist;
898 /* All nodes are guaranteed to be ALIVE */
899 while (nd) {
900 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
901 sd->sd_nodelist, MD_NR_DEL, NULL, &xep)) {
902 mdclrerror(&xep);
904 nd = nd->nd_next;
907 /* Mark set record on all hosts to be DELETED */
908 for (i = 0; i < node_c; i++) {
909 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
910 mdclrerror(&xep);
914 /* level 1 */
915 if (rb_level > 0) {
916 for (i = 0; i < node_c; i++) {
917 if (clnt_delset(node_v[i], sp, &xep) == -1)
918 mdclrerror(&xep);
922 /* level 0 */
923 /* Don't test lock flag since guaranteed to be set if in rollback */
924 cl_sk = cl_get_setkey(sp->setno, sp->setname);
925 for (i = 0; i < node_c; i++) {
926 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
927 mdclrerror(&xep);
929 cl_set_setkey(NULL);
931 /* release signals back to what they were on entry */
932 if (procsigs(FALSE, &oldsigs, &xep) < 0)
933 mdclrerror(&xep);
935 if ((sig_flag) && (!multi_node))
936 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
938 return (rval);
941 static int
942 del_db_sidenms(
943 mdsetname_t *sp,
944 side_t sideno,
945 md_error_t *ep
948 md_replicalist_t *rlp = NULL;
949 md_replicalist_t *rl;
950 int rval = 0;
952 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
953 return (-1);
955 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
956 md_replica_t *r = rl->rl_repp;
958 if (meta_db_delsidenm(sp, sideno, r->r_namep, r->r_blkno, ep)) {
959 rval = -1;
960 goto out;
964 out:
965 metafreereplicalist(rlp);
966 return (rval);
969 static int
970 del_drvs_from_hosts(
971 mdsetname_t *sp,
972 md_set_desc *sd,
973 md_drive_desc *dd,
974 int node_c,
975 char **node_v,
976 int oha,
977 md_error_t *ep
980 int i;
981 md_mnnode_desc *nd;
983 for (i = 0; i < node_c; i++) {
984 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
986 * During OHA mode, don't issue RPCs to
987 * non-alive nodes since there is no reason to
988 * wait for RPC timeouts.
990 nd = sd->sd_nodelist;
991 while (nd) {
992 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
993 break;
994 nd = nd->nd_next;
996 if (nd == NULL) {
997 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
998 sp->setno, nd->nd_nodename,
999 NULL, sp->setname));
1002 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1003 continue;
1005 if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1006 return (-1);
1008 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1010 * All nodes should be alive in non-oha mode.
1012 if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1013 return (-1);
1015 } else {
1017 * For traditional diskset, issue the RPC and
1018 * ignore RPC failure if in OHA mode.
1020 if (clnt_deldrvs(node_v[i], sp, dd, ep)) {
1021 if (oha == TRUE && mdanyrpcerror(ep)) {
1022 mdclrerror(ep);
1023 continue;
1025 return (-1);
1030 return (0);
1033 static int
1034 del_host_noset(
1035 mdsetname_t *sp,
1036 char **anode,
1037 md_error_t *ep
1040 int rval = 0;
1041 md_setkey_t *cl_sk;
1042 md_drive_desc *dd;
1043 md_error_t xep = mdnullerror;
1044 md_set_desc *sd;
1046 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1047 return (-1);
1049 /* Make sure we own the set */
1050 if (meta_check_ownership(sp, ep) != 0)
1051 return (-1);
1053 /* Lock the set on our side */
1054 if (clnt_lock_set(mynode(), sp, ep)) {
1055 rval = -1;
1056 goto out;
1059 if (clnt_delhosts(mynode(), sp, 1, anode, ep)) {
1060 rval = -1;
1061 goto out;
1064 if (!MD_MNSET_DESC(sd)) {
1065 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
1066 ep)) == NULL) {
1067 if (! mdisok(ep)) {
1068 rval = -1;
1069 goto out;
1073 /* If we have drives */
1074 if (dd != NULL) {
1075 if (clnt_del_drv_sidenms(mynode(), sp, ep)) {
1076 rval = -1;
1077 goto out;
1082 out:
1083 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1084 if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1085 if (rval == 0)
1086 (void) mdstealerror(ep, &xep);
1087 rval = -1;
1089 cl_set_setkey(NULL);
1091 metaflushsetname(sp);
1093 return (rval);
1096 static int
1097 del_md_sidenms(mdsetname_t *sp, side_t sideno, md_error_t *ep)
1099 mdnm_params_t nm;
1100 md_set_desc *sd;
1101 int i;
1103 if (!metaislocalset(sp)) {
1104 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1105 return (-1);
1107 /* Use rpc.mdcommd to add md side info from all nodes */
1108 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1109 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1110 md_mn_result_t *resultp = NULL;
1111 md_mn_msg_meta_md_delside_t md_ds;
1112 int send_rval;
1114 md_ds.msg_sideno = sideno;
1116 * If reconfig cycle has been started, this node is stuck in
1117 * in the return step until this command has completed. If
1118 * mdcommd is suspended, ask send_message to fail (instead of
1119 * retrying) so that metaset can finish allowing the
1120 * reconfig cycle to proceed.
1122 send_rval = mdmn_send_message(sp->setno,
1123 MD_MN_MSG_META_MD_DELSIDE,
1124 MD_MSGF_FAIL_ON_SUSPEND | MD_MSGF_PANIC_WHEN_INCONSISTENT,
1125 0, (char *)&md_ds, sizeof (md_mn_msg_meta_md_delside_t),
1126 &resultp, ep);
1127 if (send_rval != 0) {
1128 (void) mdstealerror(ep, &(resultp->mmr_ep));
1129 if (resultp)
1130 free_result(resultp);
1131 return (-1);
1133 if (resultp)
1134 free_result(resultp);
1135 } else {
1136 (void) memset(&nm, '\0', sizeof (nm));
1137 nm.key = MD_KEYWILD;
1139 /*CONSTCOND*/
1140 while (1) {
1141 nm.mde = mdnullerror;
1142 nm.setno = sp->setno;
1143 nm.side = MD_SIDEWILD;
1144 if (metaioctl(MD_IOCNXTKEY_NM, &nm, &nm.mde, NULL) != 0)
1145 return (mdstealerror(ep, &nm.mde));
1147 if (nm.key == MD_KEYWILD)
1148 return (0);
1151 * The device reference count can be greater than 1 if
1152 * more than one softpart is configured on top of the
1153 * same device. If this is the case then we want to
1154 * decrement the count to zero so the entry can be
1155 * actually removed.
1157 for (i = 0; i < nm.ref_count; i++) {
1158 if (del_name(sp, sideno, nm.key, ep) == -1)
1159 return (-1);
1163 return (0);
1166 static void
1167 recreate_set(
1168 mdsetname_t *sp,
1169 md_set_desc *sd
1172 int i;
1173 int has_set;
1174 md_error_t xep = mdnullerror;
1175 md_mnnode_desc *nd;
1177 if (MD_MNSET_DESC(sd)) {
1178 nd = sd->sd_nodelist;
1179 while (nd) {
1180 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1181 nd = nd->nd_next;
1182 continue;
1184 has_set = nodehasset(sp, nd->nd_nodename,
1185 NHS_NST_EQ, &xep);
1187 if (has_set >= 0) {
1188 nd = nd->nd_next;
1189 continue;
1192 mdclrerror(&xep);
1194 if (clnt_mncreateset(nd->nd_nodename, sp,
1195 sd->sd_nodelist,
1196 sd->sd_ctime, sd->sd_genid,
1197 sd->sd_mn_master_nodenm,
1198 sd->sd_mn_master_nodeid, &xep) == -1)
1199 mdclrerror(&xep);
1200 nd = nd->nd_next;
1202 } else {
1203 for (i = 0; i < MD_MAXSIDES; i++) {
1204 /* Skip empty slots */
1205 if (sd->sd_nodes[i][0] == '\0')
1206 continue;
1208 has_set = nodehasset(sp, sd->sd_nodes[i],
1209 NHS_NST_EQ, &xep);
1211 if (has_set >= 0)
1212 continue;
1214 mdclrerror(&xep);
1216 if (clnt_createset(sd->sd_nodes[i], sp, sd->sd_nodes,
1217 sd->sd_ctime, sd->sd_genid, &xep) == -1)
1218 mdclrerror(&xep);
1224 * If a MN diskset, set is already locked on all nodes via clnt_lock_set.
1226 static int
1227 del_set_nodrives(
1228 mdsetname_t *sp,
1229 int node_c,
1230 char **node_v,
1231 int oha,
1232 md_error_t *ep
1235 md_set_desc *sd;
1236 int i;
1237 sigset_t oldsigs;
1238 md_setkey_t *cl_sk;
1239 int rb_level = 0;
1240 ulong_t max_genid = 0;
1241 int rval = 0;
1242 md_error_t xep = mdnullerror;
1243 md_mnnode_desc *nd;
1244 int delete_end = 1;
1246 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1247 return (-1);
1249 if (MD_MNSET_DESC(sd)) {
1250 /* Make sure we are blocking all signals */
1251 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1252 mdclrerror(&xep);
1253 } else {
1254 md_rb_sig_handling_on();
1258 * Lock the set on current set members for traditional disksets.
1260 if (!(MD_MNSET_DESC(sd))) {
1261 for (i = 0; i < node_c; i++) {
1263 * For traditional diskset, issue the RPC and
1264 * ignore RPC failure if in OHA mode.
1266 if (clnt_lock_set(node_v[i], sp, ep)) {
1267 if (oha == TRUE && mdanyrpcerror(ep)) {
1268 mdclrerror(ep);
1269 continue;
1271 rval = -1;
1272 goto out;
1278 RB_TEST(1, "deletehosts", ep)
1280 RB_PREEMPT;
1281 rb_level = 1; /* level 1 */
1283 RB_TEST(2, "deletehosts", ep)
1286 * Mark the set record MD_SR_DEL
1288 for (i = 0; i < node_c; i++) {
1290 RB_TEST(3, "deletehosts", ep)
1292 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1294 * During OHA mode, don't issue RPCs to
1295 * non-alive nodes since there is no reason to
1296 * wait for RPC timeouts.
1298 nd = sd->sd_nodelist;
1299 while (nd) {
1300 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1301 break;
1302 nd = nd->nd_next;
1304 if (nd == NULL) {
1305 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1306 sp->setno, nd->nd_nodename,
1307 NULL, sp->setname);
1308 goto rollback;
1311 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1312 continue;
1315 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1316 goto rollback;
1318 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1320 * All nodes should be alive in non-oha mode.
1322 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1323 goto rollback;
1325 } else {
1327 * For traditional diskset, issue the RPC and
1328 * ignore RPC failure if in OHA mode.
1330 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
1331 if (oha == TRUE && mdanyrpcerror(ep)) {
1332 mdclrerror(ep);
1333 continue;
1335 goto rollback;
1339 RB_TEST(4, "deletehosts", ep)
1342 RB_TEST(5, "deletehosts", ep)
1344 RB_PREEMPT;
1345 rb_level = 2; /* level 2 */
1347 RB_TEST(6, "deletehosts", ep)
1349 if (sdssc_delete_begin(sp->setname) == SDSSC_ERROR)
1350 if (metad_isautotakebyname(sp->setname))
1351 delete_end = 0;
1352 else
1353 goto rollback;
1355 /* The set is OK to delete, make it so. */
1356 for (i = 0; i < node_c; i++) {
1358 RB_TEST(7, "deletehosts", ep)
1360 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1362 * During OHA mode, don't issue RPCs to
1363 * non-alive nodes since there is no reason to
1364 * wait for RPC timeouts.
1366 nd = sd->sd_nodelist;
1367 while (nd) {
1368 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1369 break;
1370 nd = nd->nd_next;
1372 if (nd == NULL) {
1373 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1374 sp->setno, nd->nd_nodename,
1375 NULL, sp->setname);
1376 goto rollback;
1379 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1380 continue;
1383 if (clnt_delset(node_v[i], sp, ep) == -1) {
1384 goto rollback;
1386 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1388 * All nodes should be alive in non-oha mode.
1390 if (clnt_delset(node_v[i], sp, ep) == -1) {
1391 goto rollback;
1393 } else {
1395 * For traditional diskset, issue the RPC and
1396 * ignore RPC failure if in OHA mode.
1398 if (clnt_delset(node_v[i], sp, ep) == -1) {
1399 if (oha == TRUE && mdanyrpcerror(ep)) {
1400 mdclrerror(ep);
1401 continue;
1403 goto rollback;
1407 RB_TEST(8, "deletehosts", ep)
1410 RB_TEST(9, "deletehosts", ep)
1412 out:
1414 * Unlock the set on current set members
1415 * for traditional disksets.
1417 if (!(MD_MNSET_DESC(sd))) {
1418 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1419 for (i = 0; i < node_c; i++) {
1421 * For traditional diskset, issue the RPC and
1422 * ignore RPC failure if in OHA mode.
1424 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
1425 if (oha == TRUE && mdanyrpcerror(&xep)) {
1426 mdclrerror(&xep);
1427 continue;
1429 if (rval == 0)
1430 (void) mdstealerror(ep, &xep);
1431 rval = -1;
1434 cl_set_setkey(NULL);
1438 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1439 * don't flush that data until meta_set_deletehosts has finished
1440 * with it. meta_set_deletehosts will handle the flush of the
1441 * setname.
1443 if (!(MD_MNSET_DESC(sd))) {
1444 metaflushsetname(sp);
1447 if (delete_end &&
1448 sdssc_delete_end(sp->setname, SDSSC_COMMIT) == SDSSC_ERROR)
1449 rval = -1;
1451 if (MD_MNSET_DESC(sd)) {
1452 /* release signals back to what they were on entry */
1453 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1454 mdclrerror(&xep);
1455 } else {
1456 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1459 return (rval);
1461 rollback:
1462 /* all signals already blocked for MN disket */
1463 if (!(MD_MNSET_DESC(sd))) {
1464 /* Make sure we are blocking all signals */
1465 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1466 mdclrerror(&xep);
1469 rval = -1;
1471 max_genid = sd->sd_genid;
1473 /* level 2 */
1474 if (rb_level > 1) {
1475 recreate_set(sp, sd);
1476 max_genid++;
1478 if (delete_end)
1479 (void) sdssc_delete_end(sp->setname, SDSSC_CLEANUP);
1482 /* level 1 */
1483 if (rb_level > 0) {
1484 max_genid++;
1485 resync_genid(sp, sd, max_genid, node_c, node_v);
1488 /* level 0 */
1490 * Unlock the set on current set members
1491 * for traditional disksets.
1493 if (!(MD_MNSET_DESC(sd))) {
1494 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1495 for (i = 0; i < node_c; i++) {
1497 * For traditional diskset, issue the RPC and
1498 * ignore RPC failure if in OHA mode.
1500 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
1501 mdclrerror(&xep);
1503 cl_set_setkey(NULL);
1506 /* release signals back to what they were on entry */
1507 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1508 mdclrerror(&xep);
1511 * A MN diskset has the clnt_locks held by meta_set_deletehosts so
1512 * don't flush that data until meta_set_deletehosts has finished
1513 * with it. meta_set_deletehosts will handle the flush of the
1514 * setname.
1516 if (!(MD_MNSET_DESC(sd))) {
1517 metaflushsetname(sp);
1518 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1521 return (rval);
1525 * On entry:
1526 * procsigs already called for MN diskset.
1527 * md_rb_sig_handling already called for traditional diskset.
1529 static int
1530 del_set_on_hosts(
1531 mdsetname_t *sp,
1532 md_set_desc *sd,
1533 md_drive_desc *dd,
1534 int node_c, /* Number of nodes */
1535 char **node_v, /* Nodes being deleted */
1536 int oha,
1537 md_error_t *ep
1540 int i;
1541 int j;
1542 side_t sideno;
1543 md_replicalist_t *rlp = NULL;
1544 sigset_t oldsigs;
1545 md_setkey_t *cl_sk;
1546 ulong_t max_genid = 0;
1547 int rb_level = 1; /* This is a special case */
1548 md_error_t xep = mdnullerror;
1549 md_mnnode_desc *nd;
1551 RB_PREEMPT;
1553 RB_TEST(7, "deletehosts", ep)
1555 if (dd != NULL) {
1557 * May need this to re-add sidenames on roll back.
1559 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp,
1560 ep) < 0)
1561 goto rollback;
1563 RB_TEST(8, "deletehosts", ep)
1565 RB_PREEMPT;
1566 rb_level = 2; /* level 2 */
1568 RB_TEST(9, "deletehosts", ep)
1570 if (del_drvs_from_hosts(sp, sd, dd, node_c, node_v, oha, ep))
1571 goto rollback;
1573 RB_TEST(10, "deletehosts", ep)
1575 RB_PREEMPT;
1576 rb_level = 3; /* level 3 */
1578 RB_TEST(11, "deletehosts", ep)
1581 * Delete the db replica sides
1582 * This is done before the next loop, so that
1583 * the db does not get unloaded before we are finished
1584 * deleting the sides.
1586 if (MD_MNSET_DESC(sd)) {
1587 nd = sd->sd_nodelist;
1588 while (nd) {
1589 /* Skip hosts not being deleted */
1590 if (! strinlst(nd->nd_nodename, node_c,
1591 node_v)) {
1592 nd = nd->nd_next;
1593 continue;
1596 if (del_db_sidenms(sp, nd->nd_nodeid, ep))
1597 goto rollback;
1599 RB_TEST(12, "deletehosts", ep)
1600 nd = nd->nd_next;
1602 } else {
1603 for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1604 /* Skip empty slots */
1605 if (sd->sd_nodes[sideno][0] == '\0')
1606 continue;
1608 /* Skip hosts not being deleted */
1609 if (! strinlst(sd->sd_nodes[sideno], node_c,
1610 node_v))
1611 continue;
1613 if (del_db_sidenms(sp, sideno, ep))
1614 goto rollback;
1616 RB_TEST(12, "deletehosts", ep)
1620 RB_TEST(13, "deletehosts", ep)
1622 RB_PREEMPT;
1623 rb_level = 4; /* level 4 */
1625 RB_TEST(14, "deletehosts", ep)
1627 /* Delete the names from the namespace */
1628 if (MD_MNSET_DESC(sd)) {
1629 nd = sd->sd_nodelist;
1630 while (nd) {
1631 /* Skip hosts not being deleted */
1632 if (! strinlst(nd->nd_nodename, node_c,
1633 node_v)) {
1634 nd = nd->nd_next;
1635 continue;
1638 if (del_md_sidenms(sp, nd->nd_nodeid, ep))
1639 goto rollback;
1641 RB_TEST(15, "deletehosts", ep)
1642 nd = nd->nd_next;
1644 } else {
1645 for (sideno = 0; sideno < MD_MAXSIDES; sideno++) {
1646 /* Skip empty slots */
1647 if (sd->sd_nodes[sideno][0] == '\0')
1648 continue;
1650 /* Skip hosts not being deleted */
1651 if (! strinlst(sd->sd_nodes[sideno], node_c,
1652 node_v))
1653 continue;
1655 if (del_md_sidenms(sp, sideno, ep))
1656 goto rollback;
1658 RB_TEST(15, "deletehosts", ep)
1663 RB_TEST(16, "deletehosts", ep)
1665 RB_PREEMPT;
1666 rb_level = 5; /* level 6 */
1668 RB_TEST(17, "deletehosts", ep)
1670 for (i = 0; i < node_c; i++) {
1671 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1673 * During OHA mode, don't issue RPCs to
1674 * non-alive nodes since there is no reason to
1675 * wait for RPC timeouts.
1677 nd = sd->sd_nodelist;
1678 while (nd) {
1679 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
1680 break;
1681 nd = nd->nd_next;
1683 if (nd == NULL) {
1684 (void) mddserror(ep, MDE_DS_NOTINMEMBERLIST,
1685 sp->setno, nd->nd_nodename,
1686 NULL, sp->setname);
1687 goto rollback;
1690 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
1691 continue;
1694 if (clnt_delset(node_v[i], sp, ep) == -1) {
1695 goto rollback;
1697 } else if (MD_MNSET_DESC(sd) && (oha == FALSE)) {
1699 * All nodes should be alive in non-oha mode.
1701 if (clnt_delset(node_v[i], sp, ep) == -1) {
1702 goto rollback;
1704 } else {
1706 * For traditional diskset, issue the RPC and
1707 * ignore RPC failure if in OHA mode.
1709 if (clnt_delset(node_v[i], sp, ep) == -1) {
1710 if (oha == TRUE && mdanyrpcerror(ep)) {
1711 mdclrerror(ep);
1712 continue;
1714 goto rollback;
1718 RB_TEST(18, "deletehosts", ep)
1721 metafreereplicalist(rlp);
1723 if (MD_MNSET_DESC(sd)) {
1724 /* release signals back to what they were on entry */
1725 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1726 mdclrerror(&xep);
1727 } else {
1728 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1731 return (0);
1733 rollback:
1734 /* all signals already blocked for MN disket */
1735 if (!(MD_MNSET_DESC(sd))) {
1736 /* Make sure we are blocking all signals */
1737 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1738 mdclrerror(&xep);
1741 max_genid = sd->sd_genid;
1743 /* level 5 */
1744 if (rb_level > 4) {
1745 recreate_set(sp, sd);
1746 max_genid++;
1749 /* level 2 */
1750 if (rb_level > 1 && dd != NULL) {
1752 * See if we have to re-add the drives specified.
1754 for (i = 0; i < node_c; i++) {
1755 md_set_record *sr;
1757 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
1759 * During OHA mode, don't issue RPCs to
1760 * non-alive nodes since there is no reason to
1761 * wait for RPC timeouts.
1763 nd = sd->sd_nodelist;
1764 while (nd) {
1765 if (strcmp(nd->nd_nodename, node_v[i])
1766 == 0)
1767 break;
1768 nd = nd->nd_next;
1770 if (nd == NULL)
1771 continue;
1773 if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1774 continue;
1777 /* Don't care if set record is MN or not */
1778 if (clnt_getset(node_v[i], sp->setname,
1779 MD_SET_BAD, &sr, &xep) == -1) {
1780 mdclrerror(&xep);
1781 continue;
1784 /* Drive already added, skip to next node */
1785 if (sr->sr_drivechain != NULL) {
1787 * Set record structure was allocated from RPC
1788 * routine getset so this structure is only of
1789 * size md_set_record even if the MN flag is
1790 * set. So, clear the flag so that the free
1791 * code doesn't attempt to free a structure
1792 * the size of md_mnset_record.
1794 sr->sr_flags &= ~MD_SR_MN;
1795 free_sr(sr);
1796 continue;
1799 if (clnt_adddrvs(node_v[i], sp, dd,
1800 sr->sr_ctime, sr->sr_genid, &xep) == -1)
1801 mdclrerror(&xep);
1803 if (clnt_upd_dr_flags(node_v[i], sp, dd,
1804 MD_DR_OK, &xep) == -1)
1805 mdclrerror(&xep);
1808 * Set record structure was allocated from RPC routine
1809 * getset so this structure is only of size
1810 * md_set_record even if the MN flag is set. So,
1811 * clear the flag so that the free code doesn't
1812 * attempt to free a structure the size of
1813 * md_mnset_record.
1815 sr->sr_flags &= ~MD_SR_MN;
1816 free_sr(sr);
1818 max_genid += 3;
1821 /* level 3 */
1822 if (rb_level > 2 && dd != NULL) {
1823 md_replicalist_t *rl;
1825 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
1826 md_replica_t *r = rl->rl_repp;
1829 * This is not the first replica being added to the
1830 * diskset so call with ADDSIDENMS_BCAST. If this
1831 * is a traditional diskset, the bcast flag is ignored
1832 * since traditional disksets don't use the rpc.mdcommd.
1834 if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
1835 DB_ADDSIDENMS_BCAST, &xep))
1836 mdclrerror(&xep);
1840 /* level 4 */
1841 if (rb_level > 3 && dd != NULL) {
1842 int nodeid_addsides = 0;
1844 * Add the device names for the new sides into the namespace,
1845 * on all hosts not being deleted.
1847 if (MD_MNSET_DESC(sd)) {
1848 nd = sd->sd_nodelist;
1849 while (nd) {
1850 /* Find a node that is not being deleted */
1851 if (! strinlst(nd->nd_nodename, node_c,
1852 node_v)) {
1853 nodeid_addsides = nd->nd_nodeid;
1854 break;
1856 nd = nd->nd_next;
1858 } else {
1859 for (j = 0; j < MD_MAXSIDES; j++) {
1860 /* Skip empty slots */
1861 if (sd->sd_nodes[j][0] == '\0')
1862 continue;
1864 /* Find a node that is not being deleted */
1865 if (! strinlst(sd->sd_nodes[j], node_c,
1866 node_v))
1867 break;
1869 nodeid_addsides = j;
1872 if (MD_MNSET_DESC(sd)) {
1873 nd = sd->sd_nodelist;
1874 while (nd) {
1875 /* Skip nodes not being deleted */
1876 if (!strinlst(nd->nd_nodename, node_c,
1877 node_v)) {
1878 nd = nd->nd_next;
1879 continue;
1882 /* this side was just created, add the names */
1883 if (add_md_sidenms(sp, nd->nd_nodeid,
1884 nodeid_addsides, &xep))
1885 mdclrerror(&xep);
1886 nd = nd->nd_next;
1888 } else {
1889 for (i = 0; i < MD_MAXSIDES; i++) {
1890 /* Skip empty slots */
1891 if (sd->sd_nodes[i][0] == '\0')
1892 continue;
1894 /* Skip nodes not being deleted */
1895 if (!strinlst(sd->sd_nodes[i], node_c, node_v))
1896 continue;
1898 /* this side was just created, add the names */
1899 if (add_md_sidenms(sp, i, nodeid_addsides,
1900 &xep))
1901 mdclrerror(&xep);
1906 /* level 1 */
1907 if (rb_level > 0) {
1908 max_genid++;
1909 resync_genid(sp, sd, max_genid, node_c, node_v);
1912 /* level 0 */
1913 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1914 if (MD_MNSET_DESC(sd)) {
1915 nd = sd->sd_nodelist;
1916 while (nd) {
1917 if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
1918 continue;
1919 /* To balance lock/unlock; can send to dead node */
1920 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
1921 mdclrerror(&xep);
1922 nd = nd->nd_next;
1924 } else {
1925 for (i = 0; i < MD_MAXSIDES; i++) {
1926 /* Skip empty slots */
1927 if (sd->sd_nodes[i][0] == '\0')
1928 continue;
1930 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1931 mdclrerror(&xep);
1934 cl_set_setkey(NULL);
1936 /* release signals back to what they were on entry */
1937 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1938 mdclrerror(&xep);
1940 metafreereplicalist(rlp);
1942 if (!(MD_MNSET_DESC(sd))) {
1943 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1946 return (-1);
1949 static int
1950 make_sideno_sidenm(
1951 mdsetname_t *sp,
1952 mddrivename_t *dnp,
1953 side_t sideno,
1954 md_error_t *ep
1957 mdsidenames_t *sn, **sn_next;
1958 md_set_desc *sd;
1959 mdname_t *np;
1960 uint_t rep_slice;
1961 int err = 0;
1963 assert(dnp->side_names_key != MD_KEYWILD);
1965 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1966 return (-1);
1968 /* find the end of the link list */
1969 for (sn = dnp->side_names; sn->next != NULL; sn = sn->next)
1971 sn_next = &sn->next;
1973 if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
1974 return (-1);
1976 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
1977 return (-1);
1979 sn = Zalloc(sizeof (*sn));
1980 sn->sideno = sideno;
1982 if (MD_MNSET_DESC(sd)) {
1984 * For MO diskset the sideno is not an index into
1985 * the array of nodes. Hence getside_devinfo is
1986 * used instead of meta_getnextside_devinfo.
1988 if (meta_getside_devinfo(sp, np->bname, sideno, &sn->cname,
1989 &sn->dname, &sn->mnum, ep) == -1)
1990 err = -1;
1991 } else {
1992 /* decrement sideno, to look like the previous sideno */
1993 sideno--;
1994 if (meta_getnextside_devinfo(sp, np->bname, &sideno,
1995 &sn->cname, &sn->dname, &sn->mnum, ep) == -1)
1996 err = -1;
1999 if (err) {
2000 Free(sn);
2001 return (err);
2003 assert(sn->sideno == sideno);
2005 /* Add to the end of the linked list */
2006 *sn_next = sn;
2007 return (0);
2010 static int
2011 validate_nodes(
2012 mdsetname_t *sp,
2013 int node_c,
2014 char **node_v,
2015 md_error_t *ep
2018 char *hostname;
2019 int i;
2022 for (i = 0; i < node_c; i++) {
2023 if (strlen(node_v[i]) > (size_t)MD_MAX_NODENAME)
2024 return (mddserror(ep, MDE_DS_NODENAMETOOLONG,
2025 sp->setno, node_v[i], NULL, sp->setname));
2026 if (clnt_hostname(node_v[i], &hostname, ep))
2027 return (-1);
2028 if (strcmp(node_v[i], hostname) != 0) {
2029 Free(hostname);
2030 return (mddserror(ep, MDE_DS_NOTNODENAME, sp->setno,
2031 node_v[i], NULL, sp->setname));
2033 Free(hostname);
2035 return (0);
2039 * Exported Entry Points
2043 * Check the given disk set name for syntactic correctness.
2046 meta_set_checkname(char *setname, md_error_t *ep)
2048 char *cp;
2050 if (strlen(setname) > (size_t)MD_MAX_SETNAME)
2051 return (mddserror(ep, MDE_DS_SETNAMETOOLONG,
2052 MD_SET_BAD, NULL, NULL, setname));
2054 for (cp = setname; *cp; cp++)
2055 if (!isprint(*cp) || strchr(INVALID_IN_NAMES, *cp) != NULL)
2056 return (mddserror(ep, MDE_DS_INVALIDSETNAME,
2057 MD_SET_BAD, NULL, NULL, setname));
2058 return (0);
2062 * Add host(s) to the multi-node diskset provided in sp.
2063 * - create set if non-existent.
2065 static int
2066 meta_multinode_set_addhosts(
2067 mdsetname_t *sp,
2068 int multi_node,
2069 int node_c,
2070 char **node_v,
2071 int auto_take,
2072 md_error_t *ep
2075 md_set_desc *sd;
2076 md_drive_desc *dd, *p;
2077 int rval = 0;
2078 int bool;
2079 int nodeindex;
2080 int i;
2081 int has_set;
2082 sigset_t oldsigs;
2083 md_setkey_t *cl_sk;
2084 int rb_level = 0;
2085 md_error_t xep = mdnullerror;
2086 md_mnnode_desc *nd, *nd_curr, *nd_prev;
2087 md_timeval32_t now;
2088 int nodecnt;
2089 mndiskset_membershiplist_t *nl, *nl2;
2090 int suspendall_flag = 0;
2091 int suspend1_flag = 0;
2092 int lock_flag = 0;
2093 int stale_flag = 0;
2094 md_mnnode_desc *saved_nd_next;
2095 int remote_sets_created = 0;
2098 * Check membershiplist first. If there's
2099 * an error, fail to create set and pass back error.
2101 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2102 return (-1);
2104 /* Verify that all nodes are in member list */
2105 for (i = 0; i < node_c; i++) {
2107 * If node in list isn't a member of the membership,
2108 * just return error.
2110 if (meta_is_member(node_v[i], NULL, nl) == 0) {
2111 meta_free_nodelist(nl);
2112 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2113 sp->setno, node_v[i], NULL, sp->setname));
2117 * Node list is needed later, but there is a lot of error
2118 * checking and possible failures between here and there, so
2119 * just re-get the list later if there are no errors.
2121 meta_free_nodelist(nl);
2122 nl = NULL;
2125 * Verify that list of nodes being added contains no
2126 * duplicates.
2128 if (nodesuniq(sp, node_c, node_v, ep))
2129 return (-1);
2132 * Verify that each node being added thinks that its nodename
2133 * is the same as the nodename given.
2135 if (validate_nodes(sp, node_c, node_v, ep))
2136 return (-1);
2138 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
2139 if (! mdiserror(ep, MDE_NO_SET))
2140 return (-1);
2141 mdclrerror(ep);
2142 return (create_set(sp, multi_node, node_c, node_v, auto_take,
2143 ep));
2144 } else {
2146 * If this node and another node were both attempting to
2147 * create the same setname at the same time, and the other
2148 * node has just created the set on this node then sd would
2149 * be non-NULL, but sp->setno would be null (setno is filled
2150 * in by the create_set). If this is true, then fail since
2151 * the other node has already won this race.
2153 if (sp->setno == NULL) {
2154 return (mddserror(ep, MDE_DS_NODEINSET,
2155 NULL, mynode(), NULL, sp->setname));
2159 /* The auto_take behavior is inconsistent with multiple hosts. */
2160 if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
2161 (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
2162 sp->setname);
2163 return (-1);
2167 * We already have the set.
2170 /* Make sure we own the set */
2171 if (meta_check_ownership(sp, ep) != 0)
2172 return (-1);
2175 * The drive and node records are stored in the local mddbs of each
2176 * node in the diskset. Each node's rpc.metad daemon reads in the set,
2177 * drive and node records from that node's local mddb and caches them
2178 * internally. Any process needing diskset information contacts its
2179 * local rpc.metad to get this information. Since each node in the
2180 * diskset is independently reading the set information from its local
2181 * mddb, the set, drive and node records in the local mddbs must stay
2182 * in-sync, so that all nodes have a consistent view of the diskset.
2184 * For a multinode diskset, explicitly verify that all nodes in the
2185 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
2186 * fail this operation since all nodes must be ALIVE in order to add
2187 * the new node record to their local mddb. If a panic of this node
2188 * leaves the local mddbs set, node and drive records out-of-sync, the
2189 * reconfig cycle will fix the local mddbs and force them back into
2190 * synchronization.
2192 nd = sd->sd_nodelist;
2193 while (nd) {
2194 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
2195 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
2196 sp->setno, nd->nd_nodename, NULL,
2197 sp->setname));
2199 nd = nd->nd_next;
2203 * Check if node is already in set.
2205 for (i = 0; i < node_c; i++) {
2206 /* Is node already in set? */
2207 nd = sd->sd_nodelist;
2208 while (nd) {
2209 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2210 break;
2211 nd = nd->nd_next;
2213 if (nd) {
2214 return (mddserror(ep, MDE_DS_NODEINSET,
2215 sp->setno, node_v[i], NULL,
2216 sp->setname));
2221 * Lock the set on current set members.
2222 * Set locking done much earlier for MN diskset than for traditional
2223 * diskset since lock_set and SUSPEND are used to protect against
2224 * other meta* commands running on the other nodes.
2226 /* Make sure we are blocking all signals */
2227 if (procsigs(TRUE, &oldsigs, &xep) < 0)
2228 mdclrerror(&xep);
2230 nd = sd->sd_nodelist;
2231 /* All nodes are guaranteed to be ALIVE */
2232 while (nd) {
2233 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
2234 rval = -1;
2235 goto out;
2237 lock_flag = 1;
2238 nd = nd->nd_next;
2241 * Lock out other meta* commands by suspending
2242 * class 1 messages across the diskset.
2244 nd = sd->sd_nodelist;
2245 /* Send suspend to nodes in nodelist before addhosts call */
2246 /* All nodes are guaranteed to be ALIVE */
2247 while (nd) {
2248 if (clnt_mdcommdctl(nd->nd_nodename,
2249 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2250 MD_MSCF_NO_FLAGS, ep)) {
2251 rval = -1;
2252 goto out;
2254 suspend1_flag = 1;
2255 nd = nd->nd_next;
2258 /* Lock the set on new set members */
2259 for (i = 0; i < node_c; i++) {
2260 /* Already verified to be alive */
2261 if (clnt_lock_set(node_v[i], sp, ep)) {
2262 rval = -1;
2263 goto out;
2265 lock_flag = 1;
2269 * Perform the required checks for new hosts
2271 for (i = 0; i < node_c; i++) {
2272 /* Make sure this set name is not used on the other hosts */
2273 has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
2274 if (has_set < 0) {
2275 if (! mdiserror(ep, MDE_NO_SET)) {
2276 rval = -1;
2277 goto out;
2279 /* Keep on truck'n */
2280 mdclrerror(ep);
2281 } else if (has_set) {
2282 (void) mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
2283 node_v[i], NULL, sp->setname);
2284 rval = -1;
2285 goto out;
2288 if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1) {
2289 rval = -1;
2290 goto out;
2293 if (bool == TRUE) {
2294 (void) mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
2295 node_v[i], NULL, sp->setname);
2296 rval = -1;
2297 goto out;
2300 if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1) {
2301 rval = -1;
2302 goto out;
2305 if (bool == FALSE) {
2306 (void) mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
2307 node_v[i], NULL, sp->setname);
2308 rval = -1;
2309 goto out;
2312 if (check_setdrvs_againstnode(sp, node_v[i], ep)) {
2313 rval = -1;
2314 goto out;
2318 /* Get drive descriptors for the set */
2319 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL) {
2320 if (! mdisok(ep)) {
2321 rval = -1;
2322 goto out;
2326 /* END CHECK CODE */
2328 RB_TEST(1, "addhosts", ep)
2330 RB_PREEMPT;
2331 rb_level = 1; /* level 1 */
2333 RB_TEST(2, "addhosts", ep)
2336 * Create the set where needed
2338 if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
2339 goto rollback;
2343 * Send suspend to rpc.mdcommd on nodes where a set has been
2344 * created since rpc.mdcommd must now be running on the remote nodes.
2346 remote_sets_created = 1;
2347 for (i = 0; i < node_c; i++) {
2349 * Lock out other meta* commands by suspending
2350 * class 1 messages across the diskset.
2352 if (clnt_mdcommdctl(node_v[i],
2353 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
2354 MD_MSCF_NO_FLAGS, ep)) {
2355 rval = -1;
2356 goto rollback;
2361 * Merge the new entries into the set with the existing sides.
2362 * Get membershiplist from API routine. If there's
2363 * an error, fail to create set and pass back error.
2365 if (meta_read_nodelist(&nodecnt, &nl, ep) == -1) {
2366 goto rollback;
2368 if (meta_gettimeofday(&now) == -1) {
2369 meta_free_nodelist(nl);
2370 (void) mdsyserror(ep, errno,
2371 dgettext(TEXT_DOMAIN, "meta_gettimeofday()"));
2372 goto rollback;
2374 for (nodeindex = 0; nodeindex < node_c; nodeindex++) {
2375 nd = Zalloc(sizeof (*nd));
2376 (void) strcpy(nd->nd_nodename, node_v[nodeindex]);
2377 nd->nd_ctime = now;
2378 nl2 = nl;
2379 while (nl2) {
2380 if (strcmp(nl2->msl_node_name,
2381 node_v[nodeindex]) == 0) {
2382 nd->nd_nodeid = nl2->msl_node_id;
2383 (void) strcpy(nd->nd_priv_ic,
2384 nl2->msl_node_addr);
2385 break;
2387 nl2 = nl2->next;
2391 * Nodelist must be kept in ascending nodeid order.
2393 if (sd->sd_nodelist == NULL) {
2394 /* Nothing in list, just add it */
2395 sd->sd_nodelist = nd;
2396 } else if (nd->nd_nodeid <
2397 sd->sd_nodelist->nd_nodeid) {
2398 /* Add to head of list */
2399 nd->nd_next = sd->sd_nodelist;
2400 sd->sd_nodelist = nd;
2401 } else {
2402 nd_curr = sd->sd_nodelist->nd_next;
2403 nd_prev = sd->sd_nodelist;
2404 /* Search for place to add it */
2405 while (nd_curr) {
2406 if (nd->nd_nodeid < nd_curr->nd_nodeid) {
2407 /* Add before nd_curr */
2408 nd->nd_next = nd_curr;
2409 nd_prev->nd_next = nd;
2410 break;
2412 nd_prev = nd_curr;
2413 nd_curr = nd_curr->nd_next;
2415 /* Add to end of list */
2416 if (nd_curr == NULL) {
2417 nd_prev->nd_next = nd;
2421 /* Node already verified to be in membership */
2422 nd->nd_flags |= MD_MN_NODE_ALIVE;
2424 meta_free_nodelist(nl);
2426 /* If we have drives */
2427 if (dd != NULL) {
2429 * For all the hosts being added, create a sidename structure
2431 nd = sd->sd_nodelist;
2432 while (nd) {
2433 /* Skip nodes not being added */
2434 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2435 nd = nd->nd_next;
2436 continue;
2438 for (p = dd; p != NULL; p = p->dd_next) {
2439 if (make_sideno_sidenm(sp, p->dd_dnp,
2440 nd->nd_nodeid, ep) != 0)
2441 goto rollback;
2443 nd = nd->nd_next;
2446 RB_PREEMPT;
2447 rb_level = 2; /* level 2 */
2449 RB_TEST(4, "addhosts", ep)
2452 * Add the new sidename for each drive to all the hosts
2454 * If a multi-node diskset, each host only stores
2455 * the side information for itself. So, only send
2456 * side information to the new hosts where each host
2457 * will add the appropriate side information to its
2458 * local mddb.
2460 nd = sd->sd_nodelist;
2461 while (nd) {
2462 /* Skip nodes not being added */
2463 if (!strinlst(nd->nd_nodename, node_c,
2464 node_v)) {
2465 nd = nd->nd_next;
2466 continue;
2469 /* Add side info to new hosts */
2470 if (clnt_add_drv_sidenms(nd->nd_nodename,
2471 mynode(), sp, sd, node_c, node_v, ep))
2472 goto rollback;
2474 nd = nd->nd_next;
2477 RB_TEST(5, "addhosts", ep)
2479 RB_PREEMPT;
2480 rb_level = 3; /* level 3 */
2482 RB_TEST(6, "addhosts", ep)
2485 * Add the device names for the new sides into the namespace
2486 * for all hosts being added. This is adding the side
2487 * names to the diskset's mddb so add sidenames for all
2488 * of the new hosts.
2490 nd = sd->sd_nodelist;
2491 while (nd) {
2492 /* Skip nodes not being added */
2493 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
2494 nd = nd->nd_next;
2495 continue;
2498 /* this side was just created, add the names */
2499 if (add_md_sidenms(sp, nd->nd_nodeid,
2500 MD_SIDEWILD, ep))
2501 goto rollback;
2503 nd = nd->nd_next;
2506 RB_TEST(7, "addhosts", ep)
2508 RB_PREEMPT;
2509 rb_level = 4; /* level 4 */
2511 RB_TEST(8, "addhosts", ep)
2513 if (add_db_sidenms(sp, ep))
2514 goto rollback;
2516 } else {
2517 RB_PREEMPT;
2518 rb_level = 4;
2521 RB_TEST(9, "addhosts", ep)
2523 RB_PREEMPT;
2524 rb_level = 5; /* level 5 */
2526 RB_TEST(10, "addhosts", ep)
2528 if (dd != NULL) {
2530 * Notify rpc.mdcommd on all nodes of a nodelist change.
2531 * Start by suspending rpc.mdcommd (which drains it of all
2532 * messages), then change the nodelist followed by a reinit
2533 * and resume.
2535 nd = sd->sd_nodelist;
2536 /* Send suspend_all to nodes in nodelist (existing + new) */
2537 /* All nodes are guaranteed to be ALIVE */
2538 while (nd) {
2539 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_SUSPEND,
2540 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, ep)) {
2541 rval = -1;
2542 goto rollback;
2544 suspendall_flag = 1;
2545 nd = nd->nd_next;
2549 /* Add the node(s) to the each host that is currently in the set */
2550 nd = sd->sd_nodelist;
2551 /* All nodes are guaranteed to be ALIVE */
2552 while (nd) {
2553 if (clnt_addhosts(nd->nd_nodename, sp, node_c, node_v, ep)) {
2554 goto rollback;
2556 nd = nd->nd_next;
2559 RB_TEST(11, "addhosts", ep)
2561 if (dd != NULL) {
2563 * Mark the drives MD_DR_OK.
2565 nd = sd->sd_nodelist;
2566 /* All nodes are guaranteed to be ALIVE */
2567 while (nd) {
2568 if (clnt_upd_dr_flags(nd->nd_nodename, sp, dd,
2569 MD_DR_OK, ep) == -1)
2570 goto rollback;
2571 nd = nd->nd_next;
2575 RB_TEST(12, "addhosts", ep)
2577 RB_PREEMPT;
2578 rb_level = 6; /* level 6 */
2580 RB_TEST(13, "addhosts", ep)
2583 /* Add the mediator information to all hosts in the set. */
2584 nd = sd->sd_nodelist;
2585 /* All nodes are guaranteed to be ALIVE */
2586 while (nd) {
2587 if (clnt_updmeds(nd->nd_nodename, sp, &sd->sd_med, ep))
2588 goto rollback;
2589 nd = nd->nd_next;
2592 RB_TEST(14, "addhosts", ep)
2595 * If a MN diskset and there are drives in the set,
2596 * set the master on the new nodes and
2597 * automatically join the new nodes into the set.
2599 if (dd != NULL) {
2600 mddb_config_t c;
2602 * Is current set STALE?
2604 (void) memset(&c, 0, sizeof (c));
2605 c.c_id = 0;
2606 c.c_setno = sp->setno;
2607 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2608 (void) mdstealerror(ep, &c.c_mde);
2609 rval = -1;
2610 goto out;
2612 if (c.c_flags & MDDB_C_STALE) {
2613 stale_flag = MNSET_IS_STALE;
2616 /* Set master on newly added nodes */
2617 for (i = 0; i < node_c; i++) {
2618 if (clnt_mnsetmaster(node_v[i], sp,
2619 sd->sd_mn_master_nodenm,
2620 sd->sd_mn_master_nodeid, ep)) {
2621 goto rollback;
2624 /* Join newly added nodes to diskset and set OWN flag */
2625 for (i = 0; i < node_c; i++) {
2626 if (clnt_joinset(node_v[i], sp, stale_flag, ep))
2627 goto rollback;
2628 nd = sd->sd_nodelist;
2629 while (nd) {
2630 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2631 nd->nd_flags |= MD_MN_NODE_OWN;
2633 * Also set ADD flag since this flag
2634 * is already set in rpc.metad - it's
2635 * just not in the local copy.
2636 * Could flush local cache and call
2637 * metaget_setdesc, but this just
2638 * adds time. Since this node knows
2639 * the state of the node flags in
2640 * rpc.metad, just set the ADD
2641 * flag and save time.
2643 nd->nd_flags |= MD_MN_NODE_ADD;
2644 break;
2646 nd = nd->nd_next;
2650 /* Send new node flag list to all Owner nodes */
2651 nd = sd->sd_nodelist;
2652 while (nd) {
2653 if (!(nd->nd_flags & MD_MN_NODE_OWN)) {
2654 nd = nd->nd_next;
2655 continue;
2658 * Will effectively set OWN flag in records kept
2659 * cached in rpc.metad. The ADD flag would have
2660 * already been set by the call to clnt_addhosts.
2662 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2663 sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
2664 goto rollback;
2666 nd = nd->nd_next;
2671 * Mark the set record MD_SR_OK
2673 nd = sd->sd_nodelist;
2674 /* All nodes are guaranteed to be ALIVE */
2675 while (nd) {
2676 if (clnt_upd_sr_flags(nd->nd_nodename, sp, MD_SR_OK,
2677 ep)) {
2678 goto rollback;
2680 nd = nd->nd_next;
2684 * For MN diskset:
2685 * On each newly added node, set the node record for that node
2686 * to OK. Then set all node records for the newly added
2687 * nodes on all nodes to ok.
2689 * By setting a node's own node record to ok first, even if
2690 * the node adding the hosts panics, the rest of the nodes can
2691 * determine the same node list during the choosing of the master
2692 * during reconfig. So, only nodes considered for mastership
2693 * are nodes that have both MD_MN_NODE_OK and MD_SR_OK set
2694 * on that node's rpc.metad. If all nodes have MD_SR_OK set,
2695 * but no node has its own MD_MN_NODE_OK set, then the set will
2696 * be removed during reconfig since a panic occurred during the
2697 * creation of the initial diskset.
2700 for (i = 0; i < node_c; i++) {
2701 nd = sd->sd_nodelist;
2702 /* All nodes are guaranteed to be ALIVE */
2703 while (nd) {
2704 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2705 break;
2706 nd = nd->nd_next;
2708 /* Something wrong, will pick this up in next loop */
2709 if (nd == NULL)
2710 continue;
2712 /* Only changing my local cache of node list */
2713 saved_nd_next = nd->nd_next;
2714 nd->nd_next = NULL;
2716 /* Set node record for added host to ok on that host */
2717 if (clnt_upd_nr_flags(node_v[i], sp,
2718 nd, MD_NR_OK, NULL, ep)) {
2719 nd->nd_next = saved_nd_next;
2720 goto rollback;
2722 nd->nd_next = saved_nd_next;
2725 /* Now set all node records on all nodes to be ok */
2726 nd = sd->sd_nodelist;
2727 /* All nodes are guaranteed to be ALIVE */
2728 while (nd) {
2729 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2730 sd->sd_nodelist, MD_NR_OK, NULL, ep)) {
2731 goto rollback;
2733 nd = nd->nd_next;
2736 RB_TEST(15, "addhosts", ep)
2737 out:
2739 * Notify rpc.mdcommd on all nodes of a nodelist change.
2740 * Send reinit command to mdcommd which forces it to get
2741 * fresh set description. Then send resume.
2742 * Resume on class 0 will resume all classes, so can skip
2743 * doing an explicit resume of class1 (ignore suspend1_flag).
2745 if (suspendall_flag) {
2747 * Don't know if nodelist contains the nodes being added
2748 * or not, so do reinit to nodes not being added (by skipping
2749 * any nodes in the nodelist being added) and then do
2750 * reinit to nodes being added if remote_sets_created is 1.
2752 nd = sd->sd_nodelist;
2753 /* All nodes are guaranteed to be ALIVE */
2754 while (nd) {
2755 /* Skip nodes being added - handled later */
2756 if (strinlst(nd->nd_nodename, node_c, node_v)) {
2757 nd = nd->nd_next;
2758 continue;
2760 /* Class is ignored for REINIT */
2761 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
2762 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2763 if (rval == 0)
2764 (void) mdstealerror(ep, &xep);
2765 rval = -1;
2766 mde_perror(ep, dgettext(TEXT_DOMAIN,
2767 "Unable to reinit rpc.mdcommd.\n"));
2769 nd = nd->nd_next;
2772 * Send reinit to added nodes that had a set created since
2773 * rpc.mdcommd is running on the nodes with a set.
2775 if (remote_sets_created == 1) {
2776 for (i = 0; i < node_c; i++) {
2777 if (clnt_mdcommdctl(node_v[i], COMMDCTL_REINIT,
2778 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
2779 if (rval == 0)
2780 (void) mdstealerror(ep, &xep);
2781 rval = -1;
2782 mde_perror(ep, dgettext(TEXT_DOMAIN,
2783 "Unable to reinit rpc.mdcommd.\n"));
2788 if ((suspend1_flag) || (suspendall_flag)) {
2790 * Unlock diskset by resuming messages across the diskset.
2791 * Just resume all classes so that resume is the same whether
2792 * just one class was locked or all classes were locked.
2794 * Don't know if nodelist contains the nodes being added
2795 * or not, so do resume_all to nodes not being added (by
2796 * skipping any nodes in the nodelist being added) and then do
2797 * resume_all to nodes being added if remote_sets_created is 1.
2799 nd = sd->sd_nodelist;
2800 /* All nodes are guaranteed to be ALIVE */
2801 while (nd) {
2802 /* Skip nodes being added - handled later */
2803 if (strinlst(nd->nd_nodename, node_c, node_v)) {
2804 nd = nd->nd_next;
2805 continue;
2807 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
2808 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
2809 if (rval == 0)
2810 (void) mdstealerror(ep, &xep);
2811 rval = -1;
2812 mde_perror(ep, dgettext(TEXT_DOMAIN,
2813 "Unable to resume rpc.mdcommd.\n"));
2815 nd = nd->nd_next;
2818 * Send resume to added nodes that had a set created since
2819 * rpc.mdcommd is be running on the nodes with a set.
2821 if (remote_sets_created == 1) {
2822 for (i = 0; i < node_c; i++) {
2823 /* Already verified to be alive */
2824 if (clnt_mdcommdctl(node_v[i], COMMDCTL_RESUME,
2825 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS,
2826 &xep)) {
2827 if (rval == 0)
2828 (void) mdstealerror(ep, &xep);
2829 rval = -1;
2830 mde_perror(ep, dgettext(TEXT_DOMAIN,
2831 "Unable to resume rpc.mdcommd.\n"));
2835 meta_ping_mnset(sp->setno);
2837 * Start a resync thread on the newly added nodes
2838 * if set is not stale. Also start a thread to update the
2839 * abr state of all soft partitions
2841 if (stale_flag != MNSET_IS_STALE) {
2842 for (i = 0; i < node_c; i++) {
2843 if (clnt_mn_mirror_resync_all(node_v[i],
2844 sp->setno, &xep)) {
2845 if (rval == 0)
2846 (void) mdstealerror(ep, &xep);
2847 rval = -1;
2848 mde_perror(ep, dgettext(TEXT_DOMAIN,
2849 "Unable to start resync "
2850 "thread.\n"));
2852 if (clnt_mn_sp_update_abr(node_v[i],
2853 sp->setno, &xep)) {
2854 if (rval == 0)
2855 (void) mdstealerror(ep, &xep);
2856 rval = -1;
2857 mde_perror(ep, dgettext(TEXT_DOMAIN,
2858 "Unable to start sp update "
2859 "thread.\n"));
2864 cl_sk = cl_get_setkey(sp->setno, sp->setname);
2866 * Don't know if nodelist contains the nodes being added
2867 * or not, so do clnt_unlock_set to nodes not being added (by
2868 * skipping any nodes in the nodelist being added) and then do
2869 * clnt_unlock_set to nodes being added.
2871 if (lock_flag) {
2872 nd = sd->sd_nodelist;
2873 /* All nodes are guaranteed to be ALIVE */
2874 while (nd) {
2875 /* Skip hosts we get in the next loop */
2876 if (strinlst(nd->nd_nodename, node_c, node_v)) {
2877 nd = nd->nd_next;
2878 continue;
2880 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
2881 if (rval == 0)
2882 (void) mdstealerror(ep, &xep);
2883 rval = -1;
2885 nd = nd->nd_next;
2887 for (i = 0; i < node_c; i++) {
2888 /* Already verified to be alive */
2889 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
2890 if (rval == 0)
2891 (void) mdstealerror(ep, &xep);
2892 rval = -1;
2896 cl_set_setkey(NULL);
2898 metaflushsetname(sp);
2900 /* release signals back to what they were on entry */
2901 if (procsigs(FALSE, &oldsigs, &xep) < 0)
2902 mdclrerror(&xep);
2904 return (rval);
2906 rollback:
2907 rval = -1;
2909 /* level 6 */
2910 if (rb_level > 5) {
2912 * For each node being deleted, set DEL flag and
2913 * reset OK flag on that node first.
2914 * Until a node has turned off its own
2915 * rpc.metad's NODE_OK flag, that node could be
2916 * considered for master during a reconfig.
2918 for (i = 0; i < node_c; i++) {
2919 nd = sd->sd_nodelist;
2920 /* All nodes are guaranteed to be ALIVE */
2921 while (nd) {
2922 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
2923 break;
2924 nd = nd->nd_next;
2926 /* Something wrong, handle this in next loop */
2927 if (nd == NULL)
2928 continue;
2930 /* Only changing my local cache of node list */
2931 saved_nd_next = nd->nd_next;
2932 nd->nd_next = NULL;
2934 /* Set flags for del host to DEL on that host */
2935 if (clnt_upd_nr_flags(node_v[i], sp,
2936 nd, MD_NR_DEL, NULL, &xep)) {
2937 mdclrerror(&xep);
2939 nd->nd_next = saved_nd_next;
2942 for (i = 0; i < node_c; i++) {
2943 if (dd != NULL) {
2944 /* Reset master on newly added node */
2945 if (clnt_mnsetmaster(node_v[i], sp, "",
2946 MD_MN_INVALID_NID, &xep))
2947 mdclrerror(&xep);
2948 /* Withdraw set on newly added node */
2949 if (clnt_withdrawset(node_v[i], sp, &xep))
2950 mdclrerror(&xep);
2953 * Turn off owner flag in nodes to be deleted
2954 * if there are drives in the set.
2955 * Also, turn off NODE_OK and turn on NODE_DEL
2956 * for nodes to be deleted.
2957 * These flags are used to set the node
2958 * record flags in all nodes in the set.
2960 nd = sd->sd_nodelist;
2961 while (nd) {
2962 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
2963 if (dd != NULL) {
2964 nd->nd_flags &= ~MD_MN_NODE_OWN;
2966 nd->nd_flags |= MD_MN_NODE_DEL;
2967 nd->nd_flags &= ~MD_MN_NODE_OK;
2968 break;
2970 nd = nd->nd_next;
2975 * Now, reset owner and set delete flags for the deleted
2976 * nodes on all nodes.
2978 nd = sd->sd_nodelist;
2979 while (nd) {
2980 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
2981 sd->sd_nodelist, MD_NR_SET, NULL, &xep)) {
2982 mdclrerror(&xep);
2984 nd = nd->nd_next;
2988 * On each node being deleted, set the set record
2989 * to be in DEL state.
2991 for (i = 0; i < node_c; i++) {
2992 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, &xep)) {
2993 mdclrerror(&xep);
2998 /* level 5 */
2999 if (rb_level > 4) {
3000 nd = sd->sd_nodelist;
3001 /* All nodes are guaranteed to be ALIVE */
3002 while (nd) {
3003 if (clnt_delhosts(nd->nd_nodename, sp, node_c,
3004 node_v, &xep) == -1)
3005 mdclrerror(&xep);
3006 nd = nd->nd_next;
3011 * Notify rpc.mdcommd on all nodes of a nodelist change.
3012 * Send reinit command to mdcommd which forces it to get
3013 * fresh set description. Then send resume.
3014 * Nodelist contains all nodes (existing + added).
3016 if (suspendall_flag) {
3017 /* Send reinit */
3018 nd = sd->sd_nodelist;
3019 /* All nodes are guaranteed to be ALIVE */
3020 /* Send reinit to nodes in nodelist before addhosts call */
3021 while (nd) {
3023 * Skip nodes being added if remote sets were not
3024 * created since rpc.mdcommd may not be running
3025 * on the remote nodes.
3027 if ((remote_sets_created == 0) &&
3028 (strinlst(nd->nd_nodename, node_c, node_v))) {
3029 nd = nd->nd_next;
3030 continue;
3032 /* Class is ignored for REINIT */
3033 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
3034 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
3035 mde_perror(&xep, dgettext(TEXT_DOMAIN,
3036 "Unable to reinit rpc.mdcommd.\n"));
3037 mdclrerror(&xep);
3039 nd = nd->nd_next;
3042 /* Send resume */
3043 nd = sd->sd_nodelist;
3044 /* All nodes are guaranteed to be ALIVE */
3045 while (nd) {
3047 * Skip nodes being added if remote sets were not
3048 * created since rpc.mdcommd may not be running
3049 * on the remote nodes.
3051 if ((remote_sets_created == 0) &&
3052 (strinlst(nd->nd_nodename, node_c, node_v))) {
3053 nd = nd->nd_next;
3054 continue;
3057 * Resume all classes but class 1 so that lock is held
3058 * against meta* commands.
3059 * Send resume_all_but_1 to nodes in nodelist
3060 * before addhosts call.
3062 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
3063 sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
3064 &xep)) {
3065 mde_perror(&xep, dgettext(TEXT_DOMAIN,
3066 "Unable to resume rpc.mdcommd.\n"));
3067 mdclrerror(&xep);
3069 nd = nd->nd_next;
3071 meta_ping_mnset(sp->setno);
3074 /* level 4 */
3075 /* Nodelist may or may not contain nodes being added. */
3076 if (rb_level > 3 && dd != NULL) {
3077 nd = sd->sd_nodelist;
3078 while (nd) {
3079 /* Skip nodes not being added */
3080 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3081 nd = nd->nd_next;
3082 continue;
3085 if (del_db_sidenms(sp, nd->nd_nodeid, &xep))
3086 mdclrerror(&xep);
3087 nd = nd->nd_next;
3091 /* level 3 */
3092 /* Nodelist may or may not contain nodes being added. */
3093 if (rb_level > 2 && dd != NULL) {
3094 nd = sd->sd_nodelist;
3095 while (nd) {
3096 /* Skip nodes not being added */
3097 if (!strinlst(nd->nd_nodename, node_c, node_v)) {
3098 nd = nd->nd_next;
3099 continue;
3102 if (del_md_sidenms(sp, nd->nd_nodeid, &xep))
3103 mdclrerror(&xep);
3104 nd = nd->nd_next;
3108 /* level 1 */
3109 if (rb_level > 0) {
3110 if (dd != NULL) {
3111 /* delete the drive records */
3112 for (i = 0; i < node_c; i++) {
3113 if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3114 mdclrerror(&xep);
3118 /* delete the set record */
3119 for (i = 0; i < node_c; i++) {
3120 if (clnt_delset(node_v[i], sp, &xep) == -1)
3121 mdclrerror(&xep);
3125 /* level 0 */
3126 cl_sk = cl_get_setkey(sp->setno, sp->setname);
3127 /* Don't test lock flag since guaranteed to be set if in rollback */
3128 /* Nodelist may or may not contain nodes being added. */
3130 * Unlock diskset by resuming messages across the diskset.
3131 * Just resume all classes so that resume is the same whether
3132 * just one class was locked or all classes were locked.
3134 if ((suspend1_flag) || (suspendall_flag)) {
3135 /* All nodes are guaranteed to be ALIVE */
3136 nd = sd->sd_nodelist;
3137 while (nd) {
3139 * Skip nodes being added since remote sets
3140 * were either created and then deleted or
3141 * were never created. Either way - rpc.mdcommd
3142 * may not be running on the remote node.
3144 if (strinlst(nd->nd_nodename, node_c, node_v)) {
3145 nd = nd->nd_next;
3146 continue;
3148 if (clnt_mdcommdctl(nd->nd_nodename,
3149 COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
3150 MD_MSCF_NO_FLAGS, &xep)) {
3151 mde_perror(&xep, dgettext(TEXT_DOMAIN,
3152 "Unable to resume rpc.mdcommd.\n"));
3153 mdclrerror(&xep);
3155 nd = nd->nd_next;
3157 meta_ping_mnset(sp->setno);
3159 nd = sd->sd_nodelist;
3160 /* All nodes are guaranteed to be ALIVE */
3161 while (nd) {
3162 /* Skip hosts we get in the next loop */
3163 if (strinlst(nd->nd_nodename, node_c, node_v)) {
3164 nd = nd->nd_next;
3165 continue;
3168 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
3169 mdclrerror(&xep);
3170 nd = nd->nd_next;
3173 for (i = 0; i < node_c; i++)
3174 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3175 mdclrerror(&xep);
3176 cl_set_setkey(NULL);
3178 /* release signals back to what they were on entry */
3179 if (procsigs(FALSE, &oldsigs, &xep) < 0)
3180 mdclrerror(&xep);
3182 metaflushsetname(sp);
3184 return (rval);
3188 * Add host(s) to the traditional diskset provided in sp.
3189 * - create set if non-existent.
3191 static int
3192 meta_traditional_set_addhosts(
3193 mdsetname_t *sp,
3194 int multi_node,
3195 int node_c,
3196 char **node_v,
3197 int auto_take,
3198 md_error_t *ep
3201 md_set_desc *sd;
3202 md_drive_desc *dd, *p;
3203 med_rec_t medr;
3204 med_rec_t rb_medr;
3205 int rval = 0;
3206 int bool;
3207 int nodeindex;
3208 int i;
3209 int has_set;
3210 int numsides;
3211 sigset_t oldsigs;
3212 md_setkey_t *cl_sk;
3213 int rb_level = 0;
3214 md_error_t xep = mdnullerror;
3215 int max_meds;
3217 if (nodesuniq(sp, node_c, node_v, ep))
3218 return (-1);
3220 if (validate_nodes(sp, node_c, node_v, ep))
3221 return (-1);
3223 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
3224 if (! mdiserror(ep, MDE_NO_SET))
3225 return (-1);
3226 mdclrerror(ep);
3227 return (create_set(sp, multi_node, node_c, node_v, auto_take,
3228 ep));
3231 /* The auto_take behavior is inconsistent with multiple hosts. */
3232 if (auto_take || sd->sd_flags & MD_SR_AUTO_TAKE) {
3233 (void) mddserror(ep, MDE_DS_SINGLEHOST, sp->setno, NULL, NULL,
3234 sp->setname);
3235 return (-1);
3239 * We already have the set.
3242 /* Make sure we own the set */
3243 if (meta_check_ownership(sp, ep) != 0)
3244 return (-1);
3247 * Perform the required checks for new hosts
3249 for (i = 0; i < node_c; i++) {
3250 if (getnodeside(node_v[i], sd) != MD_SIDEWILD)
3251 return (mddserror(ep, MDE_DS_NODEINSET, sp->setno,
3252 node_v[i], NULL, sp->setname));
3254 /* Make sure this set name is not used on the other hosts */
3255 has_set = nodehasset(sp, node_v[i], NHS_N_EQ, ep);
3256 if (has_set < 0) {
3257 if (! mdiserror(ep, MDE_NO_SET))
3258 return (-1);
3259 /* Keep on truck'n */
3260 mdclrerror(ep);
3261 } else if (has_set)
3262 return (mddserror(ep, MDE_DS_NODEHASSET, sp->setno,
3263 node_v[i], NULL, sp->setname));
3265 if (clnt_setnumbusy(node_v[i], sp->setno, &bool, ep) == -1)
3266 return (-1);
3268 if (bool == TRUE)
3269 return (mddserror(ep, MDE_DS_SETNUMBUSY, sp->setno,
3270 node_v[i], NULL, sp->setname));
3272 if (clnt_setnameok(node_v[i], sp, &bool, ep) == -1)
3273 return (-1);
3275 if (bool == FALSE)
3276 return (mddserror(ep, MDE_DS_SETNAMEBUSY, sp->setno,
3277 node_v[i], NULL, sp->setname));
3279 if (check_setdrvs_againstnode(sp, node_v[i], ep))
3280 return (-1);
3283 /* Count the number of occupied slots */
3284 numsides = 0;
3285 for (i = 0; i < MD_MAXSIDES; i++) {
3286 /* Count occupied slots */
3287 if (sd->sd_nodes[i][0] != '\0')
3288 numsides++;
3291 /* Make sure the we have space to add the new sides */
3292 if ((numsides + node_c) > MD_MAXSIDES) {
3293 (void) mddserror(ep, MDE_DS_SIDENUMNOTAVAIL, sp->setno, NULL,
3294 NULL, sp->setname);
3295 return (-1);
3298 /* Get drive descriptors for the set */
3299 if ((dd = metaget_drivedesc(sp, MD_FULLNAME_ONLY, ep)) == NULL)
3300 if (! mdisok(ep))
3301 return (-1);
3303 /* Setup the mediator record roll-back structure */
3304 (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
3305 rb_medr.med_rec_mag = MED_REC_MAGIC;
3306 rb_medr.med_rec_rev = MED_REC_REV;
3307 rb_medr.med_rec_fl = 0;
3308 rb_medr.med_rec_sn = sp->setno;
3309 (void) strcpy(rb_medr.med_rec_snm, sp->setname);
3310 for (i = 0; i < MD_MAXSIDES; i++)
3311 (void) strcpy(rb_medr.med_rec_nodes[i], sd->sd_nodes[i]);
3312 rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */
3313 (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
3314 rb_medr.med_rec_foff = 0;
3315 crcgen(&rb_medr, &rb_medr.med_rec_cks, sizeof (med_rec_t), NULL);
3317 if ((max_meds = get_max_meds(ep)) == 0)
3318 return (-1);
3320 /* END CHECK CODE */
3322 md_rb_sig_handling_on();
3324 /* Lock the set on current set members */
3325 for (i = 0; i < MD_MAXSIDES; i++) {
3326 /* Skip empty slots */
3327 if (sd->sd_nodes[i][0] == '\0')
3328 continue;
3330 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
3331 rval = -1;
3332 goto out;
3336 /* Lock the set on new set members */
3337 for (i = 0; i < node_c; i++) {
3338 if (clnt_lock_set(node_v[i], sp, ep)) {
3339 rval = -1;
3340 goto out;
3344 RB_TEST(1, "addhosts", ep)
3346 RB_PREEMPT;
3347 rb_level = 1; /* level 1 */
3349 RB_TEST(2, "addhosts", ep)
3352 * Add the new hosts to the existing set record on the existing hosts
3354 for (i = 0; i < MD_MAXSIDES; i++) {
3355 /* skip empty slots */
3356 if (sd->sd_nodes[i][0] == '\0')
3357 continue;
3359 if (clnt_addhosts(sd->sd_nodes[i], sp, node_c, node_v, ep))
3360 goto rollback;
3363 RB_PREEMPT;
3364 rb_level = 2; /* level 2 */
3366 RB_TEST(3, "addhosts", ep);
3368 /* Merge the new entries into the set with the existing sides */
3369 nodeindex = 0;
3370 for (i = 0; i < MD_MAXSIDES; i++) {
3371 /* Skip full slots */
3372 if (sd->sd_nodes[i][0] != '\0')
3373 continue;
3375 (void) strcpy(sd->sd_nodes[i], node_v[nodeindex++]);
3376 if (nodeindex == node_c)
3377 break;
3380 /* If we have drives */
3381 if (dd != NULL) {
3383 * For all the hosts being added, create a sidename structure
3385 for (i = 0; i < MD_MAXSIDES; i++) {
3386 /* Skip empty slots */
3387 if (sd->sd_nodes[i][0] == '\0')
3388 continue;
3390 /* Skip nodes not being added */
3391 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3392 continue;
3394 for (p = dd; p != NULL; p = p->dd_next) {
3395 if (make_sideno_sidenm(sp, p->dd_dnp, i,
3396 ep) != 0)
3397 goto rollback;
3402 * Add the new sidename for each drive to the existing hosts
3404 for (i = 0; i < MD_MAXSIDES; i++) {
3405 /* Skip empty slots */
3406 if (sd->sd_nodes[i][0] == '\0')
3407 continue;
3409 /* Skip nodes being added */
3410 if (strinlst(sd->sd_nodes[i], node_c, node_v))
3411 continue;
3413 if (clnt_add_drv_sidenms(sd->sd_nodes[i], mynode(), sp,
3414 sd, node_c, node_v, ep)) {
3415 goto rollback;
3419 RB_TEST(4, "addhosts", ep)
3421 RB_PREEMPT;
3422 rb_level = 3; /* level 3 */
3424 RB_TEST(5, "addhosts", ep)
3426 if (add_db_sidenms(sp, ep)) {
3427 goto rollback;
3430 } else {
3431 RB_PREEMPT;
3432 rb_level = 3;
3435 RB_TEST(6, "addhosts", ep)
3437 RB_PREEMPT;
3438 rb_level = 4; /* level 4 */
3440 RB_TEST(7, "addhosts", ep)
3443 /* create the set on the new nodes, this adds the drives as well */
3444 if (create_set_on_hosts(sp, multi_node, node_c, node_v, 0, ep)) {
3445 goto rollback;
3448 RB_TEST(8, "addhosts", ep)
3450 RB_PREEMPT;
3451 rb_level = 5; /* level 5 */
3453 RB_TEST(9, "addhosts", ep)
3455 if (dd != NULL) {
3458 * Add the device entries for the new sides into the namespace.
3460 for (i = 0; i < MD_MAXSIDES; i++) {
3461 /* Skip empty slots */
3462 if (sd->sd_nodes[i][0] == '\0')
3463 continue;
3465 /* Skip nodes not being added */
3466 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3467 continue;
3469 if (add_md_sidenms(sp, i, MD_SIDEWILD, ep))
3470 goto rollback;
3474 RB_TEST(10, "addhosts", ep)
3476 RB_PREEMPT;
3477 rb_level = 6; /* level 6 */
3479 RB_TEST(11, "addhosts", ep);
3481 if (dd != NULL) {
3483 * Mark the drives MD_DR_OK.
3485 for (i = 0; i < MD_MAXSIDES; i++) {
3486 /* Skip empty slots */
3487 if (sd->sd_nodes[i][0] == '\0')
3488 continue;
3490 if (clnt_upd_dr_flags(sd->sd_nodes[i], sp, dd,
3491 MD_DR_OK, ep) == -1) {
3492 goto rollback;
3497 RB_TEST(12, "addhosts", ep)
3499 /* Bring the mediator record up to date with the set record */
3500 medr = rb_medr; /* structure assignment */
3501 for (i = 0; i < MD_MAXSIDES; i++)
3502 (void) strcpy(medr.med_rec_nodes[i], sd->sd_nodes[i]);
3503 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
3505 /* Inform the mediator hosts of the new node list */
3506 for (i = 0; i < max_meds; i++) {
3507 if (sd->sd_med.n_lst[i].a_cnt == 0)
3508 continue;
3510 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp, &medr, ep))
3511 goto rollback;
3514 /* Add the mediator information to all hosts in the set */
3515 for (i = 0; i < MD_MAXSIDES; i++) {
3516 /* Skip empty slots */
3517 if (sd->sd_nodes[i][0] == '\0')
3518 continue;
3520 if (clnt_updmeds(sd->sd_nodes[i], sp, &sd->sd_med, ep))
3521 goto rollback;
3524 RB_TEST(13, "addhosts", ep)
3527 * Mark the set record MD_SR_OK
3529 for (i = 0; i < MD_MAXSIDES; i++) {
3530 /* Skip empty slots */
3531 if (sd->sd_nodes[i][0] == '\0')
3532 continue;
3534 if (clnt_upd_sr_flags(sd->sd_nodes[i], sp, MD_SR_OK, ep))
3535 goto rollback;
3538 RB_TEST(14, "addhosts", ep)
3540 out:
3541 cl_sk = cl_get_setkey(sp->setno, sp->setname);
3542 for (i = 0; i < MD_MAXSIDES; i++) {
3543 /* Skip empty slots */
3544 if (sd->sd_nodes[i][0] == '\0')
3545 continue;
3547 /* Skip hosts we get in the next loop */
3548 if (strinlst(sd->sd_nodes[i], node_c, node_v))
3549 continue;
3551 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
3552 if (rval == 0)
3553 (void) mdstealerror(ep, &xep);
3554 rval = -1;
3558 if (rval == 0) {
3559 for (i = 0; i < node_c; i++)
3560 if (clnt_unlock_set(node_v[i], cl_sk, &xep)) {
3561 if (rval == 0)
3562 (void) mdstealerror(ep, &xep);
3563 rval = -1;
3566 cl_set_setkey(NULL);
3568 metaflushsetname(sp);
3570 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3572 return (rval);
3574 rollback:
3575 /* Make sure we are blocking all signals */
3576 if (procsigs(TRUE, &oldsigs, &xep) < 0)
3577 mdclrerror(&xep);
3579 rval = -1;
3581 /* level 6 */
3582 if (rb_level > 5) {
3583 for (i = 0; i < max_meds; i++) {
3584 if (sd->sd_med.n_lst[i].a_cnt == 0)
3585 continue;
3587 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
3588 &rb_medr, &xep))
3589 mdclrerror(&xep);
3591 if (dd != NULL) {
3592 for (i = 0; i < MD_MAXSIDES; i++) {
3593 /* Skip empty slots */
3594 if (sd->sd_nodes[i][0] == '\0')
3595 continue;
3597 /* Skip nodes not being added */
3598 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3599 continue;
3601 if (del_md_sidenms(sp, i, &xep))
3602 mdclrerror(&xep);
3607 /* level 5 */
3608 if (rb_level > 4) {
3609 if (dd != NULL) {
3610 /* delete the drive records */
3611 for (i = 0; i < node_c; i++) {
3612 if (clnt_deldrvs(node_v[i], sp, dd, &xep) == -1)
3613 mdclrerror(&xep);
3616 /* delete the set record on the 'new' hosts */
3617 for (i = 0; i < node_c; i++) {
3618 if (clnt_delset(node_v[i], sp, &xep) == -1)
3619 mdclrerror(&xep);
3623 /* level 4 */
3624 if (rb_level > 3 && dd != NULL) {
3625 for (i = 0; i < MD_MAXSIDES; i++) {
3626 /* Skip empty slots */
3627 if (sd->sd_nodes[i][0] == '\0')
3628 continue;
3630 /* Skip nodes not being added */
3631 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3632 continue;
3634 if (del_db_sidenms(sp, i, &xep))
3635 mdclrerror(&xep);
3639 /* level 3 */
3640 if (rb_level > 2 && dd != NULL) {
3641 for (i = 0; i < MD_MAXSIDES; i++) {
3642 /* Skip empty slots */
3643 if (sd->sd_nodes[i][0] == '\0')
3644 continue;
3646 /* Skip nodes not being added */
3647 if (! strinlst(sd->sd_nodes[i], node_c, node_v))
3648 continue;
3650 if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
3651 &xep) == -1)
3652 mdclrerror(&xep);
3656 /* level 2 */
3657 if (rb_level > 1) {
3658 for (i = 0; i < MD_MAXSIDES; i++) {
3659 /* Skip empty slots */
3660 if (sd->sd_nodes[i][0] == '\0')
3661 continue;
3663 if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
3664 &xep) == -1)
3665 mdclrerror(&xep);
3669 /* level 1 */
3670 if (rb_level > 0) {
3671 cl_sk = cl_get_setkey(sp->setno, sp->setname);
3672 for (i = 0; i < MD_MAXSIDES; i++) {
3673 /* Skip empty slots */
3674 if (sd->sd_nodes[i][0] == '\0')
3675 continue;
3677 /* Skip hosts we get in the next loop */
3678 if (strinlst(sd->sd_nodes[i], node_c, node_v))
3679 continue;
3681 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
3682 mdclrerror(&xep);
3685 for (i = 0; i < node_c; i++)
3686 if (clnt_unlock_set(node_v[i], cl_sk, &xep))
3687 mdclrerror(&xep);
3688 cl_set_setkey(NULL);
3691 /* release signals back to what they were on entry */
3692 if (procsigs(FALSE, &oldsigs, &xep) < 0)
3693 mdclrerror(&xep);
3695 metaflushsetname(sp);
3697 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
3699 return (rval);
3703 * Add host(s) to the diskset provided in sp.
3704 * - create set if non-existent.
3707 meta_set_addhosts(
3708 mdsetname_t *sp,
3709 int multi_node,
3710 int node_c,
3711 char **node_v,
3712 int auto_take,
3713 md_error_t *ep
3716 if (multi_node)
3717 return (meta_multinode_set_addhosts(sp, multi_node, node_c,
3718 node_v, auto_take, ep));
3719 else
3720 return (meta_traditional_set_addhosts(sp, multi_node, node_c,
3721 node_v, auto_take, ep));
3725 * Delete host(s) from the diskset provided in sp.
3726 * - destroy set if last host in set is removed.
3729 meta_set_deletehosts(
3730 mdsetname_t *sp,
3731 int node_c,
3732 char **node_v,
3733 int forceflg,
3734 md_error_t *ep
3737 md_set_desc *sd;
3738 md_drive_desc *dd;
3739 med_rec_t medr;
3740 med_rec_t rb_medr;
3741 int i, j;
3742 int has_set;
3743 int numsides = 0;
3744 int oha = FALSE;
3745 sigset_t oldsigs;
3746 mhd_mhiargs_t mhiargs;
3747 md_replicalist_t *rlp = NULL;
3748 md_setkey_t *cl_sk;
3749 ulong_t max_genid = 0;
3750 int rval = 0;
3751 int rb_level = 0;
3752 int max_meds = 0;
3753 md_error_t xep = mdnullerror;
3754 md_mnnode_desc *nd;
3755 md_mnnode_record *nr;
3756 int delete_master = 0;
3757 int suspendall_flag = 0, suspendall_flag_rb = 0;
3758 int suspend1_flag = 0;
3759 int lock_flag = 0;
3760 int stale_flag = 0;
3761 int *node_id_list = NULL;
3762 int remote_sets_deleted = 0;
3764 if ((sd = metaget_setdesc(sp, ep)) == NULL)
3765 return (-1);
3768 * Verify that list of nodes being deleted contains no
3769 * duplicates.
3771 if (nodesuniq(sp, node_c, node_v, ep))
3772 return (-1);
3774 /* Make sure we own the set */
3775 if (meta_check_ownership(sp, ep) != 0)
3776 return (-1);
3779 * The drive and node records are stored in the local mddbs of each
3780 * node in the diskset. Each node's rpc.metad daemon reads in the set,
3781 * drive and node records from that node's local mddb and caches them
3782 * internally. Any process needing diskset information contacts its
3783 * local rpc.metad to get this information. Since each node in the
3784 * diskset is independently reading the set information from its local
3785 * mddb, the set, drive and node records in the local mddbs must stay
3786 * in-sync, so that all nodes have a consistent view of the diskset.
3788 * For a multinode diskset, explicitly verify that all nodes in the
3789 * diskset are ALIVE (i.e. are in the API membership list) if the
3790 * forceflag is FALSE. (The case of forceflag being TRUE is handled
3791 * in OHA check above.)
3793 * If forceflag is FALSE and a node in the diskset is not in
3794 * the membership list, then fail this operation since all nodes must
3795 * be ALIVE in order to delete the node record from their local mddb.
3796 * If a panic of this node leaves the local mddbs set, node and drive
3797 * records out-of-sync, the reconfig cycle will fix the local mddbs
3798 * and force them back into synchronization.
3800 if ((forceflg == FALSE) && (MD_MNSET_DESC(sd))) {
3801 nd = sd->sd_nodelist;
3802 while (nd) {
3803 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3804 return (mddserror(ep, MDE_DS_NOTINMEMBERLIST,
3805 sp->setno, nd->nd_nodename,
3806 NULL, sp->setname));
3808 nd = nd->nd_next;
3814 * Lock the set on current set members.
3815 * Set locking done much earlier for MN diskset than for traditional
3816 * diskset since lock_set and SUSPEND are used to protect against
3817 * other meta* commands running on the other nodes.
3819 if (MD_MNSET_DESC(sd)) {
3820 /* Make sure we are blocking all signals */
3821 if (procsigs(TRUE, &oldsigs, &xep) < 0)
3822 mdclrerror(&xep);
3824 nd = sd->sd_nodelist;
3825 while (nd) {
3826 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3827 nd = nd->nd_next;
3828 continue;
3831 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
3832 rval = -1;
3833 goto out2;
3835 lock_flag = 1;
3836 nd = nd->nd_next;
3839 * Lock out other meta* commands by suspending
3840 * class 1 messages across the diskset.
3842 nd = sd->sd_nodelist;
3843 while (nd) {
3844 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3845 nd = nd->nd_next;
3846 continue;
3848 if (clnt_mdcommdctl(nd->nd_nodename,
3849 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
3850 MD_MSCF_NO_FLAGS, ep)) {
3851 rval = -1;
3852 goto out2;
3854 suspend1_flag = 1;
3855 nd = nd->nd_next;
3859 for (i = 0; i < node_c; i++)
3860 if (getnodeside(node_v[i], sd) == MD_SIDEWILD) {
3861 (void) mddserror(ep, MDE_DS_NODENOTINSET, sp->setno,
3862 node_v[i], NULL, sp->setname);
3863 rval = -1;
3864 goto out2;
3868 * Count the number of nodes currently in the set.
3870 if (MD_MNSET_DESC(sd)) {
3871 nd = sd->sd_nodelist;
3872 while (nd) {
3873 numsides++;
3874 nd = nd->nd_next;
3876 } else {
3877 for (i = 0; i < MD_MAXSIDES; i++)
3878 /* Count full slots */
3879 if (sd->sd_nodes[i][0] != '\0')
3880 numsides++;
3884 * OHA mode == -f -h <hostname>
3885 * OHA is One Host Administration that occurs when the forceflag (-f)
3886 * is set and at least one host in the diskset isn't responding
3887 * to RPC requests.
3889 * When in OHA mode, a node cannot delete itself from a diskset.
3890 * When in OHA mode, a node can delete a list of nodes from a diskset
3891 * even if some of the nodes in the diskset are unresponsive.
3893 * For multinode diskset, only allow OHA mode when the nodes that
3894 * aren't responding in the diskset are not in the membership list
3895 * (i.e. nodes that aren't responding are not marked ALIVE).
3896 * Nodes that aren't in the membership list will be rejoining
3897 * the diskset through a reconfig cycle and the local mddb set
3898 * and node records can be reconciled during the reconfig cycle.
3900 * If a node isn't responding, but is still in the membership list,
3901 * fail the request since the node may not be responding because
3902 * rpc.metad died and is restarting. In this case, no reconfig
3903 * cycle will be started, so there's no way to recover if
3904 * the host delete operation was allowed.
3906 * NOTE: if nodes that weren't in the membership when the OHA host
3907 * delete occurred are now the only nodes in membership list,
3908 * those nodes will see the old view of the diskset. As soon as
3909 * a node re-enters the cluster that was present in the cluster
3910 * during the host deletion, the diskset will reflect the host
3911 * deletion on all nodes presently in the cluster.
3913 if (forceflg == TRUE) {
3914 if (MD_MNSET_DESC(sd)) {
3915 nd = sd->sd_nodelist;
3916 while (nd) {
3918 * If a node isn't ALIVE (in member list),
3919 * then allow a force-able delete in OHA mode.
3921 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
3922 oha = TRUE;
3923 break;
3926 * Don't test for clnt_nullproc since already
3927 * tested the RPC connections by clnt_lock_set.
3929 nd = nd->nd_next;
3931 } else {
3932 for (i = 0; i < MD_MAXSIDES; i++) {
3933 /* Skip empty slots */
3934 if (sd->sd_nodes[i][0] == '\0')
3935 continue;
3937 if (clnt_nullproc(sd->sd_nodes[i], ep) == -1) {
3939 * If we timeout to at least one
3940 * client, then we can allow OHA mode,
3941 * otherwise, we are in normal mode.
3943 if (mdanyrpcerror(ep)) {
3944 mdclrerror(ep);
3945 if (strinlst(sd->sd_nodes[i],
3946 node_c, node_v)) {
3947 oha = TRUE;
3948 break;
3957 * Don't allow this for MN diskset since meta_set_destroy of 1 node
3958 * does NOT remove this node's node record from the other node's set
3959 * records in their local mddb. This leaves a MN diskset in a very
3960 * messed up state.
3962 if (!(MD_MNSET_DESC(sd))) {
3963 /* Destroy set */
3964 if (forceflg == TRUE && node_c == 1 &&
3965 strcmp(mynode(), node_v[0]) == 0) {
3966 /* Can return since !MN diskset so nothing to unlock */
3967 return (meta_set_destroy(sp, TRUE, ep));
3973 * In multinode diskset, can only delete self if this
3974 * is the last node in the set or if all nodes in
3975 * the set are being deleted. The traditional diskset code
3976 * allows a node to delete itself (when there are other nodes
3977 * in the diskset) when using the force flag, but that code
3978 * path doesn't have the node remove itself from
3979 * the set node list on the other nodes. Since this isn't
3980 * satisfactory for the multinode diskset, just don't
3981 * allow this operation.
3983 if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3984 strinlst(mynode(), node_c, node_v)) {
3985 (void) mddserror(ep, MDE_DS_MNCANTDELSELF, sp->setno,
3986 mynode(), NULL, sp->setname);
3987 rval = -1;
3988 goto out2;
3992 * In multinode diskset, don't allow deletion of master node unless
3993 * this is the only node left or unless all nodes are being
3994 * deleted since there is no way to switch
3995 * master ownership (unless via a cluster reconfig cycle).
3997 delete_master = strinlst(sd->sd_mn_master_nodenm, node_c, node_v);
3998 if (MD_MNSET_DESC(sd) && (numsides > 1) && (node_c != numsides) &&
3999 delete_master) {
4000 (void) mddserror(ep, MDE_DS_CANTDELMASTER, sp->setno,
4001 sd->sd_mn_master_nodenm, NULL, sp->setname);
4002 rval = -1;
4003 goto out2;
4007 /* Deleting self w/o forceflg */
4008 if (forceflg == FALSE && numsides > 1 &&
4009 strinlst(mynode(), node_c, node_v)) {
4010 (void) mddserror(ep, MDE_DS_CANTDELSELF, sp->setno,
4011 mynode(), NULL, sp->setname);
4012 rval = -1;
4013 goto out2;
4017 * Setup the mediator record roll-back structure for a trad diskset.
4019 * For a MN diskset, the deletion of a host in the diskset
4020 * does not cause an update of the mediator record. If the
4021 * host deletion will cause the diskset to be removed (this is
4022 * the last host being removed or all hosts are being removed)
4023 * then the mediator record must have already been removed by the
4024 * user or this delete host operation will fail (a check for
4025 * this is done later in this routine).
4027 if (!(MD_MNSET_DESC(sd))) {
4028 (void) memset(&rb_medr, '\0', sizeof (med_rec_t));
4029 rb_medr.med_rec_mag = MED_REC_MAGIC;
4030 rb_medr.med_rec_rev = MED_REC_REV;
4031 rb_medr.med_rec_fl = 0;
4032 rb_medr.med_rec_sn = sp->setno;
4033 (void) strcpy(rb_medr.med_rec_snm, sp->setname);
4034 for (i = 0; i < MD_MAXSIDES; i++)
4035 (void) strcpy(rb_medr.med_rec_nodes[i],
4036 sd->sd_nodes[i]);
4037 rb_medr.med_rec_meds = sd->sd_med; /* structure assigment */
4038 (void) memset(&rb_medr.med_rec_data, '\0', sizeof (med_data_t));
4039 rb_medr.med_rec_foff = 0;
4040 crcgen(&rb_medr, &rb_medr.med_rec_cks,
4041 sizeof (med_rec_t), NULL);
4043 /* Bring the mediator record up to date with the set record */
4044 medr = rb_medr; /* structure assignment */
4046 if ((max_meds = get_max_meds(ep)) == 0) {
4047 rval = -1;
4048 goto out2;
4053 * For traditional diskset:
4054 * Check to see if all the hosts we are trying to delete the set from
4055 * have a set "setname" that is the same as ours, i.e. - same name,
4056 * same time stamp, same genid. We only do this if forceflg is not
4057 * specified or we are in OHA mode.
4059 if (!(MD_MNSET_DESC(sd)) && (forceflg == FALSE || oha == TRUE)) {
4060 int fix_node_v = FALSE;
4061 int j;
4063 for (i = 0; i < node_c; i++) {
4064 /* We skip this side */
4065 if (strcmp(mynode(), node_v[i]) == 0)
4066 continue;
4068 has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4070 if (has_set < 0) {
4071 char *anode[1];
4074 * Can't talk to the host only allowed in OHA
4075 * mode.
4077 if (oha == TRUE && mdanyrpcerror(ep)) {
4078 mdclrerror(ep);
4079 continue;
4083 * We got an error we do not, or are not,
4084 * prepared to handle.
4086 if (! mdiserror(ep, MDE_NO_SET) &&
4087 ! mdismddberror(ep, MDE_DB_NODB)) {
4088 rval = -1;
4089 goto out2;
4091 mdclrerror(ep);
4094 * If we got here: both hosts are up; a host in
4095 * our set record does not have the set. So we
4096 * delete the host from our set and invalidate
4097 * the node.
4099 anode[0] = Strdup(node_v[i]);
4101 rval = del_host_noset(sp, anode, ep);
4104 * If we delete a host, make sure the mediator
4105 * hosts are made aware of this.
4107 for (j = 0; j < MD_MAXSIDES; j++) {
4108 if (strcmp(medr.med_rec_nodes[j],
4109 node_v[i]) != 0)
4110 continue;
4111 (void) memset(&medr.med_rec_nodes[j],
4112 '\0', sizeof (md_node_nm_t));
4114 crcgen(&medr, &medr.med_rec_cks,
4115 sizeof (med_rec_t), NULL);
4117 rb_medr = medr; /* struct assignment */
4119 Free(anode[0]);
4121 if (rval == -1)
4122 goto out2;
4124 node_v[i][0] = '\0';
4125 fix_node_v = TRUE;
4126 continue;
4130 * If we can talk to the host, and they do not have the
4131 * exact set, then we disallow the operation.
4133 if (has_set == FALSE) {
4134 (void) mddserror(ep, MDE_DS_NODENOSET,
4135 sp->setno, node_v[i], NULL, sp->setname);
4136 rval = -1;
4137 goto out2;
4142 * Here we prune the node_v's that were invalidated above.
4144 if (fix_node_v == TRUE) {
4145 i = 0;
4146 while (i < node_c) {
4147 if (node_v[i][0] == '\0') {
4148 for (j = i; (j + 1) < node_c; j++)
4149 node_v[j] = node_v[j + 1];
4150 node_c--;
4152 i++;
4155 * If we are left with no nodes, then we have
4156 * compeleted the operation.
4158 if (node_c == 0) {
4160 * Inform the mediator hosts of the new node
4161 * list
4163 for (i = 0; i < max_meds; i++) {
4164 if (sd->sd_med.n_lst[i].a_cnt == 0)
4165 continue;
4167 if (clnt_med_upd_rec(
4168 &sd->sd_med.n_lst[i], sp, &medr,
4169 ep))
4170 mdclrerror(ep);
4172 rval = 0;
4173 goto out2;
4179 * For multinode diskset:
4180 * If forceflag is FALSE then check to see if all the hosts we
4181 * are trying to delete the set from have a set "setname" that
4182 * is the same as ours, i.e. - same name, same time stamp, same genid.
4183 * If forceflag is TRUE, then we don't care if the hosts being
4184 * deleted have the same set information or not since user is forcing
4185 * those hosts to be deleted.
4187 if ((MD_MNSET_DESC(sd)) && (forceflg == FALSE)) {
4188 for (i = 0; i < node_c; i++) {
4189 /* We skip this node since comparing against it */
4190 if (strcmp(mynode(), node_v[i]) == 0)
4191 continue;
4193 has_set = nodehasset(sp, node_v[i], NHS_NSTG_EQ, ep);
4195 if (has_set < 0) {
4196 rval = -1;
4197 goto out2;
4201 * If we can talk to the host, and they do not have the
4202 * exact set, then we disallow the operation.
4204 if (has_set == FALSE) {
4205 (void) mddserror(ep, MDE_DS_NODENOSET,
4206 sp->setno, node_v[i], NULL, sp->setname);
4207 rval = -1;
4208 goto out2;
4214 * For traditional diskset:
4215 * Can't allow user to delete their node (without deleting all nodes)
4216 * out of a set in OHA mode, would leave a real mess.
4217 * This action was already failed above for a MN diskset.
4219 if (!(MD_MNSET_DESC(sd)) && (oha == TRUE) &&
4220 strinlst(mynode(), node_c, node_v)) {
4221 /* Can directly return since !MN diskset; nothing to unlock */
4222 return (mddserror(ep, MDE_DS_OHACANTDELSELF, sp->setno,
4223 mynode(), NULL, sp->setname));
4227 /* Get the drive descriptors for this set */
4228 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
4229 ep)) == NULL) {
4230 if (! mdisok(ep)) {
4231 rval = -1;
4232 goto out2;
4237 * We have been asked to delete all the hosts in the set, i.e. - delete
4238 * the whole set.
4240 if (node_c == numsides) {
4242 * This is only a valid operation if all drives have been
4243 * removed first.
4246 if (dd != NULL) {
4247 (void) mddserror(ep, MDE_DS_HASDRIVES, sp->setno,
4248 NULL, NULL, sp->setname);
4249 rval = -1;
4250 goto out2;
4254 * If a mediator is currently associated with this set,
4255 * fail the deletion of the last host(s).
4257 if (sd->sd_med.n_cnt != 0) {
4258 (void) mddserror(ep, MDE_DS_HASMED, sp->setno,
4259 NULL, NULL, sp->setname);
4260 rval = -1;
4261 goto out2;
4264 if (! mdisok(ep)) {
4265 rval = -1;
4266 goto out2;
4269 rval = del_set_nodrives(sp, node_c, node_v, oha, ep);
4270 remote_sets_deleted = 1;
4271 goto out2;
4275 * Get timeout values in case we need to roll back
4277 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
4278 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0) {
4279 rval = -1;
4280 goto out2;
4283 if (dd != NULL) {
4285 * We need this around for re-adding DB side names later.
4287 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
4288 rval = -1;
4289 goto out2;
4293 * Alloc nodeid list if drives are present in diskset.
4294 * nodeid list is used to reset mirror owners if the
4295 * owner is a deleted node.
4297 if (MD_MNSET_DESC(sd)) {
4298 node_id_list = Zalloc(sizeof (int) * node_c);
4302 /* Lock the set on current set members */
4303 if (!(MD_MNSET_DESC(sd))) {
4304 md_rb_sig_handling_on();
4305 for (i = 0; i < MD_MAXSIDES; i++) {
4306 /* Skip empty slots */
4307 if (sd->sd_nodes[i][0] == '\0')
4308 continue;
4310 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
4311 if (oha == TRUE && mdanyrpcerror(ep)) {
4312 mdclrerror(ep);
4313 continue;
4315 rval = -1;
4316 goto out2;
4318 lock_flag = 1;
4322 RB_TEST(1, "deletehosts", ep)
4324 RB_PREEMPT;
4325 rb_level = 1; /* level 1 */
4327 RB_TEST(2, "deletehosts", ep)
4329 if (MD_MNSET_DESC(sd)) {
4330 md_mnnode_desc *saved_nd_next;
4331 mddb_config_t c;
4333 if (dd != NULL) {
4335 * Notify rpc.mdcommd on all nodes of a nodelist change.
4336 * Start by suspending rpc.mdcommd (which drains it of
4337 * all messages), then change the nodelist followed
4338 * by a reinit and resume.
4340 nd = sd->sd_nodelist;
4341 while (nd) {
4342 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4343 nd = nd->nd_next;
4344 continue;
4346 if (clnt_mdcommdctl(nd->nd_nodename,
4347 COMMDCTL_SUSPEND, sp,
4348 MD_MSG_CLASS0,
4349 MD_MSCF_NO_FLAGS, ep)) {
4350 rval = -1;
4351 goto out2;
4353 suspendall_flag = 1;
4354 nd = nd->nd_next;
4357 * Is current set STALE?
4358 * Need to know this if delete host fails and node
4359 * is re-joined to diskset.
4361 (void) memset(&c, 0, sizeof (c));
4362 c.c_id = 0;
4363 c.c_setno = sp->setno;
4364 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
4365 (void) mdstealerror(ep, &c.c_mde);
4366 rval = -1;
4367 goto out2;
4369 if (c.c_flags & MDDB_C_STALE) {
4370 stale_flag = MNSET_IS_STALE;
4375 * For each node being deleted, set DEL flag and
4376 * reset OK flag on that node first.
4377 * Until a node has turned off its own
4378 * rpc.metad's NODE_OK flag, that node could be
4379 * considered for master during a reconfig.
4381 for (i = 0; i < node_c; i++) {
4383 * During OHA mode, don't issue RPCs to
4384 * non-alive nodes since there is no reason to
4385 * wait for RPC timeouts.
4387 nd = sd->sd_nodelist;
4388 while (nd) {
4389 if (strcmp(nd->nd_nodename, node_v[i]) == 0)
4390 break;
4391 nd = nd->nd_next;
4393 /* Something wrong, handle this in next loop */
4394 if (nd == NULL)
4395 continue;
4397 /* If node_id_list is alloc'd, fill in for later use */
4398 if (node_id_list)
4399 node_id_list[i] = nd->nd_nodeid;
4401 /* All nodes are guaranteed to be ALIVE unless OHA */
4402 if ((oha == TRUE) &&
4403 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4404 continue;
4407 /* Only changing my local cache of node list */
4408 saved_nd_next = nd->nd_next;
4409 nd->nd_next = NULL;
4411 /* Set flags for del host to DEL on that host */
4412 if (clnt_upd_nr_flags(node_v[i], sp,
4413 nd, MD_NR_DEL, NULL, ep)) {
4414 nd->nd_next = saved_nd_next;
4415 goto rollback;
4417 nd->nd_next = saved_nd_next;
4419 for (i = 0; i < node_c; i++) {
4421 * Turn off owner flag in nodes to be deleted
4422 * if this node has been joined.
4423 * Also, turn off NODE_OK and turn on NODE_DEL
4424 * for nodes to be deleted.
4425 * These flags are used to set the node
4426 * record flags in all nodes in the set.
4427 * Only withdraw nodes that are joined.
4429 nd = sd->sd_nodelist;
4430 while (nd) {
4432 * Don't communicate with non-ALIVE node if
4433 * in OHA - but set flags in master list so
4434 * alive nodes are updated correctly.
4436 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4437 if ((oha == TRUE) && (!(nd->nd_flags &
4438 MD_MN_NODE_ALIVE))) {
4439 nd->nd_flags |= MD_MN_NODE_DEL;
4440 nd->nd_flags &= ~MD_MN_NODE_OK;
4441 nd = nd->nd_next;
4442 continue;
4444 if (nd->nd_flags & MD_MN_NODE_OWN) {
4446 * Going to set locally cached
4447 * node flags to rollback join
4448 * so in case of error, the
4449 * rollback code knows which
4450 * nodes to re-join. rpc.metad
4451 * ignores the RB_JOIN flag.
4453 nd->nd_flags |=
4454 MD_MN_NODE_RB_JOIN;
4455 nd->nd_flags &= ~MD_MN_NODE_OWN;
4458 * Be careful in ordering of
4459 * following steps so that
4460 * recovery from a panic
4461 * between the steps is viable.
4462 * Only reset master info in
4463 * rpc.metad - don't reset
4464 * local cached info which will
4465 * be used to set master info
4466 * back if failure (rollback).
4468 if (clnt_withdrawset(
4469 nd->nd_nodename, sp, ep))
4470 goto rollback;
4473 * Reset master on deleted node
4475 if (clnt_mnsetmaster(node_v[i],
4476 sp, "", MD_MN_INVALID_NID,
4477 ep))
4478 goto rollback;
4481 nd->nd_flags |= MD_MN_NODE_DEL;
4482 nd->nd_flags &= ~MD_MN_NODE_OK;
4484 nd = nd->nd_next;
4489 * Now, reset owner and set delete flags for the
4490 * deleted nodes on all nodes.
4492 nd = sd->sd_nodelist;
4493 while (nd) {
4494 /* Skip non-ALIVE node if in OHA */
4495 if ((oha == TRUE) &&
4496 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4497 nd = nd->nd_next;
4498 continue;
4500 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
4501 sd->sd_nodelist, MD_NR_SET, NULL, ep)) {
4502 goto rollback;
4504 nd = nd->nd_next;
4507 * Notify rpc.mdcommd on all nodes of a nodelist change.
4508 * Send reinit command to mdcommd which forces it to get
4509 * fresh set description.
4511 if (suspendall_flag) {
4512 /* Send reinit */
4513 nd = sd->sd_nodelist;
4514 while (nd) {
4515 if ((oha == TRUE) &&
4516 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4517 nd = nd->nd_next;
4518 continue;
4520 /* Class is ignored for REINIT */
4521 if (clnt_mdcommdctl(nd->nd_nodename,
4522 COMMDCTL_REINIT, sp, NULL,
4523 MD_MSCF_NO_FLAGS, ep)) {
4524 mde_perror(ep, dgettext(TEXT_DOMAIN,
4525 "Unable to reinit rpc.mdcommd.\n"));
4526 goto rollback;
4528 nd = nd->nd_next;
4530 /* Send resume */
4531 nd = sd->sd_nodelist;
4532 while (nd) {
4533 if ((oha == TRUE) &&
4534 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4535 nd = nd->nd_next;
4536 continue;
4538 if (clnt_mdcommdctl(nd->nd_nodename,
4539 COMMDCTL_RESUME, sp, MD_MSG_CLASS0,
4540 MD_MSCF_DONT_RESUME_CLASS1, ep)) {
4541 mde_perror(ep, dgettext(TEXT_DOMAIN,
4542 "Unable to resume rpc.mdcommd.\n"));
4543 goto rollback;
4545 nd = nd->nd_next;
4547 meta_ping_mnset(sp->setno);
4553 * Mark the set record MD_SR_DEL on the hosts we are deleting
4554 * If a MN diskset and OHA mode, don't issue RPC to nodes that
4555 * are not ALIVE.
4556 * If a MN diskset and not in OHA mode, then all nodes must respond
4557 * to RPC (be alive) or this routine will return failure.
4558 * If a traditional diskset, all RPC failures if in OHA mode.
4560 for (i = 0; i < node_c; i++) {
4562 RB_TEST(3, "deletehosts", ep)
4564 if ((MD_MNSET_DESC(sd)) && (oha == TRUE)) {
4566 * During OHA mode, don't issue RPCs to
4567 * non-alive nodes since there is no reason to
4568 * wait for RPC timeouts.
4570 nd = sd->sd_nodelist;
4571 while (nd) {
4572 if (strcmp(nd->nd_nodename, node_v[i]) == 0) {
4573 break;
4575 nd = nd->nd_next;
4577 if (nd == NULL) {
4578 (void) mddserror(ep, MDE_DS_NODENOTINSET,
4579 sp->setno, node_v[i], NULL, sp->setname);
4580 goto rollback;
4581 } else if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4582 /* Skip non-ALIVE node if in OHA mode */
4583 continue;
4584 } else {
4585 if (clnt_upd_sr_flags(node_v[i], sp,
4586 MD_SR_DEL, ep)) {
4587 goto rollback;
4590 } else if ((MD_MNSET_DESC(sd)) && (oha == FALSE)) {
4592 * All nodes should be alive in non-oha mode.
4594 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4595 goto rollback;
4597 } else {
4599 * For traditional diskset, issue the RPC and
4600 * ignore RPC failure if in OHA mode.
4602 if (clnt_upd_sr_flags(node_v[i], sp, MD_SR_DEL, ep)) {
4603 if (oha == TRUE && mdanyrpcerror(ep)) {
4604 mdclrerror(ep);
4605 continue;
4607 goto rollback;
4611 RB_TEST(4, "deletehosts", ep)
4614 RB_TEST(5, "deletehosts", ep)
4616 RB_PREEMPT;
4617 rb_level = 2; /* level 2 */
4619 RB_TEST(6, "deletehosts", ep)
4621 /* Delete the set on the hosts we are deleting */
4622 if (del_set_on_hosts(sp, sd, dd, node_c, node_v, oha, ep)) {
4623 if (node_id_list)
4624 Free(node_id_list);
4626 * Failure during del_set_on_hosts would have recreated
4627 * the diskset on the remote hosts, but for multi-owner
4628 * disksets need to set node flags properly and REINIT and
4629 * RESUME rpc.mdcommd, so just let the rollback code
4630 * do this.
4632 if (MD_MNSET_DESC(sd))
4633 goto rollback;
4634 return (-1);
4636 remote_sets_deleted = 1;
4638 RB_TEST(19, "deletehosts", ep)
4640 RB_PREEMPT;
4641 rb_level = 3; /* level 3 */
4643 RB_TEST(20, "deletehosts", ep)
4645 /* Delete the host from sets on hosts not being deleted */
4646 if (MD_MNSET_DESC(sd)) {
4647 nd = sd->sd_nodelist;
4648 /* All nodes are guaranteed to be ALIVE unless in oha mode */
4649 while (nd) {
4651 * During OHA mode, don't issue RPCs to
4652 * non-alive nodes since there is no reason to
4653 * wait for RPC timeouts.
4655 if ((oha == TRUE) &&
4656 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4657 nd = nd->nd_next;
4658 continue;
4661 /* Skip nodes being deleted */
4662 if (strinlst(nd->nd_nodename, node_c, node_v)) {
4663 nd = nd->nd_next;
4664 continue;
4666 if (clnt_delhosts(nd->nd_nodename, sp, node_c, node_v,
4667 ep) == -1) {
4668 goto rollback;
4671 RB_TEST(21, "deletehosts", ep)
4672 nd = nd->nd_next;
4674 } else {
4675 for (i = 0; i < MD_MAXSIDES; i++) {
4676 /* Skip empty slots */
4677 if (sd->sd_nodes[i][0] == '\0')
4678 continue;
4680 /* Skip nodes being deleted */
4681 if (strinlst(sd->sd_nodes[i], node_c, node_v))
4682 continue;
4684 if (clnt_delhosts(sd->sd_nodes[i], sp, node_c, node_v,
4685 ep) == -1) {
4686 if (oha == TRUE && mdanyrpcerror(ep)) {
4687 mdclrerror(ep);
4688 continue;
4690 goto rollback;
4693 RB_TEST(21, "deletehosts", ep)
4697 /* We have drives */
4698 if (dd != NULL) {
4699 RB_TEST(22, "deletehosts", ep)
4701 RB_PREEMPT;
4702 rb_level = 4; /* level 4 */
4704 RB_TEST(23, "deletehosts", ep)
4707 * Delete the old sidename for each drive on all the hosts.
4708 * If a multi-node diskset, each host only stores
4709 * the side information for itself. So, a multi-node
4710 * diskset doesn't delete the old sidename for
4711 * an old host.
4713 * If a MN diskset, reset owners of mirrors that are
4714 * owned by the deleted nodes.
4716 if (!(MD_MNSET_DESC(sd))) {
4717 for (i = 0; i < MD_MAXSIDES; i++) {
4718 /* Skip empty slots */
4719 if (sd->sd_nodes[i][0] == '\0')
4720 continue;
4722 /* Skip nodes being deleted */
4723 if (strinlst(sd->sd_nodes[i], node_c, node_v))
4724 continue;
4726 if (clnt_del_drv_sidenms(sd->sd_nodes[i], sp,
4727 ep)) {
4728 if (oha == TRUE && mdanyrpcerror(ep)) {
4729 mdclrerror(ep);
4730 continue;
4732 metaflushsetname(sp);
4733 goto rollback;
4736 RB_TEST(24, "deletehosts", ep)
4738 } else {
4739 nd = sd->sd_nodelist;
4740 /* All nodes guaranteed ALIVE unless in oha mode */
4741 while (nd) {
4743 * If mirror owner was set to a deleted node,
4744 * then each existing node resets mirror owner
4745 * to NULL.
4747 * During OHA mode, don't issue RPCs to
4748 * non-alive nodes since there is no reason to
4749 * wait for RPC timeouts.
4751 if ((oha == TRUE) &&
4752 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4753 nd = nd->nd_next;
4754 continue;
4757 /* Skip nodes being deleted */
4758 if (strinlst(nd->nd_nodename, node_c, node_v)) {
4759 nd = nd->nd_next;
4760 continue;
4764 * If mirror owner is a deleted node, reset
4765 * mirror owners to NULL. If an error occurs,
4766 * print a warning and continue. Don't fail
4767 * metaset because of mirror owner reset
4768 * problem since next node to grab mirror
4769 * will resolve this issue. Before next node
4770 * grabs mirrors, metaset will show the deleted
4771 * node as owner which is why an attempt to
4772 * reset the mirror owner is made.
4774 if (clnt_reset_mirror_owner(nd->nd_nodename, sp,
4775 node_c, &node_id_list[0], &xep) == -1) {
4776 mde_perror(&xep, dgettext(TEXT_DOMAIN,
4777 "Unable to reset mirror owner on"
4778 " node %s\n"), nd->nd_nodename);
4779 mdclrerror(&xep);
4782 RB_TEST(21, "deletehosts", ep)
4783 nd = nd->nd_next;
4788 RB_TEST(25, "deletehosts", ep)
4790 RB_PREEMPT;
4791 rb_level = 4; /* level 4 */
4793 RB_TEST(26, "deletehosts", ep)
4796 * Bring the mediator record up to date with the set record for
4797 * traditional diskset.
4799 if (!(MD_MNSET_DESC(sd))) {
4800 medr = rb_medr; /* structure assignment */
4801 for (i = 0; i < MD_MAXSIDES; i++) {
4802 if (strinlst(sd->sd_nodes[i], node_c, node_v))
4803 (void) memset(&medr.med_rec_nodes[i],
4804 '\0', sizeof (md_node_nm_t));
4805 else
4806 (void) strcpy(medr.med_rec_nodes[i],
4807 sd->sd_nodes[i]);
4809 crcgen(&medr, &medr.med_rec_cks, sizeof (med_rec_t), NULL);
4811 /* Inform the mediator hosts of the new node list */
4812 for (i = 0; i < max_meds; i++) {
4813 if (sd->sd_med.n_lst[i].a_cnt == 0)
4814 continue;
4816 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
4817 &medr, ep)) {
4818 if (oha == TRUE && mdanyrpcerror(ep)) {
4819 mdclrerror(ep);
4820 continue;
4822 goto rollback;
4827 RB_TEST(27, "deletehosts", ep)
4830 * For traditional diskset:
4831 * We are deleting ourselves out of the set and we have drives to
4832 * consider; so we need to halt the set, release the drives and
4833 * reset the timeout. **** THIS IS A ONE WAY TICKET, NO ROLL BACK
4834 * IS POSSIBLE AS SOON AS THE HALT SET COMPLETES, SO THIS IS DONE
4835 * WITH ALL SIGNALS BLOCKED AND LAST ****
4837 * This situation cannot occur in a MN diskset since a node can't
4838 * delete itself unless all nodes are being deleted and a diskset
4839 * cannot contain any drives if all nodes are being deleted.
4840 * So, don't even test for this if a MN diskset.
4842 if (!(MD_MNSET_DESC(sd)) && (dd != NULL) &&
4843 strinlst(mynode(), node_c, node_v)) {
4844 /* Make sure we are blocking all signals */
4845 if (procsigs(TRUE, &oldsigs, ep) < 0) {
4846 rval = -1;
4847 goto out1;
4850 if (halt_set(sp, ep)) {
4851 rval = -1;
4852 goto out1;
4855 if (rel_own_bydd(sp, dd, FALSE, ep))
4856 rval = -1;
4858 out1:
4859 /* release signals back to what they were on entry */
4860 if (procsigs(FALSE, &oldsigs, &xep) < 0) {
4861 if (rval == 0)
4862 (void) mdstealerror(ep, &xep);
4863 rval = -1;
4867 out2:
4869 * Unlock diskset by resuming messages across the diskset.
4870 * Just resume all classes so that resume is the same whether
4871 * just one class was locked or all classes were locked.
4873 if ((suspend1_flag) || (suspendall_flag)) {
4874 /* Send resume */
4875 nd = sd->sd_nodelist;
4876 while (nd) {
4877 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4878 nd = nd->nd_next;
4879 continue;
4882 * Skip nodes being deleted if remote set
4883 * was deleted since rpc.mdcommd may no longer
4884 * be running on remote node.
4886 if ((remote_sets_deleted == 1) &&
4887 (strinlst(nd->nd_nodename, node_c, node_v))) {
4888 nd = nd->nd_next;
4889 continue;
4891 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
4892 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
4893 if (rval == 0)
4894 (void) mdstealerror(ep, &xep);
4895 rval = -1;
4896 mde_perror(ep, dgettext(TEXT_DOMAIN,
4897 "Unable to resume rpc.mdcommd.\n"));
4899 nd = nd->nd_next;
4901 meta_ping_mnset(sp->setno);
4904 cl_sk = cl_get_setkey(sp->setno, sp->setname);
4905 if (lock_flag) {
4906 if (MD_MNSET_DESC(sd)) {
4907 nd = sd->sd_nodelist;
4908 while (nd) {
4910 * During OHA mode, don't issue RPCs to
4911 * non-alive nodes since there is no reason to
4912 * wait for RPC timeouts.
4914 if ((oha == TRUE) &&
4915 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
4916 nd = nd->nd_next;
4917 continue;
4919 if (clnt_unlock_set(nd->nd_nodename,
4920 cl_sk, &xep)) {
4921 if (rval == 0)
4922 (void) mdstealerror(ep, &xep);
4923 rval = -1;
4925 nd = nd->nd_next;
4927 } else {
4928 for (i = 0; i < MD_MAXSIDES; i++) {
4929 /* Skip empty slots */
4930 if (sd->sd_nodes[i][0] == '\0')
4931 continue;
4933 if (clnt_unlock_set(sd->sd_nodes[i],
4934 cl_sk, &xep)) {
4935 if (oha == TRUE &&
4936 mdanyrpcerror(&xep)) {
4937 mdclrerror(&xep);
4938 continue;
4940 if (rval == 0)
4941 (void) mdstealerror(ep, &xep);
4942 rval = -1;
4947 cl_set_setkey(NULL);
4949 out3:
4950 metafreereplicalist(rlp);
4951 if (node_id_list)
4952 Free(node_id_list);
4954 metaflushsetname(sp);
4956 if (MD_MNSET_DESC(sd)) {
4957 /* release signals back to what they were on entry */
4958 if (procsigs(FALSE, &oldsigs, &xep) < 0)
4959 mdclrerror(&xep);
4960 } else {
4961 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
4965 return (rval);
4967 rollback:
4968 /* all signals already blocked for MN disket */
4969 if (!(MD_MNSET_DESC(sd))) {
4970 if (procsigs(TRUE, &oldsigs, &xep) < 0)
4971 mdclrerror(&xep);
4974 rval = -1;
4976 max_genid = sd->sd_genid;
4980 * Send reinit command to rpc.mdcommd which forces it to get
4981 * fresh set description and resume all classes but class 0.
4982 * Don't send any commands to rpc.mdcommd if set on that node
4983 * has been removed.
4985 if (suspendall_flag) {
4986 /* Send reinit */
4987 nd = sd->sd_nodelist;
4988 while (nd) {
4989 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
4990 nd = nd->nd_next;
4991 continue;
4994 * If the remote set was deleted, rpc.mdcommd
4995 * may no longer be running so send nothing to it.
4997 if ((remote_sets_deleted == 1) &&
4998 (strinlst(nd->nd_nodename, node_c, node_v))) {
4999 nd = nd->nd_next;
5000 continue;
5002 /* Class is ignored for REINIT */
5003 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5004 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5005 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5006 "Unable to reinit rpc.mdcommd.\n"));
5007 mdclrerror(&xep);
5009 nd = nd->nd_next;
5011 /* Send resume */
5012 nd = sd->sd_nodelist;
5013 while (nd) {
5014 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5015 nd = nd->nd_next;
5016 continue;
5019 * If the remote set was deleted, rpc.mdcommd
5020 * may no longer be running so send nothing to it.
5022 if ((remote_sets_deleted == 1) &&
5023 (strinlst(nd->nd_nodename, node_c, node_v))) {
5024 nd = nd->nd_next;
5025 continue;
5027 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5028 sp, MD_MSG_CLASS0, MD_MSCF_DONT_RESUME_CLASS1,
5029 &xep)) {
5030 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5031 "Unable to resume rpc.mdcommd.\n"));
5032 mdclrerror(&xep);
5034 nd = nd->nd_next;
5036 meta_ping_mnset(sp->setno);
5039 /* level 2 */
5040 if (rb_level > 1) {
5041 md_set_record *sr;
5042 md_replicalist_t *rl;
5044 recreate_set(sp, sd);
5047 * Lock out other meta* commands on nodes with the newly
5048 * re-created sets by suspending class 1 messages
5049 * across the diskset.
5051 nd = sd->sd_nodelist;
5052 while (nd) {
5053 /* Skip nodes not being deleted */
5054 if (!(strinlst(nd->nd_nodename, node_c, node_v))) {
5055 nd = nd->nd_next;
5056 continue;
5058 /* Suspend commd on nodes with re-created sets */
5059 if (clnt_mdcommdctl(nd->nd_nodename,
5060 COMMDCTL_SUSPEND, sp, MD_MSG_CLASS1,
5061 MD_MSCF_NO_FLAGS, &xep)) {
5062 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5063 "Unable to suspend rpc.mdcommd.\n"));
5064 mdclrerror(&xep);
5066 nd = nd->nd_next;
5069 max_genid++;
5072 * See if we have to re-add the drives specified.
5074 for (i = 0; i < node_c; i++) {
5075 if (MD_MNSET_DESC(sd) && (oha == TRUE)) {
5077 * During OHA mode, don't issue RPCs to
5078 * non-alive nodes since there is no reason to
5079 * wait for RPC timeouts.
5081 nd = sd->sd_nodelist;
5082 while (nd) {
5083 if (strcmp(nd->nd_nodename, node_v[i])
5084 == 0) {
5085 break;
5087 nd = nd->nd_next;
5089 if (nd == 0)
5090 continue;
5091 if (!(nd->nd_flags & MD_MN_NODE_ALIVE))
5092 continue;
5095 /* Don't care if set record is MN or not */
5096 if (clnt_getset(node_v[i], sp->setname, MD_SET_BAD, &sr,
5097 &xep) == -1) {
5098 mdclrerror(&xep);
5099 continue;
5102 /* Drive already added, skip to next node */
5103 if (sr->sr_drivechain != NULL) {
5105 * Set record structure was allocated from RPC
5106 * routine getset so this structure is only of
5107 * size md_set_record even if the MN flag is
5108 * set. So, clear the flag so that the free
5109 * code doesn't attempt to free a structure
5110 * the size of md_mnset_record.
5112 sr->sr_flags &= ~MD_SR_MN;
5113 free_sr(sr);
5114 continue;
5117 if (clnt_adddrvs(node_v[i], sp, dd, sr->sr_ctime,
5118 sr->sr_genid, &xep) == -1)
5119 mdclrerror(&xep);
5121 if (clnt_upd_dr_flags(node_v[i], sp, dd, MD_DR_OK,
5122 &xep) == -1)
5123 mdclrerror(&xep);
5126 * Set record structure was allocated from RPC routine
5127 * getset so this structure is only of size
5128 * md_set_record even if the MN flag is set. So,
5129 * clear the flag so that the free code doesn't
5130 * attempt to free a structure the size of
5131 * md_mnset_record.
5133 sr->sr_flags &= ~MD_SR_MN;
5134 free_sr(sr);
5136 max_genid += 3;
5138 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
5139 md_replica_t *r = rl->rl_repp;
5141 * This is not the first replica being added to the
5142 * diskset so call with ADDSIDENMS_BCAST. If this
5143 * is a traditional diskset, the bcast flag is ignored
5144 * since traditional disksets don't use the rpc.mdcommd.
5146 if (meta_db_addsidenms(sp, r->r_namep, r->r_blkno,
5147 DB_ADDSIDENMS_BCAST, &xep))
5148 mdclrerror(&xep);
5152 * Add the device names for the new sides into the namespace,
5153 * on all hosts not being deleted.
5155 if (MD_MNSET_DESC(sd)) {
5156 nd = sd->sd_nodelist;
5157 while (nd) {
5158 /* Find a node that is not being deleted */
5159 if (!strinlst(nd->nd_nodename, node_c,
5160 node_v)) {
5161 j = nd->nd_nodeid;
5162 break;
5164 nd = nd->nd_next;
5166 } else {
5167 for (j = 0; j < MD_MAXSIDES; j++) {
5168 /* Skip empty slots */
5169 if (sd->sd_nodes[j][0] == '\0')
5170 continue;
5172 /* Find a node that is not being deleted */
5173 if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5174 break;
5178 if (MD_MNSET_DESC(sd)) {
5179 nd = sd->sd_nodelist;
5180 while (nd) {
5181 /* Skip nodes not being deleted */
5182 if (!strinlst(nd->nd_nodename, node_c,
5183 node_v)) {
5184 nd = nd->nd_next;
5185 continue;
5188 /* this side was just created, add the names */
5189 if (add_md_sidenms(sp, nd->nd_nodeid, j, &xep))
5190 mdclrerror(&xep);
5191 nd = nd->nd_next;
5193 } else {
5194 for (i = 0; i < MD_MAXSIDES; i++) {
5195 /* Skip empty slots */
5196 if (sd->sd_nodes[i][0] == '\0')
5197 continue;
5199 /* Skip nodes not being deleted */
5200 if (!strinlst(sd->sd_nodes[i], node_c, node_v))
5201 continue;
5203 /* this side was just created, add the names */
5204 if (add_md_sidenms(sp, i, j, &xep))
5205 mdclrerror(&xep);
5210 /* level 4 */
5211 if (rb_level > 3 && dd != NULL) {
5213 * Add the new sidename for each drive to all the hosts
5214 * Multi-node disksets only store the sidename for
5215 * that host, so there is nothing to re-add.
5217 if (!(MD_MNSET_DESC(sd))) {
5218 for (j = 0; j < MD_MAXSIDES; j++) {
5219 /* Skip empty slots */
5220 if (sd->sd_nodes[j][0] == '\0')
5221 continue;
5223 /* Skip nodes not being deleted */
5224 if (!strinlst(sd->sd_nodes[j], node_c, node_v))
5225 break;
5227 for (i = 0; i < MD_MAXSIDES; i++) {
5228 /* Skip empty slots */
5229 if (sd->sd_nodes[i][0] == '\0')
5230 continue;
5232 if (clnt_add_drv_sidenms(sd->sd_nodes[i],
5233 sd->sd_nodes[j], sp, sd, node_c, node_v,
5234 &xep))
5235 mdclrerror(&xep);
5241 /* level 5 */
5242 if ((rb_level > 4) && (!(MD_MNSET_DESC(sd)))) {
5243 /* rollback the mediator record */
5244 for (i = 0; i < max_meds; i++) {
5245 if (sd->sd_med.n_lst[i].a_cnt == 0)
5246 continue;
5248 if (clnt_med_upd_rec(&sd->sd_med.n_lst[i], sp,
5249 &rb_medr, &xep))
5250 mdclrerror(&xep);
5254 /* level 3 */
5255 if (rb_level > 2) {
5256 md_set_record *sr;
5257 md_mnset_record *mnsr;
5259 if (MD_MNSET_DESC(sd)) {
5260 nd = sd->sd_nodelist;
5262 * During OHA mode, don't issue RPCs to
5263 * non-alive nodes since there is no reason to
5264 * wait for RPC timeouts.
5266 while (nd) {
5267 if ((oha == TRUE) &&
5268 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5269 nd = nd->nd_next;
5270 continue;
5272 /* Record should be for a multi-node diskset */
5273 if (clnt_mngetset(nd->nd_nodename, sp->setname,
5274 MD_SET_BAD, &mnsr, &xep) == -1) {
5275 mdclrerror(&xep);
5276 nd = nd->nd_next;
5277 continue;
5280 has_set = 1;
5282 nr = mnsr->sr_nodechain;
5283 while (nr) {
5284 if (nd->nd_nodeid == nr->nr_nodeid) {
5285 break;
5287 nr = nr->nr_next;
5289 if (nr == NULL)
5290 has_set = 0;
5292 free_sr((struct md_set_record *)mnsr);
5293 if (has_set) {
5294 nd = nd->nd_next;
5295 continue;
5298 if (clnt_addhosts(nd->nd_nodename, sp, node_c,
5299 node_v, &xep) == -1)
5300 mdclrerror(&xep);
5302 nd = nd->nd_next;
5304 } else {
5305 for (i = 0; i < MD_MAXSIDES; i++) {
5306 /* Skip empty slots */
5307 if (sd->sd_nodes[i][0] == '\0')
5308 continue;
5310 /* Record should be for a non-multi-node set */
5311 if (clnt_getset(sd->sd_nodes[i], sp->setname,
5312 MD_SET_BAD, &sr, &xep) == -1) {
5313 mdclrerror(&xep);
5314 continue;
5318 * Set record structure was allocated from RPC
5319 * routine getset so this structure is only of
5320 * size md_set_record even if the MN flag is
5321 * set. So, clear the flag so that the free
5322 * code doesn't attempt to free a structure
5323 * the size of md_mnset_record.
5325 if (MD_MNSET_REC(sr)) {
5326 sr->sr_flags &= ~MD_SR_MN;
5327 free_sr(sr);
5328 continue;
5331 has_set = 1;
5332 for (j = 0; j < MD_MAXSIDES; j++) {
5333 /* Skip empty slots */
5334 if (sd->sd_nodes[j][0] == '\0')
5335 continue;
5337 if (sr->sr_nodes[j][0] == '\0') {
5338 has_set = 0;
5339 break;
5343 free_sr(sr);
5344 if (has_set)
5345 continue;
5347 if (clnt_addhosts(sd->sd_nodes[i], sp, node_c,
5348 node_v, &xep) == -1)
5349 mdclrerror(&xep);
5352 max_genid++;
5355 /* level 1 */
5356 if (rb_level > 0) {
5357 max_genid++;
5358 /* Sets MD_SR_OK on given nodes. */
5359 resync_genid(sp, sd, max_genid, node_c, node_v);
5362 * For MN diskset:
5363 * On each newly re-added node, set the node record for that
5364 * node to OK. Then set all node records for the newly added
5365 * nodes on all nodes to ok.
5367 * By setting a node's own node record to ok first, even if
5368 * the node re-adding the hosts panics, the rest of the nodes
5369 * can determine the same node list during the choosing of the
5370 * master during reconfig. So, only nodes considered for
5371 * mastership are nodes that have both MD_MN_NODE_OK and
5372 * MD_SR_OK set on that node's rpc.metad. If all nodes have
5373 * MD_SR_OK set, but no node has its own MD_MN_NODE_OK set,
5374 * then the set will be removed during reconfig since a panic
5375 * occurred during the re-creation of the deletion of
5376 * the initial diskset.
5378 if (MD_MNSET_DESC(sd)) {
5379 md_mnnode_desc *saved_nd_next;
5380 if (dd != NULL) {
5382 * Notify rpc.mdcommd on all nodes of a
5383 * nodelist change. Start by suspending
5384 * rpc.mdcommd (which drains it of all
5385 * messages), then change the nodelist
5386 * followed by a reinit and resume.
5388 nd = sd->sd_nodelist;
5389 while (nd) {
5390 if (!(nd->nd_flags &
5391 MD_MN_NODE_ALIVE)) {
5392 nd = nd->nd_next;
5393 continue;
5395 if (clnt_mdcommdctl(nd->nd_nodename,
5396 COMMDCTL_SUSPEND, sp,
5397 MD_MSG_CLASS0,
5398 MD_MSCF_NO_FLAGS, &xep)) {
5399 mde_perror(&xep,
5400 dgettext(TEXT_DOMAIN,
5401 "Unable to suspend "
5402 "rpc.mdcommd.\n"));
5403 mdclrerror(&xep);
5405 suspendall_flag_rb = 1;
5406 nd = nd->nd_next;
5409 for (i = 0; i < node_c; i++) {
5411 * During OHA mode, don't issue RPCs to
5412 * non-alive nodes since there is no reason to
5413 * wait for RPC timeouts.
5415 nd = sd->sd_nodelist;
5416 while (nd) {
5417 if (strcmp(nd->nd_nodename, node_v[i])
5418 == 0)
5419 break;
5420 nd = nd->nd_next;
5422 /* Something wrong, finish this in next loop */
5423 if (nd == NULL)
5424 continue;
5426 if ((oha == TRUE) &&
5427 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5428 continue;
5431 if (dd != NULL) {
5432 /* Set master on re-joining node. */
5433 if (clnt_mnsetmaster(node_v[i], sp,
5434 sd->sd_mn_master_nodenm,
5435 sd->sd_mn_master_nodeid, &xep)) {
5436 mdclrerror(&xep);
5440 * Re-join set to same state as
5441 * before - stale or non-stale.
5443 if (clnt_joinset(node_v[i], sp,
5444 stale_flag, &xep)) {
5445 mdclrerror(&xep);
5449 /* Only changing my local cache of node list */
5450 saved_nd_next = nd->nd_next;
5451 nd->nd_next = NULL;
5453 /* Set record for host to ok on that host */
5454 if (clnt_upd_nr_flags(node_v[i], sp,
5455 nd, MD_NR_OK, NULL, &xep)) {
5456 mdclrerror(&xep);
5458 nd->nd_next = saved_nd_next;
5461 /* Now set all node records on all nodes to be ok */
5462 nd = sd->sd_nodelist;
5463 while (nd) {
5465 * During OHA mode, don't issue RPCs to
5466 * non-alive nodes since there is no reason to
5467 * wait for RPC timeouts.
5469 if ((oha == TRUE) &&
5470 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5471 nd = nd->nd_next;
5472 continue;
5474 if (clnt_upd_nr_flags(nd->nd_nodename, sp,
5475 sd->sd_nodelist, MD_NR_OK, NULL, &xep)) {
5476 mdclrerror(&xep);
5478 nd = nd->nd_next;
5484 * Notify rpc.mdcommd on all nodes of a nodelist change.
5485 * Send reinit command to mdcommd which forces it to get
5486 * fresh set description.
5488 if (suspendall_flag_rb) {
5489 /* Send reinit */
5490 nd = sd->sd_nodelist;
5491 while (nd) {
5492 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5493 nd = nd->nd_next;
5494 continue;
5497 /* Class is ignored for REINIT */
5498 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_REINIT,
5499 sp, NULL, MD_MSCF_NO_FLAGS, &xep)) {
5500 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5501 "Unable to reinit rpc.mdcommd.\n"));
5502 mdclrerror(&xep);
5504 nd = nd->nd_next;
5509 * Unlock diskset by resuming messages across the diskset.
5510 * Just resume all classes so that resume is the same whether
5511 * just one class was locked or all classes were locked.
5513 if ((suspend1_flag) || (suspendall_flag) || (suspendall_flag_rb)) {
5514 /* Send resume */
5515 nd = sd->sd_nodelist;
5516 while (nd) {
5517 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
5518 nd = nd->nd_next;
5519 continue;
5521 if (clnt_mdcommdctl(nd->nd_nodename, COMMDCTL_RESUME,
5522 sp, MD_MSG_CLASS0, MD_MSCF_NO_FLAGS, &xep)) {
5523 mde_perror(&xep, dgettext(TEXT_DOMAIN,
5524 "Unable to resume rpc.mdcommd.\n"));
5526 nd = nd->nd_next;
5528 meta_ping_mnset(sp->setno);
5532 * Start a resync thread on the re-added nodes
5533 * if set is not stale. Also start a thread to update the
5534 * abr state of all soft partitions
5536 if (stale_flag != MNSET_IS_STALE) {
5537 for (i = 0; i < node_c; i++) {
5539 * During OHA mode, don't issue RPCs to
5540 * non-alive nodes since there is no reason to
5541 * wait for RPC timeouts.
5543 nd = sd->sd_nodelist;
5544 while (nd) {
5545 if (strcmp(nd->nd_nodename, node_v[i])
5546 == 0)
5547 break;
5548 nd = nd->nd_next;
5550 if (nd == NULL)
5551 continue;
5553 if ((oha == TRUE) &&
5554 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5555 continue;
5558 if (dd != 0) {
5559 if (clnt_mn_mirror_resync_all(node_v[i],
5560 sp->setno, &xep)) {
5561 mde_perror(ep, dgettext(TEXT_DOMAIN,
5562 "Unable to start resync "
5563 "thread.\n"));
5565 if (clnt_mn_sp_update_abr(node_v[i],
5566 sp->setno, &xep)) {
5567 mde_perror(ep, dgettext(TEXT_DOMAIN,
5568 "Unable to start sp update "
5569 "thread.\n"));
5575 /* level 0 */
5576 cl_sk = cl_get_setkey(sp->setno, sp->setname);
5577 /* Don't test lock flag since guaranteed to be set if in rollback */
5578 if (MD_MNSET_DESC(sd)) {
5579 nd = sd->sd_nodelist;
5580 while (nd) {
5582 * During OHA mode, don't issue RPCs to
5583 * non-alive nodes since there is no reason to
5584 * wait for RPC timeouts.
5586 if ((oha == TRUE) &&
5587 (!(nd->nd_flags & MD_MN_NODE_ALIVE))) {
5588 nd = nd->nd_next;
5589 continue;
5591 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep))
5592 mdclrerror(&xep);
5593 nd = nd->nd_next;
5595 } else {
5596 for (i = 0; i < MD_MAXSIDES; i++) {
5597 /* Skip empty slots */
5598 if (sd->sd_nodes[i][0] == '\0')
5599 continue;
5601 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
5602 mdclrerror(&xep);
5605 cl_set_setkey(NULL);
5607 /* release signals back to what they were on entry */
5608 if (procsigs(FALSE, &oldsigs, &xep) < 0)
5609 mdclrerror(&xep);
5611 metafreereplicalist(rlp);
5612 if (node_id_list)
5613 Free(node_id_list);
5615 metaflushsetname(sp);
5617 if (!(MD_MNSET_DESC(sd))) {
5618 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
5621 return (rval);
5625 meta_set_auto_take(
5626 mdsetname_t *sp,
5627 int take_val,
5628 md_error_t *ep
5631 int i;
5632 md_set_desc *sd;
5633 int rval = 0;
5634 md_setkey_t *cl_sk;
5635 md_error_t xep = mdnullerror;
5636 char *hostname;
5637 md_drive_desc *dd;
5639 if ((sd = metaget_setdesc(sp, ep)) == NULL)
5640 return (-1);
5642 /* Make sure we own the set */
5643 if (meta_check_ownership(sp, ep) != 0)
5644 return (-1);
5646 hostname = mynode();
5648 /* Lock the set on our side */
5649 if (clnt_lock_set(hostname, sp, ep)) {
5650 rval = -1;
5651 goto out;
5654 if (take_val) {
5655 /* enable auto_take but only if it is not already set */
5656 if (! (sd->sd_flags & MD_SR_AUTO_TAKE)) {
5657 /* verify that we're the only host in the set */
5658 for (i = 0; i < MD_MAXSIDES; i++) {
5659 if (sd->sd_nodes[i] == NULL ||
5660 sd->sd_nodes[i][0] == '\0')
5661 continue;
5663 if (strcmp(sd->sd_nodes[i], hostname) != 0) {
5664 (void) mddserror(ep, MDE_DS_SINGLEHOST,
5665 sp->setno, NULL, NULL, sp->setname);
5666 rval = -1;
5667 goto out;
5671 if (clnt_enable_sr_flags(hostname, sp,
5672 MD_SR_AUTO_TAKE, ep))
5673 rval = -1;
5675 /* Disable SCSI reservations */
5676 if (sd->sd_flags & MD_SR_MB_DEVID)
5677 dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5678 PRINT_FAST, &xep);
5679 else
5680 dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5681 &xep);
5683 if (! mdisok(&xep))
5684 mdclrerror(&xep);
5686 if (dd != NULL) {
5687 if (rel_own_bydd(sp, dd, TRUE, &xep))
5688 mdclrerror(&xep);
5692 } else {
5693 /* disable auto_take, if set, or error */
5694 if (sd->sd_flags & MD_SR_AUTO_TAKE) {
5695 if (clnt_disable_sr_flags(hostname, sp,
5696 MD_SR_AUTO_TAKE, ep))
5697 rval = -1;
5699 /* Enable SCSI reservations */
5700 if (sd->sd_flags & MD_SR_MB_DEVID)
5701 dd = metaget_drivedesc(sp, MD_BASICNAME_OK |
5702 PRINT_FAST, &xep);
5703 else
5704 dd = metaget_drivedesc(sp, MD_BASICNAME_OK,
5705 &xep);
5707 if (! mdisok(&xep))
5708 mdclrerror(&xep);
5710 if (dd != NULL) {
5711 mhd_mhiargs_t mhiargs = defmhiargs;
5713 if (tk_own_bydd(sp, dd, &mhiargs, TRUE, &xep))
5714 mdclrerror(&xep);
5716 } else {
5717 (void) mddserror(ep, MDE_DS_AUTONOTSET, sp->setno,
5718 NULL, NULL, sp->setname);
5719 rval = -1;
5723 out:
5724 cl_sk = cl_get_setkey(sp->setno, sp->setname);
5725 if (clnt_unlock_set(hostname, cl_sk, &xep)) {
5726 if (rval == 0)
5727 (void) mdstealerror(ep, &xep);
5728 rval = -1;
5730 cl_set_setkey(NULL);
5732 return (rval);