4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * Metadevice diskset interfaces
33 #include <mdmn_changelog.h>
34 #include "meta_set_prv.h"
35 #include "meta_repartition.h"
38 check_setnodes_againstdrivelist(
40 mddrivenamelist_t
*dnlp
,
49 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
52 if (MD_MNSET_DESC(sd
)) {
55 if (!(nd
->nd_flags
& MD_MN_NODE_ALIVE
)) {
59 for (p
= dnlp
; p
!= NULL
; p
= p
->next
)
60 if (checkdrive_onnode(sp
, p
->drivenamep
,
66 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
67 /* Skip empty slots */
68 if (sd
->sd_nodes
[i
][0] == '\0')
71 for (p
= dnlp
; p
!= NULL
; p
= p
->next
)
72 if (checkdrive_onnode(sp
, p
->drivenamep
,
81 drvsuniq(mdsetname_t
*sp
, mddrivenamelist_t
*dnlp
, md_error_t
*ep
)
83 mddrivenamelist_t
*dl1
, *dl2
;
84 mddrivename_t
*dn1
, *dn2
;
86 for (dl1
= dnlp
; dl1
!= NULL
; dl1
= dl1
->next
) {
87 dn1
= dl1
->drivenamep
;
89 for (dl2
= dl1
->next
; dl2
!= NULL
; dl2
= dl2
->next
) {
90 dn2
= dl2
->drivenamep
;
91 if (strcmp(dn1
->cname
, dn2
->cname
) != 0)
94 return (mddserror(ep
, MDE_DS_DUPDRIVE
, sp
->setno
,
95 NULL
, dn1
->cname
, sp
->setname
));
101 static md_drive_desc
*
102 metaget_drivedesc_fromdrivelist(
104 mddrivenamelist_t
*dnlp
,
109 mddrivenamelist_t
*p
;
110 md_drive_desc
*dd
= NULL
;
113 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
116 for (p
= dnlp
; p
!= NULL
; p
= p
->next
) {
117 (void) metadrivedesc_append(&dd
, p
->drivenamep
, 0, 0,
118 sd
->sd_ctime
, sd
->sd_genid
, flags
);
125 * Exported Entry Points
129 meta_make_sidenmlist(
132 int import_flag
, /* flags partial import */
133 md_im_drive_info_t
*midp
, /* import drive information */
137 mdsidenames_t
*sn
, **sn_next
;
140 side_t sideno
= MD_SIDEWILD
;
146 * Normal (aka NOT partial import) code path.
148 if (meta_replicaslice(dnp
, &rep_slice
, ep
) != 0) {
152 dnp
->side_names_key
= MD_KEYWILD
;
154 if ((np
= metaslicename(dnp
, rep_slice
, ep
)) == NULL
)
156 bname
= Strdup(np
->bname
);
159 * When doing a partial import, we'll get the needed
160 * information from somewhere other than the system.
162 dnp
->side_names_key
= MD_KEYWILD
;
163 bname
= Strdup(midp
->mid_devname
);
165 metaflushsidenames(dnp
);
166 sn_next
= &dnp
->side_names
;
169 sn
= Zalloc(sizeof (*sn
));
171 if ((done
= meta_getnextside_devinfo(sp
, bname
, &sideno
,
172 &sn
->cname
, &sn
->dname
, &sn
->mnum
, ep
)) == -1) {
175 sn
->dname
= Strdup(midp
->mid_driver_name
);
176 sn
->mnum
= midp
->mid_mnum
;
192 /* Add to the end of the linked list */
193 assert(*sn_next
== NULL
);
203 mddrivenamelist_t
*dnlp
,
210 md_drive_desc
*dd
= NULL
, *curdd
= NULL
, *ddp
;
212 mddrivenamelist_t
*p
;
213 mhd_mhiargs_t mhiargs
;
218 ulong_t max_genid
= 0;
221 md_error_t xep
= mdnullerror
;
223 int suspendall_flag
= 0;
224 int suspend1_flag
= 0;
226 int flush_set_onerr
= 0;
227 md_replicalist_t
*rlp
= NULL
, *rl
;
229 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
232 /* Make sure we own the set */
233 if (meta_check_ownership(sp
, ep
) != 0)
237 * The drive and node records are stored in the local mddbs of each
238 * node in the diskset. Each node's rpc.metad daemon reads in the set,
239 * drive and node records from that node's local mddb and caches them
240 * internally. Any process needing diskset information contacts its
241 * local rpc.metad to get this information. Since each node in the
242 * diskset is independently reading the set information from its local
243 * mddb, the set, drive and node records in the local mddbs must stay
244 * in-sync, so that all nodes have a consistent view of the diskset.
246 * For a multinode diskset, explicitly verify that all nodes in the
247 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
248 * fail this operation since all nodes must be ALIVE in order to add
249 * the new drive record to their local mddb. If a panic of this node
250 * leaves the local mddbs set, node and drive records out-of-sync, the
251 * reconfig cycle will fix the local mddbs and force them back into
254 if (MD_MNSET_DESC(sd
)) {
255 nd
= sd
->sd_nodelist
;
257 if (!(nd
->nd_flags
& MD_MN_NODE_ALIVE
)) {
258 (void) mddserror(ep
, MDE_DS_NOTINMEMBERLIST
,
260 nd
->nd_nodename
, NULL
, sp
->setname
);
267 if (drvsuniq(sp
, dnlp
, ep
) == -1)
271 * Lock the set on current set members.
272 * Set locking done much earlier for MN diskset than for traditional
273 * diskset since lock_set and SUSPEND are used to protect against
274 * other meta* commands running on the other nodes.
276 if (MD_MNSET_DESC(sd
)) {
277 /* Make sure we are blocking all signals */
278 if (procsigs(TRUE
, &oldsigs
, &xep
) < 0)
281 nd
= sd
->sd_nodelist
;
282 /* All nodes are guaranteed to be ALIVE */
284 if (clnt_lock_set(nd
->nd_nodename
, sp
, ep
)) {
292 * Lock out other meta* commands by suspending
293 * class 1 messages across the diskset.
295 nd
= sd
->sd_nodelist
;
296 /* All nodes are guaranteed to be ALIVE */
298 if (clnt_mdcommdctl(nd
->nd_nodename
,
299 COMMDCTL_SUSPEND
, sp
, MD_MSG_CLASS1
,
300 MD_MSCF_NO_FLAGS
, ep
)) {
309 if (check_setnodes_againstdrivelist(sp
, dnlp
, ep
)) {
314 for (p
= dnlp
; p
!= NULL
; p
= p
->next
) {
317 if (meta_is_drive_in_anyset(p
->drivenamep
, &tmp
, FALSE
,
324 (void) mddserror(ep
, MDE_DS_DRIVEINSET
, sp
->setno
,
325 tmp
->setname
, p
->drivenamep
->cname
, sp
->setname
);
334 * This is a separate loop (from above) so that we validate all the
335 * drives handed to us before we repartition any one drive.
337 for (p
= dnlp
; p
!= NULL
; p
= p
->next
) {
338 if (meta_repartition_drive(sp
,
339 p
->drivenamep
, force_label
== TRUE
? MD_REPART_FORCE
: 0,
340 NULL
, /* Don't return the VTOC. */
346 * Create the names for the drives we are adding per side.
348 if (meta_make_sidenmlist(sp
, p
->drivenamep
, 0, NULL
,
356 * Get the list of drives descriptors that we are adding.
358 dd
= metaget_drivedesc_fromdrivelist(sp
, dnlp
, MD_DR_ADD
, ep
);
366 * Get the set timeout information.
368 (void) memset(&mhiargs
, '\0', sizeof (mhiargs
));
369 if (clnt_gtimeout(mynode(), sp
, &mhiargs
, ep
) == -1) {
375 * Get timestamp and generation id for new records
378 genid
= sd
->sd_genid
;
381 /* At this point, in case of error, set should be flushed. */
384 /* Lock the set on current set members */
385 if (!(MD_MNSET_DESC(sd
))) {
386 md_rb_sig_handling_on();
387 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
388 /* Skip empty slots */
389 if (sd
->sd_nodes
[i
][0] == '\0')
392 if (clnt_lock_set(sd
->sd_nodes
[i
], sp
, ep
)) {
401 * Get drive descriptors for the drives that are currently in the set.
403 curdd
= metaget_drivedesc(sp
, MD_FULLNAME_ONLY
, ep
);
408 * If first drive being added to set, set the mastership
409 * of the multinode diskset to be this node.
410 * Only set it on this node. If all goes well
411 * and there are no errors, the mastership of this node will be set
412 * on all nodes in user space and in the kernel.
414 if ((MD_MNSET_DESC(sd
)) && (curdd
== NULL
)) {
415 if (clnt_mnsetmaster(mynode(), sp
,
416 sd
->sd_mn_mynode
->nd_nodename
,
417 sd
->sd_mn_mynode
->nd_nodeid
, ep
)) {
421 * Set this up in my local cache of the set desc so that
422 * the set descriptor won't have to be gotten again from
423 * rpc.metad. If it is flushed and gotten again, these
424 * values will be set in sr2setdesc.
426 sd
->sd_mn_master_nodeid
= sd
->sd_mn_mynode
->nd_nodeid
;
427 (void) strcpy(sd
->sd_mn_master_nodenm
,
428 sd
->sd_mn_mynode
->nd_nodename
);
429 sd
->sd_mn_am_i_master
= 1;
432 RB_TEST(1, "adddrives", ep
)
435 rb_level
= 1; /* level 1 */
437 RB_TEST(2, "adddrives", ep
)
440 * Add the drive records for the drives that we are adding to
441 * each host in the set. Marks the drive as MD_DR_ADD.
443 if (MD_MNSET_DESC(sd
)) {
444 nd
= sd
->sd_nodelist
;
445 /* All nodes are guaranteed to be ALIVE */
447 if (clnt_adddrvs(nd
->nd_nodename
, sp
, dd
, now
, genid
,
451 RB_TEST(3, "adddrives", ep
)
455 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
456 /* Skip empty slots */
457 if (sd
->sd_nodes
[i
][0] == '\0')
460 if (clnt_adddrvs(sd
->sd_nodes
[i
], sp
, dd
, now
, genid
,
464 RB_TEST(3, "adddrives", ep
)
468 RB_TEST(4, "adddrives", ep
)
471 rb_level
= 2; /* level 2 */
473 RB_TEST(5, "adddrives", ep
)
476 * Take ownership of the added drives.
478 if (!(MD_MNSET_DESC(sd
)) && !MD_ATSET_DESC(sd
)) {
479 if (tk_own_bydd(sp
, dd
, &mhiargs
, TRUE
, ep
))
484 * If this is not a MN set and the state flags do not indicate the
485 * presence of devids, update the set records on all nodes.
487 if (!(sd
->sd_flags
& MD_SR_MB_DEVID
) && !(MD_MNSET_DESC(sd
))) {
488 if (meta_update_mb(sp
, dd
, ep
) == 0) {
491 /* update the sr_flags on all hosts */
492 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
493 if (sd
->sd_nodes
[i
][0] == '\0')
496 if (clnt_upd_sr_flags(sd
->sd_nodes
[i
],
497 sp
, (sd
->sd_flags
| MD_SR_MB_DEVID
), ep
))
503 RB_TEST(6, "adddrives", ep
)
506 rb_level
= 3; /* level 3 */
508 RB_TEST(7, "adddrives", ep
)
511 * Balance the DB's according to the list of existing drives and the
512 * list of added drives.
514 if ((rval
= meta_db_balance(sp
, dd
, curdd
, dbsize
, ep
)) == -1)
518 * Slam a dummy master block on all the disks that we are adding
519 * that don't have replicas on them.
520 * Used by diskset import if the disksets are remotely replicated
522 if (metareplicalist(sp
, MD_BASICNAME_OK
, &rlp
, ep
) >= 0) {
523 for (ddp
= dd
; ddp
!= NULL
; ddp
= ddp
->dd_next
) {
529 drive_name
= ddp
->dd_dnp
->cname
;
531 for (rl
= rlp
; rl
!= NULL
; rl
= rl
->rl_next
) {
535 rl
->rl_repp
->r_namep
->drivenamep
->cname
;
537 if (strcmp(drive_name
, rep_name
) == 0) {
539 * Disk has a replica on it so don't
540 * add dummy master block.
547 * Drive doesn't have a replica on it so
548 * we need a dummy master block. Add it.
550 if (meta_replicaslice(ddp
->dd_dnp
, &rep_slice
,
556 if ((np
= metaslicename(ddp
->dd_dnp
, rep_slice
,
562 if ((fd
= open(np
->rname
, O_RDWR
)) >= 0) {
563 meta_mkdummymaster(sp
, fd
, 16);
570 if ((curdd
== NULL
) && (MD_MNSET_DESC(sd
))) {
572 * Notify rpc.mdcommd on all nodes of a nodelist change.
573 * Start by suspending rpc.mdcommd (which drains it of all
574 * messages), then change the nodelist followed by a reinit
577 nd
= sd
->sd_nodelist
;
578 /* All nodes are guaranteed to be ALIVE */
580 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_SUSPEND
,
581 sp
, MD_MSG_CLASS0
, MD_MSCF_NO_FLAGS
, ep
)) {
591 * If a MN diskset and this is the first disk(s) being added
592 * to set, then pre-allocate change log records here.
593 * When the other nodes are joined into the MN diskset, the
594 * USER records will just be snarfed in.
596 if ((MD_MNSET_DESC(sd
)) && (curdd
== NULL
)) {
597 if (mdmn_allocate_changelog(sp
, ep
) != 0)
602 * Mark the drives MD_DR_OK.
603 * If first drive being added to MN diskset, then set
604 * master on all nodes to be this node and then join
605 * all alive nodes (nodes in membership list) to set.
607 if (MD_MNSET_DESC(sd
)) {
608 nd
= sd
->sd_nodelist
;
609 /* All nodes are guaranteed to be ALIVE */
611 /* don't set master on this node - done earlier */
612 if ((curdd
== NULL
) && (nd
->nd_nodeid
!=
613 sd
->sd_mn_mynode
->nd_nodeid
)) {
615 * Set master on all alive nodes since
616 * all alive nodes will become joined nodes.
618 if (clnt_mnsetmaster(nd
->nd_nodename
, sp
,
619 sd
->sd_mn_mynode
->nd_nodename
,
620 sd
->sd_mn_mynode
->nd_nodeid
, ep
)) {
627 * No special flags for join set. Since
628 * all nodes are joining if 1st drive is being
629 * added to set then all nodes will be either
630 * STALE or non-STALE and each node can
631 * determine this on its own.
633 if (clnt_joinset(nd
->nd_nodename
, sp
,
637 /* Sets join node flag on all nodes in list */
638 if (clnt_upd_nr_flags(nd
->nd_nodename
, sp
,
639 sd
->sd_nodelist
, MD_NR_JOIN
, NULL
, ep
)) {
645 * Set MD_DR_OK as last thing before unlock.
646 * In case of panic on this node, recovery
647 * code can check for MD_DR_OK to determine
650 if (clnt_upd_dr_flags(nd
->nd_nodename
, sp
, dd
,
655 RB_TEST(8, "adddrives", ep
)
659 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
660 /* Skip empty slots */
661 if (sd
->sd_nodes
[i
][0] == '\0')
664 if (clnt_upd_dr_flags(sd
->sd_nodes
[i
], sp
, dd
, MD_DR_OK
,
668 RB_TEST(8, "adddrives", ep
)
672 RB_TEST(9, "adddrives", ep
)
676 * Notify rpc.mdcommd on all nodes of a nodelist change.
677 * Send reinit command to mdcommd which forces it to get
678 * fresh set description.
680 if (suspendall_flag
) {
682 nd
= sd
->sd_nodelist
;
683 /* All nodes are guaranteed to be ALIVE */
685 /* Class is ignored for REINIT */
686 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_REINIT
,
687 sp
, NULL
, MD_MSCF_NO_FLAGS
, &xep
)) {
689 (void) mdstealerror(ep
, &xep
);
691 mde_perror(ep
, dgettext(TEXT_DOMAIN
,
692 "Unable to reinit rpc.mdcommd.\n"));
698 * Unlock diskset by resuming messages across the diskset.
699 * Just resume all classes so that resume is the same whether
700 * just one class was locked or all classes were locked.
702 if ((suspend1_flag
) || (suspendall_flag
)) {
703 nd
= sd
->sd_nodelist
;
704 /* All nodes are guaranteed to be ALIVE */
706 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_RESUME
,
707 sp
, MD_MSG_CLASS0
, MD_MSCF_NO_FLAGS
, &xep
)) {
709 (void) mdstealerror(ep
, &xep
);
711 mde_perror(ep
, dgettext(TEXT_DOMAIN
,
712 "Unable to resume rpc.mdcommd.\n"));
716 meta_ping_mnset(sp
->setno
);
720 cl_sk
= cl_get_setkey(sp
->setno
, sp
->setname
);
721 if (MD_MNSET_DESC(sd
)) {
722 nd
= sd
->sd_nodelist
;
723 /* All nodes are guaranteed to be ALIVE */
725 if (clnt_unlock_set(nd
->nd_nodename
,
728 (void) mdstealerror(ep
, &xep
);
734 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
735 /* Skip empty slots */
736 if (sd
->sd_nodes
[i
][0] == '\0')
739 if (clnt_unlock_set(sd
->sd_nodes
[i
],
742 (void) mdstealerror(ep
, &xep
);
750 metafreedrivedesc(&dd
);
752 if (flush_set_onerr
) {
753 metaflushsetname(sp
);
754 if (!(MD_MNSET_DESC(sd
))) {
755 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
759 if (MD_MNSET_DESC(sd
)) {
760 /* release signals back to what they were on entry */
761 if (procsigs(FALSE
, &oldsigs
, &xep
) < 0)
768 /* all signals already blocked for MN disket */
769 if (!(MD_MNSET_DESC(sd
))) {
770 /* Make sure we are blocking all signals */
771 if (procsigs(TRUE
, &oldsigs
, &xep
) < 0)
777 max_genid
= sd
->sd_genid
;
782 * Since the add drive operation is failing, need
783 * to reset config back to the way it was
784 * before the add drive opration.
785 * If a MN diskset and this is the first drive being added,
786 * then reset master on all ALIVE nodes (which is all nodes)
787 * since the master would have not been set previously.
788 * Don't reset master on this node, since this
790 * This is ok to fail since next node to add first
791 * disk to diskset will also set the master on all nodes.
793 * Also, if this is the first drive being added,
794 * need to have each node withdraw itself from the set.
796 if ((MD_MNSET_DESC(sd
)) && (curdd
== NULL
)) {
797 nd
= sd
->sd_nodelist
;
798 /* All nodes are guaranteed to be ALIVE */
801 * Be careful with ordering in case of
802 * panic between the steps and the
803 * effect on recovery during reconfig.
805 if (clnt_withdrawset(nd
->nd_nodename
, sp
, &xep
))
808 /* Sets withdraw flag on all nodes in list */
809 if (clnt_upd_nr_flags(nd
->nd_nodename
, sp
,
810 sd
->sd_nodelist
, MD_NR_WITHDRAW
,
817 sd
->sd_mn_mynode
->nd_nodeid
) {
821 /* Reset master on all of the other nodes. */
822 if (clnt_mnsetmaster(nd
->nd_nodename
, sp
,
823 "", MD_MN_INVALID_NID
, &xep
))
831 * Send resume command to mdcommd. Don't send reinit command
832 * since nodelist should not have changed.
833 * If suspendall_flag is set, then user would have been adding
834 * first drives to set. Since this failed, there is certainly
835 * no reinit message to send to rpc.commd since no nodes will
836 * be joined to set at the end of this metaset command.
838 if (suspendall_flag
) {
840 nd
= sd
->sd_nodelist
;
841 /* All nodes are guaranteed to be ALIVE */
844 * Resume all classes but class 1 so that lock is held
845 * against meta* commands.
846 * To later resume class1, must issue a class0 resume.
848 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_RESUME
,
850 MD_MSCF_DONT_RESUME_CLASS1
, &xep
)) {
851 mde_perror(&xep
, dgettext(TEXT_DOMAIN
,
852 "Unable to resume rpc.mdcommd.\n"));
857 meta_ping_mnset(sp
->setno
);
865 for (ddp
= dd
; ddp
!= NULL
; ddp
= ddp
->dd_next
) {
868 if ((meta_replicaslice(ddp
->dd_dnp
,
869 &rep_slice
, &xep
) != 0) ||
870 ((np
= metaslicename(ddp
->dd_dnp
, rep_slice
,
876 (void) metanamelist_append(&nlp
, np
);
878 if (meta_db_detach(sp
, nlp
,
879 (MDFORCE_DS
| MDFORCE_SET_LOCKED
), NULL
, &xep
))
882 metafreenamelist(nlp
);
886 if (meta_db_balance(sp
, NULL
, curdd
, 0, &xep
) == -1)
889 /* Only if we are adding the first drive */
890 /* Handled MN diskset above. */
891 if ((curdd
== NULL
) && !(MD_MNSET_DESC(sd
))) {
892 if (clnt_stimeout(mynode(), sp
, &defmhiargs
,
896 /* This is needed because of a corner case */
897 if (halt_set(sp
, &xep
))
905 if (!(MD_MNSET_DESC(sd
)) && !MD_ATSET_DESC(sd
)) {
906 if (rel_own_bydd(sp
, dd
, TRUE
, &xep
))
913 if (MD_MNSET_DESC(sd
)) {
914 nd
= sd
->sd_nodelist
;
915 /* All nodes are guaranteed to be ALIVE */
917 if (clnt_deldrvs(nd
->nd_nodename
, sp
, dd
,
923 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
924 /* Skip empty slots */
925 if (sd
->sd_nodes
[i
][0] == '\0')
928 if (clnt_deldrvs(sd
->sd_nodes
[i
], sp
, dd
,
934 resync_genid(sp
, sd
, max_genid
, 0, NULL
);
937 if ((suspend1_flag
) || (suspendall_flag
)) {
939 nd
= sd
->sd_nodelist
;
940 /* All nodes are guaranteed to be ALIVE */
943 * Just resume all classes so that resume is the
944 * same whether just one class was locked or all
945 * classes were locked.
947 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_RESUME
,
948 sp
, MD_MSG_CLASS0
, MD_MSCF_NO_FLAGS
, &xep
)) {
953 meta_ping_mnset(sp
->setno
);
957 cl_sk
= cl_get_setkey(sp
->setno
, sp
->setname
);
958 /* Don't test lock flag since guaranteed to be set if in rollback */
959 if (MD_MNSET_DESC(sd
)) {
961 * Since the add drive operation is failing, need
962 * to reset config back to the way it was
963 * before the add drive opration.
964 * If a MN diskset and this is the first drive being
965 * added, then reset master on this node since
966 * the master would have not been set previously.
967 * This is ok to fail since next node to add first
968 * disk to diskset will also set the master on all nodes.
971 /* Reset master on mynode */
972 if (clnt_mnsetmaster(mynode(), sp
, "",
973 MD_MN_INVALID_NID
, &xep
))
976 nd
= sd
->sd_nodelist
;
977 /* All nodes are guaranteed to be ALIVE */
979 if (clnt_unlock_set(nd
->nd_nodename
, cl_sk
, &xep
))
984 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
985 /* Skip empty slots */
986 if (sd
->sd_nodes
[i
][0] == '\0')
989 if (clnt_unlock_set(sd
->sd_nodes
[i
], cl_sk
, &xep
))
995 /* release signals back to what they were on entry */
996 if (procsigs(FALSE
, &oldsigs
, &xep
) < 0)
999 metafreedrivedesc(&dd
);
1001 if (flush_set_onerr
) {
1002 metaflushsetname(sp
);
1003 if (!(MD_MNSET_DESC(sd
))) {
1004 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1012 * Add drives routine used during import of a diskset.
1015 meta_imp_set_adddrives(
1017 mddrivenamelist_t
*dnlp
,
1018 md_im_set_desc_t
*misp
,
1023 mddrivenamelist_t
*p
;
1024 md_drive_desc
*dd
= NULL
, *ddp
;
1025 int flush_set_onerr
= 0;
1028 mhd_mhiargs_t mhiargs
;
1029 md_im_replica_info_t
*mirp
;
1030 md_im_drive_info_t
*midp
;
1033 ulong_t max_genid
= 0;
1035 md_error_t xep
= mdnullerror
;
1037 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
1040 for (p
= dnlp
; p
!= NULL
; p
= p
->next
) {
1044 * If we have a partial diskset, meta_make_sidenmlist will
1045 * need information from midp to complete making the
1046 * side name structure.
1048 if (misp
->mis_partial
) {
1049 imp_flag
= MDDB_C_IMPORT
;
1050 for (midp
= misp
->mis_drives
; midp
!= NULL
;
1051 midp
= midp
->mid_next
) {
1052 if (midp
->mid_dnp
== p
->drivenamep
)
1056 (void) mddserror(ep
, MDE_DS_SETNOTIMP
,
1057 MD_SET_BAD
, mynode(), NULL
, sp
->setname
);
1063 * Create the names for the drives we are adding per side.
1065 if (meta_make_sidenmlist(sp
, p
->drivenamep
, imp_flag
,
1073 * Get the list of drives descriptors that we are adding.
1075 dd
= metaget_drivedesc_fromdrivelist(sp
, dnlp
, MD_DR_ADD
, ep
);
1083 * Get the set timeout information.
1085 (void) memset(&mhiargs
, '\0', sizeof (mhiargs
));
1086 if (clnt_gtimeout(mynode(), sp
, &mhiargs
, ep
) == -1) {
1092 * Get timestamp and generation id for new records
1095 genid
= sd
->sd_genid
;
1097 /* At this point, in case of error, set should be flushed. */
1098 flush_set_onerr
= 1;
1100 rb_level
= 1; /* level 1 */
1102 for (midp
= misp
->mis_drives
; midp
!= NULL
; midp
= midp
->mid_next
) {
1103 for (ddp
= dd
; ddp
!= NULL
; ddp
= ddp
->dd_next
) {
1104 if (ddp
->dd_dnp
== midp
->mid_dnp
) {
1106 ddp
->dd_dnp
->devid
=
1107 devid_str_encode(midp
->mid_devid
,
1108 midp
->mid_minor_name
);
1111 mirp
= midp
->mid_replicas
;
1113 ddp
->dd_dbsize
= mirp
->mir_length
;
1114 for (; mirp
!= NULL
;
1115 mirp
= mirp
->mir_next
) {
1119 if ((midp
->mid_available
&
1120 MD_IM_DISK_NOT_AVAILABLE
) &&
1121 (misp
->mis_flags
& MD_IM_SET_REPLICATED
)) {
1122 ddp
->dd_flags
= MD_DR_UNRSLV_REPLICATED
;
1129 * Add the drive records for the drives that we are adding to
1130 * each host in the set. Marks the drive records as MD_DR_ADD.
1131 * May also mark a drive record as MD_DR_UNRSLV_REPLICATED if
1132 * this flag was set in the dd_flags for that drive.
1134 if (clnt_imp_adddrvs(mynode(), sp
, dd
, now
, genid
, ep
) == -1)
1137 rb_level
= 2; /* level 2 */
1140 * Take ownership of the added drives.
1142 if (tk_own_bydd(sp
, dd
, &mhiargs
, TRUE
, ep
))
1146 metafreedrivedesc(&dd
);
1148 if (flush_set_onerr
) {
1149 metaflushsetname(sp
);
1155 /* Make sure we are blocking all signals */
1156 if (procsigs(TRUE
, &oldsigs
, &xep
) < 0)
1161 max_genid
= sd
->sd_genid
;
1165 if (!MD_ATSET_DESC(sd
)) {
1166 if (rel_own_bydd(sp
, dd
, TRUE
, &xep
)) {
1174 if (clnt_deldrvs(mynode(), sp
, dd
, &xep
) == -1) {
1178 resync_genid(sp
, sd
, max_genid
, 0, NULL
);
1183 /* release signals back to what they were on entry */
1184 if (procsigs(FALSE
, &oldsigs
, &xep
) < 0)
1187 metafreedrivedesc(&dd
);
1189 if (flush_set_onerr
) {
1190 metaflushsetname(sp
);
1191 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1198 meta_set_deletedrives(
1200 mddrivenamelist_t
*dnlp
,
1206 md_drive_desc
*ddp
, *dd
= NULL
, *curdd
= NULL
;
1207 md_replicalist_t
*rlp
= NULL
, *rl
;
1208 mddrivenamelist_t
*p
;
1211 mhd_mhiargs_t mhiargs
;
1215 ulong_t max_genid
= 0;
1217 md_error_t xep
= mdnullerror
;
1220 int current_drv_cnt
= 0;
1221 int suspendall_flag
= 0, suspendall_flag_rb
= 0;
1222 int suspend1_flag
= 0;
1224 bool_t stale_bool
= FALSE
;
1225 int flush_set_onerr
= 0;
1229 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
1232 /* Make sure we own the set */
1233 if (meta_check_ownership(sp
, ep
) != 0)
1236 if (drvsuniq(sp
, dnlp
, ep
) == -1)
1240 * Check and see if all the nodes have the set.
1242 * The drive and node records are stored in the local mddbs of each
1243 * node in the diskset. Each node's rpc.metad daemon reads in the set,
1244 * drive and node records from that node's local mddb and caches them
1245 * internally. Any process needing diskset information contacts its
1246 * local rpc.metad to get this information. Since each node in the
1247 * diskset is independently reading the set information from its local
1248 * mddb, the set, drive and node records in the local mddbs must stay
1249 * in-sync, so that all nodes have a consistent view of the diskset.
1251 * For a multinode diskset, explicitly verify that all nodes in the
1252 * diskset are ALIVE (i.e. are in the API membership list). Otherwise,
1253 * fail this operation since all nodes must be ALIVE in order to delete
1254 * a drive record from their local mddb. If a panic of this node
1255 * leaves the local mddbs set, node and drive records out-of-sync, the
1256 * reconfig cycle will fix the local mddbs and force them back into
1259 if (MD_MNSET_DESC(sd
)) {
1260 nd
= sd
->sd_nodelist
;
1262 if (!(nd
->nd_flags
& MD_MN_NODE_ALIVE
)) {
1263 (void) mddserror(ep
, MDE_DS_NOTINMEMBERLIST
,
1265 nd
->nd_nodename
, NULL
, sp
->setname
);
1271 /* Make sure we are blocking all signals */
1272 if (procsigs(TRUE
, &oldsigs
, &xep
) < 0)
1276 * Lock the set on current set members.
1277 * Set locking done much earlier for MN diskset than for
1278 * traditional diskset since lock_set and SUSPEND are used
1279 * to protect against other meta* commands running on the
1282 nd
= sd
->sd_nodelist
;
1283 /* All nodes are guaranteed to be ALIVE */
1285 if (clnt_lock_set(nd
->nd_nodename
, sp
, ep
)) {
1293 * Lock out other meta* commands by suspending
1294 * class 1 messages across the diskset.
1296 nd
= sd
->sd_nodelist
;
1297 /* All nodes are guaranteed to be ALIVE */
1299 if (clnt_mdcommdctl(nd
->nd_nodename
,
1300 COMMDCTL_SUSPEND
, sp
, MD_MSG_CLASS1
,
1301 MD_MSCF_NO_FLAGS
, ep
)) {
1309 nd
= sd
->sd_nodelist
;
1310 /* All nodes are guaranteed to be ALIVE */
1312 if (strcmp(nd
->nd_nodename
, mynode()) == 0) {
1317 has_set
= nodehasset(sp
, nd
->nd_nodename
,
1325 (void) mddserror(ep
, MDE_DS_NODENOSET
,
1326 sp
->setno
, nd
->nd_nodename
,
1334 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
1335 /* Skip empty slots */
1336 if (sd
->sd_nodes
[i
][0] == '\0')
1339 if (strcmp(sd
->sd_nodes
[i
], mynode()) == 0)
1342 has_set
= nodehasset(sp
, sd
->sd_nodes
[i
], NHS_NSTG_EQ
,
1346 * Can directly return since !MN diskset;
1347 * nothing to unlock.
1354 * Can directly return since !MN diskset;
1355 * nothing to unlock.
1357 return (mddserror(ep
, MDE_DS_NODENOSET
,
1358 sp
->setno
, sd
->sd_nodes
[i
], NULL
,
1364 for (p
= dnlp
; p
!= NULL
; p
= p
->next
) {
1368 dnp
= p
->drivenamep
;
1370 if ((is_it
= meta_is_drive_in_thisset(sp
, dnp
, FALSE
, ep
))
1377 (void) mddserror(ep
, MDE_DS_DRIVENOTINSET
, sp
->setno
,
1378 NULL
, dnp
->cname
, sp
->setname
);
1383 if ((meta_check_drive_inuse(sp
, dnp
, FALSE
, ep
)) == -1) {
1390 current_drv_cnt
= deldrvcnt
;
1393 * Get drive descriptors for the drives that are currently in the set.
1395 curdd
= metaget_drivedesc(sp
, MD_BASICNAME_OK
, ep
);
1402 * Decrement the the delete drive count for each drive currently in the
1405 for (ddp
= curdd
; ddp
!= NULL
; ddp
= ddp
->dd_next
)
1409 * If the count of drives we are deleting is equal to the drives in the
1410 * set, and we haven't specified forceflg, return an error
1412 if (deldrvcnt
== 0 && forceflg
== FALSE
) {
1413 (void) mderror(ep
, MDE_FORCE_DEL_ALL_DRV
, NULL
);
1419 * Get the list of drive descriptors that we are deleting.
1421 dd
= metaget_drivedesc_fromdrivelist(sp
, dnlp
, MD_DR_DEL
, ep
);
1428 * Get the set timeout information in case we have to roll back.
1430 (void) memset(&mhiargs
, '\0', sizeof (mhiargs
));
1431 if (clnt_gtimeout(mynode(), sp
, &mhiargs
, ep
) == -1) {
1436 /* At this point, in case of error, set should be flushed. */
1437 flush_set_onerr
= 1;
1439 /* END CHECK CODE */
1441 /* Lock the set on current set members */
1442 if (!(MD_MNSET_DESC(sd
))) {
1443 md_rb_sig_handling_on();
1444 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
1445 /* Skip empty slots */
1446 if (sd
->sd_nodes
[i
][0] == '\0')
1449 if (clnt_lock_set(sd
->sd_nodes
[i
], sp
, ep
)) {
1457 if ((deldrvcnt
== 0) && (MD_MNSET_DESC(sd
))) {
1460 * Is current set STALE?
1462 (void) memset(&c
, 0, sizeof (c
));
1464 c
.c_setno
= sp
->setno
;
1465 if (metaioctl(MD_DB_GETDEV
, &c
, &c
.c_mde
, NULL
) != 0) {
1466 (void) mdstealerror(ep
, &c
.c_mde
);
1470 if (c
.c_flags
& MDDB_C_STALE
) {
1475 RB_TEST(1, "deletedrives", ep
)
1478 rb_level
= 1; /* level 1 */
1480 RB_TEST(2, "deletedrives", ep
)
1483 * Mark the drives MD_DR_DEL
1485 if (MD_MNSET_DESC(sd
)) {
1486 nd
= sd
->sd_nodelist
;
1487 /* All nodes are guaranteed to be ALIVE */
1489 if (clnt_upd_dr_flags(nd
->nd_nodename
, sp
, dd
,
1490 MD_DR_DEL
, ep
) == -1)
1493 RB_TEST(3, "deletedrives", ep
)
1497 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
1498 /* Skip empty slots */
1499 if (sd
->sd_nodes
[i
][0] == '\0')
1502 if (clnt_upd_dr_flags(sd
->sd_nodes
[i
], sp
, dd
,
1503 MD_DR_DEL
, ep
) == -1)
1506 RB_TEST(3, "deletedrives", ep
)
1510 RB_TEST(4, "deletedrives", ep
)
1513 rb_level
= 2; /* level 2 */
1515 RB_TEST(5, "deletedrives", ep
)
1518 * Balance the DB's according to the list of existing drives and the
1519 * list of deleted drives.
1521 if (meta_db_balance(sp
, dd
, curdd
, 0, ep
) == -1)
1525 * If the drive(s) to be deleted cannot be accessed,
1526 * they haven't really been deleted yet. Check and delete now
1529 if (metareplicalist(sp
, MD_BASICNAME_OK
, &rlp
, ep
) >= 0) {
1531 for (ddp
= dd
; ddp
!= NULL
; ddp
= ddp
->dd_next
) {
1534 delete_name
= ddp
->dd_dnp
->cname
;
1536 for (rl
= rlp
; rl
!= NULL
; rl
= rl
->rl_next
) {
1540 rl
->rl_repp
->r_namep
->drivenamep
->cname
;
1542 if (strcmp(delete_name
, cur_name
) == 0) {
1543 /* put it on the delete list */
1544 np
= rl
->rl_repp
->r_namep
;
1545 (void) metanamelist_append(&nlp
, np
);
1552 if (meta_db_detach(sp
, nlp
,
1553 (MDFORCE_DS
| MDFORCE_SET_LOCKED
), NULL
,
1555 metafreenamelist(nlp
);
1558 metafreenamelist(nlp
);
1562 RB_TEST(6, "deletedrives", ep
)
1565 rb_level
= 3; /* level 3 */
1567 RB_TEST(7, "deletedrives", ep
)
1570 * Cannot suspend set until after meta_db_balance since
1571 * meta_db_balance uses META_DB_ATTACH/DETACH messages.
1573 if ((deldrvcnt
== 0) && (MD_MNSET_DESC(sd
))) {
1575 * Notify rpc.mdcommd on all nodes of a nodelist change.
1576 * Start by suspending rpc.mdcommd (which drains it of all
1577 * messages), then change the nodelist followed by a reinit
1580 nd
= sd
->sd_nodelist
;
1581 /* All nodes are guaranteed to be ALIVE */
1583 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_SUSPEND
,
1584 sp
, MD_MSG_CLASS0
, MD_MSCF_NO_FLAGS
, ep
)) {
1588 suspendall_flag
= 1;
1594 * Remove the drive records for the drives that were deleted from
1595 * each host in the set. This removes the record and dr_flags.
1597 if (MD_MNSET_DESC(sd
)) {
1598 nd
= sd
->sd_nodelist
;
1599 /* All nodes are guaranteed to be ALIVE */
1601 if (clnt_deldrvs(nd
->nd_nodename
, sp
, dd
, ep
) == -1)
1604 RB_TEST(8, "deletedrives", ep
)
1608 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
1609 /* Skip empty slots */
1610 if (sd
->sd_nodes
[i
][0] == '\0')
1613 if (clnt_deldrvs(sd
->sd_nodes
[i
], sp
, dd
, ep
) == -1)
1616 RB_TEST(8, "deletedrives", ep
)
1620 RB_TEST(9, "deletedrives", ep
)
1623 rb_level
= 4; /* level 4 */
1625 RB_TEST(10, "deletedrives", ep
)
1627 if (!(MD_MNSET_DESC(sd
)) && !MD_ATSET_DESC(sd
)) {
1628 if (rel_own_bydd(sp
, dd
, TRUE
, ep
))
1632 /* If we deleted all the drives, then we need to halt the set. */
1633 if (deldrvcnt
== 0) {
1634 RB_TEST(11, "deletedrives", ep
)
1637 rb_level
= 5; /* level 5 */
1639 RB_TEST(12, "deletedrives", ep
)
1641 if (clnt_stimeout(mynode(), sp
, &defmhiargs
, ep
) == -1)
1644 RB_TEST(13, "deletedrives", ep
)
1647 rb_level
= 6; /* level 6 */
1649 RB_TEST(14, "deletedrives", ep
)
1651 /* Halt MN diskset on all nodes by having node withdraw */
1652 if (MD_MNSET_DESC(sd
)) {
1653 nd
= sd
->sd_nodelist
;
1654 /* All nodes are guaranteed to be ALIVE */
1656 /* Only withdraw nodes that are joined */
1657 if (!(nd
->nd_flags
& MD_MN_NODE_OWN
)) {
1662 * Going to set locally cached node flags to
1663 * rollback join so in case of error, the
1664 * rollback code knows which nodes to re-join.
1666 nd
->nd_flags
|= MD_MN_NODE_RB_JOIN
;
1669 * Be careful in ordering of following steps
1670 * so that recovery from a panic between
1671 * the steps is viable.
1672 * Only reset master info in rpc.metad -
1673 * don't reset local cached information
1674 * which will be used to set master information
1675 * back in case of failure (rollback).
1677 if (clnt_withdrawset(nd
->nd_nodename
, sp
, ep
))
1679 /* Sets withdraw flag on all nodes in list */
1680 if (clnt_upd_nr_flags(nd
->nd_nodename
, sp
,
1681 sd
->sd_nodelist
, MD_NR_WITHDRAW
,
1685 if (clnt_mnsetmaster(nd
->nd_nodename
, sp
,
1686 "", MD_MN_INVALID_NID
, ep
)) {
1692 if (halt_set(sp
, ep
))
1696 RB_TEST(15, "deletedrives", ep
)
1699 RB_TEST(16, "deletedrives", ep
)
1703 * Notify rpc.mdcommd on all nodes of a nodelist change.
1704 * Send reinit command to mdcommd which forces it to get
1705 * fresh set description.
1707 if (suspendall_flag
) {
1709 nd
= sd
->sd_nodelist
;
1710 /* All nodes are guaranteed to be ALIVE */
1712 /* Class is ignored for REINIT */
1713 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_REINIT
,
1714 sp
, NULL
, MD_MSCF_NO_FLAGS
, &xep
)) {
1716 (void) mdstealerror(ep
, &xep
);
1718 mde_perror(ep
, dgettext(TEXT_DOMAIN
,
1719 "Unable to reinit rpc.mdcommd.\n"));
1726 * Just resume all classes so that resume is the same whether
1727 * just one class was locked or all classes were locked.
1729 if ((suspend1_flag
) || (suspendall_flag
)) {
1731 nd
= sd
->sd_nodelist
;
1732 /* All nodes are guaranteed to be ALIVE */
1734 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_RESUME
,
1735 sp
, MD_MSG_CLASS0
, MD_MSCF_NO_FLAGS
, &xep
)) {
1737 (void) mdstealerror(ep
, &xep
);
1739 mde_perror(ep
, dgettext(TEXT_DOMAIN
,
1740 "Unable to resume rpc.mdcommd.\n"));
1744 meta_ping_mnset(sp
->setno
);
1747 cl_sk
= cl_get_setkey(sp
->setno
, sp
->setname
);
1748 if (MD_MNSET_DESC(sd
)) {
1749 nd
= sd
->sd_nodelist
;
1750 /* All nodes are guaranteed to be ALIVE */
1752 if (clnt_unlock_set(nd
->nd_nodename
,
1755 (void) mdstealerror(ep
, &xep
);
1761 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
1762 /* Skip empty slots */
1763 if (sd
->sd_nodes
[i
][0] == '\0')
1766 if (clnt_unlock_set(sd
->sd_nodes
[i
],
1769 (void) mdstealerror(ep
, &xep
);
1774 cl_set_setkey(NULL
);
1777 metafreedrivedesc(&dd
);
1779 if (flush_set_onerr
) {
1780 metaflushsetname(sp
);
1781 if (!(MD_MNSET_DESC(sd
))) {
1782 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1786 if (MD_MNSET_DESC(sd
)) {
1787 /* release signals back to what they were on entry */
1788 if (procsigs(FALSE
, &oldsigs
, &xep
) < 0)
1795 /* all signals already blocked for MN disket */
1796 if (!(MD_MNSET_DESC(sd
))) {
1797 /* Make sure we are blocking all signals */
1798 if (procsigs(TRUE
, &oldsigs
, &xep
) < 0)
1804 max_genid
= sd
->sd_genid
;
1806 /* Set the master on all nodes first thing */
1808 if (MD_MNSET_DESC(sd
)) {
1809 nd
= sd
->sd_nodelist
;
1810 /* All nodes are guaranteed to be ALIVE */
1812 if (!(nd
->nd_flags
& MD_MN_NODE_RB_JOIN
)) {
1816 * Set master on all re-joining nodes to be
1817 * my cached view of master.
1819 if (clnt_mnsetmaster(nd
->nd_nodename
, sp
,
1820 sd
->sd_mn_master_nodenm
,
1821 sd
->sd_mn_master_nodeid
, &xep
)) {
1831 md_mnset_record
*mnsr
;
1832 md_drive_record
*dr
;
1836 * See if we have to re-add the drives specified.
1838 if (MD_MNSET_DESC(sd
)) {
1839 nd
= sd
->sd_nodelist
;
1840 /* All nodes are guaranteed to be ALIVE */
1843 * Must get current set record from each
1844 * node to see what else must be done
1846 * Record should be for a multi-node diskset.
1848 if (clnt_mngetset(nd
->nd_nodename
, sp
->setname
,
1849 MD_SET_BAD
, &mnsr
, &xep
) == -1) {
1856 * If all drives are already there, skip
1860 dr
= mnsr
->sr_drivechain
;
1865 if (sr_drive_cnt
== current_drv_cnt
) {
1866 free_sr((md_set_record
*)mnsr
);
1871 /* Readd all drives */
1872 if (clnt_adddrvs(nd
->nd_nodename
, sp
, dd
,
1873 mnsr
->sr_ctime
, mnsr
->sr_genid
, &xep
) == -1)
1876 free_sr((struct md_set_record
*)mnsr
);
1880 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
1881 /* Skip empty slots */
1882 if (sd
->sd_nodes
[i
][0] == '\0')
1885 /* Record should be for a non-multi-node set */
1886 if (clnt_getset(sd
->sd_nodes
[i
], sp
->setname
,
1887 MD_SET_BAD
, &sr
, &xep
) == -1) {
1893 * Set record structure was allocated from RPC
1894 * routine getset so this structure is only of
1895 * size md_set_record even if the MN flag is
1896 * set. So, clear the flag so that the free
1897 * code doesn't attempt to free a structure
1898 * the size of md_mnset_record.
1900 if (MD_MNSET_REC(sr
)) {
1901 sr
->sr_flags
&= ~MD_SR_MN
;
1906 /* Drive already added, skip to next node */
1907 if (sr
->sr_drivechain
!= NULL
) {
1912 if (clnt_adddrvs(sd
->sd_nodes
[i
], sp
, dd
,
1913 sr
->sr_ctime
, sr
->sr_genid
, &xep
) == -1)
1923 * Notify rpc.mdcommd on all nodes of a nodelist change.
1924 * At this point in time, don't know which nodes are joined
1925 * to the set. So, send a reinit command to mdcommd
1926 * which forces it to get fresh set description. Then send resume.
1928 * Later, this code will use rpc.mdcommd messages to reattach disks
1929 * and then rpc.mdcommd may be suspended again, rest of the nodes
1930 * joined, rpc.mdcommd reinited and then resumed.
1932 if (suspendall_flag
) {
1934 nd
= sd
->sd_nodelist
;
1935 /* All nodes are guaranteed to be ALIVE */
1937 /* Class is ignored for REINIT */
1938 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_REINIT
,
1939 sp
, NULL
, MD_MSCF_NO_FLAGS
, &xep
)) {
1940 mde_perror(&xep
, dgettext(TEXT_DOMAIN
,
1941 "Unable to reinit rpc.mdcommd.\n"));
1948 nd
= sd
->sd_nodelist
;
1949 /* All nodes are guaranteed to be ALIVE */
1952 * Resume all classes but class 1 so that lock is held
1953 * against meta* commands.
1954 * To later resume class1, must issue a class0 resume.
1956 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_RESUME
,
1958 MD_MSCF_DONT_RESUME_CLASS1
, &xep
)) {
1959 mde_perror(&xep
, dgettext(TEXT_DOMAIN
,
1960 "Unable to resume rpc.mdcommd.\n"));
1965 meta_ping_mnset(sp
->setno
);
1973 for (ddp
= dd
; ddp
!= NULL
; ddp
= ddp
->dd_next
) {
1976 if ((meta_replicaslice(ddp
->dd_dnp
,
1977 &rep_slice
, &xep
) != 0) ||
1978 ((np
= metaslicename(ddp
->dd_dnp
, rep_slice
,
1984 (void) metanamelist_append(&nlp
, np
);
1986 if (meta_db_attach(sp
, nlp
,
1987 (MDCHK_DRVINSET
| MDCHK_SET_LOCKED
),
1988 &sd
->sd_ctime
, ddp
->dd_dbcnt
, ddp
->dd_dbsize
,
1992 metafreenamelist(nlp
);
1995 if (meta_db_balance(sp
, NULL
, curdd
, 0, &xep
) == -1)
2001 if (!(MD_MNSET_DESC(sd
)) && !MD_ATSET_DESC(sd
)) {
2002 if (tk_own_bydd(sp
, dd
, &mhiargs
, TRUE
, &xep
))
2009 if (clnt_stimeout(mynode(), sp
, &mhiargs
, &xep
) == -1)
2014 * If at least one node needs to be rejoined to MN diskset,
2015 * then suspend commd again.
2017 if (MD_MNSET_DESC(sd
)) {
2018 nd
= sd
->sd_nodelist
;
2019 /* All nodes are guaranteed to be ALIVE */
2021 if (!(nd
->nd_flags
& MD_MN_NODE_RB_JOIN
)) {
2029 * Found node that will be rejoined so
2030 * notify rpc.mdcommd on all nodes of a nodelist change.
2031 * Start by suspending rpc.mdcommd (which drains it of
2032 * all messages), then change the nodelist followed by
2033 * a reinit and resume.
2035 nd
= sd
->sd_nodelist
;
2036 /* All nodes are guaranteed to be ALIVE */
2038 if (clnt_mdcommdctl(nd
->nd_nodename
,
2039 COMMDCTL_SUSPEND
, sp
, MD_MSG_CLASS0
,
2040 MD_MSCF_NO_FLAGS
, &xep
)) {
2043 suspendall_flag_rb
= 1;
2053 if (MD_MNSET_DESC(sd
)) {
2056 nd
= sd
->sd_nodelist
;
2057 /* All nodes are guaranteed to be ALIVE */
2059 /* Only rejoin nodes that were joined before */
2060 if (!(nd
->nd_flags
& MD_MN_NODE_RB_JOIN
)) {
2065 * Rejoin nodes to same state as before -
2066 * either STALE or non-STALE.
2068 if (stale_bool
== TRUE
)
2069 join_flags
= MNSET_IS_STALE
;
2070 if (clnt_joinset(nd
->nd_nodename
, sp
,
2073 /* Sets OWN flag on all nodes in list */
2074 if (clnt_upd_nr_flags(nd
->nd_nodename
, sp
,
2075 sd
->sd_nodelist
, MD_NR_JOIN
, NULL
, &xep
)) {
2081 if (setup_db_bydd(sp
, dd
, TRUE
, &xep
) == -1)
2084 /* No special flag for traditional diskset */
2085 if (snarf_set(sp
, NULL
, &xep
))
2093 * Mark the drives as OK.
2095 if (MD_MNSET_DESC(sd
)) {
2096 nd
= sd
->sd_nodelist
;
2097 /* All nodes are guaranteed to be ALIVE */
2100 * Must be last action before unlock.
2101 * In case of panic, recovery code checks
2102 * for MD_DR_OK to know that drive
2103 * and possible master are fully added back.
2105 if (clnt_upd_dr_flags(nd
->nd_nodename
, sp
, dd
,
2106 MD_DR_OK
, &xep
) == -1)
2111 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
2112 /* Skip empty slots */
2113 if (sd
->sd_nodes
[i
][0] == '\0')
2116 if (clnt_upd_dr_flags(sd
->sd_nodes
[i
], sp
, dd
,
2117 MD_DR_OK
, &xep
) == -1)
2123 resync_genid(sp
, sd
, max_genid
, 0, NULL
);
2126 * Notify rpc.mdcommd on all nodes of a nodelist change.
2127 * Send a reinit command to mdcommd which forces it to get
2128 * fresh set description.
2130 if (suspendall_flag_rb
) {
2132 nd
= sd
->sd_nodelist
;
2133 /* All nodes are guaranteed to be ALIVE */
2135 /* Class is ignored for REINIT */
2136 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_REINIT
,
2137 sp
, NULL
, MD_MSCF_NO_FLAGS
, &xep
)) {
2138 mde_perror(&xep
, dgettext(TEXT_DOMAIN
,
2139 "Unable to reinit rpc.mdcommd.\n"));
2147 * Just resume all classes so that resume is the same whether
2148 * just one class was locked or all classes were locked.
2150 if ((suspend1_flag
) || (suspendall_flag_rb
) || (suspendall_flag
)) {
2152 nd
= sd
->sd_nodelist
;
2153 /* All nodes are guaranteed to be ALIVE */
2155 if (clnt_mdcommdctl(nd
->nd_nodename
, COMMDCTL_RESUME
,
2156 sp
, MD_MSG_CLASS0
, MD_MSCF_NO_FLAGS
, &xep
)) {
2157 mde_perror(&xep
, dgettext(TEXT_DOMAIN
,
2158 "Unable to resume rpc.mdcommd.\n"));
2163 meta_ping_mnset(sp
->setno
);
2168 cl_sk
= cl_get_setkey(sp
->setno
, sp
->setname
);
2169 /* Don't test lock flag since guaranteed to be set if in rollback */
2170 if (MD_MNSET_DESC(sd
)) {
2171 nd
= sd
->sd_nodelist
;
2172 /* All nodes are guaranteed to be ALIVE */
2174 if (clnt_unlock_set(nd
->nd_nodename
, cl_sk
, &xep
))
2179 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
2180 /* Skip empty slots */
2181 if (sd
->sd_nodes
[i
][0] == '\0')
2184 if (clnt_unlock_set(sd
->sd_nodes
[i
], cl_sk
, &xep
))
2188 cl_set_setkey(NULL
);
2190 /* release signals back to what they were on entry */
2191 if (procsigs(FALSE
, &oldsigs
, &xep
) < 0)
2194 metafreedrivedesc(&dd
);
2196 if (flush_set_onerr
) {
2197 metaflushsetname(sp
);
2198 if (!(MD_MNSET_DESC(sd
))) {
2199 md_rb_sig_handling_off(md_got_sig(), md_which_sig());