4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * Metadevice diskset interfaces
32 #include "meta_set_prv.h"
34 #include <sys/lvm/md_mddb.h>
35 #include <sys/cladm.h>
37 #include <sys/lvm/md_convert.h>
41 * Exported Entry Points
51 time_t mystamp
, otherstamp
;
53 mdname_t
*np
, *remote_np
;
54 mddrivename_t
*remote_dnp
;
59 mhd_mhiargs_t mhiargs
;
64 (void) memset(&mhiargs
, '\0', sizeof (mhiargs
));
66 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
69 if (meta_is_drive_in_thisset(sp
, dnp
, FALSE
, ep
)) {
75 if (clnt_gtimeout(mynode(), sp
, &mhiargs
, ep
) != 0)
77 if (!(MD_MNSET_DESC(sd
)) && !MD_ATSET_DESC(sd
)) {
78 if (rel_own_bydd(sp
, &dd
, TRUE
, ep
))
82 if ((np
= metaslicename(dnp
, MD_SLICE0
, ep
)) == NULL
) {
88 * First try and operate assuming the other side
89 * is running a SVM version that supports device id
90 * in disksets i.e. is running SVM RPC version 2.
92 * If this call fails due to the other side running
93 * a SVM version that does not support device id
94 * in disksets i.e. is running SVM RPC version 1, we
95 * fallback to the old behaviour.
97 if (dnp
->devid
!= NULL
) {
99 md_dev64_t dev
= NODEV64
;
102 * If the disk is connected to the remote node then the
103 * only thing we can be certain of is that the disk will
104 * have the same devid on that node, it may not have the
105 * same minor number nor the same ctd name. But if it
106 * does have the same ctd name then use it. In most cases
107 * there will only be a single entry returned but if the
108 * system has multi-path disks with MPXIO turned off there
109 * will be multiple entries. Attempting to choose the same
110 * name will give the user as consistent a view across the
113 ret
= clnt_devinfo_by_devid(node
, sp
, dnp
->devid
, &dev
,
114 np
->rname
, &rname
, NULL
, ep
);
117 * If the return value was ENOTSUP, we know the
118 * other side is not running a SVM version that
119 * supports device id in disksets. We fallback
120 * to the previous behaviour in that case.
122 if (ret
== ENOTSUP
) {
125 } else if (ret
== -1) {
131 * If the device does not exist on the remote node then
132 * the returned dev should indicate this (NODEV64) but
133 * we also check to make sure the returned name is not
134 * empty to make sure that the namespace does not get
135 * created with a NULL/empty entry (should not be possbile
136 * but being paranoid).
138 if (dev
== NODEV64
|| rname
== (char *)NULL
||
139 strcmp(rname
, "") == 0) {
140 rval
= mddserror(ep
, MDE_DS_DRIVENOTCOMMON
, sp
->setno
,
141 node
, dnp
->cname
, sp
->setname
);
146 * The rname returned from the remote node maybe different
147 * to the rname on this node, therefore we need to build up
148 * a dnp for this new rname.
150 if (strcmp(np
->rname
, rname
) != 0) {
151 /* different rname */
152 remote_np
= metaname_fast(&sp
, rname
,
154 if (remote_np
!= NULL
) {
155 remote_dnp
= remote_np
->drivenamep
;
166 ret
= setdevstamp(dnp
, &mystamp
, ep
);
168 * Check if the disk in question is an EFI disk.
175 if ((np
= metaslicename(dnp
, MD_SLICE0
, ep
)) == NULL
) {
182 * For EFI disks, we compare the device
183 * id for the disks in question.
185 ddi_devid_t thisdevid
, otherdevid
;
186 char *encoded_otherdevid
= NULL
;
187 char *encoded_thisdevid
= NULL
;
189 if (clnt_devinfo(node
, sp
, dnp
, &otherdev
, NULL
, ep
)
194 if (np
->dev
!= otherdev
) {
195 rval
= mddserror(ep
, MDE_DS_DRIVENOTCOMMON
,
196 sp
->setno
, node
, dnp
->cname
, sp
->setname
);
200 if (clnt_devid(node
, sp
, dnp
, &encoded_otherdevid
,
205 if (encoded_otherdevid
== NULL
) {
209 if (devid_str_decode(encoded_otherdevid
, &otherdevid
,
212 * If we are here, it means that dnp->devid
213 * is NULL. This will typically happen if
214 * we are dealing with SunCluster DID devices.
216 * We want to explicitly get the device id
219 encoded_thisdevid
= meta_get_devid(dnp
->rname
);
220 ret
= devid_str_decode(encoded_thisdevid
,
223 ret
= devid_compare(thisdevid
,
225 devid_free(thisdevid
);
227 devid_free(otherdevid
);
228 if (encoded_thisdevid
)
229 Free(encoded_thisdevid
);
232 Free(encoded_otherdevid
);
234 rval
= mddserror(ep
, MDE_DS_DRIVENOTCOMMON
,
235 sp
->setno
, node
, dnp
->cname
, sp
->setname
);
240 * For VTOC disks, we compare the dev_t and
241 * timestamp for the disks in question.
243 if (clnt_devinfo(node
, sp
, dnp
, &otherdev
,
244 &otherstamp
, ep
) == -1) {
248 if ((mystamp
!= otherstamp
) || (np
->dev
!= otherdev
)) {
249 rval
= mddserror(ep
, MDE_DS_DRIVENOTCOMMON
,
250 sp
->setno
, node
, dnp
->cname
, sp
->setname
);
257 if (clnt_drvused(node
, sp
, remote_dnp
, ep
) == -1)
262 if (!(MD_MNSET_DESC(sd
)) && !MD_ATSET_DESC(sd
)) {
263 if (tk_own_bydd(sp
, &dd
, &mhiargs
, TRUE
, ep
))
271 getnodeside(char *node
, md_set_desc
*sd
)
277 if (MD_MNSET_DESC(sd
)) {
278 nd
= sd
->sd_nodelist
;
280 if (strcmp(nd
->nd_nodename
, node
) == 0) {
281 return (nd
->nd_nodeid
);
285 return (MD_SIDEWILD
);
289 /* If regular diskset */
290 for (sideno
= 0; sideno
< MD_MAXSIDES
; sideno
++) {
291 if (sd
->sd_nodes
[sideno
] == NULL
||
292 sd
->sd_nodes
[sideno
][0] == '\0')
295 if (strcmp(sd
->sd_nodes
[sideno
], node
) == 0) {
301 * If the first loop fails we may be in a situation where this host
302 * is configured as part of a cluster yet not running in the cluster
303 * mode. If so, the names stored in sd->sd_nodes[] are going to be
304 * nodeid's instead of hostnames. See if we can find a match that way.
306 if (_cladm(CL_CONFIG
, CL_NODEID
, &nid
) == 0) {
307 for (sideno
= 0; sideno
< MD_MAXSIDES
; sideno
++) {
308 if (sd
->sd_nodes
[sideno
] == NULL
||
309 sd
->sd_nodes
[sideno
][0] == '\0')
311 if (atoi(sd
->sd_nodes
[sideno
]) == nid
)
316 return (MD_SIDEWILD
);
320 halt_set(mdsetname_t
*sp
, md_error_t
*ep
)
324 (void) memset(&c
, 0, sizeof (c
));
325 c
.c_setno
= sp
->setno
;
326 if ((c
.c_sideno
= getmyside(sp
, ep
)) == MD_SIDEWILD
)
329 if (s_ownset(sp
->setno
, ep
) == MD_SETOWNER_YES
) {
330 /* Don't need device id information from this ioctl */
331 c
.c_locator
.l_devid
= (uint64_t)0;
332 c
.c_locator
.l_devid_flags
= 0;
333 /* Kill any resyncs that are running on mirrors in this set */
334 meta_mirror_resync_kill(sp
);
335 if (metaioctl(MD_RELEASE_SET
, &c
, &c
.c_mde
, NULL
) != 0)
336 return (mdstealerror(ep
, &c
.c_mde
));
343 metadrivedesc_append(
348 md_timeval32_t timestamp
,
355 /* run to end of list */
356 for (/* void */; (*dd
!= NULL
); dd
= &(*dd
)->dd_next
)
359 /* allocate new list element */
360 p
= *dd
= Zalloc(sizeof (*p
));
364 p
->dd_dbsize
= dbsize
;
365 p
->dd_ctime
= timestamp
;
383 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
386 /* Don't care if set record is MN or not */
387 if (clnt_getset(node
, sp
->setname
, MD_SET_BAD
, &sr
, ep
))
396 /* Looking for name only match */
397 if ((match_flag
& NHS_N_EQ
) == NHS_N_EQ
) {
402 if (sd
->sd_setno
!= sr
->sr_setno
)
405 /* Looking for name and setno match */
406 if ((match_flag
& NHS_NS_EQ
) == NHS_NS_EQ
) {
411 if (sd
->sd_ctime
.tv_sec
!= sr
->sr_ctime
.tv_sec
||
412 sd
->sd_ctime
.tv_usec
!= sr
->sr_ctime
.tv_usec
)
415 /* Looking for name, setno, and timestamp match */
416 if ((match_flag
& NHS_NST_EQ
) == NHS_NST_EQ
) {
421 if (sd
->sd_genid
!= sr
->sr_genid
) {
422 if (sd
->sd_genid
< sr
->sr_genid
) {
424 * Looking for name, setno, timestamp, and genid on
425 * other host is GT than other host.
427 if ((match_flag
& NHS_NST_EQ_G_GT
) == NHS_NST_EQ_G_GT
) {
435 /* Looking for name, setno, timestamp, and genid match */
436 if ((match_flag
& NHS_NSTG_EQ
) == NHS_NSTG_EQ
)
441 * Set record structure was allocated from RPC routine getset
442 * so this structure is only of size md_set_record even if
443 * the MN flag is set. So, clear the flag so that the free
444 * code doesn't attempt to free a structure the size of
447 sr
->sr_flags
&= ~MD_SR_MN
;
454 nodesuniq(mdsetname_t
*sp
, int cnt
, char **strings
, md_error_t
*ep
)
457 for (i
= 0; i
< cnt
; i
++)
458 for (j
= i
+ 1; j
< cnt
; j
++)
459 if (strcmp(strings
[i
], strings
[j
]) == 0)
460 return (mddserror(ep
, MDE_DS_DUPHOST
,
461 sp
->setno
, strings
[i
], NULL
, sp
->setname
));
466 own_set(mdsetname_t
*sp
, char **owner_of_set
, int forceflg
, md_error_t
*ep
)
472 if (metaislocalset(sp
)) {
473 if (owner_of_set
!= NULL
)
474 *owner_of_set
= Strdup(mynode());
475 return (MD_SETOWNER_YES
);
478 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
481 if (clnt_ownset(mynode(), sp
, &am_i_owner
, ep
) == -1)
484 if (MD_MNSET_DESC(sd
)) {
485 if (am_i_owner
== TRUE
)
486 return (MD_SETOWNER_YES
);
488 return (MD_SETOWNER_NO
);
491 if (forceflg
== TRUE
) {
492 if (am_i_owner
== TRUE
) {
493 if (owner_of_set
!= NULL
)
494 *owner_of_set
= Strdup(mynode());
495 return (MD_SETOWNER_YES
);
498 if (owner_of_set
!= NULL
)
499 *owner_of_set
= NULL
;
500 return (MD_SETOWNER_NONE
);
503 if (am_i_owner
== TRUE
) {
504 if (owner_of_set
!= NULL
)
505 *owner_of_set
= Strdup(mynode());
506 return (MD_SETOWNER_YES
);
510 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
512 * Skip empty slots, and my own slot.
514 if (sd
->sd_nodes
[i
][0] == '\0' ||
515 strcmp(sd
->sd_nodes
[i
], mynode()) == 0)
518 if (clnt_ownset(sd
->sd_nodes
[i
], sp
, &am_i_owner
, ep
) == -1)
521 if (am_i_owner
== TRUE
) {
522 if (owner_of_set
!= NULL
)
523 *owner_of_set
= Strdup(sd
->sd_nodes
[i
]);
524 return (MD_SETOWNER_NO
);
528 /* We get here, we currently have no owner. */
529 if (owner_of_set
!= NULL
)
530 *owner_of_set
= NULL
;
531 return (MD_SETOWNER_NONE
);
544 ulong_t cur_genid
[MD_MAXSIDES
];
546 md_error_t xep
= mdnullerror
;
548 md_mnset_record
*mnsr
;
550 if (node_c
> 0 && node_v
&& *node_v
) {
552 * Mark the set record MD_SR_OK.
554 for (i
= 0; i
< node_c
; i
++)
555 if (clnt_upd_sr_flags(node_v
[i
], sp
, MD_SR_OK
, &xep
))
560 if (MD_MNSET_DESC(sd
)) {
561 nd
= sd
->sd_nodelist
;
563 if (!(nd
->nd_flags
& MD_MN_NODE_ALIVE
)) {
567 /* Will only return a multi-node diskset record */
568 if (clnt_mngetset(nd
->nd_nodename
, sp
->setname
,
569 MD_SET_BAD
, &mnsr
, &xep
) == -1) {
574 for (j
= mnsr
->sr_genid
; j
< max_genid
; j
++) {
575 if (clnt_upd_sr_flags(nd
->nd_nodename
, sp
,
579 free_sr((struct md_set_record
*)mnsr
);
586 * Get current genid for each node.
588 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
591 /* Skip empty slots */
592 if (sd
->sd_nodes
[i
][0] == '\0')
595 /* Should be a non-multinode diskset */
596 if (clnt_getset(sd
->sd_nodes
[i
], sp
->setname
,
597 MD_SET_BAD
, &sr
, &xep
) == -1) {
602 if (MD_MNSET_REC(sr
)) {
604 * Set record structure was allocated from RPC routine
605 * getset so this structure is only of size
606 * md_set_record even if the MN flag is set. So,
607 * clear the flag so that the free code doesn't
608 * attempt to free a structure the size of
611 sr
->sr_flags
&= ~MD_SR_MN
;
616 cur_genid
[i
] = sr
->sr_genid
;
622 * Mark the set record MD_SR_OK
624 for (i
= 0; i
< MD_MAXSIDES
; i
++) {
625 /* Skip empty slots */
626 if (sd
->sd_nodes
[i
][0] == '\0')
629 for (j
= cur_genid
[i
]; j
< max_genid
; j
++)
630 if (clnt_upd_sr_flags(sd
->sd_nodes
[i
], sp
, MD_SR_OK
,
638 setup_db_bydd(mdsetname_t
*sp
, md_drive_desc
*dd
, int force
, md_error_t
*ep
)
641 struct mddb_config c
;
645 ddi_devid_t devidp
, new_devidp
;
646 char *minor_name
= NULL
;
648 char *devid_str
= NULL
;
649 sdssc_version_t version
;
650 int need_to_free_devidp
= 0;
652 if ((sd
= metaget_setdesc(sp
, ep
)) == NULL
)
654 (void) memset(&c
, 0, sizeof (c
));
656 c
.c_setno
= sp
->setno
;
657 (void) strcpy(c
.c_setname
, sp
->setname
);
658 if ((c
.c_sideno
= getmyside(sp
, ep
)) == MD_SIDEWILD
)
661 c
.c_timestamp
= sd
->sd_ctime
;
663 if (setup_med_cfg(sp
, &c
, force
, ep
))
666 for (p
= dd
; p
!= NULL
; p
= p
->dd_next
) {
670 mdsidenames_t
*sn
= NULL
;
672 if (p
->dd_dbcnt
== 0)
679 for (sn
= dnp
->side_names
; sn
!= NULL
; sn
= sn
->next
) {
680 if (sn
->sideno
== c
.c_sideno
)
685 * The disk has no side name information
690 if ((meta_replicaslice(dnp
, &rep_slice
, ep
) != 0) ||
691 ((np
= metaslicename(dnp
, rep_slice
, ep
))
697 if (np
->dev
== NODEV64
)
700 c
.c_locator
.l_dev
= meta_cmpldev(np
->dev
);
701 c
.c_locator
.l_mnum
= meta_getminor(np
->dev
);
703 if (!MD_MNSET_DESC(sd
)) {
705 * minor_name will be NULL if dnp->devid == NULL
706 * - see metagetvtoc()
708 if (np
->minor_name
!= NULL
) {
709 minor_name
= Strdup(np
->minor_name
);
713 if ((cinfo
= metagetcinfo(np
, ep
)) == NULL
) {
718 (void) strncpy(c
.c_locator
.l_driver
, cinfo
->dname
,
719 sizeof (c
.c_locator
.l_driver
));
721 c
.c_locator
.l_dev
= NODEV32
;
722 c
.c_locator
.l_mnum
= sn
->mnum
;
723 (void) strncpy(c
.c_locator
.l_driver
, sn
->dname
,
724 sizeof (c
.c_locator
.l_driver
));
726 if (!MD_MNSET_DESC(sd
)) {
727 if (dnp
->devid
!= NULL
) {
728 minor_name
= meta_getdidminorbykey(
729 MD_LOCAL_SET
, sn
->sideno
+ SKEW
,
730 dnp
->side_names_key
, ep
);
736 * If the device does not have a devid or is a multinode
737 * diskset or we are in a SunCluster 3.x enviroment then
740 if ((dnp
->devid
== NULL
) || MD_MNSET_DESC(sd
) ||
741 ((sdssc_version(&version
) == SDSSC_OKAY
) &&
742 (version
.major
>= 3))) {
748 * The devid associated with the dnp does not have
749 * a minor name and so we must add it in.
751 size_t len
= strlen(dnp
->devid
) +
752 strlen(minor_name
) + 2;
753 devid_str
= (char *)Malloc(len
);
754 (void) snprintf(devid_str
, len
, "%s/%s", dnp
->devid
,
756 (void) devid_str_decode(devid_str
, &devidp
, NULL
);
757 need_to_free_devidp
= 1;
759 /* If need to fix LB then setup old_devid info */
760 if (p
->dd_flags
& MD_DR_FIX_LB_NM_DID
) {
761 sz
= devid_sizeof(devidp
);
762 c
.c_locator
.l_old_devid_sz
= sz
;
763 c
.c_locator
.l_old_devid
= (uintptr_t)malloc(sz
);
764 (void) memcpy((void *)(uintptr_t)
765 c
.c_locator
.l_old_devid
,
768 new_devidp
= replicated_list_lookup(
769 devid_sizeof((ddi_devid_t
)devidp
),
770 (void *)(uintptr_t)devidp
);
772 need_to_free_devidp
= 0;
776 sz
= devid_sizeof(devidp
);
777 c
.c_locator
.l_devid
= (uintptr_t)malloc(sz
);
778 c
.c_locator
.l_devid_sz
= sz
;
779 (void) memcpy((void *)(uintptr_t)
782 if (need_to_free_devidp
) {
784 need_to_free_devidp
= 0;
786 if (minor_name
== NULL
) {
789 Free((void *)(uintptr_t)c
.c_locator
.l_devid
);
790 if (c
.c_locator
.l_old_devid_sz
) {
792 (uintptr_t)c
.c_locator
.l_old_devid
);
793 c
.c_locator
.l_old_devid_sz
= 0;
794 c
.c_locator
.l_old_devid
=
799 (void) strcpy(c
.c_locator
.l_minor_name
,
801 c
.c_locator
.l_devid_flags
= MDDB_DEVID_VALID
|
802 MDDB_DEVID_SPACE
| MDDB_DEVID_SZ
;
805 * Don't need device id information from
808 c
.c_locator
.l_devid
= (uint64_t)0;
809 c
.c_locator
.l_devid_flags
= 0;
813 for (i
= 0; i
< p
->dd_dbcnt
; i
++) {
814 c
.c_locator
.l_flags
= 0;
815 c
.c_locator
.l_blkno
= 16 + i
* p
->dd_dbsize
;
817 if (metaioctl(MD_DB_USEDEV
, &c
, &c
.c_mde
, NULL
) != 0) {
821 (uintptr_t)c
.c_locator
.l_devid
);
822 if (c
.c_locator
.l_old_devid_sz
) {
823 Free((void *)(uintptr_t)
824 c
.c_locator
.l_old_devid
);
825 c
.c_locator
.l_old_devid_sz
= 0;
826 c
.c_locator
.l_old_devid
=
831 return (mdstealerror(ep
, &c
.c_mde
));
836 Free((void *)(uintptr_t)c
.c_locator
.l_devid
);
837 if (c
.c_locator
.l_old_devid_sz
) {
839 (uintptr_t)c
.c_locator
.l_old_devid
);
840 c
.c_locator
.l_old_devid_sz
= 0;
841 c
.c_locator
.l_old_devid
= (uintptr_t)NULL
;
852 snarf_set(mdsetname_t
*sp
, bool_t stale_bool
, md_error_t
*ep
)
856 (void) memset(&c
, '\0', sizeof (c
));
858 c
.c_setno
= sp
->setno
;
859 if ((c
.c_sideno
= getmyside(sp
, ep
)) == MD_SIDEWILD
)
862 /* Don't need device id information from this ioctl */
863 c
.c_locator
.l_devid
= (uint64_t)0;
864 c
.c_locator
.l_devid_flags
= 0;
865 if (stale_bool
== TRUE
) {
866 c
.c_flags
= MDDB_C_STALE
;
868 if (metaioctl(MD_GRAB_SET
, &c
, &c
.c_mde
, NULL
) != 0)
869 return (mdstealerror(ep
, &c
.c_mde
));
871 if (c
.c_flags
& MDDB_C_STALE
)
872 return (mdmddberror(ep
, MDE_DB_STALE
, (minor_t
)NODEV64
,
873 sp
->setno
, 0, NULL
));