4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #pragma ident "%Z%%M% %I% %E% SMI"
29 * initialize metadevices
35 #include <sys/lvm/md_mirror.h>
37 #include "meta_set_prv.h"
40 * try to initialize devices
49 * generate a command of the form "metainit -s setname [-n] [-f] ....."
51 * If -n option is *not* set, send the metainit command *with -n set* to
52 * all nodes first. Do this with MD_MSGF_STOP_ON_ERROR set.
53 * That means if it fails on one node, it'll return immediately,
54 * reporting the error.
55 * By doing so, we have a dryrun first that has to succeed on every node
56 * before we start the command for real.
57 * This saves us from backing out a metainit command that succeeded on
58 * some nodes but failed on one.
78 newargv
= calloc(argc
+5, sizeof (char *));
79 newargv
[0] = "metainit";
81 newargv
[2] = (*spp
)->setname
;
82 newargv
[3] = "-n"; /* always do "-n" first */
84 if ((options
& MDCMD_DOIT
) == 0) {
87 if ((options
& MDCMD_FORCE
) != 0) {
88 newargv
[newargc
] = "-f";
91 for (i
= 0; i
< argc
; i
++, newargc
++)
92 newargv
[newargc
] = argv
[i
];
93 ret
= meta_mn_send_command(*spp
, newargc
, newargv
,
94 flags
| MD_DRYRUN
| MD_NOLOG
, context
, ep
);
96 if ((dryrun_only
== 0) && (ret
== 0)) {
98 * Do it for real now. Remove "-n" from the arguments and
99 * MD_DRYRUN from the flags. If we fail this time the master
100 * must panic as the mddbs may be inconsistent.
102 newargv
[3] = ""; /* this was "-n" before */
103 ret
= meta_mn_send_command(*spp
, newargc
, newargv
,
104 flags
| MD_RETRY_BUSY
| MD_PANIC_WHEN_INCONSISTENT
,
118 bool_t called_thru_rpc
,
127 /* for all matching entries, which haven't already been done */
128 for (line
= 0; (line
< tabp
->nlines
); ++line
) {
129 md_tab_line_t
*linep
= &tabp
->lines
[line
];
130 char *uname
= linep
->argv
[0];
132 /* see if already done */
133 if (linep
->flags
!= DO_AGAIN
)
136 /* clear the metadev/hsp caches between inits */
137 metaflushmetanames();
140 if ((called_thru_rpc
== FALSE
) &&
141 meta_is_mn_name(spp
, uname
, ep
)) {
143 * MN set, send command to all nodes
144 * Note that is sp is NULL, meta_is_mn_name() derives
145 * sp from linep->argv which is the metadevice arg
147 ret
= mn_send_command(spp
, linep
->argc
, linep
->argv
,
148 options
, flags
, linep
->context
, ep
);
152 cname
= meta_name_getname(spp
, uname
, META_DEVICE
, ep
);
158 ret
= meta_init_name(spp
, linep
->argc
,
159 linep
->argv
, cname
, options
, ep
);
163 if (!(flags
& MD_IGNORE_STDERR
)) {
164 mderrorextra(ep
, linep
->context
);
173 linep
->flags
= IS_DONE
;
185 * initialize all devices in set
191 bool_t called_thru_rpc
,
195 md_tab_t
*tabp
= NULL
;
202 * Only take the lock if this is not a MN set
203 * We can only enter this code for a MN set if we are the initiator
204 * and in this case, we don't want to take locks.
206 if (meta_is_mn_set((*spp
), ep
) == 0) {
208 if (meta_lock(*spp
, TRUE
, ep
)) {
214 /* check for ownership */
215 if (meta_check_ownership(*spp
, ep
) != 0) {
221 /* lock is held across init_entries */
222 options
|= MDCMD_NOLOCK
;
225 /* get md.tab, preen entries */
226 if ((tabp
= meta_tab_parse(NULL
, ep
)) == NULL
) {
232 setlen
= strlen((*spp
)->setname
);
233 for (more
= 0; (more
< tabp
->nlines
); ++more
) {
234 md_tab_line_t
*linep
= &tabp
->lines
[more
];
235 char *cname
= linep
->cname
;
239 /* better have args */
240 assert((linep
->argc
> 0) && (linep
->argv
[0] != NULL
));
242 /* only do metadevices and hotspare pools in set */
243 if (linep
->type
& TAB_MD_HSP
) {
244 if ((p
= strrchr(cname
, '/')) == NULL
) {
249 if ((len
== setlen
) &&
250 (strncmp(cname
, (*spp
)->setname
, len
) == 0)) {
251 linep
->flags
= DO_AGAIN
;
253 linep
->flags
= DONT_DO
;
257 linep
->flags
= DONT_DO
;
263 /* while more devices get made */
265 done
= init_entries(spp
, tabp
, options
,
266 MD_IGNORE_STDERR
|MD_RETRY_BUSY
, called_thru_rpc
, ep
);
269 /* now do it and report errors */
270 if (init_entries(spp
, tabp
, options
, MD_RETRY_BUSY
,
271 called_thru_rpc
, ep
) >= 0)
272 eval
= 0; /* success */
275 /* cleanup, return success */
282 * initialize named device or hotspare pool
294 md_tab_t
*tabp
= NULL
;
295 md_tab_line_t
*linep
= NULL
;
298 char *uname
= argv
[0];
302 /* get md.tab entries */
303 if ((tabp
= meta_tab_parse(NULL
, ep
)) == NULL
) {
304 if (! mdissyserror(ep
, ENOENT
))
309 if ((linep
= meta_tab_find(*spp
, tabp
, uname
, TAB_MD_HSP
))
316 if ((called_thru_rpc
== FALSE
) &&
317 meta_is_mn_name(spp
, uname
, ep
)) {
319 * MN set, send command to all nodes
321 ret
= mn_send_command(spp
, argc
, argv
, options
,
322 MD_DISP_STDERR
, NO_CONTEXT_STRING
, ep
);
326 cname
= meta_name_getname(spp
, uname
, META_DEVICE
, ep
);
331 /* check for ownership */
332 if (meta_check_ownership(*spp
, ep
) != 0) {
337 ret
= meta_init_name(spp
, argc
, argv
, cname
, options
, ep
);
343 mderrorextra(ep
, linep
->context
);
346 rval
= 0; /* success */
348 /* cleanup, return error */
356 * print usage message
365 (void) fprintf(stderr
, gettext("\
366 usage: %s [-s setname] [-n] [-f] concat/stripe numstripes\n\
367 width component... [-i interlace]\n\
368 [width component... [-i interlace]] [-h hotspare_pool]\n\
369 %s [-s setname] [-n] [-f] mirror -m submirror...\n\
370 [read_options] [write_options] [pass_num]\n\
371 %s [-s setname] [-n] [-f] RAID -r component...\n\
372 [-i interlace] [-h hotspare_pool]\n\
373 [-k] [-o original_column_count]\n\
374 %s [-s setname] [-n] [-f] hotspare_pool [hotspare...]\n\
375 %s [-s setname] [-n] [-f] softpart -p [-A alignment]\n\
376 [-e] device size|all\n\
377 %s [-s setname] [-n] [-f] md.tab_entry\n\
378 %s [-s setname] [-n] [-f] -a\n\
379 %s -r\n"), myname
, myname
, myname
, myname
, myname
, myname
, myname
,
386 * If we fail during the attempt to take the auto-take disksets
387 * we need to tell the kernel to cleanup the in-core set struct
388 * so that we have a chance to take the set again later.
391 auto_take_cleanup(mdsetname_t
*sp
, side_t sideno
)
395 (void) memset(&c
, 0, sizeof (c
));
396 c
.c_setno
= sp
->setno
;
399 if (metaioctl(MD_RELEASE_SET
, &c
, &c
.c_mde
, NULL
) != 0) {
400 mde_perror(&c
.c_mde
, "auto_take_cleanup");
408 * This is a clean auto-take set, so do the work to take it.
409 * This is a streamlined version of the code in meta_set_take. We avoid the
410 * need for talking to the rpc.metad since that can't run this early during the
411 * boot. We don't need to talk to the metad for this diskset since we're the
412 * only host in the set.
415 take_set(md_set_record
*sr
)
419 md_error_t error
= mdnullerror
;
420 md_replicalist_t
*rlp
= NULL
;
421 md_replicalist_t
*rl
;
427 * Several of the functions we call take a sp param so
428 * construct one from the set record.
430 sn
.setname
= sr
->sr_setname
;
431 sn
.setno
= sr
->sr_setno
;
432 sn
.setdesc
= sr2setdesc(sr
);
433 sn
.lockfd
= MD_NO_LOCK
;
435 if (sr
->sr_flags
& MD_SR_MB_DEVID
)
436 dd
= metaget_drivedesc(&sn
, MD_BASICNAME_OK
| PRINT_FAST
,
439 dd
= metaget_drivedesc(&sn
, MD_BASICNAME_OK
, &error
);
442 mde_perror(&error
, "");
448 * Skip call to tk_own_bydd. This talks to rpc.metamhd (which we can't
449 * do yet) and is not needed for auto-take disksets since we are not
450 * doing SCSI reservations on these drives.
453 if (setup_db_bydd(&sn
, dd
, 0, &error
) != 0) {
454 if (! mdismddberror(&error
, MDE_DB_ACCOK
) &&
455 ! mdismddberror(&error
, MDE_DB_TAGDATA
)) {
457 * Skip call to rel_own_bydd since that really just
458 * calls rpc.metamhd which we don't need to do,
459 * so there really isn't anything to rollback here.
461 mde_perror(&error
, "");
468 if ((sideno
= getmyside(&sn
, &error
)) == MD_SIDEWILD
) {
469 mde_perror(&error
, "");
473 if (snarf_set(&sn
, FALSE
, &error
) != 0) {
474 if (mdismddberror(&error
, MDE_DB_STALE
) ||
475 mdismddberror(&error
, MDE_DB_TAGDATA
) ||
476 ! mdismddberror(&error
, MDE_DB_NODB
) &&
477 ! mdismddberror(&error
, MDE_DB_NOTOWNER
)) {
480 * Normally MDE_DB_STALE or MDE_DB_TAGDATA
481 * would still keep the set but in this case we don't
482 * want to do that. This will probably result in the
483 * boot going in to single-user since we won't have the
484 * set so any attempted mounts using the set's metadevices
485 * will fail. However, that is a "good thing" so the
486 * sysadmin can fix the set. Normally they would see
487 * all of these problems when they ran the take and be
488 * able to immediately fix the problem.
490 mde_perror(&error
, "");
491 auto_take_cleanup(&sn
, sideno
);
497 * Call metareplicalist and upd_dr_dbinfo.
498 * Most of that code is only needed to synchronize amongst the multiple
499 * hosts in a set, which is not applicable in our case. But we do a
500 * subset here to handle the case when the user had been
501 * adding/deleting/balancing mddbs when this node panic'd. We are
502 * synchronizing the ondisk mddbs to the list of drive records stored
505 if (metareplicalist(&sn
, (MD_BASICNAME_OK
| PRINT_FAST
), &rlp
, &error
)
508 mde_perror(&error
, "");
509 auto_take_cleanup(&sn
, sideno
);
514 * The following code is equivalent to upd_dr_dbinfo for syncronizing
515 * the local host only. That function is normally run through the
516 * metad with a local and daemon side but we'll do all of the work
520 /* find the smallest existing replica */
521 for (rl
= rlp
; rl
!= NULL
; rl
= rl
->rl_next
) {
525 nblks
= ((nblks
== 0) ? r
->r_nblk
: min(r
->r_nblk
, nblks
));
531 for (dr
= sr
->sr_drivechain
; dr
; dr
= dr
->dr_next
) {
534 md_replicalist_t
*rl
;
537 * The cname style for dnp and replica list will be same since
538 * both use the the same flags MD_BASICNAME_OK|PRINT_FAST which
539 * will always provide the cached value.
541 if ((dnp
= metadrivename_withdrkey(&sn
, sideno
, dr
->dr_key
,
542 MD_BASICNAME_OK
| PRINT_FAST
, &error
)) == NULL
) {
543 mde_perror(&error
, "");
544 metafreereplicalist(rlp
);
545 auto_take_cleanup(&sn
, sideno
);
550 /* see how many replicas are on this drive */
551 for (rl
= rlp
; rl
!= NULL
; rl
= rl
->rl_next
) {
552 if (strcmp(rl
->rl_repp
->r_namep
->drivenamep
->cname
, dnp
->cname
)
557 /* Adjust the fields in the copy */
558 dr
->dr_dbcnt
= dbcnt
;
559 dr
->dr_dbsize
= dbcnt
> 0 ? nblks
: 0;
563 * If the set doesn't have the MD_SR_MB_DEVID bit set, i.e
564 * the drives in the set don't have the device id information,
565 * then stick it in if possible.
567 * If updating the master block fails for whatever reason, it's
568 * okay. It just means the disk(s) in the diskset won't be self
571 if (!(sr
->sr_flags
& MD_SR_MB_DEVID
)) {
572 if (meta_update_mb(&sn
, dd
, &error
) == 0) {
573 sr
->sr_flags
|= MD_SR_MB_DEVID
;
578 commitset(sr
, FALSE
, &error
);
580 metafreereplicalist(rlp
);
583 * This finishes up the logical equivalent of meta_set_take.
585 if (meta_resync_all(&sn
, MD_DEF_RESYNC_BUF_SIZE
, &error
) != 0) {
586 mde_perror(&error
, "");
592 * Take the disksets that are marked to be taken at boot time.
599 md_error_t error
= mdnullerror
;
602 if ((max_sets
= get_max_sets(&error
)) == 0)
605 if (!mdisok(&error
)) {
606 mde_perror(&error
, "");
610 /* set up so auto-take errors also go to syslog */
611 openlog("metainit", LOG_ODELAY
, LOG_USER
);
617 * For each possible set number (skip set 0 which is the unnamed local
618 * set), see if we really have a diskset. If so, check if auto-take
621 * In order to take the set it must have drives and it must not be
622 * stuck in mid-add. The sr_validate routine within rpc.metad will
623 * delete sets that are in mid-add when it runs.
625 for (i
= 1; i
< max_sets
; i
++) {
628 if ((sr
= metad_getsetbynum(i
, &error
)) == NULL
) {
633 if (sr
->sr_flags
& MD_SR_AUTO_TAKE
&& !(sr
->sr_flags
& MD_SR_ADD
)) {
636 int host_mismatch
= 0;
640 /* check for host renames or multiple hosts in set */
641 for (j
= 0; j
< MD_MAXSIDES
; j
++) {
642 /* Skip empty slots */
643 if (sr
->sr_nodes
[j
][0] == '\0')
647 if (strcmp(sr
->sr_nodes
[j
], hostname
) != 0)
651 /* paranoid check that we're the only host in the set */
654 "diskset %s: auto-take enabled and multiple hosts in set\n"),
660 /* The host was renamed, repair the set. */
661 for (j
= 0; j
< MD_MAXSIDES
; j
++) {
662 /* Skip empty slots */
663 if (sr
->sr_nodes
[j
][0] == '\0')
666 (void) strncpy(sr
->sr_nodes
[j
], hostname
,
667 sizeof (sr
->sr_nodes
[j
]));
668 commitset(sr
, FALSE
, &error
);
669 if (!mdisok(&error
)) {
670 mde_perror(&error
, "");
674 "new hostname %s, update auto-take diskset %s\n"),
675 hostname
, sr
->sr_setname
);
681 /* set must have at least one drive to be taken */
682 for (dr
= sr
->sr_drivechain
; dr
!= NULL
; dr
= dr
->dr_next
) {
683 /* ignore drives in mid-add */
684 if (!(dr
->dr_flags
& MD_DR_ADD
)) {
694 "diskset %s: auto-take enabled but set has no drives\n"),
701 * mainline. crack command line arguments.
709 char *sname
= MD_LOCAL_NAME
;
710 mdsetname_t
*sp
= NULL
;
716 mdcmdopts_t options
= (MDCMD_DOIT
| MDCMD_PRINT
);
718 md_error_t status
= mdnullerror
;
719 md_error_t
*ep
= &status
;
721 md_error_t dummystatus
= mdnullerror
;
722 md_error_t
*dummyep
= &dummystatus
;
725 bool_t called_thru_rpc
= FALSE
;
730 * Get the locale set up before calling any other routines
731 * with messages to ouput. Just in case we're not in a build
732 * environment, make sure that TEXT_DOMAIN gets set to
735 #if !defined(TEXT_DOMAIN)
736 #define TEXT_DOMAIN "SYS_TEST"
738 (void) setlocale(LC_ALL
, "");
739 (void) textdomain(TEXT_DOMAIN
);
740 if ((cp
= strstr(argv
[0], ".rpc_call")) != NULL
) {
741 *cp
= '\0'; /* cut off ".rpc_call" */
742 called_thru_rpc
= TRUE
;
744 if (sdssc_bind_library() == SDSSC_OKAY
)
745 if (sdssc_cmd_proxy(argc
, argv
, SDSSC_PROXY_PRIMARY
,
746 &error
) == SDSSC_PROXY_DONE
)
751 if (md_init(argc
, argv
, 0, 1, ep
) != 0 ||
752 meta_check_root(ep
) != 0) {
760 while ((c
= getopt(argc
, argv
, "afhnrs:?")) != -1) {
773 /* all devices in md.tab */
778 options
|= MDCMD_ALLOPTION
;
780 /* check for validity, but don't really init */
782 options
&= ~MDCMD_DOIT
;
792 /* mounted and swapped components are OK */
794 options
|= MDCMD_FORCE
;
807 /* sname is MD_LOCAL_NAME if not specified on the command line */
808 if ((sp
= metasetname(sname
, ep
)) == NULL
) {
819 } else if (argc
> 0) {
824 /* setup database locations */
825 if (meta_setup_db_locations(ep
) != 0) {
827 if (mdismddberror(ep
, MDE_DB_STALE
))
829 if (! mdiserror(ep
, MDE_MDDB_CKSUM
)) /* relatively benign */
832 if (todo
== INIT
) { /* load and take auto-take sets */
836 * During the boot sequence we need to update the mediator
837 * records, however this depends upon the rpc.metamedd
838 * running. So, in order to not introduce a delay in the
839 * boot time, fork a new process to do this work in the
843 if (pid
== (pid_t
)-1) {
845 * We could not fork a child process to udpate mediator
846 * information on this node. There is no need to panic.
847 * We shall simply return 1.
849 mde_perror(ep
, "Could not fork a child process to"
850 " update mediator record");
852 } else if (pid
== (pid_t
)0) {
854 if (meta_mediator_info_from_file(NULL
, 0, ep
) == 1) {
856 * No need to print any error messages.
857 * All the errors messages are printed in the
858 * library routine itself.
868 } else if (todo
== ALL
) { /* initialize all devices in md.tab */
869 eval
= init_all(&sp
, options
, called_thru_rpc
, ep
);
870 } else { /* initialize the named device */
872 if (init_name(&sp
, argc
, argv
, options
, called_thru_rpc
,
875 * If we're dealing with MN metadevices and we are
876 * directly called, then the appropriate error message
877 * has already been displayed. So just exit.
879 if (meta_is_mn_set(sp
, dummyep
) && (!called_thru_rpc
)) {
890 /* update md.cf, return success */
891 if (meta_update_md_cf(sp
, ep
) != 0) {