4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
28 * Md - is the meta-disk driver. It sits below the UFS file system
29 * but above the 'real' disk drivers, xy, id, sd etc.
31 * To the UFS software, md looks like a normal driver, since it has
32 * the normal kinds of entries in the bdevsw and cdevsw arrays. So
33 * UFS accesses md in the usual ways. In particular, the strategy
34 * routine, mdstrategy(), gets called by fbiwrite(), ufs_getapage(),
37 * Md maintains an array of minor devices (meta-partitions). Each
38 * meta partition stands for a matrix of real partitions, in rows
39 * which are not necessarily of equal length. Md maintains a table,
40 * with one entry for each meta-partition, which lists the rows and
41 * columns of actual partitions, and the job of the strategy routine
42 * is to translate from the meta-partition device and block numbers
43 * known to UFS into the actual partitions' device and block numbers.
45 * See below, in mdstrategy(), mdreal(), and mddone() for details of
50 * Driver for Virtual Disk.
54 #include <sys/sysmacros.h>
57 #include <sys/errno.h>
58 #include <sys/param.h>
59 #include <sys/systm.h>
64 #include <sys/cmn_err.h>
66 #include <sys/sunddi.h>
67 #include <sys/debug.h>
68 #include <sys/utsname.h>
69 #include <sys/lvm/mdvar.h>
70 #include <sys/lvm/md_names.h>
71 #include <sys/lvm/md_mddb.h>
72 #include <sys/lvm/md_sp.h>
73 #include <sys/types.h>
75 #include <sys/cladm.h>
76 #include <sys/priv_names.h>
77 #include <sys/modhash.h>
79 int md_init_debug
= 0; /* module binding debug */
82 * Tunable to turn off the failfast behavior.
84 int md_ff_disable
= 0;
87 * dynamically allocated list of non FF driver names - needs to
88 * be freed when md is detached.
90 char **non_ff_drivers
= NULL
;
92 md_krwlock_t md_unit_array_rw
; /* protects all unit arrays */
93 md_krwlock_t nm_lock
; /* protects all the name spaces */
95 md_resync_t md_cpr_resync
;
97 extern char svm_bootpath
[];
98 #define SVM_PSEUDO_STR "/pseudo/md@0:"
100 #define VERSION_LENGTH 6
101 #define VERSION "1.0"
104 * Keep track of possible 'orphan' entries in the name space
106 int *md_nm_snarfed
= NULL
;
109 * Global tunable giving the percentage of free space left in replica during
110 * conversion of non-devid style replica to devid style replica.
112 int md_conv_perc
= MDDB_DEVID_CONV_PERC
;
115 /* debug code to verify framework exclusion guarantees */
117 kmutex_t md_in_mx
; /* used to md global stuff */
120 #define IN_ATTACH 0x04
121 #define IN_DETACH 0x08
123 #define MD_SET_IN(x) { \
124 mutex_enter(&md_in_mx); \
126 debug_enter("MD_SET_IN exclusion lost"); \
128 debug_enter("MD_SET_IN already set"); \
130 mutex_exit(&md_in_mx); \
133 #define MD_CLR_IN(x) { \
134 mutex_enter(&md_in_mx); \
136 debug_enter("MD_CLR_IN exclusion lost"); \
138 debug_enter("MD_CLR_IN already clr"); \
140 mutex_exit(&md_in_mx); \
146 hrtime_t savetime1
, savetime2
;
150 * list things protected by md_mx even if they aren't
153 kmutex_t md_mx
; /* used to md global stuff */
154 kcondvar_t md_cv
; /* md_status events */
155 int md_status
= 0; /* global status for the meta-driver */
156 int md_num_daemons
= 0;
157 int md_ioctl_cnt
= 0;
158 int md_mtioctl_cnt
= 0; /* multithreaded ioctl cnt */
159 uint_t md_mdelay
= 10; /* variable so can be patched */
161 int (*mdv_strategy_tstpnt
)(buf_t
*, int, void*);
163 major_t md_major
, md_major_targ
;
165 unit_t md_nunits
= MD_MAXUNITS
;
166 set_t md_nsets
= MD_MAXSETS
;
168 char *md_med_trans_lst
= NULL
;
169 md_set_t md_set
[MD_MAXSETS
];
170 md_set_io_t md_set_io
[MD_MAXSETS
];
172 md_krwlock_t hsp_rwlp
; /* protects hot_spare_interface */
173 md_krwlock_t ni_rwlp
; /* protects notify_interface */
174 md_ops_t
**md_ops
= NULL
;
175 ddi_modhandle_t
*md_mods
= NULL
;
176 md_ops_t
*md_opslist
;
178 md_event_queue_t
*md_event_queue
= NULL
;
181 int md_keep_repl_state
;
182 int md_devid_destroy
;
184 /* for sending messages thru a door to userland */
185 door_handle_t mdmn_door_handle
= NULL
;
186 int mdmn_door_did
= -1;
188 dev_info_t
*md_devinfo
= NULL
;
190 md_mn_nodeid_t md_mn_mynode_id
= ~0u; /* My node id (for multi-node sets) */
192 static uint_t md_ocnt
[OTYPCNT
];
194 static int mdinfo(dev_info_t
*, ddi_info_cmd_t
, void *, void **);
195 static int mdattach(dev_info_t
*, ddi_attach_cmd_t
);
196 static int mddetach(dev_info_t
*, ddi_detach_cmd_t
);
197 static int mdopen(dev_t
*, int, int, cred_t
*);
198 static int mdclose(dev_t
, int, int, cred_t
*);
199 static int mddump(dev_t
, caddr_t
, daddr_t
, int);
200 static int mdread(dev_t
, struct uio
*, cred_t
*);
201 static int mdwrite(dev_t
, struct uio
*, cred_t
*);
202 static int mdaread(dev_t
, struct aio_req
*, cred_t
*);
203 static int mdawrite(dev_t
, struct aio_req
*, cred_t
*);
204 static int mdioctl(dev_t
, int, intptr_t, int, cred_t
*, int *);
205 static int mdprop_op(dev_t
, dev_info_t
*,
206 ddi_prop_op_t
, int, char *, caddr_t
, int *);
208 static struct cb_ops md_cb_ops
= {
211 mdstrategy
, /* strategy */
212 /* print routine -- none yet */
213 (int(*)(dev_t
, char *))nulldev
,
219 (int(*)(dev_t
, devmap_cookie_t
, offset_t
, size_t, size_t *,
222 (int(*)(dev_t
, off_t
, int))nodev
,
224 (int(*)(dev_t
, off_t
, struct as
*, caddr_t
*, off_t
, unsigned,
225 unsigned, unsigned, cred_t
*))nodev
,
227 mdprop_op
, /* prop_op */
229 (D_64BIT
|D_MP
|D_NEW
), /* driver compatibility flag */
230 CB_REV
, /* cb_ops version */
232 mdawrite
, /* awrite */
235 static struct dev_ops md_devops
= {
236 DEVO_REV
, /* dev_ops version */
237 0, /* device reference count */
238 mdinfo
, /* info routine */
239 nulldev
, /* identify routine */
240 nulldev
, /* probe - not defined */
241 mdattach
, /* attach routine */
242 mddetach
, /* detach routine */
243 nodev
, /* reset - not defined */
244 &md_cb_ops
, /* driver operations */
245 NULL
, /* bus operations */
246 nodev
, /* power management */
247 ddi_quiesce_not_needed
, /* quiesce */
251 * loadable module wrapper
253 #include <sys/modctl.h>
255 static struct modldrv modldrv
= {
256 &mod_driverops
, /* type of module -- a pseudodriver */
257 "Solaris Volume Manager base module", /* name of the module */
258 &md_devops
, /* driver ops */
261 static struct modlinkage modlinkage
= {
269 extern void med_init(void);
270 extern void med_fini(void);
271 extern void md_devid_cleanup(set_t
, uint_t
);
274 extern struct nm_next_hdr
*get_first_record(set_t
, int, int);
276 int md_maxphys
= 0; /* maximum io size in bytes */
277 #define MD_MAXBCOUNT (1024 * 1024)
278 unsigned md_maxbcount
= 0; /* maximum physio size in bytes */
281 * Some md ioctls trigger io framework device tree operations. An
282 * example is md ioctls that call md_resolve_bydevid(): which uses the
283 * io framework to resolve a devid. Such operations result in acquiring
284 * io framework locks (like ndi_devi_enter() of "/") while holding
285 * driver locks (like md_unit_writerlock()).
287 * The prop_op(9E) entry point is called from the devinfo driver with
288 * an active ndi_devi_enter of "/". To avoid deadlock, md's prop_op
289 * implementation must avoid taking a lock that is held per above md
290 * ioctl description: i.e. mdprop_op(9E) can't call md_unit_readerlock()
291 * without risking deadlock.
293 * To service "size" requests without risking deadlock, we maintain a
294 * "mnum->nblocks" sizemap (protected by a short-term global mutex).
296 static kmutex_t md_nblocks_mutex
;
297 static mod_hash_t
*md_nblocksmap
; /* mnum -> nblocks */
298 int md_nblocksmap_size
= 512;
301 * Maintain "mnum->nblocks" sizemap for mdprop_op use:
303 * Create: any code that establishes a unit's un_total_blocks needs the
304 * following type of call to establish nblocks for mdprop_op():
305 * md_nblocks_set(mnum, un->c.un_total_blocks);"
306 * NOTE: locate via cscope md_create_minor_node/md_create_unit_incore
307 * ...or "MD_UNIT..*="
309 * Change: any code that changes a unit's un_total_blocks needs the
310 * following type of call to sync nblocks for mdprop_op():
311 * md_nblocks_set(mnum, un->c.un_total_blocks);"
312 * NOTE: locate via cscope for "un_total_blocks[ \t]*="
314 * Destroy: any code that deletes a unit needs the following type of call
315 * to sync nblocks for mdprop_op():
316 * md_nblocks_set(mnum, -1ULL);
317 * NOTE: locate via cscope md_remove_minor_node/md_destroy_unit_incore
318 * ...or "MD_UNIT..*="
321 md_nblocks_set(minor_t mnum
, uint64_t nblocks
)
323 mutex_enter(&md_nblocks_mutex
);
324 if (nblocks
== -1ULL)
325 (void) mod_hash_destroy(md_nblocksmap
,
326 (mod_hash_key_t
)(intptr_t)mnum
);
328 (void) mod_hash_replace(md_nblocksmap
,
329 (mod_hash_key_t
)(intptr_t)mnum
,
330 (mod_hash_val_t
)(intptr_t)nblocks
);
331 mutex_exit(&md_nblocks_mutex
);
334 /* get the size of a mnum from "mnum->nblocks" sizemap */
336 md_nblocks_get(minor_t mnum
)
340 mutex_enter(&md_nblocks_mutex
);
341 if (mod_hash_find(md_nblocksmap
,
342 (mod_hash_key_t
)(intptr_t)mnum
, &hv
) == 0) {
343 mutex_exit(&md_nblocks_mutex
);
344 return ((uint64_t)(intptr_t)hv
);
346 mutex_exit(&md_nblocks_mutex
);
350 /* allocate/free dynamic space associated with driver globals */
352 md_global_alloc_free(int alloc
)
357 /* initialize driver global locks */
358 cv_init(&md_cv
, NULL
, CV_DEFAULT
, NULL
);
359 mutex_init(&md_mx
, NULL
, MUTEX_DEFAULT
, NULL
);
360 rw_init(&md_unit_array_rw
.lock
, NULL
, RW_DEFAULT
, NULL
);
361 rw_init(&nm_lock
.lock
, NULL
, RW_DEFAULT
, NULL
);
362 rw_init(&ni_rwlp
.lock
, NULL
, RW_DRIVER
, NULL
);
363 rw_init(&hsp_rwlp
.lock
, NULL
, RW_DRIVER
, NULL
);
364 mutex_init(&md_cpr_resync
.md_resync_mutex
, NULL
,
365 MUTEX_DEFAULT
, NULL
);
366 mutex_init(&md_nblocks_mutex
, NULL
, MUTEX_DEFAULT
, NULL
);
368 /* initialize per set driver global locks */
369 for (s
= 0; s
< MD_MAXSETS
; s
++) {
370 /* initialize per set driver globals locks */
371 mutex_init(&md_set
[s
].s_dbmx
,
372 NULL
, MUTEX_DEFAULT
, NULL
);
373 mutex_init(&md_set_io
[s
].md_io_mx
,
374 NULL
, MUTEX_DEFAULT
, NULL
);
375 cv_init(&md_set_io
[s
].md_io_cv
,
376 NULL
, CV_DEFAULT
, NULL
);
379 /* destroy per set driver global locks */
380 for (s
= 0; s
< MD_MAXSETS
; s
++) {
381 cv_destroy(&md_set_io
[s
].md_io_cv
);
382 mutex_destroy(&md_set_io
[s
].md_io_mx
);
383 mutex_destroy(&md_set
[s
].s_dbmx
);
386 /* destroy driver global locks */
387 mutex_destroy(&md_nblocks_mutex
);
388 mutex_destroy(&md_cpr_resync
.md_resync_mutex
);
389 rw_destroy(&hsp_rwlp
.lock
);
390 rw_destroy(&ni_rwlp
.lock
);
391 rw_destroy(&nm_lock
.lock
);
392 rw_destroy(&md_unit_array_rw
.lock
);
393 mutex_destroy(&md_mx
);
406 /* allocate dynamic space associated with driver globals */
407 md_global_alloc_free(1);
409 /* initialize driver globals */
410 md_major
= ddi_name_to_major("md");
411 md_hz
= drv_usectohz(NUM_USEC_IN_SEC
);
413 /* initialize tunable globals */
414 if (md_maxphys
== 0) /* maximum io size in bytes */
415 md_maxphys
= maxphys
;
416 if (md_maxbcount
== 0) /* maximum physio size in bytes */
417 md_maxbcount
= MD_MAXBCOUNT
;
419 /* initialize per set driver globals */
420 for (s
= 0; s
< MD_MAXSETS
; s
++)
421 md_set_io
[s
].io_state
= MD_SET_ACTIVE
;
424 * NOTE: the framework does not currently guarantee exclusion
425 * between _init and attach after calling mod_install.
428 if ((err
= mod_install(&modlinkage
))) {
430 md_global_alloc_free(0); /* free dynamic space */
442 * NOTE: the framework currently does not guarantee exclusion
443 * with attach until after mod_remove returns 0.
445 if ((err
= mod_remove(&modlinkage
)))
449 md_global_alloc_free(0); /* free dynamic space */
455 _info(struct modinfo
*modinfop
)
457 return (mod_info(&modlinkage
, modinfop
));
462 mdattach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
467 char ver
[VERSION_LENGTH
];
468 char **maj_str_array
;
471 MD_SET_IN(IN_ATTACH
);
473 md_keep_repl_state
= 0;
474 md_devid_destroy
= 0;
476 if (cmd
!= DDI_ATTACH
) {
477 MD_CLR_IN(IN_ATTACH
);
478 return (DDI_FAILURE
);
481 if (md_devinfo
!= NULL
) {
482 MD_CLR_IN(IN_ATTACH
);
483 return (DDI_FAILURE
);
488 if (md_start_daemons(TRUE
)) {
489 MD_CLR_IN(IN_ATTACH
);
490 mddb_unload(); /* undo mddb_init() allocations */
491 return (DDI_FAILURE
);
494 /* clear the halted state */
495 md_clr_status(MD_GBL_HALTED
);
497 /* see if the diagnostic switch is on */
498 if (ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
499 DDI_PROP_DONTPASS
, "md_init_debug", 0))
502 /* see if the failfast disable switch is on */
503 if (ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
504 DDI_PROP_DONTPASS
, "md_ff_disable", 0))
507 /* try and get the md_nmedh property */
508 md_nmedh
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
509 DDI_PROP_DONTPASS
, "md_nmedh", MED_DEF_HOSTS
);
510 if ((md_nmedh
<= 0) || (md_nmedh
> MED_MAX_HOSTS
))
511 md_nmedh
= MED_DEF_HOSTS
;
513 /* try and get the md_med_trans_lst property */
515 if (ddi_prop_op(DDI_DEV_T_ANY
, dip
, PROP_LEN
,
516 0, "md_med_trans_lst", NULL
, &len
) != DDI_PROP_SUCCESS
||
518 md_med_trans_lst
= md_strdup("tcp");
520 md_med_trans_lst
= kmem_zalloc((size_t)len
, KM_SLEEP
);
521 if (ddi_prop_op(DDI_DEV_T_ANY
, dip
, PROP_LEN_AND_VAL_BUF
,
522 0, "md_med_trans_lst", md_med_trans_lst
, &len
) !=
524 kmem_free(md_med_trans_lst
, (size_t)len
);
525 md_med_trans_lst
= md_strdup("tcp");
530 * Must initialize the internal data structures before the
531 * any possible calls to 'goto attach_failure' as _fini
532 * routine references them.
536 md_ops
= (md_ops_t
**)kmem_zalloc(
537 sizeof (md_ops_t
*) * MD_NOPS
, KM_SLEEP
);
538 md_mods
= (ddi_modhandle_t
*)kmem_zalloc(
539 sizeof (ddi_modhandle_t
) * MD_NOPS
, KM_SLEEP
);
541 /* try and get the md_xlate property */
542 /* Should we only do this if upgrade? */
543 len
= sizeof (char) * 5;
544 if (ddi_prop_op(DDI_DEV_T_ANY
, dip
, PROP_LEN_AND_VAL_BUF
,
545 0, "md_xlate_ver", ver
, &len
) == DDI_PROP_SUCCESS
) {
546 if (strcmp(ver
, VERSION
) == 0) {
548 if (ddi_prop_op(DDI_DEV_T_ANY
, dip
,
549 PROP_LEN_AND_VAL_ALLOC
, 0, "md_xlate",
550 (caddr_t
)&md_tuple_table
, &len
) !=
554 "md_xlate ddi_prop_op failed");
558 len
/(2 * ((int)sizeof (dev32_t
)));
562 /* Get target's name to major table */
563 if (ddi_prop_lookup_string_array(DDI_DEV_T_ANY
,
564 dip
, DDI_PROP_DONTPASS
,
565 "md_targ_nm_table", &maj_str_array
,
566 &md_majortab_len
) != DDI_PROP_SUCCESS
) {
569 cmn_err(CE_WARN
, "md_targ_nm_table "
570 "ddi_prop_lookup_string_array "
575 md_major_tuple_table
=
576 (struct md_xlate_major_table
*)
577 kmem_zalloc(md_majortab_len
*
578 sizeof (struct md_xlate_major_table
), KM_SLEEP
);
580 for (i
= 0; i
< md_majortab_len
; i
++) {
581 /* Getting major name */
582 str
= strchr(maj_str_array
[i
], ' ');
586 md_major_tuple_table
[i
].drv_name
=
587 md_strdup(maj_str_array
[i
]);
589 /* Simplified atoi to get major number */
591 md_major_tuple_table
[i
].targ_maj
= 0;
592 while ((*str2
>= '0') && (*str2
<= '9')) {
593 md_major_tuple_table
[i
].targ_maj
*= 10;
594 md_major_tuple_table
[i
].targ_maj
+=
599 ddi_prop_free((void *)maj_str_array
);
602 cmn_err(CE_WARN
, "md_xlate_ver is incorrect");
608 * Check for properties:
609 * md_keep_repl_state and md_devid_destroy
610 * and set globals if these exist.
612 md_keep_repl_state
= ddi_getprop(DDI_DEV_T_ANY
, dip
,
613 0, "md_keep_repl_state", 0);
615 md_devid_destroy
= ddi_getprop(DDI_DEV_T_ANY
, dip
,
616 0, "md_devid_destroy", 0);
619 md_major_targ
= md_targ_name_to_major("md");
623 /* allocate admin device node */
624 if (ddi_create_priv_minor_node(dip
, "admin", S_IFCHR
,
625 MD_ADM_MINOR
, DDI_PSEUDO
, 0, NULL
, PRIV_SYS_CONFIG
, 0640))
628 if (ddi_prop_create(DDI_DEV_T_NONE
, dip
, DDI_PROP_CANSLEEP
,
629 DDI_KERNEL_IOCTL
, NULL
, 0) != DDI_SUCCESS
)
632 if (ddi_prop_update_int(DDI_DEV_T_NONE
, dip
,
633 "ddi-abrwrite-supported", 1) != DDI_SUCCESS
)
636 /* these could have been cleared by a detach */
637 md_nunits
= MD_MAXUNITS
;
638 md_nsets
= MD_MAXSETS
;
640 sz
= sizeof (void *) * MD_MAXUNITS
;
641 if (md_set
[0].s_un
== NULL
)
642 md_set
[0].s_un
= kmem_zalloc(sz
, KM_SLEEP
);
643 if (md_set
[0].s_ui
== NULL
)
644 md_set
[0].s_ui
= kmem_zalloc(sz
, KM_SLEEP
);
649 * Only allocate device node for root mirror metadevice.
650 * Don't pre-allocate unnecessary device nodes (thus slowing down a
651 * boot when we attach).
652 * We can't read the mddbs in attach. The mddbs will be read
653 * by metainit during the boot process when it is doing the
654 * auto-take processing and any other minor nodes will be
655 * allocated at that point.
657 * There are two scenarios to be aware of here:
658 * 1) when we are booting from a mirrored root we need the root
659 * metadevice to exist very early (during vfs_mountroot processing)
660 * 2) we need all of the nodes to be created so that any mnttab entries
661 * will succeed (handled by metainit reading the mddb during boot).
663 if (strncmp(SVM_PSEUDO_STR
, svm_bootpath
, sizeof (SVM_PSEUDO_STR
) - 1)
669 * The svm_bootpath string looks something like
670 * /pseudo/md@0:0,150,blk where 150 is the minor number
671 * in this example so we need to set the pointer p onto
672 * the first digit of the minor number and convert it
675 for (p
= svm_bootpath
+ sizeof (SVM_PSEUDO_STR
) + 1;
676 *p
>= '0' && *p
<= '9'; p
++) {
681 if (md_create_minor_node(0, mnum
)) {
682 kmem_free(md_set
[0].s_un
, sz
);
683 kmem_free(md_set
[0].s_ui
, sz
);
688 /* create the hash to store the meta device sizes */
689 md_nblocksmap
= mod_hash_create_idhash("md_nblocksmap",
690 md_nblocksmap_size
, mod_hash_null_valdtor
);
692 MD_CLR_IN(IN_ATTACH
);
693 return (DDI_SUCCESS
);
697 * Use our own detach routine to toss any stuff we allocated above.
698 * NOTE: detach will call md_halt to free the mddb_init allocations.
700 MD_CLR_IN(IN_ATTACH
);
701 if (mddetach(dip
, DDI_DETACH
) != DDI_SUCCESS
)
702 cmn_err(CE_WARN
, "detach from attach failed");
703 return (DDI_FAILURE
);
708 mddetach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
710 extern int check_active_locators();
715 MD_SET_IN(IN_DETACH
);
718 if (cmd
!= DDI_DETACH
) {
719 MD_CLR_IN(IN_DETACH
);
720 return (DDI_FAILURE
);
724 * if we have not already halted yet we have no active config
725 * then automatically initiate a halt so we can detach.
727 if (!(md_get_status() & MD_GBL_HALTED
)) {
728 if (check_active_locators() == 0) {
730 * NOTE: a successful md_halt will have done the
731 * mddb_unload to free allocations done in mddb_init
733 if (md_halt(MD_NO_GBL_LOCKS_HELD
)) {
734 cmn_err(CE_NOTE
, "md:detach: "
735 "Could not halt Solaris Volume Manager");
736 MD_CLR_IN(IN_DETACH
);
737 return (DDI_FAILURE
);
741 /* fail detach if we have not halted */
742 if (!(md_get_status() & MD_GBL_HALTED
)) {
743 MD_CLR_IN(IN_DETACH
);
744 return (DDI_FAILURE
);
748 /* must be in halted state, this will be cleared on next attach */
749 ASSERT(md_get_status() & MD_GBL_HALTED
);
751 /* cleanup attach allocations and initializations */
754 sz
= sizeof (void *) * md_nunits
;
755 for (s
= 0; s
< md_nsets
; s
++) {
756 if (md_set
[s
].s_un
!= NULL
) {
757 kmem_free(md_set
[s
].s_un
, sz
);
758 md_set
[s
].s_un
= NULL
;
761 if (md_set
[s
].s_ui
!= NULL
) {
762 kmem_free(md_set
[s
].s_ui
, sz
);
763 md_set
[s
].s_ui
= NULL
;
770 if (non_ff_drivers
!= NULL
) {
773 for (i
= 0; non_ff_drivers
[i
] != NULL
; i
++)
774 kmem_free(non_ff_drivers
[i
],
775 strlen(non_ff_drivers
[i
]) + 1);
777 /* free i+1 entries because there is a null entry at list end */
778 kmem_free(non_ff_drivers
, (i
+ 1) * sizeof (char *));
779 non_ff_drivers
= NULL
;
782 if (md_med_trans_lst
!= NULL
) {
783 kmem_free(md_med_trans_lst
, strlen(md_med_trans_lst
) + 1);
784 md_med_trans_lst
= NULL
;
787 if (md_mods
!= NULL
) {
788 kmem_free(md_mods
, sizeof (ddi_modhandle_t
) * MD_NOPS
);
792 if (md_ops
!= NULL
) {
793 kmem_free(md_ops
, sizeof (md_ops_t
*) * MD_NOPS
);
798 len
= md_tuple_length
* (2 * ((int)sizeof (dev32_t
)));
805 * Undo what we did in mdattach, freeing resources
806 * and removing things we installed. The system
807 * framework guarantees we are not active with this devinfo
808 * node in any other entry points at this time.
810 ddi_prop_remove_all(dip
);
811 ddi_remove_minor_node(dip
, NULL
);
815 mod_hash_destroy_idhash(md_nblocksmap
);
819 MD_CLR_IN(IN_DETACH
);
820 return (DDI_SUCCESS
);
825 * Given the device number return the devinfo pointer
826 * given to md via md_attach
830 mdinfo(dev_info_t
*dip
, ddi_info_cmd_t infocmd
, void *arg
, void **result
)
832 int error
= DDI_FAILURE
;
835 case DDI_INFO_DEVT2DEVINFO
:
837 *result
= (void *)md_devinfo
;
842 case DDI_INFO_DEVT2INSTANCE
:
851 * property operation routine. return the number of blocks for the partition
852 * in question or forward the request to the property facilities.
856 dev_t dev
, /* device number associated with device */
857 dev_info_t
*dip
, /* device info struct for this device */
858 ddi_prop_op_t prop_op
, /* property operator */
859 int mod_flags
, /* property flags */
860 char *name
, /* name of property */
861 caddr_t valuep
, /* where to put property value */
862 int *lengthp
) /* put length of property here */
864 return (ddi_prop_op_nblocks(dev
, dip
, prop_op
, mod_flags
,
865 name
, valuep
, lengthp
, md_nblocks_get(getminor(dev
))));
869 snarf_user_data(set_t setno
)
872 mddb_recstatus_t status
;
874 recid
= mddb_makerecid(setno
, 0);
875 while ((recid
= mddb_getnextrec(recid
, MDDB_USER
, 0)) > 0) {
876 if (mddb_getrecprivate(recid
) & MD_PRV_GOTIT
)
879 status
= mddb_getrecstatus(recid
);
880 if (status
== MDDB_STALE
)
883 if (status
== MDDB_NODATA
) {
884 mddb_setrecprivate(recid
, MD_PRV_PENDDEL
);
888 ASSERT(status
== MDDB_OK
);
890 mddb_setrecprivate(recid
, MD_PRV_GOTIT
);
895 md_print_block_usage(mddb_set_t
*s
, uint_t blks
)
900 uint_t max_blk_needed
;
902 mddb_sidelocator_t
*slp
;
912 max_blk_needed
= s
->s_totalblkcnt
- s
->s_freeblkcnt
+ blks
;
914 cmn_err(CE_WARN
, "Blocks in Metadevice State Database: %d\n"
915 " Additional Blocks Needed: %d\n\n"
916 " Increase size of following replicas for\n"
917 " device relocatability by deleting listed\n"
918 " replica and re-adding replica with\n"
919 " increased size (see metadb(1M)):\n"
920 " Replica Increase By",
921 s
->s_totalblkcnt
, (blks
- s
->s_freeblkcnt
));
925 for (li
= 0; li
< lbp
->lb_loccnt
; li
++) {
926 if (lbp
->lb_locators
[li
].l_flags
& MDDB_F_DELETED
)
929 for (mbip
= s
->s_mbiarray
[li
]; mbip
!= NULL
;
930 mbip
= mbip
->mbi_next
) {
931 ib
+= (uint_t
)mbip
->mbi_mddb_mb
.mb_blkcnt
;
935 if (ib
< max_blk_needed
) {
936 slp
= &lbp
->lb_sidelocators
[s
->s_sideno
][li
];
937 drv_index
= slp
->l_drvnm_index
;
938 mddb_locatorblock2splitname(s
->s_lnp
, li
, s
->s_sideno
,
940 prefixlen
= SPN_PREFIX(&sn
).pre_len
;
941 suffixlen
= SPN_SUFFIX(&sn
).suf_len
;
942 alloc_sz
= (int)(prefixlen
+ suffixlen
+ 2);
943 name
= (char *)kmem_alloc(alloc_sz
, KM_SLEEP
);
944 (void) strncpy(name
, SPN_PREFIX(&sn
).pre_data
,
946 name
[prefixlen
] = '/';
947 suffix
= name
+ (prefixlen
+ 1);
948 (void) strncpy(suffix
, SPN_SUFFIX(&sn
).suf_data
,
950 name
[prefixlen
+ suffixlen
+ 1] = '\0';
952 " %s (%s:%d:%d) %d blocks",
953 name
, lbp
->lb_drvnm
[drv_index
].dn_data
,
954 slp
->l_mnum
, lbp
->lb_locators
[li
].l_blkno
,
955 (max_blk_needed
- ib
));
956 kmem_free(name
, alloc_sz
);
962 * md_create_minor_node:
963 * Create the minor device for the given set and un_self_id.
967 * mnum - selfID of unit
972 * Returns 0 for success, 1 for failure.
978 md_create_minor_node(set_t setno
, minor_t mnum
)
982 /* Check for valid arguments */
983 if (setno
>= MD_MAXSETS
|| MD_MIN2UNIT(mnum
) >= MD_MAXUNITS
)
986 (void) snprintf(name
, 20, "%u,%u,blk",
987 (unsigned)setno
, (unsigned)MD_MIN2UNIT(mnum
));
989 if (ddi_create_minor_node(md_devinfo
, name
, S_IFBLK
,
990 MD_MKMIN(setno
, mnum
), DDI_PSEUDO
, 0))
993 (void) snprintf(name
, 20, "%u,%u,raw",
994 (unsigned)setno
, (unsigned)MD_MIN2UNIT(mnum
));
996 if (ddi_create_minor_node(md_devinfo
, name
, S_IFCHR
,
997 MD_MKMIN(setno
, mnum
), DDI_PSEUDO
, 0))
1004 * For a given key check if it is an orphaned record.
1005 * The following conditions are used to determine an orphan.
1006 * 1. The device associated with that key is not a metadevice.
1007 * 2. If DEVID_STYLE then the physical device does not have a device Id
1008 * associated with it.
1010 * If a key does not have an entry in the devid namespace it could be
1011 * a device that does not support device ids. Hence the record is not
1016 md_verify_orphaned_record(set_t setno
, mdkey_t key
)
1018 md_dev64_t odev
; /* orphaned dev */
1021 struct nm_next_hdr
*did_nh
= NULL
;
1023 s
= (mddb_set_t
*)md_set
[setno
].s_db
;
1024 if ((did_nh
= get_first_record(setno
, 1, (NM_DEVID
| NM_NOTSHARED
)))
1028 * If devid style is set then get the dev_t using MD_NOTRUST_DEVT
1030 if (s
->s_lbp
->lb_flags
& MDDB_DEVID_STYLE
) {
1031 odev
= md_getdevnum(setno
, side
, key
, MD_NOTRUST_DEVT
);
1032 if ((odev
== NODEV64
) || (md_getmajor(odev
) == md_major
))
1034 if (lookup_entry(did_nh
, setno
, side
, key
, odev
, NM_DEVID
) ==
1042 md_snarf_db_set(set_t setno
, md_error_t
*ep
)
1048 mddb_recstatus_t status
;
1053 struct nm_next_hdr
*nh
;
1054 mdkey_t key
= MD_KEYWILD
;
1060 int un_next_set
= 0;
1062 md_haltsnarf_enter(setno
);
1064 mutex_enter(&md_mx
);
1065 if (md_set
[setno
].s_status
& MD_SET_SNARFED
) {
1067 md_haltsnarf_exit(setno
);
1072 if (! (md_get_status() & MD_GBL_DAEMONS_LIVE
)) {
1073 if (md_start_daemons(TRUE
)) {
1075 (void) mdsyserror(ep
, ENXIO
);
1083 * Load the devid name space if it exists
1085 (void) md_load_namespace(setno
, NULL
, NM_DEVID
);
1086 if (!md_load_namespace(setno
, ep
, 0L)) {
1088 * Unload the devid namespace
1090 (void) md_unload_namespace(setno
, NM_DEVID
);
1096 * If replica is in non-devid state, convert if:
1097 * - not in probe during upgrade (md_keep_repl_state = 0)
1098 * - enough space available in replica
1100 * - not a multi-node diskset
1101 * - clustering is not present (for non-local set)
1103 s
= (mddb_set_t
*)md_set
[setno
].s_db
;
1105 if (!(s
->s_lbp
->lb_flags
& MDDB_DEVID_STYLE
) && !md_keep_repl_state
)
1107 if (cluster_bootflags
& CLUSTER_CONFIGURED
)
1108 if (setno
!= MD_LOCAL_SET
)
1110 if (MD_MNSET_SETNO(setno
))
1112 if ((md_devid_destroy
== 1) && (md_keep_repl_state
== 1))
1116 * if we weren't devid style before and md_keep_repl_state=1
1117 * we need to stay non-devid
1119 if ((md_keep_repl_state
== 1) &&
1120 ((s
->s_lbp
->lb_flags
& MDDB_DEVID_STYLE
) == 0))
1124 * Determine number of free blocks needed to convert
1125 * entire replica to device id format - locator blocks
1129 if (mddb_lb_did_convert(s
, 0, &cvt_blks
) != 0) {
1131 (void) mdsyserror(ep
, EIO
);
1136 cvt_blks
+= md_nm_did_chkspace(setno
);
1138 /* add MDDB_DEVID_CONV_PERC% */
1139 if ((md_conv_perc
> 0) && (md_conv_perc
<= 100)) {
1140 cvt_blks
= cvt_blks
* (100 + md_conv_perc
) / 100;
1143 if (cvt_blks
<= s
->s_freeblkcnt
) {
1144 if (mddb_lb_did_convert(s
, 1, &cvt_blks
) != 0) {
1146 (void) mdsyserror(ep
, EIO
);
1153 * Print message that replica can't be converted for
1154 * lack of space. No failure - just continue to
1155 * run without device ids.
1158 "Unable to add Solaris Volume Manager device "
1159 "relocation data.\n"
1160 " To use device relocation feature:\n"
1161 " - Increase size of listed replicas\n"
1163 md_print_block_usage(s
, cvt_blks
);
1165 "Loading set without device relocation data.\n"
1166 " Solaris Volume Manager disk movement "
1167 "not tracked in local set.");
1172 * go through and load any modules referenced in
1175 recid
= mddb_makerecid(setno
, 0);
1176 while ((recid
= mddb_getnextrec(recid
, MDDB_ALL
, 0)) > 0) {
1177 status
= mddb_getrecstatus(recid
);
1178 if (status
== MDDB_STALE
) {
1179 if (! (md_get_setstatus(setno
) & MD_SET_STALE
)) {
1180 md_set_setstatus(setno
, MD_SET_STALE
);
1182 "md: state database is stale");
1184 } else if (status
== MDDB_NODATA
) {
1185 mddb_setrecprivate(recid
, MD_PRV_PENDDEL
);
1188 drvrid
= mddb_getrectype1(recid
);
1189 if (drvrid
< MDDB_FIRST_MODID
)
1191 if (md_loadsubmod(setno
, md_getshared_name(setno
, drvrid
),
1193 cmn_err(CE_NOTE
, "md: could not load misc/%s",
1194 md_getshared_name(setno
, drvrid
));
1201 snarf_user_data(setno
);
1204 * Initialize the md_nm_snarfed array
1205 * this array is indexed by the key and
1206 * is set by md_getdevnum during the snarf time
1208 if ((nh
= get_first_record(setno
, 0, NM_NOTSHARED
)) != NULL
) {
1209 size
= (int)((((struct nm_rec_hdr
*)nh
->nmn_record
)->
1210 r_next_key
) * (sizeof (int)));
1211 md_nm_snarfed
= (int *)kmem_zalloc(size
, KM_SLEEP
);
1215 * go through and snarf until nothing gets added
1219 for (ops
= md_opslist
; ops
!= NULL
; ops
= ops
->md_next
) {
1220 if (ops
->md_snarf
!= NULL
) {
1221 retval
= ops
->md_snarf(MD_SNARF_DOIT
, setno
);
1224 /* Don't know the failed unit */
1225 (void) mdmderror(ep
, MDE_RR_ALLOC_ERROR
,
1227 (void) md_halt_set(setno
, MD_HALT_ALL
);
1228 (void) mddb_unload_set(setno
);
1229 md_haltsnarf_exit(setno
);
1239 * Set the first available slot and availability
1241 md_set
[setno
].s_un_avail
= 0;
1242 for (un
= 0; un
< MD_MAXUNITS
; un
++) {
1243 if (md_set
[setno
].s_un
[un
] != NULL
) {
1247 md_set
[setno
].s_un_next
= un
;
1250 md_set
[setno
].s_un_avail
++;
1254 md_set_setstatus(setno
, MD_SET_SNARFED
);
1256 recid
= mddb_makerecid(setno
, 0);
1257 while ((recid
= mddb_getnextrec(recid
, MDDB_ALL
, 0)) > 0) {
1258 privat
= mddb_getrecprivate(recid
);
1259 if (privat
& MD_PRV_COMMIT
) {
1260 if (mddb_commitrec(recid
)) {
1261 if (!(md_get_setstatus(setno
) & MD_SET_STALE
)) {
1262 md_set_setstatus(setno
, MD_SET_STALE
);
1264 "md: state database is stale");
1267 mddb_setrecprivate(recid
, MD_PRV_GOTIT
);
1271 /* Deletes must happen after all the commits */
1272 recid
= mddb_makerecid(setno
, 0);
1273 while ((recid
= mddb_getnextrec(recid
, MDDB_ALL
, 0)) > 0) {
1274 privat
= mddb_getrecprivate(recid
);
1275 if (privat
& MD_PRV_DELETE
) {
1276 if (mddb_deleterec(recid
)) {
1277 if (!(md_get_setstatus(setno
) & MD_SET_STALE
)) {
1278 md_set_setstatus(setno
, MD_SET_STALE
);
1280 "md: state database is stale");
1282 mddb_setrecprivate(recid
, MD_PRV_GOTIT
);
1284 recid
= mddb_makerecid(setno
, 0);
1289 * go through and clean up records until nothing gets cleaned up.
1293 for (ops
= md_opslist
; ops
!= NULL
; ops
= ops
->md_next
)
1294 if (ops
->md_snarf
!= NULL
)
1295 i
+= ops
->md_snarf(MD_SNARF_CLEANUP
, setno
);
1298 if (md_nm_snarfed
!= NULL
&&
1299 !(md_get_setstatus(setno
) & MD_SET_STALE
)) {
1301 * go thru and cleanup the namespace and the device id
1305 key
< ((struct nm_rec_hdr
*)nh
->nmn_record
)->r_next_key
;
1308 * Is the entry an 'orphan'?
1310 if (lookup_entry(nh
, setno
, side
, key
, NODEV64
, 0L) !=
1313 * If the value is not set then apparently
1314 * it is not part of the current configuration,
1315 * remove it this can happen when system panic
1316 * between the primary name space update and
1317 * the device id name space update
1319 if (md_nm_snarfed
[key
] == 0) {
1320 if (md_verify_orphaned_record(setno
,
1322 (void) remove_entry(nh
,
1329 if (md_nm_snarfed
!= NULL
) {
1331 * Done and free the memory
1333 kmem_free(md_nm_snarfed
, size
);
1334 md_nm_snarfed
= NULL
;
1337 if (s
->s_lbp
->lb_flags
& MDDB_DEVID_STYLE
&&
1338 !(md_get_setstatus(setno
) & MD_SET_STALE
)) {
1340 * if the destroy flag has been set and
1341 * the MD_SET_DIDCLUP bit is not set in
1342 * the set's status field, cleanup the
1343 * entire device id namespace
1345 if (md_devid_destroy
&&
1346 !(md_get_setstatus(setno
) & MD_SET_DIDCLUP
)) {
1347 (void) md_devid_cleanup(setno
, 1);
1348 md_set_setstatus(setno
, MD_SET_DIDCLUP
);
1350 (void) md_devid_cleanup(setno
, 0);
1354 * clear single threading on snarf, return success or error
1357 md_haltsnarf_exit(setno
);
1362 get_minfo(struct dk_minfo
*info
, minor_t mnum
)
1367 info
->dki_capacity
= 0;
1368 info
->dki_lbsize
= 0;
1369 info
->dki_media_type
= 0;
1371 if ((ui
= MDI_UNIT(mnum
)) == NULL
) {
1374 un
= (md_unit_t
*)md_unit_readerlock(ui
);
1375 info
->dki_capacity
= un
->c
.un_total_blocks
;
1376 md_unit_readerexit(ui
);
1377 info
->dki_lbsize
= DEV_BSIZE
;
1378 info
->dki_media_type
= DK_UNKNOWN
;
1383 get_info(struct dk_cinfo
*info
, minor_t mnum
)
1386 * Controller Information
1388 info
->dki_ctype
= DKC_MD
;
1389 info
->dki_cnum
= ddi_get_instance(ddi_get_parent(md_devinfo
));
1390 (void) strcpy(info
->dki_cname
,
1391 ddi_get_name(ddi_get_parent(md_devinfo
)));
1395 info
->dki_unit
= mnum
;
1396 info
->dki_slave
= 0;
1397 (void) strcpy(info
->dki_dname
, ddi_driver_name(md_devinfo
));
1398 info
->dki_flags
= 0;
1399 info
->dki_partition
= 0;
1400 info
->dki_maxtransfer
= (ushort_t
)(md_maxphys
/ DEV_BSIZE
);
1403 * We can't get from here to there yet
1406 info
->dki_space
= 0;
1422 mutex_enter(&md_mx
);
1424 /* check type and flags */
1425 if ((otyp
!= OTYP_CHR
) && (otyp
!= OTYP_LYR
)) {
1429 if (((flag
& FEXCL
) && (md_status
& MD_GBL_OPEN
)) ||
1430 (md_status
& MD_GBL_EXCL
)) {
1435 /* count and flag open */
1437 md_status
|= MD_GBL_OPEN
;
1439 md_status
|= MD_GBL_EXCL
;
1441 /* unlock return success */
1457 minor_t mnum
= getminor(*dev
);
1458 unit_t unit
= MD_MIN2UNIT(mnum
);
1459 set_t setno
= MD_MIN2SET(mnum
);
1460 mdi_unit_t
*ui
= NULL
;
1464 /* dispatch admin device opens */
1465 if (mnum
== MD_ADM_MINOR
)
1466 return (mdadminopen(flag
, otyp
));
1468 /* lock, check status */
1469 rw_enter(&md_unit_array_rw
.lock
, RW_READER
);
1472 if (md_get_status() & MD_GBL_HALTED
) {
1478 if ((setno
>= md_nsets
) || (unit
>= md_nunits
)) {
1483 /* make sure we're snarfed */
1484 if ((md_get_setstatus(MD_LOCAL_SET
) & MD_SET_SNARFED
) == 0) {
1485 if (md_snarf_db_set(MD_LOCAL_SET
, NULL
) != 0) {
1490 if ((md_get_setstatus(setno
) & MD_SET_SNARFED
) == 0) {
1496 if ((ui
= MDI_UNIT(mnum
)) == NULL
) {
1502 * The softpart open routine may do an I/O during the open, in
1503 * which case the open routine will set the OPENINPROGRESS flag
1504 * and drop all locks during the I/O. If this thread sees
1505 * the OPENINPROGRESS flag set, if should wait until the flag
1506 * is reset before calling the driver's open routine. It must
1507 * also revalidate the world after it grabs the unit_array lock
1508 * since the set may have been released or the metadevice cleared
1511 if (MD_MNSET_SETNO(setno
)) {
1512 mutex_enter(&ui
->ui_mx
);
1513 if (ui
->ui_lock
& MD_UL_OPENINPROGRESS
) {
1514 rw_exit(&md_unit_array_rw
.lock
);
1515 cv_wait(&ui
->ui_cv
, &ui
->ui_mx
);
1516 rw_enter(&md_unit_array_rw
.lock
, RW_READER
);
1517 mutex_exit(&ui
->ui_mx
);
1520 mutex_exit(&ui
->ui_mx
);
1523 /* Test if device is openable */
1524 if ((ui
->ui_tstate
& MD_NOTOPENABLE
) != 0) {
1529 /* don't allow opens w/WRITE flag if stale */
1530 if ((flag
& FWRITE
) && (md_get_setstatus(setno
) & MD_SET_STALE
)) {
1535 /* don't allow writes to subdevices */
1536 parent
= md_get_parent(md_expldev(*dev
));
1537 if ((flag
& FWRITE
) && MD_HAS_PARENT(parent
)) {
1542 /* open underlying driver */
1543 if (md_ops
[ui
->ui_opsindex
]->md_open
!= NULL
) {
1544 if ((err
= (*md_ops
[ui
->ui_opsindex
]->md_open
)
1545 (dev
, flag
, otyp
, cred_p
, 0)) != 0)
1549 /* or do it ourselves */
1552 (void) md_unit_openclose_enter(ui
);
1553 err
= md_unit_incopen(mnum
, flag
, otyp
);
1554 md_unit_openclose_exit(ui
);
1559 /* unlock, return status */
1561 rw_exit(&md_unit_array_rw
.lock
);
1566 * close admin device
1576 mutex_enter(&md_mx
);
1578 /* check type and flags */
1579 if ((otyp
< 0) || (otyp
>= OTYPCNT
)) {
1582 } else if (md_ocnt
[otyp
] == 0) {
1587 /* count and flag closed */
1588 if (otyp
== OTYP_LYR
)
1592 md_status
&= ~MD_GBL_OPEN
;
1593 for (i
= 0; (i
< OTYPCNT
); ++i
)
1594 if (md_ocnt
[i
] != 0)
1595 md_status
|= MD_GBL_OPEN
;
1596 if (! (md_status
& MD_GBL_OPEN
))
1597 md_status
&= ~MD_GBL_EXCL
;
1599 /* unlock return success */
1615 minor_t mnum
= getminor(dev
);
1616 set_t setno
= MD_MIN2SET(mnum
);
1617 unit_t unit
= MD_MIN2UNIT(mnum
);
1618 mdi_unit_t
*ui
= NULL
;
1621 /* dispatch admin device closes */
1622 if (mnum
== MD_ADM_MINOR
)
1623 return (mdadminclose(otyp
));
1626 if ((setno
>= md_nsets
) || (unit
>= md_nunits
) ||
1627 ((ui
= MDI_UNIT(mnum
)) == NULL
)) {
1632 /* close underlying driver */
1633 if (md_ops
[ui
->ui_opsindex
]->md_close
!= NULL
) {
1634 if ((err
= (*md_ops
[ui
->ui_opsindex
]->md_close
)
1635 (dev
, flag
, otyp
, cred_p
, 0)) != 0)
1639 /* or do it ourselves */
1642 (void) md_unit_openclose_enter(ui
);
1643 err
= md_unit_decopen(mnum
, otyp
);
1644 md_unit_openclose_exit(ui
);
1649 /* return success */
1656 * This routine performs raw read operations. It is called from the
1657 * device switch at normal priority.
1659 * The main catch is that the *uio struct which is passed to us may
1660 * specify a read which spans two buffers, which would be contiguous
1661 * on a single partition, but not on a striped partition. This will
1662 * be handled by mdstrategy.
1666 mdread(dev_t dev
, struct uio
*uio
, cred_t
*credp
)
1672 if (((mnum
= getminor(dev
)) == MD_ADM_MINOR
) ||
1673 (MD_MIN2SET(mnum
) >= md_nsets
) ||
1674 (MD_MIN2UNIT(mnum
) >= md_nunits
) ||
1675 ((ui
= MDI_UNIT(mnum
)) == NULL
))
1678 if (md_ops
[ui
->ui_opsindex
]->md_read
!= NULL
)
1679 return ((*md_ops
[ui
->ui_opsindex
]->md_read
)
1682 if ((error
= md_chk_uio(uio
)) != 0)
1685 return (physio(mdstrategy
, NULL
, dev
, B_READ
, md_minphys
, uio
));
1689 * This routine performs async raw read operations. It is called from the
1690 * device switch at normal priority.
1692 * The main catch is that the *aio struct which is passed to us may
1693 * specify a read which spans two buffers, which would be contiguous
1694 * on a single partition, but not on a striped partition. This will
1695 * be handled by mdstrategy.
1699 mdaread(dev_t dev
, struct aio_req
*aio
, cred_t
*credp
)
1706 if (((mnum
= getminor(dev
)) == MD_ADM_MINOR
) ||
1707 (MD_MIN2SET(mnum
) >= md_nsets
) ||
1708 (MD_MIN2UNIT(mnum
) >= md_nunits
) ||
1709 ((ui
= MDI_UNIT(mnum
)) == NULL
))
1712 if (md_ops
[ui
->ui_opsindex
]->md_aread
!= NULL
)
1713 return ((*md_ops
[ui
->ui_opsindex
]->md_aread
)
1716 if ((error
= md_chk_uio(aio
->aio_uio
)) != 0)
1719 return (aphysio(mdstrategy
, anocancel
, dev
, B_READ
, md_minphys
, aio
));
1723 * This routine performs raw write operations. It is called from the
1724 * device switch at normal priority.
1726 * The main catch is that the *uio struct which is passed to us may
1727 * specify a write which spans two buffers, which would be contiguous
1728 * on a single partition, but not on a striped partition. This is
1729 * handled by mdstrategy.
1734 mdwrite(dev_t dev
, struct uio
*uio
, cred_t
*credp
)
1740 if (((mnum
= getminor(dev
)) == MD_ADM_MINOR
) ||
1741 (MD_MIN2SET(mnum
) >= md_nsets
) ||
1742 (MD_MIN2UNIT(mnum
) >= md_nunits
) ||
1743 ((ui
= MDI_UNIT(mnum
)) == NULL
))
1746 if (md_ops
[ui
->ui_opsindex
]->md_write
!= NULL
)
1747 return ((*md_ops
[ui
->ui_opsindex
]->md_write
)
1750 if ((error
= md_chk_uio(uio
)) != 0)
1753 return (physio(mdstrategy
, NULL
, dev
, B_WRITE
, md_minphys
, uio
));
1757 * This routine performs async raw write operations. It is called from the
1758 * device switch at normal priority.
1760 * The main catch is that the *aio struct which is passed to us may
1761 * specify a write which spans two buffers, which would be contiguous
1762 * on a single partition, but not on a striped partition. This is
1763 * handled by mdstrategy.
1768 mdawrite(dev_t dev
, struct aio_req
*aio
, cred_t
*credp
)
1775 if (((mnum
= getminor(dev
)) == MD_ADM_MINOR
) ||
1776 (MD_MIN2SET(mnum
) >= md_nsets
) ||
1777 (MD_MIN2UNIT(mnum
) >= md_nunits
) ||
1778 ((ui
= MDI_UNIT(mnum
)) == NULL
))
1781 if (md_ops
[ui
->ui_opsindex
]->md_awrite
!= NULL
)
1782 return ((*md_ops
[ui
->ui_opsindex
]->md_awrite
)
1785 if ((error
= md_chk_uio(aio
->aio_uio
)) != 0)
1788 return (aphysio(mdstrategy
, anocancel
, dev
, B_WRITE
, md_minphys
, aio
));
1792 mdstrategy(struct buf
*bp
)
1797 ASSERT((bp
->b_flags
& B_DONE
) == 0);
1800 md_clr_status(MD_GBL_DAEMONS_LIVE
);
1802 if (((mnum
= getminor(bp
->b_edev
)) == MD_ADM_MINOR
) ||
1803 (MD_MIN2SET(mnum
) >= md_nsets
) ||
1804 (MD_MIN2UNIT(mnum
) >= md_nunits
) ||
1805 ((ui
= MDI_UNIT(mnum
)) == NULL
)) {
1806 bp
->b_flags
|= B_ERROR
;
1807 bp
->b_error
= ENXIO
;
1808 bp
->b_resid
= bp
->b_bcount
;
1813 bp
->b_flags
&= ~(B_ERROR
| B_DONE
);
1814 if (md_ops
[ui
->ui_opsindex
]->md_strategy
!= NULL
) {
1815 (*md_ops
[ui
->ui_opsindex
]->md_strategy
) (bp
, 0, NULL
);
1817 (void) errdone(ui
, bp
, ENXIO
);
1823 * Return true if the ioctl is allowed to be multithreaded.
1824 * All the ioctls with MN are sent only from the message handlers through
1825 * rpc.mdcommd, which (via it's own locking mechanism) takes care that not two
1826 * ioctl for the same metadevice are issued at the same time.
1827 * So we are safe here.
1828 * The other ioctls do not mess with any metadevice structures and therefor
1829 * are harmless too, if called multiple times at the same time.
1832 is_mt_ioctl(int cmd
) {
1835 case MD_IOCGUNIQMSGID
:
1836 case MD_IOCGVERSION
:
1838 case MD_MN_SET_MM_OWNER
:
1839 case MD_MN_SET_STATE
:
1840 case MD_MN_SUSPEND_WRITES
:
1841 case MD_MN_ALLOCATE_HOTSPARE
:
1842 case MD_MN_SET_SETFLAGS
:
1843 case MD_MN_GET_SETFLAGS
:
1844 case MD_MN_MDDB_OPTRECFIX
:
1845 case MD_MN_MDDB_PARSE
:
1846 case MD_MN_MDDB_BLOCK
:
1847 case MD_MN_DB_USERREQ
:
1848 case MD_IOC_SPSTATUS
:
1849 case MD_MN_COMMD_ERR
:
1850 case MD_MN_SET_COMMD_RUNNING
:
1853 case MD_MN_POKE_HOTSPARES
:
1854 case MD_MN_RR_DIRTY
:
1855 case MD_MN_RR_CLEAN
:
1856 case MD_MN_IOC_SPUPDATEWM
:
1864 * This routine implements the ioctl calls for the Virtual Disk System.
1865 * It is called from the device switch at normal priority.
1869 mdioctl(dev_t dev
, int cmd
, intptr_t data
, int mode
, cred_t
*cred_p
,
1872 minor_t mnum
= getminor(dev
);
1878 * For multinode disksets number of ioctls are allowed to be
1880 * A fundamental assumption made in this implementation is that
1881 * ioctls either do not interact with other md structures or the
1882 * ioctl to the admin device can only occur if the metadevice
1883 * device is open. i.e. avoid a race between metaclear and the
1884 * progress of a multithreaded ioctl.
1887 if (!is_mt_ioctl(cmd
) && md_ioctl_lock_enter() == EINTR
) {
1892 * initialize lock tracker
1896 /* Flag to indicate that MD_GBL_IOCTL_LOCK is not acquired */
1898 if (is_mt_ioctl(cmd
)) {
1899 /* increment the md_mtioctl_cnt */
1900 mutex_enter(&md_mx
);
1903 lock
.l_flags
|= MD_MT_IOCTL
;
1907 * this has been added to prevent notification from re-snarfing
1908 * so metaunload will work. It may interfere with other modules
1911 if (md_get_status() & (MD_GBL_HALTED
| MD_GBL_DAEMONS_DIE
))
1912 return (IOLOCK_RETURN(ENXIO
, &lock
));
1915 * admin device ioctls
1917 if (mnum
== MD_ADM_MINOR
) {
1918 err
= md_admin_ioctl(md_expldev(dev
), cmd
, (void *) data
,
1925 else if ((MD_MIN2SET(mnum
) >= md_nsets
) ||
1926 (MD_MIN2UNIT(mnum
) >= md_nunits
) ||
1927 (md_set
[MD_MIN2SET(mnum
)].s_ui
== NULL
) ||
1928 ((ui
= MDI_UNIT(mnum
)) == NULL
)) {
1930 } else if (md_ops
[ui
->ui_opsindex
]->md_ioctl
== NULL
) {
1933 err
= (*md_ops
[ui
->ui_opsindex
]->md_ioctl
)
1934 (dev
, cmd
, (void *) data
, mode
, &lock
);
1938 * drop any locks we grabbed
1940 return (IOLOCK_RETURN_IOCTLEND(err
, &lock
));
1944 mddump(dev_t dev
, caddr_t addr
, daddr_t blkno
, int nblk
)
1950 if ((mnum
= getminor(dev
)) == MD_ADM_MINOR
)
1953 setno
= MD_MIN2SET(mnum
);
1955 if ((setno
>= md_nsets
) || (MD_MIN2UNIT(mnum
) >= md_nunits
) ||
1956 ((ui
= MDI_UNIT(mnum
)) == NULL
))
1960 if ((md_get_setstatus(setno
) & MD_SET_SNARFED
) == 0)
1963 if (md_ops
[ui
->ui_opsindex
]->md_dump
!= NULL
)
1964 return ((*md_ops
[ui
->ui_opsindex
]->md_dump
)
1965 (dev
, addr
, blkno
, nblk
));
1971 * Metadevice unit number dispatcher
1972 * When this routine is called it will scan the
1973 * incore unit array and return the avail slot
1974 * hence the unit number to the caller
1976 * Return -1 if there is nothing available
1979 md_get_nextunit(set_t setno
)
1984 * If nothing available
1986 if (md_set
[setno
].s_un_avail
== 0) {
1987 return (MD_UNITBAD
);
1990 mutex_enter(&md_mx
);
1991 start
= un
= md_set
[setno
].s_un_next
;
1993 /* LINTED: E_CONSTANT_CONDITION */
1995 if (md_set
[setno
].s_un
[un
] == NULL
) {
1997 * Advance the starting index for the next
1998 * md_get_nextunit call
2000 if (un
== MD_MAXUNITS
- 1) {
2001 md_set
[setno
].s_un_next
= 0;
2003 md_set
[setno
].s_un_next
= un
+ 1;
2008 un
= ((un
== MD_MAXUNITS
- 1) ? 0 : un
+ 1);