4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
27 * LDoms virtual disk client (vdc) device driver
29 * This driver runs on a guest logical domain and communicates with the virtual
30 * disk server (vds) driver running on the service domain which is exporting
31 * virtualized "disks" to the guest logical domain.
33 * The driver can be divided into four sections:
35 * 1) generic device driver housekeeping
36 * _init, _fini, attach, detach, ops structures, etc.
38 * 2) communication channel setup
39 * Setup the communications link over the LDC channel that vdc uses to
40 * talk to the vDisk server. Initialise the descriptor ring which
41 * allows the LDC clients to transfer data via memory mappings.
43 * 3) Support exported to upper layers (filesystems, etc)
44 * The upper layers call into vdc via strategy(9E) and DKIO(7I)
45 * ioctl calls. vdc will copy the data to be written to the descriptor
46 * ring or maps the buffer to store the data read by the vDisk
47 * server into the descriptor ring. It then sends a message to the
48 * vDisk server requesting it to complete the operation.
50 * 4) Handling responses from vDisk server.
51 * The vDisk server will ACK some or all of the messages vdc sends to it
52 * (this is configured during the handshake). Upon receipt of an ACK
53 * vdc will check the descriptor ring and signal to the upper layer
54 * code waiting on the IO.
57 #include <sys/atomic.h>
62 #include <sys/efi_partition.h>
63 #include <sys/fcntl.h>
65 #include <sys/kstat.h>
66 #include <sys/mach_descrip.h>
67 #include <sys/modctl.h>
71 #include <sys/random.h>
74 #include <sys/sunddi.h>
75 #include <sys/types.h>
76 #include <sys/promif.h>
79 #include <sys/archsystm.h>
80 #include <sys/sysmacros.h>
83 #include <sys/dktp/fdisk.h>
84 #include <sys/dktp/dadkio.h>
85 #include <sys/fs/dv_node.h>
87 #include <sys/scsi/generic/sense.h>
88 #include <sys/scsi/impl/uscsi.h>
89 #include <sys/scsi/impl/services.h>
90 #include <sys/scsi/targets/sddef.h>
92 #include <sys/ldoms.h>
94 #include <sys/vio_common.h>
95 #include <sys/vio_mailbox.h>
96 #include <sys/vio_util.h>
97 #include <sys/vdsk_common.h>
98 #include <sys/vdsk_mailbox.h>
101 #define VD_OLDVTOC_LIMIT 0x7fffffff
104 * function prototypes
107 /* standard driver functions */
108 static int vdc_open(dev_t
*dev
, int flag
, int otyp
, cred_t
*cred
);
109 static int vdc_close(dev_t dev
, int flag
, int otyp
, cred_t
*cred
);
110 static int vdc_strategy(struct buf
*buf
);
111 static int vdc_print(dev_t dev
, char *str
);
112 static int vdc_dump(dev_t dev
, caddr_t addr
, daddr_t blkno
, int nblk
);
113 static int vdc_read(dev_t dev
, struct uio
*uio
, cred_t
*cred
);
114 static int vdc_write(dev_t dev
, struct uio
*uio
, cred_t
*cred
);
115 static int vdc_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
,
116 cred_t
*credp
, int *rvalp
);
117 static int vdc_aread(dev_t dev
, struct aio_req
*aio
, cred_t
*cred
);
118 static int vdc_awrite(dev_t dev
, struct aio_req
*aio
, cred_t
*cred
);
120 static int vdc_getinfo(dev_info_t
*dip
, ddi_info_cmd_t cmd
,
121 void *arg
, void **resultp
);
122 static int vdc_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
);
123 static int vdc_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
);
124 static int vdc_prop_op(dev_t dev
, dev_info_t
*dip
, ddi_prop_op_t prop_op
,
125 int mod_flags
, char *name
, caddr_t valuep
, int *lengthp
);
128 static void vdc_min(struct buf
*bufp
);
129 static int vdc_send(vdc_t
*vdc
, caddr_t pkt
, size_t *msglen
);
130 static int vdc_do_ldc_init(vdc_t
*vdc
, vdc_server_t
*srvr
);
131 static int vdc_start_ldc_connection(vdc_t
*vdc
);
132 static int vdc_create_device_nodes(vdc_t
*vdc
);
133 static int vdc_create_device_nodes_efi(vdc_t
*vdc
);
134 static int vdc_create_device_nodes_vtoc(vdc_t
*vdc
);
135 static void vdc_create_io_kstats(vdc_t
*vdc
);
136 static void vdc_create_err_kstats(vdc_t
*vdc
);
137 static void vdc_set_err_kstats(vdc_t
*vdc
);
138 static int vdc_get_md_node(dev_info_t
*dip
, md_t
**mdpp
,
139 mde_cookie_t
*vd_nodep
);
140 static int vdc_init_ports(vdc_t
*vdc
, md_t
*mdp
, mde_cookie_t vd_nodep
);
141 static void vdc_fini_ports(vdc_t
*vdc
);
142 static void vdc_switch_server(vdc_t
*vdcp
);
143 static int vdc_do_ldc_up(vdc_t
*vdc
);
144 static void vdc_terminate_ldc(vdc_t
*vdc
, vdc_server_t
*srvr
);
145 static int vdc_init_descriptor_ring(vdc_t
*vdc
);
146 static void vdc_destroy_descriptor_ring(vdc_t
*vdc
);
147 static int vdc_setup_devid(vdc_t
*vdc
);
148 static void vdc_store_label_efi(vdc_t
*, efi_gpt_t
*, efi_gpe_t
*);
149 static void vdc_store_label_vtoc(vdc_t
*, struct dk_geom
*,
151 static void vdc_store_label_unk(vdc_t
*vdc
);
152 static boolean_t
vdc_is_opened(vdc_t
*vdc
);
153 static void vdc_update_size(vdc_t
*vdc
, size_t, size_t, size_t);
154 static int vdc_update_vio_bsize(vdc_t
*vdc
, uint32_t);
156 /* handshake with vds */
157 static int vdc_init_ver_negotiation(vdc_t
*vdc
, vio_ver_t ver
);
158 static int vdc_ver_negotiation(vdc_t
*vdcp
);
159 static int vdc_init_attr_negotiation(vdc_t
*vdc
);
160 static int vdc_attr_negotiation(vdc_t
*vdcp
);
161 static int vdc_init_dring_negotiate(vdc_t
*vdc
);
162 static int vdc_dring_negotiation(vdc_t
*vdcp
);
163 static int vdc_send_rdx(vdc_t
*vdcp
);
164 static int vdc_rdx_exchange(vdc_t
*vdcp
);
165 static boolean_t
vdc_is_supported_version(vio_ver_msg_t
*ver_msg
);
167 /* processing incoming messages from vDisk server */
168 static void vdc_process_msg_thread(vdc_t
*vdc
);
169 static int vdc_recv(vdc_t
*vdc
, vio_msg_t
*msgp
, size_t *nbytesp
);
171 static uint_t
vdc_handle_cb(uint64_t event
, caddr_t arg
);
172 static int vdc_process_data_msg(vdc_t
*vdc
, vio_msg_t
*msg
);
173 static int vdc_handle_ver_msg(vdc_t
*vdc
, vio_ver_msg_t
*ver_msg
);
174 static int vdc_handle_attr_msg(vdc_t
*vdc
, vd_attr_msg_t
*attr_msg
);
175 static int vdc_handle_dring_reg_msg(vdc_t
*vdc
, vio_dring_reg_msg_t
*msg
);
176 static int vdc_send_request(vdc_t
*vdcp
, int operation
,
177 caddr_t addr
, size_t nbytes
, int slice
, diskaddr_t offset
,
178 buf_t
*bufp
, vio_desc_direction_t dir
, int flags
);
179 static int vdc_map_to_shared_dring(vdc_t
*vdcp
, int idx
);
180 static int vdc_populate_descriptor(vdc_t
*vdcp
, int operation
,
181 caddr_t addr
, size_t nbytes
, int slice
, diskaddr_t offset
,
182 buf_t
*bufp
, vio_desc_direction_t dir
, int flags
);
183 static int vdc_do_sync_op(vdc_t
*vdcp
, int operation
, caddr_t addr
,
184 size_t nbytes
, int slice
, diskaddr_t offset
,
185 vio_desc_direction_t dir
, boolean_t
);
186 static int vdc_do_op(vdc_t
*vdc
, int op
, caddr_t addr
, size_t nbytes
,
187 int slice
, diskaddr_t offset
, struct buf
*bufp
,
188 vio_desc_direction_t dir
, int flags
);
190 static int vdc_wait_for_response(vdc_t
*vdcp
, vio_msg_t
*msgp
);
191 static int vdc_drain_response(vdc_t
*vdcp
, struct buf
*buf
);
192 static int vdc_depopulate_descriptor(vdc_t
*vdc
, uint_t idx
);
193 static int vdc_populate_mem_hdl(vdc_t
*vdcp
, vdc_local_desc_t
*ldep
);
194 static int vdc_verify_seq_num(vdc_t
*vdc
, vio_dring_msg_t
*dring_msg
);
197 static int vd_process_ioctl(dev_t dev
, int cmd
, caddr_t arg
, int mode
,
199 static int vd_process_efi_ioctl(void *vdisk
, int cmd
, uintptr_t arg
);
200 static void vdc_create_fake_geometry(vdc_t
*vdc
);
201 static int vdc_validate_geometry(vdc_t
*vdc
);
202 static void vdc_validate(vdc_t
*vdc
);
203 static void vdc_validate_task(void *arg
);
204 static int vdc_null_copy_func(vdc_t
*vdc
, void *from
, void *to
,
206 static int vdc_get_wce_convert(vdc_t
*vdc
, void *from
, void *to
,
208 static int vdc_set_wce_convert(vdc_t
*vdc
, void *from
, void *to
,
210 static int vdc_get_vtoc_convert(vdc_t
*vdc
, void *from
, void *to
,
212 static int vdc_set_vtoc_convert(vdc_t
*vdc
, void *from
, void *to
,
214 static int vdc_get_extvtoc_convert(vdc_t
*vdc
, void *from
, void *to
,
216 static int vdc_set_extvtoc_convert(vdc_t
*vdc
, void *from
, void *to
,
218 static int vdc_get_geom_convert(vdc_t
*vdc
, void *from
, void *to
,
220 static int vdc_set_geom_convert(vdc_t
*vdc
, void *from
, void *to
,
222 static int vdc_get_efi_convert(vdc_t
*vdc
, void *from
, void *to
,
224 static int vdc_set_efi_convert(vdc_t
*vdc
, void *from
, void *to
,
227 static void vdc_ownership_update(vdc_t
*vdc
, int ownership_flags
);
228 static int vdc_access_set(vdc_t
*vdc
, uint64_t flags
);
229 static vdc_io_t
*vdc_eio_queue(vdc_t
*vdc
, int index
);
230 static void vdc_eio_unqueue(vdc_t
*vdc
, clock_t deadline
,
231 boolean_t complete_io
);
232 static int vdc_eio_check(vdc_t
*vdc
, int flags
);
233 static void vdc_eio_thread(void *arg
);
240 * Number of handshake retries with the current server before switching to
241 * a different server. These retries are done so that we stick with the same
242 * server if vdc receives a LDC reset event during the initiation of the
243 * handshake. This can happen if vdc reset the LDC channel and then immediately
244 * retry a connexion before it has received the LDC reset event.
246 * If there is only one server then we "switch" to the same server. We also
247 * switch if the handshake has reached the attribute negotiate step whatever
248 * the number of handshake retries might be.
250 static uint_t vdc_hshake_retries
= VDC_HSHAKE_RETRIES
;
253 * If the handshake done during the attach fails then the two following
254 * variables will also be used to control the number of retries for the
255 * next handshakes. In that case, when a handshake is done after the
256 * attach (i.e. the vdc lifecycle is VDC_ONLINE_PENDING) then the handshake
257 * will be retried until we have done an attribution negotiation with each
258 * server, with a specified minimum total number of negotations (the value
259 * of the vdc_hattr_min_initial or vdc_hattr_min variable).
261 * This prevents new I/Os on a newly used vdisk to block forever if the
262 * attribute negotiations can not be done, and to limit the amount of time
263 * before I/Os will fail. Basically, attribute negotiations will fail when
264 * the service is up but the backend does not exist. In that case, vds will
265 * typically retry to access the backend during 50 seconds. So I/Os will fail
266 * after the following amount of time:
268 * 50 seconds x max(number of servers, vdc->hattr_min)
270 * After that the handshake done during the attach has failed then the next
271 * handshake will use vdc_attr_min_initial. This handshake will correspond to
272 * the very first I/O to the device. If this handshake also fails then
273 * vdc_hattr_min will be used for subsequent handshakes. We typically allow
274 * more retries for the first handshake (VDC_HATTR_MIN_INITIAL = 3) to give more
275 * time for the backend to become available (50s x VDC_HATTR_MIN_INITIAL = 150s)
276 * in case this is a critical vdisk (e.g. vdisk access during boot). Then we use
277 * a smaller value (VDC_HATTR_MIN = 1) to avoid waiting too long for each I/O.
279 static uint_t vdc_hattr_min_initial
= VDC_HATTR_MIN_INITIAL
;
280 static uint_t vdc_hattr_min
= VDC_HATTR_MIN
;
283 * Tunable variables to control how long vdc waits before timing out on
286 static int vdc_timeout
= 0; /* units: seconds */
287 static int vdc_ldcup_timeout
= 1; /* units: seconds */
289 static uint64_t vdc_hz_min_ldc_delay
;
290 static uint64_t vdc_min_timeout_ldc
= 1 * MILLISEC
;
291 static uint64_t vdc_hz_max_ldc_delay
;
292 static uint64_t vdc_max_timeout_ldc
= 100 * MILLISEC
;
294 static uint64_t vdc_ldc_read_init_delay
= 1 * MILLISEC
;
295 static uint64_t vdc_ldc_read_max_delay
= 100 * MILLISEC
;
297 /* values for dumping - need to run in a tighter loop */
298 static uint64_t vdc_usec_timeout_dump
= 100 * MILLISEC
; /* 0.1s units: ns */
299 static int vdc_dump_retries
= 100;
301 static uint16_t vdc_scsi_timeout
= 60; /* 60s units: seconds */
303 static uint64_t vdc_ownership_delay
= 6 * MICROSEC
; /* 6s units: usec */
305 /* Count of the number of vdc instances attached */
306 static volatile uint32_t vdc_instance_count
= 0;
308 /* Tunable to log all SCSI errors */
309 static boolean_t vdc_scsi_log_error
= B_FALSE
;
311 /* Soft state pointer */
312 static void *vdc_state
;
315 * Controlling the verbosity of the error/debug messages
317 * vdc_msglevel - controls level of messages
318 * vdc_matchinst - 64-bit variable where each bit corresponds
319 * to the vdc instance the vdc_msglevel applies.
321 int vdc_msglevel
= 0x0;
322 uint64_t vdc_matchinst
= 0ull;
325 * Supported vDisk protocol version pairs.
327 * The first array entry is the latest and preferred version.
329 static const vio_ver_t vdc_version
[] = {{1, 1}};
331 static struct cb_ops vdc_cb_ops
= {
332 vdc_open
, /* cb_open */
333 vdc_close
, /* cb_close */
334 vdc_strategy
, /* cb_strategy */
335 vdc_print
, /* cb_print */
336 vdc_dump
, /* cb_dump */
337 vdc_read
, /* cb_read */
338 vdc_write
, /* cb_write */
339 vdc_ioctl
, /* cb_ioctl */
340 nodev
, /* cb_devmap */
342 nodev
, /* cb_segmap */
343 nochpoll
, /* cb_chpoll */
344 vdc_prop_op
, /* cb_prop_op */
346 D_MP
| D_64BIT
, /* cb_flag */
348 vdc_aread
, /* cb_aread */
349 vdc_awrite
/* cb_awrite */
352 static struct dev_ops vdc_ops
= {
353 DEVO_REV
, /* devo_rev */
355 vdc_getinfo
, /* devo_getinfo */
356 nulldev
, /* devo_identify */
357 nulldev
, /* devo_probe */
358 vdc_attach
, /* devo_attach */
359 vdc_detach
, /* devo_detach */
360 nodev
, /* devo_reset */
361 &vdc_cb_ops
, /* devo_cb_ops */
362 NULL
, /* devo_bus_ops */
363 nulldev
, /* devo_power */
364 ddi_quiesce_not_needed
, /* devo_quiesce */
367 static struct modldrv modldrv
= {
369 "virtual disk client",
373 static struct modlinkage modlinkage
= {
379 /* -------------------------------------------------------------------------- */
382 * Device Driver housekeeping and setup
390 if ((status
= ddi_soft_state_init(&vdc_state
, sizeof (vdc_t
), 1)) != 0)
392 if ((status
= mod_install(&modlinkage
)) != 0)
393 ddi_soft_state_fini(&vdc_state
);
398 _info(struct modinfo
*modinfop
)
400 return (mod_info(&modlinkage
, modinfop
));
408 if ((status
= mod_remove(&modlinkage
)) != 0)
410 ddi_soft_state_fini(&vdc_state
);
415 vdc_getinfo(dev_info_t
*dip
, ddi_info_cmd_t cmd
, void *arg
, void **resultp
)
417 _NOTE(ARGUNUSED(dip
))
419 int instance
= VDCUNIT((dev_t
)arg
);
423 case DDI_INFO_DEVT2DEVINFO
:
424 if ((vdc
= ddi_get_soft_state(vdc_state
, instance
)) == NULL
) {
426 return (DDI_FAILURE
);
429 return (DDI_SUCCESS
);
430 case DDI_INFO_DEVT2INSTANCE
:
431 *resultp
= (void *)(uintptr_t)instance
;
432 return (DDI_SUCCESS
);
435 return (DDI_FAILURE
);
440 vdc_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
442 kt_did_t eio_tid
, ownership_tid
;
450 /* the real work happens below */
453 /* nothing to do for this non-device */
454 return (DDI_SUCCESS
);
456 return (DDI_FAILURE
);
459 ASSERT(cmd
== DDI_DETACH
);
460 instance
= ddi_get_instance(dip
);
461 DMSGX(1, "[%d] Entered\n", instance
);
463 if ((vdc
= ddi_get_soft_state(vdc_state
, instance
)) == NULL
) {
464 cmn_err(CE_NOTE
, "[%d] Couldn't get state structure", instance
);
465 return (DDI_FAILURE
);
468 if (vdc_is_opened(vdc
)) {
469 DMSG(vdc
, 0, "[%d] Cannot detach: device is open", instance
);
470 return (DDI_FAILURE
);
473 if (vdc
->dkio_flush_pending
) {
475 "[%d] Cannot detach: %d outstanding DKIO flushes\n",
476 instance
, vdc
->dkio_flush_pending
);
477 return (DDI_FAILURE
);
480 if (vdc
->validate_pending
) {
482 "[%d] Cannot detach: %d outstanding validate request\n",
483 instance
, vdc
->validate_pending
);
484 return (DDI_FAILURE
);
487 DMSG(vdc
, 0, "[%d] proceeding...\n", instance
);
489 /* If we took ownership, release ownership */
490 mutex_enter(&vdc
->ownership_lock
);
491 if (vdc
->ownership
& VDC_OWNERSHIP_GRANTED
) {
492 rv
= vdc_access_set(vdc
, VD_ACCESS_SET_CLEAR
);
494 vdc_ownership_update(vdc
, VDC_OWNERSHIP_NONE
);
497 mutex_exit(&vdc
->ownership_lock
);
499 /* mark instance as detaching */
500 mutex_enter(&vdc
->lock
);
501 vdc
->lifecycle
= VDC_LC_DETACHING
;
502 mutex_exit(&vdc
->lock
);
505 * Try and disable callbacks to prevent another handshake. We have to
506 * disable callbacks for all servers.
508 for (srvr
= vdc
->server_list
; srvr
!= NULL
; srvr
= srvr
->next
) {
509 rv
= ldc_set_cb_mode(srvr
->ldc_handle
, LDC_CB_DISABLE
);
510 DMSG(vdc
, 0, "callback disabled (ldc=%lu, rv=%d)\n",
514 if (vdc
->initialized
& VDC_THREAD
) {
515 mutex_enter(&vdc
->read_lock
);
516 if ((vdc
->read_state
== VDC_READ_WAITING
) ||
517 (vdc
->read_state
== VDC_READ_RESET
)) {
518 vdc
->read_state
= VDC_READ_RESET
;
519 cv_signal(&vdc
->read_cv
);
522 mutex_exit(&vdc
->read_lock
);
524 /* wake up any thread waiting for connection to come online */
525 mutex_enter(&vdc
->lock
);
526 if (vdc
->state
== VDC_STATE_INIT_WAITING
) {
528 "[%d] write reset - move to resetting state...\n",
530 vdc
->state
= VDC_STATE_RESETTING
;
531 cv_signal(&vdc
->initwait_cv
);
532 } else if (vdc
->state
== VDC_STATE_FAILED
) {
533 vdc
->io_pending
= B_TRUE
;
534 cv_signal(&vdc
->io_pending_cv
);
536 mutex_exit(&vdc
->lock
);
538 /* now wait until state transitions to VDC_STATE_DETACH */
539 thread_join(vdc
->msg_proc_thr
->t_did
);
540 ASSERT(vdc
->state
== VDC_STATE_DETACH
);
541 DMSG(vdc
, 0, "[%d] Reset thread exit and join ..\n",
545 mutex_enter(&vdc
->lock
);
547 if (vdc
->initialized
& VDC_DRING
)
548 vdc_destroy_descriptor_ring(vdc
);
552 if (vdc
->eio_thread
) {
553 eio_tid
= vdc
->eio_thread
->t_did
;
554 vdc
->failfast_interval
= 0;
555 ASSERT(vdc
->num_servers
== 0);
556 cv_signal(&vdc
->eio_cv
);
561 if (vdc
->ownership
& VDC_OWNERSHIP_WANTED
) {
562 ownership_tid
= vdc
->ownership_thread
->t_did
;
563 vdc
->ownership
= VDC_OWNERSHIP_NONE
;
564 cv_signal(&vdc
->ownership_cv
);
569 mutex_exit(&vdc
->lock
);
572 thread_join(eio_tid
);
574 if (ownership_tid
!= 0)
575 thread_join(ownership_tid
);
577 if (vdc
->initialized
& VDC_MINOR
)
578 ddi_remove_minor_node(dip
, NULL
);
581 kstat_delete(vdc
->io_stats
);
582 vdc
->io_stats
= NULL
;
585 if (vdc
->err_stats
) {
586 kstat_delete(vdc
->err_stats
);
587 vdc
->err_stats
= NULL
;
590 if (vdc
->initialized
& VDC_LOCKS
) {
591 mutex_destroy(&vdc
->lock
);
592 mutex_destroy(&vdc
->read_lock
);
593 mutex_destroy(&vdc
->ownership_lock
);
594 cv_destroy(&vdc
->initwait_cv
);
595 cv_destroy(&vdc
->dring_free_cv
);
596 cv_destroy(&vdc
->membind_cv
);
597 cv_destroy(&vdc
->sync_blocked_cv
);
598 cv_destroy(&vdc
->read_cv
);
599 cv_destroy(&vdc
->running_cv
);
600 cv_destroy(&vdc
->io_pending_cv
);
601 cv_destroy(&vdc
->ownership_cv
);
602 cv_destroy(&vdc
->eio_cv
);
606 kmem_free(vdc
->minfo
, sizeof (struct dk_minfo
));
609 kmem_free(vdc
->cinfo
, sizeof (struct dk_cinfo
));
612 kmem_free(vdc
->vtoc
, sizeof (struct extvtoc
));
615 kmem_free(vdc
->geom
, sizeof (struct dk_geom
));
618 ddi_devid_unregister(dip
);
619 ddi_devid_free(vdc
->devid
);
622 if (vdc
->initialized
& VDC_SOFT_STATE
)
623 ddi_soft_state_free(vdc_state
, instance
);
625 DMSG(vdc
, 0, "[%d] End %p\n", instance
, (void *)vdc
);
627 return (DDI_SUCCESS
);
632 vdc_do_attach(dev_info_t
*dip
)
638 mde_cookie_t vd_node
;
642 instance
= ddi_get_instance(dip
);
643 if (ddi_soft_state_zalloc(vdc_state
, instance
) != DDI_SUCCESS
) {
644 cmn_err(CE_NOTE
, "[%d] Couldn't alloc state structure",
646 return (DDI_FAILURE
);
649 if ((vdc
= ddi_get_soft_state(vdc_state
, instance
)) == NULL
) {
650 cmn_err(CE_NOTE
, "[%d] Couldn't get state structure", instance
);
651 return (DDI_FAILURE
);
655 * We assign the value to initialized in this case to zero out the
656 * variable and then set bits in it to indicate what has been done
658 vdc
->initialized
= VDC_SOFT_STATE
;
660 vdc_hz_min_ldc_delay
= drv_usectohz(vdc_min_timeout_ldc
);
661 vdc_hz_max_ldc_delay
= drv_usectohz(vdc_max_timeout_ldc
);
664 vdc
->instance
= instance
;
665 vdc
->vdisk_type
= VD_DISK_TYPE_UNK
;
666 vdc
->vdisk_label
= VD_DISK_LABEL_UNK
;
667 vdc
->state
= VDC_STATE_INIT
;
668 vdc
->lifecycle
= VDC_LC_ATTACHING
;
670 vdc
->vdisk_bsize
= DEV_BSIZE
;
673 vdc
->max_xfer_sz
= maxphys
/ vdc
->vdisk_bsize
;
676 * We assume, for now, that the vDisk server will export 'read'
677 * operations to us at a minimum (this is needed because of checks
678 * in vdc for supported operations early in the handshake process).
679 * The vDisk server will return ENOTSUP if this is not the case.
680 * The value will be overwritten during the attribute exchange with
681 * the bitmask of operations exported by server.
683 vdc
->operations
= VD_OP_MASK_READ
;
690 mutex_init(&vdc
->lock
, NULL
, MUTEX_DRIVER
, NULL
);
691 cv_init(&vdc
->initwait_cv
, NULL
, CV_DRIVER
, NULL
);
692 cv_init(&vdc
->dring_free_cv
, NULL
, CV_DRIVER
, NULL
);
693 cv_init(&vdc
->membind_cv
, NULL
, CV_DRIVER
, NULL
);
694 cv_init(&vdc
->running_cv
, NULL
, CV_DRIVER
, NULL
);
695 cv_init(&vdc
->io_pending_cv
, NULL
, CV_DRIVER
, NULL
);
697 vdc
->io_pending
= B_FALSE
;
698 vdc
->threads_pending
= 0;
699 vdc
->sync_op_blocked
= B_FALSE
;
700 cv_init(&vdc
->sync_blocked_cv
, NULL
, CV_DRIVER
, NULL
);
702 mutex_init(&vdc
->ownership_lock
, NULL
, MUTEX_DRIVER
, NULL
);
703 cv_init(&vdc
->ownership_cv
, NULL
, CV_DRIVER
, NULL
);
704 cv_init(&vdc
->eio_cv
, NULL
, CV_DRIVER
, NULL
);
706 /* init blocking msg read functionality */
707 mutex_init(&vdc
->read_lock
, NULL
, MUTEX_DRIVER
, NULL
);
708 cv_init(&vdc
->read_cv
, NULL
, CV_DRIVER
, NULL
);
709 vdc
->read_state
= VDC_READ_IDLE
;
711 vdc
->initialized
|= VDC_LOCKS
;
713 /* get device and port MD node for this disk instance */
714 if (vdc_get_md_node(dip
, &mdp
, &vd_node
) != 0) {
715 cmn_err(CE_NOTE
, "[%d] Could not get machine description node",
717 return (DDI_FAILURE
);
720 if (vdc_init_ports(vdc
, mdp
, vd_node
) != 0) {
721 cmn_err(CE_NOTE
, "[%d] Error initialising ports", instance
);
722 return (DDI_FAILURE
);
725 (void) md_fini_handle(mdp
);
727 /* Create the kstats for saving the I/O statistics used by iostat(1M) */
728 vdc_create_io_kstats(vdc
);
729 vdc_create_err_kstats(vdc
);
731 /* Initialize remaining structures before starting the msg thread */
732 vdc
->vdisk_label
= VD_DISK_LABEL_UNK
;
733 vdc
->vtoc
= kmem_zalloc(sizeof (struct extvtoc
), KM_SLEEP
);
734 vdc
->geom
= kmem_zalloc(sizeof (struct dk_geom
), KM_SLEEP
);
735 vdc
->minfo
= kmem_zalloc(sizeof (struct dk_minfo
), KM_SLEEP
);
737 /* initialize the thread responsible for managing state with server */
738 vdc
->msg_proc_thr
= thread_create(NULL
, 0, vdc_process_msg_thread
,
739 vdc
, 0, &p0
, TS_RUN
, minclsyspri
);
740 if (vdc
->msg_proc_thr
== NULL
) {
741 cmn_err(CE_NOTE
, "[%d] Failed to create msg processing thread",
743 return (DDI_FAILURE
);
747 * If there are multiple servers then start the eio thread.
749 if (vdc
->num_servers
> 1) {
750 vdc
->eio_thread
= thread_create(NULL
, 0, vdc_eio_thread
, vdc
, 0,
751 &p0
, TS_RUN
, v
.v_maxsyspri
- 2);
752 if (vdc
->eio_thread
== NULL
) {
753 cmn_err(CE_NOTE
, "[%d] Failed to create error "
754 "I/O thread", instance
);
755 return (DDI_FAILURE
);
759 vdc
->initialized
|= VDC_THREAD
;
761 atomic_inc_32(&vdc_instance_count
);
764 * Check the disk label. This will send requests and do the handshake.
765 * We don't really care about the disk label now. What we really need is
766 * the handshake do be done so that we know the type of the disk (slice
767 * or full disk) and the appropriate device nodes can be created.
770 mutex_enter(&vdc
->lock
);
771 (void) vdc_validate_geometry(vdc
);
772 mutex_exit(&vdc
->lock
);
775 * Now that we have the device info we can create the device nodes
777 status
= vdc_create_device_nodes(vdc
);
779 DMSG(vdc
, 0, "[%d] Failed to create device nodes",
785 * Fill in the fields of the error statistics kstat that were not
786 * available when creating the kstat
788 vdc_set_err_kstats(vdc
);
790 ASSERT(vdc
->lifecycle
== VDC_LC_ONLINE
||
791 vdc
->lifecycle
== VDC_LC_ONLINE_PENDING
);
792 DMSG(vdc
, 0, "[%d] Attach tasks successful\n", instance
);
795 DMSG(vdc
, 0, "[%d] Attach completed\n", instance
);
800 vdc_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
806 if ((status
= vdc_do_attach(dip
)) != 0)
807 (void) vdc_detach(dip
, DDI_DETACH
);
810 /* nothing to do for this non-device */
811 return (DDI_SUCCESS
);
813 return (DDI_FAILURE
);
818 vdc_do_ldc_init(vdc_t
*vdc
, vdc_server_t
*srvr
)
821 ldc_status_t ldc_state
;
825 ASSERT(srvr
!= NULL
);
827 ldc_attr
.devclass
= LDC_DEV_BLK
;
828 ldc_attr
.instance
= vdc
->instance
;
829 ldc_attr
.mode
= LDC_MODE_UNRELIABLE
; /* unreliable transport */
830 ldc_attr
.mtu
= VD_LDC_MTU
;
832 if ((srvr
->state
& VDC_LDC_INIT
) == 0) {
833 status
= ldc_init(srvr
->ldc_id
, &ldc_attr
,
836 DMSG(vdc
, 0, "[%d] ldc_init(chan %ld) returned %d",
837 vdc
->instance
, srvr
->ldc_id
, status
);
840 srvr
->state
|= VDC_LDC_INIT
;
842 status
= ldc_status(srvr
->ldc_handle
, &ldc_state
);
844 DMSG(vdc
, 0, "[%d] Cannot discover LDC status [err=%d]",
845 vdc
->instance
, status
);
848 srvr
->ldc_state
= ldc_state
;
850 if ((srvr
->state
& VDC_LDC_CB
) == 0) {
851 status
= ldc_reg_callback(srvr
->ldc_handle
, vdc_handle_cb
,
854 DMSG(vdc
, 0, "[%d] LDC callback reg. failed (%d)",
855 vdc
->instance
, status
);
858 srvr
->state
|= VDC_LDC_CB
;
862 * At this stage we have initialised LDC, we will now try and open
865 if (srvr
->ldc_state
== LDC_INIT
) {
866 status
= ldc_open(srvr
->ldc_handle
);
868 DMSG(vdc
, 0, "[%d] ldc_open(chan %ld) returned %d",
869 vdc
->instance
, srvr
->ldc_id
, status
);
872 srvr
->state
|= VDC_LDC_OPEN
;
877 vdc_terminate_ldc(vdc
, srvr
);
884 vdc_start_ldc_connection(vdc_t
*vdc
)
890 ASSERT(MUTEX_HELD(&vdc
->lock
));
892 status
= vdc_do_ldc_up(vdc
);
894 DMSG(vdc
, 0, "[%d] Finished bringing up LDC\n", vdc
->instance
);
900 vdc_stop_ldc_connection(vdc_t
*vdcp
)
904 ASSERT(vdcp
!= NULL
);
906 ASSERT(MUTEX_HELD(&vdcp
->lock
));
908 DMSG(vdcp
, 0, ": Resetting connection to vDisk server : state %d\n",
911 status
= ldc_down(vdcp
->curr_server
->ldc_handle
);
912 DMSG(vdcp
, 0, "ldc_down() = %d\n", status
);
914 vdcp
->initialized
&= ~VDC_HANDSHAKE
;
915 DMSG(vdcp
, 0, "initialized=%x\n", vdcp
->initialized
);
921 vdc_create_io_kstats(vdc_t
*vdc
)
923 if (vdc
->io_stats
!= NULL
) {
924 DMSG(vdc
, 0, "[%d] I/O kstat already exists\n", vdc
->instance
);
928 vdc
->io_stats
= kstat_create(VDC_DRIVER_NAME
, vdc
->instance
, NULL
,
929 "disk", KSTAT_TYPE_IO
, 1, KSTAT_FLAG_PERSISTENT
);
930 if (vdc
->io_stats
!= NULL
) {
931 vdc
->io_stats
->ks_lock
= &vdc
->lock
;
932 kstat_install(vdc
->io_stats
);
934 cmn_err(CE_NOTE
, "[%d] Failed to create kstat: I/O statistics"
935 " will not be gathered", vdc
->instance
);
940 vdc_create_err_kstats(vdc_t
*vdc
)
943 char kstatmodule_err
[KSTAT_STRLEN
];
944 char kstatname
[KSTAT_STRLEN
];
945 int ndata
= (sizeof (vd_err_stats_t
) / sizeof (kstat_named_t
));
946 int instance
= vdc
->instance
;
948 if (vdc
->err_stats
!= NULL
) {
949 DMSG(vdc
, 0, "[%d] ERR kstat already exists\n", vdc
->instance
);
953 (void) snprintf(kstatmodule_err
, sizeof (kstatmodule_err
),
954 "%serr", VDC_DRIVER_NAME
);
955 (void) snprintf(kstatname
, sizeof (kstatname
),
956 "%s%d,err", VDC_DRIVER_NAME
, instance
);
958 vdc
->err_stats
= kstat_create(kstatmodule_err
, instance
, kstatname
,
959 "device_error", KSTAT_TYPE_NAMED
, ndata
, KSTAT_FLAG_PERSISTENT
);
961 if (vdc
->err_stats
== NULL
) {
962 cmn_err(CE_NOTE
, "[%d] Failed to create kstat: Error statistics"
963 " will not be gathered", instance
);
967 stp
= (vd_err_stats_t
*)vdc
->err_stats
->ks_data
;
968 kstat_named_init(&stp
->vd_softerrs
, "Soft Errors",
970 kstat_named_init(&stp
->vd_transerrs
, "Transport Errors",
972 kstat_named_init(&stp
->vd_protoerrs
, "Protocol Errors",
974 kstat_named_init(&stp
->vd_vid
, "Vendor",
976 kstat_named_init(&stp
->vd_pid
, "Product",
978 kstat_named_init(&stp
->vd_capacity
, "Size",
979 KSTAT_DATA_ULONGLONG
);
981 vdc
->err_stats
->ks_update
= nulldev
;
983 kstat_install(vdc
->err_stats
);
987 vdc_set_err_kstats(vdc_t
*vdc
)
991 if (vdc
->err_stats
== NULL
)
994 mutex_enter(&vdc
->lock
);
996 stp
= (vd_err_stats_t
*)vdc
->err_stats
->ks_data
;
999 stp
->vd_capacity
.value
.ui64
= vdc
->vdisk_size
* vdc
->vdisk_bsize
;
1000 (void) strcpy(stp
->vd_vid
.value
.c
, "SUN");
1001 (void) strcpy(stp
->vd_pid
.value
.c
, "VDSK");
1003 mutex_exit(&vdc
->lock
);
1007 vdc_create_device_nodes_efi(vdc_t
*vdc
)
1009 ddi_remove_minor_node(vdc
->dip
, "h");
1010 ddi_remove_minor_node(vdc
->dip
, "h,raw");
1012 if (ddi_create_minor_node(vdc
->dip
, "wd", S_IFBLK
,
1013 VD_MAKE_DEV(vdc
->instance
, VD_EFI_WD_SLICE
),
1014 DDI_NT_BLOCK
, 0) != DDI_SUCCESS
) {
1015 cmn_err(CE_NOTE
, "[%d] Couldn't add block node 'wd'",
1020 /* if any device node is created we set this flag */
1021 vdc
->initialized
|= VDC_MINOR
;
1023 if (ddi_create_minor_node(vdc
->dip
, "wd,raw", S_IFCHR
,
1024 VD_MAKE_DEV(vdc
->instance
, VD_EFI_WD_SLICE
),
1025 DDI_NT_BLOCK
, 0) != DDI_SUCCESS
) {
1026 cmn_err(CE_NOTE
, "[%d] Couldn't add block node 'wd,raw'",
1035 vdc_create_device_nodes_vtoc(vdc_t
*vdc
)
1037 ddi_remove_minor_node(vdc
->dip
, "wd");
1038 ddi_remove_minor_node(vdc
->dip
, "wd,raw");
1040 if (ddi_create_minor_node(vdc
->dip
, "h", S_IFBLK
,
1041 VD_MAKE_DEV(vdc
->instance
, VD_EFI_WD_SLICE
),
1042 DDI_NT_BLOCK
, 0) != DDI_SUCCESS
) {
1043 cmn_err(CE_NOTE
, "[%d] Couldn't add block node 'h'",
1048 /* if any device node is created we set this flag */
1049 vdc
->initialized
|= VDC_MINOR
;
1051 if (ddi_create_minor_node(vdc
->dip
, "h,raw", S_IFCHR
,
1052 VD_MAKE_DEV(vdc
->instance
, VD_EFI_WD_SLICE
),
1053 DDI_NT_BLOCK
, 0) != DDI_SUCCESS
) {
1054 cmn_err(CE_NOTE
, "[%d] Couldn't add block node 'h,raw'",
1064 * vdc_create_device_nodes
1067 * This function creates the block and character device nodes under
1068 * /devices. It is called as part of the attach(9E) of the instance
1069 * during the handshake with vds after vds has sent the attributes
1072 * If the device is of type VD_DISK_TYPE_SLICE then the minor node
1073 * of 2 is used in keeping with the Solaris convention that slice 2
1074 * refers to a whole disk. Slices start at 'a'
1077 * vdc - soft state pointer
1081 * EIO - Failed to create node
1084 vdc_create_device_nodes(vdc_t
*vdc
)
1086 char name
[sizeof ("s,raw")];
1087 dev_info_t
*dip
= NULL
;
1088 int instance
, status
;
1092 ASSERT(vdc
!= NULL
);
1094 instance
= vdc
->instance
;
1097 switch (vdc
->vdisk_type
) {
1098 case VD_DISK_TYPE_DISK
:
1099 case VD_DISK_TYPE_UNK
:
1100 num_slices
= V_NUMPAR
;
1102 case VD_DISK_TYPE_SLICE
:
1110 * Minor nodes are different for EFI disks: EFI disks do not have
1111 * a minor node 'g' for the minor number corresponding to slice
1112 * VD_EFI_WD_SLICE (slice 7) instead they have a minor node 'wd'
1113 * representing the whole disk.
1115 for (i
= 0; i
< num_slices
; i
++) {
1117 if (i
== VD_EFI_WD_SLICE
) {
1118 if (vdc
->vdisk_label
== VD_DISK_LABEL_EFI
)
1119 status
= vdc_create_device_nodes_efi(vdc
);
1121 status
= vdc_create_device_nodes_vtoc(vdc
);
1127 (void) snprintf(name
, sizeof (name
), "%c", 'a' + i
);
1128 if (ddi_create_minor_node(dip
, name
, S_IFBLK
,
1129 VD_MAKE_DEV(instance
, i
), DDI_NT_BLOCK
, 0) != DDI_SUCCESS
) {
1130 cmn_err(CE_NOTE
, "[%d] Couldn't add block node '%s'",
1135 /* if any device node is created we set this flag */
1136 vdc
->initialized
|= VDC_MINOR
;
1138 (void) snprintf(name
, sizeof (name
), "%c%s", 'a' + i
, ",raw");
1140 if (ddi_create_minor_node(dip
, name
, S_IFCHR
,
1141 VD_MAKE_DEV(instance
, i
), DDI_NT_BLOCK
, 0) != DDI_SUCCESS
) {
1142 cmn_err(CE_NOTE
, "[%d] Couldn't add raw node '%s'",
1152 * Driver prop_op(9e) entry point function. Return the number of blocks for
1153 * the partition in question or forward the request to the property facilities.
1156 vdc_prop_op(dev_t dev
, dev_info_t
*dip
, ddi_prop_op_t prop_op
, int mod_flags
,
1157 char *name
, caddr_t valuep
, int *lengthp
)
1159 int instance
= ddi_get_instance(dip
);
1164 vdc
= ddi_get_soft_state(vdc_state
, instance
);
1166 if (dev
== DDI_DEV_T_ANY
|| vdc
== NULL
) {
1167 return (ddi_prop_op(dev
, dip
, prop_op
, mod_flags
,
1168 name
, valuep
, lengthp
));
1171 mutex_enter(&vdc
->lock
);
1172 (void) vdc_validate_geometry(vdc
);
1173 if (vdc
->vdisk_label
== VD_DISK_LABEL_UNK
) {
1174 mutex_exit(&vdc
->lock
);
1175 return (ddi_prop_op(dev
, dip
, prop_op
, mod_flags
,
1176 name
, valuep
, lengthp
));
1178 nblocks
= vdc
->slice
[VDCPART(dev
)].nblocks
;
1179 blksize
= vdc
->vdisk_bsize
;
1180 mutex_exit(&vdc
->lock
);
1182 return (ddi_prop_op_nblocks_blksize(dev
, dip
, prop_op
, mod_flags
,
1183 name
, valuep
, lengthp
, nblocks
, blksize
));
1191 * This function checks if any slice of a given virtual disk is
1195 * vdc - soft state pointer
1198 * B_TRUE - at least one slice is opened.
1199 * B_FALSE - no slice is opened.
1202 vdc_is_opened(vdc_t
*vdc
)
1206 /* check if there's any layered open */
1207 for (i
= 0; i
< V_NUMPAR
; i
++) {
1208 if (vdc
->open_lyr
[i
] > 0)
1212 /* check if there is any other kind of open */
1213 for (i
= 0; i
< OTYPCNT
; i
++) {
1214 if (vdc
->open
[i
] != 0)
1222 vdc_mark_opened(vdc_t
*vdc
, int slice
, int flag
, int otyp
)
1227 ASSERT(otyp
< OTYPCNT
);
1228 ASSERT(slice
< V_NUMPAR
);
1229 ASSERT(MUTEX_HELD(&vdc
->lock
));
1231 slicemask
= 1 << slice
;
1234 * If we have a single-slice disk which was unavailable during the
1235 * attach then a device was created for each 8 slices. Now that
1236 * the type is known, we prevent opening any slice other than 0
1237 * even if a device still exists.
1239 if (vdc
->vdisk_type
== VD_DISK_TYPE_SLICE
&& slice
!= 0)
1242 /* check if slice is already exclusively opened */
1243 if (vdc
->open_excl
& slicemask
)
1246 /* if open exclusive, check if slice is already opened */
1248 if (vdc
->open_lyr
[slice
] > 0)
1250 for (i
= 0; i
< OTYPCNT
; i
++) {
1251 if (vdc
->open
[i
] & slicemask
)
1254 vdc
->open_excl
|= slicemask
;
1257 /* mark slice as opened */
1258 if (otyp
== OTYP_LYR
) {
1259 vdc
->open_lyr
[slice
]++;
1261 vdc
->open
[otyp
] |= slicemask
;
1268 vdc_mark_closed(vdc_t
*vdc
, int slice
, int flag
, int otyp
)
1272 ASSERT(otyp
< OTYPCNT
);
1273 ASSERT(slice
< V_NUMPAR
);
1274 ASSERT(MUTEX_HELD(&vdc
->lock
));
1276 slicemask
= 1 << slice
;
1278 if (otyp
== OTYP_LYR
) {
1279 ASSERT(vdc
->open_lyr
[slice
] > 0);
1280 vdc
->open_lyr
[slice
]--;
1282 vdc
->open
[otyp
] &= ~slicemask
;
1286 vdc
->open_excl
&= ~slicemask
;
1290 vdc_open(dev_t
*dev
, int flag
, int otyp
, cred_t
*cred
)
1292 _NOTE(ARGUNUSED(cred
))
1294 int instance
, nodelay
;
1295 int slice
, status
= 0;
1298 ASSERT(dev
!= NULL
);
1299 instance
= VDCUNIT(*dev
);
1301 if (otyp
>= OTYPCNT
)
1304 if ((vdc
= ddi_get_soft_state(vdc_state
, instance
)) == NULL
) {
1305 cmn_err(CE_NOTE
, "[%d] Couldn't get state structure", instance
);
1309 DMSG(vdc
, 0, "minor = %d flag = %x, otyp = %x\n",
1310 getminor(*dev
), flag
, otyp
);
1312 slice
= VDCPART(*dev
);
1314 nodelay
= flag
& (FNDELAY
| FNONBLOCK
);
1316 if ((flag
& FWRITE
) && (!nodelay
) &&
1317 !(VD_OP_SUPPORTED(vdc
->operations
, VD_OP_BWRITE
))) {
1321 mutex_enter(&vdc
->lock
);
1323 status
= vdc_mark_opened(vdc
, slice
, flag
, otyp
);
1326 mutex_exit(&vdc
->lock
);
1331 * If the disk type is unknown then we have to wait for the
1332 * handshake to complete because we don't know if the slice
1333 * device we are opening effectively exists.
1335 if (vdc
->vdisk_type
!= VD_DISK_TYPE_UNK
&& nodelay
) {
1337 /* don't resubmit a validate request if there's already one */
1338 if (vdc
->validate_pending
> 0) {
1339 mutex_exit(&vdc
->lock
);
1343 /* call vdc_validate() asynchronously to avoid blocking */
1344 if (taskq_dispatch(system_taskq
, vdc_validate_task
,
1345 (void *)vdc
, TQ_NOSLEEP
) == NULL
) {
1346 vdc_mark_closed(vdc
, slice
, flag
, otyp
);
1347 mutex_exit(&vdc
->lock
);
1351 vdc
->validate_pending
++;
1352 mutex_exit(&vdc
->lock
);
1356 mutex_exit(&vdc
->lock
);
1360 mutex_enter(&vdc
->lock
);
1362 if (vdc
->vdisk_type
== VD_DISK_TYPE_UNK
||
1363 (vdc
->vdisk_type
== VD_DISK_TYPE_SLICE
&& slice
!= 0) ||
1364 (!nodelay
&& (vdc
->vdisk_label
== VD_DISK_LABEL_UNK
||
1365 vdc
->slice
[slice
].nblocks
== 0))) {
1366 vdc_mark_closed(vdc
, slice
, flag
, otyp
);
1370 mutex_exit(&vdc
->lock
);
1376 vdc_close(dev_t dev
, int flag
, int otyp
, cred_t
*cred
)
1378 _NOTE(ARGUNUSED(cred
))
1385 instance
= VDCUNIT(dev
);
1387 if (otyp
>= OTYPCNT
)
1390 if ((vdc
= ddi_get_soft_state(vdc_state
, instance
)) == NULL
) {
1391 cmn_err(CE_NOTE
, "[%d] Couldn't get state structure", instance
);
1395 DMSG(vdc
, 0, "[%d] flag = %x, otyp = %x\n", instance
, flag
, otyp
);
1397 slice
= VDCPART(dev
);
1400 * Attempt to flush the W$ on a close operation. If this is
1401 * not a supported IOCTL command or the backing device is read-only
1402 * do not fail the close operation.
1404 rv
= vd_process_ioctl(dev
, DKIOCFLUSHWRITECACHE
, NULL
, FKIOCTL
, &rval
);
1406 if (rv
!= 0 && rv
!= ENOTSUP
&& rv
!= ENOTTY
&& rv
!= EROFS
) {
1407 DMSG(vdc
, 0, "[%d] flush failed with error %d on close\n",
1412 mutex_enter(&vdc
->lock
);
1413 vdc_mark_closed(vdc
, slice
, flag
, otyp
);
1414 mutex_exit(&vdc
->lock
);
1420 vdc_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
, cred_t
*credp
, int *rvalp
)
1422 _NOTE(ARGUNUSED(credp
))
1424 return (vd_process_ioctl(dev
, cmd
, (caddr_t
)arg
, mode
, rvalp
));
1428 vdc_print(dev_t dev
, char *str
)
1430 cmn_err(CE_NOTE
, "vdc%d: %s", VDCUNIT(dev
), str
);
1435 vdc_dump(dev_t dev
, caddr_t addr
, daddr_t blkno
, int nblk
)
1438 size_t nbytes
= nblk
* DEV_BSIZE
;
1439 int instance
= VDCUNIT(dev
);
1441 diskaddr_t vio_blkno
;
1443 if ((vdc
= ddi_get_soft_state(vdc_state
, instance
)) == NULL
) {
1444 cmn_err(CE_NOTE
, "[%d] Couldn't get state structure", instance
);
1448 DMSG(vdc
, 2, "[%d] dump %ld bytes at block 0x%lx : addr=0x%p\n",
1449 instance
, nbytes
, blkno
, (void *)addr
);
1451 /* convert logical block to vio block */
1452 if ((blkno
& vdc
->vio_bmask
) != 0) {
1453 DMSG(vdc
, 0, "Misaligned block number (%lu)\n", blkno
);
1456 vio_blkno
= blkno
>> vdc
->vio_bshift
;
1459 * If we are panicking, we need the state to be "running" so that we
1460 * can submit I/Os, but we don't want to check for any backend error.
1462 flags
= (ddi_in_panic())? VDC_OP_STATE_RUNNING
: VDC_OP_NORMAL
;
1464 rv
= vdc_do_op(vdc
, VD_OP_BWRITE
, addr
, nbytes
, VDCPART(dev
),
1465 vio_blkno
, NULL
, VIO_write_dir
, flags
);
1468 DMSG(vdc
, 0, "Failed to do a disk dump (err=%d)\n", rv
);
1472 DMSG(vdc
, 0, "[%d] End\n", instance
);
1477 /* -------------------------------------------------------------------------- */
1480 * Disk access routines
1488 * 0: As per strategy(9E), the strategy() function must return 0
1489 * [ bioerror(9f) sets b_flags to the proper error code ]
1492 vdc_strategy(struct buf
*buf
)
1494 diskaddr_t vio_blkno
;
1496 int instance
= VDCUNIT(buf
->b_edev
);
1497 int op
= (buf
->b_flags
& B_READ
) ? VD_OP_BREAD
: VD_OP_BWRITE
;
1500 if ((vdc
= ddi_get_soft_state(vdc_state
, instance
)) == NULL
) {
1501 cmn_err(CE_NOTE
, "[%d] Couldn't get state structure", instance
);
1502 bioerror(buf
, ENXIO
);
1507 DMSG(vdc
, 2, "[%d] %s %ld bytes at block %llx : b_addr=0x%p\n",
1508 instance
, (buf
->b_flags
& B_READ
) ? "Read" : "Write",
1509 buf
->b_bcount
, buf
->b_lblkno
, (void *)buf
->b_un
.b_addr
);
1513 if ((long)buf
->b_private
== VD_SLICE_NONE
) {
1514 /* I/O using an absolute disk offset */
1515 slice
= VD_SLICE_NONE
;
1517 slice
= VDCPART(buf
->b_edev
);
1521 * In the buf structure, b_lblkno represents a logical block number
1522 * using a block size of 512 bytes. For the VIO request, this block
1523 * number has to be converted to be represented with the block size
1524 * used by the VIO protocol.
1526 if ((buf
->b_lblkno
& vdc
->vio_bmask
) != 0) {
1527 bioerror(buf
, EINVAL
);
1531 vio_blkno
= buf
->b_lblkno
>> vdc
->vio_bshift
;
1533 /* submit the I/O, any error will be reported in the buf structure */
1534 (void) vdc_do_op(vdc
, op
, (caddr_t
)buf
->b_un
.b_addr
,
1535 buf
->b_bcount
, slice
, vio_blkno
,
1536 buf
, (op
== VD_OP_BREAD
) ? VIO_read_dir
: VIO_write_dir
,
1547 * Routine to limit the size of a data transfer. Used in
1548 * conjunction with physio(9F).
1551 * bp - pointer to the indicated buf(9S) struct.
1555 vdc_min(struct buf
*bufp
)
1558 int instance
= VDCUNIT(bufp
->b_edev
);
1560 vdc
= ddi_get_soft_state(vdc_state
, instance
);
1561 VERIFY(vdc
!= NULL
);
1563 if (bufp
->b_bcount
> (vdc
->max_xfer_sz
* vdc
->vdisk_bsize
)) {
1564 bufp
->b_bcount
= vdc
->max_xfer_sz
* vdc
->vdisk_bsize
;
1569 vdc_read(dev_t dev
, struct uio
*uio
, cred_t
*cred
)
1571 _NOTE(ARGUNUSED(cred
))
1573 DMSGX(1, "[%d] Entered", VDCUNIT(dev
));
1574 return (physio(vdc_strategy
, NULL
, dev
, B_READ
, vdc_min
, uio
));
1578 vdc_write(dev_t dev
, struct uio
*uio
, cred_t
*cred
)
1580 _NOTE(ARGUNUSED(cred
))
1582 DMSGX(1, "[%d] Entered", VDCUNIT(dev
));
1583 return (physio(vdc_strategy
, NULL
, dev
, B_WRITE
, vdc_min
, uio
));
1587 vdc_aread(dev_t dev
, struct aio_req
*aio
, cred_t
*cred
)
1589 _NOTE(ARGUNUSED(cred
))
1591 DMSGX(1, "[%d] Entered", VDCUNIT(dev
));
1592 return (aphysio(vdc_strategy
, anocancel
, dev
, B_READ
, vdc_min
, aio
));
1596 vdc_awrite(dev_t dev
, struct aio_req
*aio
, cred_t
*cred
)
1598 _NOTE(ARGUNUSED(cred
))
1600 DMSGX(1, "[%d] Entered", VDCUNIT(dev
));
1601 return (aphysio(vdc_strategy
, anocancel
, dev
, B_WRITE
, vdc_min
, aio
));
1605 /* -------------------------------------------------------------------------- */
1614 * vdc_init_ver_negotiation()
1619 * vdc - soft state pointer for this instance of the device driver.
1625 vdc_init_ver_negotiation(vdc_t
*vdc
, vio_ver_t ver
)
1628 size_t msglen
= sizeof (pkt
);
1631 ASSERT(vdc
!= NULL
);
1632 ASSERT(mutex_owned(&vdc
->lock
));
1634 DMSG(vdc
, 0, "[%d] Entered.\n", vdc
->instance
);
1637 * set the Session ID to a unique value
1638 * (the lower 32 bits of the clock tick)
1640 vdc
->session_id
= ((uint32_t)gettick() & 0xffffffff);
1641 DMSG(vdc
, 0, "[%d] Set SID to 0x%lx\n", vdc
->instance
, vdc
->session_id
);
1643 pkt
.tag
.vio_msgtype
= VIO_TYPE_CTRL
;
1644 pkt
.tag
.vio_subtype
= VIO_SUBTYPE_INFO
;
1645 pkt
.tag
.vio_subtype_env
= VIO_VER_INFO
;
1646 pkt
.tag
.vio_sid
= vdc
->session_id
;
1647 pkt
.dev_class
= VDEV_DISK
;
1648 pkt
.ver_major
= ver
.major
;
1649 pkt
.ver_minor
= ver
.minor
;
1651 status
= vdc_send(vdc
, (caddr_t
)&pkt
, &msglen
);
1652 DMSG(vdc
, 0, "[%d] Ver info sent (status = %d)\n",
1653 vdc
->instance
, status
);
1654 if ((status
!= 0) || (msglen
!= sizeof (vio_ver_msg_t
))) {
1655 DMSG(vdc
, 0, "[%d] Failed to send Ver negotiation info: "
1656 "id(%lx) rv(%d) size(%ld)", vdc
->instance
,
1657 vdc
->curr_server
->ldc_handle
, status
, msglen
);
1658 if (msglen
!= sizeof (vio_ver_msg_t
))
1667 * vdc_ver_negotiation()
1672 * vdcp - soft state pointer for this instance of the device driver.
1678 vdc_ver_negotiation(vdc_t
*vdcp
)
1683 if (status
= vdc_init_ver_negotiation(vdcp
, vdc_version
[0]))
1686 /* release lock and wait for response */
1687 mutex_exit(&vdcp
->lock
);
1688 status
= vdc_wait_for_response(vdcp
, &vio_msg
);
1689 mutex_enter(&vdcp
->lock
);
1692 "[%d] Failed waiting for Ver negotiation response, rv(%d)",
1693 vdcp
->instance
, status
);
1697 /* check type and sub_type ... */
1698 if (vio_msg
.tag
.vio_msgtype
!= VIO_TYPE_CTRL
||
1699 vio_msg
.tag
.vio_subtype
== VIO_SUBTYPE_INFO
) {
1700 DMSG(vdcp
, 0, "[%d] Invalid ver negotiation response\n",
1705 return (vdc_handle_ver_msg(vdcp
, (vio_ver_msg_t
*)&vio_msg
));
1710 * vdc_init_attr_negotiation()
1715 * vdc - soft state pointer for this instance of the device driver.
1721 vdc_init_attr_negotiation(vdc_t
*vdc
)
1724 size_t msglen
= sizeof (pkt
);
1727 ASSERT(vdc
!= NULL
);
1728 ASSERT(mutex_owned(&vdc
->lock
));
1730 DMSG(vdc
, 0, "[%d] entered\n", vdc
->instance
);
1733 pkt
.tag
.vio_msgtype
= VIO_TYPE_CTRL
;
1734 pkt
.tag
.vio_subtype
= VIO_SUBTYPE_INFO
;
1735 pkt
.tag
.vio_subtype_env
= VIO_ATTR_INFO
;
1736 pkt
.tag
.vio_sid
= vdc
->session_id
;
1737 /* fill in payload */
1738 pkt
.max_xfer_sz
= vdc
->max_xfer_sz
;
1739 pkt
.vdisk_block_size
= vdc
->vdisk_bsize
;
1740 pkt
.xfer_mode
= VIO_DRING_MODE_V1_0
;
1741 pkt
.operations
= 0; /* server will set bits of valid operations */
1742 pkt
.vdisk_type
= 0; /* server will set to valid device type */
1743 pkt
.vdisk_media
= 0; /* server will set to valid media type */
1744 pkt
.vdisk_size
= 0; /* server will set to valid size */
1746 status
= vdc_send(vdc
, (caddr_t
)&pkt
, &msglen
);
1747 DMSG(vdc
, 0, "Attr info sent (status = %d)\n", status
);
1749 if ((status
!= 0) || (msglen
!= sizeof (vd_attr_msg_t
))) {
1750 DMSG(vdc
, 0, "[%d] Failed to send Attr negotiation info: "
1751 "id(%lx) rv(%d) size(%ld)", vdc
->instance
,
1752 vdc
->curr_server
->ldc_handle
, status
, msglen
);
1753 if (msglen
!= sizeof (vd_attr_msg_t
))
1762 * vdc_attr_negotiation()
1767 * vdc - soft state pointer for this instance of the device driver.
1773 vdc_attr_negotiation(vdc_t
*vdcp
)
1778 if (status
= vdc_init_attr_negotiation(vdcp
))
1781 /* release lock and wait for response */
1782 mutex_exit(&vdcp
->lock
);
1783 status
= vdc_wait_for_response(vdcp
, &vio_msg
);
1784 mutex_enter(&vdcp
->lock
);
1787 "[%d] Failed waiting for Attr negotiation response, rv(%d)",
1788 vdcp
->instance
, status
);
1792 /* check type and sub_type ... */
1793 if (vio_msg
.tag
.vio_msgtype
!= VIO_TYPE_CTRL
||
1794 vio_msg
.tag
.vio_subtype
== VIO_SUBTYPE_INFO
) {
1795 DMSG(vdcp
, 0, "[%d] Invalid attr negotiation response\n",
1800 return (vdc_handle_attr_msg(vdcp
, (vd_attr_msg_t
*)&vio_msg
));
1806 * vdc_init_dring_negotiate()
1811 * vdc - soft state pointer for this instance of the device driver.
1817 vdc_init_dring_negotiate(vdc_t
*vdc
)
1819 vio_dring_reg_msg_t pkt
;
1820 size_t msglen
= sizeof (pkt
);
1825 ASSERT(vdc
!= NULL
);
1826 ASSERT(mutex_owned(&vdc
->lock
));
1828 for (retry
= 0; retry
< nretries
; retry
++) {
1829 status
= vdc_init_descriptor_ring(vdc
);
1830 if (status
!= EAGAIN
)
1832 drv_usecwait(vdc_min_timeout_ldc
);
1836 DMSG(vdc
, 0, "[%d] Failed to init DRing (status = %d)\n",
1837 vdc
->instance
, status
);
1841 DMSG(vdc
, 0, "[%d] Init of descriptor ring completed (status = %d)\n",
1842 vdc
->instance
, status
);
1845 pkt
.tag
.vio_msgtype
= VIO_TYPE_CTRL
;
1846 pkt
.tag
.vio_subtype
= VIO_SUBTYPE_INFO
;
1847 pkt
.tag
.vio_subtype_env
= VIO_DRING_REG
;
1848 pkt
.tag
.vio_sid
= vdc
->session_id
;
1849 /* fill in payload */
1850 pkt
.dring_ident
= 0;
1851 pkt
.num_descriptors
= vdc
->dring_len
;
1852 pkt
.descriptor_size
= vdc
->dring_entry_size
;
1853 pkt
.options
= (VIO_TX_DRING
| VIO_RX_DRING
);
1854 pkt
.ncookies
= vdc
->dring_cookie_count
;
1855 pkt
.cookie
[0] = vdc
->dring_cookie
[0]; /* for now just one cookie */
1857 status
= vdc_send(vdc
, (caddr_t
)&pkt
, &msglen
);
1859 DMSG(vdc
, 0, "[%d] Failed to register DRing (err = %d)",
1860 vdc
->instance
, status
);
1869 * vdc_dring_negotiation()
1874 * vdc - soft state pointer for this instance of the device driver.
1880 vdc_dring_negotiation(vdc_t
*vdcp
)
1885 if (status
= vdc_init_dring_negotiate(vdcp
))
1888 /* release lock and wait for response */
1889 mutex_exit(&vdcp
->lock
);
1890 status
= vdc_wait_for_response(vdcp
, &vio_msg
);
1891 mutex_enter(&vdcp
->lock
);
1894 "[%d] Failed waiting for Dring negotiation response,"
1895 " rv(%d)", vdcp
->instance
, status
);
1899 /* check type and sub_type ... */
1900 if (vio_msg
.tag
.vio_msgtype
!= VIO_TYPE_CTRL
||
1901 vio_msg
.tag
.vio_subtype
== VIO_SUBTYPE_INFO
) {
1902 DMSG(vdcp
, 0, "[%d] Invalid Dring negotiation response\n",
1907 return (vdc_handle_dring_reg_msg(vdcp
,
1908 (vio_dring_reg_msg_t
*)&vio_msg
));
1919 * vdc - soft state pointer for this instance of the device driver.
1925 vdc_send_rdx(vdc_t
*vdcp
)
1928 size_t msglen
= sizeof (vio_msg_t
);
1932 * Send an RDX message to vds to indicate we are ready
1935 msg
.tag
.vio_msgtype
= VIO_TYPE_CTRL
;
1936 msg
.tag
.vio_subtype
= VIO_SUBTYPE_INFO
;
1937 msg
.tag
.vio_subtype_env
= VIO_RDX
;
1938 msg
.tag
.vio_sid
= vdcp
->session_id
;
1939 status
= vdc_send(vdcp
, (caddr_t
)&msg
, &msglen
);
1941 DMSG(vdcp
, 0, "[%d] Failed to send RDX message (%d)",
1942 vdcp
->instance
, status
);
1955 * vdc - soft state pointer for this instance of the device driver.
1956 * msgp - received msg
1962 vdc_handle_rdx(vdc_t
*vdcp
, vio_rdx_msg_t
*msgp
)
1964 _NOTE(ARGUNUSED(vdcp
))
1965 _NOTE(ARGUNUSED(msgp
))
1967 ASSERT(msgp
->tag
.vio_msgtype
== VIO_TYPE_CTRL
);
1968 ASSERT(msgp
->tag
.vio_subtype
== VIO_SUBTYPE_ACK
);
1969 ASSERT(msgp
->tag
.vio_subtype_env
== VIO_RDX
);
1971 DMSG(vdcp
, 1, "[%d] Got an RDX msg", vdcp
->instance
);
1978 * vdc_rdx_exchange()
1983 * vdc - soft state pointer for this instance of the device driver.
1989 vdc_rdx_exchange(vdc_t
*vdcp
)
1994 if (status
= vdc_send_rdx(vdcp
))
1997 /* release lock and wait for response */
1998 mutex_exit(&vdcp
->lock
);
1999 status
= vdc_wait_for_response(vdcp
, &vio_msg
);
2000 mutex_enter(&vdcp
->lock
);
2002 DMSG(vdcp
, 0, "[%d] Failed waiting for RDX response, rv(%d)",
2003 vdcp
->instance
, status
);
2007 /* check type and sub_type ... */
2008 if (vio_msg
.tag
.vio_msgtype
!= VIO_TYPE_CTRL
||
2009 vio_msg
.tag
.vio_subtype
!= VIO_SUBTYPE_ACK
) {
2010 DMSG(vdcp
, 0, "[%d] Invalid RDX response\n", vdcp
->instance
);
2014 return (vdc_handle_rdx(vdcp
, (vio_rdx_msg_t
*)&vio_msg
));
2018 /* -------------------------------------------------------------------------- */
2021 * LDC helper routines
2025 vdc_recv(vdc_t
*vdc
, vio_msg_t
*msgp
, size_t *nbytesp
)
2028 uint64_t delay_time
;
2032 * Until we get a blocking ldc read we have to retry until the entire
2033 * LDC message has arrived before ldc_read() will return that message.
2034 * If ldc_read() succeed but returns a zero length message then that
2035 * means that the LDC queue is empty and we have to wait for a
2036 * notification from the LDC callback which will set the read_state to
2037 * VDC_READ_PENDING. Note we also bail out if the channel is reset or
2040 delay_time
= vdc_ldc_read_init_delay
;
2046 * vdc->curr_server is protected by vdc->lock but to avoid
2047 * contentions we don't take the lock here. We can do this
2048 * safely because vdc_recv() is only called from thread
2049 * process_msg_thread() which is also the only thread that
2050 * can change vdc->curr_server.
2052 status
= ldc_read(vdc
->curr_server
->ldc_handle
,
2053 (caddr_t
)msgp
, &len
);
2055 if (status
== EAGAIN
) {
2057 if (delay_time
>= vdc_ldc_read_max_delay
)
2058 delay_time
= vdc_ldc_read_max_delay
;
2064 DMSG(vdc
, 0, "ldc_read returned %d\n", status
);
2073 mutex_enter(&vdc
->read_lock
);
2075 while (vdc
->read_state
!= VDC_READ_PENDING
) {
2077 /* detect if the connection has been reset */
2078 if (vdc
->read_state
== VDC_READ_RESET
) {
2079 mutex_exit(&vdc
->read_lock
);
2080 return (ECONNRESET
);
2083 vdc
->read_state
= VDC_READ_WAITING
;
2084 cv_wait(&vdc
->read_cv
, &vdc
->read_lock
);
2087 vdc
->read_state
= VDC_READ_IDLE
;
2088 mutex_exit(&vdc
->read_lock
);
2090 delay_time
= vdc_ldc_read_init_delay
;
2100 vdc_decode_tag(vdc_t
*vdcp
, vio_msg_t
*msg
)
2102 char *ms
, *ss
, *ses
;
2103 switch (msg
->tag
.vio_msgtype
) {
2104 #define Q(_s) case _s : ms = #_s; break;
2109 default: ms
= "unknown"; break;
2112 switch (msg
->tag
.vio_subtype
) {
2113 #define Q(_s) case _s : ss = #_s; break;
2118 default: ss
= "unknown"; break;
2121 switch (msg
->tag
.vio_subtype_env
) {
2122 #define Q(_s) case _s : ses = #_s; break;
2132 default: ses
= "unknown"; break;
2135 DMSG(vdcp
, 3, "(%x/%x/%x) message : (%s/%s/%s)\n",
2136 msg
->tag
.vio_msgtype
, msg
->tag
.vio_subtype
,
2137 msg
->tag
.vio_subtype_env
, ms
, ss
, ses
);
2146 * The function encapsulates the call to write a message using LDC.
2147 * If LDC indicates that the call failed due to the queue being full,
2148 * we retry the ldc_write(), otherwise we return the error returned by LDC.
2151 * ldc_handle - LDC handle for the channel this instance of vdc uses
2152 * pkt - address of LDC message to be sent
2153 * msglen - the size of the message being sent. When the function
2154 * returns, this contains the number of bytes written.
2158 * EINVAL - pkt or msglen were NULL
2159 * ECONNRESET - The connection was not up.
2160 * EWOULDBLOCK - LDC queue is full
2161 * xxx - other error codes returned by ldc_write
2164 vdc_send(vdc_t
*vdc
, caddr_t pkt
, size_t *msglen
)
2168 clock_t delay_ticks
;
2170 ASSERT(vdc
!= NULL
);
2171 ASSERT(mutex_owned(&vdc
->lock
));
2172 ASSERT(msglen
!= NULL
);
2173 ASSERT(*msglen
!= 0);
2176 vdc_decode_tag(vdc
, (vio_msg_t
*)(uintptr_t)pkt
);
2179 * Wait indefinitely to send if channel
2180 * is busy, but bail out if we succeed or
2181 * if the channel closes or is reset.
2183 delay_ticks
= vdc_hz_min_ldc_delay
;
2186 status
= ldc_write(vdc
->curr_server
->ldc_handle
, pkt
, &size
);
2187 if (status
== EWOULDBLOCK
) {
2189 /* geometric backoff */
2191 if (delay_ticks
> vdc_hz_max_ldc_delay
)
2192 delay_ticks
= vdc_hz_max_ldc_delay
;
2194 } while (status
== EWOULDBLOCK
);
2196 /* if LDC had serious issues --- reset vdc state */
2197 if (status
== EIO
|| status
== ECONNRESET
) {
2198 /* LDC had serious issues --- reset vdc state */
2199 mutex_enter(&vdc
->read_lock
);
2200 if ((vdc
->read_state
== VDC_READ_WAITING
) ||
2201 (vdc
->read_state
== VDC_READ_RESET
))
2202 cv_signal(&vdc
->read_cv
);
2203 vdc
->read_state
= VDC_READ_RESET
;
2204 mutex_exit(&vdc
->read_lock
);
2206 /* wake up any waiters in the reset thread */
2207 if (vdc
->state
== VDC_STATE_INIT_WAITING
) {
2208 DMSG(vdc
, 0, "[%d] write reset - "
2209 "vdc is resetting ..\n", vdc
->instance
);
2210 vdc
->state
= VDC_STATE_RESETTING
;
2211 cv_signal(&vdc
->initwait_cv
);
2214 return (ECONNRESET
);
2217 /* return the last size written */
2228 * Get the MD, the device node for the given disk instance. The
2229 * caller is responsible for cleaning up the reference to the
2230 * returned MD (mdpp) by calling md_fini_handle().
2233 * dip - dev info pointer for this instance of the device driver.
2234 * mdpp - the returned MD.
2235 * vd_nodep - the returned device node.
2239 * ENOENT - Expected node or property did not exist.
2240 * ENXIO - Unexpected error communicating with MD framework
2243 vdc_get_md_node(dev_info_t
*dip
, md_t
**mdpp
, mde_cookie_t
*vd_nodep
)
2245 int status
= ENOENT
;
2246 char *node_name
= NULL
;
2250 mde_cookie_t rootnode
;
2251 mde_cookie_t
*listp
= NULL
;
2252 boolean_t found_inst
= B_FALSE
;
2257 int instance
= ddi_get_instance(dip
);
2260 * Get the OBP instance number for comparison with the MD instance
2262 * The "cfg-handle" property of a vdc node in an MD contains the MD's
2263 * notion of "instance", or unique identifier, for that node; OBP
2264 * stores the value of the "cfg-handle" MD property as the value of
2265 * the "reg" property on the node in the device tree it builds from
2266 * the MD and passes to Solaris. Thus, we look up the devinfo node's
2267 * "reg" property value to uniquely identify this device instance.
2268 * If the "reg" property cannot be found, the device tree state is
2269 * presumably so broken that there is no point in continuing.
2271 if (!ddi_prop_exists(DDI_DEV_T_ANY
, dip
, DDI_PROP_DONTPASS
, OBP_REG
)) {
2272 cmn_err(CE_WARN
, "'%s' property does not exist", OBP_REG
);
2275 obp_inst
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
, DDI_PROP_DONTPASS
,
2277 DMSGX(1, "[%d] OBP inst=%d\n", instance
, obp_inst
);
2280 * We now walk the MD nodes to find the node for this vdisk.
2282 if ((mdp
= md_get_handle()) == NULL
) {
2283 cmn_err(CE_WARN
, "unable to init machine description");
2287 num_nodes
= md_node_count(mdp
);
2288 ASSERT(num_nodes
> 0);
2290 listsz
= num_nodes
* sizeof (mde_cookie_t
);
2292 /* allocate memory for nodes */
2293 listp
= kmem_zalloc(listsz
, KM_SLEEP
);
2295 rootnode
= md_root_node(mdp
);
2296 ASSERT(rootnode
!= MDE_INVAL_ELEM_COOKIE
);
2299 * Search for all the virtual devices, we will then check to see which
2300 * ones are disk nodes.
2302 num_vdevs
= md_scan_dag(mdp
, rootnode
,
2303 md_find_name(mdp
, VDC_MD_VDEV_NAME
),
2304 md_find_name(mdp
, "fwd"), listp
);
2306 if (num_vdevs
<= 0) {
2307 cmn_err(CE_NOTE
, "No '%s' node found", VDC_MD_VDEV_NAME
);
2312 DMSGX(1, "[%d] num_vdevs=%d\n", instance
, num_vdevs
);
2313 for (idx
= 0; idx
< num_vdevs
; idx
++) {
2314 status
= md_get_prop_str(mdp
, listp
[idx
], "name", &node_name
);
2315 if ((status
!= 0) || (node_name
== NULL
)) {
2316 cmn_err(CE_NOTE
, "Unable to get name of node type '%s'"
2317 ": err %d", VDC_MD_VDEV_NAME
, status
);
2321 DMSGX(1, "[%d] Found node '%s'\n", instance
, node_name
);
2322 if (strcmp(VDC_MD_DISK_NAME
, node_name
) == 0) {
2323 status
= md_get_prop_val(mdp
, listp
[idx
],
2324 VDC_MD_CFG_HDL
, &md_inst
);
2325 DMSGX(1, "[%d] vdc inst in MD=%lx\n",
2327 if ((status
== 0) && (md_inst
== obp_inst
)) {
2328 found_inst
= B_TRUE
;
2335 DMSGX(0, "Unable to find correct '%s' node", VDC_MD_DISK_NAME
);
2339 DMSGX(0, "[%d] MD inst=%lx\n", instance
, md_inst
);
2341 *vd_nodep
= listp
[idx
];
2344 kmem_free(listp
, listsz
);
2353 * Initialize all the ports for this vdisk instance.
2356 * vdc - soft state pointer for this instance of the device driver.
2358 * vd_nodep - device md node.
2362 * ENOENT - Expected node or property did not exist.
2365 vdc_init_ports(vdc_t
*vdc
, md_t
*mdp
, mde_cookie_t vd_nodep
)
2373 mde_cookie_t vd_port
;
2374 mde_cookie_t
*chanp
= NULL
;
2375 mde_cookie_t
*portp
= NULL
;
2377 vdc_server_t
*prev_srvr
= NULL
;
2380 * We now walk the MD nodes to find the port nodes for this vdisk.
2382 num_nodes
= md_node_count(mdp
);
2383 ASSERT(num_nodes
> 0);
2385 listsz
= num_nodes
* sizeof (mde_cookie_t
);
2387 /* allocate memory for nodes */
2388 portp
= kmem_zalloc(listsz
, KM_SLEEP
);
2389 chanp
= kmem_zalloc(listsz
, KM_SLEEP
);
2391 num_vports
= md_scan_dag(mdp
, vd_nodep
,
2392 md_find_name(mdp
, VDC_MD_PORT_NAME
),
2393 md_find_name(mdp
, "fwd"), portp
);
2394 if (num_vports
== 0) {
2395 DMSGX(0, "Found no '%s' node for '%s' port\n",
2396 VDC_MD_PORT_NAME
, VDC_MD_VDEV_NAME
);
2401 DMSGX(1, "Found %d '%s' node(s) for '%s' port\n",
2402 num_vports
, VDC_MD_PORT_NAME
, VDC_MD_VDEV_NAME
);
2404 vdc
->num_servers
= 0;
2405 for (idx
= 0; idx
< num_vports
; idx
++) {
2407 /* initialize this port */
2408 vd_port
= portp
[idx
];
2409 srvr
= kmem_zalloc(sizeof (vdc_server_t
), KM_SLEEP
);
2411 srvr
->svc_state
= VDC_SERVICE_OFFLINE
;
2412 srvr
->log_state
= VDC_SERVICE_NONE
;
2415 if (md_get_prop_val(mdp
, vd_port
, VDC_MD_ID
, &srvr
->id
) != 0) {
2416 cmn_err(CE_NOTE
, "vDisk port '%s' property not found",
2418 kmem_free(srvr
, sizeof (vdc_server_t
));
2422 /* set the connection timeout */
2423 if (md_get_prop_val(mdp
, vd_port
, VDC_MD_TIMEOUT
,
2424 &srvr
->ctimeout
) != 0) {
2428 /* get the ldc id */
2429 num_chans
= md_scan_dag(mdp
, vd_port
,
2430 md_find_name(mdp
, VDC_MD_CHAN_NAME
),
2431 md_find_name(mdp
, "fwd"), chanp
);
2433 /* expecting at least one channel */
2434 if (num_chans
<= 0) {
2435 cmn_err(CE_NOTE
, "No '%s' node for '%s' port",
2436 VDC_MD_CHAN_NAME
, VDC_MD_VDEV_NAME
);
2437 kmem_free(srvr
, sizeof (vdc_server_t
));
2439 } else if (num_chans
!= 1) {
2440 DMSGX(0, "Expected 1 '%s' node for '%s' port, "
2441 "found %d\n", VDC_MD_CHAN_NAME
, VDC_MD_VDEV_NAME
,
2446 * We use the first channel found (index 0), irrespective of how
2447 * many are there in total.
2449 if (md_get_prop_val(mdp
, chanp
[0], VDC_MD_ID
,
2450 &srvr
->ldc_id
) != 0) {
2451 cmn_err(CE_NOTE
, "Channel '%s' property not found",
2453 kmem_free(srvr
, sizeof (vdc_server_t
));
2458 * now initialise LDC channel which will be used to
2459 * communicate with this server
2461 if (vdc_do_ldc_init(vdc
, srvr
) != 0) {
2462 kmem_free(srvr
, sizeof (vdc_server_t
));
2466 /* add server to list */
2468 prev_srvr
->next
= srvr
;
2470 vdc
->server_list
= srvr
;
2474 /* inc numbers of servers */
2478 /* pick first server as current server */
2479 if (vdc
->server_list
!= NULL
) {
2480 vdc
->curr_server
= vdc
->server_list
;
2487 kmem_free(chanp
, listsz
);
2488 kmem_free(portp
, listsz
);
2498 * Bring the channel for the current server up.
2501 * vdc - soft state pointer for this instance of the device driver.
2505 * EINVAL - Driver is detaching / LDC error
2506 * ECONNREFUSED - Other end is not listening
2509 vdc_do_ldc_up(vdc_t
*vdc
)
2512 ldc_status_t ldc_state
;
2514 ASSERT(MUTEX_HELD(&vdc
->lock
));
2516 DMSG(vdc
, 0, "[%d] Bringing up channel %lx\n",
2517 vdc
->instance
, vdc
->curr_server
->ldc_id
);
2519 if (vdc
->lifecycle
== VDC_LC_DETACHING
)
2522 if ((status
= ldc_up(vdc
->curr_server
->ldc_handle
)) != 0) {
2524 case ECONNREFUSED
: /* listener not ready at other end */
2525 DMSG(vdc
, 0, "[%d] ldc_up(%lx,...) return %d\n",
2526 vdc
->instance
, vdc
->curr_server
->ldc_id
, status
);
2530 DMSG(vdc
, 0, "[%d] Failed to bring up LDC: "
2531 "channel=%ld, err=%d", vdc
->instance
,
2532 vdc
->curr_server
->ldc_id
, status
);
2537 if (ldc_status(vdc
->curr_server
->ldc_handle
, &ldc_state
) == 0) {
2538 vdc
->curr_server
->ldc_state
= ldc_state
;
2539 if (ldc_state
== LDC_UP
) {
2540 DMSG(vdc
, 0, "[%d] LDC channel already up\n",
2543 vdc
->seq_num_reply
= 0;
2552 * vdc_terminate_ldc()
2557 * vdc - soft state pointer for this instance of the device driver.
2558 * srvr - vdc per-server info structure
2564 vdc_terminate_ldc(vdc_t
*vdc
, vdc_server_t
*srvr
)
2566 int instance
= ddi_get_instance(vdc
->dip
);
2568 if (srvr
->state
& VDC_LDC_OPEN
) {
2569 DMSG(vdc
, 0, "[%d] ldc_close()\n", instance
);
2570 (void) ldc_close(srvr
->ldc_handle
);
2572 if (srvr
->state
& VDC_LDC_CB
) {
2573 DMSG(vdc
, 0, "[%d] ldc_unreg_callback()\n", instance
);
2574 (void) ldc_unreg_callback(srvr
->ldc_handle
);
2576 if (srvr
->state
& VDC_LDC_INIT
) {
2577 DMSG(vdc
, 0, "[%d] ldc_fini()\n", instance
);
2578 (void) ldc_fini(srvr
->ldc_handle
);
2579 srvr
->ldc_handle
= NULL
;
2582 srvr
->state
&= ~(VDC_LDC_INIT
| VDC_LDC_CB
| VDC_LDC_OPEN
);
2590 * Finalize all ports by closing the channel associated with each
2591 * port and also freeing the server structure.
2594 * vdc - soft state pointer for this instance of the device driver.
2600 vdc_fini_ports(vdc_t
*vdc
)
2602 int instance
= ddi_get_instance(vdc
->dip
);
2603 vdc_server_t
*srvr
, *prev_srvr
;
2605 ASSERT(vdc
!= NULL
);
2606 ASSERT(mutex_owned(&vdc
->lock
));
2608 DMSG(vdc
, 0, "[%d] initialized=%x\n", instance
, vdc
->initialized
);
2610 srvr
= vdc
->server_list
;
2614 vdc_terminate_ldc(vdc
, srvr
);
2621 kmem_free(prev_srvr
, sizeof (vdc_server_t
));
2624 vdc
->server_list
= NULL
;
2625 vdc
->num_servers
= 0;
2628 /* -------------------------------------------------------------------------- */
2631 * Descriptor Ring helper routines
2636 * vdc_init_descriptor_ring()
2641 * vdc - soft state pointer for this instance of the device driver.
2647 vdc_init_descriptor_ring(vdc_t
*vdc
)
2649 vd_dring_entry_t
*dep
= NULL
; /* DRing Entry pointer */
2653 DMSG(vdc
, 0, "[%d] initialized=%x\n", vdc
->instance
, vdc
->initialized
);
2655 ASSERT(vdc
!= NULL
);
2656 ASSERT(mutex_owned(&vdc
->lock
));
2658 /* ensure we have enough room to store max sized block */
2659 ASSERT(maxphys
<= VD_MAX_BLOCK_SIZE
);
2661 if ((vdc
->initialized
& VDC_DRING_INIT
) == 0) {
2662 DMSG(vdc
, 0, "[%d] ldc_mem_dring_create\n", vdc
->instance
);
2664 * Calculate the maximum block size we can transmit using one
2665 * Descriptor Ring entry from the attributes returned by the
2666 * vDisk server. This is subject to a minimum of 'maxphys'
2667 * as we do not have the capability to split requests over
2668 * multiple DRing entries.
2670 if ((vdc
->max_xfer_sz
* vdc
->vdisk_bsize
) < maxphys
) {
2671 DMSG(vdc
, 0, "[%d] using minimum DRing size\n",
2673 vdc
->dring_max_cookies
= maxphys
/ PAGESIZE
;
2675 vdc
->dring_max_cookies
=
2676 (vdc
->max_xfer_sz
* vdc
->vdisk_bsize
) / PAGESIZE
;
2678 vdc
->dring_entry_size
= (sizeof (vd_dring_entry_t
) +
2679 (sizeof (ldc_mem_cookie_t
) *
2680 (vdc
->dring_max_cookies
- 1)));
2681 vdc
->dring_len
= VD_DRING_LEN
;
2683 status
= ldc_mem_dring_create(vdc
->dring_len
,
2684 vdc
->dring_entry_size
, &vdc
->dring_hdl
);
2685 if ((vdc
->dring_hdl
== NULL
) || (status
!= 0)) {
2686 DMSG(vdc
, 0, "[%d] Descriptor ring creation failed",
2690 vdc
->initialized
|= VDC_DRING_INIT
;
2693 if ((vdc
->initialized
& VDC_DRING_BOUND
) == 0) {
2694 DMSG(vdc
, 0, "[%d] ldc_mem_dring_bind\n", vdc
->instance
);
2696 kmem_zalloc(sizeof (ldc_mem_cookie_t
), KM_SLEEP
);
2698 status
= ldc_mem_dring_bind(vdc
->curr_server
->ldc_handle
,
2700 LDC_SHADOW_MAP
|LDC_DIRECT_MAP
, LDC_MEM_RW
,
2701 &vdc
->dring_cookie
[0],
2702 &vdc
->dring_cookie_count
);
2704 DMSG(vdc
, 0, "[%d] Failed to bind descriptor ring "
2705 "(%lx) to channel (%lx) status=%d\n",
2706 vdc
->instance
, vdc
->dring_hdl
,
2707 vdc
->curr_server
->ldc_handle
, status
);
2710 ASSERT(vdc
->dring_cookie_count
== 1);
2711 vdc
->initialized
|= VDC_DRING_BOUND
;
2714 status
= ldc_mem_dring_info(vdc
->dring_hdl
, &vdc
->dring_mem_info
);
2717 "[%d] Failed to get info for descriptor ring (%lx)\n",
2718 vdc
->instance
, vdc
->dring_hdl
);
2722 if ((vdc
->initialized
& VDC_DRING_LOCAL
) == 0) {
2723 DMSG(vdc
, 0, "[%d] local dring\n", vdc
->instance
);
2725 /* Allocate the local copy of this dring */
2727 kmem_zalloc(vdc
->dring_len
* sizeof (vdc_local_desc_t
),
2729 vdc
->initialized
|= VDC_DRING_LOCAL
;
2733 * Mark all DRing entries as free and initialize the private
2734 * descriptor's memory handles. If any entry is initialized,
2735 * we need to free it later so we set the bit in 'initialized'
2738 vdc
->initialized
|= VDC_DRING_ENTRY
;
2739 for (i
= 0; i
< vdc
->dring_len
; i
++) {
2740 dep
= VDC_GET_DRING_ENTRY_PTR(vdc
, i
);
2741 dep
->hdr
.dstate
= VIO_DESC_FREE
;
2743 status
= ldc_mem_alloc_handle(vdc
->curr_server
->ldc_handle
,
2744 &vdc
->local_dring
[i
].desc_mhdl
);
2746 DMSG(vdc
, 0, "![%d] Failed to alloc mem handle for"
2747 " descriptor %d", vdc
->instance
, i
);
2750 vdc
->local_dring
[i
].is_free
= B_TRUE
;
2751 vdc
->local_dring
[i
].dep
= dep
;
2754 /* Initialize the starting index */
2755 vdc
->dring_curr_idx
= VDC_DRING_FIRST_ENTRY
;
2762 * vdc_destroy_descriptor_ring()
2767 * vdc - soft state pointer for this instance of the device driver.
2773 vdc_destroy_descriptor_ring(vdc_t
*vdc
)
2775 vdc_local_desc_t
*ldep
= NULL
; /* Local Dring Entry Pointer */
2776 ldc_mem_handle_t mhdl
= NULL
;
2777 ldc_mem_info_t minfo
;
2781 ASSERT(vdc
!= NULL
);
2782 ASSERT(mutex_owned(&vdc
->lock
));
2784 DMSG(vdc
, 0, "[%d] Entered\n", vdc
->instance
);
2786 if (vdc
->initialized
& VDC_DRING_ENTRY
) {
2788 "[%d] Removing Local DRing entries\n", vdc
->instance
);
2789 for (i
= 0; i
< vdc
->dring_len
; i
++) {
2790 ldep
= &vdc
->local_dring
[i
];
2791 mhdl
= ldep
->desc_mhdl
;
2796 if ((status
= ldc_mem_info(mhdl
, &minfo
)) != 0) {
2798 "ldc_mem_info returned an error: %d\n",
2802 * This must mean that the mem handle
2803 * is not valid. Clear it out so that
2804 * no one tries to use it.
2806 ldep
->desc_mhdl
= NULL
;
2810 if (minfo
.status
== LDC_BOUND
) {
2811 (void) ldc_mem_unbind_handle(mhdl
);
2814 (void) ldc_mem_free_handle(mhdl
);
2816 ldep
->desc_mhdl
= NULL
;
2818 vdc
->initialized
&= ~VDC_DRING_ENTRY
;
2821 if (vdc
->initialized
& VDC_DRING_LOCAL
) {
2822 DMSG(vdc
, 0, "[%d] Freeing Local DRing\n", vdc
->instance
);
2823 kmem_free(vdc
->local_dring
,
2824 vdc
->dring_len
* sizeof (vdc_local_desc_t
));
2825 vdc
->initialized
&= ~VDC_DRING_LOCAL
;
2828 if (vdc
->initialized
& VDC_DRING_BOUND
) {
2829 DMSG(vdc
, 0, "[%d] Unbinding DRing\n", vdc
->instance
);
2830 status
= ldc_mem_dring_unbind(vdc
->dring_hdl
);
2832 vdc
->initialized
&= ~VDC_DRING_BOUND
;
2834 DMSG(vdc
, 0, "[%d] Error %d unbinding DRing %lx",
2835 vdc
->instance
, status
, vdc
->dring_hdl
);
2837 kmem_free(vdc
->dring_cookie
, sizeof (ldc_mem_cookie_t
));
2840 if (vdc
->initialized
& VDC_DRING_INIT
) {
2841 DMSG(vdc
, 0, "[%d] Destroying DRing\n", vdc
->instance
);
2842 status
= ldc_mem_dring_destroy(vdc
->dring_hdl
);
2844 vdc
->dring_hdl
= NULL
;
2845 bzero(&vdc
->dring_mem_info
, sizeof (ldc_mem_info_t
));
2846 vdc
->initialized
&= ~VDC_DRING_INIT
;
2848 DMSG(vdc
, 0, "[%d] Error %d destroying DRing (%lx)",
2849 vdc
->instance
, status
, vdc
->dring_hdl
);
2856 * vdc_map_to_shared_dring()
2859 * Copy contents of the local descriptor to the shared
2860 * memory descriptor.
2863 * vdcp - soft state pointer for this instance of the device driver.
2864 * idx - descriptor ring index
2870 vdc_map_to_shared_dring(vdc_t
*vdcp
, int idx
)
2872 vdc_local_desc_t
*ldep
;
2873 vd_dring_entry_t
*dep
;
2876 ldep
= &(vdcp
->local_dring
[idx
]);
2878 /* for now leave in the old pop_mem_hdl stuff */
2879 if (ldep
->nbytes
> 0) {
2880 rv
= vdc_populate_mem_hdl(vdcp
, ldep
);
2882 DMSG(vdcp
, 0, "[%d] Cannot populate mem handle\n",
2889 * fill in the data details into the DRing
2892 ASSERT(dep
!= NULL
);
2894 dep
->payload
.req_id
= VDC_GET_NEXT_REQ_ID(vdcp
);
2895 dep
->payload
.operation
= ldep
->operation
;
2896 dep
->payload
.addr
= ldep
->offset
;
2897 dep
->payload
.nbytes
= ldep
->nbytes
;
2898 dep
->payload
.status
= (uint32_t)-1; /* vds will set valid value */
2899 dep
->payload
.slice
= ldep
->slice
;
2900 dep
->hdr
.dstate
= VIO_DESC_READY
;
2901 dep
->hdr
.ack
= 1; /* request an ACK for every message */
2911 * This routine writes the data to be transmitted to vds into the
2912 * descriptor, notifies vds that the ring has been updated and
2913 * then waits for the request to be processed.
2916 * vdcp - the soft state pointer
2917 * operation - operation we want vds to perform (VD_OP_XXX)
2918 * addr - address of data buf to be read/written.
2919 * nbytes - number of bytes to read/write
2920 * slice - the disk slice this request is for
2921 * offset - relative disk offset
2922 * bufp - buf of operation
2923 * dir - direction of operation (READ/WRITE/BOTH)
2930 vdc_send_request(vdc_t
*vdcp
, int operation
, caddr_t addr
,
2931 size_t nbytes
, int slice
, diskaddr_t offset
, buf_t
*bufp
,
2932 vio_desc_direction_t dir
, int flags
)
2936 ASSERT(vdcp
!= NULL
);
2937 ASSERT(slice
== VD_SLICE_NONE
|| slice
< V_NUMPAR
);
2939 mutex_enter(&vdcp
->lock
);
2942 * If this is a block read/write operation we update the I/O statistics
2943 * to indicate that the request is being put on the waitq to be
2944 * serviced. Operations which are resubmitted are already in the waitq.
2946 * We do it here (a common routine for both synchronous and strategy
2947 * calls) for performance reasons - we are already holding vdc->lock
2948 * so there is no extra locking overhead. We would have to explicitly
2949 * grab the 'lock' mutex to update the stats if we were to do this
2950 * higher up the stack in vdc_strategy() et. al.
2952 if (((operation
== VD_OP_BREAD
) || (operation
== VD_OP_BWRITE
)) &&
2953 !(flags
& VDC_OP_RESUBMIT
)) {
2954 DTRACE_IO1(start
, buf_t
*, bufp
);
2955 VD_KSTAT_WAITQ_ENTER(vdcp
);
2959 * If the request does not expect the state to be VDC_STATE_RUNNING
2960 * then we just try to populate the descriptor ring once.
2962 if (!(flags
& VDC_OP_STATE_RUNNING
)) {
2963 rv
= vdc_populate_descriptor(vdcp
, operation
, addr
,
2964 nbytes
, slice
, offset
, bufp
, dir
, flags
);
2969 while (vdcp
->state
!= VDC_STATE_RUNNING
) {
2971 /* return error if detaching */
2972 if (vdcp
->state
== VDC_STATE_DETACH
) {
2978 * If we are panicking and the disk is not ready then
2979 * we can't send any request because we can't complete
2980 * the handshake now.
2982 if (ddi_in_panic()) {
2988 * If the state is faulted, notify that a new I/O is
2989 * being submitted to force the system to check if any
2990 * server has recovered.
2992 if (vdcp
->state
== VDC_STATE_FAILED
) {
2993 vdcp
->io_pending
= B_TRUE
;
2994 cv_signal(&vdcp
->io_pending_cv
);
2997 cv_wait(&vdcp
->running_cv
, &vdcp
->lock
);
2999 /* if service is still faulted then fail the request */
3000 if (vdcp
->state
== VDC_STATE_FAILED
) {
3006 } while (vdc_populate_descriptor(vdcp
, operation
, addr
,
3007 nbytes
, slice
, offset
, bufp
, dir
, flags
& ~VDC_OP_RESUBMIT
));
3011 * If this is a block read/write we update the I/O statistics kstat
3012 * to indicate that this request has been placed on the queue for
3013 * processing (i.e sent to the vDisk server) - iostat(1M) will
3014 * report the time waiting for the vDisk server under the %b column
3016 * In the case of an error we take it off the wait queue only if
3017 * the I/O was not resubmited.
3019 if ((operation
== VD_OP_BREAD
) || (operation
== VD_OP_BWRITE
)) {
3021 VD_KSTAT_WAITQ_TO_RUNQ(vdcp
);
3022 DTRACE_PROBE1(send
, buf_t
*, bufp
);
3024 VD_UPDATE_ERR_STATS(vdcp
, vd_transerrs
);
3025 if (!(flags
& VDC_OP_RESUBMIT
)) {
3026 VD_KSTAT_WAITQ_EXIT(vdcp
);
3027 DTRACE_IO1(done
, buf_t
*, bufp
);
3032 mutex_exit(&vdcp
->lock
);
3040 * vdc_populate_descriptor
3043 * This routine writes the data to be transmitted to vds into the
3044 * descriptor, notifies vds that the ring has been updated and
3045 * then waits for the request to be processed.
3048 * vdcp - the soft state pointer
3049 * operation - operation we want vds to perform (VD_OP_XXX)
3050 * addr - address of data buf to be read/written.
3051 * nbytes - number of bytes to read/write
3052 * slice - the disk slice this request is for
3053 * offset - relative disk offset
3054 * bufp - buf of operation
3055 * dir - direction of operation (READ/WRITE/BOTH)
3064 vdc_populate_descriptor(vdc_t
*vdcp
, int operation
, caddr_t addr
,
3065 size_t nbytes
, int slice
, diskaddr_t offset
,
3066 buf_t
*bufp
, vio_desc_direction_t dir
, int flags
)
3068 vdc_local_desc_t
*local_dep
= NULL
; /* Local Dring Pointer */
3069 int idx
; /* Index of DRing entry used */
3071 vio_dring_msg_t dmsg
;
3075 ASSERT(MUTEX_HELD(&vdcp
->lock
));
3076 vdcp
->threads_pending
++;
3078 DMSG(vdcp
, 2, ": dring_curr_idx = %d\n", vdcp
->dring_curr_idx
);
3080 if (flags
& VDC_OP_DRING_RESERVED
) {
3081 /* use D-Ring reserved entry */
3082 idx
= VDC_DRING_FIRST_RESV
;
3083 local_dep
= &(vdcp
->local_dring
[idx
]);
3085 /* Get next available D-Ring entry */
3086 idx
= vdcp
->dring_curr_idx
;
3087 local_dep
= &(vdcp
->local_dring
[idx
]);
3089 if (!local_dep
->is_free
) {
3090 DMSG(vdcp
, 2, "[%d]: dring full - waiting for space\n",
3092 cv_wait(&vdcp
->dring_free_cv
, &vdcp
->lock
);
3093 if (vdcp
->state
== VDC_STATE_RUNNING
||
3094 vdcp
->state
== VDC_STATE_HANDLE_PENDING
) {
3097 vdcp
->threads_pending
--;
3098 return (ECONNRESET
);
3102 if (next_idx
>= vdcp
->dring_len
)
3103 next_idx
= VDC_DRING_FIRST_ENTRY
;
3104 vdcp
->dring_curr_idx
= next_idx
;
3107 ASSERT(local_dep
->is_free
);
3109 local_dep
->operation
= operation
;
3110 local_dep
->addr
= addr
;
3111 local_dep
->nbytes
= nbytes
;
3112 local_dep
->slice
= slice
;
3113 local_dep
->offset
= offset
;
3114 local_dep
->buf
= bufp
;
3115 local_dep
->dir
= dir
;
3116 local_dep
->flags
= flags
;
3118 local_dep
->is_free
= B_FALSE
;
3120 rv
= vdc_map_to_shared_dring(vdcp
, idx
);
3122 if (flags
& VDC_OP_DRING_RESERVED
) {
3123 DMSG(vdcp
, 0, "[%d]: cannot bind memory - error\n",
3126 * We can't wait if we are using reserved slot.
3127 * Free the descriptor and return.
3129 local_dep
->is_free
= B_TRUE
;
3130 vdcp
->threads_pending
--;
3133 DMSG(vdcp
, 0, "[%d]: cannot bind memory - waiting ..\n",
3135 /* free the descriptor */
3136 local_dep
->is_free
= B_TRUE
;
3137 vdcp
->dring_curr_idx
= idx
;
3138 cv_wait(&vdcp
->membind_cv
, &vdcp
->lock
);
3139 if (vdcp
->state
== VDC_STATE_RUNNING
||
3140 vdcp
->state
== VDC_STATE_HANDLE_PENDING
) {
3143 vdcp
->threads_pending
--;
3144 return (ECONNRESET
);
3148 * Send a msg with the DRing details to vds
3150 VIO_INIT_DRING_DATA_TAG(dmsg
);
3151 VDC_INIT_DRING_DATA_MSG_IDS(dmsg
, vdcp
);
3152 dmsg
.dring_ident
= vdcp
->dring_ident
;
3153 dmsg
.start_idx
= idx
;
3157 DTRACE_PROBE2(populate
, int, vdcp
->instance
,
3158 vdc_local_desc_t
*, local_dep
);
3159 DMSG(vdcp
, 2, "ident=0x%lx, st=%u, end=%u, seq=%ld\n",
3160 vdcp
->dring_ident
, dmsg
.start_idx
, dmsg
.end_idx
, dmsg
.seq_num
);
3163 * note we're still holding the lock here to
3164 * make sure the message goes out in order !!!...
3166 msglen
= sizeof (dmsg
);
3167 rv
= vdc_send(vdcp
, (caddr_t
)&dmsg
, &msglen
);
3171 * vdc_send initiates the reset on failure.
3172 * Since the transaction has already been put
3173 * on the local dring, it will automatically get
3174 * retried when the channel is reset. Given that,
3175 * it is ok to just return success even though the
3182 DMSG(vdcp
, 1, "sent via LDC: rv=%d\n", rv
);
3186 DMSG(vdcp
, 0, "unexpected error, rv=%d\n", rv
);
3191 vdcp
->threads_pending
--;
3200 * Wrapper around vdc_submit_request(). Each request is associated with a
3201 * buf structure. If a buf structure is provided (bufp != NULL) then the
3202 * request will be submitted with that buf, and the caller can wait for
3203 * completion of the request with biowait(). If a buf structure is not
3204 * provided (bufp == NULL) then a buf structure is created and the function
3205 * waits for the completion of the request.
3207 * If the flag VD_OP_STATE_RUNNING is set then vdc_submit_request() will
3208 * submit the request only when the vdisk is in state VD_STATE_RUNNING.
3209 * If the vdisk is not in that state then the vdc_submit_request() will
3210 * wait for that state to be reached. After the request is submitted, the
3211 * reply will be processed asynchronously by the vdc_process_msg_thread()
3214 * If the flag VD_OP_STATE_RUNNING is not set then vdc_submit_request()
3215 * submit the request whatever the state of the vdisk is. Then vdc_do_op()
3216 * will wait for a reply message, process the reply and complete the
3220 * vdc - the soft state pointer
3221 * op - operation we want vds to perform (VD_OP_XXX)
3222 * addr - address of data buf to be read/written.
3223 * nbytes - number of bytes to read/write
3224 * slice - the disk slice this request is for
3225 * offset - relative disk offset
3226 * bufp - buf structure associated with the request (can be NULL).
3227 * dir - direction of operation (READ/WRITE/BOTH)
3228 * flags - flags for the request.
3231 * 0 - the request has been succesfully submitted and completed.
3232 * != 0 - the request has failed. In that case, if a buf structure
3233 * was provided (bufp != NULL) then the B_ERROR flag is set
3234 * and the b_error field of the buf structure is set to EIO.
3237 vdc_do_op(vdc_t
*vdc
, int op
, caddr_t addr
, size_t nbytes
, int slice
,
3238 diskaddr_t offset
, struct buf
*bufp
, vio_desc_direction_t dir
, int flags
)
3246 * We use buf just as a convenient way to get a notification
3247 * that the request is completed, so we initialize buf to the
3251 buf
.b_bcount
= nbytes
;
3252 buf
.b_flags
= B_BUSY
;
3256 rv
= vdc_send_request(vdc
, op
, addr
, nbytes
, slice
, offset
, bufp
,
3263 * If the request should be done in VDC_STATE_RUNNING state then the
3264 * reply will be received and processed by vdc_process_msg_thread()
3265 * and we just have to handle the panic case. Otherwise we have to
3266 * wait for the reply message and process it.
3268 if (flags
& VDC_OP_STATE_RUNNING
) {
3270 if (ddi_in_panic()) {
3271 rv
= vdc_drain_response(vdc
, bufp
);
3276 /* wait for the response message */
3277 rv
= vdc_wait_for_response(vdc
, &vio_msg
);
3280 rv
= vdc_process_data_msg(vdc
, &vio_msg
);
3284 * If this is a block read/write we update the I/O
3285 * statistics kstat to take it off the run queue.
3286 * If it is a resubmit then it needs to stay in
3287 * in the waitq, and it will be removed when the
3288 * I/O is eventually completed or cancelled.
3290 mutex_enter(&vdc
->lock
);
3291 if (op
== VD_OP_BREAD
|| op
== VD_OP_BWRITE
) {
3292 if (flags
& VDC_OP_RESUBMIT
) {
3293 VD_KSTAT_RUNQ_BACK_TO_WAITQ(vdc
);
3295 VD_KSTAT_RUNQ_EXIT(vdc
);
3296 DTRACE_IO1(done
, buf_t
*, bufp
);
3299 mutex_exit(&vdc
->lock
);
3311 } else if (rv
!= 0) {
3312 bioerror(bufp
, EIO
);
3324 * Wrapper around vdc_do_op that serializes requests.
3327 * vdcp - the soft state pointer
3328 * operation - operation we want vds to perform (VD_OP_XXX)
3329 * addr - address of data buf to be read/written.
3330 * nbytes - number of bytes to read/write
3331 * slice - the disk slice this request is for
3332 * offset - relative disk offset
3333 * dir - direction of operation (READ/WRITE/BOTH)
3334 * rconflict - check for reservation conflict in case of failure
3336 * rconflict should be set to B_TRUE by most callers. Callers invoking the
3337 * VD_OP_SCSICMD operation can set rconflict to B_FALSE if they check the
3338 * result of a successful operation with vdc_scsi_status().
3348 vdc_do_sync_op(vdc_t
*vdcp
, int operation
, caddr_t addr
, size_t nbytes
,
3349 int slice
, diskaddr_t offset
, vio_desc_direction_t dir
, boolean_t rconflict
)
3352 int flags
= VDC_OP_NORMAL
;
3355 * Grab the lock, if blocked wait until the server
3356 * response causes us to wake up again.
3358 mutex_enter(&vdcp
->lock
);
3359 vdcp
->sync_op_cnt
++;
3360 while (vdcp
->sync_op_blocked
&& vdcp
->state
!= VDC_STATE_DETACH
) {
3361 if (ddi_in_panic()) {
3362 /* don't block if we are panicking */
3363 vdcp
->sync_op_cnt
--;
3364 mutex_exit(&vdcp
->lock
);
3367 cv_wait(&vdcp
->sync_blocked_cv
, &vdcp
->lock
);
3371 if (vdcp
->state
== VDC_STATE_DETACH
) {
3372 cv_broadcast(&vdcp
->sync_blocked_cv
);
3373 vdcp
->sync_op_cnt
--;
3374 mutex_exit(&vdcp
->lock
);
3378 /* now block anyone other thread entering after us */
3379 vdcp
->sync_op_blocked
= B_TRUE
;
3381 mutex_exit(&vdcp
->lock
);
3384 flags
&= ~VDC_OP_ERRCHK_CONFLICT
;
3386 status
= vdc_do_op(vdcp
, operation
, addr
, nbytes
, slice
, offset
,
3389 mutex_enter(&vdcp
->lock
);
3391 DMSG(vdcp
, 2, ": operation returned %d\n", status
);
3393 if (vdcp
->state
== VDC_STATE_DETACH
) {
3397 vdcp
->sync_op_blocked
= B_FALSE
;
3398 vdcp
->sync_op_cnt
--;
3400 /* signal the next waiting thread */
3401 cv_signal(&vdcp
->sync_blocked_cv
);
3403 mutex_exit(&vdcp
->lock
);
3411 * vdc_drain_response()
3414 * When a guest is panicking, the completion of requests needs to be
3415 * handled differently because interrupts are disabled and vdc
3416 * will not get messages. We have to poll for the messages instead.
3418 * Note: since we are panicking we don't implement the io:::done
3419 * DTrace probe or update the I/O statistics kstats.
3422 * vdc - soft state pointer for this instance of the device driver.
3423 * buf - if buf is NULL then we drain all responses, otherwise we
3424 * poll until we receive a ACK/NACK for the specific I/O
3428 * 0 - Success. If we were expecting a response to a particular
3429 * request then this means that a response has been received.
3432 vdc_drain_response(vdc_t
*vdc
, struct buf
*buf
)
3434 int rv
, idx
, retries
;
3436 vdc_local_desc_t
*ldep
= NULL
; /* Local Dring Entry Pointer */
3437 vio_dring_msg_t dmsg
;
3441 mutex_enter(&vdc
->lock
);
3445 msglen
= sizeof (dmsg
);
3446 rv
= ldc_read(vdc
->curr_server
->ldc_handle
, (caddr_t
)&dmsg
,
3454 * if there are no packets wait and check again
3456 if ((rv
== 0) && (msglen
== 0)) {
3457 if (retries
++ > vdc_dump_retries
) {
3462 drv_usecwait(vdc_usec_timeout_dump
);
3467 * Ignore all messages that are not ACKs/NACKs to
3470 if ((dmsg
.tag
.vio_msgtype
!= VIO_TYPE_DATA
) ||
3471 (dmsg
.tag
.vio_subtype_env
!= VIO_DRING_DATA
)) {
3472 DMSG(vdc
, 0, "discard pkt: type=%d sub=%d env=%d\n",
3473 dmsg
.tag
.vio_msgtype
,
3474 dmsg
.tag
.vio_subtype
,
3475 dmsg
.tag
.vio_subtype_env
);
3480 * Record if the packet was ACK'ed or not. If the packet was not
3481 * ACK'ed then we will just mark the request as failed; we don't
3482 * want to reset the connection at this point.
3484 switch (dmsg
.tag
.vio_subtype
) {
3485 case VIO_SUBTYPE_ACK
:
3488 case VIO_SUBTYPE_NACK
:
3495 idx
= dmsg
.start_idx
;
3496 if (idx
>= vdc
->dring_len
) {
3497 DMSG(vdc
, 0, "[%d] Bogus ack data : start %d\n",
3498 vdc
->instance
, idx
);
3501 ldep
= &vdc
->local_dring
[idx
];
3502 if (ldep
->dep
->hdr
.dstate
!= VIO_DESC_DONE
) {
3503 DMSG(vdc
, 0, "[%d] Entry @ %d - state !DONE %d\n",
3504 vdc
->instance
, idx
, ldep
->dep
->hdr
.dstate
);
3509 ASSERT(mbuf
!= NULL
);
3510 mbuf
->b_resid
= mbuf
->b_bcount
- ldep
->dep
->payload
.nbytes
;
3511 bioerror(mbuf
, ack
? ldep
->dep
->payload
.status
: EIO
);
3514 rv
= vdc_depopulate_descriptor(vdc
, idx
);
3515 if (buf
!= NULL
&& buf
== mbuf
) {
3520 /* if this is the last descriptor - break out of loop */
3521 if ((idx
+ 1) % vdc
->dring_len
== vdc
->dring_curr_idx
) {
3523 * If we were expecting a response for a particular
3524 * request then we return with an error otherwise we
3525 * have successfully completed the drain.
3527 rv
= (buf
!= NULL
)? ESRCH
: 0;
3533 mutex_exit(&vdc
->lock
);
3534 DMSG(vdc
, 0, "End idx=%d\n", idx
);
3542 * vdc_depopulate_descriptor()
3547 * vdc - soft state pointer for this instance of the device driver.
3548 * idx - Index of the Descriptor Ring entry being modified
3554 vdc_depopulate_descriptor(vdc_t
*vdc
, uint_t idx
)
3556 vd_dring_entry_t
*dep
= NULL
; /* Dring Entry Pointer */
3557 vdc_local_desc_t
*ldep
= NULL
; /* Local Dring Entry Pointer */
3561 ASSERT(vdc
!= NULL
);
3562 ASSERT(idx
< vdc
->dring_len
);
3563 ldep
= &vdc
->local_dring
[idx
];
3564 ASSERT(ldep
!= NULL
);
3565 ASSERT(MUTEX_HELD(&vdc
->lock
));
3567 DTRACE_PROBE2(depopulate
, int, vdc
->instance
, vdc_local_desc_t
*, ldep
);
3568 DMSG(vdc
, 2, ": idx = %d\n", idx
);
3571 ASSERT(dep
!= NULL
);
3572 ASSERT((dep
->hdr
.dstate
== VIO_DESC_DONE
) ||
3573 (dep
->payload
.status
== ECANCELED
));
3575 VDC_MARK_DRING_ENTRY_FREE(vdc
, idx
);
3577 ldep
->is_free
= B_TRUE
;
3578 status
= dep
->payload
.status
;
3579 DMSG(vdc
, 2, ": is_free = %d : status = %d\n", ldep
->is_free
, status
);
3582 * If no buffers were used to transfer information to the server when
3583 * populating the descriptor then no memory handles need to be unbound
3584 * and we can return now.
3586 if (ldep
->nbytes
== 0) {
3587 cv_signal(&vdc
->dring_free_cv
);
3592 * If the upper layer passed in a misaligned address we copied the
3593 * data into an aligned buffer before sending it to LDC - we now
3594 * copy it back to the original buffer.
3596 if (ldep
->align_addr
) {
3597 ASSERT(ldep
->addr
!= NULL
);
3599 if (dep
->payload
.nbytes
> 0)
3600 bcopy(ldep
->align_addr
, ldep
->addr
,
3601 dep
->payload
.nbytes
);
3602 kmem_free(ldep
->align_addr
,
3603 sizeof (caddr_t
) * P2ROUNDUP(ldep
->nbytes
, 8));
3604 ldep
->align_addr
= NULL
;
3607 rv
= ldc_mem_unbind_handle(ldep
->desc_mhdl
);
3609 DMSG(vdc
, 0, "?[%d] unbind mhdl 0x%lx @ idx %d failed (%d)",
3610 vdc
->instance
, ldep
->desc_mhdl
, idx
, rv
);
3612 * The error returned by the vDisk server is more informative
3613 * and thus has a higher priority but if it isn't set we ensure
3614 * that this function returns an error.
3620 cv_signal(&vdc
->membind_cv
);
3621 cv_signal(&vdc
->dring_free_cv
);
3628 * vdc_populate_mem_hdl()
3633 * vdc - soft state pointer for this instance of the device driver.
3634 * idx - Index of the Descriptor Ring entry being modified
3635 * addr - virtual address being mapped in
3636 * nybtes - number of bytes in 'addr'
3637 * operation - the vDisk operation being performed (VD_OP_xxx)
3643 vdc_populate_mem_hdl(vdc_t
*vdcp
, vdc_local_desc_t
*ldep
)
3645 vd_dring_entry_t
*dep
= NULL
;
3646 ldc_mem_handle_t mhdl
;
3649 uint8_t perm
= LDC_MEM_RW
;
3654 ASSERT(vdcp
!= NULL
);
3657 mhdl
= ldep
->desc_mhdl
;
3659 switch (ldep
->dir
) {
3673 ASSERT(0); /* catch bad programming in vdc */
3677 * LDC expects any addresses passed in to be 8-byte aligned. We need
3678 * to copy the contents of any misaligned buffers to a newly allocated
3679 * buffer and bind it instead (and copy the the contents back to the
3680 * original buffer passed in when depopulating the descriptor)
3683 nbytes
= ldep
->nbytes
;
3684 if (((uint64_t)vaddr
& 0x7) != 0) {
3685 ASSERT(ldep
->align_addr
== NULL
);
3687 kmem_alloc(sizeof (caddr_t
) *
3688 P2ROUNDUP(nbytes
, 8), KM_SLEEP
);
3689 DMSG(vdcp
, 0, "[%d] Misaligned address %p reallocating "
3690 "(buf=%p nb=%ld op=%d)\n",
3691 vdcp
->instance
, (void *)vaddr
, (void *)ldep
->align_addr
,
3692 nbytes
, ldep
->operation
);
3693 if (perm
!= LDC_MEM_W
)
3694 bcopy(vaddr
, ldep
->align_addr
, nbytes
);
3695 vaddr
= ldep
->align_addr
;
3698 maptype
= LDC_IO_MAP
|LDC_SHADOW_MAP
;
3699 rv
= ldc_mem_bind_handle(mhdl
, vaddr
, P2ROUNDUP(nbytes
, 8),
3700 maptype
, perm
, &dep
->payload
.cookie
[0], &dep
->payload
.ncookies
);
3701 DMSG(vdcp
, 2, "[%d] bound mem handle; ncookies=%d\n",
3702 vdcp
->instance
, dep
->payload
.ncookies
);
3704 DMSG(vdcp
, 0, "[%d] Failed to bind LDC memory handle "
3705 "(mhdl=%p, buf=%p, err=%d)\n",
3706 vdcp
->instance
, (void *)mhdl
, (void *)vaddr
, rv
);
3707 if (ldep
->align_addr
) {
3708 kmem_free(ldep
->align_addr
,
3709 sizeof (caddr_t
) * P2ROUNDUP(nbytes
, 8));
3710 ldep
->align_addr
= NULL
;
3716 * Get the other cookies (if any).
3718 for (i
= 1; i
< dep
->payload
.ncookies
; i
++) {
3719 rv
= ldc_mem_nextcookie(mhdl
, &dep
->payload
.cookie
[i
]);
3721 (void) ldc_mem_unbind_handle(mhdl
);
3722 DMSG(vdcp
, 0, "?[%d] Failed to get next cookie "
3723 "(mhdl=%lx cnum=%d), err=%d",
3724 vdcp
->instance
, mhdl
, i
, rv
);
3725 if (ldep
->align_addr
) {
3726 kmem_free(ldep
->align_addr
,
3727 sizeof (caddr_t
) * ldep
->nbytes
);
3728 ldep
->align_addr
= NULL
;
3738 * Interrupt handlers for messages from LDC
3748 * event - Type of event (LDC_EVT_xxx) that triggered the callback
3749 * arg - soft state pointer for this instance of the device driver.
3755 vdc_handle_cb(uint64_t event
, caddr_t arg
)
3757 ldc_status_t ldc_state
;
3759 vdc_server_t
*srvr
= (vdc_server_t
*)(void *)arg
;
3760 vdc_t
*vdc
= srvr
->vdcp
;
3762 ASSERT(vdc
!= NULL
);
3764 DMSG(vdc
, 1, "evt=%lx seqID=%ld\n", event
, vdc
->seq_num
);
3766 /* If callback is not for the current server, ignore it */
3767 mutex_enter(&vdc
->lock
);
3769 if (vdc
->curr_server
!= srvr
) {
3770 DMSG(vdc
, 0, "[%d] Ignoring event 0x%lx for port@%ld\n",
3771 vdc
->instance
, event
, srvr
->id
);
3772 mutex_exit(&vdc
->lock
);
3773 return (LDC_SUCCESS
);
3777 * Depending on the type of event that triggered this callback,
3778 * we modify the handshake state or read the data.
3780 * NOTE: not done as a switch() as event could be triggered by
3781 * a state change and a read request. Also the ordering of the
3782 * check for the event types is deliberate.
3784 if (event
& LDC_EVT_UP
) {
3785 DMSG(vdc
, 0, "[%d] Received LDC_EVT_UP\n", vdc
->instance
);
3788 rv
= ldc_status(srvr
->ldc_handle
, &ldc_state
);
3790 DMSG(vdc
, 0, "[%d] Couldn't get LDC status %d",
3792 mutex_exit(&vdc
->lock
);
3793 return (LDC_SUCCESS
);
3795 if (srvr
->ldc_state
!= LDC_UP
&&
3796 ldc_state
== LDC_UP
) {
3798 * Reset the transaction sequence numbers when
3799 * LDC comes up. We then kick off the handshake
3800 * negotiation with the vDisk server.
3803 vdc
->seq_num_reply
= 0;
3804 vdc
->io_pending
= B_TRUE
;
3805 srvr
->ldc_state
= ldc_state
;
3806 cv_signal(&vdc
->initwait_cv
);
3807 cv_signal(&vdc
->io_pending_cv
);
3811 if (event
& LDC_EVT_READ
) {
3812 DMSG(vdc
, 1, "[%d] Received LDC_EVT_READ\n", vdc
->instance
);
3813 mutex_enter(&vdc
->read_lock
);
3814 cv_signal(&vdc
->read_cv
);
3815 vdc
->read_state
= VDC_READ_PENDING
;
3816 mutex_exit(&vdc
->read_lock
);
3817 mutex_exit(&vdc
->lock
);
3819 /* that's all we have to do - no need to handle DOWN/RESET */
3820 return (LDC_SUCCESS
);
3823 if (event
& (LDC_EVT_RESET
|LDC_EVT_DOWN
)) {
3825 DMSG(vdc
, 0, "[%d] Received LDC RESET event\n", vdc
->instance
);
3828 * Need to wake up any readers so they will
3829 * detect that a reset has occurred.
3831 mutex_enter(&vdc
->read_lock
);
3832 if ((vdc
->read_state
== VDC_READ_WAITING
) ||
3833 (vdc
->read_state
== VDC_READ_RESET
))
3834 cv_signal(&vdc
->read_cv
);
3835 vdc
->read_state
= VDC_READ_RESET
;
3836 mutex_exit(&vdc
->read_lock
);
3838 /* wake up any threads waiting for connection to come up */
3839 if (vdc
->state
== VDC_STATE_INIT_WAITING
) {
3840 vdc
->state
= VDC_STATE_RESETTING
;
3841 cv_signal(&vdc
->initwait_cv
);
3842 } else if (vdc
->state
== VDC_STATE_FAILED
) {
3843 vdc
->io_pending
= B_TRUE
;
3844 cv_signal(&vdc
->io_pending_cv
);
3849 mutex_exit(&vdc
->lock
);
3851 if (event
& ~(LDC_EVT_UP
| LDC_EVT_RESET
| LDC_EVT_DOWN
| LDC_EVT_READ
))
3852 DMSG(vdc
, 0, "![%d] Unexpected LDC event (%lx) received",
3853 vdc
->instance
, event
);
3855 return (LDC_SUCCESS
);
3860 * vdc_wait_for_response()
3863 * Block waiting for a response from the server. If there is
3864 * no data the thread block on the read_cv that is signalled
3865 * by the callback when an EVT_READ occurs.
3868 * vdcp - soft state pointer for this instance of the device driver.
3874 vdc_wait_for_response(vdc_t
*vdcp
, vio_msg_t
*msgp
)
3876 size_t nbytes
= sizeof (*msgp
);
3879 ASSERT(vdcp
!= NULL
);
3881 DMSG(vdcp
, 1, "[%d] Entered\n", vdcp
->instance
);
3883 status
= vdc_recv(vdcp
, msgp
, &nbytes
);
3884 DMSG(vdcp
, 3, "vdc_read() done.. status=0x%x size=0x%x\n",
3885 status
, (int)nbytes
);
3887 DMSG(vdcp
, 0, "?[%d] Error %d reading LDC msg\n",
3888 vdcp
->instance
, status
);
3892 if (nbytes
< sizeof (vio_msg_tag_t
)) {
3893 DMSG(vdcp
, 0, "?[%d] Expect %lu bytes; recv'd %lu\n",
3894 vdcp
->instance
, sizeof (vio_msg_tag_t
), nbytes
);
3898 DMSG(vdcp
, 2, "[%d] (%x/%x/%x)\n", vdcp
->instance
,
3899 msgp
->tag
.vio_msgtype
,
3900 msgp
->tag
.vio_subtype
,
3901 msgp
->tag
.vio_subtype_env
);
3904 * Verify the Session ID of the message
3906 * Every message after the Version has been negotiated should
3907 * have the correct session ID set.
3909 if ((msgp
->tag
.vio_sid
!= vdcp
->session_id
) &&
3910 (msgp
->tag
.vio_subtype_env
!= VIO_VER_INFO
)) {
3911 DMSG(vdcp
, 0, "[%d] Invalid SID: received 0x%x, "
3912 "expected 0x%lx [seq num %lx @ %d]",
3913 vdcp
->instance
, msgp
->tag
.vio_sid
,
3915 ((vio_dring_msg_t
*)msgp
)->seq_num
,
3916 ((vio_dring_msg_t
*)msgp
)->start_idx
);
3925 * vdc_resubmit_backup_dring()
3928 * Resubmit each descriptor in the backed up dring to
3929 * vDisk server. The Dring was backed up during connection
3933 * vdcp - soft state pointer for this instance of the device driver.
3939 vdc_resubmit_backup_dring(vdc_t
*vdcp
)
3946 vdc_local_desc_t
*curr_ldep
;
3948 ASSERT(MUTEX_NOT_HELD(&vdcp
->lock
));
3949 ASSERT(vdcp
->state
== VDC_STATE_HANDLE_PENDING
);
3951 if (vdcp
->local_dring_backup
== NULL
) {
3952 /* the pending requests have already been processed */
3956 DMSG(vdcp
, 1, "restoring pending dring entries (len=%d, tail=%d)\n",
3957 vdcp
->local_dring_backup_len
, vdcp
->local_dring_backup_tail
);
3960 * Walk the backup copy of the local descriptor ring and
3961 * resubmit all the outstanding transactions.
3963 b_idx
= vdcp
->local_dring_backup_tail
;
3964 for (count
= 0; count
< vdcp
->local_dring_backup_len
; count
++) {
3966 curr_ldep
= &(vdcp
->local_dring_backup
[b_idx
]);
3968 /* only resubmit outstanding transactions */
3969 if (!curr_ldep
->is_free
) {
3971 DMSG(vdcp
, 1, "resubmitting entry idx=%x\n", b_idx
);
3973 rv
= vdc_do_op(vdcp
, curr_ldep
->operation
,
3974 curr_ldep
->addr
, curr_ldep
->nbytes
,
3975 curr_ldep
->slice
, curr_ldep
->offset
,
3976 curr_ldep
->buf
, curr_ldep
->dir
,
3977 (curr_ldep
->flags
& ~VDC_OP_STATE_RUNNING
) |
3981 DMSG(vdcp
, 1, "[%d] resubmit entry %d failed\n",
3982 vdcp
->instance
, b_idx
);
3987 * Mark this entry as free so that we will not resubmit
3988 * this "done" request again, if we were to use the same
3989 * backup_dring again in future. This could happen when
3990 * a reset happens while processing the backup_dring.
3992 curr_ldep
->is_free
= B_TRUE
;
3996 /* get the next element to submit */
3997 if (++b_idx
>= vdcp
->local_dring_backup_len
)
4001 /* all done - now clear up pending dring copy */
4002 dring_size
= vdcp
->local_dring_backup_len
*
4003 sizeof (vdcp
->local_dring_backup
[0]);
4005 (void) kmem_free(vdcp
->local_dring_backup
, dring_size
);
4007 vdcp
->local_dring_backup
= NULL
;
4010 DTRACE_PROBE2(processed
, int, processed
, vdc_t
*, vdcp
);
4017 * vdc_cancel_backup_dring
4020 * Cancel each descriptor in the backed up dring to vDisk server.
4021 * The Dring was backed up during connection reset.
4024 * vdcp - soft state pointer for this instance of the device driver.
4030 vdc_cancel_backup_dring(vdc_t
*vdcp
)
4032 vdc_local_desc_t
*ldep
;
4039 ASSERT(MUTEX_HELD(&vdcp
->lock
));
4040 ASSERT(vdcp
->state
== VDC_STATE_FAILED
);
4042 if (vdcp
->local_dring_backup
== NULL
) {
4043 /* the pending requests have already been processed */
4047 DMSG(vdcp
, 1, "cancelling pending dring entries (len=%d, tail=%d)\n",
4048 vdcp
->local_dring_backup_len
, vdcp
->local_dring_backup_tail
);
4051 * Walk the backup copy of the local descriptor ring and
4052 * cancel all the outstanding transactions.
4054 b_idx
= vdcp
->local_dring_backup_tail
;
4055 for (count
= 0; count
< vdcp
->local_dring_backup_len
; count
++) {
4057 ldep
= &(vdcp
->local_dring_backup
[b_idx
]);
4059 /* only cancel outstanding transactions */
4060 if (!ldep
->is_free
) {
4062 DMSG(vdcp
, 1, "cancelling entry idx=%x\n", b_idx
);
4066 * All requests have already been cleared from the
4067 * local descriptor ring and the LDC channel has been
4068 * reset so we will never get any reply for these
4069 * requests. Now we just have to notify threads waiting
4070 * for replies that the request has failed.
4073 ASSERT(bufp
!= NULL
);
4074 bufp
->b_resid
= bufp
->b_bcount
;
4075 if (ldep
->operation
== VD_OP_BREAD
||
4076 ldep
->operation
== VD_OP_BWRITE
) {
4077 VD_UPDATE_ERR_STATS(vdcp
, vd_softerrs
);
4078 VD_KSTAT_WAITQ_EXIT(vdcp
);
4079 DTRACE_IO1(done
, buf_t
*, bufp
);
4081 bioerror(bufp
, EIO
);
4085 /* get the next element to cancel */
4086 if (++b_idx
>= vdcp
->local_dring_backup_len
)
4090 /* all done - now clear up pending dring copy */
4091 dring_size
= vdcp
->local_dring_backup_len
*
4092 sizeof (vdcp
->local_dring_backup
[0]);
4094 (void) kmem_free(vdcp
->local_dring_backup
, dring_size
);
4096 vdcp
->local_dring_backup
= NULL
;
4098 DTRACE_PROBE2(cancelled
, int, cancelled
, vdc_t
*, vdcp
);
4103 * vdc_connection_timeout
4106 * This function is invoked if the timeout set to establish the connection
4107 * with vds expires. This will happen if we spend too much time in the
4108 * VDC_STATE_INIT_WAITING, VDC_STATE_NEGOTIATE or VDC_STATE_HANDLE_PENDING
4112 * arg - argument of the timeout function actually a soft state
4113 * pointer for the instance of the device driver.
4119 vdc_connection_timeout(void *arg
)
4121 vdc_t
*vdcp
= (vdc_t
*)arg
;
4123 mutex_enter(&vdcp
->lock
);
4125 vdcp
->ctimeout_reached
= B_TRUE
;
4127 mutex_exit(&vdcp
->lock
);
4132 * vdc_backup_local_dring()
4135 * Backup the current dring in the event of a reset. The Dring
4136 * transactions will be resubmitted to the server when the
4137 * connection is restored.
4140 * vdcp - soft state pointer for this instance of the device driver.
4146 vdc_backup_local_dring(vdc_t
*vdcp
)
4148 int b_idx
, count
, dring_size
;
4149 vdc_local_desc_t
*curr_ldep
;
4151 ASSERT(MUTEX_HELD(&vdcp
->lock
));
4152 ASSERT(vdcp
->state
== VDC_STATE_RESETTING
);
4155 * If the backup dring is stil around, it means
4156 * that the last restore did not complete. However,
4157 * since we never got back into the running state,
4158 * the backup copy we have is still valid.
4160 if (vdcp
->local_dring_backup
!= NULL
) {
4161 DMSG(vdcp
, 1, "reusing local descriptor ring backup "
4162 "(len=%d, tail=%d)\n", vdcp
->local_dring_backup_len
,
4163 vdcp
->local_dring_backup_tail
);
4168 * The backup dring can be NULL and the local dring may not be
4169 * initialized. This can happen if we had a reset while establishing
4170 * a new connection but after the connection has timed out. In that
4171 * case the backup dring is NULL because the requests have been
4172 * cancelled and the request occured before the local dring is
4175 if (!(vdcp
->initialized
& VDC_DRING_LOCAL
))
4178 DMSG(vdcp
, 1, "backing up the local descriptor ring (len=%d, "
4179 "tail=%d)\n", vdcp
->dring_len
, vdcp
->dring_curr_idx
);
4181 dring_size
= vdcp
->dring_len
* sizeof (vdcp
->local_dring
[0]);
4183 vdcp
->local_dring_backup
= kmem_alloc(dring_size
, KM_SLEEP
);
4184 bcopy(vdcp
->local_dring
, vdcp
->local_dring_backup
, dring_size
);
4186 vdcp
->local_dring_backup_tail
= vdcp
->dring_curr_idx
;
4187 vdcp
->local_dring_backup_len
= vdcp
->dring_len
;
4190 * At this point, pending read or write I/Os are recorded in the
4191 * runq. We update the I/O statistics to indicate that they are now
4192 * back in the waitq.
4194 b_idx
= vdcp
->local_dring_backup_tail
;
4195 for (count
= 0; count
< vdcp
->local_dring_backup_len
; count
++) {
4197 curr_ldep
= &(vdcp
->local_dring_backup
[b_idx
]);
4199 if (!curr_ldep
->is_free
&&
4200 (curr_ldep
->operation
== VD_OP_BREAD
||
4201 curr_ldep
->operation
== VD_OP_BWRITE
)) {
4202 VD_KSTAT_RUNQ_BACK_TO_WAITQ(vdcp
);
4205 /* get the next element */
4206 if (++b_idx
>= vdcp
->local_dring_backup_len
)
4213 vdc_switch_server(vdc_t
*vdcp
)
4216 vdc_server_t
*curr_server
, *new_server
;
4218 ASSERT(MUTEX_HELD(&vdcp
->lock
));
4220 /* if there is only one server return back */
4221 if (vdcp
->num_servers
== 1) {
4225 /* Get current and next server */
4226 curr_server
= vdcp
->curr_server
;
4228 (curr_server
->next
) ? curr_server
->next
: vdcp
->server_list
;
4229 ASSERT(curr_server
!= new_server
);
4231 /* bring current server's channel down */
4232 rv
= ldc_down(curr_server
->ldc_handle
);
4234 DMSG(vdcp
, 0, "[%d] Cannot bring channel down, port %ld\n",
4235 vdcp
->instance
, curr_server
->id
);
4239 /* switch the server */
4240 vdcp
->curr_server
= new_server
;
4242 DMSG(vdcp
, 0, "[%d] Switched to next vdisk server, port@%ld, ldc@%ld\n",
4243 vdcp
->instance
, vdcp
->curr_server
->id
, vdcp
->curr_server
->ldc_id
);
4247 vdc_print_svc_status(vdc_t
*vdcp
)
4250 uint64_t ldc_id
, port_id
;
4251 vdc_service_state_t svc_state
;
4253 ASSERT(mutex_owned(&vdcp
->lock
));
4255 svc_state
= vdcp
->curr_server
->svc_state
;
4257 if (vdcp
->curr_server
->log_state
== svc_state
)
4260 instance
= vdcp
->instance
;
4261 ldc_id
= vdcp
->curr_server
->ldc_id
;
4262 port_id
= vdcp
->curr_server
->id
;
4264 switch (svc_state
) {
4266 case VDC_SERVICE_OFFLINE
:
4267 cmn_err(CE_CONT
, "?vdisk@%d is offline\n", instance
);
4270 case VDC_SERVICE_CONNECTED
:
4271 cmn_err(CE_CONT
, "?vdisk@%d is connected using ldc@%ld,%ld\n",
4272 instance
, ldc_id
, port_id
);
4275 case VDC_SERVICE_ONLINE
:
4276 cmn_err(CE_CONT
, "?vdisk@%d is online using ldc@%ld,%ld\n",
4277 instance
, ldc_id
, port_id
);
4280 case VDC_SERVICE_FAILED
:
4281 cmn_err(CE_CONT
, "?vdisk@%d access to service failed "
4282 "using ldc@%ld,%ld\n", instance
, ldc_id
, port_id
);
4285 case VDC_SERVICE_FAULTED
:
4286 cmn_err(CE_CONT
, "?vdisk@%d access to backend failed "
4287 "using ldc@%ld,%ld\n", instance
, ldc_id
, port_id
);
4295 vdcp
->curr_server
->log_state
= svc_state
;
4300 * vdc_handshake_retry
4303 * This function indicates if the handshake should be retried or not.
4304 * This depends on the lifecycle of the driver:
4306 * VDC_LC_ATTACHING: the handshake is retried until we have tried
4307 * a handshake with each server. We don't care how far each handshake
4308 * went, the goal is just to try the handshake. We want to minimize the
4309 * the time spent doing the attach because this is locking the device
4312 * VDC_LC_ONLINE_PENDING: the handshake is retried while we haven't done
4313 * consecutive attribute negotiations with each server, and we haven't
4314 * reached a minimum total of consecutive negotiations (hattr_min). The
4315 * number of attribution negotiations determines the time spent before
4316 * failing pending I/Os if the handshake is not successful.
4318 * VDC_LC_ONLINE: the handshake is always retried, until we have a
4319 * successful handshake with a server.
4321 * VDC_LC_DETACHING: N/A
4324 * hshake_cnt - number of handshake attempts
4325 * hattr_cnt - number of attribute negotiation attempts
4328 * B_TRUE - handshake should be retried
4329 * B_FALSE - handshake should not be retried
4332 vdc_handshake_retry(vdc_t
*vdcp
, int hshake_cnt
, int hattr_cnt
)
4334 int hattr_total
= 0;
4337 ASSERT(vdcp
->lifecycle
!= VDC_LC_DETACHING
);
4339 /* update handshake counters */
4340 vdcp
->curr_server
->hshake_cnt
= hshake_cnt
;
4341 vdcp
->curr_server
->hattr_cnt
= hattr_cnt
;
4344 * If no attribute negotiation was done then we reset the total
4345 * number otherwise we cumulate the number.
4348 vdcp
->curr_server
->hattr_total
= 0;
4350 vdcp
->curr_server
->hattr_total
+= hattr_cnt
;
4353 * If we are online (i.e. at least one handshake was successfully
4354 * completed) then we always retry the handshake.
4356 if (vdcp
->lifecycle
== VDC_LC_ONLINE
)
4360 * If we are attaching then we retry the handshake only if we haven't
4361 * tried with all servers.
4363 if (vdcp
->lifecycle
== VDC_LC_ATTACHING
) {
4365 for (srvr
= vdcp
->server_list
; srvr
!= NULL
;
4366 srvr
= srvr
->next
) {
4367 if (srvr
->hshake_cnt
== 0) {
4376 * Here we are in the case where we haven't completed any handshake
4379 ASSERT(vdcp
->lifecycle
== VDC_LC_ONLINE_PENDING
);
4382 * We retry the handshake if we haven't done an attribute negotiation
4383 * with each server. This is to handle the case where one service domain
4386 for (srvr
= vdcp
->server_list
; srvr
!= NULL
; srvr
= srvr
->next
) {
4387 if (srvr
->hattr_cnt
== 0) {
4390 hattr_total
+= srvr
->hattr_total
;
4394 * We retry the handshake if we haven't reached the minimum number of
4395 * attribute negotiation.
4397 return (hattr_total
< vdcp
->hattr_min
);
4400 /* -------------------------------------------------------------------------- */
4403 * The following functions process the incoming messages from vds
4408 * vdc_process_msg_thread()
4412 * Main VDC message processing thread. Each vDisk instance
4413 * consists of a copy of this thread. This thread triggers
4414 * all the handshakes and data exchange with the server. It
4415 * also handles all channel resets
4418 * vdc - soft state pointer for this instance of the device driver.
4424 vdc_process_msg_thread(vdc_t
*vdcp
)
4426 boolean_t failure_msg
= B_FALSE
;
4429 timeout_id_t tmid
= 0;
4430 clock_t ldcup_timeout
= 0;
4432 vdc_service_state_t svc_state
;
4436 mutex_enter(&vdcp
->lock
);
4438 ASSERT(vdcp
->lifecycle
== VDC_LC_ATTACHING
);
4442 #define Q(_s) (vdcp->state == _s) ? #_s :
4443 DMSG(vdcp
, 3, "state = %d (%s)\n", vdcp
->state
,
4445 Q(VDC_STATE_INIT_WAITING
)
4446 Q(VDC_STATE_NEGOTIATE
)
4447 Q(VDC_STATE_HANDLE_PENDING
)
4448 Q(VDC_STATE_FAULTED
)
4450 Q(VDC_STATE_RUNNING
)
4451 Q(VDC_STATE_RESETTING
)
4456 switch (vdcp
->state
) {
4457 case VDC_STATE_INIT
:
4460 * If requested, start a timeout to check if the
4461 * connection with vds is established in the
4462 * specified delay. If the timeout expires, we
4463 * will cancel any pending request.
4465 * If some reset have occurred while establishing
4466 * the connection, we already have a timeout armed
4467 * and in that case we don't need to arm a new one.
4469 * The same rule applies when there are multiple vds'.
4470 * If either a connection cannot be established or
4471 * the handshake times out, the connection thread will
4472 * try another server. The 'ctimeout' will report
4473 * back an error after it expires irrespective of
4474 * whether the vdisk is trying to connect to just
4475 * one or multiple servers.
4477 ctimeout
= (vdc_timeout
!= 0)?
4478 vdc_timeout
: vdcp
->curr_server
->ctimeout
;
4480 if (ctimeout
!= 0 && tmid
== 0) {
4481 tmid
= timeout(vdc_connection_timeout
, vdcp
,
4482 ctimeout
* drv_usectohz(MICROSEC
));
4485 /* Switch to STATE_DETACH if drv is detaching */
4486 if (vdcp
->lifecycle
== VDC_LC_DETACHING
) {
4487 vdcp
->state
= VDC_STATE_DETACH
;
4491 /* Check if the timeout has been reached */
4492 if (vdcp
->ctimeout_reached
) {
4495 vdcp
->state
= VDC_STATE_FAILED
;
4500 * Switch to another server when we reach the limit of
4501 * the number of handshake per server or if we have done
4502 * an attribute negotiation.
4504 if (hshake_cnt
>= vdc_hshake_retries
|| hattr_cnt
> 0) {
4506 if (!vdc_handshake_retry(vdcp
, hshake_cnt
,
4508 DMSG(vdcp
, 0, "[%d] too many "
4509 "handshakes", vdcp
->instance
);
4510 vdcp
->state
= VDC_STATE_FAILED
;
4514 vdc_switch_server(vdcp
);
4522 /* Bring up connection with vds via LDC */
4523 status
= vdc_start_ldc_connection(vdcp
);
4524 if (status
!= EINVAL
) {
4525 vdcp
->state
= VDC_STATE_INIT_WAITING
;
4527 vdcp
->curr_server
->svc_state
=
4529 vdc_print_svc_status(vdcp
);
4533 case VDC_STATE_INIT_WAITING
:
4535 /* if channel is UP, start negotiation */
4536 if (vdcp
->curr_server
->ldc_state
== LDC_UP
) {
4537 vdcp
->state
= VDC_STATE_NEGOTIATE
;
4542 * Wait for LDC_UP. If it times out and we have multiple
4543 * servers then we will retry using a different server.
4545 ldcup_timeout
= ddi_get_lbolt() + (vdc_ldcup_timeout
*
4546 drv_usectohz(MICROSEC
));
4547 status
= cv_timedwait(&vdcp
->initwait_cv
, &vdcp
->lock
,
4550 vdcp
->state
== VDC_STATE_INIT_WAITING
&&
4551 vdcp
->curr_server
->ldc_state
!= LDC_UP
) {
4552 /* timed out & still waiting */
4553 vdcp
->curr_server
->svc_state
=
4555 vdc_print_svc_status(vdcp
);
4556 vdcp
->state
= VDC_STATE_INIT
;
4560 if (vdcp
->state
!= VDC_STATE_INIT_WAITING
) {
4562 "state moved to %d out from under us...\n",
4567 case VDC_STATE_NEGOTIATE
:
4568 switch (status
= vdc_ver_negotiation(vdcp
)) {
4572 DMSG(vdcp
, 0, "ver negotiate failed (%d)..\n",
4579 switch (status
= vdc_attr_negotiation(vdcp
)) {
4583 DMSG(vdcp
, 0, "attr negotiate failed (%d)..\n",
4588 switch (status
= vdc_dring_negotiation(vdcp
)) {
4592 DMSG(vdcp
, 0, "dring negotiate failed (%d)..\n",
4597 switch (status
= vdc_rdx_exchange(vdcp
)) {
4599 vdcp
->state
= VDC_STATE_HANDLE_PENDING
;
4602 DMSG(vdcp
, 0, "RDX xchg failed ..(%d)\n",
4607 DMSG(vdcp
, 0, "negotiation failed: resetting (%d)\n",
4609 vdcp
->state
= VDC_STATE_RESETTING
;
4610 vdcp
->self_reset
= B_TRUE
;
4611 vdcp
->curr_server
->svc_state
= VDC_SERVICE_FAILED
;
4612 vdc_print_svc_status(vdcp
);
4614 DMSG(vdcp
, 0, "negotiation complete (state=0x%x)...\n",
4618 case VDC_STATE_HANDLE_PENDING
:
4620 DMSG(vdcp
, 0, "[%d] connection to service domain is up",
4622 vdcp
->curr_server
->svc_state
= VDC_SERVICE_CONNECTED
;
4624 mutex_exit(&vdcp
->lock
);
4627 * If we have multiple servers, check that the backend
4628 * is effectively available before resubmitting any IO.
4630 if (vdcp
->num_servers
> 1 &&
4631 vdc_eio_check(vdcp
, 0) != 0) {
4632 mutex_enter(&vdcp
->lock
);
4633 vdcp
->curr_server
->svc_state
=
4634 VDC_SERVICE_FAULTED
;
4635 vdcp
->state
= VDC_STATE_FAULTED
;
4640 (void) untimeout(tmid
);
4642 vdcp
->ctimeout_reached
= B_FALSE
;
4648 (void) vdc_setup_devid(vdcp
);
4650 status
= vdc_resubmit_backup_dring(vdcp
);
4652 mutex_enter(&vdcp
->lock
);
4655 vdcp
->state
= VDC_STATE_RESETTING
;
4656 vdcp
->self_reset
= B_TRUE
;
4657 vdcp
->curr_server
->svc_state
=
4659 vdc_print_svc_status(vdcp
);
4661 vdcp
->state
= VDC_STATE_RUNNING
;
4665 case VDC_STATE_FAULTED
:
4667 * Server is faulted because the backend is unavailable.
4668 * If all servers are faulted then we mark the service
4669 * as failed, otherwise we reset to switch to another
4672 vdc_print_svc_status(vdcp
);
4674 /* check if all servers are faulted */
4675 for (srvr
= vdcp
->server_list
; srvr
!= NULL
;
4676 srvr
= srvr
->next
) {
4677 svc_state
= srvr
->svc_state
;
4678 if (svc_state
!= VDC_SERVICE_FAULTED
)
4683 vdcp
->state
= VDC_STATE_RESETTING
;
4684 vdcp
->self_reset
= B_TRUE
;
4686 vdcp
->state
= VDC_STATE_FAILED
;
4690 case VDC_STATE_FAILED
:
4692 * We reach this state when we are unable to access the
4693 * backend from any server, either because of a maximum
4694 * connection retries or timeout, or because the backend
4697 * Then we cancel the backup DRing so that errors get
4698 * reported and we wait for a new I/O before attempting
4699 * another connection.
4702 cmn_err(CE_NOTE
, "vdisk@%d disk access failed",
4704 failure_msg
= B_TRUE
;
4706 if (vdcp
->lifecycle
== VDC_LC_ATTACHING
) {
4707 vdcp
->lifecycle
= VDC_LC_ONLINE_PENDING
;
4708 vdcp
->hattr_min
= vdc_hattr_min_initial
;
4710 vdcp
->hattr_min
= vdc_hattr_min
;
4713 /* cancel any timeout */
4715 (void) untimeout(tmid
);
4719 /* cancel pending I/Os */
4720 cv_broadcast(&vdcp
->running_cv
);
4721 vdc_cancel_backup_dring(vdcp
);
4723 /* wait for new I/O */
4724 while (!vdcp
->io_pending
)
4725 cv_wait(&vdcp
->io_pending_cv
, &vdcp
->lock
);
4728 * There's a new IO pending. Try to re-establish a
4729 * connection. Mark all services as offline, so that
4730 * we don't stop again before having retried all
4733 for (srvr
= vdcp
->server_list
; srvr
!= NULL
;
4734 srvr
= srvr
->next
) {
4735 srvr
->svc_state
= VDC_SERVICE_OFFLINE
;
4736 srvr
->hshake_cnt
= 0;
4737 srvr
->hattr_cnt
= 0;
4738 srvr
->hattr_total
= 0;
4741 /* reset variables */
4744 vdcp
->ctimeout_reached
= B_FALSE
;
4746 vdcp
->state
= VDC_STATE_RESETTING
;
4747 vdcp
->self_reset
= B_TRUE
;
4750 /* enter running state */
4751 case VDC_STATE_RUNNING
:
4753 if (vdcp
->lifecycle
== VDC_LC_DETACHING
) {
4754 vdcp
->state
= VDC_STATE_DETACH
;
4758 vdcp
->lifecycle
= VDC_LC_ONLINE
;
4761 cmn_err(CE_NOTE
, "vdisk@%d disk access "
4762 "recovered", vdcp
->instance
);
4763 failure_msg
= B_FALSE
;
4767 * Signal anyone waiting for the connection
4770 cv_broadcast(&vdcp
->running_cv
);
4772 /* backend has to be checked after reset */
4773 if (vdcp
->failfast_interval
!= 0 ||
4774 vdcp
->num_servers
> 1)
4775 cv_signal(&vdcp
->eio_cv
);
4777 /* ownership is lost during reset */
4778 if (vdcp
->ownership
& VDC_OWNERSHIP_WANTED
)
4779 vdcp
->ownership
|= VDC_OWNERSHIP_RESET
;
4780 cv_signal(&vdcp
->ownership_cv
);
4782 vdcp
->curr_server
->svc_state
= VDC_SERVICE_ONLINE
;
4783 vdc_print_svc_status(vdcp
);
4785 mutex_exit(&vdcp
->lock
);
4789 status
= vdc_wait_for_response(vdcp
, &msg
);
4792 DMSG(vdcp
, 1, "[%d] new pkt(s) available\n",
4794 status
= vdc_process_data_msg(vdcp
, &msg
);
4796 DMSG(vdcp
, 1, "[%d] process_data_msg "
4797 "returned err=%d\n", vdcp
->instance
,
4804 mutex_enter(&vdcp
->lock
);
4806 /* all servers are now offline */
4807 for (srvr
= vdcp
->server_list
; srvr
!= NULL
;
4808 srvr
= srvr
->next
) {
4809 srvr
->svc_state
= VDC_SERVICE_OFFLINE
;
4810 srvr
->log_state
= VDC_SERVICE_NONE
;
4811 srvr
->hshake_cnt
= 0;
4812 srvr
->hattr_cnt
= 0;
4813 srvr
->hattr_total
= 0;
4819 vdc_print_svc_status(vdcp
);
4821 vdcp
->state
= VDC_STATE_RESETTING
;
4822 vdcp
->self_reset
= B_TRUE
;
4825 case VDC_STATE_RESETTING
:
4827 * When we reach this state, we either come from the
4828 * VDC_STATE_RUNNING state and we can have pending
4829 * request but no timeout is armed; or we come from
4830 * the VDC_STATE_INIT_WAITING, VDC_NEGOTIATE or
4831 * VDC_HANDLE_PENDING state and there is no pending
4832 * request or pending requests have already been copied
4833 * into the backup dring. So we can safely keep the
4834 * connection timeout armed while we are in this state.
4837 DMSG(vdcp
, 0, "Initiating channel reset "
4838 "(pending = %d)\n", (int)vdcp
->threads_pending
);
4840 if (vdcp
->self_reset
) {
4842 "[%d] calling stop_ldc_connection.\n",
4844 status
= vdc_stop_ldc_connection(vdcp
);
4845 vdcp
->self_reset
= B_FALSE
;
4849 * Wait for all threads currently waiting
4850 * for a free dring entry to use.
4852 while (vdcp
->threads_pending
) {
4853 cv_broadcast(&vdcp
->membind_cv
);
4854 cv_broadcast(&vdcp
->dring_free_cv
);
4855 mutex_exit(&vdcp
->lock
);
4856 /* give the waiters enough time to wake up */
4857 delay(vdc_hz_min_ldc_delay
);
4858 mutex_enter(&vdcp
->lock
);
4861 ASSERT(vdcp
->threads_pending
== 0);
4863 /* Sanity check that no thread is receiving */
4864 ASSERT(vdcp
->read_state
!= VDC_READ_WAITING
);
4866 vdcp
->read_state
= VDC_READ_IDLE
;
4867 vdcp
->io_pending
= B_FALSE
;
4870 * Cleanup any pending eio. These I/Os are going to
4873 vdc_eio_unqueue(vdcp
, 0, B_FALSE
);
4875 vdc_backup_local_dring(vdcp
);
4877 /* cleanup the old d-ring */
4878 vdc_destroy_descriptor_ring(vdcp
);
4880 /* go and start again */
4881 vdcp
->state
= VDC_STATE_INIT
;
4885 case VDC_STATE_DETACH
:
4886 DMSG(vdcp
, 0, "[%d] Reset thread exit cleanup ..\n",
4889 /* cancel any pending timeout */
4890 mutex_exit(&vdcp
->lock
);
4892 (void) untimeout(tmid
);
4895 mutex_enter(&vdcp
->lock
);
4898 * Signal anyone waiting for connection
4901 cv_broadcast(&vdcp
->running_cv
);
4903 while (vdcp
->sync_op_cnt
> 0) {
4904 cv_broadcast(&vdcp
->sync_blocked_cv
);
4905 mutex_exit(&vdcp
->lock
);
4906 /* give the waiters enough time to wake up */
4907 delay(vdc_hz_min_ldc_delay
);
4908 mutex_enter(&vdcp
->lock
);
4911 mutex_exit(&vdcp
->lock
);
4913 DMSG(vdcp
, 0, "[%d] Msg processing thread exiting ..\n",
4924 * vdc_process_data_msg()
4927 * This function is called by the message processing thread each time
4928 * a message with a msgtype of VIO_TYPE_DATA is received. It will either
4929 * be an ACK or NACK from vds[1] which vdc handles as follows.
4930 * ACK - wake up the waiting thread
4931 * NACK - resend any messages necessary
4933 * [1] Although the message format allows it, vds should not send a
4934 * VIO_SUBTYPE_INFO message to vdc asking it to read data; if for
4935 * some bizarre reason it does, vdc will reset the connection.
4938 * vdc - soft state pointer for this instance of the device driver.
4939 * msg - the LDC message sent by vds
4943 * > 0 - error value returned by LDC
4946 vdc_process_data_msg(vdc_t
*vdcp
, vio_msg_t
*msg
)
4949 vio_dring_msg_t
*dring_msg
;
4950 vdc_local_desc_t
*ldep
= NULL
;
4955 dring_msg
= (vio_dring_msg_t
*)msg
;
4957 ASSERT(msg
->tag
.vio_msgtype
== VIO_TYPE_DATA
);
4958 ASSERT(vdcp
!= NULL
);
4960 mutex_enter(&vdcp
->lock
);
4963 * Check to see if the message has bogus data
4965 idx
= start
= dring_msg
->start_idx
;
4966 end
= dring_msg
->end_idx
;
4967 if ((start
>= vdcp
->dring_len
) ||
4968 (end
>= vdcp
->dring_len
) || (end
< -1)) {
4970 * Update the I/O statistics to indicate that an error ocurred.
4971 * No need to update the wait/run queues as no specific read or
4972 * write request is being completed in response to this 'msg'.
4974 VD_UPDATE_ERR_STATS(vdcp
, vd_softerrs
);
4975 DMSG(vdcp
, 0, "[%d] Bogus ACK data : start %d, end %d\n",
4976 vdcp
->instance
, start
, end
);
4977 mutex_exit(&vdcp
->lock
);
4982 * Verify that the sequence number is what vdc expects.
4984 switch (vdc_verify_seq_num(vdcp
, dring_msg
)) {
4985 case VDC_SEQ_NUM_TODO
:
4986 break; /* keep processing this message */
4987 case VDC_SEQ_NUM_SKIP
:
4988 mutex_exit(&vdcp
->lock
);
4990 case VDC_SEQ_NUM_INVALID
:
4992 * Update the I/O statistics to indicate that an error ocurred.
4993 * No need to update the wait/run queues as no specific read or
4994 * write request is being completed in response to this 'msg'.
4996 VD_UPDATE_ERR_STATS(vdcp
, vd_softerrs
);
4997 DMSG(vdcp
, 0, "[%d] invalid seqno\n", vdcp
->instance
);
4998 mutex_exit(&vdcp
->lock
);
5002 if (msg
->tag
.vio_subtype
== VIO_SUBTYPE_NACK
) {
5004 * Update the I/O statistics to indicate that an error ocurred.
5005 * No need to update the wait/run queues, this will be done by
5006 * the thread calling this function.
5008 VD_UPDATE_ERR_STATS(vdcp
, vd_softerrs
);
5009 VDC_DUMP_DRING_MSG(dring_msg
);
5010 DMSG(vdcp
, 0, "[%d] DATA NACK\n", vdcp
->instance
);
5011 mutex_exit(&vdcp
->lock
);
5014 } else if (msg
->tag
.vio_subtype
== VIO_SUBTYPE_INFO
) {
5016 * Update the I/O statistics to indicate that an error occurred.
5017 * No need to update the wait/run queues as no specific read or
5018 * write request is being completed in response to this 'msg'.
5020 VD_UPDATE_ERR_STATS(vdcp
, vd_protoerrs
);
5021 mutex_exit(&vdcp
->lock
);
5025 DMSG(vdcp
, 1, ": start %d end %d\n", start
, end
);
5026 ASSERT(start
== end
);
5028 ldep
= &vdcp
->local_dring
[idx
];
5030 DMSG(vdcp
, 1, ": state 0x%x\n", ldep
->dep
->hdr
.dstate
);
5032 if (ldep
->dep
->hdr
.dstate
== VIO_DESC_DONE
) {
5035 status
= ldep
->dep
->payload
.status
;
5038 ASSERT(bufp
!= NULL
);
5040 bufp
->b_resid
= bufp
->b_bcount
- ldep
->dep
->payload
.nbytes
;
5041 bioerror(bufp
, status
);
5044 DMSG(vdcp
, 1, "I/O status=%d\n", status
);
5048 "I/O complete req=%ld bytes resp=%ld bytes\n",
5049 bufp
->b_bcount
, ldep
->dep
->payload
.nbytes
);
5052 * If the request has failed and we have multiple servers or
5053 * failfast is enabled then we will have to defer the completion
5054 * of the request until we have checked that the vdisk backend
5055 * is effectively available (if multiple server) or that there
5056 * is no reservation conflict (if failfast).
5059 ((vdcp
->num_servers
> 1 &&
5060 (ldep
->flags
& VDC_OP_ERRCHK_BACKEND
)) ||
5061 (vdcp
->failfast_interval
!= 0 &&
5062 (ldep
->flags
& VDC_OP_ERRCHK_CONFLICT
)))) {
5064 * The I/O has failed and we need to check the error.
5066 (void) vdc_eio_queue(vdcp
, idx
);
5068 op
= ldep
->operation
;
5069 if (op
== VD_OP_BREAD
|| op
== VD_OP_BWRITE
) {
5071 VD_UPDATE_IO_STATS(vdcp
, op
,
5072 ldep
->dep
->payload
.nbytes
);
5074 VD_UPDATE_ERR_STATS(vdcp
, vd_softerrs
);
5076 VD_KSTAT_RUNQ_EXIT(vdcp
);
5077 DTRACE_IO1(done
, buf_t
*, bufp
);
5079 (void) vdc_depopulate_descriptor(vdcp
, idx
);
5084 /* let the arrival signal propogate */
5085 mutex_exit(&vdcp
->lock
);
5087 /* probe gives the count of how many entries were processed */
5088 DTRACE_PROBE2(processed
, int, 1, vdc_t
*, vdcp
);
5096 * vdc_handle_ver_msg()
5101 * vdc - soft state pointer for this instance of the device driver.
5102 * ver_msg - LDC message sent by vDisk server
5108 vdc_handle_ver_msg(vdc_t
*vdc
, vio_ver_msg_t
*ver_msg
)
5112 ASSERT(vdc
!= NULL
);
5113 ASSERT(mutex_owned(&vdc
->lock
));
5115 if (ver_msg
->tag
.vio_subtype_env
!= VIO_VER_INFO
) {
5119 if (ver_msg
->dev_class
!= VDEV_DISK_SERVER
) {
5123 switch (ver_msg
->tag
.vio_subtype
) {
5124 case VIO_SUBTYPE_ACK
:
5126 * We check to see if the version returned is indeed supported
5127 * (The server may have also adjusted the minor number downwards
5128 * and if so 'ver_msg' will contain the actual version agreed)
5130 if (vdc_is_supported_version(ver_msg
)) {
5131 vdc
->ver
.major
= ver_msg
->ver_major
;
5132 vdc
->ver
.minor
= ver_msg
->ver_minor
;
5133 ASSERT(vdc
->ver
.major
> 0);
5139 case VIO_SUBTYPE_NACK
:
5141 * call vdc_is_supported_version() which will return the next
5142 * supported version (if any) in 'ver_msg'
5144 (void) vdc_is_supported_version(ver_msg
);
5145 if (ver_msg
->ver_major
> 0) {
5146 size_t len
= sizeof (*ver_msg
);
5148 ASSERT(vdc
->ver
.major
> 0);
5150 /* reset the necessary fields and resend */
5151 ver_msg
->tag
.vio_subtype
= VIO_SUBTYPE_INFO
;
5152 ver_msg
->dev_class
= VDEV_DISK
;
5154 status
= vdc_send(vdc
, (caddr_t
)ver_msg
, &len
);
5155 DMSG(vdc
, 0, "[%d] Resend VER info (LDC status = %d)\n",
5156 vdc
->instance
, status
);
5157 if (len
!= sizeof (*ver_msg
))
5160 DMSG(vdc
, 0, "[%d] No common version with vDisk server",
5166 case VIO_SUBTYPE_INFO
:
5168 * Handle the case where vds starts handshake
5169 * (for now only vdc is the instigator)
5184 * vdc_handle_attr_msg()
5189 * vdc - soft state pointer for this instance of the device driver.
5190 * attr_msg - LDC message sent by vDisk server
5196 vdc_handle_attr_msg(vdc_t
*vdc
, vd_attr_msg_t
*attr_msg
)
5199 vd_disk_type_t old_type
;
5201 ASSERT(vdc
!= NULL
);
5202 ASSERT(mutex_owned(&vdc
->lock
));
5204 if (attr_msg
->tag
.vio_subtype_env
!= VIO_ATTR_INFO
) {
5208 switch (attr_msg
->tag
.vio_subtype
) {
5209 case VIO_SUBTYPE_ACK
:
5211 * We now verify the attributes sent by vds.
5213 if (attr_msg
->vdisk_size
== 0) {
5214 DMSG(vdc
, 0, "[%d] Invalid disk size from vds",
5220 if (attr_msg
->max_xfer_sz
== 0) {
5221 DMSG(vdc
, 0, "[%d] Invalid transfer size from vds",
5227 if (attr_msg
->vdisk_size
== VD_SIZE_UNKNOWN
) {
5228 DMSG(vdc
, 0, "[%d] Unknown disk size from vds",
5230 attr_msg
->vdisk_size
= 0;
5233 /* update the VIO block size */
5234 if (attr_msg
->vdisk_block_size
> 0 &&
5235 vdc_update_vio_bsize(vdc
,
5236 attr_msg
->vdisk_block_size
) != 0) {
5237 DMSG(vdc
, 0, "[%d] Invalid block size (%u) from vds",
5238 vdc
->instance
, attr_msg
->vdisk_block_size
);
5243 /* update disk, block and transfer sizes */
5244 old_type
= vdc
->vdisk_type
;
5245 vdc_update_size(vdc
, attr_msg
->vdisk_size
,
5246 attr_msg
->vdisk_block_size
, attr_msg
->max_xfer_sz
);
5247 vdc
->vdisk_type
= attr_msg
->vdisk_type
;
5248 vdc
->operations
= attr_msg
->operations
;
5249 if (vio_ver_is_supported(vdc
->ver
, 1, 1))
5250 vdc
->vdisk_media
= attr_msg
->vdisk_media
;
5252 vdc
->vdisk_media
= 0;
5254 DMSG(vdc
, 0, "[%d] max_xfer_sz: sent %lx acked %lx\n",
5255 vdc
->instance
, vdc
->max_xfer_sz
, attr_msg
->max_xfer_sz
);
5256 DMSG(vdc
, 0, "[%d] vdisk_block_size: sent %lx acked %x\n",
5257 vdc
->instance
, vdc
->vdisk_bsize
,
5258 attr_msg
->vdisk_block_size
);
5260 if ((attr_msg
->xfer_mode
!= VIO_DRING_MODE_V1_0
) ||
5261 (attr_msg
->vdisk_size
> INT64_MAX
) ||
5262 (attr_msg
->operations
== 0) ||
5263 (attr_msg
->vdisk_type
> VD_DISK_TYPE_DISK
)) {
5264 DMSG(vdc
, 0, "[%d] Invalid attributes from vds",
5271 * Now that we have received all attributes we can create a
5272 * fake geometry for the disk.
5274 vdc_create_fake_geometry(vdc
);
5277 * If the disk type was previously unknown and device nodes
5278 * were created then the driver would have created 8 device
5279 * nodes. If we now find out that this is a single-slice disk
5280 * then we need to re-create the appropriate device nodes.
5282 if (old_type
== VD_DISK_TYPE_UNK
&&
5283 (vdc
->initialized
& VDC_MINOR
) &&
5284 vdc
->vdisk_type
== VD_DISK_TYPE_SLICE
) {
5285 ddi_remove_minor_node(vdc
->dip
, NULL
);
5286 (void) devfs_clean(ddi_get_parent(vdc
->dip
),
5287 NULL
, DV_CLEAN_FORCE
);
5288 if (vdc_create_device_nodes(vdc
) != 0) {
5289 DMSG(vdc
, 0, "![%d] Failed to update "
5290 "device nodes", vdc
->instance
);
5296 case VIO_SUBTYPE_NACK
:
5298 * vds could not handle the attributes we sent so we
5304 case VIO_SUBTYPE_INFO
:
5306 * Handle the case where vds starts the handshake
5307 * (for now; vdc is the only supported instigatior)
5322 * vdc_handle_dring_reg_msg()
5327 * vdc - soft state pointer for this instance of the driver.
5328 * dring_msg - LDC message sent by vDisk server
5334 vdc_handle_dring_reg_msg(vdc_t
*vdc
, vio_dring_reg_msg_t
*dring_msg
)
5338 ASSERT(vdc
!= NULL
);
5339 ASSERT(mutex_owned(&vdc
->lock
));
5341 if (dring_msg
->tag
.vio_subtype_env
!= VIO_DRING_REG
) {
5345 switch (dring_msg
->tag
.vio_subtype
) {
5346 case VIO_SUBTYPE_ACK
:
5347 /* save the received dring_ident */
5348 vdc
->dring_ident
= dring_msg
->dring_ident
;
5349 DMSG(vdc
, 0, "[%d] Received dring ident=0x%lx\n",
5350 vdc
->instance
, vdc
->dring_ident
);
5353 case VIO_SUBTYPE_NACK
:
5355 * vds could not handle the DRing info we sent so we
5358 DMSG(vdc
, 0, "[%d] server could not register DRing\n",
5363 case VIO_SUBTYPE_INFO
:
5365 * Handle the case where vds starts handshake
5366 * (for now only vdc is the instigatior)
5379 * vdc_verify_seq_num()
5382 * This functions verifies that the sequence number sent back by the vDisk
5383 * server with the latest message is what is expected (i.e. it is greater
5384 * than the last seq num sent by the vDisk server and less than or equal
5385 * to the last seq num generated by vdc).
5387 * It then checks the request ID to see if any requests need processing
5391 * vdc - soft state pointer for this instance of the driver.
5392 * dring_msg - pointer to the LDC message sent by vds
5395 * VDC_SEQ_NUM_TODO - Message needs to be processed
5396 * VDC_SEQ_NUM_SKIP - Message has already been processed
5397 * VDC_SEQ_NUM_INVALID - The seq numbers are so out of sync,
5398 * vdc cannot deal with them
5401 vdc_verify_seq_num(vdc_t
*vdc
, vio_dring_msg_t
*dring_msg
)
5403 ASSERT(vdc
!= NULL
);
5404 ASSERT(dring_msg
!= NULL
);
5405 ASSERT(mutex_owned(&vdc
->lock
));
5408 * Check to see if the messages were responded to in the correct
5411 if ((dring_msg
->seq_num
<= vdc
->seq_num_reply
) ||
5412 (dring_msg
->seq_num
> vdc
->seq_num
)) {
5413 DMSG(vdc
, 0, "?[%d] Bogus sequence_number %lu: "
5414 "%lu > expected <= %lu (last proc req %lu sent %lu)\n",
5415 vdc
->instance
, dring_msg
->seq_num
,
5416 vdc
->seq_num_reply
, vdc
->seq_num
,
5417 vdc
->req_id_proc
, vdc
->req_id
);
5418 return (VDC_SEQ_NUM_INVALID
);
5420 vdc
->seq_num_reply
= dring_msg
->seq_num
;
5422 if (vdc
->req_id_proc
< vdc
->req_id
)
5423 return (VDC_SEQ_NUM_TODO
);
5425 return (VDC_SEQ_NUM_SKIP
);
5431 * vdc_is_supported_version()
5434 * This routine checks if the major/minor version numbers specified in
5435 * 'ver_msg' are supported. If not it finds the next version that is
5436 * in the supported version list 'vdc_version[]' and sets the fields in
5437 * 'ver_msg' to those values
5440 * ver_msg - LDC message sent by vDisk server
5444 * B_FALSE - Version not supported
5447 vdc_is_supported_version(vio_ver_msg_t
*ver_msg
)
5449 int vdc_num_versions
= sizeof (vdc_version
) / sizeof (vdc_version
[0]);
5451 for (int i
= 0; i
< vdc_num_versions
; i
++) {
5452 ASSERT(vdc_version
[i
].major
> 0);
5454 (vdc_version
[i
].major
< vdc_version
[i
-1].major
));
5457 * If the major versions match, adjust the minor version, if
5458 * necessary, down to the highest value supported by this
5459 * client. The server should support all minor versions lower
5460 * than the value it sent
5462 if (ver_msg
->ver_major
== vdc_version
[i
].major
) {
5463 if (ver_msg
->ver_minor
> vdc_version
[i
].minor
) {
5465 "Adjusting minor version from %u to %u",
5466 ver_msg
->ver_minor
, vdc_version
[i
].minor
);
5467 ver_msg
->ver_minor
= vdc_version
[i
].minor
;
5473 * If the message contains a higher major version number, set
5474 * the message's major/minor versions to the current values
5475 * and return false, so this message will get resent with
5476 * these values, and the server will potentially try again
5477 * with the same or a lower version
5479 if (ver_msg
->ver_major
> vdc_version
[i
].major
) {
5480 ver_msg
->ver_major
= vdc_version
[i
].major
;
5481 ver_msg
->ver_minor
= vdc_version
[i
].minor
;
5482 DMSGX(0, "Suggesting major/minor (0x%x/0x%x)\n",
5483 ver_msg
->ver_major
, ver_msg
->ver_minor
);
5489 * Otherwise, the message's major version is less than the
5490 * current major version, so continue the loop to the next
5491 * (lower) supported version
5496 * No common version was found; "ground" the version pair in the
5497 * message to terminate negotiation
5499 ver_msg
->ver_major
= 0;
5500 ver_msg
->ver_minor
= 0;
5504 /* -------------------------------------------------------------------------- */
5510 typedef struct vdc_dk_arg
{
5511 struct dk_callback dkc
;
5519 * vdc_dkio_flush_cb()
5522 * This routine is a callback for DKIOCFLUSHWRITECACHE which can be called
5526 * arg - a pointer to a vdc_dk_arg_t structure.
5529 vdc_dkio_flush_cb(void *arg
)
5531 struct vdc_dk_arg
*dk_arg
= (struct vdc_dk_arg
*)arg
;
5532 struct dk_callback
*dkc
= NULL
;
5536 if (dk_arg
== NULL
) {
5537 cmn_err(CE_NOTE
, "?[Unk] DKIOCFLUSHWRITECACHE arg is NULL\n");
5542 ASSERT(vdc
!= NULL
);
5544 rv
= vdc_do_sync_op(vdc
, VD_OP_FLUSH
, NULL
, 0,
5545 VDCPART(dk_arg
->dev
), 0, VIO_both_dir
, B_TRUE
);
5547 DMSG(vdc
, 0, "[%d] DKIOCFLUSHWRITECACHE failed %d : model %x\n",
5549 ddi_model_convert_from(dk_arg
->mode
& FMODELS
));
5553 * Trigger the call back to notify the caller the the ioctl call has
5556 if ((dk_arg
->mode
& FKIOCTL
) &&
5558 (dkc
->dkc_callback
!= NULL
)) {
5559 ASSERT(dkc
->dkc_cookie
!= NULL
);
5560 (*dkc
->dkc_callback
)(dkc
->dkc_cookie
, rv
);
5563 /* Indicate that one less DKIO write flush is outstanding */
5564 mutex_enter(&vdc
->lock
);
5565 vdc
->dkio_flush_pending
--;
5566 ASSERT(vdc
->dkio_flush_pending
>= 0);
5567 mutex_exit(&vdc
->lock
);
5569 /* free the mem that was allocated when the callback was dispatched */
5570 kmem_free(arg
, sizeof (vdc_dk_arg_t
));
5578 * This function implements the DKIOCGAPART ioctl.
5581 * vdc - soft state pointer
5582 * arg - a pointer to a dk_map[NDKMAP] or dk_map32[NDKMAP] structure
5583 * flag - ioctl flags
5586 vdc_dkio_gapart(vdc_t
*vdc
, caddr_t arg
, int flag
)
5588 struct dk_geom
*geom
;
5589 struct extvtoc
*vtoc
;
5591 struct dk_map map
[NDKMAP
];
5592 struct dk_map32 map32
[NDKMAP
];
5596 mutex_enter(&vdc
->lock
);
5598 if ((rv
= vdc_validate_geometry(vdc
)) != 0) {
5599 mutex_exit(&vdc
->lock
);
5603 if (vdc
->vdisk_size
> VD_OLDVTOC_LIMIT
) {
5604 mutex_exit(&vdc
->lock
);
5611 if (ddi_model_convert_from(flag
& FMODELS
) == DDI_MODEL_ILP32
) {
5613 for (i
= 0; i
< vtoc
->v_nparts
; i
++) {
5614 data
.map32
[i
].dkl_cylno
= vtoc
->v_part
[i
].p_start
/
5615 (geom
->dkg_nhead
* geom
->dkg_nsect
);
5616 data
.map32
[i
].dkl_nblk
= vtoc
->v_part
[i
].p_size
;
5618 size
= NDKMAP
* sizeof (struct dk_map32
);
5622 for (i
= 0; i
< vtoc
->v_nparts
; i
++) {
5623 data
.map
[i
].dkl_cylno
= vtoc
->v_part
[i
].p_start
/
5624 (geom
->dkg_nhead
* geom
->dkg_nsect
);
5625 data
.map
[i
].dkl_nblk
= vtoc
->v_part
[i
].p_size
;
5627 size
= NDKMAP
* sizeof (struct dk_map
);
5631 mutex_exit(&vdc
->lock
);
5633 if (ddi_copyout(&data
, arg
, size
, flag
) != 0)
5641 * vdc_dkio_partition()
5644 * This function implements the DKIOCPARTITION ioctl.
5647 * vdc - soft state pointer
5648 * arg - a pointer to a struct partition64 structure
5649 * flag - ioctl flags
5652 vdc_dkio_partition(vdc_t
*vdc
, caddr_t arg
, int flag
)
5654 struct partition64 p64
;
5661 if (ddi_copyin(arg
, &p64
, sizeof (struct partition64
), flag
)) {
5665 VDC_EFI_DEV_SET(edev
, vdc
, vd_process_efi_ioctl
);
5667 if ((rv
= vd_efi_alloc_and_read(&edev
, &gpt
, &gpe
)) != 0) {
5671 partno
= p64
.p_partno
;
5673 if (partno
>= gpt
->efi_gpt_NumberOfPartitionEntries
) {
5674 vd_efi_free(&edev
, gpt
, gpe
);
5678 bcopy(&gpe
[partno
].efi_gpe_PartitionTypeGUID
, &p64
.p_type
,
5679 sizeof (struct uuid
));
5680 p64
.p_start
= gpe
[partno
].efi_gpe_StartingLBA
;
5681 p64
.p_size
= gpe
[partno
].efi_gpe_EndingLBA
- p64
.p_start
+ 1;
5683 if (ddi_copyout(&p64
, arg
, sizeof (struct partition64
), flag
)) {
5684 vd_efi_free(&edev
, gpt
, gpe
);
5688 vd_efi_free(&edev
, gpt
, gpe
);
5694 * vdc_dioctl_rwcmd()
5697 * This function implements the DIOCTL_RWCMD ioctl. This ioctl is used
5698 * for DKC_DIRECT disks to read or write at an absolute disk offset.
5702 * arg - a pointer to a dadkio_rwcmd or dadkio_rwcmd32 structure
5703 * flag - ioctl flags
5706 vdc_dioctl_rwcmd(vdc_t
*vdc
, caddr_t arg
, int flag
)
5708 struct dadkio_rwcmd32 rwcmd32
;
5709 struct dadkio_rwcmd rwcmd
;
5715 if (ddi_model_convert_from(flag
& FMODELS
) == DDI_MODEL_ILP32
) {
5716 if (ddi_copyin((caddr_t
)arg
, (caddr_t
)&rwcmd32
,
5717 sizeof (struct dadkio_rwcmd32
), flag
)) {
5720 rwcmd
.cmd
= rwcmd32
.cmd
;
5721 rwcmd
.flags
= rwcmd32
.flags
;
5722 rwcmd
.blkaddr
= (daddr_t
)rwcmd32
.blkaddr
;
5723 rwcmd
.buflen
= rwcmd32
.buflen
;
5724 rwcmd
.bufaddr
= (caddr_t
)(uintptr_t)rwcmd32
.bufaddr
;
5726 if (ddi_copyin((caddr_t
)arg
, (caddr_t
)&rwcmd
,
5727 sizeof (struct dadkio_rwcmd
), flag
)) {
5732 switch (rwcmd
.cmd
) {
5733 case DADKIO_RWCMD_READ
:
5736 case DADKIO_RWCMD_WRITE
:
5743 bzero((caddr_t
)&aiov
, sizeof (struct iovec
));
5744 aiov
.iov_base
= rwcmd
.bufaddr
;
5745 aiov
.iov_len
= rwcmd
.buflen
;
5747 bzero((caddr_t
)&auio
, sizeof (struct uio
));
5748 auio
.uio_iov
= &aiov
;
5749 auio
.uio_iovcnt
= 1;
5750 auio
.uio_loffset
= rwcmd
.blkaddr
* vdc
->vdisk_bsize
;
5751 auio
.uio_resid
= rwcmd
.buflen
;
5752 auio
.uio_segflg
= flag
& FKIOCTL
? UIO_SYSSPACE
: UIO_USERSPACE
;
5754 buf
= kmem_alloc(sizeof (buf_t
), KM_SLEEP
);
5757 * We use the private field of buf to specify that this is an
5758 * I/O using an absolute offset.
5760 buf
->b_private
= (void *)VD_SLICE_NONE
;
5762 status
= physio(vdc_strategy
, buf
, VD_MAKE_DEV(vdc
->instance
, 0),
5763 rw
, vdc_min
, &auio
);
5766 kmem_free(buf
, sizeof (buf_t
));
5772 * Allocate a buffer for a VD_OP_SCSICMD operation. The size of the allocated
5773 * buffer is returned in alloc_len.
5776 vdc_scsi_alloc(int cdb_len
, int sense_len
, int datain_len
, int dataout_len
,
5780 int vd_scsi_len
= VD_SCSI_SIZE
;
5782 vd_scsi_len
+= P2ROUNDUP(cdb_len
, sizeof (uint64_t));
5783 vd_scsi_len
+= P2ROUNDUP(sense_len
, sizeof (uint64_t));
5784 vd_scsi_len
+= P2ROUNDUP(datain_len
, sizeof (uint64_t));
5785 vd_scsi_len
+= P2ROUNDUP(dataout_len
, sizeof (uint64_t));
5787 ASSERT(vd_scsi_len
% sizeof (uint64_t) == 0);
5789 vd_scsi
= kmem_zalloc(vd_scsi_len
, KM_SLEEP
);
5791 vd_scsi
->cdb_len
= cdb_len
;
5792 vd_scsi
->sense_len
= sense_len
;
5793 vd_scsi
->datain_len
= datain_len
;
5794 vd_scsi
->dataout_len
= dataout_len
;
5796 *alloc_len
= vd_scsi_len
;
5802 * Convert the status of a SCSI command to a Solaris return code.
5805 * vd_scsi - The SCSI operation buffer.
5806 * log_error - indicate if an error message should be logged.
5808 * Note that our SCSI error messages are rather primitive for the moment
5809 * and could be improved by decoding some data like the SCSI command and
5813 * 0 - Status is good.
5814 * EACCES - Status reports a reservation conflict.
5815 * ENOTSUP - Status reports a check condition and sense key
5816 * reports an illegal request.
5817 * EIO - Any other status.
5820 vdc_scsi_status(vdc_t
*vdc
, vd_scsi_t
*vd_scsi
, boolean_t log_error
)
5823 char path_str
[MAXPATHLEN
];
5824 char panic_str
[VDC_RESV_CONFLICT_FMT_LEN
+ MAXPATHLEN
];
5825 union scsi_cdb
*cdb
;
5826 struct scsi_extended_sense
*sense
;
5828 if (vd_scsi
->cmd_status
== STATUS_GOOD
)
5832 /* when the tunable vdc_scsi_log_error is true we log all errors */
5833 if (vdc_scsi_log_error
)
5837 cmn_err(CE_WARN
, "%s (vdc%d):\tError for Command: 0x%x)\n",
5838 ddi_pathname(vdc
->dip
, path_str
), vdc
->instance
,
5839 GETCMD(VD_SCSI_DATA_CDB(vd_scsi
)));
5842 /* default returned value */
5845 switch (vd_scsi
->cmd_status
) {
5848 case STATUS_TERMINATED
:
5850 cmn_err(CE_CONT
, "\tCheck Condition Error\n");
5852 /* check sense buffer */
5853 if (vd_scsi
->sense_len
== 0 ||
5854 vd_scsi
->sense_status
!= STATUS_GOOD
) {
5856 cmn_err(CE_CONT
, "\tNo Sense Data Available\n");
5860 sense
= VD_SCSI_DATA_SENSE(vd_scsi
);
5863 cmn_err(CE_CONT
, "\tSense Key: 0x%x\n"
5864 "\tASC: 0x%x, ASCQ: 0x%x\n",
5865 scsi_sense_key((uint8_t *)sense
),
5866 scsi_sense_asc((uint8_t *)sense
),
5867 scsi_sense_ascq((uint8_t *)sense
));
5870 if (scsi_sense_key((uint8_t *)sense
) == KEY_ILLEGAL_REQUEST
)
5876 cmn_err(CE_NOTE
, "\tDevice Busy\n");
5879 case STATUS_RESERVATION_CONFLICT
:
5881 * If the command was PERSISTENT_RESERVATION_[IN|OUT] then
5882 * reservation conflict could be due to various reasons like
5883 * incorrect keys, not registered or not reserved etc. So,
5884 * we should not panic in that case.
5886 cdb
= VD_SCSI_DATA_CDB(vd_scsi
);
5887 if (vdc
->failfast_interval
!= 0 &&
5888 cdb
->scc_cmd
!= SCMD_PERSISTENT_RESERVE_IN
&&
5889 cdb
->scc_cmd
!= SCMD_PERSISTENT_RESERVE_OUT
) {
5890 /* failfast is enabled so we have to panic */
5891 (void) snprintf(panic_str
, sizeof (panic_str
),
5892 VDC_RESV_CONFLICT_FMT_STR
"%s",
5893 ddi_pathname(vdc
->dip
, path_str
));
5897 cmn_err(CE_NOTE
, "\tReservation Conflict\n");
5903 cmn_err(CE_NOTE
, "\tQueue Full\n");
5907 case STATUS_INTERMEDIATE
:
5909 case STATUS_INTERMEDIATE_MET
:
5910 case STATUS_ACA_ACTIVE
:
5913 "\tUnexpected SCSI status received: 0x%x\n",
5914 vd_scsi
->cmd_status
);
5920 "\tInvalid SCSI status received: 0x%x\n",
5921 vd_scsi
->cmd_status
);
5929 * Implemented the USCSICMD uscsi(7I) ioctl. This ioctl is converted to
5930 * a VD_OP_SCSICMD operation which is sent to the vdisk server. If a SCSI
5931 * reset is requested (i.e. a flag USCSI_RESET* is set) then the ioctl is
5932 * converted to a VD_OP_RESET operation.
5935 vdc_uscsi_cmd(vdc_t
*vdc
, caddr_t arg
, int mode
)
5937 struct uscsi_cmd uscsi
;
5938 struct uscsi_cmd32 uscsi32
;
5941 union scsi_cdb
*cdb
;
5942 struct scsi_extended_sense
*sense
;
5943 char *datain
, *dataout
;
5944 size_t cdb_len
, datain_len
, dataout_len
, sense_len
;
5947 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
5948 if (ddi_copyin(arg
, &uscsi32
, sizeof (struct uscsi_cmd32
),
5951 uscsi_cmd32touscsi_cmd((&uscsi32
), (&uscsi
));
5953 if (ddi_copyin(arg
, &uscsi
, sizeof (struct uscsi_cmd
),
5958 /* a uscsi reset is converted to a VD_OP_RESET operation */
5959 if (uscsi
.uscsi_flags
& (USCSI_RESET
| USCSI_RESET_LUN
|
5961 rv
= vdc_do_sync_op(vdc
, VD_OP_RESET
, NULL
, 0, 0, 0,
5962 VIO_both_dir
, B_TRUE
);
5966 /* cdb buffer length */
5967 cdb_len
= uscsi
.uscsi_cdblen
;
5969 /* data in and out buffers length */
5970 if (uscsi
.uscsi_flags
& USCSI_READ
) {
5971 datain_len
= uscsi
.uscsi_buflen
;
5975 dataout_len
= uscsi
.uscsi_buflen
;
5978 /* sense buffer length */
5979 if (uscsi
.uscsi_flags
& USCSI_RQENABLE
)
5980 sense_len
= uscsi
.uscsi_rqlen
;
5984 /* allocate buffer for the VD_SCSICMD_OP operation */
5985 vd_scsi
= vdc_scsi_alloc(cdb_len
, sense_len
, datain_len
, dataout_len
,
5989 * The documentation of USCSI_ISOLATE and USCSI_DIAGNOSE is very vague,
5990 * but basically they prevent a SCSI command from being retried in case
5993 if ((uscsi
.uscsi_flags
& USCSI_ISOLATE
) ||
5994 (uscsi
.uscsi_flags
& USCSI_DIAGNOSE
))
5995 vd_scsi
->options
|= VD_SCSI_OPT_NORETRY
;
5997 /* set task attribute */
5998 if (uscsi
.uscsi_flags
& USCSI_NOTAG
) {
5999 vd_scsi
->task_attribute
= 0;
6001 if (uscsi
.uscsi_flags
& USCSI_HEAD
)
6002 vd_scsi
->task_attribute
= VD_SCSI_TASK_ACA
;
6003 else if (uscsi
.uscsi_flags
& USCSI_HTAG
)
6004 vd_scsi
->task_attribute
= VD_SCSI_TASK_HQUEUE
;
6005 else if (uscsi
.uscsi_flags
& USCSI_OTAG
)
6006 vd_scsi
->task_attribute
= VD_SCSI_TASK_ORDERED
;
6008 vd_scsi
->task_attribute
= 0;
6012 vd_scsi
->timeout
= uscsi
.uscsi_timeout
;
6014 /* copy-in cdb data */
6015 cdb
= VD_SCSI_DATA_CDB(vd_scsi
);
6016 if (ddi_copyin(uscsi
.uscsi_cdb
, cdb
, cdb_len
, mode
) != 0) {
6021 /* keep a pointer to the sense buffer */
6022 sense
= VD_SCSI_DATA_SENSE(vd_scsi
);
6024 /* keep a pointer to the data-in buffer */
6025 datain
= (char *)VD_SCSI_DATA_IN(vd_scsi
);
6027 /* copy-in request data to the data-out buffer */
6028 dataout
= (char *)VD_SCSI_DATA_OUT(vd_scsi
);
6029 if (!(uscsi
.uscsi_flags
& USCSI_READ
)) {
6030 if (ddi_copyin(uscsi
.uscsi_bufaddr
, dataout
, dataout_len
,
6037 /* submit the request */
6038 rv
= vdc_do_sync_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6039 0, 0, VIO_both_dir
, B_FALSE
);
6044 /* update scsi status */
6045 uscsi
.uscsi_status
= vd_scsi
->cmd_status
;
6047 /* update sense data */
6048 if ((uscsi
.uscsi_flags
& USCSI_RQENABLE
) &&
6049 (uscsi
.uscsi_status
== STATUS_CHECK
||
6050 uscsi
.uscsi_status
== STATUS_TERMINATED
)) {
6052 uscsi
.uscsi_rqstatus
= vd_scsi
->sense_status
;
6054 if (uscsi
.uscsi_rqstatus
== STATUS_GOOD
) {
6055 uscsi
.uscsi_rqresid
= uscsi
.uscsi_rqlen
-
6057 if (ddi_copyout(sense
, uscsi
.uscsi_rqbuf
,
6058 vd_scsi
->sense_len
, mode
) != 0) {
6065 /* update request data */
6066 if (uscsi
.uscsi_status
== STATUS_GOOD
) {
6067 if (uscsi
.uscsi_flags
& USCSI_READ
) {
6068 uscsi
.uscsi_resid
= uscsi
.uscsi_buflen
-
6069 vd_scsi
->datain_len
;
6070 if (ddi_copyout(datain
, uscsi
.uscsi_bufaddr
,
6071 vd_scsi
->datain_len
, mode
) != 0) {
6076 uscsi
.uscsi_resid
= uscsi
.uscsi_buflen
-
6077 vd_scsi
->dataout_len
;
6081 /* copy-out result */
6082 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
6083 uscsi_cmdtouscsi_cmd32((&uscsi
), (&uscsi32
));
6084 if (ddi_copyout(&uscsi32
, arg
, sizeof (struct uscsi_cmd32
),
6090 if (ddi_copyout(&uscsi
, arg
, sizeof (struct uscsi_cmd
),
6097 /* get the return code from the SCSI command status */
6098 rv
= vdc_scsi_status(vdc
, vd_scsi
,
6099 !(uscsi
.uscsi_flags
& USCSI_SILENT
));
6102 kmem_free(vd_scsi
, vd_scsi_len
);
6107 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT IN command.
6110 * cmd - SCSI PERSISTENT IN command
6111 * len - length of the SCSI input buffer
6112 * vd_scsi_len - return the length of the allocated buffer
6115 * a pointer to the allocated VD_OP_SCSICMD buffer.
6118 vdc_scsi_alloc_persistent_in(uchar_t cmd
, int len
, int *vd_scsi_len
)
6120 int cdb_len
, sense_len
, datain_len
, dataout_len
;
6122 union scsi_cdb
*cdb
;
6124 cdb_len
= CDB_GROUP1
;
6125 sense_len
= sizeof (struct scsi_extended_sense
);
6129 vd_scsi
= vdc_scsi_alloc(cdb_len
, sense_len
, datain_len
, dataout_len
,
6132 cdb
= VD_SCSI_DATA_CDB(vd_scsi
);
6135 cdb
->scc_cmd
= SCMD_PERSISTENT_RESERVE_IN
;
6136 cdb
->cdb_opaque
[1] = cmd
;
6137 FORMG1COUNT(cdb
, datain_len
);
6139 vd_scsi
->timeout
= vdc_scsi_timeout
;
6145 * Create a VD_OP_SCSICMD buffer for a SCSI PERSISTENT OUT command.
6148 * cmd - SCSI PERSISTENT OUT command
6149 * len - length of the SCSI output buffer
6150 * vd_scsi_len - return the length of the allocated buffer
6153 * a pointer to the allocated VD_OP_SCSICMD buffer.
6156 vdc_scsi_alloc_persistent_out(uchar_t cmd
, int len
, int *vd_scsi_len
)
6158 int cdb_len
, sense_len
, datain_len
, dataout_len
;
6160 union scsi_cdb
*cdb
;
6162 cdb_len
= CDB_GROUP1
;
6163 sense_len
= sizeof (struct scsi_extended_sense
);
6167 vd_scsi
= vdc_scsi_alloc(cdb_len
, sense_len
, datain_len
, dataout_len
,
6170 cdb
= VD_SCSI_DATA_CDB(vd_scsi
);
6173 cdb
->scc_cmd
= SCMD_PERSISTENT_RESERVE_OUT
;
6174 cdb
->cdb_opaque
[1] = cmd
;
6175 FORMG1COUNT(cdb
, dataout_len
);
6177 vd_scsi
->timeout
= vdc_scsi_timeout
;
6183 * Implement the MHIOCGRP_INKEYS mhd(7i) ioctl. The ioctl is converted
6184 * to a SCSI PERSISTENT IN READ KEYS command which is sent to the vdisk
6185 * server with a VD_OP_SCSICMD operation.
6188 vdc_mhd_inkeys(vdc_t
*vdc
, caddr_t arg
, int mode
)
6191 mhioc_inkeys_t inkeys
;
6192 mhioc_key_list_t klist
;
6193 struct mhioc_inkeys32 inkeys32
;
6194 struct mhioc_key_list32 klist32
;
6195 sd_prin_readkeys_t
*scsi_keys
;
6198 int listsize
, listlen
, rv
;
6200 /* copyin arguments */
6201 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
6202 rv
= ddi_copyin(arg
, &inkeys32
, sizeof (inkeys32
), mode
);
6206 rv
= ddi_copyin((caddr_t
)(uintptr_t)inkeys32
.li
, &klist32
,
6207 sizeof (klist32
), mode
);
6211 listsize
= klist32
.listsize
;
6213 rv
= ddi_copyin(arg
, &inkeys
, sizeof (inkeys
), mode
);
6217 rv
= ddi_copyin(inkeys
.li
, &klist
, sizeof (klist
), mode
);
6221 listsize
= klist
.listsize
;
6224 /* build SCSI VD_OP request */
6225 vd_scsi
= vdc_scsi_alloc_persistent_in(SD_READ_KEYS
,
6226 sizeof (sd_prin_readkeys_t
) - sizeof (caddr_t
) +
6227 (sizeof (mhioc_resv_key_t
) * listsize
), &vd_scsi_len
);
6229 scsi_keys
= (sd_prin_readkeys_t
*)VD_SCSI_DATA_IN(vd_scsi
);
6231 /* submit the request */
6232 rv
= vdc_do_sync_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6233 0, 0, VIO_both_dir
, B_FALSE
);
6238 listlen
= scsi_keys
->len
/ MHIOC_RESV_KEY_SIZE
;
6240 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
6241 inkeys32
.generation
= scsi_keys
->generation
;
6242 rv
= ddi_copyout(&inkeys32
, arg
, sizeof (inkeys32
), mode
);
6248 klist32
.listlen
= listlen
;
6249 rv
= ddi_copyout(&klist32
, (caddr_t
)(uintptr_t)inkeys32
.li
,
6250 sizeof (klist32
), mode
);
6256 user_keys
= (caddr_t
)(uintptr_t)klist32
.list
;
6258 inkeys
.generation
= scsi_keys
->generation
;
6259 rv
= ddi_copyout(&inkeys
, arg
, sizeof (inkeys
), mode
);
6265 klist
.listlen
= listlen
;
6266 rv
= ddi_copyout(&klist
, inkeys
.li
, sizeof (klist
), mode
);
6272 user_keys
= klist
.list
;
6276 if (listlen
> 0 && listsize
> 0) {
6277 if (listsize
< listlen
)
6279 rv
= ddi_copyout(&scsi_keys
->keylist
, user_keys
,
6280 listlen
* MHIOC_RESV_KEY_SIZE
, mode
);
6286 rv
= vdc_scsi_status(vdc
, vd_scsi
, B_FALSE
);
6289 kmem_free(vd_scsi
, vd_scsi_len
);
6295 * Implement the MHIOCGRP_INRESV mhd(7i) ioctl. The ioctl is converted
6296 * to a SCSI PERSISTENT IN READ RESERVATION command which is sent to
6297 * the vdisk server with a VD_OP_SCSICMD operation.
6300 vdc_mhd_inresv(vdc_t
*vdc
, caddr_t arg
, int mode
)
6303 mhioc_inresvs_t inresv
;
6304 mhioc_resv_desc_list_t rlist
;
6305 struct mhioc_inresvs32 inresv32
;
6306 struct mhioc_resv_desc_list32 rlist32
;
6307 mhioc_resv_desc_t mhd_resv
;
6308 sd_prin_readresv_t
*scsi_resv
;
6309 sd_readresv_desc_t
*resv
;
6310 mhioc_resv_desc_t
*user_resv
;
6312 int listsize
, listlen
, i
, rv
;
6314 /* copyin arguments */
6315 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
6316 rv
= ddi_copyin(arg
, &inresv32
, sizeof (inresv32
), mode
);
6320 rv
= ddi_copyin((caddr_t
)(uintptr_t)inresv32
.li
, &rlist32
,
6321 sizeof (rlist32
), mode
);
6325 listsize
= rlist32
.listsize
;
6327 rv
= ddi_copyin(arg
, &inresv
, sizeof (inresv
), mode
);
6331 rv
= ddi_copyin(inresv
.li
, &rlist
, sizeof (rlist
), mode
);
6335 listsize
= rlist
.listsize
;
6338 /* build SCSI VD_OP request */
6339 vd_scsi
= vdc_scsi_alloc_persistent_in(SD_READ_RESV
,
6340 sizeof (sd_prin_readresv_t
) - sizeof (caddr_t
) +
6341 (SCSI3_RESV_DESC_LEN
* listsize
), &vd_scsi_len
);
6343 scsi_resv
= (sd_prin_readresv_t
*)VD_SCSI_DATA_IN(vd_scsi
);
6345 /* submit the request */
6346 rv
= vdc_do_sync_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6347 0, 0, VIO_both_dir
, B_FALSE
);
6352 listlen
= scsi_resv
->len
/ SCSI3_RESV_DESC_LEN
;
6354 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
6355 inresv32
.generation
= scsi_resv
->generation
;
6356 rv
= ddi_copyout(&inresv32
, arg
, sizeof (inresv32
), mode
);
6362 rlist32
.listlen
= listlen
;
6363 rv
= ddi_copyout(&rlist32
, (caddr_t
)(uintptr_t)inresv32
.li
,
6364 sizeof (rlist32
), mode
);
6370 user_resv
= (mhioc_resv_desc_t
*)(uintptr_t)rlist32
.list
;
6372 inresv
.generation
= scsi_resv
->generation
;
6373 rv
= ddi_copyout(&inresv
, arg
, sizeof (inresv
), mode
);
6379 rlist
.listlen
= listlen
;
6380 rv
= ddi_copyout(&rlist
, inresv
.li
, sizeof (rlist
), mode
);
6386 user_resv
= rlist
.list
;
6389 /* copy out reservations */
6390 if (listsize
> 0 && listlen
> 0) {
6391 if (listsize
< listlen
)
6393 resv
= (sd_readresv_desc_t
*)&scsi_resv
->readresv_desc
;
6395 for (i
= 0; i
< listlen
; i
++) {
6396 mhd_resv
.type
= resv
->type
;
6397 mhd_resv
.scope
= resv
->scope
;
6398 mhd_resv
.scope_specific_addr
=
6399 BE_32(resv
->scope_specific_addr
);
6400 bcopy(&resv
->resvkey
, &mhd_resv
.key
,
6401 MHIOC_RESV_KEY_SIZE
);
6403 rv
= ddi_copyout(&mhd_resv
, user_resv
,
6404 sizeof (mhd_resv
), mode
);
6415 rv
= vdc_scsi_status(vdc
, vd_scsi
, B_FALSE
);
6418 kmem_free(vd_scsi
, vd_scsi_len
);
6423 * Implement the MHIOCGRP_REGISTER mhd(7i) ioctl. The ioctl is converted
6424 * to a SCSI PERSISTENT OUT REGISTER command which is sent to the vdisk
6425 * server with a VD_OP_SCSICMD operation.
6428 vdc_mhd_register(vdc_t
*vdc
, caddr_t arg
, int mode
)
6431 sd_prout_t
*scsi_prout
;
6432 mhioc_register_t mhd_reg
;
6433 int vd_scsi_len
, rv
;
6435 /* copyin arguments */
6436 rv
= ddi_copyin(arg
, &mhd_reg
, sizeof (mhd_reg
), mode
);
6440 /* build SCSI VD_OP request */
6441 vd_scsi
= vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTER
,
6442 sizeof (sd_prout_t
), &vd_scsi_len
);
6444 /* set parameters */
6445 scsi_prout
= (sd_prout_t
*)VD_SCSI_DATA_OUT(vd_scsi
);
6446 bcopy(mhd_reg
.oldkey
.key
, scsi_prout
->res_key
, MHIOC_RESV_KEY_SIZE
);
6447 bcopy(mhd_reg
.newkey
.key
, scsi_prout
->service_key
, MHIOC_RESV_KEY_SIZE
);
6448 scsi_prout
->aptpl
= (uchar_t
)mhd_reg
.aptpl
;
6450 /* submit the request */
6451 rv
= vdc_do_sync_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6452 0, 0, VIO_both_dir
, B_FALSE
);
6455 rv
= vdc_scsi_status(vdc
, vd_scsi
, B_FALSE
);
6457 kmem_free(vd_scsi
, vd_scsi_len
);
6462 * Implement the MHIOCGRP_RESERVE mhd(7i) ioctl. The ioctl is converted
6463 * to a SCSI PERSISTENT OUT RESERVE command which is sent to the vdisk
6464 * server with a VD_OP_SCSICMD operation.
6467 vdc_mhd_reserve(vdc_t
*vdc
, caddr_t arg
, int mode
)
6469 union scsi_cdb
*cdb
;
6471 sd_prout_t
*scsi_prout
;
6472 mhioc_resv_desc_t mhd_resv
;
6473 int vd_scsi_len
, rv
;
6475 /* copyin arguments */
6476 rv
= ddi_copyin(arg
, &mhd_resv
, sizeof (mhd_resv
), mode
);
6480 /* build SCSI VD_OP request */
6481 vd_scsi
= vdc_scsi_alloc_persistent_out(SD_SCSI3_RESERVE
,
6482 sizeof (sd_prout_t
), &vd_scsi_len
);
6484 /* set parameters */
6485 cdb
= VD_SCSI_DATA_CDB(vd_scsi
);
6486 scsi_prout
= (sd_prout_t
*)VD_SCSI_DATA_OUT(vd_scsi
);
6487 bcopy(mhd_resv
.key
.key
, scsi_prout
->res_key
, MHIOC_RESV_KEY_SIZE
);
6488 scsi_prout
->scope_address
= mhd_resv
.scope_specific_addr
;
6489 cdb
->cdb_opaque
[2] = mhd_resv
.type
;
6491 /* submit the request */
6492 rv
= vdc_do_sync_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6493 0, 0, VIO_both_dir
, B_FALSE
);
6496 rv
= vdc_scsi_status(vdc
, vd_scsi
, B_FALSE
);
6498 kmem_free(vd_scsi
, vd_scsi_len
);
6503 * Implement the MHIOCGRP_PREEMPTANDABORT mhd(7i) ioctl. The ioctl is
6504 * converted to a SCSI PERSISTENT OUT PREEMPT AND ABORT command which
6505 * is sent to the vdisk server with a VD_OP_SCSICMD operation.
6508 vdc_mhd_preemptabort(vdc_t
*vdc
, caddr_t arg
, int mode
)
6510 union scsi_cdb
*cdb
;
6512 sd_prout_t
*scsi_prout
;
6513 mhioc_preemptandabort_t mhd_preempt
;
6514 int vd_scsi_len
, rv
;
6516 /* copyin arguments */
6517 rv
= ddi_copyin(arg
, &mhd_preempt
, sizeof (mhd_preempt
), mode
);
6521 /* build SCSI VD_OP request */
6522 vd_scsi
= vdc_scsi_alloc_persistent_out(SD_SCSI3_PREEMPTANDABORT
,
6523 sizeof (sd_prout_t
), &vd_scsi_len
);
6525 /* set parameters */
6526 vd_scsi
->task_attribute
= VD_SCSI_TASK_ACA
;
6527 cdb
= VD_SCSI_DATA_CDB(vd_scsi
);
6528 scsi_prout
= (sd_prout_t
*)VD_SCSI_DATA_OUT(vd_scsi
);
6529 bcopy(mhd_preempt
.resvdesc
.key
.key
, scsi_prout
->res_key
,
6530 MHIOC_RESV_KEY_SIZE
);
6531 bcopy(mhd_preempt
.victim_key
.key
, scsi_prout
->service_key
,
6532 MHIOC_RESV_KEY_SIZE
);
6533 scsi_prout
->scope_address
= mhd_preempt
.resvdesc
.scope_specific_addr
;
6534 cdb
->cdb_opaque
[2] = mhd_preempt
.resvdesc
.type
;
6536 /* submit the request */
6537 rv
= vdc_do_sync_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6538 0, 0, VIO_both_dir
, B_FALSE
);
6541 rv
= vdc_scsi_status(vdc
, vd_scsi
, B_FALSE
);
6543 kmem_free(vd_scsi
, vd_scsi_len
);
6548 * Implement the MHIOCGRP_REGISTERANDIGNOREKEY mhd(7i) ioctl. The ioctl
6549 * is converted to a SCSI PERSISTENT OUT REGISTER AND IGNORE EXISTING KEY
6550 * command which is sent to the vdisk server with a VD_OP_SCSICMD operation.
6553 vdc_mhd_registerignore(vdc_t
*vdc
, caddr_t arg
, int mode
)
6556 sd_prout_t
*scsi_prout
;
6557 mhioc_registerandignorekey_t mhd_regi
;
6558 int vd_scsi_len
, rv
;
6560 /* copyin arguments */
6561 rv
= ddi_copyin(arg
, &mhd_regi
, sizeof (mhd_regi
), mode
);
6565 /* build SCSI VD_OP request */
6566 vd_scsi
= vdc_scsi_alloc_persistent_out(SD_SCSI3_REGISTERANDIGNOREKEY
,
6567 sizeof (sd_prout_t
), &vd_scsi_len
);
6569 /* set parameters */
6570 scsi_prout
= (sd_prout_t
*)VD_SCSI_DATA_OUT(vd_scsi
);
6571 bcopy(mhd_regi
.newkey
.key
, scsi_prout
->service_key
,
6572 MHIOC_RESV_KEY_SIZE
);
6573 scsi_prout
->aptpl
= (uchar_t
)mhd_regi
.aptpl
;
6575 /* submit the request */
6576 rv
= vdc_do_sync_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6577 0, 0, VIO_both_dir
, B_FALSE
);
6580 rv
= vdc_scsi_status(vdc
, vd_scsi
, B_FALSE
);
6582 kmem_free(vd_scsi
, vd_scsi_len
);
6587 * This function is used to send a (simple) SCSI command and check errors.
6590 vdc_eio_scsi_cmd(vdc_t
*vdc
, uchar_t scmd
, int flags
)
6592 int cdb_len
, sense_len
, vd_scsi_len
;
6594 union scsi_cdb
*cdb
;
6597 ASSERT(scmd
== SCMD_TEST_UNIT_READY
|| scmd
== SCMD_WRITE_G1
);
6599 if (scmd
== SCMD_WRITE_G1
)
6600 cdb_len
= CDB_GROUP1
;
6602 cdb_len
= CDB_GROUP0
;
6604 sense_len
= sizeof (struct scsi_extended_sense
);
6606 vd_scsi
= vdc_scsi_alloc(cdb_len
, sense_len
, 0, 0, &vd_scsi_len
);
6609 cdb
= VD_SCSI_DATA_CDB(vd_scsi
);
6610 cdb
->scc_cmd
= scmd
;
6612 vd_scsi
->timeout
= vdc_scsi_timeout
;
6615 * Submit the request. Note the operation should not request that any
6616 * error is checked because this function is precisely called when
6619 ASSERT((flags
& VDC_OP_ERRCHK
) == 0);
6621 rv
= vdc_do_op(vdc
, VD_OP_SCSICMD
, (caddr_t
)vd_scsi
, vd_scsi_len
,
6622 0, 0, NULL
, VIO_both_dir
, flags
);
6625 rv
= vdc_scsi_status(vdc
, vd_scsi
, B_FALSE
);
6627 kmem_free(vd_scsi
, vd_scsi_len
);
6632 * This function is used to check if a SCSI backend is accessible. It will
6633 * also detect reservation conflict if failfast is enabled, and panic the
6634 * system in that case.
6637 * 0 - disk is accessible
6638 * != 0 - disk is inaccessible or unable to check if disk is accessible
6641 vdc_eio_scsi_check(vdc_t
*vdc
, int flags
)
6647 * Send a TEST UNIT READY command. The command will panic
6648 * the system if it fails with a reservation conflict and
6649 * failfast is enabled. If there is a reservation conflict
6650 * and failfast is not enabled then the function will return
6651 * EACCES. In that case, there's no problem with accessing
6652 * the backend, it is just reserved.
6654 rv
= vdc_eio_scsi_cmd(vdc
, SCMD_TEST_UNIT_READY
, flags
);
6655 if (rv
!= 0 && rv
!= EACCES
)
6658 /* we don't need to do more checking if failfast is not enabled */
6659 if (vdc
->failfast_interval
== 0)
6663 * With SPC-3 compliant devices TEST UNIT READY will succeed on
6664 * a reserved device, so we also do a WRITE(10) of zero byte in
6665 * order to provoke a Reservation Conflict status on those newer
6668 if (vdc_eio_scsi_cmd(vdc
, SCMD_WRITE_G1
, flags
) != 0)
6675 * This function is used to check if a backend is effectively accessible.
6678 * 0 - disk is accessible
6679 * != 0 - disk is inaccessible or unable to check if disk is accessible
6682 vdc_eio_check(vdc_t
*vdc
, int flags
)
6688 ASSERT((flags
& VDC_OP_ERRCHK
) == 0);
6690 flags
|= VDC_OP_DRING_RESERVED
;
6692 if (VD_OP_SUPPORTED(vdc
->operations
, VD_OP_SCSICMD
))
6693 return (vdc_eio_scsi_check(vdc
, flags
));
6695 ASSERT(vdc
->failfast_interval
== 0);
6698 * If the backend does not support SCSI operations then we simply
6699 * check if the backend is accessible by reading some data blocks.
6700 * We first try to read a random block, to try to avoid getting
6701 * a block that might have been cached on the service domain. Then
6702 * we try the last block, and finally the first block.
6704 * We return success as soon as we are able to read any block.
6706 buffer
= kmem_alloc(vdc
->vdisk_bsize
, KM_SLEEP
);
6708 if (vdc
->vdisk_size
> 0) {
6710 /* try a random block */
6711 (void) random_get_pseudo_bytes((uint8_t *)&blkno
,
6712 sizeof (diskaddr_t
));
6713 blkno
= blkno
% vdc
->vdisk_size
;
6714 rv
= vdc_do_op(vdc
, VD_OP_BREAD
, (caddr_t
)buffer
,
6715 vdc
->vdisk_bsize
, VD_SLICE_NONE
, blkno
, NULL
,
6716 VIO_read_dir
, flags
);
6721 /* try the last block */
6722 blkno
= vdc
->vdisk_size
- 1;
6723 rv
= vdc_do_op(vdc
, VD_OP_BREAD
, (caddr_t
)buffer
,
6724 vdc
->vdisk_bsize
, VD_SLICE_NONE
, blkno
, NULL
,
6725 VIO_read_dir
, flags
);
6733 rv
= vdc_do_op(vdc
, VD_OP_BREAD
, (caddr_t
)buffer
, vdc
->vdisk_bsize
,
6734 VD_SLICE_NONE
, blkno
, NULL
, VIO_read_dir
, flags
);
6737 kmem_free(buffer
, vdc
->vdisk_bsize
);
6742 * Add a pending I/O to the eio queue. An I/O is added to this queue
6743 * when it has failed and failfast is enabled or the vdisk has multiple
6744 * servers. It will then be handled by the eio thread (vdc_eio_thread).
6745 * The eio queue is ordered starting with the most recent I/O added.
6748 vdc_eio_queue(vdc_t
*vdc
, int index
)
6752 ASSERT(MUTEX_HELD(&vdc
->lock
));
6754 vio
= kmem_alloc(sizeof (vdc_io_t
), KM_SLEEP
);
6755 vio
->vio_next
= vdc
->eio_queue
;
6756 vio
->vio_index
= index
;
6757 vio
->vio_qtime
= ddi_get_lbolt();
6759 vdc
->eio_queue
= vio
;
6761 /* notify the eio thread that a new I/O is queued */
6762 cv_signal(&vdc
->eio_cv
);
6768 * Remove I/Os added before the indicated deadline from the eio queue. A
6769 * deadline of 0 means that all I/Os have to be unqueued. The complete_io
6770 * boolean specifies if unqueued I/Os should be marked as completed or not.
6773 vdc_eio_unqueue(vdc_t
*vdc
, clock_t deadline
, boolean_t complete_io
)
6776 vdc_io_t
*vio
, *vio_tmp
;
6779 ASSERT(MUTEX_HELD(&vdc
->lock
));
6782 vio
= vdc
->eio_queue
;
6784 if (deadline
!= 0) {
6786 * Skip any io queued after the deadline. The eio queue is
6787 * ordered starting with the last I/O added to the queue.
6789 while (vio
!= NULL
&& vio
->vio_qtime
> deadline
) {
6791 vio
= vio
->vio_next
;
6796 /* nothing to unqueue */
6799 /* update the queue */
6800 if (vio_tmp
== NULL
)
6801 vdc
->eio_queue
= NULL
;
6803 vio_tmp
->vio_next
= NULL
;
6806 * Free and complete unqueued I/Os if this was requested. All I/Os
6807 * have a block I/O data transfer structure (buf) and they are
6808 * completed by calling biodone().
6810 while (vio
!= NULL
) {
6811 vio_tmp
= vio
->vio_next
;
6814 index
= vio
->vio_index
;
6815 op
= vdc
->local_dring
[index
].operation
;
6816 buf
= vdc
->local_dring
[index
].buf
;
6817 (void) vdc_depopulate_descriptor(vdc
, index
);
6818 ASSERT(buf
->b_flags
& B_ERROR
);
6819 if (op
== VD_OP_BREAD
|| op
== VD_OP_BWRITE
) {
6820 VD_UPDATE_ERR_STATS(vdc
, vd_softerrs
);
6821 VD_KSTAT_RUNQ_EXIT(vdc
);
6822 DTRACE_IO1(done
, buf_t
*, buf
);
6827 kmem_free(vio
, sizeof (vdc_io_t
));
6833 * Error I/O Thread. There is one eio thread for each virtual disk that
6834 * has multiple servers or for which failfast is enabled. Failfast can only
6835 * be enabled for vdisk supporting SCSI commands.
6837 * While failfast is enabled, the eio thread sends a TEST UNIT READY
6838 * and a zero size WRITE(10) SCSI commands on a regular basis to check that
6839 * we still have access to the disk. If a command fails with a RESERVATION
6840 * CONFLICT error then the system will immediatly panic.
6842 * The eio thread is also woken up when an I/O has failed. It then checks
6843 * the access to the disk to ensure that the I/O failure was not due to a
6844 * reservation conflict or to the backend been inaccessible.
6848 vdc_eio_thread(void *arg
)
6851 vdc_t
*vdc
= (vdc_t
*)arg
;
6852 clock_t starttime
, timeout
= drv_usectohz(vdc
->failfast_interval
);
6854 mutex_enter(&vdc
->lock
);
6856 while (vdc
->failfast_interval
!= 0 || vdc
->num_servers
> 1) {
6858 * Wait if there is nothing in the eio queue or if the state
6859 * is not VDC_STATE_RUNNING.
6861 if (vdc
->eio_queue
== NULL
|| vdc
->state
!= VDC_STATE_RUNNING
) {
6862 if (vdc
->failfast_interval
!= 0) {
6863 timeout
= ddi_get_lbolt() +
6864 drv_usectohz(vdc
->failfast_interval
);
6865 (void) cv_timedwait(&vdc
->eio_cv
, &vdc
->lock
,
6868 ASSERT(vdc
->num_servers
> 1);
6869 (void) cv_wait(&vdc
->eio_cv
, &vdc
->lock
);
6872 if (vdc
->state
!= VDC_STATE_RUNNING
)
6876 mutex_exit(&vdc
->lock
);
6878 starttime
= ddi_get_lbolt();
6881 status
= vdc_eio_check(vdc
, VDC_OP_STATE_RUNNING
);
6883 mutex_enter(&vdc
->lock
);
6885 * We have dropped the lock to check the backend so we have
6886 * to check that the eio thread is still enabled.
6888 if (vdc
->failfast_interval
== 0 && vdc
->num_servers
<= 1)
6892 * If the eio queue is empty or we are not in running state
6893 * anymore then there is nothing to do.
6895 if (vdc
->state
!= VDC_STATE_RUNNING
|| vdc
->eio_queue
== NULL
)
6900 * The backend access has been successfully checked,
6901 * we can complete any I/O queued before the last check.
6903 vdc_eio_unqueue(vdc
, starttime
, B_TRUE
);
6905 } else if (vdc
->num_servers
> 1) {
6907 * The backend is inaccessible for a disk with multiple
6908 * servers. So we force a reset to switch to another
6909 * server. The reset will also clear the eio queue and
6910 * resubmit all pending I/Os.
6912 mutex_enter(&vdc
->read_lock
);
6913 vdc
->read_state
= VDC_READ_RESET
;
6914 cv_signal(&vdc
->read_cv
);
6915 mutex_exit(&vdc
->read_lock
);
6918 * There is only one path and the backend is not
6919 * accessible, so I/Os are actually failing because
6920 * of that. So we can complete I/O queued before the
6923 vdc_eio_unqueue(vdc
, starttime
, B_TRUE
);
6928 * The thread is being stopped so we can complete any queued I/O.
6930 vdc_eio_unqueue(vdc
, 0, B_TRUE
);
6931 vdc
->eio_thread
= NULL
;
6932 mutex_exit(&vdc
->lock
);
6937 * Implement the MHIOCENFAILFAST mhd(7i) ioctl.
6940 vdc_failfast(vdc_t
*vdc
, caddr_t arg
, int mode
)
6942 unsigned int mh_time
;
6944 if (ddi_copyin((void *)arg
, &mh_time
, sizeof (int), mode
))
6947 mutex_enter(&vdc
->lock
);
6948 if (mh_time
!= 0 && vdc
->eio_thread
== NULL
) {
6949 vdc
->eio_thread
= thread_create(NULL
, 0,
6950 vdc_eio_thread
, vdc
, 0, &p0
, TS_RUN
,
6954 vdc
->failfast_interval
= ((long)mh_time
) * MILLISEC
;
6955 cv_signal(&vdc
->eio_cv
);
6956 mutex_exit(&vdc
->lock
);
6962 * Implement the MHIOCTKOWN and MHIOCRELEASE mhd(7i) ioctls. These ioctls are
6963 * converted to VD_OP_SET_ACCESS operations.
6966 vdc_access_set(vdc_t
*vdc
, uint64_t flags
)
6970 /* submit owership command request */
6971 rv
= vdc_do_sync_op(vdc
, VD_OP_SET_ACCESS
, (caddr_t
)&flags
,
6972 sizeof (uint64_t), 0, 0, VIO_both_dir
, B_TRUE
);
6978 * Implement the MHIOCSTATUS mhd(7i) ioctl. This ioctl is converted to a
6979 * VD_OP_GET_ACCESS operation.
6982 vdc_access_get(vdc_t
*vdc
, uint64_t *status
)
6986 /* submit owership command request */
6987 rv
= vdc_do_sync_op(vdc
, VD_OP_GET_ACCESS
, (caddr_t
)status
,
6988 sizeof (uint64_t), 0, 0, VIO_both_dir
, B_TRUE
);
6994 * Disk Ownership Thread.
6996 * When we have taken the ownership of a disk, this thread waits to be
6997 * notified when the LDC channel is reset so that it can recover the
7000 * Note that the thread handling the LDC reset (vdc_process_msg_thread())
7001 * can not be used to do the ownership recovery because it has to be
7002 * running to handle the reply message to the ownership operation.
7005 vdc_ownership_thread(void *arg
)
7007 vdc_t
*vdc
= (vdc_t
*)arg
;
7011 mutex_enter(&vdc
->ownership_lock
);
7012 mutex_enter(&vdc
->lock
);
7014 while (vdc
->ownership
& VDC_OWNERSHIP_WANTED
) {
7016 if ((vdc
->ownership
& VDC_OWNERSHIP_RESET
) ||
7017 !(vdc
->ownership
& VDC_OWNERSHIP_GRANTED
)) {
7019 * There was a reset so the ownership has been lost,
7020 * try to recover. We do this without using the preempt
7021 * option so that we don't steal the ownership from
7022 * someone who has preempted us.
7024 DMSG(vdc
, 0, "[%d] Ownership lost, recovering",
7027 vdc
->ownership
&= ~(VDC_OWNERSHIP_RESET
|
7028 VDC_OWNERSHIP_GRANTED
);
7030 mutex_exit(&vdc
->lock
);
7032 status
= vdc_access_set(vdc
, VD_ACCESS_SET_EXCLUSIVE
|
7033 VD_ACCESS_SET_PRESERVE
);
7035 mutex_enter(&vdc
->lock
);
7038 DMSG(vdc
, 0, "[%d] Ownership recovered",
7040 vdc
->ownership
|= VDC_OWNERSHIP_GRANTED
;
7042 DMSG(vdc
, 0, "[%d] Fail to recover ownership",
7049 * If we have the ownership then we just wait for an event
7050 * to happen (LDC reset), otherwise we will retry to recover
7053 if (vdc
->ownership
& VDC_OWNERSHIP_GRANTED
)
7056 timeout
= drv_usectohz(vdc_ownership_delay
);
7058 /* Release the ownership_lock and wait on the vdc lock */
7059 mutex_exit(&vdc
->ownership_lock
);
7062 (void) cv_wait(&vdc
->ownership_cv
, &vdc
->lock
);
7064 (void) cv_reltimedwait(&vdc
->ownership_cv
, &vdc
->lock
,
7065 timeout
, TR_CLOCK_TICK
);
7067 mutex_exit(&vdc
->lock
);
7069 mutex_enter(&vdc
->ownership_lock
);
7070 mutex_enter(&vdc
->lock
);
7073 vdc
->ownership_thread
= NULL
;
7074 mutex_exit(&vdc
->lock
);
7075 mutex_exit(&vdc
->ownership_lock
);
7081 vdc_ownership_update(vdc_t
*vdc
, int ownership_flags
)
7083 ASSERT(MUTEX_HELD(&vdc
->ownership_lock
));
7085 mutex_enter(&vdc
->lock
);
7086 vdc
->ownership
= ownership_flags
;
7087 if ((vdc
->ownership
& VDC_OWNERSHIP_WANTED
) &&
7088 vdc
->ownership_thread
== NULL
) {
7089 /* start ownership thread */
7090 vdc
->ownership_thread
= thread_create(NULL
, 0,
7091 vdc_ownership_thread
, vdc
, 0, &p0
, TS_RUN
,
7094 /* notify the ownership thread */
7095 cv_signal(&vdc
->ownership_cv
);
7097 mutex_exit(&vdc
->lock
);
7101 * Get the size and the block size of a virtual disk from the vdisk server.
7104 vdc_get_capacity(vdc_t
*vdc
, size_t *dsk_size
, size_t *blk_size
)
7108 vd_capacity_t
*vd_cap
;
7110 ASSERT(MUTEX_NOT_HELD(&vdc
->lock
));
7112 alloc_len
= P2ROUNDUP(sizeof (vd_capacity_t
), sizeof (uint64_t));
7114 vd_cap
= kmem_zalloc(alloc_len
, KM_SLEEP
);
7116 rv
= vdc_do_sync_op(vdc
, VD_OP_GET_CAPACITY
, (caddr_t
)vd_cap
, alloc_len
,
7117 0, 0, VIO_both_dir
, B_TRUE
);
7119 *dsk_size
= vd_cap
->vdisk_size
;
7120 *blk_size
= vd_cap
->vdisk_block_size
;
7122 kmem_free(vd_cap
, alloc_len
);
7127 * Check the disk capacity. Disk size information is updated if size has
7130 * Return 0 if the disk capacity is available, or non-zero if it is not.
7133 vdc_check_capacity(vdc_t
*vdc
)
7135 size_t dsk_size
, blk_size
;
7139 * If the vdisk does not support the VD_OP_GET_CAPACITY operation
7140 * then the disk capacity has been retrieved during the handshake
7141 * and there's nothing more to do here.
7143 if (!VD_OP_SUPPORTED(vdc
->operations
, VD_OP_GET_CAPACITY
))
7146 if ((rv
= vdc_get_capacity(vdc
, &dsk_size
, &blk_size
)) != 0)
7149 if (dsk_size
== VD_SIZE_UNKNOWN
|| dsk_size
== 0 || blk_size
== 0)
7152 mutex_enter(&vdc
->lock
);
7154 * First try to update the VIO block size (which is the same as the
7155 * vdisk block size). If this returns an error then that means that
7156 * we can not use that block size so basically the vdisk is unusable
7157 * and we return an error.
7159 rv
= vdc_update_vio_bsize(vdc
, blk_size
);
7161 vdc_update_size(vdc
, dsk_size
, blk_size
, vdc
->max_xfer_sz
);
7163 mutex_exit(&vdc
->lock
);
7169 * This structure is used in the DKIO(7I) array below.
7171 typedef struct vdc_dk_ioctl
{
7172 uint8_t op
; /* VD_OP_XXX value */
7173 int cmd
; /* Solaris ioctl operation number */
7174 size_t nbytes
; /* size of structure to be copied */
7176 /* function to convert between vDisk and Solaris structure formats */
7177 int (*convert
)(vdc_t
*vdc
, void *vd_buf
, void *ioctl_arg
,
7182 * Subset of DKIO(7I) operations currently supported
7184 static vdc_dk_ioctl_t dk_ioctl
[] = {
7185 {VD_OP_FLUSH
, DKIOCFLUSHWRITECACHE
, 0,
7186 vdc_null_copy_func
},
7187 {VD_OP_GET_WCE
, DKIOCGETWCE
, sizeof (int),
7188 vdc_get_wce_convert
},
7189 {VD_OP_SET_WCE
, DKIOCSETWCE
, sizeof (int),
7190 vdc_set_wce_convert
},
7191 {VD_OP_GET_VTOC
, DKIOCGVTOC
, sizeof (vd_vtoc_t
),
7192 vdc_get_vtoc_convert
},
7193 {VD_OP_SET_VTOC
, DKIOCSVTOC
, sizeof (vd_vtoc_t
),
7194 vdc_set_vtoc_convert
},
7195 {VD_OP_GET_VTOC
, DKIOCGEXTVTOC
, sizeof (vd_vtoc_t
),
7196 vdc_get_extvtoc_convert
},
7197 {VD_OP_SET_VTOC
, DKIOCSEXTVTOC
, sizeof (vd_vtoc_t
),
7198 vdc_set_extvtoc_convert
},
7199 {VD_OP_GET_DISKGEOM
, DKIOCGGEOM
, sizeof (vd_geom_t
),
7200 vdc_get_geom_convert
},
7201 {VD_OP_GET_DISKGEOM
, DKIOCG_PHYGEOM
, sizeof (vd_geom_t
),
7202 vdc_get_geom_convert
},
7203 {VD_OP_GET_DISKGEOM
, DKIOCG_VIRTGEOM
, sizeof (vd_geom_t
),
7204 vdc_get_geom_convert
},
7205 {VD_OP_SET_DISKGEOM
, DKIOCSGEOM
, sizeof (vd_geom_t
),
7206 vdc_set_geom_convert
},
7207 {VD_OP_GET_EFI
, DKIOCGETEFI
, 0,
7208 vdc_get_efi_convert
},
7209 {VD_OP_SET_EFI
, DKIOCSETEFI
, 0,
7210 vdc_set_efi_convert
},
7212 /* DIOCTL_RWCMD is converted to a read or a write */
7213 {0, DIOCTL_RWCMD
, sizeof (struct dadkio_rwcmd
), NULL
},
7215 /* mhd(7I) non-shared multihost disks ioctls */
7216 {0, MHIOCTKOWN
, 0, vdc_null_copy_func
},
7217 {0, MHIOCRELEASE
, 0, vdc_null_copy_func
},
7218 {0, MHIOCSTATUS
, 0, vdc_null_copy_func
},
7219 {0, MHIOCQRESERVE
, 0, vdc_null_copy_func
},
7221 /* mhd(7I) shared multihost disks ioctls */
7222 {0, MHIOCGRP_INKEYS
, 0, vdc_null_copy_func
},
7223 {0, MHIOCGRP_INRESV
, 0, vdc_null_copy_func
},
7224 {0, MHIOCGRP_REGISTER
, 0, vdc_null_copy_func
},
7225 {0, MHIOCGRP_RESERVE
, 0, vdc_null_copy_func
},
7226 {0, MHIOCGRP_PREEMPTANDABORT
, 0, vdc_null_copy_func
},
7227 {0, MHIOCGRP_REGISTERANDIGNOREKEY
, 0, vdc_null_copy_func
},
7229 /* mhd(7I) failfast ioctl */
7230 {0, MHIOCENFAILFAST
, 0, vdc_null_copy_func
},
7233 * These particular ioctls are not sent to the server - vdc fakes up
7234 * the necessary info.
7236 {0, DKIOCINFO
, sizeof (struct dk_cinfo
), vdc_null_copy_func
},
7237 {0, DKIOCGMEDIAINFO
, sizeof (struct dk_minfo
), vdc_null_copy_func
},
7238 {0, USCSICMD
, sizeof (struct uscsi_cmd
), vdc_null_copy_func
},
7239 {0, DKIOCPARTITION
, 0, vdc_null_copy_func
},
7240 {0, DKIOCGAPART
, 0, vdc_null_copy_func
},
7241 {0, DKIOCREMOVABLE
, 0, vdc_null_copy_func
},
7242 {0, CDROMREADOFFSET
, 0, vdc_null_copy_func
}
7246 * This function handles ioctl requests from the vd_efi_alloc_and_read()
7247 * function and forward them to the vdisk.
7250 vd_process_efi_ioctl(void *vdisk
, int cmd
, uintptr_t arg
)
7252 vdc_t
*vdc
= (vdc_t
*)vdisk
;
7256 dev
= makedevice(ddi_driver_major(vdc
->dip
),
7257 VD_MAKE_DEV(vdc
->instance
, 0));
7259 return (vd_process_ioctl(dev
, cmd
, (caddr_t
)arg
, FKIOCTL
, &rval
));
7264 * vd_process_ioctl()
7267 * This routine processes disk specific ioctl calls
7270 * dev - the device number
7271 * cmd - the operation [dkio(7I)] to be processed
7272 * arg - pointer to user provided structure
7273 * (contains data to be set or reference parameter for get)
7274 * mode - bit flag, indicating open settings, 32/64 bit type, etc
7275 * rvalp - pointer to return value for calling process.
7285 vd_process_ioctl(dev_t dev
, int cmd
, caddr_t arg
, int mode
, int *rvalp
)
7287 int instance
= VDCUNIT(dev
);
7290 int idx
= 0; /* index into dk_ioctl[] */
7291 size_t len
= 0; /* #bytes to send to vds */
7292 size_t alloc_len
= 0; /* #bytes to allocate mem for */
7293 caddr_t mem_p
= NULL
;
7294 size_t nioctls
= (sizeof (dk_ioctl
)) / (sizeof (dk_ioctl
[0]));
7295 vdc_dk_ioctl_t
*iop
;
7297 vdc
= ddi_get_soft_state(vdc_state
, instance
);
7299 cmn_err(CE_NOTE
, "![%d] Could not get soft state structure",
7304 DMSG(vdc
, 0, "[%d] Processing ioctl(%x) for dev %lx : model %x\n",
7305 instance
, cmd
, dev
, ddi_model_convert_from(mode
& FMODELS
));
7307 if (rvalp
!= NULL
) {
7308 /* the return value of the ioctl is 0 by default */
7313 * Validate the ioctl operation to be performed.
7315 * If we have looped through the array without finding a match then we
7316 * don't support this ioctl.
7318 for (idx
= 0; idx
< nioctls
; idx
++) {
7319 if (cmd
== dk_ioctl
[idx
].cmd
)
7323 if (idx
>= nioctls
) {
7324 DMSG(vdc
, 0, "[%d] Unsupported ioctl (0x%x)\n",
7325 vdc
->instance
, cmd
);
7329 iop
= &(dk_ioctl
[idx
]);
7331 if (cmd
== DKIOCGETEFI
|| cmd
== DKIOCSETEFI
) {
7332 /* size is not fixed for EFI ioctls, it depends on ioctl arg */
7335 rv
= ddi_copyin(arg
, &dk_efi
, sizeof (dk_efi_t
), mode
);
7339 len
= sizeof (vd_efi_t
) - 1 + dk_efi
.dki_length
;
7344 /* check if the ioctl is applicable */
7346 case CDROMREADOFFSET
:
7347 case DKIOCREMOVABLE
:
7355 case MHIOCGRP_INKEYS
:
7356 case MHIOCGRP_INRESV
:
7357 case MHIOCGRP_REGISTER
:
7358 case MHIOCGRP_RESERVE
:
7359 case MHIOCGRP_PREEMPTANDABORT
:
7360 case MHIOCGRP_REGISTERANDIGNOREKEY
:
7361 case MHIOCENFAILFAST
:
7362 if (vdc
->cinfo
== NULL
)
7364 if (vdc
->cinfo
->dki_ctype
!= DKC_SCSI_CCS
)
7369 if (vdc
->cinfo
== NULL
)
7371 if (vdc
->cinfo
->dki_ctype
!= DKC_DIRECT
)
7376 if (vdc
->cinfo
== NULL
)
7380 case DKIOCGMEDIAINFO
:
7381 if (vdc
->minfo
== NULL
)
7383 if (vdc_check_capacity(vdc
) != 0)
7384 /* disk capacity is not available */
7390 * Deal with ioctls which require a processing different than
7391 * converting ioctl arguments and sending a corresponding
7398 return (vdc_uscsi_cmd(vdc
, arg
, mode
));
7403 mutex_enter(&vdc
->ownership_lock
);
7405 * We have to set VDC_OWNERSHIP_WANTED now so that the ownership
7406 * can be flagged with VDC_OWNERSHIP_RESET if the LDC is reset
7407 * while we are processing the ioctl.
7409 vdc_ownership_update(vdc
, VDC_OWNERSHIP_WANTED
);
7411 rv
= vdc_access_set(vdc
, VD_ACCESS_SET_EXCLUSIVE
|
7412 VD_ACCESS_SET_PREEMPT
| VD_ACCESS_SET_PRESERVE
);
7414 vdc_ownership_update(vdc
, VDC_OWNERSHIP_WANTED
|
7415 VDC_OWNERSHIP_GRANTED
);
7417 vdc_ownership_update(vdc
, VDC_OWNERSHIP_NONE
);
7419 mutex_exit(&vdc
->ownership_lock
);
7425 mutex_enter(&vdc
->ownership_lock
);
7426 rv
= vdc_access_set(vdc
, VD_ACCESS_SET_CLEAR
);
7428 vdc_ownership_update(vdc
, VDC_OWNERSHIP_NONE
);
7430 mutex_exit(&vdc
->ownership_lock
);
7438 rv
= vdc_access_get(vdc
, &status
);
7439 if (rv
== 0 && rvalp
!= NULL
)
7440 *rvalp
= (status
& VD_ACCESS_ALLOWED
)? 0 : 1;
7446 rv
= vdc_access_set(vdc
, VD_ACCESS_SET_EXCLUSIVE
);
7450 case MHIOCGRP_INKEYS
:
7452 return (vdc_mhd_inkeys(vdc
, arg
, mode
));
7455 case MHIOCGRP_INRESV
:
7457 return (vdc_mhd_inresv(vdc
, arg
, mode
));
7460 case MHIOCGRP_REGISTER
:
7462 return (vdc_mhd_register(vdc
, arg
, mode
));
7465 case MHIOCGRP_RESERVE
:
7467 return (vdc_mhd_reserve(vdc
, arg
, mode
));
7470 case MHIOCGRP_PREEMPTANDABORT
:
7472 return (vdc_mhd_preemptabort(vdc
, arg
, mode
));
7475 case MHIOCGRP_REGISTERANDIGNOREKEY
:
7477 return (vdc_mhd_registerignore(vdc
, arg
, mode
));
7480 case MHIOCENFAILFAST
:
7482 rv
= vdc_failfast(vdc
, arg
, mode
);
7488 return (vdc_dioctl_rwcmd(vdc
, arg
, mode
));
7493 return (vdc_dkio_gapart(vdc
, arg
, mode
));
7496 case DKIOCPARTITION
:
7498 return (vdc_dkio_partition(vdc
, arg
, mode
));
7503 struct dk_cinfo cinfo
;
7505 bcopy(vdc
->cinfo
, &cinfo
, sizeof (struct dk_cinfo
));
7506 cinfo
.dki_partition
= VDCPART(dev
);
7508 rv
= ddi_copyout(&cinfo
, (void *)arg
,
7509 sizeof (struct dk_cinfo
), mode
);
7516 case DKIOCGMEDIAINFO
:
7518 ASSERT(vdc
->vdisk_size
!= 0);
7519 ASSERT(vdc
->minfo
->dki_capacity
!= 0);
7520 rv
= ddi_copyout(vdc
->minfo
, (void *)arg
,
7521 sizeof (struct dk_minfo
), mode
);
7528 case DKIOCFLUSHWRITECACHE
:
7530 struct dk_callback
*dkc
=
7531 (struct dk_callback
*)(uintptr_t)arg
;
7532 vdc_dk_arg_t
*dkarg
= NULL
;
7534 DMSG(vdc
, 1, "[%d] Flush W$: mode %x\n",
7538 * If arg is NULL, then there is no callback function
7539 * registered and the call operates synchronously; we
7540 * break and continue with the rest of the function and
7541 * wait for vds to return (i.e. after the request to
7542 * vds returns successfully, all writes completed prior
7543 * to the ioctl will have been flushed from the disk
7544 * write cache to persistent media.
7546 * If a callback function is registered, we dispatch
7547 * the request on a task queue and return immediately.
7548 * The callback will deal with informing the calling
7549 * thread that the flush request is completed.
7555 * the asynchronous callback is only supported if
7556 * invoked from within the kernel
7558 if ((mode
& FKIOCTL
) == 0)
7561 dkarg
= kmem_zalloc(sizeof (vdc_dk_arg_t
), KM_SLEEP
);
7565 bcopy(dkc
, &dkarg
->dkc
, sizeof (*dkc
));
7567 mutex_enter(&vdc
->lock
);
7568 vdc
->dkio_flush_pending
++;
7570 mutex_exit(&vdc
->lock
);
7572 /* put the request on a task queue */
7573 rv
= taskq_dispatch(system_taskq
, vdc_dkio_flush_cb
,
7574 (void *)dkarg
, DDI_SLEEP
);
7576 /* clean up if dispatch fails */
7577 mutex_enter(&vdc
->lock
);
7578 vdc
->dkio_flush_pending
--;
7579 mutex_exit(&vdc
->lock
);
7580 kmem_free(dkarg
, sizeof (vdc_dk_arg_t
));
7583 return (rv
== NULL
? ENOMEM
: 0);
7587 /* catch programming error in vdc - should be a VD_OP_XXX ioctl */
7588 ASSERT(iop
->op
!= 0);
7590 /* check if the vDisk server handles the operation for this vDisk */
7591 if (VD_OP_SUPPORTED(vdc
->operations
, iop
->op
) == B_FALSE
) {
7592 DMSG(vdc
, 0, "[%d] Unsupported VD_OP operation (0x%x)\n",
7593 vdc
->instance
, iop
->op
);
7597 /* LDC requires that the memory being mapped is 8-byte aligned */
7598 alloc_len
= P2ROUNDUP(len
, sizeof (uint64_t));
7599 DMSG(vdc
, 1, "[%d] struct size %ld alloc %ld\n",
7600 instance
, len
, alloc_len
);
7603 mem_p
= kmem_zalloc(alloc_len
, KM_SLEEP
);
7606 * Call the conversion function for this ioctl which, if necessary,
7607 * converts from the Solaris format to the format ARC'ed
7608 * as part of the vDisk protocol (FWARC 2006/195)
7610 ASSERT(iop
->convert
!= NULL
);
7611 rv
= (iop
->convert
)(vdc
, arg
, mem_p
, mode
, VD_COPYIN
);
7613 DMSG(vdc
, 0, "[%d] convert func returned %d for ioctl 0x%x\n",
7616 kmem_free(mem_p
, alloc_len
);
7621 * send request to vds to service the ioctl.
7623 rv
= vdc_do_sync_op(vdc
, iop
->op
, mem_p
, alloc_len
,
7624 VDCPART(dev
), 0, VIO_both_dir
, B_TRUE
);
7628 * This is not necessarily an error. The ioctl could
7629 * be returning a value such as ENOTTY to indicate
7630 * that the ioctl is not applicable.
7632 DMSG(vdc
, 0, "[%d] vds returned %d for ioctl 0x%x\n",
7635 kmem_free(mem_p
, alloc_len
);
7641 * Call the conversion function (if it exists) for this ioctl
7642 * which converts from the format ARC'ed as part of the vDisk
7643 * protocol (FWARC 2006/195) back to a format understood by
7644 * the rest of Solaris.
7646 rv
= (iop
->convert
)(vdc
, mem_p
, arg
, mode
, VD_COPYOUT
);
7648 DMSG(vdc
, 0, "[%d] convert func returned %d for ioctl 0x%x\n",
7651 kmem_free(mem_p
, alloc_len
);
7656 kmem_free(mem_p
, alloc_len
);
7665 * This is an empty conversion function used by ioctl calls which
7666 * do not need to convert the data being passed in/out to userland
7669 vdc_null_copy_func(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
7671 _NOTE(ARGUNUSED(vdc
))
7672 _NOTE(ARGUNUSED(from
))
7673 _NOTE(ARGUNUSED(to
))
7674 _NOTE(ARGUNUSED(mode
))
7675 _NOTE(ARGUNUSED(dir
))
7681 vdc_get_wce_convert(vdc_t
*vdc
, void *from
, void *to
,
7684 _NOTE(ARGUNUSED(vdc
))
7686 if (dir
== VD_COPYIN
)
7687 return (0); /* nothing to do */
7689 if (ddi_copyout(from
, to
, sizeof (int), mode
) != 0)
7696 vdc_set_wce_convert(vdc_t
*vdc
, void *from
, void *to
,
7699 _NOTE(ARGUNUSED(vdc
))
7701 if (dir
== VD_COPYOUT
)
7702 return (0); /* nothing to do */
7704 if (ddi_copyin(from
, to
, sizeof (int), mode
) != 0)
7712 * vdc_get_vtoc_convert()
7715 * This routine performs the necessary convertions from the DKIOCGVTOC
7716 * Solaris structure to the format defined in FWARC 2006/195.
7718 * In the struct vtoc definition, the timestamp field is marked as not
7719 * supported so it is not part of vDisk protocol (FWARC 2006/195).
7720 * However SVM uses that field to check it can write into the VTOC,
7721 * so we fake up the info of that field.
7724 * vdc - the vDisk client
7725 * from - the buffer containing the data to be copied from
7726 * to - the buffer to be copied to
7727 * mode - flags passed to ioctl() call
7728 * dir - the "direction" of the copy - VD_COPYIN or VD_COPYOUT
7732 * ENXIO - incorrect buffer passed in.
7733 * EFAULT - ddi_copyout routine encountered an error.
7736 vdc_get_vtoc_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
7740 struct vtoc32 vtoc32
;
7741 struct extvtoc evtoc
;
7744 if (dir
!= VD_COPYOUT
)
7745 return (0); /* nothing to do */
7747 if ((from
== NULL
) || (to
== NULL
))
7750 if (vdc
->vdisk_size
> VD_OLDVTOC_LIMIT
)
7753 VD_VTOC2VTOC((vd_vtoc_t
*)from
, &evtoc
);
7755 /* fake the VTOC timestamp field */
7756 for (i
= 0; i
< V_NUMPAR
; i
++) {
7757 evtoc
.timestamp
[i
] = vdc
->vtoc
->timestamp
[i
];
7760 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
7761 /* LINTED E_ASSIGN_NARROW_CONV */
7762 extvtoctovtoc32(evtoc
, vtoc32
);
7763 rv
= ddi_copyout(&vtoc32
, to
, sizeof (vtoc32
), mode
);
7767 extvtoctovtoc(evtoc
, vtoc
);
7768 rv
= ddi_copyout(&vtoc
, to
, sizeof (vtoc
), mode
);
7778 * vdc_set_vtoc_convert()
7781 * This routine performs the necessary convertions from the DKIOCSVTOC
7782 * Solaris structure to the format defined in FWARC 2006/195.
7785 * vdc - the vDisk client
7786 * from - Buffer with data
7787 * to - Buffer where data is to be copied to
7788 * mode - flags passed to ioctl
7789 * dir - direction of copy (in or out)
7793 * ENXIO - Invalid buffer passed in
7794 * EFAULT - ddi_copyin of data failed
7797 vdc_set_vtoc_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
7801 struct vtoc32 vtoc32
;
7802 struct extvtoc evtoc
;
7805 if ((from
== NULL
) || (to
== NULL
))
7808 if (vdc
->vdisk_size
> VD_OLDVTOC_LIMIT
)
7811 uvtoc
= (dir
== VD_COPYIN
)? from
: to
;
7813 if (ddi_model_convert_from(mode
& FMODELS
) == DDI_MODEL_ILP32
) {
7814 rv
= ddi_copyin(uvtoc
, &vtoc32
, sizeof (vtoc32
), mode
);
7817 vtoc32toextvtoc(vtoc32
, evtoc
);
7819 rv
= ddi_copyin(uvtoc
, &vtoc
, sizeof (vtoc
), mode
);
7822 vtoctoextvtoc(vtoc
, evtoc
);
7825 if (dir
== VD_COPYOUT
) {
7827 * The disk label may have changed. Revalidate the disk
7828 * geometry. This will also update the device nodes.
7833 * We also need to keep track of the timestamp fields.
7835 for (i
= 0; i
< V_NUMPAR
; i
++) {
7836 vdc
->vtoc
->timestamp
[i
] = evtoc
.timestamp
[i
];
7840 VTOC2VD_VTOC(&evtoc
, (vd_vtoc_t
*)to
);
7847 vdc_get_extvtoc_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
7850 struct extvtoc evtoc
;
7852 if (dir
!= VD_COPYOUT
)
7853 return (0); /* nothing to do */
7855 if ((from
== NULL
) || (to
== NULL
))
7858 VD_VTOC2VTOC((vd_vtoc_t
*)from
, &evtoc
);
7860 /* fake the VTOC timestamp field */
7861 for (i
= 0; i
< V_NUMPAR
; i
++) {
7862 evtoc
.timestamp
[i
] = vdc
->vtoc
->timestamp
[i
];
7865 rv
= ddi_copyout(&evtoc
, to
, sizeof (struct extvtoc
), mode
);
7873 vdc_set_extvtoc_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
7876 struct extvtoc evtoc
;
7879 if ((from
== NULL
) || (to
== NULL
))
7882 uvtoc
= (dir
== VD_COPYIN
)? from
: to
;
7884 rv
= ddi_copyin(uvtoc
, &evtoc
, sizeof (struct extvtoc
), mode
);
7888 if (dir
== VD_COPYOUT
) {
7890 * The disk label may have changed. Revalidate the disk
7891 * geometry. This will also update the device nodes.
7896 * We also need to keep track of the timestamp fields.
7898 for (i
= 0; i
< V_NUMPAR
; i
++) {
7899 vdc
->vtoc
->timestamp
[i
] = evtoc
.timestamp
[i
];
7903 VTOC2VD_VTOC(&evtoc
, (vd_vtoc_t
*)to
);
7911 * vdc_get_geom_convert()
7914 * This routine performs the necessary convertions from the DKIOCGGEOM,
7915 * DKIOCG_PHYSGEOM and DKIOG_VIRTGEOM Solaris structures to the format
7916 * defined in FWARC 2006/195
7919 * vdc - the vDisk client
7920 * from - Buffer with data
7921 * to - Buffer where data is to be copied to
7922 * mode - flags passed to ioctl
7923 * dir - direction of copy (in or out)
7927 * ENXIO - Invalid buffer passed in
7928 * EFAULT - ddi_copyout of data failed
7931 vdc_get_geom_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
7933 _NOTE(ARGUNUSED(vdc
))
7935 struct dk_geom geom
;
7936 int copy_len
= sizeof (struct dk_geom
);
7939 if (dir
!= VD_COPYOUT
)
7940 return (0); /* nothing to do */
7942 if ((from
== NULL
) || (to
== NULL
))
7945 VD_GEOM2DK_GEOM((vd_geom_t
*)from
, &geom
);
7946 rv
= ddi_copyout(&geom
, to
, copy_len
, mode
);
7955 * vdc_set_geom_convert()
7958 * This routine performs the necessary convertions from the DKIOCSGEOM
7959 * Solaris structure to the format defined in FWARC 2006/195.
7962 * vdc - the vDisk client
7963 * from - Buffer with data
7964 * to - Buffer where data is to be copied to
7965 * mode - flags passed to ioctl
7966 * dir - direction of copy (in or out)
7970 * ENXIO - Invalid buffer passed in
7971 * EFAULT - ddi_copyin of data failed
7974 vdc_set_geom_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
7976 _NOTE(ARGUNUSED(vdc
))
7979 void *tmp_mem
= NULL
;
7980 int copy_len
= sizeof (struct dk_geom
);
7983 if (dir
!= VD_COPYIN
)
7984 return (0); /* nothing to do */
7986 if ((from
== NULL
) || (to
== NULL
))
7989 tmp_mem
= kmem_alloc(copy_len
, KM_SLEEP
);
7991 rv
= ddi_copyin(from
, tmp_mem
, copy_len
, mode
);
7993 kmem_free(tmp_mem
, copy_len
);
7996 DK_GEOM2VD_GEOM((struct dk_geom
*)tmp_mem
, &vdgeom
);
7997 bcopy(&vdgeom
, to
, sizeof (vdgeom
));
7998 kmem_free(tmp_mem
, copy_len
);
8004 vdc_get_efi_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
8006 _NOTE(ARGUNUSED(vdc
))
8013 if ((from
== NULL
) || (to
== NULL
))
8016 if (dir
== VD_COPYIN
) {
8018 vd_efi
= (vd_efi_t
*)to
;
8020 rv
= ddi_copyin(from
, &dk_efi
, sizeof (dk_efi_t
), mode
);
8024 vd_efi
->lba
= dk_efi
.dki_lba
;
8025 vd_efi
->length
= dk_efi
.dki_length
;
8026 bzero(vd_efi
->data
, vd_efi
->length
);
8030 rv
= ddi_copyin(to
, &dk_efi
, sizeof (dk_efi_t
), mode
);
8034 uaddr
= dk_efi
.dki_data
;
8036 dk_efi
.dki_data
= kmem_alloc(dk_efi
.dki_length
, KM_SLEEP
);
8038 VD_EFI2DK_EFI((vd_efi_t
*)from
, &dk_efi
);
8040 rv
= ddi_copyout(dk_efi
.dki_data
, uaddr
, dk_efi
.dki_length
,
8045 kmem_free(dk_efi
.dki_data
, dk_efi
.dki_length
);
8052 vdc_set_efi_convert(vdc_t
*vdc
, void *from
, void *to
, int mode
, int dir
)
8054 _NOTE(ARGUNUSED(vdc
))
8059 if (dir
== VD_COPYOUT
) {
8061 * The disk label may have changed. Revalidate the disk
8062 * geometry. This will also update the device nodes.
8068 if ((from
== NULL
) || (to
== NULL
))
8071 if (ddi_copyin(from
, &dk_efi
, sizeof (dk_efi_t
), mode
) != 0)
8074 uaddr
= dk_efi
.dki_data
;
8076 dk_efi
.dki_data
= kmem_alloc(dk_efi
.dki_length
, KM_SLEEP
);
8078 if (ddi_copyin(uaddr
, dk_efi
.dki_data
, dk_efi
.dki_length
, mode
) != 0)
8081 DK_EFI2VD_EFI(&dk_efi
, (vd_efi_t
*)to
);
8083 kmem_free(dk_efi
.dki_data
, dk_efi
.dki_length
);
8089 /* -------------------------------------------------------------------------- */
8093 * vdc_create_fake_geometry()
8096 * This routine fakes up the disk info needed for some DKIO ioctls such
8097 * as DKIOCINFO and DKIOCGMEDIAINFO [just like lofi(7D) and ramdisk(7D) do]
8099 * Note: This function must not be called until the vDisk attributes have
8100 * been exchanged as part of the handshake with the vDisk server.
8103 * vdc - soft state pointer for this instance of the device driver.
8109 vdc_create_fake_geometry(vdc_t
*vdc
)
8111 ASSERT(vdc
!= NULL
);
8112 ASSERT(vdc
->max_xfer_sz
!= 0);
8117 if (vdc
->cinfo
== NULL
)
8118 vdc
->cinfo
= kmem_zalloc(sizeof (struct dk_cinfo
), KM_SLEEP
);
8120 (void) strcpy(vdc
->cinfo
->dki_cname
, VDC_DRIVER_NAME
);
8121 (void) strcpy(vdc
->cinfo
->dki_dname
, VDC_DRIVER_NAME
);
8122 /* max_xfer_sz is #blocks so we don't need to divide by vdisk_bsize */
8123 vdc
->cinfo
->dki_maxtransfer
= vdc
->max_xfer_sz
;
8126 * We set the controller type to DKC_SCSI_CCS only if the VD_OP_SCSICMD
8127 * operation is supported, otherwise the controller type is DKC_DIRECT.
8128 * Version 1.0 does not support the VD_OP_SCSICMD operation, so the
8129 * controller type is always DKC_DIRECT in that case.
8131 * If the virtual disk is backed by a physical CD/DVD device or
8132 * an ISO image, modify the controller type to indicate this
8134 switch (vdc
->vdisk_media
) {
8137 vdc
->cinfo
->dki_ctype
= DKC_CDROM
;
8139 case VD_MEDIA_FIXED
:
8140 if (VD_OP_SUPPORTED(vdc
->operations
, VD_OP_SCSICMD
))
8141 vdc
->cinfo
->dki_ctype
= DKC_SCSI_CCS
;
8143 vdc
->cinfo
->dki_ctype
= DKC_DIRECT
;
8146 /* in the case of v1.0 we default to a fixed disk */
8147 vdc
->cinfo
->dki_ctype
= DKC_DIRECT
;
8150 vdc
->cinfo
->dki_flags
= DKI_FMTVOL
;
8151 vdc
->cinfo
->dki_cnum
= 0;
8152 vdc
->cinfo
->dki_addr
= 0;
8153 vdc
->cinfo
->dki_space
= 0;
8154 vdc
->cinfo
->dki_prio
= 0;
8155 vdc
->cinfo
->dki_vec
= 0;
8156 vdc
->cinfo
->dki_unit
= vdc
->instance
;
8157 vdc
->cinfo
->dki_slave
= 0;
8159 * The partition number will be created on the fly depending on the
8160 * actual slice (i.e. minor node) that is used to request the data.
8162 vdc
->cinfo
->dki_partition
= 0;
8165 * DKIOCGMEDIAINFO support
8167 if (vdc
->minfo
== NULL
)
8168 vdc
->minfo
= kmem_zalloc(sizeof (struct dk_minfo
), KM_SLEEP
);
8170 if (vio_ver_is_supported(vdc
->ver
, 1, 1)) {
8171 vdc
->minfo
->dki_media_type
=
8172 VD_MEDIATYPE2DK_MEDIATYPE(vdc
->vdisk_media
);
8174 vdc
->minfo
->dki_media_type
= DK_FIXED_DISK
;
8177 vdc
->minfo
->dki_capacity
= vdc
->vdisk_size
;
8178 vdc
->minfo
->dki_lbsize
= vdc
->vdisk_bsize
;
8182 vdc_lbl2cksum(struct dk_label
*label
)
8187 count
= (sizeof (struct dk_label
)) / (sizeof (short)) - 1;
8188 sp
= (ushort_t
*)label
;
8198 vdc_update_size(vdc_t
*vdc
, size_t dsk_size
, size_t blk_size
, size_t xfr_size
)
8200 vd_err_stats_t
*stp
;
8202 ASSERT(MUTEX_HELD(&vdc
->lock
));
8203 ASSERT(xfr_size
!= 0);
8206 * If the disk size is unknown or sizes are unchanged then don't
8209 if (dsk_size
== VD_SIZE_UNKNOWN
|| dsk_size
== 0 ||
8210 (blk_size
== vdc
->vdisk_bsize
&& dsk_size
== vdc
->vdisk_size
&&
8211 xfr_size
== vdc
->max_xfer_sz
))
8215 * We don't know at compile time what the vDisk server will think
8216 * are good values but we apply a large (arbitrary) upper bound to
8217 * prevent memory exhaustion in vdc if it was allocating a DRing
8218 * based of huge values sent by the server. We probably will never
8219 * exceed this except if the message was garbage.
8221 if ((xfr_size
* blk_size
) > (PAGESIZE
* DEV_BSIZE
)) {
8222 DMSG(vdc
, 0, "[%d] vds block transfer size too big;"
8223 " using max supported by vdc", vdc
->instance
);
8224 xfr_size
= maxphys
/ blk_size
;
8227 vdc
->max_xfer_sz
= xfr_size
;
8228 vdc
->vdisk_bsize
= blk_size
;
8229 vdc
->vdisk_size
= dsk_size
;
8231 stp
= (vd_err_stats_t
*)vdc
->err_stats
->ks_data
;
8232 stp
->vd_capacity
.value
.ui64
= dsk_size
* blk_size
;
8234 vdc
->minfo
->dki_capacity
= dsk_size
;
8235 vdc
->minfo
->dki_lbsize
= (uint_t
)blk_size
;
8239 * Update information about the VIO block size. The VIO block size is the
8240 * same as the vdisk block size which is stored in vdc->vdisk_bsize so we
8241 * do not store that information again.
8243 * However, buf structures will always use a logical block size of 512 bytes
8244 * (DEV_BSIZE) and we will need to convert logical block numbers to VIO block
8245 * numbers for each read or write operation using vdc_strategy(). To speed up
8246 * this conversion, we expect the VIO block size to be a power of 2 and a
8247 * multiple 512 bytes (DEV_BSIZE), and we cache some useful information.
8249 * The function return EINVAL if the new VIO block size (blk_size) is not a
8250 * power of 2 or not a multiple of 512 bytes, otherwise it returns 0.
8253 vdc_update_vio_bsize(vdc_t
*vdc
, uint32_t blk_size
)
8259 vdc
->vio_bshift
= 0;
8261 ASSERT(blk_size
> 0);
8263 if ((blk_size
% DEV_BSIZE
) != 0)
8266 ratio
= blk_size
/ DEV_BSIZE
;
8268 for (n
= ratio
; n
> 1; n
>>= 1) {
8269 if ((n
& 0x1) != 0) {
8270 /* blk_size is not a power of 2 */
8276 vdc
->vio_bshift
= nshift
;
8277 vdc
->vio_bmask
= ratio
- 1;
8284 * vdc_validate_geometry
8287 * This routine discovers the label and geometry of the disk. It stores
8288 * the disk label and related information in the vdc structure. If it
8289 * fails to validate the geometry or to discover the disk label then
8290 * the label is marked as unknown (VD_DISK_LABEL_UNK).
8293 * vdc - soft state pointer for this instance of the device driver.
8297 * EINVAL - unknown disk label.
8298 * ENOTSUP - geometry not applicable (EFI label).
8299 * EIO - error accessing the disk.
8302 vdc_validate_geometry(vdc_t
*vdc
)
8306 struct dk_label
*label
;
8307 struct dk_geom geom
;
8308 struct extvtoc vtoc
;
8313 ASSERT(vdc
!= NULL
);
8314 ASSERT(vdc
->vtoc
!= NULL
&& vdc
->geom
!= NULL
);
8315 ASSERT(MUTEX_HELD(&vdc
->lock
));
8317 mutex_exit(&vdc
->lock
);
8319 * Check the disk capacity in case it has changed. If that fails then
8320 * we proceed and we will be using the disk size we currently have.
8322 (void) vdc_check_capacity(vdc
);
8323 dev
= makedevice(ddi_driver_major(vdc
->dip
),
8324 VD_MAKE_DEV(vdc
->instance
, 0));
8326 rv
= vd_process_ioctl(dev
, DKIOCGGEOM
, (caddr_t
)&geom
, FKIOCTL
, &rval
);
8328 rv
= vd_process_ioctl(dev
, DKIOCGEXTVTOC
, (caddr_t
)&vtoc
,
8331 if (rv
== ENOTSUP
) {
8333 * If the device does not support VTOC then we try
8334 * to read an EFI label.
8336 * We need to know the block size and the disk size to
8337 * be able to read an EFI label.
8339 if (vdc
->vdisk_size
== 0) {
8340 mutex_enter(&vdc
->lock
);
8341 vdc_store_label_unk(vdc
);
8345 VDC_EFI_DEV_SET(edev
, vdc
, vd_process_efi_ioctl
);
8347 rv
= vd_efi_alloc_and_read(&edev
, &gpt
, &gpe
);
8350 DMSG(vdc
, 0, "[%d] Failed to get EFI (err=%d)",
8352 mutex_enter(&vdc
->lock
);
8353 vdc_store_label_unk(vdc
);
8357 mutex_enter(&vdc
->lock
);
8358 vdc_store_label_efi(vdc
, gpt
, gpe
);
8359 vd_efi_free(&edev
, gpt
, gpe
);
8364 DMSG(vdc
, 0, "[%d] Failed to get VTOC (err=%d)",
8366 mutex_enter(&vdc
->lock
);
8367 vdc_store_label_unk(vdc
);
8373 /* check that geometry and vtoc are valid */
8374 if (geom
.dkg_nhead
== 0 || geom
.dkg_nsect
== 0 ||
8375 vtoc
.v_sanity
!= VTOC_SANE
) {
8376 mutex_enter(&vdc
->lock
);
8377 vdc_store_label_unk(vdc
);
8382 * We have a disk and a valid VTOC. However this does not mean
8383 * that the disk currently have a VTOC label. The returned VTOC may
8384 * be a default VTOC to be used for configuring the disk (this is
8385 * what is done for disk image). So we read the label from the
8386 * beginning of the disk to ensure we really have a VTOC label.
8388 * FUTURE: This could be the default way for reading the VTOC
8389 * from the disk as opposed to sending the VD_OP_GET_VTOC
8390 * to the server. This will be the default if vdc is implemented
8395 * Single slice disk does not support read using an absolute disk
8396 * offset so we just rely on the DKIOCGVTOC ioctl in that case.
8398 if (vdc
->vdisk_type
== VD_DISK_TYPE_SLICE
) {
8399 mutex_enter(&vdc
->lock
);
8400 if (vtoc
.v_nparts
!= 1) {
8401 vdc_store_label_unk(vdc
);
8404 vdc_store_label_vtoc(vdc
, &geom
, &vtoc
);
8408 if (vtoc
.v_nparts
!= V_NUMPAR
) {
8409 mutex_enter(&vdc
->lock
);
8410 vdc_store_label_unk(vdc
);
8415 * Most CD/DVDs do not have a disk label and the label is
8416 * generated by the disk driver. So the on-disk label check
8417 * below may fail and we return now to avoid this problem.
8419 if (vdc
->vdisk_media
== VD_MEDIA_CD
||
8420 vdc
->vdisk_media
== VD_MEDIA_DVD
) {
8421 mutex_enter(&vdc
->lock
);
8422 vdc_store_label_vtoc(vdc
, &geom
, &vtoc
);
8427 * Read disk label from start of disk
8429 label
= kmem_alloc(vdc
->vdisk_bsize
, KM_SLEEP
);
8431 rv
= vdc_do_op(vdc
, VD_OP_BREAD
, (caddr_t
)label
, vdc
->vdisk_bsize
,
8432 VD_SLICE_NONE
, 0, NULL
, VIO_read_dir
, VDC_OP_NORMAL
);
8434 if (rv
!= 0 || label
->dkl_magic
!= DKL_MAGIC
||
8435 label
->dkl_cksum
!= vdc_lbl2cksum(label
)) {
8436 DMSG(vdc
, 1, "[%d] Got VTOC with invalid label\n",
8438 kmem_free(label
, vdc
->vdisk_bsize
);
8439 mutex_enter(&vdc
->lock
);
8440 vdc_store_label_unk(vdc
);
8444 kmem_free(label
, vdc
->vdisk_bsize
);
8445 mutex_enter(&vdc
->lock
);
8446 vdc_store_label_vtoc(vdc
, &geom
, &vtoc
);
8455 * This routine discovers the label of the disk and create the
8456 * appropriate device nodes if the label has changed.
8459 * vdc - soft state pointer for this instance of the device driver.
8465 vdc_validate(vdc_t
*vdc
)
8467 vd_disk_label_t old_label
;
8468 vd_slice_t old_slice
[V_NUMPAR
];
8471 ASSERT(!MUTEX_HELD(&vdc
->lock
));
8473 mutex_enter(&vdc
->lock
);
8475 /* save the current label and vtoc */
8476 old_label
= vdc
->vdisk_label
;
8477 bcopy(vdc
->slice
, &old_slice
, sizeof (vd_slice_t
) * V_NUMPAR
);
8479 /* check the geometry */
8480 (void) vdc_validate_geometry(vdc
);
8482 /* if the disk label has changed, update device nodes */
8483 if (vdc
->vdisk_type
== VD_DISK_TYPE_DISK
&&
8484 vdc
->vdisk_label
!= old_label
) {
8486 if (vdc
->vdisk_label
== VD_DISK_LABEL_EFI
)
8487 rv
= vdc_create_device_nodes_efi(vdc
);
8489 rv
= vdc_create_device_nodes_vtoc(vdc
);
8492 DMSG(vdc
, 0, "![%d] Failed to update device nodes",
8497 mutex_exit(&vdc
->lock
);
8501 vdc_validate_task(void *arg
)
8503 vdc_t
*vdc
= (vdc_t
*)arg
;
8507 mutex_enter(&vdc
->lock
);
8508 ASSERT(vdc
->validate_pending
> 0);
8509 vdc
->validate_pending
--;
8510 mutex_exit(&vdc
->lock
);
8518 * This routine discovers the devid of a vDisk. It requests the devid of
8519 * the underlying device from the vDisk server, builds an encapsulated
8520 * devid based on the retrieved devid and registers that new devid to
8524 * vdc - soft state pointer for this instance of the device driver.
8527 * 0 - A devid was succesfully registered for the vDisk
8530 vdc_setup_devid(vdc_t
*vdc
)
8533 vd_devid_t
*vd_devid
;
8534 size_t bufsize
, bufid_len
;
8535 ddi_devid_t vdisk_devid
;
8539 * At first sight, we don't know the size of the devid that the
8540 * server will return but this size will be encoded into the
8541 * reply. So we do a first request using a default size then we
8542 * check if this size was large enough. If not then we do a second
8543 * request with the correct size returned by the server. Note that
8544 * ldc requires size to be 8-byte aligned.
8546 bufsize
= P2ROUNDUP(VD_DEVID_SIZE(VD_DEVID_DEFAULT_LEN
),
8548 vd_devid
= kmem_zalloc(bufsize
, KM_SLEEP
);
8549 bufid_len
= bufsize
- sizeof (vd_efi_t
) - 1;
8551 rv
= vdc_do_op(vdc
, VD_OP_GET_DEVID
, (caddr_t
)vd_devid
,
8552 bufsize
, 0, 0, NULL
, VIO_both_dir
, 0);
8554 DMSG(vdc
, 2, "do_op returned %d\n", rv
);
8557 kmem_free(vd_devid
, bufsize
);
8561 if (vd_devid
->length
> bufid_len
) {
8563 * The returned devid is larger than the buffer used. Try again
8564 * with a buffer with the right size.
8566 kmem_free(vd_devid
, bufsize
);
8567 bufsize
= P2ROUNDUP(VD_DEVID_SIZE(vd_devid
->length
),
8569 vd_devid
= kmem_zalloc(bufsize
, KM_SLEEP
);
8570 bufid_len
= bufsize
- sizeof (vd_efi_t
) - 1;
8572 rv
= vdc_do_sync_op(vdc
, VD_OP_GET_DEVID
, (caddr_t
)vd_devid
,
8573 bufsize
, 0, 0, VIO_both_dir
, B_TRUE
);
8576 kmem_free(vd_devid
, bufsize
);
8582 * The virtual disk should have the same device id as the one associated
8583 * with the physical disk it is mapped on, otherwise sharing a disk
8584 * between a LDom and a non-LDom may not work (for example for a shared
8587 * The DDI framework does not allow creating a device id with any
8588 * type so we first create a device id of type DEVID_ENCAP and then
8589 * we restore the orignal type of the physical device.
8592 DMSG(vdc
, 2, ": devid length = %d\n", vd_devid
->length
);
8594 /* build an encapsulated devid based on the returned devid */
8595 if (ddi_devid_init(vdc
->dip
, DEVID_ENCAP
, vd_devid
->length
,
8596 vd_devid
->id
, &vdisk_devid
) != DDI_SUCCESS
) {
8597 DMSG(vdc
, 1, "[%d] Fail to created devid\n", vdc
->instance
);
8598 kmem_free(vd_devid
, bufsize
);
8602 DEVID_FORMTYPE((impl_devid_t
*)vdisk_devid
, vd_devid
->type
);
8604 ASSERT(ddi_devid_valid(vdisk_devid
) == DDI_SUCCESS
);
8606 kmem_free(vd_devid
, bufsize
);
8608 if (vdc
->devid
!= NULL
) {
8609 /* check that the devid hasn't changed */
8610 if (ddi_devid_compare(vdisk_devid
, vdc
->devid
) == 0) {
8611 ddi_devid_free(vdisk_devid
);
8615 cmn_err(CE_WARN
, "vdisk@%d backend devid has changed",
8618 devid_str
= ddi_devid_str_encode(vdc
->devid
, NULL
);
8620 cmn_err(CE_CONT
, "vdisk@%d backend initial devid: %s",
8622 (devid_str
)? devid_str
: "<encoding error>");
8625 ddi_devid_str_free(devid_str
);
8627 devid_str
= ddi_devid_str_encode(vdisk_devid
, NULL
);
8629 cmn_err(CE_CONT
, "vdisk@%d backend current devid: %s",
8631 (devid_str
)? devid_str
: "<encoding error>");
8634 ddi_devid_str_free(devid_str
);
8636 ddi_devid_free(vdisk_devid
);
8640 if (ddi_devid_register(vdc
->dip
, vdisk_devid
) != DDI_SUCCESS
) {
8641 DMSG(vdc
, 1, "[%d] Fail to register devid\n", vdc
->instance
);
8642 ddi_devid_free(vdisk_devid
);
8646 vdc
->devid
= vdisk_devid
;
8652 vdc_store_label_efi(vdc_t
*vdc
, efi_gpt_t
*gpt
, efi_gpe_t
*gpe
)
8656 ASSERT(MUTEX_HELD(&vdc
->lock
));
8658 vdc
->vdisk_label
= VD_DISK_LABEL_EFI
;
8659 bzero(vdc
->vtoc
, sizeof (struct extvtoc
));
8660 bzero(vdc
->geom
, sizeof (struct dk_geom
));
8661 bzero(vdc
->slice
, sizeof (vd_slice_t
) * V_NUMPAR
);
8663 nparts
= gpt
->efi_gpt_NumberOfPartitionEntries
;
8665 for (i
= 0; i
< nparts
&& i
< VD_EFI_WD_SLICE
; i
++) {
8667 if (gpe
[i
].efi_gpe_StartingLBA
== 0 &&
8668 gpe
[i
].efi_gpe_EndingLBA
== 0) {
8672 vdc
->slice
[i
].start
= gpe
[i
].efi_gpe_StartingLBA
;
8673 vdc
->slice
[i
].nblocks
= gpe
[i
].efi_gpe_EndingLBA
-
8674 gpe
[i
].efi_gpe_StartingLBA
+ 1;
8677 ASSERT(vdc
->vdisk_size
!= 0);
8678 vdc
->slice
[VD_EFI_WD_SLICE
].start
= 0;
8679 vdc
->slice
[VD_EFI_WD_SLICE
].nblocks
= vdc
->vdisk_size
;
8684 vdc_store_label_vtoc(vdc_t
*vdc
, struct dk_geom
*geom
, struct extvtoc
*vtoc
)
8688 ASSERT(MUTEX_HELD(&vdc
->lock
));
8689 ASSERT(vdc
->vdisk_bsize
== vtoc
->v_sectorsz
);
8691 vdc
->vdisk_label
= VD_DISK_LABEL_VTOC
;
8692 bcopy(vtoc
, vdc
->vtoc
, sizeof (struct extvtoc
));
8693 bcopy(geom
, vdc
->geom
, sizeof (struct dk_geom
));
8694 bzero(vdc
->slice
, sizeof (vd_slice_t
) * V_NUMPAR
);
8696 for (i
= 0; i
< vtoc
->v_nparts
; i
++) {
8697 vdc
->slice
[i
].start
= vtoc
->v_part
[i
].p_start
;
8698 vdc
->slice
[i
].nblocks
= vtoc
->v_part
[i
].p_size
;
8703 vdc_store_label_unk(vdc_t
*vdc
)
8705 ASSERT(MUTEX_HELD(&vdc
->lock
));
8707 vdc
->vdisk_label
= VD_DISK_LABEL_UNK
;
8708 bzero(vdc
->vtoc
, sizeof (struct extvtoc
));
8709 bzero(vdc
->geom
, sizeof (struct dk_geom
));
8710 bzero(vdc
->slice
, sizeof (vd_slice_t
) * V_NUMPAR
);