6514 AS_* lock macros simplification
[illumos-gate.git] / usr / src / uts / common / xen / io / xpvtap.c
blob57290aa9d55da6065e56af8498eba1e2dd507905
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 #include <sys/errno.h>
29 #include <sys/types.h>
30 #include <sys/conf.h>
31 #include <sys/kmem.h>
32 #include <sys/ddi.h>
33 #include <sys/stat.h>
34 #include <sys/sunddi.h>
35 #include <sys/file.h>
36 #include <sys/open.h>
37 #include <sys/modctl.h>
38 #include <sys/ddi_impldefs.h>
39 #include <sys/sysmacros.h>
40 #include <sys/ddidevmap.h>
41 #include <sys/policy.h>
43 #include <sys/vmsystm.h>
44 #include <vm/hat_i86.h>
45 #include <vm/hat_pte.h>
46 #include <vm/seg_kmem.h>
47 #include <vm/seg_mf.h>
49 #include <xen/io/blkif_impl.h>
50 #include <xen/io/blk_common.h>
51 #include <xen/io/xpvtap.h>
54 static int xpvtap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
55 static int xpvtap_close(dev_t devp, int flag, int otyp, cred_t *cred);
56 static int xpvtap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
57 cred_t *cred, int *rval);
58 static int xpvtap_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off,
59 size_t len, size_t *maplen, uint_t model);
60 static int xpvtap_segmap(dev_t dev, off_t off, struct as *asp, caddr_t *addrp,
61 off_t len, unsigned int prot, unsigned int maxprot, unsigned int flags,
62 cred_t *cred_p);
63 static int xpvtap_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
64 struct pollhead **phpp);
66 static struct cb_ops xpvtap_cb_ops = {
67 xpvtap_open, /* cb_open */
68 xpvtap_close, /* cb_close */
69 nodev, /* cb_strategy */
70 nodev, /* cb_print */
71 nodev, /* cb_dump */
72 nodev, /* cb_read */
73 nodev, /* cb_write */
74 xpvtap_ioctl, /* cb_ioctl */
75 xpvtap_devmap, /* cb_devmap */
76 nodev, /* cb_mmap */
77 xpvtap_segmap, /* cb_segmap */
78 xpvtap_chpoll, /* cb_chpoll */
79 ddi_prop_op, /* cb_prop_op */
80 NULL, /* cb_stream */
81 D_NEW | D_MP | D_64BIT | D_DEVMAP, /* cb_flag */
82 CB_REV
85 static int xpvtap_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg,
86 void **result);
87 static int xpvtap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd);
88 static int xpvtap_detach(dev_info_t *devi, ddi_detach_cmd_t cmd);
90 static struct dev_ops xpvtap_dev_ops = {
91 DEVO_REV, /* devo_rev */
92 0, /* devo_refcnt */
93 xpvtap_getinfo, /* devo_getinfo */
94 nulldev, /* devo_identify */
95 nulldev, /* devo_probe */
96 xpvtap_attach, /* devo_attach */
97 xpvtap_detach, /* devo_detach */
98 nodev, /* devo_reset */
99 &xpvtap_cb_ops, /* devo_cb_ops */
100 NULL, /* devo_bus_ops */
101 NULL /* power */
105 static struct modldrv xpvtap_modldrv = {
106 &mod_driverops, /* Type of module. This one is a driver */
107 "xpvtap driver", /* Name of the module. */
108 &xpvtap_dev_ops, /* driver ops */
111 static struct modlinkage xpvtap_modlinkage = {
112 MODREV_1,
113 (void *) &xpvtap_modldrv,
114 NULL
118 void *xpvtap_statep;
121 static xpvtap_state_t *xpvtap_drv_init(int instance);
122 static void xpvtap_drv_fini(xpvtap_state_t *state);
123 static uint_t xpvtap_intr(caddr_t arg);
125 typedef void (*xpvtap_rs_cleanup_t)(xpvtap_state_t *state, uint_t rs);
126 static void xpvtap_rs_init(uint_t min_val, uint_t max_val,
127 xpvtap_rs_hdl_t *handle);
128 static void xpvtap_rs_fini(xpvtap_rs_hdl_t *handle);
129 static int xpvtap_rs_alloc(xpvtap_rs_hdl_t handle, uint_t *rs);
130 static void xpvtap_rs_free(xpvtap_rs_hdl_t handle, uint_t rs);
131 static void xpvtap_rs_flush(xpvtap_rs_hdl_t handle,
132 xpvtap_rs_cleanup_t callback, void *arg);
134 static int xpvtap_segmf_register(xpvtap_state_t *state);
135 static void xpvtap_segmf_unregister(struct as *as, void *arg, uint_t event);
137 static int xpvtap_user_init(xpvtap_state_t *state);
138 static void xpvtap_user_fini(xpvtap_state_t *state);
139 static int xpvtap_user_ring_init(xpvtap_state_t *state);
140 static void xpvtap_user_ring_fini(xpvtap_state_t *state);
141 static int xpvtap_user_thread_init(xpvtap_state_t *state);
142 static void xpvtap_user_thread_fini(xpvtap_state_t *state);
143 static void xpvtap_user_thread_start(caddr_t arg);
144 static void xpvtap_user_thread_stop(xpvtap_state_t *state);
145 static void xpvtap_user_thread(void *arg);
147 static void xpvtap_user_app_stop(caddr_t arg);
149 static int xpvtap_user_request_map(xpvtap_state_t *state, blkif_request_t *req,
150 uint_t *uid);
151 static int xpvtap_user_request_push(xpvtap_state_t *state,
152 blkif_request_t *req, uint_t uid);
153 static int xpvtap_user_response_get(xpvtap_state_t *state,
154 blkif_response_t *resp, uint_t *uid);
155 static void xpvtap_user_request_unmap(xpvtap_state_t *state, uint_t uid);
159 * _init()
162 _init(void)
164 int e;
166 e = ddi_soft_state_init(&xpvtap_statep, sizeof (xpvtap_state_t), 1);
167 if (e != 0) {
168 return (e);
171 e = mod_install(&xpvtap_modlinkage);
172 if (e != 0) {
173 ddi_soft_state_fini(&xpvtap_statep);
174 return (e);
177 return (0);
182 * _info()
185 _info(struct modinfo *modinfop)
187 return (mod_info(&xpvtap_modlinkage, modinfop));
192 * _fini()
195 _fini(void)
197 int e;
199 e = mod_remove(&xpvtap_modlinkage);
200 if (e != 0) {
201 return (e);
204 ddi_soft_state_fini(&xpvtap_statep);
206 return (0);
211 * xpvtap_attach()
213 static int
214 xpvtap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
216 blk_ringinit_args_t args;
217 xpvtap_state_t *state;
218 int instance;
219 int e;
222 switch (cmd) {
223 case DDI_ATTACH:
224 break;
226 case DDI_RESUME:
227 return (DDI_SUCCESS);
229 default:
230 return (DDI_FAILURE);
233 /* initialize our state info */
234 instance = ddi_get_instance(dip);
235 state = xpvtap_drv_init(instance);
236 if (state == NULL) {
237 return (DDI_FAILURE);
239 state->bt_dip = dip;
241 /* Initialize the guest ring */
242 args.ar_dip = state->bt_dip;
243 args.ar_intr = xpvtap_intr;
244 args.ar_intr_arg = (caddr_t)state;
245 args.ar_ringup = xpvtap_user_thread_start;
246 args.ar_ringup_arg = (caddr_t)state;
247 args.ar_ringdown = xpvtap_user_app_stop;
248 args.ar_ringdown_arg = (caddr_t)state;
249 e = blk_ring_init(&args, &state->bt_guest_ring);
250 if (e != DDI_SUCCESS) {
251 goto attachfail_ringinit;
254 /* create the minor node (for ioctl/mmap) */
255 e = ddi_create_minor_node(dip, "xpvtap", S_IFCHR, instance,
256 DDI_PSEUDO, 0);
257 if (e != DDI_SUCCESS) {
258 goto attachfail_minor_node;
261 /* Report that driver was loaded */
262 ddi_report_dev(dip);
264 return (DDI_SUCCESS);
266 attachfail_minor_node:
267 blk_ring_fini(&state->bt_guest_ring);
268 attachfail_ringinit:
269 xpvtap_drv_fini(state);
270 return (DDI_FAILURE);
275 * xpvtap_detach()
277 static int
278 xpvtap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
280 xpvtap_state_t *state;
281 int instance;
284 instance = ddi_get_instance(dip);
285 state = ddi_get_soft_state(xpvtap_statep, instance);
286 if (state == NULL) {
287 return (DDI_FAILURE);
290 switch (cmd) {
291 case DDI_DETACH:
292 break;
294 case DDI_SUSPEND:
295 default:
296 return (DDI_FAILURE);
299 xpvtap_user_thread_stop(state);
300 blk_ring_fini(&state->bt_guest_ring);
301 xpvtap_drv_fini(state);
302 ddi_remove_minor_node(dip, NULL);
304 return (DDI_SUCCESS);
309 * xpvtap_getinfo()
311 /*ARGSUSED*/
312 static int
313 xpvtap_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
315 xpvtap_state_t *state;
316 int instance;
317 dev_t dev;
318 int e;
321 dev = (dev_t)arg;
322 instance = getminor(dev);
324 switch (cmd) {
325 case DDI_INFO_DEVT2DEVINFO:
326 state = ddi_get_soft_state(xpvtap_statep, instance);
327 if (state == NULL) {
328 return (DDI_FAILURE);
330 *result = (void *)state->bt_dip;
331 e = DDI_SUCCESS;
332 break;
334 case DDI_INFO_DEVT2INSTANCE:
335 *result = (void *)(uintptr_t)instance;
336 e = DDI_SUCCESS;
337 break;
339 default:
340 e = DDI_FAILURE;
341 break;
344 return (e);
349 * xpvtap_open()
351 /*ARGSUSED*/
352 static int
353 xpvtap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
355 xpvtap_state_t *state;
356 int instance;
359 if (secpolicy_xvm_control(cred)) {
360 return (EPERM);
363 instance = getminor(*devp);
364 state = ddi_get_soft_state(xpvtap_statep, instance);
365 if (state == NULL) {
366 return (ENXIO);
369 /* we should only be opened once */
370 mutex_enter(&state->bt_open.bo_mutex);
371 if (state->bt_open.bo_opened) {
372 mutex_exit(&state->bt_open.bo_mutex);
373 return (EBUSY);
375 state->bt_open.bo_opened = B_TRUE;
376 mutex_exit(&state->bt_open.bo_mutex);
379 * save the apps address space. need it for mapping/unmapping grefs
380 * since will be doing it in a separate kernel thread.
382 state->bt_map.um_as = curproc->p_as;
384 return (0);
389 * xpvtap_close()
391 /*ARGSUSED*/
392 static int
393 xpvtap_close(dev_t devp, int flag, int otyp, cred_t *cred)
395 xpvtap_state_t *state;
396 int instance;
399 instance = getminor(devp);
400 state = ddi_get_soft_state(xpvtap_statep, instance);
401 if (state == NULL) {
402 return (ENXIO);
406 * wake thread so it can cleanup and wait for it to exit so we can
407 * be sure it's not in the middle of processing a request/response.
409 mutex_enter(&state->bt_thread.ut_mutex);
410 state->bt_thread.ut_wake = B_TRUE;
411 state->bt_thread.ut_exit = B_TRUE;
412 cv_signal(&state->bt_thread.ut_wake_cv);
413 if (!state->bt_thread.ut_exit_done) {
414 cv_wait(&state->bt_thread.ut_exit_done_cv,
415 &state->bt_thread.ut_mutex);
417 ASSERT(state->bt_thread.ut_exit_done);
418 mutex_exit(&state->bt_thread.ut_mutex);
420 state->bt_map.um_as = NULL;
421 state->bt_map.um_guest_pages = NULL;
424 * when the ring is brought down, a userland hotplug script is run
425 * which tries to bring the userland app down. We'll wait for a bit
426 * for the user app to exit. Notify the thread waiting that the app
427 * has closed the driver.
429 mutex_enter(&state->bt_open.bo_mutex);
430 ASSERT(state->bt_open.bo_opened);
431 state->bt_open.bo_opened = B_FALSE;
432 cv_signal(&state->bt_open.bo_exit_cv);
433 mutex_exit(&state->bt_open.bo_mutex);
435 return (0);
440 * xpvtap_ioctl()
442 /*ARGSUSED*/
443 static int
444 xpvtap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred,
445 int *rval)
447 xpvtap_state_t *state;
448 int instance;
451 if (secpolicy_xvm_control(cred)) {
452 return (EPERM);
455 instance = getminor(dev);
456 if (instance == -1) {
457 return (EBADF);
460 state = ddi_get_soft_state(xpvtap_statep, instance);
461 if (state == NULL) {
462 return (EBADF);
465 switch (cmd) {
466 case XPVTAP_IOCTL_RESP_PUSH:
468 * wake thread, thread handles guest requests and user app
469 * responses.
471 mutex_enter(&state->bt_thread.ut_mutex);
472 state->bt_thread.ut_wake = B_TRUE;
473 cv_signal(&state->bt_thread.ut_wake_cv);
474 mutex_exit(&state->bt_thread.ut_mutex);
475 break;
477 default:
478 cmn_err(CE_WARN, "ioctl(%d) not supported\n", cmd);
479 return (ENXIO);
482 return (0);
487 * xpvtap_segmap()
489 /*ARGSUSED*/
490 static int
491 xpvtap_segmap(dev_t dev, off_t off, struct as *asp, caddr_t *addrp,
492 off_t len, unsigned int prot, unsigned int maxprot, unsigned int flags,
493 cred_t *cred_p)
495 struct segmf_crargs a;
496 xpvtap_state_t *state;
497 int instance;
498 int e;
501 if (secpolicy_xvm_control(cred_p)) {
502 return (EPERM);
505 instance = getminor(dev);
506 state = ddi_get_soft_state(xpvtap_statep, instance);
507 if (state == NULL) {
508 return (EBADF);
511 /* the user app should be doing a MAP_SHARED mapping */
512 if ((flags & MAP_TYPE) != MAP_SHARED) {
513 return (EINVAL);
517 * if this is the user ring (offset = 0), devmap it (which ends up in
518 * xpvtap_devmap). devmap will alloc and map the ring into the
519 * app's VA space.
521 if (off == 0) {
522 e = devmap_setup(dev, (offset_t)off, asp, addrp, (size_t)len,
523 prot, maxprot, flags, cred_p);
524 return (e);
527 /* this should be the mmap for the gref pages (offset = PAGESIZE) */
528 if (off != PAGESIZE) {
529 return (EINVAL);
532 /* make sure we get the size we're expecting */
533 if (len != XPVTAP_GREF_BUFSIZE) {
534 return (EINVAL);
538 * reserve user app VA space for the gref pages and use segmf to
539 * manage the backing store for the physical memory. segmf will
540 * map in/out the grefs and fault them in/out.
542 ASSERT(asp == state->bt_map.um_as);
543 as_rangelock(asp);
544 if ((flags & MAP_FIXED) == 0) {
545 map_addr(addrp, len, 0, 0, flags);
546 if (*addrp == NULL) {
547 as_rangeunlock(asp);
548 return (ENOMEM);
550 } else {
551 /* User specified address */
552 (void) as_unmap(asp, *addrp, len);
554 a.dev = dev;
555 a.prot = (uchar_t)prot;
556 a.maxprot = (uchar_t)maxprot;
557 e = as_map(asp, *addrp, len, segmf_create, &a);
558 if (e != 0) {
559 as_rangeunlock(asp);
560 return (e);
562 as_rangeunlock(asp);
565 * Stash user base address, and compute address where the request
566 * array will end up.
568 state->bt_map.um_guest_pages = (caddr_t)*addrp;
569 state->bt_map.um_guest_size = (size_t)len;
571 /* register an as callback so we can cleanup when the app goes away */
572 e = as_add_callback(asp, xpvtap_segmf_unregister, state,
573 AS_UNMAP_EVENT, *addrp, len, KM_SLEEP);
574 if (e != 0) {
575 (void) as_unmap(asp, *addrp, len);
576 return (EINVAL);
579 /* wake thread to see if there are requests already queued up */
580 mutex_enter(&state->bt_thread.ut_mutex);
581 state->bt_thread.ut_wake = B_TRUE;
582 cv_signal(&state->bt_thread.ut_wake_cv);
583 mutex_exit(&state->bt_thread.ut_mutex);
585 return (0);
590 * xpvtap_devmap()
592 /*ARGSUSED*/
593 static int
594 xpvtap_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off, size_t len,
595 size_t *maplen, uint_t model)
597 xpvtap_user_ring_t *usring;
598 xpvtap_state_t *state;
599 int instance;
600 int e;
603 instance = getminor(dev);
604 state = ddi_get_soft_state(xpvtap_statep, instance);
605 if (state == NULL) {
606 return (EBADF);
609 /* we should only get here if the offset was == 0 */
610 if (off != 0) {
611 return (EINVAL);
614 /* we should only be mapping in one page */
615 if (len != PAGESIZE) {
616 return (EINVAL);
620 * we already allocated the user ring during driver attach, all we
621 * need to do is map it into the user app's VA.
623 usring = &state->bt_user_ring;
624 e = devmap_umem_setup(dhp, state->bt_dip, NULL, usring->ur_cookie, 0,
625 PAGESIZE, PROT_ALL, DEVMAP_DEFAULTS, NULL);
626 if (e < 0) {
627 return (e);
630 /* return the size to compete the devmap */
631 *maplen = PAGESIZE;
633 return (0);
638 * xpvtap_chpoll()
640 static int
641 xpvtap_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
642 struct pollhead **phpp)
644 xpvtap_user_ring_t *usring;
645 xpvtap_state_t *state;
646 int instance;
649 instance = getminor(dev);
650 if (instance == -1) {
651 return (EBADF);
653 state = ddi_get_soft_state(xpvtap_statep, instance);
654 if (state == NULL) {
655 return (EBADF);
658 if (((events & (POLLIN | POLLRDNORM)) == 0) && !anyyet) {
659 *reventsp = 0;
660 return (EINVAL);
664 * if we pushed requests on the user ring since the last poll, wakeup
665 * the user app
667 usring = &state->bt_user_ring;
668 if (usring->ur_prod_polled != usring->ur_ring.req_prod_pvt) {
671 * XXX - is this faster here or xpvtap_user_request_push??
672 * prelim data says here. Because less membars or because
673 * user thread will spin in poll requests before getting to
674 * responses?
676 RING_PUSH_REQUESTS(&usring->ur_ring);
678 usring->ur_prod_polled = usring->ur_ring.sring->req_prod;
679 *reventsp = POLLIN | POLLRDNORM;
681 /* no new requests */
682 } else {
683 *reventsp = 0;
684 if (!anyyet) {
685 *phpp = &state->bt_pollhead;
689 return (0);
694 * xpvtap_drv_init()
696 static xpvtap_state_t *
697 xpvtap_drv_init(int instance)
699 xpvtap_state_t *state;
700 int e;
703 e = ddi_soft_state_zalloc(xpvtap_statep, instance);
704 if (e != DDI_SUCCESS) {
705 return (NULL);
707 state = ddi_get_soft_state(xpvtap_statep, instance);
708 if (state == NULL) {
709 goto drvinitfail_get_soft_state;
712 state->bt_instance = instance;
713 mutex_init(&state->bt_open.bo_mutex, NULL, MUTEX_DRIVER, NULL);
714 cv_init(&state->bt_open.bo_exit_cv, NULL, CV_DRIVER, NULL);
715 state->bt_open.bo_opened = B_FALSE;
716 state->bt_map.um_registered = B_FALSE;
718 /* initialize user ring, thread, mapping state */
719 e = xpvtap_user_init(state);
720 if (e != DDI_SUCCESS) {
721 goto drvinitfail_userinit;
724 return (state);
726 drvinitfail_userinit:
727 cv_destroy(&state->bt_open.bo_exit_cv);
728 mutex_destroy(&state->bt_open.bo_mutex);
729 drvinitfail_get_soft_state:
730 (void) ddi_soft_state_free(xpvtap_statep, instance);
731 return (NULL);
736 * xpvtap_drv_fini()
738 static void
739 xpvtap_drv_fini(xpvtap_state_t *state)
741 xpvtap_user_fini(state);
742 cv_destroy(&state->bt_open.bo_exit_cv);
743 mutex_destroy(&state->bt_open.bo_mutex);
744 (void) ddi_soft_state_free(xpvtap_statep, state->bt_instance);
749 * xpvtap_intr()
750 * this routine will be called when we have a request on the guest ring.
752 static uint_t
753 xpvtap_intr(caddr_t arg)
755 xpvtap_state_t *state;
758 state = (xpvtap_state_t *)arg;
760 /* wake thread, thread handles guest requests and user app responses */
761 mutex_enter(&state->bt_thread.ut_mutex);
762 state->bt_thread.ut_wake = B_TRUE;
763 cv_signal(&state->bt_thread.ut_wake_cv);
764 mutex_exit(&state->bt_thread.ut_mutex);
766 return (DDI_INTR_CLAIMED);
771 * xpvtap_segmf_register()
773 static int
774 xpvtap_segmf_register(xpvtap_state_t *state)
776 struct seg *seg;
777 uint64_t pte_ma;
778 struct as *as;
779 caddr_t uaddr;
780 uint_t pgcnt;
781 int i;
784 as = state->bt_map.um_as;
785 pgcnt = btopr(state->bt_map.um_guest_size);
786 uaddr = state->bt_map.um_guest_pages;
788 if (pgcnt == 0) {
789 return (DDI_FAILURE);
792 AS_LOCK_ENTER(as, RW_READER);
794 seg = as_findseg(as, state->bt_map.um_guest_pages, 0);
795 if ((seg == NULL) || ((uaddr + state->bt_map.um_guest_size) >
796 (seg->s_base + seg->s_size))) {
797 AS_LOCK_EXIT(as);
798 return (DDI_FAILURE);
802 * lock down the htables so the HAT can't steal them. Register the
803 * PTE MA's for each gref page with seg_mf so we can do user space
804 * gref mappings.
806 for (i = 0; i < pgcnt; i++) {
807 hat_prepare_mapping(as->a_hat, uaddr, &pte_ma);
808 hat_devload(as->a_hat, uaddr, PAGESIZE, (pfn_t)0,
809 PROT_READ | PROT_WRITE | PROT_USER | HAT_UNORDERED_OK,
810 HAT_LOAD_NOCONSIST | HAT_LOAD_LOCK);
811 hat_release_mapping(as->a_hat, uaddr);
812 segmf_add_gref_pte(seg, uaddr, pte_ma);
813 uaddr += PAGESIZE;
816 state->bt_map.um_registered = B_TRUE;
818 AS_LOCK_EXIT(as);
820 return (DDI_SUCCESS);
825 * xpvtap_segmf_unregister()
826 * as_callback routine
828 /*ARGSUSED*/
829 static void
830 xpvtap_segmf_unregister(struct as *as, void *arg, uint_t event)
832 xpvtap_state_t *state;
833 caddr_t uaddr;
834 uint_t pgcnt;
835 int i;
838 state = (xpvtap_state_t *)arg;
839 if (!state->bt_map.um_registered) {
840 /* remove the callback (which is this routine) */
841 (void) as_delete_callback(as, arg);
842 return;
845 pgcnt = btopr(state->bt_map.um_guest_size);
846 uaddr = state->bt_map.um_guest_pages;
848 /* unmap any outstanding req's grefs */
849 xpvtap_rs_flush(state->bt_map.um_rs, xpvtap_user_request_unmap, state);
851 /* Unlock the gref pages */
852 for (i = 0; i < pgcnt; i++) {
853 AS_LOCK_ENTER(as, RW_WRITER);
854 hat_prepare_mapping(as->a_hat, uaddr, NULL);
855 hat_unload(as->a_hat, uaddr, PAGESIZE, HAT_UNLOAD_UNLOCK);
856 hat_release_mapping(as->a_hat, uaddr);
857 AS_LOCK_EXIT(as);
858 uaddr += PAGESIZE;
861 /* remove the callback (which is this routine) */
862 (void) as_delete_callback(as, arg);
864 state->bt_map.um_registered = B_FALSE;
869 * xpvtap_user_init()
871 static int
872 xpvtap_user_init(xpvtap_state_t *state)
874 xpvtap_user_map_t *map;
875 int e;
878 map = &state->bt_map;
880 /* Setup the ring between the driver and user app */
881 e = xpvtap_user_ring_init(state);
882 if (e != DDI_SUCCESS) {
883 return (DDI_FAILURE);
887 * the user ring can handle BLKIF_RING_SIZE outstanding requests. This
888 * is the same number of requests as the guest ring. Initialize the
889 * state we use to track request IDs to the user app. These IDs will
890 * also identify which group of gref pages correspond with the
891 * request.
893 xpvtap_rs_init(0, (BLKIF_RING_SIZE - 1), &map->um_rs);
896 * allocate the space to store a copy of each outstanding requests. We
897 * will need to reference the ID and the number of segments when we
898 * get the response from the user app.
900 map->um_outstanding_reqs = kmem_zalloc(
901 sizeof (*map->um_outstanding_reqs) * BLKIF_RING_SIZE,
902 KM_SLEEP);
905 * initialize the thread we use to process guest requests and user
906 * responses.
908 e = xpvtap_user_thread_init(state);
909 if (e != DDI_SUCCESS) {
910 goto userinitfail_user_thread_init;
913 return (DDI_SUCCESS);
915 userinitfail_user_thread_init:
916 xpvtap_rs_fini(&map->um_rs);
917 kmem_free(map->um_outstanding_reqs,
918 sizeof (*map->um_outstanding_reqs) * BLKIF_RING_SIZE);
919 xpvtap_user_ring_fini(state);
920 return (DDI_FAILURE);
925 * xpvtap_user_ring_init()
927 static int
928 xpvtap_user_ring_init(xpvtap_state_t *state)
930 xpvtap_user_ring_t *usring;
933 usring = &state->bt_user_ring;
935 /* alocate and initialize the page for the shared user ring */
936 usring->ur_sring = (blkif_sring_t *)ddi_umem_alloc(PAGESIZE,
937 DDI_UMEM_SLEEP, &usring->ur_cookie);
938 SHARED_RING_INIT(usring->ur_sring);
939 FRONT_RING_INIT(&usring->ur_ring, usring->ur_sring, PAGESIZE);
940 usring->ur_prod_polled = 0;
942 return (DDI_SUCCESS);
947 * xpvtap_user_thread_init()
949 static int
950 xpvtap_user_thread_init(xpvtap_state_t *state)
952 xpvtap_user_thread_t *thread;
953 char taskqname[32];
956 thread = &state->bt_thread;
958 mutex_init(&thread->ut_mutex, NULL, MUTEX_DRIVER, NULL);
959 cv_init(&thread->ut_wake_cv, NULL, CV_DRIVER, NULL);
960 cv_init(&thread->ut_exit_done_cv, NULL, CV_DRIVER, NULL);
961 thread->ut_wake = B_FALSE;
962 thread->ut_exit = B_FALSE;
963 thread->ut_exit_done = B_TRUE;
965 /* create but don't start the user thread */
966 (void) sprintf(taskqname, "xvptap_%d", state->bt_instance);
967 thread->ut_taskq = ddi_taskq_create(state->bt_dip, taskqname, 1,
968 TASKQ_DEFAULTPRI, 0);
969 if (thread->ut_taskq == NULL) {
970 goto userinitthrfail_taskq_create;
973 return (DDI_SUCCESS);
975 userinitthrfail_taskq_dispatch:
976 ddi_taskq_destroy(thread->ut_taskq);
977 userinitthrfail_taskq_create:
978 cv_destroy(&thread->ut_exit_done_cv);
979 cv_destroy(&thread->ut_wake_cv);
980 mutex_destroy(&thread->ut_mutex);
982 return (DDI_FAILURE);
987 * xpvtap_user_thread_start()
989 static void
990 xpvtap_user_thread_start(caddr_t arg)
992 xpvtap_user_thread_t *thread;
993 xpvtap_state_t *state;
994 int e;
997 state = (xpvtap_state_t *)arg;
998 thread = &state->bt_thread;
1000 /* start the user thread */
1001 thread->ut_exit_done = B_FALSE;
1002 e = ddi_taskq_dispatch(thread->ut_taskq, xpvtap_user_thread, state,
1003 DDI_SLEEP);
1004 if (e != DDI_SUCCESS) {
1005 thread->ut_exit_done = B_TRUE;
1006 cmn_err(CE_WARN, "Unable to start user thread\n");
1012 * xpvtap_user_thread_stop()
1014 static void
1015 xpvtap_user_thread_stop(xpvtap_state_t *state)
1017 /* wake thread so it can exit */
1018 mutex_enter(&state->bt_thread.ut_mutex);
1019 state->bt_thread.ut_wake = B_TRUE;
1020 state->bt_thread.ut_exit = B_TRUE;
1021 cv_signal(&state->bt_thread.ut_wake_cv);
1022 if (!state->bt_thread.ut_exit_done) {
1023 cv_wait(&state->bt_thread.ut_exit_done_cv,
1024 &state->bt_thread.ut_mutex);
1026 mutex_exit(&state->bt_thread.ut_mutex);
1027 ASSERT(state->bt_thread.ut_exit_done);
1032 * xpvtap_user_fini()
1034 static void
1035 xpvtap_user_fini(xpvtap_state_t *state)
1037 xpvtap_user_map_t *map;
1040 map = &state->bt_map;
1042 xpvtap_user_thread_fini(state);
1043 xpvtap_rs_fini(&map->um_rs);
1044 kmem_free(map->um_outstanding_reqs,
1045 sizeof (*map->um_outstanding_reqs) * BLKIF_RING_SIZE);
1046 xpvtap_user_ring_fini(state);
1051 * xpvtap_user_ring_fini()
1053 static void
1054 xpvtap_user_ring_fini(xpvtap_state_t *state)
1056 ddi_umem_free(state->bt_user_ring.ur_cookie);
1061 * xpvtap_user_thread_fini()
1063 static void
1064 xpvtap_user_thread_fini(xpvtap_state_t *state)
1066 ddi_taskq_destroy(state->bt_thread.ut_taskq);
1067 cv_destroy(&state->bt_thread.ut_exit_done_cv);
1068 cv_destroy(&state->bt_thread.ut_wake_cv);
1069 mutex_destroy(&state->bt_thread.ut_mutex);
1074 * xpvtap_user_thread()
1076 static void
1077 xpvtap_user_thread(void *arg)
1079 xpvtap_user_thread_t *thread;
1080 blkif_response_t resp;
1081 xpvtap_state_t *state;
1082 blkif_request_t req;
1083 boolean_t b;
1084 uint_t uid;
1085 int e;
1088 state = (xpvtap_state_t *)arg;
1089 thread = &state->bt_thread;
1091 xpvtap_thread_start:
1092 /* See if we are supposed to exit */
1093 mutex_enter(&thread->ut_mutex);
1094 if (thread->ut_exit) {
1095 thread->ut_exit_done = B_TRUE;
1096 cv_signal(&state->bt_thread.ut_exit_done_cv);
1097 mutex_exit(&thread->ut_mutex);
1098 return;
1102 * if we aren't supposed to be awake, wait until someone wakes us.
1103 * when we wake up, check for a kill or someone telling us to exit.
1105 if (!thread->ut_wake) {
1106 e = cv_wait_sig(&thread->ut_wake_cv, &thread->ut_mutex);
1107 if ((e == 0) || (thread->ut_exit)) {
1108 thread->ut_exit = B_TRUE;
1109 mutex_exit(&thread->ut_mutex);
1110 goto xpvtap_thread_start;
1114 /* if someone didn't wake us, go back to the start of the thread */
1115 if (!thread->ut_wake) {
1116 mutex_exit(&thread->ut_mutex);
1117 goto xpvtap_thread_start;
1120 /* we are awake */
1121 thread->ut_wake = B_FALSE;
1122 mutex_exit(&thread->ut_mutex);
1124 /* process requests from the guest */
1125 do {
1127 * check for requests from the guest. if we don't have any,
1128 * break out of the loop.
1130 e = blk_ring_request_get(state->bt_guest_ring, &req);
1131 if (e == B_FALSE) {
1132 break;
1135 /* we got a request, map the grefs into the user app's VA */
1136 e = xpvtap_user_request_map(state, &req, &uid);
1137 if (e != DDI_SUCCESS) {
1139 * If we couldn't map the request (e.g. user app hasn't
1140 * opened the device yet), requeue it and try again
1141 * later
1143 blk_ring_request_requeue(state->bt_guest_ring);
1144 break;
1147 /* push the request to the user app */
1148 e = xpvtap_user_request_push(state, &req, uid);
1149 if (e != DDI_SUCCESS) {
1150 resp.id = req.id;
1151 resp.operation = req.operation;
1152 resp.status = BLKIF_RSP_ERROR;
1153 blk_ring_response_put(state->bt_guest_ring, &resp);
1155 } while (!thread->ut_exit);
1157 /* process reponses from the user app */
1158 do {
1160 * check for responses from the user app. if we don't have any,
1161 * break out of the loop.
1163 b = xpvtap_user_response_get(state, &resp, &uid);
1164 if (b != B_TRUE) {
1165 break;
1169 * if we got a response, unmap the grefs from the matching
1170 * request.
1172 xpvtap_user_request_unmap(state, uid);
1174 /* push the response to the guest */
1175 blk_ring_response_put(state->bt_guest_ring, &resp);
1176 } while (!thread->ut_exit);
1178 goto xpvtap_thread_start;
1183 * xpvtap_user_request_map()
1185 static int
1186 xpvtap_user_request_map(xpvtap_state_t *state, blkif_request_t *req,
1187 uint_t *uid)
1189 grant_ref_t gref[BLKIF_MAX_SEGMENTS_PER_REQUEST];
1190 struct seg *seg;
1191 struct as *as;
1192 domid_t domid;
1193 caddr_t uaddr;
1194 uint_t flags;
1195 int i;
1196 int e;
1199 domid = xvdi_get_oeid(state->bt_dip);
1201 as = state->bt_map.um_as;
1202 if ((as == NULL) || (state->bt_map.um_guest_pages == NULL)) {
1203 return (DDI_FAILURE);
1206 /* has to happen after segmap returns */
1207 if (!state->bt_map.um_registered) {
1208 /* register the pte's with segmf */
1209 e = xpvtap_segmf_register(state);
1210 if (e != DDI_SUCCESS) {
1211 return (DDI_FAILURE);
1215 /* alloc an ID for the user ring */
1216 e = xpvtap_rs_alloc(state->bt_map.um_rs, uid);
1217 if (e != DDI_SUCCESS) {
1218 return (DDI_FAILURE);
1221 /* if we don't have any segments to map, we're done */
1222 if ((req->operation == BLKIF_OP_WRITE_BARRIER) ||
1223 (req->operation == BLKIF_OP_FLUSH_DISKCACHE) ||
1224 (req->nr_segments == 0)) {
1225 return (DDI_SUCCESS);
1228 /* get the apps gref address */
1229 uaddr = XPVTAP_GREF_REQADDR(state->bt_map.um_guest_pages, *uid);
1231 AS_LOCK_ENTER(as, RW_READER);
1232 seg = as_findseg(as, state->bt_map.um_guest_pages, 0);
1233 if ((seg == NULL) || ((uaddr + mmu_ptob(req->nr_segments)) >
1234 (seg->s_base + seg->s_size))) {
1235 AS_LOCK_EXIT(as);
1236 return (DDI_FAILURE);
1239 /* if we are reading from disk, we are writing into memory */
1240 flags = 0;
1241 if (req->operation == BLKIF_OP_READ) {
1242 flags |= SEGMF_GREF_WR;
1245 /* Load the grefs into seg_mf */
1246 for (i = 0; i < req->nr_segments; i++) {
1247 gref[i] = req->seg[i].gref;
1249 (void) segmf_add_grefs(seg, uaddr, flags, gref, req->nr_segments,
1250 domid);
1252 AS_LOCK_EXIT(as);
1254 return (DDI_SUCCESS);
1259 * xpvtap_user_request_push()
1261 static int
1262 xpvtap_user_request_push(xpvtap_state_t *state, blkif_request_t *req,
1263 uint_t uid)
1265 blkif_request_t *outstanding_req;
1266 blkif_front_ring_t *uring;
1267 blkif_request_t *target;
1268 xpvtap_user_map_t *map;
1271 uring = &state->bt_user_ring.ur_ring;
1272 map = &state->bt_map;
1274 target = RING_GET_REQUEST(uring, uring->req_prod_pvt);
1277 * Save request from the frontend. used for ID mapping and unmap
1278 * on response/cleanup
1280 outstanding_req = &map->um_outstanding_reqs[uid];
1281 bcopy(req, outstanding_req, sizeof (*outstanding_req));
1283 /* put the request on the user ring */
1284 bcopy(req, target, sizeof (*req));
1285 target->id = (uint64_t)uid;
1286 uring->req_prod_pvt++;
1288 pollwakeup(&state->bt_pollhead, POLLIN | POLLRDNORM);
1290 return (DDI_SUCCESS);
1294 static void
1295 xpvtap_user_request_unmap(xpvtap_state_t *state, uint_t uid)
1297 blkif_request_t *req;
1298 struct seg *seg;
1299 struct as *as;
1300 caddr_t uaddr;
1301 int e;
1304 as = state->bt_map.um_as;
1305 if (as == NULL) {
1306 return;
1309 /* get a copy of the original request */
1310 req = &state->bt_map.um_outstanding_reqs[uid];
1312 /* unmap the grefs for this request */
1313 if ((req->operation != BLKIF_OP_WRITE_BARRIER) &&
1314 (req->operation != BLKIF_OP_FLUSH_DISKCACHE) &&
1315 (req->nr_segments != 0)) {
1316 uaddr = XPVTAP_GREF_REQADDR(state->bt_map.um_guest_pages, uid);
1317 AS_LOCK_ENTER(as, RW_READER);
1318 seg = as_findseg(as, state->bt_map.um_guest_pages, 0);
1319 if ((seg == NULL) || ((uaddr + mmu_ptob(req->nr_segments)) >
1320 (seg->s_base + seg->s_size))) {
1321 AS_LOCK_EXIT(as);
1322 xpvtap_rs_free(state->bt_map.um_rs, uid);
1323 return;
1326 e = segmf_release_grefs(seg, uaddr, req->nr_segments);
1327 if (e != 0) {
1328 cmn_err(CE_WARN, "unable to release grefs");
1331 AS_LOCK_EXIT(as);
1334 /* free up the user ring id */
1335 xpvtap_rs_free(state->bt_map.um_rs, uid);
1339 static int
1340 xpvtap_user_response_get(xpvtap_state_t *state, blkif_response_t *resp,
1341 uint_t *uid)
1343 blkif_front_ring_t *uring;
1344 blkif_response_t *target;
1347 uring = &state->bt_user_ring.ur_ring;
1349 if (!RING_HAS_UNCONSUMED_RESPONSES(uring)) {
1350 return (B_FALSE);
1353 target = NULL;
1354 target = RING_GET_RESPONSE(uring, uring->rsp_cons);
1355 if (target == NULL) {
1356 return (B_FALSE);
1359 /* copy out the user app response */
1360 bcopy(target, resp, sizeof (*resp));
1361 uring->rsp_cons++;
1363 /* restore the quests id from the original request */
1364 *uid = (uint_t)resp->id;
1365 resp->id = state->bt_map.um_outstanding_reqs[*uid].id;
1367 return (B_TRUE);
1372 * xpvtap_user_app_stop()
1374 static void xpvtap_user_app_stop(caddr_t arg)
1376 xpvtap_state_t *state;
1377 clock_t rc;
1379 state = (xpvtap_state_t *)arg;
1382 * Give the app 10 secs to exit. If it doesn't exit, it's not a serious
1383 * problem, we just won't auto-detach the driver.
1385 mutex_enter(&state->bt_open.bo_mutex);
1386 if (state->bt_open.bo_opened) {
1387 rc = cv_reltimedwait(&state->bt_open.bo_exit_cv,
1388 &state->bt_open.bo_mutex, drv_usectohz(10000000),
1389 TR_CLOCK_TICK);
1390 if (rc <= 0) {
1391 cmn_err(CE_NOTE, "!user process still has driver open, "
1392 "deferring detach\n");
1395 mutex_exit(&state->bt_open.bo_mutex);
1400 * xpvtap_rs_init()
1401 * Initialize the resource structure. init() returns a handle to be used
1402 * for the rest of the resource functions. This code is written assuming
1403 * that min_val will be close to 0. Therefore, we will allocate the free
1404 * buffer only taking max_val into account.
1406 static void
1407 xpvtap_rs_init(uint_t min_val, uint_t max_val, xpvtap_rs_hdl_t *handle)
1409 xpvtap_rs_t *rstruct;
1410 uint_t array_size;
1411 uint_t index;
1414 ASSERT(handle != NULL);
1415 ASSERT(min_val < max_val);
1417 /* alloc space for resource structure */
1418 rstruct = kmem_alloc(sizeof (xpvtap_rs_t), KM_SLEEP);
1421 * Test to see if the max value is 64-bit aligned. If so, we don't need
1422 * to allocate an extra 64-bit word. alloc space for free buffer
1423 * (8 bytes per uint64_t).
1425 if ((max_val & 0x3F) == 0) {
1426 rstruct->rs_free_size = (max_val >> 6) * 8;
1427 } else {
1428 rstruct->rs_free_size = ((max_val >> 6) + 1) * 8;
1430 rstruct->rs_free = kmem_alloc(rstruct->rs_free_size, KM_SLEEP);
1432 /* Initialize resource structure */
1433 rstruct->rs_min = min_val;
1434 rstruct->rs_last = min_val;
1435 rstruct->rs_max = max_val;
1436 mutex_init(&rstruct->rs_mutex, NULL, MUTEX_DRIVER, NULL);
1437 rstruct->rs_flushing = B_FALSE;
1439 /* Mark all resources as free */
1440 array_size = rstruct->rs_free_size >> 3;
1441 for (index = 0; index < array_size; index++) {
1442 rstruct->rs_free[index] = (uint64_t)0xFFFFFFFFFFFFFFFF;
1445 /* setup handle which is returned from this function */
1446 *handle = rstruct;
1451 * xpvtap_rs_fini()
1452 * Frees up the space allocated in init(). Notice that a pointer to the
1453 * handle is used for the parameter. fini() will set the handle to NULL
1454 * before returning.
1456 static void
1457 xpvtap_rs_fini(xpvtap_rs_hdl_t *handle)
1459 xpvtap_rs_t *rstruct;
1462 ASSERT(handle != NULL);
1464 rstruct = (xpvtap_rs_t *)*handle;
1466 mutex_destroy(&rstruct->rs_mutex);
1467 kmem_free(rstruct->rs_free, rstruct->rs_free_size);
1468 kmem_free(rstruct, sizeof (xpvtap_rs_t));
1470 /* set handle to null. This helps catch bugs. */
1471 *handle = NULL;
1476 * xpvtap_rs_alloc()
1477 * alloc a resource. If alloc fails, we are out of resources.
1479 static int
1480 xpvtap_rs_alloc(xpvtap_rs_hdl_t handle, uint_t *resource)
1482 xpvtap_rs_t *rstruct;
1483 uint_t array_idx;
1484 uint64_t free;
1485 uint_t index;
1486 uint_t last;
1487 uint_t min;
1488 uint_t max;
1491 ASSERT(handle != NULL);
1492 ASSERT(resource != NULL);
1494 rstruct = (xpvtap_rs_t *)handle;
1496 mutex_enter(&rstruct->rs_mutex);
1497 min = rstruct->rs_min;
1498 max = rstruct->rs_max;
1501 * Find a free resource. This will return out of the loop once it finds
1502 * a free resource. There are a total of 'max'-'min'+1 resources.
1503 * Performs a round robin allocation.
1505 for (index = min; index <= max; index++) {
1507 array_idx = rstruct->rs_last >> 6;
1508 free = rstruct->rs_free[array_idx];
1509 last = rstruct->rs_last & 0x3F;
1511 /* if the next resource to check is free */
1512 if ((free & ((uint64_t)1 << last)) != 0) {
1513 /* we are using this resource */
1514 *resource = rstruct->rs_last;
1516 /* take it out of the free list */
1517 rstruct->rs_free[array_idx] &= ~((uint64_t)1 << last);
1520 * increment the last count so we start checking the
1521 * next resource on the next alloc(). Note the rollover
1522 * at 'max'+1.
1524 rstruct->rs_last++;
1525 if (rstruct->rs_last > max) {
1526 rstruct->rs_last = rstruct->rs_min;
1529 /* unlock the resource structure */
1530 mutex_exit(&rstruct->rs_mutex);
1532 return (DDI_SUCCESS);
1536 * This resource is not free, lets go to the next one. Note the
1537 * rollover at 'max'.
1539 rstruct->rs_last++;
1540 if (rstruct->rs_last > max) {
1541 rstruct->rs_last = rstruct->rs_min;
1545 mutex_exit(&rstruct->rs_mutex);
1547 return (DDI_FAILURE);
1552 * xpvtap_rs_free()
1553 * Free the previously alloc'd resource. Once a resource has been free'd,
1554 * it can be used again when alloc is called.
1556 static void
1557 xpvtap_rs_free(xpvtap_rs_hdl_t handle, uint_t resource)
1559 xpvtap_rs_t *rstruct;
1560 uint_t array_idx;
1561 uint_t offset;
1564 ASSERT(handle != NULL);
1566 rstruct = (xpvtap_rs_t *)handle;
1567 ASSERT(resource >= rstruct->rs_min);
1568 ASSERT(resource <= rstruct->rs_max);
1570 if (!rstruct->rs_flushing) {
1571 mutex_enter(&rstruct->rs_mutex);
1574 /* Put the resource back in the free list */
1575 array_idx = resource >> 6;
1576 offset = resource & 0x3F;
1577 rstruct->rs_free[array_idx] |= ((uint64_t)1 << offset);
1579 if (!rstruct->rs_flushing) {
1580 mutex_exit(&rstruct->rs_mutex);
1586 * xpvtap_rs_flush()
1588 static void
1589 xpvtap_rs_flush(xpvtap_rs_hdl_t handle, xpvtap_rs_cleanup_t callback,
1590 void *arg)
1592 xpvtap_rs_t *rstruct;
1593 uint_t array_idx;
1594 uint64_t free;
1595 uint_t index;
1596 uint_t last;
1597 uint_t min;
1598 uint_t max;
1601 ASSERT(handle != NULL);
1603 rstruct = (xpvtap_rs_t *)handle;
1605 mutex_enter(&rstruct->rs_mutex);
1606 min = rstruct->rs_min;
1607 max = rstruct->rs_max;
1609 rstruct->rs_flushing = B_TRUE;
1612 * for all resources not free, call the callback routine to clean it
1613 * up.
1615 for (index = min; index <= max; index++) {
1617 array_idx = rstruct->rs_last >> 6;
1618 free = rstruct->rs_free[array_idx];
1619 last = rstruct->rs_last & 0x3F;
1621 /* if the next resource to check is not free */
1622 if ((free & ((uint64_t)1 << last)) == 0) {
1623 /* call the callback to cleanup */
1624 (*callback)(arg, rstruct->rs_last);
1626 /* put it back in the free list */
1627 rstruct->rs_free[array_idx] |= ((uint64_t)1 << last);
1630 /* go to the next one. Note the rollover at 'max' */
1631 rstruct->rs_last++;
1632 if (rstruct->rs_last > max) {
1633 rstruct->rs_last = rstruct->rs_min;
1637 mutex_exit(&rstruct->rs_mutex);