1 /******************************************************************************
3 * Back-end of the driver for virtual block devices. This portion of the
4 * driver exports a 'unified' block-device interface that can be accessed
5 * by any operating system that implements a compatible front end. A
6 * reference front-end implementation can be found in:
7 * drivers/block/xen-blkfront.c
9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
10 * Copyright (c) 2005, Christopher Clark
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License version 2
14 * as published by the Free Software Foundation; or, when distributed
15 * separately from the Linux kernel or incorporated into other
16 * software packages, subject to the following license:
18 * Permission is hereby granted, free of charge, to any person obtaining a copy
19 * of this source file (the "Software"), to deal in the Software without
20 * restriction, including without limitation the rights to use, copy, modify,
21 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
22 * and to permit persons to whom the Software is furnished to do so, subject to
23 * the following conditions:
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
37 #include <linux/spinlock.h>
38 #include <linux/kthread.h>
39 #include <linux/list.h>
40 #include <linux/delay.h>
41 #include <linux/freezer.h>
43 #include <xen/events.h>
45 #include <asm/xen/hypervisor.h>
46 #include <asm/xen/hypercall.h>
49 #define WRITE_BARRIER (REQ_WRITE | REQ_FLUSH | REQ_FUA)
52 * These are rather arbitrary. They are fairly large because adjacent requests
53 * pulled from a communication ring are quite likely to end up being part of
54 * the same scatter/gather request at the disc.
56 * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
58 * This will increase the chances of being able to write whole tracks.
59 * 64 should be enough to keep us competitive with Linux.
61 static int blkif_reqs
= 64;
62 module_param_named(reqs
, blkif_reqs
, int, 0);
63 MODULE_PARM_DESC(reqs
, "Number of blkback requests to allocate");
65 /* Run-time switchable: /sys/module/blkback/parameters/ */
66 static unsigned int log_stats
= 0;
67 static unsigned int debug_lvl
= 0;
68 module_param(log_stats
, int, 0644);
69 module_param(debug_lvl
, int, 0644);
72 * Each outstanding request that we've passed to the lower device layers has a
73 * 'pending_req' allocated to it. Each buffer_head that completes decrements
74 * the pendcnt towards zero. When it hits zero, the specified domain has a
75 * response queued for it, with the saved 'id' passed back.
82 unsigned short operation
;
84 struct list_head free_list
;
87 #define BLKBACK_INVALID_HANDLE (~0)
90 pending_req_t
*pending_reqs
;
91 /* List of all 'pending_req' available */
92 struct list_head pending_free
;
93 /* And its spinlock. */
94 spinlock_t pending_free_lock
;
95 wait_queue_head_t pending_free_wq
;
96 /* The list of all pages that are available. */
97 struct page
**pending_pages
;
98 /* And the grant handles that are available. */
99 grant_handle_t
*pending_grant_handles
;
102 static struct xen_blkbk
*blkbk
;
105 * Little helpful macro to figure out the index and virtual address of the
106 * pending_pages[..]. For each 'pending_req' we have have up to
107 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
108 * 10 and would index in the pending_pages[..]. */
109 static inline int vaddr_pagenr(pending_req_t
*req
, int seg
)
111 return (req
- blkbk
->pending_reqs
) * BLKIF_MAX_SEGMENTS_PER_REQUEST
+ seg
;
114 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
116 static inline unsigned long vaddr(pending_req_t
*req
, int seg
)
118 unsigned long pfn
= page_to_pfn(blkbk
->pending_page(req
, seg
));
119 return (unsigned long)pfn_to_kaddr(pfn
);
122 #define pending_handle(_req, _seg) \
123 (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
126 static int do_block_io_op(blkif_t
*blkif
);
127 static void dispatch_rw_block_io(blkif_t
*blkif
,
128 struct blkif_request
*req
,
129 pending_req_t
*pending_req
);
130 static void make_response(blkif_t
*blkif
, u64 id
,
131 unsigned short op
, int st
);
134 * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
136 static pending_req_t
* alloc_req(void)
138 pending_req_t
*req
= NULL
;
141 spin_lock_irqsave(&blkbk
->pending_free_lock
, flags
);
142 if (!list_empty(&blkbk
->pending_free
)) {
143 req
= list_entry(blkbk
->pending_free
.next
, pending_req_t
, free_list
);
144 list_del(&req
->free_list
);
146 spin_unlock_irqrestore(&blkbk
->pending_free_lock
, flags
);
151 * Return the 'pending_req' structure back to the freepool. We also
152 * wake up the thread if it was waiting for a free page.
154 static void free_req(pending_req_t
*req
)
159 spin_lock_irqsave(&blkbk
->pending_free_lock
, flags
);
160 was_empty
= list_empty(&blkbk
->pending_free
);
161 list_add(&req
->free_list
, &blkbk
->pending_free
);
162 spin_unlock_irqrestore(&blkbk
->pending_free_lock
, flags
);
164 wake_up(&blkbk
->pending_free_wq
);
168 * Give back a reference count on the underlaying storage.
169 * It is OK to make multiple calls in this function as it
170 * resets the plug to NULL when it is done on the first call.
172 static void unplug_queue(blkif_t
*blkif
)
174 if (blkif
->plug
== NULL
)
176 if (blkif
->plug
->unplug_fn
)
177 blkif
->plug
->unplug_fn(blkif
->plug
);
178 blk_put_queue(blkif
->plug
);
183 * Take a reference count on the underlaying storage.
184 * It is OK to call this multiple times as we check to make sure
185 * not to double reference. We also give back a reference count
186 * if it corresponds to another queue.
188 static void plug_queue(blkif_t
*blkif
, struct block_device
*bdev
)
190 struct request_queue
*q
= bdev_get_queue(bdev
);
192 if (q
== blkif
->plug
)
200 * Unmap the grant references, and also remove the M2P over-rides
201 * used in the 'pending_req'.
203 static void fast_flush_area(pending_req_t
*req
)
205 struct gnttab_unmap_grant_ref unmap
[BLKIF_MAX_SEGMENTS_PER_REQUEST
];
206 unsigned int i
, invcount
= 0;
207 grant_handle_t handle
;
210 for (i
= 0; i
< req
->nr_pages
; i
++) {
211 handle
= pending_handle(req
, i
);
212 if (handle
== BLKBACK_INVALID_HANDLE
)
214 gnttab_set_unmap_op(&unmap
[invcount
], vaddr(req
, i
),
215 GNTMAP_host_map
, handle
);
216 pending_handle(req
, i
) = BLKBACK_INVALID_HANDLE
;
220 ret
= HYPERVISOR_grant_table_op(
221 GNTTABOP_unmap_grant_ref
, unmap
, invcount
);
223 /* Note, we use invcount, so nr->pages, so we can't index
224 * using vaddr(req, i). */
225 for (i
= 0; i
< invcount
; i
++) {
226 ret
= m2p_remove_override(
227 virt_to_page(unmap
[i
].host_addr
), false);
229 printk(KERN_ALERT
"Failed to remove M2P override for " \
230 "%lx\n", (unsigned long)unmap
[i
].host_addr
);
236 /******************************************************************
237 * SCHEDULER FUNCTIONS
240 static void print_stats(blkif_t
*blkif
)
242 printk(KERN_DEBUG
"%s: oo %3d | rd %4d | wr %4d | br %4d\n",
243 current
->comm
, blkif
->st_oo_req
,
244 blkif
->st_rd_req
, blkif
->st_wr_req
, blkif
->st_br_req
);
245 blkif
->st_print
= jiffies
+ msecs_to_jiffies(10 * 1000);
246 blkif
->st_rd_req
= 0;
247 blkif
->st_wr_req
= 0;
248 blkif
->st_oo_req
= 0;
251 int blkif_schedule(void *arg
)
253 blkif_t
*blkif
= arg
;
254 struct vbd
*vbd
= &blkif
->vbd
;
259 printk(KERN_DEBUG
"%s: started\n", current
->comm
);
261 while (!kthread_should_stop()) {
264 if (unlikely(vbd
->size
!= vbd_size(vbd
)))
267 wait_event_interruptible(
269 blkif
->waiting_reqs
|| kthread_should_stop());
270 wait_event_interruptible(
271 blkbk
->pending_free_wq
,
272 !list_empty(&blkbk
->pending_free
) || kthread_should_stop());
274 blkif
->waiting_reqs
= 0;
275 smp_mb(); /* clear flag *before* checking for work */
277 if (do_block_io_op(blkif
))
278 blkif
->waiting_reqs
= 1;
281 if (log_stats
&& time_after(jiffies
, blkif
->st_print
))
288 printk(KERN_DEBUG
"%s: exiting\n", current
->comm
);
290 blkif
->xenblkd
= NULL
;
297 * Completion callback on the bio's. Called as bh->b_end_io()
300 static void __end_block_io_op(pending_req_t
*pending_req
, int error
)
302 /* An error fails the entire request. */
303 if ((pending_req
->operation
== BLKIF_OP_WRITE_BARRIER
) &&
304 (error
== -EOPNOTSUPP
)) {
305 DPRINTK("blkback: write barrier op failed, not supported\n");
306 blkback_barrier(XBT_NIL
, pending_req
->blkif
->be
, 0);
307 pending_req
->status
= BLKIF_RSP_EOPNOTSUPP
;
309 DPRINTK("Buffer not up-to-date at end of operation, "
310 "error=%d\n", error
);
311 pending_req
->status
= BLKIF_RSP_ERROR
;
314 /* If all of the bio's have completed it is time to unmap
315 * the grant references associated with 'request' and provide
316 * the proper response on the ring. */
317 if (atomic_dec_and_test(&pending_req
->pendcnt
)) {
318 fast_flush_area(pending_req
);
319 make_response(pending_req
->blkif
, pending_req
->id
,
320 pending_req
->operation
, pending_req
->status
);
321 blkif_put(pending_req
->blkif
);
322 free_req(pending_req
);
329 static void end_block_io_op(struct bio
*bio
, int error
)
331 __end_block_io_op(bio
->bi_private
, error
);
337 * Notification from the guest OS.
340 static void blkif_notify_work(blkif_t
*blkif
)
342 blkif
->waiting_reqs
= 1;
346 irqreturn_t
blkif_be_int(int irq
, void *dev_id
)
348 blkif_notify_work(dev_id
);
355 * Function to copy the from the ring buffer the 'struct blkif_request'
356 * (which has the sectors we want, number of them, grant references, etc),
357 * and transmute it to the block API to hand it over to the proper block disk.
359 static int do_block_io_op(blkif_t
*blkif
)
361 union blkif_back_rings
*blk_rings
= &blkif
->blk_rings
;
362 struct blkif_request req
;
363 pending_req_t
*pending_req
;
367 rc
= blk_rings
->common
.req_cons
;
368 rp
= blk_rings
->common
.sring
->req_prod
;
369 rmb(); /* Ensure we see queued requests up to 'rp'. */
373 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings
->common
, rc
))
376 if (kthread_should_stop()) {
381 pending_req
= alloc_req();
382 if (NULL
== pending_req
) {
388 switch (blkif
->blk_protocol
) {
389 case BLKIF_PROTOCOL_NATIVE
:
390 memcpy(&req
, RING_GET_REQUEST(&blk_rings
->native
, rc
), sizeof(req
));
392 case BLKIF_PROTOCOL_X86_32
:
393 blkif_get_x86_32_req(&req
, RING_GET_REQUEST(&blk_rings
->x86_32
, rc
));
395 case BLKIF_PROTOCOL_X86_64
:
396 blkif_get_x86_64_req(&req
, RING_GET_REQUEST(&blk_rings
->x86_64
, rc
));
401 blk_rings
->common
.req_cons
= ++rc
; /* before make_response() */
403 /* Apply all sanity checks to /private copy/ of request. */
406 switch (req
.operation
) {
409 dispatch_rw_block_io(blkif
, &req
, pending_req
);
411 case BLKIF_OP_WRITE_BARRIER
:
416 dispatch_rw_block_io(blkif
, &req
, pending_req
);
419 /* A good sign something is wrong: sleep for a while to
420 * avoid excessive CPU consumption by a bad guest. */
422 DPRINTK("error: unknown block io operation [%d]\n",
424 make_response(blkif
, req
.id
, req
.operation
,
426 free_req(pending_req
);
430 /* Yield point for this unbounded loop. */
438 * Transumation of the 'struct blkif_request' to a proper 'struct bio'
439 * and call the 'submit_bio' to pass it to the underlaying storage.
441 static void dispatch_rw_block_io(blkif_t
*blkif
,
442 struct blkif_request
*req
,
443 pending_req_t
*pending_req
)
445 struct gnttab_map_grant_ref map
[BLKIF_MAX_SEGMENTS_PER_REQUEST
];
446 struct phys_req preq
;
448 unsigned long buf
; unsigned int nsec
;
449 } seg
[BLKIF_MAX_SEGMENTS_PER_REQUEST
];
451 struct bio
*bio
= NULL
;
455 switch (req
->operation
) {
462 case BLKIF_OP_WRITE_BARRIER
:
463 operation
= WRITE_BARRIER
;
466 operation
= 0; /* make gcc happy */
470 /* Check that the number of segments is sane. */
471 nseg
= req
->nr_segments
;
472 if (unlikely(nseg
== 0 && operation
!= WRITE_BARRIER
) ||
473 unlikely(nseg
> BLKIF_MAX_SEGMENTS_PER_REQUEST
)) {
474 DPRINTK("Bad number of segments in request (%d)\n", nseg
);
478 preq
.dev
= req
->handle
;
479 preq
.sector_number
= req
->u
.rw
.sector_number
;
482 pending_req
->blkif
= blkif
;
483 pending_req
->id
= req
->id
;
484 pending_req
->operation
= req
->operation
;
485 pending_req
->status
= BLKIF_RSP_OKAY
;
486 pending_req
->nr_pages
= nseg
;
488 /* Fill out preq.nr_sects with proper amount of sectors, and setup
489 * assign map[..] with the PFN of the page in our domain with the
490 * corresponding grant reference for each page.*/
491 for (i
= 0; i
< nseg
; i
++) {
494 seg
[i
].nsec
= req
->u
.rw
.seg
[i
].last_sect
-
495 req
->u
.rw
.seg
[i
].first_sect
+ 1;
496 if ((req
->u
.rw
.seg
[i
].last_sect
>= (PAGE_SIZE
>> 9)) ||
497 (req
->u
.rw
.seg
[i
].last_sect
< req
->u
.rw
.seg
[i
].first_sect
))
499 preq
.nr_sects
+= seg
[i
].nsec
;
501 flags
= GNTMAP_host_map
;
502 if (operation
!= READ
)
503 flags
|= GNTMAP_readonly
;
504 gnttab_set_map_op(&map
[i
], vaddr(pending_req
, i
), flags
,
505 req
->u
.rw
.seg
[i
].gref
, blkif
->domid
);
508 ret
= HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref
, map
, nseg
);
511 /* Now swizzel the MFN in our domain with the MFN from the other domain
512 * so that when we access vaddr(pending_req,i) it has the contents of the
513 * page from the other domain. */
514 for (i
= 0; i
< nseg
; i
++) {
515 if (unlikely(map
[i
].status
!= 0)) {
516 DPRINTK("invalid buffer -- could not remap it\n");
517 map
[i
].handle
= BLKBACK_INVALID_HANDLE
;
521 pending_handle(pending_req
, i
) = map
[i
].handle
;
526 ret
= m2p_add_override(PFN_DOWN(map
[i
].dev_bus_addr
),
527 blkbk
->pending_page(pending_req
, i
), false);
529 printk(KERN_ALERT
"Failed to install M2P override for"\
530 " %lx (ret: %d)\n", (unsigned long)map
[i
].dev_bus_addr
, ret
);
531 /* We could switch over to GNTTABOP_copy */
535 seg
[i
].buf
= map
[i
].dev_bus_addr
|
536 (req
->u
.rw
.seg
[i
].first_sect
<< 9);
539 /* If we have failed at this point, we need to undo the M2P override, set
540 * gnttab_set_unmap_op on all of the grant references and perform the
541 * hypercall to unmap the grants - that is all done in fast_flush_area. */
545 if (vbd_translate(&preq
, blkif
, operation
) != 0) {
546 DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n",
547 operation
== READ
? "read" : "write",
549 preq
.sector_number
+ preq
.nr_sects
, preq
.dev
);
553 /* Get a reference count for the disk queue and start sending I/O */
554 plug_queue(blkif
, preq
.bdev
);
556 /* We set it one so that the last submit_bio does not have to call
558 atomic_set(&pending_req
->pendcnt
, 1);
561 for (i
= 0; i
< nseg
; i
++) {
562 if (((int)preq
.sector_number
|(int)seg
[i
].nsec
) &
563 ((bdev_logical_block_size(preq
.bdev
) >> 9) - 1)) {
564 DPRINTK("Misaligned I/O request from domain %d",
569 while ((bio
== NULL
) ||
571 blkbk
->pending_page(pending_req
, i
),
573 seg
[i
].buf
& ~PAGE_MASK
) == 0)) {
575 atomic_inc(&pending_req
->pendcnt
);
576 submit_bio(operation
, bio
);
579 bio
= bio_alloc(GFP_KERNEL
, nseg
-i
);
580 if (unlikely(bio
== NULL
))
583 bio
->bi_bdev
= preq
.bdev
;
584 bio
->bi_private
= pending_req
;
585 bio
->bi_end_io
= end_block_io_op
;
586 bio
->bi_sector
= preq
.sector_number
;
589 preq
.sector_number
+= seg
[i
].nsec
;
592 /* This will be hit if the operation was a barrier. */
594 BUG_ON(operation
!= WRITE_BARRIER
);
595 bio
= bio_alloc(GFP_KERNEL
, 0);
596 if (unlikely(bio
== NULL
))
599 bio
->bi_bdev
= preq
.bdev
;
600 bio
->bi_private
= pending_req
;
601 bio
->bi_end_io
= end_block_io_op
;
605 submit_bio(operation
, bio
);
607 if (operation
== READ
)
608 blkif
->st_rd_sect
+= preq
.nr_sects
;
609 else if (operation
== WRITE
|| operation
== WRITE_BARRIER
)
610 blkif
->st_wr_sect
+= preq
.nr_sects
;
615 fast_flush_area(pending_req
);
617 make_response(blkif
, req
->id
, req
->operation
, BLKIF_RSP_ERROR
);
618 free_req(pending_req
);
619 msleep(1); /* back off a bit */
623 __end_block_io_op(pending_req
, -EINVAL
);
627 msleep(1); /* back off a bit */
634 * Put a response on the ring on how the operation fared.
636 static void make_response(blkif_t
*blkif
, u64 id
,
637 unsigned short op
, int st
)
639 struct blkif_response resp
;
641 union blkif_back_rings
*blk_rings
= &blkif
->blk_rings
;
649 spin_lock_irqsave(&blkif
->blk_ring_lock
, flags
);
650 /* Place on the response ring for the relevant domain. */
651 switch (blkif
->blk_protocol
) {
652 case BLKIF_PROTOCOL_NATIVE
:
653 memcpy(RING_GET_RESPONSE(&blk_rings
->native
, blk_rings
->native
.rsp_prod_pvt
),
654 &resp
, sizeof(resp
));
656 case BLKIF_PROTOCOL_X86_32
:
657 memcpy(RING_GET_RESPONSE(&blk_rings
->x86_32
, blk_rings
->x86_32
.rsp_prod_pvt
),
658 &resp
, sizeof(resp
));
660 case BLKIF_PROTOCOL_X86_64
:
661 memcpy(RING_GET_RESPONSE(&blk_rings
->x86_64
, blk_rings
->x86_64
.rsp_prod_pvt
),
662 &resp
, sizeof(resp
));
667 blk_rings
->common
.rsp_prod_pvt
++;
668 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings
->common
, notify
);
669 if (blk_rings
->common
.rsp_prod_pvt
== blk_rings
->common
.req_cons
) {
671 * Tail check for pending requests. Allows frontend to avoid
672 * notifications if requests are already in flight (lower
673 * overheads and promotes batching).
675 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings
->common
, more_to_do
);
677 } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings
->common
)) {
681 spin_unlock_irqrestore(&blkif
->blk_ring_lock
, flags
);
684 blkif_notify_work(blkif
);
686 notify_remote_via_irq(blkif
->irq
);
689 static int __init
blkif_init(void)
694 if (!xen_pv_domain())
697 blkbk
= (struct xen_blkbk
*)kzalloc(sizeof(struct xen_blkbk
), GFP_KERNEL
);
699 printk(KERN_ALERT
"%s: out of memory!\n", __func__
);
703 mmap_pages
= blkif_reqs
* BLKIF_MAX_SEGMENTS_PER_REQUEST
;
705 blkbk
->pending_reqs
= kmalloc(sizeof(blkbk
->pending_reqs
[0]) *
706 blkif_reqs
, GFP_KERNEL
);
707 blkbk
->pending_grant_handles
= kzalloc(sizeof(blkbk
->pending_grant_handles
[0]) *
708 mmap_pages
, GFP_KERNEL
);
709 blkbk
->pending_pages
= kzalloc(sizeof(blkbk
->pending_pages
[0]) *
710 mmap_pages
, GFP_KERNEL
);
712 if (!blkbk
->pending_reqs
|| !blkbk
->pending_grant_handles
|| !blkbk
->pending_pages
) {
717 for (i
= 0; i
< mmap_pages
; i
++) {
718 blkbk
->pending_grant_handles
[i
] = BLKBACK_INVALID_HANDLE
;
719 blkbk
->pending_pages
[i
] = alloc_page(GFP_KERNEL
);
720 if (blkbk
->pending_pages
[i
] == NULL
) {
725 rc
= blkif_interface_init();
729 memset(blkbk
->pending_reqs
, 0, sizeof(blkbk
->pending_reqs
));
731 INIT_LIST_HEAD(&blkbk
->pending_free
);
732 spin_lock_init(&blkbk
->pending_free_lock
);
733 init_waitqueue_head(&blkbk
->pending_free_wq
);
735 for (i
= 0; i
< blkif_reqs
; i
++)
736 list_add_tail(&blkbk
->pending_reqs
[i
].free_list
, &blkbk
->pending_free
);
738 rc
= blkif_xenbus_init();
745 printk(KERN_ERR
"%s: out of memory\n", __func__
);
747 kfree(blkbk
->pending_reqs
);
748 kfree(blkbk
->pending_grant_handles
);
749 for (i
= 0; i
< mmap_pages
; i
++) {
750 if (blkbk
->pending_pages
[i
])
751 __free_page(blkbk
->pending_pages
[i
]);
753 kfree(blkbk
->pending_pages
);
759 module_init(blkif_init
);
761 MODULE_LICENSE("Dual BSD/GPL");