drivers/block/lguest_blk.c

   1 /*D:400
   2  * The Guest block driver
   3  *
   4  * This is a simple block driver, which appears as /dev/lgba, lgbb, lgbc etc.
   5  * The mechanism is simple: we place the information about the request in the
   6  * device page, then use SEND_DMA (containing the data for a write, or an empty
   7  * "ping" DMA for a read).
   8  :*/
   9 /* Copyright 2006 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  24  */
  25 //#define DEBUG
  26 #include <linux/init.h>
  27 #include <linux/types.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/lguest_bus.h>
  31
  32 static char next_block_index = 'a';
  33
  34 /*D:420 Here is the structure which holds all the information we need about
  35  * each Guest block device.
  36  *
  37  * I'm sure at this stage, you're wondering "hey, where was the adventure I was
  38  * promised?" and thinking "Rusty sucks, I shall say nasty things about him on
  39  * my blog".  I think Real adventures have boring bits, too, and you're in the
  40  * middle of one.  But it gets better.  Just not quite yet. */
  41 struct blockdev
  42 {
  43         /* The block queue infrastructure wants a spinlock: it is held while it
  44          * calls our block request function.  We grab it in our interrupt
  45          * handler so the responses don't mess with new requests. */
  46         spinlock_t lock;
  47
  48         /* The disk structure registered with kernel. */
  49         struct gendisk *disk;
  50
  51         /* The major device number for this disk, and the interrupt.  We only
  52          * really keep them here for completeness; we'd need them if we
  53          * supported device unplugging. */
  54         int major;
  55         int irq;
  56
  57         /* The physical address of this device's memory page */
  58         unsigned long phys_addr;
  59         /* The mapped memory page for convenient acces. */
  60         struct lguest_block_page *lb_page;
  61
  62         /* We only have a single request outstanding at a time: this is it. */
  63         struct lguest_dma dma;
  64         struct request *req;
  65 };
  66
  67 /*D:495 We originally used end_request() throughout the driver, but it turns
  68  * out that end_request() is deprecated, and doesn't actually end the request
  69  * (which seems like a good reason to deprecate it!).  It simply ends the first
  70  * bio.  So if we had 3 bios in a "struct request" we would do all 3,
  71  * end_request(), do 2, end_request(), do 1 and end_request(): twice as much
  72  * work as we needed to do.
  73  *
  74  * This reinforced to me that I do not understand the block layer.
  75  *
  76  * Nonetheless, Jens Axboe gave me this nice helper to end all chunks of a
  77  * request.  This improved disk speed by 130%. */
  78 static void end_entire_request(struct request *req, int uptodate)
  79 {
  80         if (end_that_request_first(req, uptodate, req->hard_nr_sectors))
  81                 BUG();
  82         add_disk_randomness(req->rq_disk);
  83         blkdev_dequeue_request(req);
  84         end_that_request_last(req, uptodate);
  85 }
  86
  87 /* I'm told there are only two stories in the world worth telling: love and
  88  * hate.  So there used to be a love scene here like this:
  89  *
  90  *  Launcher:   We could make beautiful I/O together, you and I.
  91  *  Guest:      My, that's a big disk!
  92  *
  93  * Unfortunately, it was just too raunchy for our otherwise-gentle tale. */
  94
  95 /*D:490 This is the interrupt handler, called when a block read or write has
  96  * been completed for us. */
  97 static irqreturn_t lgb_irq(int irq, void *_bd)
  98 {
  99         /* We handed our "struct blockdev" as the argument to request_irq(), so
 100          * it is passed through to us here.  This tells us which device we're
 101          * dealing with in case we have more than one. */
 102         struct blockdev *bd = _bd;
 103         unsigned long flags;
 104
 105         /* We weren't doing anything?  Strange, but could happen if we shared
 106          * interrupts (we don't!). */
 107         if (!bd->req) {
 108                 pr_debug("No work!\n");
 109                 return IRQ_NONE;
 110         }
 111
 112         /* Not done yet?  That's equally strange. */
 113         if (!bd->lb_page->result) {
 114                 pr_debug("No result!\n");
 115                 return IRQ_NONE;
 116         }
 117
 118         /* We have to grab the lock before ending the request. */
 119         spin_lock_irqsave(&bd->lock, flags);
 120         /* "result" is 1 for success, 2 for failure: end_entire_request() wants
 121          * to know whether this succeeded or not. */
 122         end_entire_request(bd->req, bd->lb_page->result == 1);
 123         /* Clear out request, it's done. */
 124         bd->req = NULL;
 125         /* Reset incoming DMA for next time. */
 126         bd->dma.used_len = 0;
 127         /* Ready for more reads or writes */
 128         blk_start_queue(bd->disk->queue);
 129         spin_unlock_irqrestore(&bd->lock, flags);
 130
 131         /* The interrupt was for us, we dealt with it. */
 132         return IRQ_HANDLED;
 133 }
 134
 135 /*D:480 The block layer's "struct request" contains a number of "struct bio"s,
 136  * each of which contains "struct bio_vec"s, each of which contains a page, an
 137  * offset and a length.
 138  *
 139  * Fortunately there are iterators to help us walk through the "struct
 140  * request".  Even more fortunately, there were plenty of places to steal the
 141  * code from.  We pack the "struct request" into our "struct lguest_dma" and
 142  * return the total length. */
 143 static unsigned int req_to_dma(struct request *req, struct lguest_dma *dma)
 144 {
 145         unsigned int i = 0, idx, len = 0;
 146         struct bio *bio;
 147
 148         rq_for_each_bio(bio, req) {
 149                 struct bio_vec *bvec;
 150                 bio_for_each_segment(bvec, bio, idx) {
 151                         /* We told the block layer not to give us too many. */
 152                         BUG_ON(i == LGUEST_MAX_DMA_SECTIONS);
 153                         /* If we had a zero-length segment, it would look like
 154                          * the end of the data referred to by the "struct
 155                          * lguest_dma", so make sure that doesn't happen. */
 156                         BUG_ON(!bvec->bv_len);
 157                         /* Convert page & offset to a physical address */
 158                         dma->addr[i] = page_to_phys(bvec->bv_page)
 159                                 + bvec->bv_offset;
 160                         dma->len[i] = bvec->bv_len;
 161                         len += bvec->bv_len;
 162                         i++;
 163                 }
 164         }
 165         /* If the array isn't full, we mark the end with a 0 length */
 166         if (i < LGUEST_MAX_DMA_SECTIONS)
 167                 dma->len[i] = 0;
 168         return len;
 169 }
 170
 171 /* This creates an empty DMA, useful for prodding the Host without sending data
 172  * (ie. when we want to do a read) */
 173 static void empty_dma(struct lguest_dma *dma)
 174 {
 175         dma->len[0] = 0;
 176 }
 177
 178 /*D:470 Setting up a request is fairly easy: */
 179 static void setup_req(struct blockdev *bd,
 180                       int type, struct request *req, struct lguest_dma *dma)
 181 {
 182         /* The type is 1 (write) or 0 (read). */
 183         bd->lb_page->type = type;
 184         /* The sector on disk where the read or write starts. */
 185         bd->lb_page->sector = req->sector;
 186         /* The result is initialized to 0 (unfinished). */
 187         bd->lb_page->result = 0;
 188         /* The current request (so we can end it in the interrupt handler). */
 189         bd->req = req;
 190         /* The number of bytes: returned as a side-effect of req_to_dma(),
 191          * which packs the block layer's "struct request" into our "struct
 192          * lguest_dma" */
 193         bd->lb_page->bytes = req_to_dma(req, dma);
 194 }
 195
 196 /*D:450 Write is pretty straightforward: we pack the request into a "struct
 197  * lguest_dma", then use SEND_DMA to send the request. */
 198 static void do_write(struct blockdev *bd, struct request *req)
 199 {
 200         struct lguest_dma send;
 201
 202         pr_debug("lgb: WRITE sector %li\n", (long)req->sector);
 203         setup_req(bd, 1, req, &send);
 204
 205         lguest_send_dma(bd->phys_addr, &send);
 206 }
 207
 208 /* Read is similar to write, except we pack the request into our receive
 209  * "struct lguest_dma" and send through an empty DMA just to tell the Host that
 210  * there's a request pending. */
 211 static void do_read(struct blockdev *bd, struct request *req)
 212 {
 213         struct lguest_dma ping;
 214
 215         pr_debug("lgb: READ sector %li\n", (long)req->sector);
 216         setup_req(bd, 0, req, &bd->dma);
 217
 218         empty_dma(&ping);
 219         lguest_send_dma(bd->phys_addr, &ping);
 220 }
 221
 222 /*D:440 This where requests come in: we get handed the request queue and are
 223  * expected to pull a "struct request" off it until we've finished them or
 224  * we're waiting for a reply: */
 225 static void do_lgb_request(struct request_queue *q)
 226 {
 227         struct blockdev *bd;
 228         struct request *req;
 229
 230 again:
 231         /* This sometimes returns NULL even on the very first time around.  I
 232          * wonder if it's something to do with letting elves handle the request
 233          * queue... */
 234         req = elv_next_request(q);
 235         if (!req)
 236                 return;
 237
 238         /* We attached the struct blockdev to the disk: get it back */
 239         bd = req->rq_disk->private_data;
 240         /* Sometimes we get repeated requests after blk_stop_queue(), but we
 241          * can only handle one at a time. */
 242         if (bd->req)
 243                 return;
 244
 245         /* We only do reads and writes: no tricky business! */
 246         if (!blk_fs_request(req)) {
 247                 pr_debug("Got non-command 0x%08x\n", req->cmd_type);
 248                 req->errors++;
 249                 end_entire_request(req, 0);
 250                 goto again;
 251         }
 252
 253         if (rq_data_dir(req) == WRITE)
 254                 do_write(bd, req);
 255         else
 256                 do_read(bd, req);
 257
 258         /* We've put out the request, so stop any more coming in until we get
 259          * an interrupt, which takes us to lgb_irq() to re-enable the queue. */
 260         blk_stop_queue(q);
 261 }
 262
 263 /*D:430 This is the "struct block_device_operations" we attach to the disk at
 264  * the end of lguestblk_probe().  It doesn't seem to want much. */
 265 static struct block_device_operations lguestblk_fops = {
 266         .owner = THIS_MODULE,
 267 };
 268
 269 /*D:425 Setting up a disk device seems to involve a lot of code.  I'm not sure
 270  * quite why.  I do know that the IDE code sent two or three of the maintainers
 271  * insane, perhaps this is the fringe of the same disease?
 272  *
 273  * As in the console code, the probe function gets handed the generic
 274  * lguest_device from lguest_bus.c: */
 275 static int lguestblk_probe(struct lguest_device *lgdev)
 276 {
 277         struct blockdev *bd;
 278         int err;
 279         int irqflags = IRQF_SHARED;
 280
 281         /* First we allocate our own "struct blockdev" and initialize the easy
 282          * fields. */
 283         bd = kmalloc(sizeof(*bd), GFP_KERNEL);
 284         if (!bd)
 285                 return -ENOMEM;
 286
 287         spin_lock_init(&bd->lock);
 288         bd->irq = lgdev_irq(lgdev);
 289         bd->req = NULL;
 290         bd->dma.used_len = 0;
 291         bd->dma.len[0] = 0;
 292         /* The descriptor in the lguest_devices array provided by the Host
 293          * gives the Guest the physical page number of the device's page. */
 294         bd->phys_addr = (lguest_devices[lgdev->index].pfn << PAGE_SHIFT);
 295
 296         /* We use lguest_map() to get a pointer to the device page */
 297         bd->lb_page = lguest_map(bd->phys_addr, 1);
 298         if (!bd->lb_page) {
 299                 err = -ENOMEM;
 300                 goto out_free_bd;
 301         }
 302
 303         /* We need a major device number: 0 means "assign one dynamically". */
 304         bd->major = register_blkdev(0, "lguestblk");
 305         if (bd->major < 0) {
 306                 err = bd->major;
 307                 goto out_unmap;
 308         }
 309
 310         /* This allocates a "struct gendisk" where we pack all the information
 311          * about the disk which the rest of Linux sees.  The argument is the
 312          * number of minor devices desired: we need one minor for the main
 313          * disk, and one for each partition.  Of course, we can't possibly know
 314          * how many partitions are on the disk (add_disk does that).
 315          */
 316         bd->disk = alloc_disk(16);
 317         if (!bd->disk) {
 318                 err = -ENOMEM;
 319                 goto out_unregister_blkdev;
 320         }
 321
 322         /* Every disk needs a queue for requests to come in: we set up the
 323          * queue with a callback function (the core of our driver) and the lock
 324          * to use. */
 325         bd->disk->queue = blk_init_queue(do_lgb_request, &bd->lock);
 326         if (!bd->disk->queue) {
 327                 err = -ENOMEM;
 328                 goto out_put_disk;
 329         }
 330
 331         /* We can only handle a certain number of pointers in our SEND_DMA
 332          * call, so we set that with blk_queue_max_hw_segments().  This is not
 333          * to be confused with blk_queue_max_phys_segments() of course!  I
 334          * know, who could possibly confuse the two?
 335          *
 336          * Well, it's simple to tell them apart: this one seems to work and the
 337          * other one didn't. */
 338         blk_queue_max_hw_segments(bd->disk->queue, LGUEST_MAX_DMA_SECTIONS);
 339
 340         /* Due to technical limitations of our Host (and simple coding) we
 341          * can't have a single buffer which crosses a page boundary.  Tell it
 342          * here.  This means that our maximum request size is 16
 343          * (LGUEST_MAX_DMA_SECTIONS) pages. */
 344         blk_queue_segment_boundary(bd->disk->queue, PAGE_SIZE-1);
 345
 346         /* We name our disk: this becomes the device name when udev does its
 347          * magic thing and creates the device node, such as /dev/lgba.
 348          * next_block_index is a global which starts at 'a'.  Unfortunately
 349          * this simple increment logic means that the 27th disk will be called
 350          * "/dev/lgb{".  In that case, I recommend having at least 29 disks, so
 351          * your /dev directory will be balanced. */
 352         sprintf(bd->disk->disk_name, "lgb%c", next_block_index++);
 353
 354         /* We look to the device descriptor again to see if this device's
 355          * interrupts are expected to be random.  If they are, we tell the irq
 356          * subsystem.  At the moment this bit is always set. */
 357         if (lguest_devices[lgdev->index].features & LGUEST_DEVICE_F_RANDOMNESS)
 358                 irqflags |= IRQF_SAMPLE_RANDOM;
 359
 360         /* Now we have the name and irqflags, we can request the interrupt; we
 361          * give it the "struct blockdev" we have set up to pass to lgb_irq()
 362          * when there is an interrupt. */
 363         err = request_irq(bd->irq, lgb_irq, irqflags, bd->disk->disk_name, bd);
 364         if (err)
 365                 goto out_cleanup_queue;
 366
 367         /* We bind our one-entry DMA pool to the key for this block device so
 368          * the Host can reply to our requests.  The key is equal to the
 369          * physical address of the device's page, which is conveniently
 370          * unique. */
 371         err = lguest_bind_dma(bd->phys_addr, &bd->dma, 1, bd->irq);
 372         if (err)
 373                 goto out_free_irq;
 374
 375         /* We finish our disk initialization and add the disk to the system. */
 376         bd->disk->major = bd->major;
 377         bd->disk->first_minor = 0;
 378         bd->disk->private_data = bd;
 379         bd->disk->fops = &lguestblk_fops;
 380         /* This is initialized to the disk size by the Launcher. */
 381         set_capacity(bd->disk, bd->lb_page->num_sectors);
 382         add_disk(bd->disk);
 383
 384         printk(KERN_INFO "%s: device %i at major %d\n",
 385                bd->disk->disk_name, lgdev->index, bd->major);
 386
 387         /* We don't need to keep the "struct blockdev" around, but if we ever
 388          * implemented device removal, we'd need this. */
 389         lgdev->private = bd;
 390         return 0;
 391
 392 out_free_irq:
 393         free_irq(bd->irq, bd);
 394 out_cleanup_queue:
 395         blk_cleanup_queue(bd->disk->queue);
 396 out_put_disk:
 397         put_disk(bd->disk);
 398 out_unregister_blkdev:
 399         unregister_blkdev(bd->major, "lguestblk");
 400 out_unmap:
 401         lguest_unmap(bd->lb_page);
 402 out_free_bd:
 403         kfree(bd);
 404         return err;
 405 }
 406
 407 /*D:410 The boilerplate code for registering the lguest block driver is just
 408  * like the console: */
 409 static struct lguest_driver lguestblk_drv = {
 410         .name = "lguestblk",
 411         .owner = THIS_MODULE,
 412         .device_type = LGUEST_DEVICE_T_BLOCK,
 413         .probe = lguestblk_probe,
 414 };
 415
 416 static __init int lguestblk_init(void)
 417 {
 418         return register_lguest_driver(&lguestblk_drv);
 419 }
 420 module_init(lguestblk_init);
 421
 422 MODULE_DESCRIPTION("Lguest block driver");
 423 MODULE_LICENSE("GPL");