Hopefully get the Kconfig PCI stuff right, finally.
[linux-2.6/linux-mips.git] / drivers / md / dm.c
blob18ead55a549a9ed16dbf11b12436a09bd38c68ca
1 /*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
4 * This file is released under the GPL.
5 */
7 #include "dm.h"
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/blk.h>
12 #include <linux/blkpg.h>
13 #include <linux/bio.h>
14 #include <linux/mempool.h>
15 #include <linux/slab.h>
17 static const char *_name = DM_NAME;
18 #define MAX_DEVICES 1024
20 static int major = 0;
21 static int _major = 0;
23 struct dm_io {
24 struct mapped_device *md;
25 int error;
26 struct bio *bio;
27 atomic_t io_count;
30 struct deferred_io {
31 struct bio *bio;
32 struct deferred_io *next;
36 * Bits for the md->flags field.
38 #define DMF_BLOCK_IO 0
39 #define DMF_SUSPENDED 1
41 struct mapped_device {
42 struct rw_semaphore lock;
43 atomic_t holders;
45 unsigned long flags;
47 request_queue_t queue;
48 struct gendisk *disk;
51 * A list of ios that arrived while we were suspended.
53 atomic_t pending;
54 wait_queue_head_t wait;
55 struct deferred_io *deferred;
58 * The current mapping.
60 struct dm_table *map;
63 * io objects are allocated from here.
65 mempool_t *io_pool;
68 #define MIN_IOS 256
69 static kmem_cache_t *_io_cache;
71 static __init int local_init(void)
73 int r;
75 /* allocate a slab for the dm_ios */
76 _io_cache = kmem_cache_create("dm io",
77 sizeof(struct dm_io), 0, 0, NULL, NULL);
78 if (!_io_cache)
79 return -ENOMEM;
81 _major = major;
82 r = register_blkdev(_major, _name);
83 if (r < 0) {
84 kmem_cache_destroy(_io_cache);
85 return r;
88 if (!_major)
89 _major = r;
91 return 0;
94 static void local_exit(void)
96 kmem_cache_destroy(_io_cache);
98 if (unregister_blkdev(_major, _name) < 0)
99 DMERR("devfs_unregister_blkdev failed");
101 _major = 0;
103 DMINFO("cleaned up");
107 * We have a lot of init/exit functions, so it seems easier to
108 * store them in an array. The disposable macro 'xx'
109 * expands a prefix into a pair of function names.
111 static struct {
112 int (*init) (void);
113 void (*exit) (void);
115 } _inits[] = {
116 #define xx(n) {n ## _init, n ## _exit},
117 xx(local)
118 xx(dm_target)
119 xx(dm_linear)
120 xx(dm_stripe)
121 xx(dm_interface)
122 #undef xx
125 static int __init dm_init(void)
127 const int count = ARRAY_SIZE(_inits);
129 int r, i;
131 for (i = 0; i < count; i++) {
132 r = _inits[i].init();
133 if (r)
134 goto bad;
137 return 0;
139 bad:
140 while (i--)
141 _inits[i].exit();
143 return r;
146 static void __exit dm_exit(void)
148 int i = ARRAY_SIZE(_inits);
150 while (i--)
151 _inits[i].exit();
155 * Block device functions
157 static int dm_blk_open(struct inode *inode, struct file *file)
159 struct mapped_device *md;
161 md = inode->i_bdev->bd_disk->private_data;
162 dm_get(md);
163 return 0;
166 static int dm_blk_close(struct inode *inode, struct file *file)
168 struct mapped_device *md;
170 md = inode->i_bdev->bd_disk->private_data;
171 dm_put(md);
172 return 0;
175 static inline struct dm_io *alloc_io(struct mapped_device *md)
177 return mempool_alloc(md->io_pool, GFP_NOIO);
180 static inline void free_io(struct mapped_device *md, struct dm_io *io)
182 mempool_free(io, md->io_pool);
185 static inline struct deferred_io *alloc_deferred(void)
187 return kmalloc(sizeof(struct deferred_io), GFP_NOIO);
190 static inline void free_deferred(struct deferred_io *di)
192 kfree(di);
196 * Add the bio to the list of deferred io.
198 static int queue_io(struct mapped_device *md, struct bio *bio)
200 struct deferred_io *di;
202 di = alloc_deferred();
203 if (!di)
204 return -ENOMEM;
206 down_write(&md->lock);
208 if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
209 up_write(&md->lock);
210 free_deferred(di);
211 return 1;
214 di->bio = bio;
215 di->next = md->deferred;
216 md->deferred = di;
218 up_write(&md->lock);
219 return 0; /* deferred successfully */
222 /*-----------------------------------------------------------------
223 * CRUD START:
224 * A more elegant soln is in the works that uses the queue
225 * merge fn, unfortunately there are a couple of changes to
226 * the block layer that I want to make for this. So in the
227 * interests of getting something for people to use I give
228 * you this clearly demarcated crap.
229 *---------------------------------------------------------------*/
230 static inline sector_t to_sector(unsigned int bytes)
232 return bytes >> SECTOR_SHIFT;
235 static inline unsigned int to_bytes(sector_t sector)
237 return sector << SECTOR_SHIFT;
241 * Decrements the number of outstanding ios that a bio has been
242 * cloned into, completing the original io if necc.
244 static inline void dec_pending(struct dm_io *io, int error)
246 static spinlock_t _uptodate_lock = SPIN_LOCK_UNLOCKED;
247 unsigned long flags;
249 if (error) {
250 spin_lock_irqsave(&_uptodate_lock, flags);
251 io->error = error;
252 spin_unlock_irqrestore(&_uptodate_lock, flags);
255 if (atomic_dec_and_test(&io->io_count)) {
256 if (atomic_dec_and_test(&io->md->pending))
257 /* nudge anyone waiting on suspend queue */
258 wake_up(&io->md->wait);
260 bio_endio(io->bio, io->bio->bi_size, io->error);
261 free_io(io->md, io);
265 static int clone_endio(struct bio *bio, unsigned int done, int error)
267 struct dm_io *io = bio->bi_private;
269 if (bio->bi_size)
270 return 1;
272 dec_pending(io, error);
273 bio_put(bio);
274 return 0;
278 static sector_t max_io_len(struct mapped_device *md,
279 sector_t sector, struct dm_target *ti)
281 sector_t offset = sector - ti->begin;
282 sector_t len = ti->len - offset;
284 /* FIXME: obey io_restrictions ! */
288 * Does the target need to split even further ?
290 if (ti->split_io) {
291 sector_t boundary;
292 boundary = dm_round_up(offset + 1, ti->split_io) - offset;
294 if (len > boundary)
295 len = boundary;
298 return len;
301 static void __map_bio(struct dm_target *ti, struct bio *clone, struct dm_io *io)
303 int r;
306 * Sanity checks.
308 BUG_ON(!clone->bi_size);
310 clone->bi_end_io = clone_endio;
311 clone->bi_private = io;
314 * Map the clone. If r == 0 we don't need to do
315 * anything, the target has assumed ownership of
316 * this io.
318 atomic_inc(&io->io_count);
319 r = ti->type->map(ti, clone);
320 if (r > 0)
321 /* the bio has been remapped so dispatch it */
322 generic_make_request(clone);
324 else if (r < 0)
325 /* error the io and bail out */
326 dec_pending(io, -EIO);
329 struct clone_info {
330 struct mapped_device *md;
331 struct bio *bio;
332 struct dm_io *io;
333 sector_t sector;
334 sector_t sector_count;
335 unsigned short idx;
339 * Creates a little bio that is just does part of a bvec.
341 static struct bio *split_bvec(struct bio *bio, sector_t sector,
342 unsigned short idx, unsigned int offset,
343 unsigned int len)
345 struct bio *clone;
346 struct bio_vec *bv = bio->bi_io_vec + idx;
348 clone = bio_alloc(GFP_NOIO, 1);
349 memcpy(clone->bi_io_vec, bv, sizeof(*bv));
351 clone->bi_sector = sector;
352 clone->bi_bdev = bio->bi_bdev;
353 clone->bi_rw = bio->bi_rw;
354 clone->bi_vcnt = 1;
355 clone->bi_size = to_bytes(len);
356 clone->bi_io_vec->bv_offset = offset;
357 clone->bi_io_vec->bv_len = clone->bi_size;
359 return clone;
363 * Creates a bio that consists of range of complete bvecs.
365 static struct bio *clone_bio(struct bio *bio, sector_t sector,
366 unsigned short idx, unsigned short bv_count,
367 unsigned int len)
369 struct bio *clone;
371 clone = bio_clone(bio, GFP_NOIO);
372 clone->bi_sector = sector;
373 clone->bi_idx = idx;
374 clone->bi_vcnt = idx + bv_count;
375 clone->bi_size = to_bytes(len);
377 return clone;
380 static void __clone_and_map(struct clone_info *ci)
382 struct bio *clone, *bio = ci->bio;
383 struct dm_target *ti = dm_table_find_target(ci->md->map, ci->sector);
384 sector_t len = 0, max = max_io_len(ci->md, ci->sector, ti);
386 if (ci->sector_count <= max) {
388 * Optimise for the simple case where we can do all of
389 * the remaining io with a single clone.
391 clone = clone_bio(bio, ci->sector, ci->idx,
392 bio->bi_vcnt - ci->idx, ci->sector_count);
393 __map_bio(ti, clone, ci->io);
394 ci->sector_count = 0;
396 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
398 * There are some bvecs that don't span targets.
399 * Do as many of these as possible.
401 int i;
402 sector_t remaining = max;
403 sector_t bv_len;
405 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
406 bv_len = to_sector(bio->bi_io_vec[i].bv_len);
408 if (bv_len > remaining)
409 break;
411 remaining -= bv_len;
412 len += bv_len;
415 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len);
416 __map_bio(ti, clone, ci->io);
418 ci->sector += len;
419 ci->sector_count -= len;
420 ci->idx = i;
422 } else {
424 * Create two copy bios to deal with io that has
425 * been split across a target.
427 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
429 clone = split_bvec(bio, ci->sector, ci->idx,
430 bv->bv_offset, max);
431 __map_bio(ti, clone, ci->io);
433 ci->sector += max;
434 ci->sector_count -= max;
435 ti = dm_table_find_target(ci->md->map, ci->sector);
437 len = to_sector(bv->bv_len) - max;
438 clone = split_bvec(bio, ci->sector, ci->idx,
439 bv->bv_offset + to_bytes(max), len);
440 __map_bio(ti, clone, ci->io);
442 ci->sector += len;
443 ci->sector_count -= len;
444 ci->idx++;
449 * Split the bio into several clones.
451 static void __split_bio(struct mapped_device *md, struct bio *bio)
453 struct clone_info ci;
455 ci.md = md;
456 ci.bio = bio;
457 ci.io = alloc_io(md);
458 ci.io->error = 0;
459 atomic_set(&ci.io->io_count, 1);
460 ci.io->bio = bio;
461 ci.io->md = md;
462 ci.sector = bio->bi_sector;
463 ci.sector_count = bio_sectors(bio);
464 ci.idx = bio->bi_idx;
466 atomic_inc(&md->pending);
467 while (ci.sector_count)
468 __clone_and_map(&ci);
470 /* drop the extra reference count */
471 dec_pending(ci.io, 0);
473 /*-----------------------------------------------------------------
474 * CRUD END
475 *---------------------------------------------------------------*/
479 * The request function that just remaps the bio built up by
480 * dm_merge_bvec.
482 static int dm_request(request_queue_t *q, struct bio *bio)
484 int r;
485 struct mapped_device *md = q->queuedata;
487 down_read(&md->lock);
490 * If we're suspended we have to queue
491 * this io for later.
493 while (test_bit(DMF_BLOCK_IO, &md->flags)) {
494 up_read(&md->lock);
496 if (bio_rw(bio) == READA) {
497 bio_io_error(bio, bio->bi_size);
498 return 0;
501 r = queue_io(md, bio);
502 if (r < 0) {
503 bio_io_error(bio, bio->bi_size);
504 return 0;
506 } else if (r == 0)
507 return 0; /* deferred successfully */
510 * We're in a while loop, because someone could suspend
511 * before we get to the following read lock.
513 down_read(&md->lock);
516 __split_bio(md, bio);
517 up_read(&md->lock);
518 return 0;
521 /*-----------------------------------------------------------------
522 * A bitset is used to keep track of allocated minor numbers.
523 *---------------------------------------------------------------*/
524 static spinlock_t _minor_lock = SPIN_LOCK_UNLOCKED;
525 static unsigned long _minor_bits[MAX_DEVICES / BITS_PER_LONG];
527 static void free_minor(int minor)
529 spin_lock(&_minor_lock);
530 clear_bit(minor, _minor_bits);
531 spin_unlock(&_minor_lock);
535 * See if the device with a specific minor # is free.
537 static int specific_minor(int minor)
539 int r = -EBUSY;
541 if (minor >= MAX_DEVICES) {
542 DMWARN("request for a mapped_device beyond MAX_DEVICES (%d)",
543 MAX_DEVICES);
544 return -EINVAL;
547 spin_lock(&_minor_lock);
548 if (!test_and_set_bit(minor, _minor_bits))
549 r = minor;
550 spin_unlock(&_minor_lock);
552 return r;
555 static int next_free_minor(void)
557 int minor, r = -EBUSY;
559 spin_lock(&_minor_lock);
560 minor = find_first_zero_bit(_minor_bits, MAX_DEVICES);
561 if (minor != MAX_DEVICES) {
562 set_bit(minor, _minor_bits);
563 r = minor;
565 spin_unlock(&_minor_lock);
567 return r;
571 * Allocate and initialise a blank device with a given minor.
573 static struct mapped_device *alloc_dev(int minor)
575 struct mapped_device *md = kmalloc(sizeof(*md), GFP_KERNEL);
577 if (!md) {
578 DMWARN("unable to allocate device, out of memory.");
579 return NULL;
582 /* get a minor number for the dev */
583 minor = (minor < 0) ? next_free_minor() : specific_minor(minor);
584 if (minor < 0) {
585 kfree(md);
586 return NULL;
589 DMWARN("allocating minor %d.", minor);
590 memset(md, 0, sizeof(*md));
591 init_rwsem(&md->lock);
592 atomic_set(&md->holders, 1);
594 md->queue.queuedata = md;
595 blk_queue_make_request(&md->queue, dm_request);
597 md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
598 mempool_free_slab, _io_cache);
599 if (!md->io_pool) {
600 free_minor(md->disk->first_minor);
601 kfree(md);
602 return NULL;
605 md->disk = alloc_disk(1);
606 if (!md->disk) {
607 mempool_destroy(md->io_pool);
608 free_minor(md->disk->first_minor);
609 kfree(md);
610 return NULL;
613 md->disk->major = _major;
614 md->disk->first_minor = minor;
615 md->disk->fops = &dm_blk_dops;
616 md->disk->queue = &md->queue;
617 md->disk->private_data = md;
618 sprintf(md->disk->disk_name, "dm-%d", minor);
619 add_disk(md->disk);
621 atomic_set(&md->pending, 0);
622 init_waitqueue_head(&md->wait);
623 return md;
626 static void free_dev(struct mapped_device *md)
628 free_minor(md->disk->first_minor);
629 mempool_destroy(md->io_pool);
630 del_gendisk(md->disk);
631 put_disk(md->disk);
632 kfree(md);
636 * Bind a table to the device.
638 static int __bind(struct mapped_device *md, struct dm_table *t)
640 request_queue_t *q = &md->queue;
641 sector_t size;
642 md->map = t;
644 size = dm_table_get_size(t);
645 set_capacity(md->disk, size);
646 if (size == 0)
647 return 0;
649 dm_table_get(t);
650 dm_table_set_restrictions(t, q);
651 return 0;
654 static void __unbind(struct mapped_device *md)
656 dm_table_put(md->map);
657 md->map = NULL;
658 set_capacity(md->disk, 0);
662 * Constructor for a new device.
664 int dm_create(int minor, struct dm_table *table, struct mapped_device **result)
666 int r;
667 struct mapped_device *md;
669 md = alloc_dev(minor);
670 if (!md)
671 return -ENXIO;
673 r = __bind(md, table);
674 if (r) {
675 free_dev(md);
676 return r;
679 *result = md;
680 return 0;
683 void dm_get(struct mapped_device *md)
685 atomic_inc(&md->holders);
688 void dm_put(struct mapped_device *md)
690 if (atomic_dec_and_test(&md->holders)) {
691 DMWARN("destroying md");
692 __unbind(md);
693 free_dev(md);
698 * Requeue the deferred bios by calling generic_make_request.
700 static void flush_deferred_io(struct deferred_io *c)
702 struct deferred_io *n;
704 while (c) {
705 n = c->next;
706 generic_make_request(c->bio);
707 free_deferred(c);
708 c = n;
713 * Swap in a new table (destroying old one).
715 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
717 int r;
719 down_write(&md->lock);
721 /* device must be suspended */
722 if (!test_bit(DMF_SUSPENDED, &md->flags)) {
723 up_write(&md->lock);
724 return -EPERM;
727 __unbind(md);
728 r = __bind(md, table);
729 if (r)
730 return r;
732 up_write(&md->lock);
733 return 0;
737 * We need to be able to change a mapping table under a mounted
738 * filesystem. For example we might want to move some data in
739 * the background. Before the table can be swapped with
740 * dm_bind_table, dm_suspend must be called to flush any in
741 * flight bios and ensure that any further io gets deferred.
743 int dm_suspend(struct mapped_device *md)
745 DECLARE_WAITQUEUE(wait, current);
747 down_write(&md->lock);
750 * First we set the BLOCK_IO flag so no more ios will be
751 * mapped.
753 if (test_bit(DMF_BLOCK_IO, &md->flags)) {
754 up_write(&md->lock);
755 return -EINVAL;
758 set_bit(DMF_BLOCK_IO, &md->flags);
759 add_wait_queue(&md->wait, &wait);
760 up_write(&md->lock);
763 * Then we wait for the already mapped ios to
764 * complete.
766 blk_run_queues();
767 while (1) {
768 set_current_state(TASK_INTERRUPTIBLE);
770 if (!atomic_read(&md->pending))
771 break;
773 yield();
776 current->state = TASK_RUNNING;
778 down_write(&md->lock);
779 remove_wait_queue(&md->wait, &wait);
780 set_bit(DMF_SUSPENDED, &md->flags);
781 up_write(&md->lock);
783 return 0;
786 int dm_resume(struct mapped_device *md)
788 struct deferred_io *def;
790 down_write(&md->lock);
791 if (!test_bit(DMF_SUSPENDED, &md->flags) ||
792 !dm_table_get_size(md->map)) {
793 up_write(&md->lock);
794 return -EINVAL;
797 clear_bit(DMF_SUSPENDED, &md->flags);
798 clear_bit(DMF_BLOCK_IO, &md->flags);
799 def = md->deferred;
800 md->deferred = NULL;
801 up_write(&md->lock);
803 flush_deferred_io(def);
804 blk_run_queues();
806 return 0;
810 * The gendisk is only valid as long as you have a reference
811 * count on 'md'.
813 struct gendisk *dm_disk(struct mapped_device *md)
815 return md->disk;
818 struct dm_table *dm_get_table(struct mapped_device *md)
820 struct dm_table *t;
822 down_read(&md->lock);
823 t = md->map;
824 dm_table_get(t);
825 up_read(&md->lock);
827 return t;
830 int dm_suspended(struct mapped_device *md)
832 return test_bit(DMF_SUSPENDED, &md->flags);
835 struct block_device_operations dm_blk_dops = {
836 .open = dm_blk_open,
837 .release = dm_blk_close,
838 .owner = THIS_MODULE
842 * module hooks
844 module_init(dm_init);
845 module_exit(dm_exit);
847 MODULE_PARM(major, "i");
848 MODULE_PARM_DESC(major, "The major number of the device mapper");
849 MODULE_DESCRIPTION(DM_NAME " driver");
850 MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
851 MODULE_LICENSE("GPL");