2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
4 * This file is released under the GPL.
9 #include <linux/init.h>
10 #include <linux/module.h>
11 #include <linux/blk.h>
12 #include <linux/blkpg.h>
13 #include <linux/bio.h>
14 #include <linux/mempool.h>
15 #include <linux/slab.h>
17 static const char *_name
= DM_NAME
;
18 #define MAX_DEVICES 1024
21 static int _major
= 0;
24 struct mapped_device
*md
;
32 struct deferred_io
*next
;
36 * Bits for the md->flags field.
38 #define DMF_BLOCK_IO 0
39 #define DMF_SUSPENDED 1
41 struct mapped_device
{
42 struct rw_semaphore lock
;
47 request_queue_t queue
;
51 * A list of ios that arrived while we were suspended.
54 wait_queue_head_t wait
;
55 struct deferred_io
*deferred
;
58 * The current mapping.
63 * io objects are allocated from here.
69 static kmem_cache_t
*_io_cache
;
71 static __init
int local_init(void)
75 /* allocate a slab for the dm_ios */
76 _io_cache
= kmem_cache_create("dm io",
77 sizeof(struct dm_io
), 0, 0, NULL
, NULL
);
82 r
= register_blkdev(_major
, _name
);
84 kmem_cache_destroy(_io_cache
);
94 static void local_exit(void)
96 kmem_cache_destroy(_io_cache
);
98 if (unregister_blkdev(_major
, _name
) < 0)
99 DMERR("devfs_unregister_blkdev failed");
103 DMINFO("cleaned up");
107 * We have a lot of init/exit functions, so it seems easier to
108 * store them in an array. The disposable macro 'xx'
109 * expands a prefix into a pair of function names.
116 #define xx(n) {n ## _init, n ## _exit},
125 static int __init
dm_init(void)
127 const int count
= ARRAY_SIZE(_inits
);
131 for (i
= 0; i
< count
; i
++) {
132 r
= _inits
[i
].init();
146 static void __exit
dm_exit(void)
148 int i
= ARRAY_SIZE(_inits
);
155 * Block device functions
157 static int dm_blk_open(struct inode
*inode
, struct file
*file
)
159 struct mapped_device
*md
;
161 md
= inode
->i_bdev
->bd_disk
->private_data
;
166 static int dm_blk_close(struct inode
*inode
, struct file
*file
)
168 struct mapped_device
*md
;
170 md
= inode
->i_bdev
->bd_disk
->private_data
;
175 static inline struct dm_io
*alloc_io(struct mapped_device
*md
)
177 return mempool_alloc(md
->io_pool
, GFP_NOIO
);
180 static inline void free_io(struct mapped_device
*md
, struct dm_io
*io
)
182 mempool_free(io
, md
->io_pool
);
185 static inline struct deferred_io
*alloc_deferred(void)
187 return kmalloc(sizeof(struct deferred_io
), GFP_NOIO
);
190 static inline void free_deferred(struct deferred_io
*di
)
196 * Add the bio to the list of deferred io.
198 static int queue_io(struct mapped_device
*md
, struct bio
*bio
)
200 struct deferred_io
*di
;
202 di
= alloc_deferred();
206 down_write(&md
->lock
);
208 if (!test_bit(DMF_BLOCK_IO
, &md
->flags
)) {
215 di
->next
= md
->deferred
;
219 return 0; /* deferred successfully */
222 /*-----------------------------------------------------------------
224 * A more elegant soln is in the works that uses the queue
225 * merge fn, unfortunately there are a couple of changes to
226 * the block layer that I want to make for this. So in the
227 * interests of getting something for people to use I give
228 * you this clearly demarcated crap.
229 *---------------------------------------------------------------*/
230 static inline sector_t
to_sector(unsigned int bytes
)
232 return bytes
>> SECTOR_SHIFT
;
235 static inline unsigned int to_bytes(sector_t sector
)
237 return sector
<< SECTOR_SHIFT
;
241 * Decrements the number of outstanding ios that a bio has been
242 * cloned into, completing the original io if necc.
244 static inline void dec_pending(struct dm_io
*io
, int error
)
246 static spinlock_t _uptodate_lock
= SPIN_LOCK_UNLOCKED
;
250 spin_lock_irqsave(&_uptodate_lock
, flags
);
252 spin_unlock_irqrestore(&_uptodate_lock
, flags
);
255 if (atomic_dec_and_test(&io
->io_count
)) {
256 if (atomic_dec_and_test(&io
->md
->pending
))
257 /* nudge anyone waiting on suspend queue */
258 wake_up(&io
->md
->wait
);
260 bio_endio(io
->bio
, io
->bio
->bi_size
, io
->error
);
265 static int clone_endio(struct bio
*bio
, unsigned int done
, int error
)
267 struct dm_io
*io
= bio
->bi_private
;
272 dec_pending(io
, error
);
278 static sector_t
max_io_len(struct mapped_device
*md
,
279 sector_t sector
, struct dm_target
*ti
)
281 sector_t offset
= sector
- ti
->begin
;
282 sector_t len
= ti
->len
- offset
;
284 /* FIXME: obey io_restrictions ! */
288 * Does the target need to split even further ?
292 boundary
= dm_round_up(offset
+ 1, ti
->split_io
) - offset
;
301 static void __map_bio(struct dm_target
*ti
, struct bio
*clone
, struct dm_io
*io
)
308 BUG_ON(!clone
->bi_size
);
310 clone
->bi_end_io
= clone_endio
;
311 clone
->bi_private
= io
;
314 * Map the clone. If r == 0 we don't need to do
315 * anything, the target has assumed ownership of
318 atomic_inc(&io
->io_count
);
319 r
= ti
->type
->map(ti
, clone
);
321 /* the bio has been remapped so dispatch it */
322 generic_make_request(clone
);
325 /* error the io and bail out */
326 dec_pending(io
, -EIO
);
330 struct mapped_device
*md
;
334 sector_t sector_count
;
339 * Creates a little bio that is just does part of a bvec.
341 static struct bio
*split_bvec(struct bio
*bio
, sector_t sector
,
342 unsigned short idx
, unsigned int offset
,
346 struct bio_vec
*bv
= bio
->bi_io_vec
+ idx
;
348 clone
= bio_alloc(GFP_NOIO
, 1);
349 memcpy(clone
->bi_io_vec
, bv
, sizeof(*bv
));
351 clone
->bi_sector
= sector
;
352 clone
->bi_bdev
= bio
->bi_bdev
;
353 clone
->bi_rw
= bio
->bi_rw
;
355 clone
->bi_size
= to_bytes(len
);
356 clone
->bi_io_vec
->bv_offset
= offset
;
357 clone
->bi_io_vec
->bv_len
= clone
->bi_size
;
363 * Creates a bio that consists of range of complete bvecs.
365 static struct bio
*clone_bio(struct bio
*bio
, sector_t sector
,
366 unsigned short idx
, unsigned short bv_count
,
371 clone
= bio_clone(bio
, GFP_NOIO
);
372 clone
->bi_sector
= sector
;
374 clone
->bi_vcnt
= idx
+ bv_count
;
375 clone
->bi_size
= to_bytes(len
);
380 static void __clone_and_map(struct clone_info
*ci
)
382 struct bio
*clone
, *bio
= ci
->bio
;
383 struct dm_target
*ti
= dm_table_find_target(ci
->md
->map
, ci
->sector
);
384 sector_t len
= 0, max
= max_io_len(ci
->md
, ci
->sector
, ti
);
386 if (ci
->sector_count
<= max
) {
388 * Optimise for the simple case where we can do all of
389 * the remaining io with a single clone.
391 clone
= clone_bio(bio
, ci
->sector
, ci
->idx
,
392 bio
->bi_vcnt
- ci
->idx
, ci
->sector_count
);
393 __map_bio(ti
, clone
, ci
->io
);
394 ci
->sector_count
= 0;
396 } else if (to_sector(bio
->bi_io_vec
[ci
->idx
].bv_len
) <= max
) {
398 * There are some bvecs that don't span targets.
399 * Do as many of these as possible.
402 sector_t remaining
= max
;
405 for (i
= ci
->idx
; remaining
&& (i
< bio
->bi_vcnt
); i
++) {
406 bv_len
= to_sector(bio
->bi_io_vec
[i
].bv_len
);
408 if (bv_len
> remaining
)
415 clone
= clone_bio(bio
, ci
->sector
, ci
->idx
, i
- ci
->idx
, len
);
416 __map_bio(ti
, clone
, ci
->io
);
419 ci
->sector_count
-= len
;
424 * Create two copy bios to deal with io that has
425 * been split across a target.
427 struct bio_vec
*bv
= bio
->bi_io_vec
+ ci
->idx
;
429 clone
= split_bvec(bio
, ci
->sector
, ci
->idx
,
431 __map_bio(ti
, clone
, ci
->io
);
434 ci
->sector_count
-= max
;
435 ti
= dm_table_find_target(ci
->md
->map
, ci
->sector
);
437 len
= to_sector(bv
->bv_len
) - max
;
438 clone
= split_bvec(bio
, ci
->sector
, ci
->idx
,
439 bv
->bv_offset
+ to_bytes(max
), len
);
440 __map_bio(ti
, clone
, ci
->io
);
443 ci
->sector_count
-= len
;
449 * Split the bio into several clones.
451 static void __split_bio(struct mapped_device
*md
, struct bio
*bio
)
453 struct clone_info ci
;
457 ci
.io
= alloc_io(md
);
459 atomic_set(&ci
.io
->io_count
, 1);
462 ci
.sector
= bio
->bi_sector
;
463 ci
.sector_count
= bio_sectors(bio
);
464 ci
.idx
= bio
->bi_idx
;
466 atomic_inc(&md
->pending
);
467 while (ci
.sector_count
)
468 __clone_and_map(&ci
);
470 /* drop the extra reference count */
471 dec_pending(ci
.io
, 0);
473 /*-----------------------------------------------------------------
475 *---------------------------------------------------------------*/
479 * The request function that just remaps the bio built up by
482 static int dm_request(request_queue_t
*q
, struct bio
*bio
)
485 struct mapped_device
*md
= q
->queuedata
;
487 down_read(&md
->lock
);
490 * If we're suspended we have to queue
493 while (test_bit(DMF_BLOCK_IO
, &md
->flags
)) {
496 if (bio_rw(bio
) == READA
) {
497 bio_io_error(bio
, bio
->bi_size
);
501 r
= queue_io(md
, bio
);
503 bio_io_error(bio
, bio
->bi_size
);
507 return 0; /* deferred successfully */
510 * We're in a while loop, because someone could suspend
511 * before we get to the following read lock.
513 down_read(&md
->lock
);
516 __split_bio(md
, bio
);
521 /*-----------------------------------------------------------------
522 * A bitset is used to keep track of allocated minor numbers.
523 *---------------------------------------------------------------*/
524 static spinlock_t _minor_lock
= SPIN_LOCK_UNLOCKED
;
525 static unsigned long _minor_bits
[MAX_DEVICES
/ BITS_PER_LONG
];
527 static void free_minor(int minor
)
529 spin_lock(&_minor_lock
);
530 clear_bit(minor
, _minor_bits
);
531 spin_unlock(&_minor_lock
);
535 * See if the device with a specific minor # is free.
537 static int specific_minor(int minor
)
541 if (minor
>= MAX_DEVICES
) {
542 DMWARN("request for a mapped_device beyond MAX_DEVICES (%d)",
547 spin_lock(&_minor_lock
);
548 if (!test_and_set_bit(minor
, _minor_bits
))
550 spin_unlock(&_minor_lock
);
555 static int next_free_minor(void)
557 int minor
, r
= -EBUSY
;
559 spin_lock(&_minor_lock
);
560 minor
= find_first_zero_bit(_minor_bits
, MAX_DEVICES
);
561 if (minor
!= MAX_DEVICES
) {
562 set_bit(minor
, _minor_bits
);
565 spin_unlock(&_minor_lock
);
571 * Allocate and initialise a blank device with a given minor.
573 static struct mapped_device
*alloc_dev(int minor
)
575 struct mapped_device
*md
= kmalloc(sizeof(*md
), GFP_KERNEL
);
578 DMWARN("unable to allocate device, out of memory.");
582 /* get a minor number for the dev */
583 minor
= (minor
< 0) ? next_free_minor() : specific_minor(minor
);
589 DMWARN("allocating minor %d.", minor
);
590 memset(md
, 0, sizeof(*md
));
591 init_rwsem(&md
->lock
);
592 atomic_set(&md
->holders
, 1);
594 md
->queue
.queuedata
= md
;
595 blk_queue_make_request(&md
->queue
, dm_request
);
597 md
->io_pool
= mempool_create(MIN_IOS
, mempool_alloc_slab
,
598 mempool_free_slab
, _io_cache
);
600 free_minor(md
->disk
->first_minor
);
605 md
->disk
= alloc_disk(1);
607 mempool_destroy(md
->io_pool
);
608 free_minor(md
->disk
->first_minor
);
613 md
->disk
->major
= _major
;
614 md
->disk
->first_minor
= minor
;
615 md
->disk
->fops
= &dm_blk_dops
;
616 md
->disk
->queue
= &md
->queue
;
617 md
->disk
->private_data
= md
;
618 sprintf(md
->disk
->disk_name
, "dm-%d", minor
);
621 atomic_set(&md
->pending
, 0);
622 init_waitqueue_head(&md
->wait
);
626 static void free_dev(struct mapped_device
*md
)
628 free_minor(md
->disk
->first_minor
);
629 mempool_destroy(md
->io_pool
);
630 del_gendisk(md
->disk
);
636 * Bind a table to the device.
638 static int __bind(struct mapped_device
*md
, struct dm_table
*t
)
640 request_queue_t
*q
= &md
->queue
;
644 size
= dm_table_get_size(t
);
645 set_capacity(md
->disk
, size
);
650 dm_table_set_restrictions(t
, q
);
654 static void __unbind(struct mapped_device
*md
)
656 dm_table_put(md
->map
);
658 set_capacity(md
->disk
, 0);
662 * Constructor for a new device.
664 int dm_create(int minor
, struct dm_table
*table
, struct mapped_device
**result
)
667 struct mapped_device
*md
;
669 md
= alloc_dev(minor
);
673 r
= __bind(md
, table
);
683 void dm_get(struct mapped_device
*md
)
685 atomic_inc(&md
->holders
);
688 void dm_put(struct mapped_device
*md
)
690 if (atomic_dec_and_test(&md
->holders
)) {
691 DMWARN("destroying md");
698 * Requeue the deferred bios by calling generic_make_request.
700 static void flush_deferred_io(struct deferred_io
*c
)
702 struct deferred_io
*n
;
706 generic_make_request(c
->bio
);
713 * Swap in a new table (destroying old one).
715 int dm_swap_table(struct mapped_device
*md
, struct dm_table
*table
)
719 down_write(&md
->lock
);
721 /* device must be suspended */
722 if (!test_bit(DMF_SUSPENDED
, &md
->flags
)) {
728 r
= __bind(md
, table
);
737 * We need to be able to change a mapping table under a mounted
738 * filesystem. For example we might want to move some data in
739 * the background. Before the table can be swapped with
740 * dm_bind_table, dm_suspend must be called to flush any in
741 * flight bios and ensure that any further io gets deferred.
743 int dm_suspend(struct mapped_device
*md
)
745 DECLARE_WAITQUEUE(wait
, current
);
747 down_write(&md
->lock
);
750 * First we set the BLOCK_IO flag so no more ios will be
753 if (test_bit(DMF_BLOCK_IO
, &md
->flags
)) {
758 set_bit(DMF_BLOCK_IO
, &md
->flags
);
759 add_wait_queue(&md
->wait
, &wait
);
763 * Then we wait for the already mapped ios to
768 set_current_state(TASK_INTERRUPTIBLE
);
770 if (!atomic_read(&md
->pending
))
776 current
->state
= TASK_RUNNING
;
778 down_write(&md
->lock
);
779 remove_wait_queue(&md
->wait
, &wait
);
780 set_bit(DMF_SUSPENDED
, &md
->flags
);
786 int dm_resume(struct mapped_device
*md
)
788 struct deferred_io
*def
;
790 down_write(&md
->lock
);
791 if (!test_bit(DMF_SUSPENDED
, &md
->flags
) ||
792 !dm_table_get_size(md
->map
)) {
797 clear_bit(DMF_SUSPENDED
, &md
->flags
);
798 clear_bit(DMF_BLOCK_IO
, &md
->flags
);
803 flush_deferred_io(def
);
810 * The gendisk is only valid as long as you have a reference
813 struct gendisk
*dm_disk(struct mapped_device
*md
)
818 struct dm_table
*dm_get_table(struct mapped_device
*md
)
822 down_read(&md
->lock
);
830 int dm_suspended(struct mapped_device
*md
)
832 return test_bit(DMF_SUSPENDED
, &md
->flags
);
835 struct block_device_operations dm_blk_dops
= {
837 .release
= dm_blk_close
,
844 module_init(dm_init
);
845 module_exit(dm_exit
);
847 MODULE_PARM(major
, "i");
848 MODULE_PARM_DESC(major
, "The major number of the device mapper");
849 MODULE_DESCRIPTION(DM_NAME
" driver");
850 MODULE_AUTHOR("Joe Thornber <thornber@sistina.com>");
851 MODULE_LICENSE("GPL");