4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
8 #include <linux/config.h>
9 #include <linux/init.h>
11 #include <linux/fcntl.h>
12 #include <linux/slab.h>
13 #include <linux/kmod.h>
14 #include <linux/major.h>
15 #include <linux/devfs_fs_kernel.h>
16 #include <linux/smp_lock.h>
17 #include <linux/highmem.h>
18 #include <linux/blkdev.h>
19 #include <linux/module.h>
20 #include <linux/blkpg.h>
21 #include <linux/buffer_head.h>
22 #include <linux/mpage.h>
23 #include <linux/mount.h>
24 #include <linux/uio.h>
25 #include <linux/namei.h>
26 #include <asm/uaccess.h>
29 static sector_t
max_block(struct block_device
*bdev
)
31 sector_t retval
= ~((sector_t
)0);
32 loff_t sz
= i_size_read(bdev
->bd_inode
);
35 unsigned int size
= block_size(bdev
);
36 unsigned int sizebits
= blksize_bits(size
);
37 retval
= (sz
>> sizebits
);
42 /* Kill _all_ buffers, dirty or not.. */
43 static void kill_bdev(struct block_device
*bdev
)
45 invalidate_bdev(bdev
, 1);
46 truncate_inode_pages(bdev
->bd_inode
->i_mapping
, 0);
49 int set_blocksize(struct block_device
*bdev
, int size
)
53 /* Size must be a power of two, and between 512 and PAGE_SIZE */
54 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
57 /* Size cannot be smaller than the size supported by the device */
58 if (size
< bdev_hardsect_size(bdev
))
61 oldsize
= bdev
->bd_block_size
;
65 /* Ok, we're actually changing the blocksize.. */
67 bdev
->bd_block_size
= size
;
68 bdev
->bd_inode
->i_blkbits
= blksize_bits(size
);
73 int sb_set_blocksize(struct super_block
*sb
, int size
)
76 if (set_blocksize(sb
->s_bdev
, size
) < 0)
78 sb
->s_blocksize
= size
;
79 for (bits
= 9, size
>>= 9; size
>>= 1; bits
++)
81 sb
->s_blocksize_bits
= bits
;
82 return sb
->s_blocksize
;
85 int sb_min_blocksize(struct super_block
*sb
, int size
)
87 int minsize
= bdev_hardsect_size(sb
->s_bdev
);
90 return sb_set_blocksize(sb
, size
);
94 blkdev_get_block(struct inode
*inode
, sector_t iblock
,
95 struct buffer_head
*bh
, int create
)
97 if (iblock
>= max_block(inode
->i_bdev
))
100 bh
->b_bdev
= inode
->i_bdev
;
101 bh
->b_blocknr
= iblock
;
102 set_buffer_mapped(bh
);
107 blkdev_get_blocks(struct inode
*inode
, sector_t iblock
,
108 unsigned long max_blocks
, struct buffer_head
*bh
, int create
)
110 if ((iblock
+ max_blocks
) > max_block(inode
->i_bdev
))
113 bh
->b_bdev
= inode
->i_bdev
;
114 bh
->b_blocknr
= iblock
;
115 bh
->b_size
= max_blocks
<< inode
->i_blkbits
;
116 set_buffer_mapped(bh
);
121 blkdev_direct_IO(int rw
, struct kiocb
*iocb
, const struct iovec
*iov
,
122 loff_t offset
, unsigned long nr_segs
)
124 struct file
*file
= iocb
->ki_filp
;
125 struct inode
*inode
= file
->f_dentry
->d_inode
->i_mapping
->host
;
127 return blockdev_direct_IO(rw
, iocb
, inode
, inode
->i_bdev
, iov
, offset
,
128 nr_segs
, blkdev_get_blocks
);
131 static int blkdev_writepage(struct page
*page
, struct writeback_control
*wbc
)
133 return block_write_full_page(page
, blkdev_get_block
, wbc
);
136 static int blkdev_readpage(struct file
* file
, struct page
* page
)
138 return block_read_full_page(page
, blkdev_get_block
);
141 static int blkdev_prepare_write(struct file
*file
, struct page
*page
, unsigned from
, unsigned to
)
143 return block_prepare_write(page
, from
, to
, blkdev_get_block
);
146 static int blkdev_commit_write(struct file
*file
, struct page
*page
, unsigned from
, unsigned to
)
148 return block_commit_write(page
, from
, to
);
153 * for a block special file file->f_dentry->d_inode->i_size is zero
154 * so we compute the size by hand (just as in block_read/write above)
156 static loff_t
block_llseek(struct file
*file
, loff_t offset
, int origin
)
158 struct inode
*bd_inode
;
162 bd_inode
= file
->f_dentry
->d_inode
->i_bdev
->bd_inode
;
163 down(&bd_inode
->i_sem
);
164 size
= i_size_read(bd_inode
);
171 offset
+= file
->f_pos
;
174 if (offset
>= 0 && offset
<= size
) {
175 if (offset
!= file
->f_pos
) {
176 file
->f_pos
= offset
;
180 up(&bd_inode
->i_sem
);
185 * Filp may be NULL when we are called by an msync of a vma
186 * since the vma has no handle.
189 static int block_fsync(struct file
*filp
, struct dentry
*dentry
, int datasync
)
191 struct inode
* inode
= dentry
->d_inode
;
193 return sync_blockdev(inode
->i_bdev
);
200 static struct super_block
*bd_get_sb(struct file_system_type
*fs_type
,
201 int flags
, const char *dev_name
, void *data
)
203 return get_sb_pseudo(fs_type
, "bdev:", NULL
, 0x62646576);
206 static struct file_system_type bd_type
= {
209 .kill_sb
= kill_anon_super
,
212 static struct vfsmount
*bd_mnt
;
213 struct super_block
*blockdev_superblock
;
216 * bdev cache handling - shamelessly stolen from inode.c
217 * We use smaller hashtable, though.
221 #define HASH_SIZE (1UL << HASH_BITS)
222 #define HASH_MASK (HASH_SIZE-1)
223 static struct list_head bdev_hashtable
[HASH_SIZE
];
224 static spinlock_t bdev_lock __cacheline_aligned_in_smp
= SPIN_LOCK_UNLOCKED
;
225 static kmem_cache_t
* bdev_cachep
;
227 #define alloc_bdev() \
228 ((struct block_device *) kmem_cache_alloc(bdev_cachep, SLAB_KERNEL))
229 #define destroy_bdev(bdev) kmem_cache_free(bdev_cachep, (bdev))
231 static void init_once(void * foo
, kmem_cache_t
* cachep
, unsigned long flags
)
233 struct block_device
* bdev
= (struct block_device
*) foo
;
235 if ((flags
& (SLAB_CTOR_VERIFY
|SLAB_CTOR_CONSTRUCTOR
)) ==
236 SLAB_CTOR_CONSTRUCTOR
)
238 memset(bdev
, 0, sizeof(*bdev
));
239 sema_init(&bdev
->bd_sem
, 1);
240 INIT_LIST_HEAD(&bdev
->bd_inodes
);
244 void __init
bdev_cache_init(void)
247 struct list_head
*head
= bdev_hashtable
;
251 INIT_LIST_HEAD(head
);
256 bdev_cachep
= kmem_cache_create("bdev_cache",
257 sizeof(struct block_device
),
258 0, SLAB_HWCACHE_ALIGN
, init_once
,
261 panic("Cannot create bdev_cache SLAB cache");
262 err
= register_filesystem(&bd_type
);
264 panic("Cannot register bdev pseudo-fs");
265 bd_mnt
= kern_mount(&bd_type
);
266 err
= PTR_ERR(bd_mnt
);
268 panic("Cannot create bdev pseudo-fs");
269 blockdev_superblock
= bd_mnt
->mnt_sb
; /* For writeback */
273 * Most likely _very_ bad one - but then it's hardly critical for small
274 * /dev and can be fixed when somebody will need really large one.
276 static inline unsigned long hash(dev_t dev
)
278 unsigned long tmp
= dev
;
279 tmp
= tmp
+ (tmp
>> HASH_BITS
) + (tmp
>> HASH_BITS
*2);
280 return tmp
& HASH_MASK
;
283 static struct block_device
*bdfind(dev_t dev
, struct list_head
*head
)
286 struct block_device
*bdev
;
287 list_for_each(p
, head
) {
288 bdev
= list_entry(p
, struct block_device
, bd_hash
);
289 if (bdev
->bd_dev
!= dev
)
291 atomic_inc(&bdev
->bd_count
);
297 struct block_device
*bdget(dev_t dev
)
299 struct list_head
* head
= bdev_hashtable
+ hash(dev
);
300 struct block_device
*bdev
, *new_bdev
;
301 spin_lock(&bdev_lock
);
302 bdev
= bdfind(dev
, head
);
303 spin_unlock(&bdev_lock
);
306 new_bdev
= alloc_bdev();
308 struct inode
*inode
= new_inode(bd_mnt
->mnt_sb
);
310 kdev_t kdev
= to_kdev_t(dev
);
312 atomic_set(&new_bdev
->bd_count
,1);
313 new_bdev
->bd_dev
= dev
;
314 new_bdev
->bd_contains
= NULL
;
315 new_bdev
->bd_inode
= inode
;
316 new_bdev
->bd_block_size
= (1 << inode
->i_blkbits
);
317 new_bdev
->bd_part_count
= 0;
318 new_bdev
->bd_invalidated
= 0;
319 inode
->i_mode
= S_IFBLK
;
320 inode
->i_rdev
= kdev
;
321 inode
->i_bdev
= new_bdev
;
322 inode
->i_data
.a_ops
= &def_blk_aops
;
323 inode
->i_data
.gfp_mask
= GFP_USER
;
324 inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
325 spin_lock(&bdev_lock
);
326 bdev
= bdfind(dev
, head
);
328 list_add(&new_bdev
->bd_hash
, head
);
329 spin_unlock(&bdev_lock
);
332 spin_unlock(&bdev_lock
);
333 iput(new_bdev
->bd_inode
);
335 destroy_bdev(new_bdev
);
340 long nr_blockdev_pages(void)
345 spin_lock(&bdev_lock
);
346 for (i
= 0; i
< ARRAY_SIZE(bdev_hashtable
); i
++) {
347 struct list_head
*head
= &bdev_hashtable
[i
];
348 struct list_head
*lh
;
352 list_for_each(lh
, head
) {
353 struct block_device
*bdev
;
355 bdev
= list_entry(lh
, struct block_device
, bd_hash
);
356 ret
+= bdev
->bd_inode
->i_mapping
->nrpages
;
359 spin_unlock(&bdev_lock
);
363 static inline void __bd_forget(struct inode
*inode
)
365 list_del_init(&inode
->i_devices
);
366 inode
->i_bdev
= NULL
;
367 inode
->i_mapping
= &inode
->i_data
;
370 void bdput(struct block_device
*bdev
)
372 if (atomic_dec_and_lock(&bdev
->bd_count
, &bdev_lock
)) {
374 if (bdev
->bd_openers
)
376 list_del(&bdev
->bd_hash
);
377 while ( (p
= bdev
->bd_inodes
.next
) != &bdev
->bd_inodes
) {
378 __bd_forget(list_entry(p
, struct inode
, i_devices
));
380 spin_unlock(&bdev_lock
);
381 iput(bdev
->bd_inode
);
386 int bd_acquire(struct inode
*inode
)
388 struct block_device
*bdev
;
389 spin_lock(&bdev_lock
);
391 atomic_inc(&inode
->i_bdev
->bd_count
);
392 spin_unlock(&bdev_lock
);
395 spin_unlock(&bdev_lock
);
396 bdev
= bdget(kdev_t_to_nr(inode
->i_rdev
));
399 spin_lock(&bdev_lock
);
400 if (!inode
->i_bdev
) {
401 inode
->i_bdev
= bdev
;
402 inode
->i_mapping
= bdev
->bd_inode
->i_mapping
;
403 list_add(&inode
->i_devices
, &bdev
->bd_inodes
);
404 } else if (inode
->i_bdev
!= bdev
)
406 spin_unlock(&bdev_lock
);
410 /* Call when you free inode */
412 void bd_forget(struct inode
*inode
)
414 spin_lock(&bdev_lock
);
417 spin_unlock(&bdev_lock
);
420 int bd_claim(struct block_device
*bdev
, void *holder
)
423 spin_lock(&bdev_lock
);
424 if (!bdev
->bd_holder
|| bdev
->bd_holder
== holder
) {
425 bdev
->bd_holder
= holder
;
429 spin_unlock(&bdev_lock
);
433 void bd_release(struct block_device
*bdev
)
435 spin_lock(&bdev_lock
);
436 if (!--bdev
->bd_holders
)
437 bdev
->bd_holder
= NULL
;
438 spin_unlock(&bdev_lock
);
442 * Tries to open block device by device number. Use it ONLY if you
443 * really do not have anything better - i.e. when you are behind a
444 * truly sucky interface and all you are given is a device number. _Never_
445 * to be used for internal purposes. If you ever need it - reconsider
448 struct block_device
*open_by_devnum(dev_t dev
, unsigned mode
, int kind
)
450 struct block_device
*bdev
= bdget(dev
);
452 int flags
= mode
& FMODE_WRITE
? O_RDWR
: O_RDONLY
;
454 err
= blkdev_get(bdev
, mode
, flags
, kind
);
455 return err
? ERR_PTR(err
) : bdev
;
459 * This routine checks whether a removable media has been changed,
460 * and invalidates all buffer-cache-entries in that case. This
461 * is a relatively slow routine, so we have to try to minimize using
462 * it. Thus it is called only upon a 'mount' or 'open'. This
463 * is the best way of combining speed and utility, I think.
464 * People changing diskettes in the middle of an operation deserve
467 int check_disk_change(struct block_device
*bdev
)
469 struct gendisk
*disk
= bdev
->bd_disk
;
470 struct block_device_operations
* bdops
= disk
->fops
;
472 if (!bdops
->media_changed
)
474 if (!bdops
->media_changed(bdev
->bd_disk
))
477 if (__invalidate_device(bdev
, 0))
478 printk("VFS: busy inodes on changed media.\n");
480 if (bdops
->revalidate_disk
)
481 bdops
->revalidate_disk(bdev
->bd_disk
);
482 if (bdev
->bd_disk
->minors
> 1)
483 bdev
->bd_invalidated
= 1;
487 static void bd_set_size(struct block_device
*bdev
, loff_t size
)
489 unsigned bsize
= bdev_hardsect_size(bdev
);
490 i_size_write(bdev
->bd_inode
, size
);
491 while (bsize
< PAGE_CACHE_SIZE
) {
496 bdev
->bd_block_size
= bsize
;
497 bdev
->bd_inode
->i_blkbits
= blksize_bits(bsize
);
500 static int do_open(struct block_device
*bdev
, struct inode
*inode
, struct file
*file
)
502 struct module
*owner
= NULL
;
503 struct gendisk
*disk
;
508 disk
= get_gendisk(bdev
->bd_dev
, &part
);
514 owner
= disk
->fops
->owner
;
517 if (!bdev
->bd_openers
) {
518 bdev
->bd_disk
= disk
;
519 bdev
->bd_contains
= bdev
;
521 struct backing_dev_info
*bdi
;
522 if (disk
->fops
->open
) {
523 ret
= disk
->fops
->open(inode
, file
);
528 if (!bdev
->bd_openers
) {
529 bd_set_size(bdev
,(loff_t
)get_capacity(disk
)<<9);
530 bdi
= blk_get_backing_dev_info(bdev
);
532 bdi
= &default_backing_dev_info
;
533 bdev
->bd_inode
->i_data
.backing_dev_info
= bdi
;
535 if (bdev
->bd_invalidated
)
536 rescan_partitions(disk
, bdev
);
539 struct block_device
*whole
;
540 whole
= bdget_disk(disk
, 0);
544 ret
= blkdev_get(whole
, file
->f_mode
, file
->f_flags
, BDEV_RAW
);
547 bdev
->bd_contains
= whole
;
548 down(&whole
->bd_sem
);
549 whole
->bd_part_count
++;
550 p
= disk
->part
[part
- 1];
551 bdev
->bd_inode
->i_data
.backing_dev_info
=
552 whole
->bd_inode
->i_data
.backing_dev_info
;
553 if (!(disk
->flags
& GENHD_FL_UP
) || !p
|| !p
->nr_sects
) {
554 whole
->bd_part_count
--;
559 bdev
->bd_offset
= p
->start_sect
;
560 bd_set_size(bdev
, (loff_t
) p
->nr_sects
<< 9);
566 if (bdev
->bd_contains
== bdev
) {
567 if (bdev
->bd_disk
->fops
->open
) {
568 ret
= bdev
->bd_disk
->fops
->open(inode
, file
);
572 if (bdev
->bd_invalidated
)
573 rescan_partitions(bdev
->bd_disk
, bdev
);
575 down(&bdev
->bd_contains
->bd_sem
);
576 bdev
->bd_contains
->bd_part_count
++;
577 up(&bdev
->bd_contains
->bd_sem
);
586 bdev
->bd_disk
= NULL
;
587 bdev
->bd_inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
588 if (bdev
!= bdev
->bd_contains
)
589 blkdev_put(bdev
->bd_contains
, BDEV_RAW
);
590 bdev
->bd_contains
= NULL
;
601 int blkdev_get(struct block_device
*bdev
, mode_t mode
, unsigned flags
, int kind
)
604 * This crockload is due to bad choice of ->open() type.
606 * For now, block device ->open() routine must _not_
607 * examine anything in 'inode' argument except ->i_rdev.
609 struct file fake_file
= {};
610 struct dentry fake_dentry
= {};
611 fake_file
.f_mode
= mode
;
612 fake_file
.f_flags
= flags
;
613 fake_file
.f_dentry
= &fake_dentry
;
614 fake_dentry
.d_inode
= bdev
->bd_inode
;
616 return do_open(bdev
, bdev
->bd_inode
, &fake_file
);
619 int blkdev_open(struct inode
* inode
, struct file
* filp
)
621 struct block_device
*bdev
;
624 * Preserve backwards compatibility and allow large file access
625 * even if userspace doesn't ask for it explicitly. Some mkfs
626 * binary needs it. We might want to drop this workaround
627 * during an unstable branch.
629 filp
->f_flags
|= O_LARGEFILE
;
632 bdev
= inode
->i_bdev
;
634 return do_open(bdev
, inode
, filp
);
637 int blkdev_put(struct block_device
*bdev
, int kind
)
640 struct inode
*bd_inode
= bdev
->bd_inode
;
641 struct gendisk
*disk
= bdev
->bd_disk
;
645 if (!--bdev
->bd_openers
) {
649 sync_blockdev(bd_inode
->i_bdev
);
654 if (bdev
->bd_contains
== bdev
) {
655 if (disk
->fops
->release
)
656 ret
= disk
->fops
->release(bd_inode
, NULL
);
658 down(&bdev
->bd_contains
->bd_sem
);
659 bdev
->bd_contains
->bd_part_count
--;
660 up(&bdev
->bd_contains
->bd_sem
);
662 if (!bdev
->bd_openers
) {
663 struct module
*owner
= disk
->fops
->owner
;
668 bdev
->bd_disk
= NULL
;
669 bdev
->bd_inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
670 if (bdev
!= bdev
->bd_contains
) {
671 blkdev_put(bdev
->bd_contains
, BDEV_RAW
);
673 bdev
->bd_contains
= NULL
;
681 int blkdev_close(struct inode
* inode
, struct file
* filp
)
683 return blkdev_put(inode
->i_bdev
, BDEV_FILE
);
686 static ssize_t
blkdev_file_write(struct file
*file
, const char __user
*buf
,
687 size_t count
, loff_t
*ppos
)
689 struct iovec local_iov
= { .iov_base
= (void __user
*)buf
, .iov_len
= count
};
691 return generic_file_write_nolock(file
, &local_iov
, 1, ppos
);
694 static ssize_t
blkdev_file_aio_write(struct kiocb
*iocb
, const char __user
*buf
,
695 size_t count
, loff_t pos
)
697 struct iovec local_iov
= { .iov_base
= (void __user
*)buf
, .iov_len
= count
};
699 return generic_file_aio_write_nolock(iocb
, &local_iov
, 1, &iocb
->ki_pos
);
703 struct address_space_operations def_blk_aops
= {
704 .readpage
= blkdev_readpage
,
705 .writepage
= blkdev_writepage
,
706 .sync_page
= block_sync_page
,
707 .prepare_write
= blkdev_prepare_write
,
708 .commit_write
= blkdev_commit_write
,
709 .writepages
= generic_writepages
,
710 .direct_IO
= blkdev_direct_IO
,
713 struct file_operations def_blk_fops
= {
715 .release
= blkdev_close
,
716 .llseek
= block_llseek
,
717 .read
= generic_file_read
,
718 .write
= blkdev_file_write
,
719 .aio_read
= generic_file_aio_read
,
720 .aio_write
= blkdev_file_aio_write
,
721 .mmap
= generic_file_mmap
,
722 .fsync
= block_fsync
,
723 .ioctl
= blkdev_ioctl
,
724 .readv
= generic_file_readv
,
725 .writev
= generic_file_writev
,
726 .sendfile
= generic_file_sendfile
,
729 int ioctl_by_bdev(struct block_device
*bdev
, unsigned cmd
, unsigned long arg
)
732 mm_segment_t old_fs
= get_fs();
734 res
= blkdev_ioctl(bdev
->bd_inode
, NULL
, cmd
, arg
);
740 * lookup_bdev - lookup a struct block_device by name
742 * @path: special file representing the block device
744 * Get a reference to the blockdevice at @path in the current
745 * namespace if possible and return it. Return ERR_PTR(error)
748 struct block_device
*lookup_bdev(const char *path
)
750 struct block_device
*bdev
;
756 return ERR_PTR(-EINVAL
);
758 error
= path_lookup(path
, LOOKUP_FOLLOW
, &nd
);
760 return ERR_PTR(error
);
762 inode
= nd
.dentry
->d_inode
;
764 if (!S_ISBLK(inode
->i_mode
))
767 if (nd
.mnt
->mnt_flags
& MNT_NODEV
)
769 error
= bd_acquire(inode
);
772 bdev
= inode
->i_bdev
;
778 bdev
= ERR_PTR(error
);
783 * open_bdev_excl - open a block device by name and set it up for use
785 * @path: special file representing the block device
786 * @flags: %MS_RDONLY for opening read-only
787 * @kind: usage (same as the 4th paramter to blkdev_get)
788 * @holder: owner for exclusion
790 * Open the blockdevice described by the special file at @path, claim it
791 * for the @holder and properly set it up for @kind usage.
793 struct block_device
*open_bdev_excl(const char *path
, int flags
,
794 int kind
, void *holder
)
796 struct block_device
*bdev
;
797 mode_t mode
= FMODE_READ
;
800 bdev
= lookup_bdev(path
);
804 if (!(flags
& MS_RDONLY
))
806 error
= blkdev_get(bdev
, mode
, 0, kind
);
808 return ERR_PTR(error
);
810 if (!(flags
& MS_RDONLY
) && bdev_read_only(bdev
))
812 error
= bd_claim(bdev
, holder
);
819 blkdev_put(bdev
, BDEV_FS
);
820 return ERR_PTR(error
);
824 * close_bdev_excl - release a blockdevice openen by open_bdev_excl()
826 * @bdev: blockdevice to close
827 * @kind: usage (same as the 4th paramter to blkdev_get)
829 * This is the counterpart to open_bdev_excl().
831 void close_bdev_excl(struct block_device
*bdev
, int kind
)
834 blkdev_put(bdev
, kind
);