4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
8 #include <linux/config.h>
9 #include <linux/init.h>
11 #include <linux/fcntl.h>
12 #include <linux/slab.h>
13 #include <linux/kmod.h>
14 #include <linux/major.h>
15 #include <linux/devfs_fs_kernel.h>
16 #include <linux/smp_lock.h>
17 #include <linux/highmem.h>
18 #include <linux/blkdev.h>
19 #include <linux/module.h>
20 #include <linux/blkpg.h>
21 #include <linux/buffer_head.h>
22 #include <linux/mpage.h>
23 #include <linux/mount.h>
24 #include <linux/uio.h>
25 #include <linux/namei.h>
26 #include <asm/uaccess.h>
29 static sector_t
max_block(struct block_device
*bdev
)
31 sector_t retval
= ~((sector_t
)0);
32 loff_t sz
= bdev
->bd_inode
->i_size
;
35 unsigned int size
= block_size(bdev
);
36 unsigned int sizebits
= blksize_bits(size
);
37 retval
= (sz
>> sizebits
);
42 /* Kill _all_ buffers, dirty or not.. */
43 static void kill_bdev(struct block_device
*bdev
)
45 invalidate_bdev(bdev
, 1);
46 truncate_inode_pages(bdev
->bd_inode
->i_mapping
, 0);
49 int set_blocksize(struct block_device
*bdev
, int size
)
53 /* Size must be a power of two, and between 512 and PAGE_SIZE */
54 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
57 /* Size cannot be smaller than the size supported by the device */
58 if (size
< bdev_hardsect_size(bdev
))
61 oldsize
= bdev
->bd_block_size
;
65 /* Ok, we're actually changing the blocksize.. */
67 bdev
->bd_block_size
= size
;
68 bdev
->bd_inode
->i_blkbits
= blksize_bits(size
);
73 int sb_set_blocksize(struct super_block
*sb
, int size
)
76 if (set_blocksize(sb
->s_bdev
, size
) < 0)
78 sb
->s_blocksize
= size
;
79 for (bits
= 9, size
>>= 9; size
>>= 1; bits
++)
81 sb
->s_blocksize_bits
= bits
;
82 return sb
->s_blocksize
;
85 int sb_min_blocksize(struct super_block
*sb
, int size
)
87 int minsize
= bdev_hardsect_size(sb
->s_bdev
);
90 return sb_set_blocksize(sb
, size
);
94 blkdev_get_block(struct inode
*inode
, sector_t iblock
,
95 struct buffer_head
*bh
, int create
)
97 if (iblock
>= max_block(inode
->i_bdev
))
100 bh
->b_bdev
= inode
->i_bdev
;
101 bh
->b_blocknr
= iblock
;
102 set_buffer_mapped(bh
);
107 blkdev_get_blocks(struct inode
*inode
, sector_t iblock
,
108 unsigned long max_blocks
, struct buffer_head
*bh
, int create
)
110 if ((iblock
+ max_blocks
) > max_block(inode
->i_bdev
))
113 bh
->b_bdev
= inode
->i_bdev
;
114 bh
->b_blocknr
= iblock
;
115 bh
->b_size
= max_blocks
<< inode
->i_blkbits
;
116 set_buffer_mapped(bh
);
121 blkdev_direct_IO(int rw
, struct kiocb
*iocb
, const struct iovec
*iov
,
122 loff_t offset
, unsigned long nr_segs
)
124 struct file
*file
= iocb
->ki_filp
;
125 struct inode
*inode
= file
->f_dentry
->d_inode
->i_mapping
->host
;
127 return blockdev_direct_IO(rw
, iocb
, inode
, inode
->i_bdev
, iov
, offset
,
128 nr_segs
, blkdev_get_blocks
);
131 static int blkdev_writepage(struct page
*page
, struct writeback_control
*wbc
)
133 return block_write_full_page(page
, blkdev_get_block
, wbc
);
136 static int blkdev_readpage(struct file
* file
, struct page
* page
)
138 return block_read_full_page(page
, blkdev_get_block
);
141 static int blkdev_prepare_write(struct file
*file
, struct page
*page
, unsigned from
, unsigned to
)
143 return block_prepare_write(page
, from
, to
, blkdev_get_block
);
146 static int blkdev_commit_write(struct file
*file
, struct page
*page
, unsigned from
, unsigned to
)
148 return block_commit_write(page
, from
, to
);
153 * for a block special file file->f_dentry->d_inode->i_size is zero
154 * so we compute the size by hand (just as in block_read/write above)
156 static loff_t
block_llseek(struct file
*file
, loff_t offset
, int origin
)
159 loff_t size
= file
->f_dentry
->d_inode
->i_bdev
->bd_inode
->i_size
;
169 offset
+= file
->f_pos
;
172 if (offset
>= 0 && offset
<= size
) {
173 if (offset
!= file
->f_pos
) {
174 file
->f_pos
= offset
;
183 * Filp may be NULL when we are called by an msync of a vma
184 * since the vma has no handle.
187 static int block_fsync(struct file
*filp
, struct dentry
*dentry
, int datasync
)
189 struct inode
* inode
= dentry
->d_inode
;
191 return sync_blockdev(inode
->i_bdev
);
198 static struct super_block
*bd_get_sb(struct file_system_type
*fs_type
,
199 int flags
, const char *dev_name
, void *data
)
201 return get_sb_pseudo(fs_type
, "bdev:", NULL
, 0x62646576);
204 static struct file_system_type bd_type
= {
207 .kill_sb
= kill_anon_super
,
210 static struct vfsmount
*bd_mnt
;
211 struct super_block
*blockdev_superblock
;
214 * bdev cache handling - shamelessly stolen from inode.c
215 * We use smaller hashtable, though.
219 #define HASH_SIZE (1UL << HASH_BITS)
220 #define HASH_MASK (HASH_SIZE-1)
221 static struct list_head bdev_hashtable
[HASH_SIZE
];
222 static spinlock_t bdev_lock __cacheline_aligned_in_smp
= SPIN_LOCK_UNLOCKED
;
223 static kmem_cache_t
* bdev_cachep
;
225 #define alloc_bdev() \
226 ((struct block_device *) kmem_cache_alloc(bdev_cachep, SLAB_KERNEL))
227 #define destroy_bdev(bdev) kmem_cache_free(bdev_cachep, (bdev))
229 static void init_once(void * foo
, kmem_cache_t
* cachep
, unsigned long flags
)
231 struct block_device
* bdev
= (struct block_device
*) foo
;
233 if ((flags
& (SLAB_CTOR_VERIFY
|SLAB_CTOR_CONSTRUCTOR
)) ==
234 SLAB_CTOR_CONSTRUCTOR
)
236 memset(bdev
, 0, sizeof(*bdev
));
237 sema_init(&bdev
->bd_sem
, 1);
238 INIT_LIST_HEAD(&bdev
->bd_inodes
);
242 void __init
bdev_cache_init(void)
245 struct list_head
*head
= bdev_hashtable
;
249 INIT_LIST_HEAD(head
);
254 bdev_cachep
= kmem_cache_create("bdev_cache",
255 sizeof(struct block_device
),
256 0, SLAB_HWCACHE_ALIGN
, init_once
,
259 panic("Cannot create bdev_cache SLAB cache");
260 err
= register_filesystem(&bd_type
);
262 panic("Cannot register bdev pseudo-fs");
263 bd_mnt
= kern_mount(&bd_type
);
264 err
= PTR_ERR(bd_mnt
);
266 panic("Cannot create bdev pseudo-fs");
267 blockdev_superblock
= bd_mnt
->mnt_sb
; /* For writeback */
271 * Most likely _very_ bad one - but then it's hardly critical for small
272 * /dev and can be fixed when somebody will need really large one.
274 static inline unsigned long hash(dev_t dev
)
276 unsigned long tmp
= dev
;
277 tmp
= tmp
+ (tmp
>> HASH_BITS
) + (tmp
>> HASH_BITS
*2);
278 return tmp
& HASH_MASK
;
281 static struct block_device
*bdfind(dev_t dev
, struct list_head
*head
)
284 struct block_device
*bdev
;
285 list_for_each(p
, head
) {
286 bdev
= list_entry(p
, struct block_device
, bd_hash
);
287 if (bdev
->bd_dev
!= dev
)
289 atomic_inc(&bdev
->bd_count
);
295 struct block_device
*bdget(dev_t dev
)
297 struct list_head
* head
= bdev_hashtable
+ hash(dev
);
298 struct block_device
*bdev
, *new_bdev
;
299 spin_lock(&bdev_lock
);
300 bdev
= bdfind(dev
, head
);
301 spin_unlock(&bdev_lock
);
304 new_bdev
= alloc_bdev();
306 struct inode
*inode
= new_inode(bd_mnt
->mnt_sb
);
308 kdev_t kdev
= to_kdev_t(dev
);
310 atomic_set(&new_bdev
->bd_count
,1);
311 new_bdev
->bd_dev
= dev
;
312 new_bdev
->bd_contains
= NULL
;
313 new_bdev
->bd_inode
= inode
;
314 new_bdev
->bd_block_size
= (1 << inode
->i_blkbits
);
315 new_bdev
->bd_part_count
= 0;
316 new_bdev
->bd_invalidated
= 0;
317 inode
->i_mode
= S_IFBLK
;
318 inode
->i_rdev
= kdev
;
319 inode
->i_bdev
= new_bdev
;
320 inode
->i_data
.a_ops
= &def_blk_aops
;
321 inode
->i_data
.gfp_mask
= GFP_USER
;
322 inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
323 spin_lock(&bdev_lock
);
324 bdev
= bdfind(dev
, head
);
326 list_add(&new_bdev
->bd_hash
, head
);
327 spin_unlock(&bdev_lock
);
330 spin_unlock(&bdev_lock
);
331 iput(new_bdev
->bd_inode
);
333 destroy_bdev(new_bdev
);
338 long nr_blockdev_pages(void)
343 spin_lock(&bdev_lock
);
344 for (i
= 0; i
< ARRAY_SIZE(bdev_hashtable
); i
++) {
345 struct list_head
*head
= &bdev_hashtable
[i
];
346 struct list_head
*lh
;
350 list_for_each(lh
, head
) {
351 struct block_device
*bdev
;
353 bdev
= list_entry(lh
, struct block_device
, bd_hash
);
354 ret
+= bdev
->bd_inode
->i_mapping
->nrpages
;
357 spin_unlock(&bdev_lock
);
361 static inline void __bd_forget(struct inode
*inode
)
363 list_del_init(&inode
->i_devices
);
364 inode
->i_bdev
= NULL
;
365 inode
->i_mapping
= &inode
->i_data
;
368 void bdput(struct block_device
*bdev
)
370 if (atomic_dec_and_lock(&bdev
->bd_count
, &bdev_lock
)) {
372 if (bdev
->bd_openers
)
374 list_del(&bdev
->bd_hash
);
375 while ( (p
= bdev
->bd_inodes
.next
) != &bdev
->bd_inodes
) {
376 __bd_forget(list_entry(p
, struct inode
, i_devices
));
378 spin_unlock(&bdev_lock
);
379 iput(bdev
->bd_inode
);
384 int bd_acquire(struct inode
*inode
)
386 struct block_device
*bdev
;
387 spin_lock(&bdev_lock
);
389 atomic_inc(&inode
->i_bdev
->bd_count
);
390 spin_unlock(&bdev_lock
);
393 spin_unlock(&bdev_lock
);
394 bdev
= bdget(kdev_t_to_nr(inode
->i_rdev
));
397 spin_lock(&bdev_lock
);
398 if (!inode
->i_bdev
) {
399 inode
->i_bdev
= bdev
;
400 inode
->i_mapping
= bdev
->bd_inode
->i_mapping
;
401 list_add(&inode
->i_devices
, &bdev
->bd_inodes
);
402 } else if (inode
->i_bdev
!= bdev
)
404 spin_unlock(&bdev_lock
);
408 /* Call when you free inode */
410 void bd_forget(struct inode
*inode
)
412 spin_lock(&bdev_lock
);
415 spin_unlock(&bdev_lock
);
418 int bd_claim(struct block_device
*bdev
, void *holder
)
421 spin_lock(&bdev_lock
);
422 if (!bdev
->bd_holder
|| bdev
->bd_holder
== holder
) {
423 bdev
->bd_holder
= holder
;
427 spin_unlock(&bdev_lock
);
431 void bd_release(struct block_device
*bdev
)
433 spin_lock(&bdev_lock
);
434 if (!--bdev
->bd_holders
)
435 bdev
->bd_holder
= NULL
;
436 spin_unlock(&bdev_lock
);
440 * Tries to open block device by device number. Use it ONLY if you
441 * really do not have anything better - i.e. when you are behind a
442 * truly sucky interface and all you are given is a device number. _Never_
443 * to be used for internal purposes. If you ever need it - reconsider
446 struct block_device
*open_by_devnum(dev_t dev
, unsigned mode
, int kind
)
448 struct block_device
*bdev
= bdget(dev
);
450 int flags
= mode
& FMODE_WRITE
? O_RDWR
: O_RDONLY
;
452 err
= blkdev_get(bdev
, mode
, flags
, kind
);
453 return err
? ERR_PTR(err
) : bdev
;
457 * This routine checks whether a removable media has been changed,
458 * and invalidates all buffer-cache-entries in that case. This
459 * is a relatively slow routine, so we have to try to minimize using
460 * it. Thus it is called only upon a 'mount' or 'open'. This
461 * is the best way of combining speed and utility, I think.
462 * People changing diskettes in the middle of an operation deserve
465 int check_disk_change(struct block_device
*bdev
)
467 struct gendisk
*disk
= bdev
->bd_disk
;
468 struct block_device_operations
* bdops
= disk
->fops
;
470 if (!bdops
->media_changed
)
472 if (!bdops
->media_changed(bdev
->bd_disk
))
475 if (__invalidate_device(bdev
, 0))
476 printk("VFS: busy inodes on changed media.\n");
478 if (bdops
->revalidate_disk
)
479 bdops
->revalidate_disk(bdev
->bd_disk
);
480 if (bdev
->bd_disk
->minors
> 1)
481 bdev
->bd_invalidated
= 1;
485 static void bd_set_size(struct block_device
*bdev
, loff_t size
)
487 unsigned bsize
= bdev_hardsect_size(bdev
);
488 bdev
->bd_inode
->i_size
= size
;
489 while (bsize
< PAGE_CACHE_SIZE
) {
494 bdev
->bd_block_size
= bsize
;
495 bdev
->bd_inode
->i_blkbits
= blksize_bits(bsize
);
498 static int do_open(struct block_device
*bdev
, struct inode
*inode
, struct file
*file
)
500 struct module
*owner
= NULL
;
501 struct gendisk
*disk
;
506 disk
= get_gendisk(bdev
->bd_dev
, &part
);
512 owner
= disk
->fops
->owner
;
515 if (!bdev
->bd_openers
) {
516 bdev
->bd_disk
= disk
;
517 bdev
->bd_contains
= bdev
;
519 struct backing_dev_info
*bdi
;
520 if (disk
->fops
->open
) {
521 ret
= disk
->fops
->open(inode
, file
);
526 if (!bdev
->bd_openers
) {
527 bd_set_size(bdev
,(loff_t
)get_capacity(disk
)<<9);
528 bdi
= blk_get_backing_dev_info(bdev
);
530 bdi
= &default_backing_dev_info
;
531 bdev
->bd_inode
->i_data
.backing_dev_info
= bdi
;
533 if (bdev
->bd_invalidated
)
534 rescan_partitions(disk
, bdev
);
537 struct block_device
*whole
;
538 whole
= bdget_disk(disk
, 0);
542 ret
= blkdev_get(whole
, file
->f_mode
, file
->f_flags
, BDEV_RAW
);
545 bdev
->bd_contains
= whole
;
546 down(&whole
->bd_sem
);
547 whole
->bd_part_count
++;
548 p
= disk
->part
[part
- 1];
549 bdev
->bd_inode
->i_data
.backing_dev_info
=
550 whole
->bd_inode
->i_data
.backing_dev_info
;
551 if (!(disk
->flags
& GENHD_FL_UP
) || !p
|| !p
->nr_sects
) {
552 whole
->bd_part_count
--;
557 bdev
->bd_offset
= p
->start_sect
;
558 bd_set_size(bdev
, (loff_t
) p
->nr_sects
<< 9);
564 if (bdev
->bd_contains
== bdev
) {
565 if (bdev
->bd_disk
->fops
->open
) {
566 ret
= bdev
->bd_disk
->fops
->open(inode
, file
);
570 if (bdev
->bd_invalidated
)
571 rescan_partitions(bdev
->bd_disk
, bdev
);
573 down(&bdev
->bd_contains
->bd_sem
);
574 bdev
->bd_contains
->bd_part_count
++;
575 up(&bdev
->bd_contains
->bd_sem
);
584 bdev
->bd_disk
= NULL
;
585 bdev
->bd_inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
586 if (bdev
!= bdev
->bd_contains
)
587 blkdev_put(bdev
->bd_contains
, BDEV_RAW
);
588 bdev
->bd_contains
= NULL
;
599 int blkdev_get(struct block_device
*bdev
, mode_t mode
, unsigned flags
, int kind
)
602 * This crockload is due to bad choice of ->open() type.
604 * For now, block device ->open() routine must _not_
605 * examine anything in 'inode' argument except ->i_rdev.
607 struct file fake_file
= {};
608 struct dentry fake_dentry
= {};
609 fake_file
.f_mode
= mode
;
610 fake_file
.f_flags
= flags
;
611 fake_file
.f_dentry
= &fake_dentry
;
612 fake_dentry
.d_inode
= bdev
->bd_inode
;
614 return do_open(bdev
, bdev
->bd_inode
, &fake_file
);
617 int blkdev_open(struct inode
* inode
, struct file
* filp
)
619 struct block_device
*bdev
;
622 * Preserve backwards compatibility and allow large file access
623 * even if userspace doesn't ask for it explicitly. Some mkfs
624 * binary needs it. We might want to drop this workaround
625 * during an unstable branch.
627 filp
->f_flags
|= O_LARGEFILE
;
630 bdev
= inode
->i_bdev
;
632 return do_open(bdev
, inode
, filp
);
635 int blkdev_put(struct block_device
*bdev
, int kind
)
638 struct inode
*bd_inode
= bdev
->bd_inode
;
639 struct gendisk
*disk
= bdev
->bd_disk
;
643 if (!--bdev
->bd_openers
) {
647 sync_blockdev(bd_inode
->i_bdev
);
652 if (bdev
->bd_contains
== bdev
) {
653 if (disk
->fops
->release
)
654 ret
= disk
->fops
->release(bd_inode
, NULL
);
656 down(&bdev
->bd_contains
->bd_sem
);
657 bdev
->bd_contains
->bd_part_count
--;
658 up(&bdev
->bd_contains
->bd_sem
);
660 if (!bdev
->bd_openers
) {
661 struct module
*owner
= disk
->fops
->owner
;
666 bdev
->bd_disk
= NULL
;
667 bdev
->bd_inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
668 if (bdev
!= bdev
->bd_contains
) {
669 blkdev_put(bdev
->bd_contains
, BDEV_RAW
);
671 bdev
->bd_contains
= NULL
;
679 int blkdev_close(struct inode
* inode
, struct file
* filp
)
681 return blkdev_put(inode
->i_bdev
, BDEV_FILE
);
684 static ssize_t
blkdev_file_write(struct file
*file
, const char __user
*buf
,
685 size_t count
, loff_t
*ppos
)
687 struct iovec local_iov
= { .iov_base
= (void __user
*)buf
, .iov_len
= count
};
689 return generic_file_write_nolock(file
, &local_iov
, 1, ppos
);
692 static ssize_t
blkdev_file_aio_write(struct kiocb
*iocb
, const char __user
*buf
,
693 size_t count
, loff_t pos
)
695 struct iovec local_iov
= { .iov_base
= (void __user
*)buf
, .iov_len
= count
};
697 return generic_file_aio_write_nolock(iocb
, &local_iov
, 1, &iocb
->ki_pos
);
701 struct address_space_operations def_blk_aops
= {
702 .readpage
= blkdev_readpage
,
703 .writepage
= blkdev_writepage
,
704 .sync_page
= block_sync_page
,
705 .prepare_write
= blkdev_prepare_write
,
706 .commit_write
= blkdev_commit_write
,
707 .writepages
= generic_writepages
,
708 .direct_IO
= blkdev_direct_IO
,
711 struct file_operations def_blk_fops
= {
713 .release
= blkdev_close
,
714 .llseek
= block_llseek
,
715 .read
= generic_file_read
,
716 .write
= blkdev_file_write
,
717 .aio_read
= generic_file_aio_read
,
718 .aio_write
= blkdev_file_aio_write
,
719 .mmap
= generic_file_mmap
,
720 .fsync
= block_fsync
,
721 .ioctl
= blkdev_ioctl
,
722 .readv
= generic_file_readv
,
723 .writev
= generic_file_writev
,
724 .sendfile
= generic_file_sendfile
,
727 int ioctl_by_bdev(struct block_device
*bdev
, unsigned cmd
, unsigned long arg
)
730 mm_segment_t old_fs
= get_fs();
732 res
= blkdev_ioctl(bdev
->bd_inode
, NULL
, cmd
, arg
);
738 * lookup_bdev - lookup a struct block_device by name
740 * @path: special file representing the block device
742 * Get a reference to the blockdevice at @path in the current
743 * namespace if possible and return it. Return ERR_PTR(error)
746 struct block_device
*lookup_bdev(const char *path
)
748 struct block_device
*bdev
;
754 return ERR_PTR(-EINVAL
);
756 error
= path_lookup(path
, LOOKUP_FOLLOW
, &nd
);
758 return ERR_PTR(error
);
760 inode
= nd
.dentry
->d_inode
;
762 if (!S_ISBLK(inode
->i_mode
))
765 if (nd
.mnt
->mnt_flags
& MNT_NODEV
)
767 error
= bd_acquire(inode
);
770 bdev
= inode
->i_bdev
;
776 bdev
= ERR_PTR(error
);
781 * open_bdev_excl - open a block device by name and set it up for use
783 * @path: special file representing the block device
784 * @flags: %MS_RDONLY for opening read-only
785 * @kind: usage (same as the 4th paramter to blkdev_get)
786 * @holder: owner for exclusion
788 * Open the blockdevice described by the special file at @path, claim it
789 * for the @holder and properly set it up for @kind usage.
791 struct block_device
*open_bdev_excl(const char *path
, int flags
,
792 int kind
, void *holder
)
794 struct block_device
*bdev
;
795 mode_t mode
= FMODE_READ
;
798 bdev
= lookup_bdev(path
);
802 if (!(flags
& MS_RDONLY
))
804 error
= blkdev_get(bdev
, mode
, 0, kind
);
806 return ERR_PTR(error
);
808 if (!(flags
& MS_RDONLY
) && bdev_read_only(bdev
))
810 error
= bd_claim(bdev
, holder
);
817 blkdev_put(bdev
, BDEV_FS
);
818 return ERR_PTR(error
);
822 * close_bdev_excl - release a blockdevice openen by open_bdev_excl()
824 * @bdev: blockdevice to close
825 * @kind: usage (same as the 4th paramter to blkdev_get)
827 * This is the counterpart to open_bdev_excl().
829 void close_bdev_excl(struct block_device
*bdev
, int kind
)
832 blkdev_put(bdev
, kind
);