4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
8 #include <linux/config.h>
9 #include <linux/init.h>
11 #include <linux/fcntl.h>
12 #include <linux/slab.h>
13 #include <linux/kmod.h>
14 #include <linux/major.h>
15 #include <linux/devfs_fs_kernel.h>
16 #include <linux/smp_lock.h>
17 #include <linux/highmem.h>
18 #include <linux/blkdev.h>
19 #include <linux/module.h>
20 #include <linux/blkpg.h>
21 #include <linux/buffer_head.h>
22 #include <linux/mpage.h>
23 #include <linux/mount.h>
24 #include <linux/uio.h>
25 #include <asm/uaccess.h>
28 static sector_t
max_block(struct block_device
*bdev
)
30 sector_t retval
= ~((sector_t
)0);
31 loff_t sz
= bdev
->bd_inode
->i_size
;
34 unsigned int size
= block_size(bdev
);
35 unsigned int sizebits
= blksize_bits(size
);
36 retval
= (sz
>> sizebits
);
41 /* Kill _all_ buffers, dirty or not.. */
42 static void kill_bdev(struct block_device
*bdev
)
44 invalidate_bdev(bdev
, 1);
45 truncate_inode_pages(bdev
->bd_inode
->i_mapping
, 0);
48 int set_blocksize(struct block_device
*bdev
, int size
)
52 /* Size must be a power of two, and between 512 and PAGE_SIZE */
53 if (size
> PAGE_SIZE
|| size
< 512 || (size
& (size
-1)))
56 /* Size cannot be smaller than the size supported by the device */
57 if (size
< bdev_hardsect_size(bdev
))
60 oldsize
= bdev
->bd_block_size
;
64 /* Ok, we're actually changing the blocksize.. */
66 bdev
->bd_block_size
= size
;
67 bdev
->bd_inode
->i_blkbits
= blksize_bits(size
);
72 int sb_set_blocksize(struct super_block
*sb
, int size
)
75 if (set_blocksize(sb
->s_bdev
, size
) < 0)
77 sb
->s_blocksize
= size
;
78 for (bits
= 9, size
>>= 9; size
>>= 1; bits
++)
80 sb
->s_blocksize_bits
= bits
;
81 return sb
->s_blocksize
;
84 int sb_min_blocksize(struct super_block
*sb
, int size
)
86 int minsize
= bdev_hardsect_size(sb
->s_bdev
);
89 return sb_set_blocksize(sb
, size
);
93 blkdev_get_block(struct inode
*inode
, sector_t iblock
,
94 struct buffer_head
*bh
, int create
)
96 if (iblock
>= max_block(inode
->i_bdev
))
99 bh
->b_bdev
= inode
->i_bdev
;
100 bh
->b_blocknr
= iblock
;
101 set_buffer_mapped(bh
);
106 blkdev_get_blocks(struct inode
*inode
, sector_t iblock
,
107 unsigned long max_blocks
, struct buffer_head
*bh
, int create
)
109 if ((iblock
+ max_blocks
) > max_block(inode
->i_bdev
))
112 bh
->b_bdev
= inode
->i_bdev
;
113 bh
->b_blocknr
= iblock
;
114 bh
->b_size
= max_blocks
<< inode
->i_blkbits
;
115 set_buffer_mapped(bh
);
120 blkdev_direct_IO(int rw
, struct file
*file
, const struct iovec
*iov
,
121 loff_t offset
, unsigned long nr_segs
)
123 struct inode
*inode
= file
->f_dentry
->d_inode
->i_mapping
->host
;
125 return generic_direct_IO(rw
, inode
, inode
->i_bdev
, iov
, offset
,
126 nr_segs
, blkdev_get_blocks
);
129 static int blkdev_writepage(struct page
* page
)
131 return block_write_full_page(page
, blkdev_get_block
);
134 static int blkdev_readpage(struct file
* file
, struct page
* page
)
136 return block_read_full_page(page
, blkdev_get_block
);
139 static int blkdev_prepare_write(struct file
*file
, struct page
*page
, unsigned from
, unsigned to
)
141 return block_prepare_write(page
, from
, to
, blkdev_get_block
);
144 static int blkdev_commit_write(struct file
*file
, struct page
*page
, unsigned from
, unsigned to
)
146 return block_commit_write(page
, from
, to
);
151 * for a block special file file->f_dentry->d_inode->i_size is zero
152 * so we compute the size by hand (just as in block_read/write above)
154 static loff_t
block_llseek(struct file
*file
, loff_t offset
, int origin
)
157 loff_t size
= file
->f_dentry
->d_inode
->i_bdev
->bd_inode
->i_size
;
167 offset
+= file
->f_pos
;
170 if (offset
>= 0 && offset
<= size
) {
171 if (offset
!= file
->f_pos
) {
172 file
->f_pos
= offset
;
181 * Filp may be NULL when we are called by an msync of a vma
182 * since the vma has no handle.
185 static int block_fsync(struct file
*filp
, struct dentry
*dentry
, int datasync
)
187 struct inode
* inode
= dentry
->d_inode
;
189 return sync_blockdev(inode
->i_bdev
);
196 static struct super_block
*bd_get_sb(struct file_system_type
*fs_type
,
197 int flags
, char *dev_name
, void *data
)
199 return get_sb_pseudo(fs_type
, "bdev:", NULL
, 0x62646576);
202 static struct file_system_type bd_type
= {
205 .kill_sb
= kill_anon_super
,
208 static struct vfsmount
*bd_mnt
;
209 struct super_block
*blockdev_superblock
;
212 * bdev cache handling - shamelessly stolen from inode.c
213 * We use smaller hashtable, though.
217 #define HASH_SIZE (1UL << HASH_BITS)
218 #define HASH_MASK (HASH_SIZE-1)
219 static struct list_head bdev_hashtable
[HASH_SIZE
];
220 static spinlock_t bdev_lock __cacheline_aligned_in_smp
= SPIN_LOCK_UNLOCKED
;
221 static kmem_cache_t
* bdev_cachep
;
223 #define alloc_bdev() \
224 ((struct block_device *) kmem_cache_alloc(bdev_cachep, SLAB_KERNEL))
225 #define destroy_bdev(bdev) kmem_cache_free(bdev_cachep, (bdev))
227 static void init_once(void * foo
, kmem_cache_t
* cachep
, unsigned long flags
)
229 struct block_device
* bdev
= (struct block_device
*) foo
;
231 if ((flags
& (SLAB_CTOR_VERIFY
|SLAB_CTOR_CONSTRUCTOR
)) ==
232 SLAB_CTOR_CONSTRUCTOR
)
234 memset(bdev
, 0, sizeof(*bdev
));
235 sema_init(&bdev
->bd_sem
, 1);
236 INIT_LIST_HEAD(&bdev
->bd_inodes
);
240 void __init
bdev_cache_init(void)
243 struct list_head
*head
= bdev_hashtable
;
247 INIT_LIST_HEAD(head
);
252 bdev_cachep
= kmem_cache_create("bdev_cache",
253 sizeof(struct block_device
),
254 0, SLAB_HWCACHE_ALIGN
, init_once
,
257 panic("Cannot create bdev_cache SLAB cache");
258 err
= register_filesystem(&bd_type
);
260 panic("Cannot register bdev pseudo-fs");
261 bd_mnt
= kern_mount(&bd_type
);
262 err
= PTR_ERR(bd_mnt
);
264 panic("Cannot create bdev pseudo-fs");
265 blockdev_superblock
= bd_mnt
->mnt_sb
; /* For writeback */
269 * Most likely _very_ bad one - but then it's hardly critical for small
270 * /dev and can be fixed when somebody will need really large one.
272 static inline unsigned long hash(dev_t dev
)
274 unsigned long tmp
= dev
;
275 tmp
= tmp
+ (tmp
>> HASH_BITS
) + (tmp
>> HASH_BITS
*2);
276 return tmp
& HASH_MASK
;
279 static struct block_device
*bdfind(dev_t dev
, struct list_head
*head
)
282 struct block_device
*bdev
;
283 list_for_each(p
, head
) {
284 bdev
= list_entry(p
, struct block_device
, bd_hash
);
285 if (bdev
->bd_dev
!= dev
)
287 atomic_inc(&bdev
->bd_count
);
293 struct block_device
*bdget(dev_t dev
)
295 struct list_head
* head
= bdev_hashtable
+ hash(dev
);
296 struct block_device
*bdev
, *new_bdev
;
297 spin_lock(&bdev_lock
);
298 bdev
= bdfind(dev
, head
);
299 spin_unlock(&bdev_lock
);
302 new_bdev
= alloc_bdev();
304 struct inode
*inode
= new_inode(bd_mnt
->mnt_sb
);
306 kdev_t kdev
= to_kdev_t(dev
);
308 atomic_set(&new_bdev
->bd_count
,1);
309 new_bdev
->bd_dev
= dev
;
310 new_bdev
->bd_contains
= NULL
;
311 new_bdev
->bd_inode
= inode
;
312 new_bdev
->bd_block_size
= (1 << inode
->i_blkbits
);
313 new_bdev
->bd_part_count
= 0;
314 new_bdev
->bd_invalidated
= 0;
315 inode
->i_mode
= S_IFBLK
;
316 inode
->i_rdev
= kdev
;
317 inode
->i_bdev
= new_bdev
;
318 inode
->i_data
.a_ops
= &def_blk_aops
;
319 inode
->i_data
.gfp_mask
= GFP_USER
;
320 inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
321 spin_lock(&bdev_lock
);
322 bdev
= bdfind(dev
, head
);
324 list_add(&new_bdev
->bd_hash
, head
);
325 spin_unlock(&bdev_lock
);
328 spin_unlock(&bdev_lock
);
329 iput(new_bdev
->bd_inode
);
331 destroy_bdev(new_bdev
);
336 long nr_blockdev_pages(void)
341 spin_lock(&bdev_lock
);
342 for (i
= 0; i
< ARRAY_SIZE(bdev_hashtable
); i
++) {
343 struct list_head
*head
= &bdev_hashtable
[i
];
344 struct list_head
*lh
;
348 list_for_each(lh
, head
) {
349 struct block_device
*bdev
;
351 bdev
= list_entry(lh
, struct block_device
, bd_hash
);
352 ret
+= bdev
->bd_inode
->i_mapping
->nrpages
;
355 spin_unlock(&bdev_lock
);
359 static inline void __bd_forget(struct inode
*inode
)
361 list_del_init(&inode
->i_devices
);
362 inode
->i_bdev
= NULL
;
363 inode
->i_mapping
= &inode
->i_data
;
366 void bdput(struct block_device
*bdev
)
368 if (atomic_dec_and_lock(&bdev
->bd_count
, &bdev_lock
)) {
370 if (bdev
->bd_openers
)
372 list_del(&bdev
->bd_hash
);
373 while ( (p
= bdev
->bd_inodes
.next
) != &bdev
->bd_inodes
) {
374 __bd_forget(list_entry(p
, struct inode
, i_devices
));
376 spin_unlock(&bdev_lock
);
377 iput(bdev
->bd_inode
);
382 int bd_acquire(struct inode
*inode
)
384 struct block_device
*bdev
;
385 spin_lock(&bdev_lock
);
387 atomic_inc(&inode
->i_bdev
->bd_count
);
388 spin_unlock(&bdev_lock
);
391 spin_unlock(&bdev_lock
);
392 bdev
= bdget(kdev_t_to_nr(inode
->i_rdev
));
395 spin_lock(&bdev_lock
);
396 if (!inode
->i_bdev
) {
397 inode
->i_bdev
= bdev
;
398 inode
->i_mapping
= bdev
->bd_inode
->i_mapping
;
399 list_add(&inode
->i_devices
, &bdev
->bd_inodes
);
400 } else if (inode
->i_bdev
!= bdev
)
402 spin_unlock(&bdev_lock
);
406 /* Call when you free inode */
408 void bd_forget(struct inode
*inode
)
410 spin_lock(&bdev_lock
);
413 spin_unlock(&bdev_lock
);
416 int bd_claim(struct block_device
*bdev
, void *holder
)
419 spin_lock(&bdev_lock
);
420 if (!bdev
->bd_holder
|| bdev
->bd_holder
== holder
) {
421 bdev
->bd_holder
= holder
;
425 spin_unlock(&bdev_lock
);
429 void bd_release(struct block_device
*bdev
)
431 spin_lock(&bdev_lock
);
432 if (!--bdev
->bd_holders
)
433 bdev
->bd_holder
= NULL
;
434 spin_unlock(&bdev_lock
);
437 static const char *blkdevs
[MAX_BLKDEV
];
439 int get_blkdev_list(char * p
)
444 len
= sprintf(p
, "\nBlock devices:\n");
445 for (i
= 0; i
< MAX_BLKDEV
; i
++) {
447 len
+= sprintf(p
+len
, "%3d %s\n", i
, blkdevs
[i
]);
452 int register_blkdev(unsigned int major
, const char * name
, struct block_device_operations
*bdops
)
457 for (major
= MAX_BLKDEV
-1; major
> 0; major
--) {
458 if (blkdevs
[major
] == NULL
) {
459 blkdevs
[major
] = name
;
465 if (major
>= MAX_BLKDEV
)
469 blkdevs
[major
] = name
;
473 int unregister_blkdev(unsigned int major
, const char * name
)
477 if (major
>= MAX_BLKDEV
)
481 if (strcmp(blkdevs
[major
], name
))
483 blkdevs
[major
] = NULL
;
488 * This routine checks whether a removable media has been changed,
489 * and invalidates all buffer-cache-entries in that case. This
490 * is a relatively slow routine, so we have to try to minimize using
491 * it. Thus it is called only upon a 'mount' or 'open'. This
492 * is the best way of combining speed and utility, I think.
493 * People changing diskettes in the middle of an operation deserve
496 int check_disk_change(struct block_device
*bdev
)
498 struct gendisk
*disk
= bdev
->bd_disk
;
499 struct block_device_operations
* bdops
= disk
->fops
;
500 kdev_t dev
= to_kdev_t(bdev
->bd_dev
);
502 if (!bdops
->media_changed
)
504 if (!bdops
->media_changed(bdev
->bd_disk
))
507 if (invalidate_device(dev
, 0))
508 printk("VFS: busy inodes on changed media.\n");
510 if (bdops
->revalidate_disk
)
511 bdops
->revalidate_disk(bdev
->bd_disk
);
512 if (bdev
->bd_disk
->minors
> 1)
513 bdev
->bd_invalidated
= 1;
517 int full_check_disk_change(struct block_device
*bdev
)
520 if (bdev
->bd_contains
!= bdev
)
523 if (check_disk_change(bdev
) && bdev
->bd_invalidated
) {
524 rescan_partitions(bdev
->bd_disk
, bdev
);
532 * Will die as soon as two remaining callers get converted.
534 int __check_disk_change(dev_t dev
)
536 struct block_device
*bdev
= bdget(dev
);
540 if (blkdev_get(bdev
, FMODE_READ
, 0, BDEV_RAW
) < 0)
542 res
= full_check_disk_change(bdev
);
543 blkdev_put(bdev
, BDEV_RAW
);
547 static void bd_set_size(struct block_device
*bdev
, loff_t size
)
549 unsigned bsize
= bdev_hardsect_size(bdev
);
550 bdev
->bd_inode
->i_size
= size
;
551 while (bsize
< PAGE_CACHE_SIZE
) {
556 bdev
->bd_block_size
= bsize
;
557 bdev
->bd_inode
->i_blkbits
= blksize_bits(bsize
);
560 static int do_open(struct block_device
*bdev
, struct inode
*inode
, struct file
*file
)
562 struct module
*owner
= NULL
;
563 struct gendisk
*disk
;
568 disk
= get_gendisk(bdev
->bd_dev
, &part
);
573 owner
= disk
->fops
->owner
;
576 if (!bdev
->bd_openers
) {
577 bdev
->bd_disk
= disk
;
578 bdev
->bd_contains
= bdev
;
580 struct backing_dev_info
*bdi
;
581 if (disk
->fops
->open
) {
582 ret
= disk
->fops
->open(inode
, file
);
587 if (!bdev
->bd_openers
) {
588 bd_set_size(bdev
,(loff_t
)get_capacity(disk
)<<9);
589 bdi
= blk_get_backing_dev_info(bdev
);
591 bdi
= &default_backing_dev_info
;
592 bdev
->bd_inode
->i_data
.backing_dev_info
= bdi
;
594 if (bdev
->bd_invalidated
)
595 rescan_partitions(disk
, bdev
);
598 struct block_device
*whole
;
599 whole
= bdget(MKDEV(disk
->major
, disk
->first_minor
));
603 ret
= blkdev_get(whole
, file
->f_mode
, file
->f_flags
, BDEV_RAW
);
606 bdev
->bd_contains
= whole
;
607 down(&whole
->bd_sem
);
608 whole
->bd_part_count
++;
609 p
= disk
->part
+ part
- 1;
610 bdev
->bd_inode
->i_data
.backing_dev_info
=
611 whole
->bd_inode
->i_data
.backing_dev_info
;
612 if (!(disk
->flags
& GENHD_FL_UP
) || !p
->nr_sects
) {
613 whole
->bd_part_count
--;
618 bdev
->bd_offset
= p
->start_sect
;
619 bd_set_size(bdev
, (loff_t
) p
->nr_sects
<< 9);
625 __MOD_DEC_USE_COUNT(owner
);
626 if (bdev
->bd_contains
== bdev
) {
627 if (bdev
->bd_disk
->fops
->open
) {
628 ret
= bdev
->bd_disk
->fops
->open(inode
, file
);
632 if (bdev
->bd_invalidated
)
633 rescan_partitions(bdev
->bd_disk
, bdev
);
635 down(&bdev
->bd_contains
->bd_sem
);
636 bdev
->bd_contains
->bd_part_count
++;
637 up(&bdev
->bd_contains
->bd_sem
);
646 bdev
->bd_disk
= NULL
;
647 bdev
->bd_inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
648 if (bdev
!= bdev
->bd_contains
)
649 blkdev_put(bdev
->bd_contains
, BDEV_RAW
);
650 bdev
->bd_contains
= NULL
;
653 __MOD_DEC_USE_COUNT(owner
);
662 int blkdev_get(struct block_device
*bdev
, mode_t mode
, unsigned flags
, int kind
)
665 * This crockload is due to bad choice of ->open() type.
667 * For now, block device ->open() routine must _not_
668 * examine anything in 'inode' argument except ->i_rdev.
670 struct file fake_file
= {};
671 struct dentry fake_dentry
= {};
672 fake_file
.f_mode
= mode
;
673 fake_file
.f_flags
= flags
;
674 fake_file
.f_dentry
= &fake_dentry
;
675 fake_dentry
.d_inode
= bdev
->bd_inode
;
677 return do_open(bdev
, bdev
->bd_inode
, &fake_file
);
680 int blkdev_open(struct inode
* inode
, struct file
* filp
)
682 struct block_device
*bdev
;
685 * Preserve backwards compatibility and allow large file access
686 * even if userspace doesn't ask for it explicitly. Some mkfs
687 * binary needs it. We might want to drop this workaround
688 * during an unstable branch.
690 filp
->f_flags
|= O_LARGEFILE
;
693 bdev
= inode
->i_bdev
;
695 return do_open(bdev
, inode
, filp
);
698 int blkdev_put(struct block_device
*bdev
, int kind
)
701 struct inode
*bd_inode
= bdev
->bd_inode
;
702 struct gendisk
*disk
= bdev
->bd_disk
;
709 sync_blockdev(bd_inode
->i_bdev
);
712 if (!--bdev
->bd_openers
)
714 if (bdev
->bd_contains
== bdev
) {
715 if (disk
->fops
->release
)
716 ret
= disk
->fops
->release(bd_inode
, NULL
);
718 down(&bdev
->bd_contains
->bd_sem
);
719 bdev
->bd_contains
->bd_part_count
--;
720 up(&bdev
->bd_contains
->bd_sem
);
722 if (!bdev
->bd_openers
) {
723 struct module
*owner
= disk
->fops
->owner
;
726 __MOD_DEC_USE_COUNT(owner
);
727 bdev
->bd_disk
= NULL
;
728 bdev
->bd_inode
->i_data
.backing_dev_info
= &default_backing_dev_info
;
729 if (bdev
!= bdev
->bd_contains
) {
730 blkdev_put(bdev
->bd_contains
, BDEV_RAW
);
732 bdev
->bd_contains
= NULL
;
740 int blkdev_close(struct inode
* inode
, struct file
* filp
)
742 return blkdev_put(inode
->i_bdev
, BDEV_FILE
);
745 static ssize_t
blkdev_file_write(struct file
*file
, const char *buf
,
746 size_t count
, loff_t
*ppos
)
748 struct iovec local_iov
= { .iov_base
= (void *)buf
, .iov_len
= count
};
750 return generic_file_write_nolock(file
, &local_iov
, 1, ppos
);
753 struct address_space_operations def_blk_aops
= {
754 .readpage
= blkdev_readpage
,
755 .writepage
= blkdev_writepage
,
756 .sync_page
= block_sync_page
,
757 .prepare_write
= blkdev_prepare_write
,
758 .commit_write
= blkdev_commit_write
,
759 .writepages
= generic_writepages
,
760 .vm_writeback
= generic_vm_writeback
,
761 .direct_IO
= blkdev_direct_IO
,
764 struct file_operations def_blk_fops
= {
766 .release
= blkdev_close
,
767 .llseek
= block_llseek
,
768 .read
= generic_file_read
,
769 .write
= blkdev_file_write
,
770 .mmap
= generic_file_mmap
,
771 .fsync
= block_fsync
,
772 .ioctl
= blkdev_ioctl
,
773 .readv
= generic_file_readv
,
774 .writev
= generic_file_writev
,
775 .sendfile
= generic_file_sendfile
,
778 int ioctl_by_bdev(struct block_device
*bdev
, unsigned cmd
, unsigned long arg
)
781 mm_segment_t old_fs
= get_fs();
783 res
= blkdev_ioctl(bdev
->bd_inode
, NULL
, cmd
, arg
);
788 const char *__bdevname(dev_t dev
)
790 static char buffer
[32];
791 const char * name
= blkdevs
[MAJOR(dev
)];
794 name
= "unknown-block";
796 sprintf(buffer
, "%s(%d,%d)", name
, MAJOR(dev
), MINOR(dev
));