Merge with Linux 2.5.48.
[linux-2.6/linux-mips.git] / fs / block_dev.c
blob9fd5fc4c3a36239b88e8d472e9094791c529385e
1 /*
2 * linux/fs/block_dev.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
6 */
8 #include <linux/config.h>
9 #include <linux/init.h>
10 #include <linux/mm.h>
11 #include <linux/fcntl.h>
12 #include <linux/slab.h>
13 #include <linux/kmod.h>
14 #include <linux/major.h>
15 #include <linux/devfs_fs_kernel.h>
16 #include <linux/smp_lock.h>
17 #include <linux/highmem.h>
18 #include <linux/blkdev.h>
19 #include <linux/module.h>
20 #include <linux/blkpg.h>
21 #include <linux/buffer_head.h>
22 #include <linux/mpage.h>
23 #include <linux/mount.h>
24 #include <linux/uio.h>
25 #include <asm/uaccess.h>
28 static sector_t max_block(struct block_device *bdev)
30 sector_t retval = ~((sector_t)0);
31 loff_t sz = bdev->bd_inode->i_size;
33 if (sz) {
34 unsigned int size = block_size(bdev);
35 unsigned int sizebits = blksize_bits(size);
36 retval = (sz >> sizebits);
38 return retval;
41 /* Kill _all_ buffers, dirty or not.. */
42 static void kill_bdev(struct block_device *bdev)
44 invalidate_bdev(bdev, 1);
45 truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
48 int set_blocksize(struct block_device *bdev, int size)
50 int oldsize;
52 /* Size must be a power of two, and between 512 and PAGE_SIZE */
53 if (size > PAGE_SIZE || size < 512 || (size & (size-1)))
54 return -EINVAL;
56 /* Size cannot be smaller than the size supported by the device */
57 if (size < bdev_hardsect_size(bdev))
58 return -EINVAL;
60 oldsize = bdev->bd_block_size;
61 if (oldsize == size)
62 return 0;
64 /* Ok, we're actually changing the blocksize.. */
65 sync_blockdev(bdev);
66 bdev->bd_block_size = size;
67 bdev->bd_inode->i_blkbits = blksize_bits(size);
68 kill_bdev(bdev);
69 return 0;
72 int sb_set_blocksize(struct super_block *sb, int size)
74 int bits;
75 if (set_blocksize(sb->s_bdev, size) < 0)
76 return 0;
77 sb->s_blocksize = size;
78 for (bits = 9, size >>= 9; size >>= 1; bits++)
80 sb->s_blocksize_bits = bits;
81 return sb->s_blocksize;
84 int sb_min_blocksize(struct super_block *sb, int size)
86 int minsize = bdev_hardsect_size(sb->s_bdev);
87 if (size < minsize)
88 size = minsize;
89 return sb_set_blocksize(sb, size);
92 static int
93 blkdev_get_block(struct inode *inode, sector_t iblock,
94 struct buffer_head *bh, int create)
96 if (iblock >= max_block(inode->i_bdev))
97 return -EIO;
99 bh->b_bdev = inode->i_bdev;
100 bh->b_blocknr = iblock;
101 set_buffer_mapped(bh);
102 return 0;
105 static int
106 blkdev_get_blocks(struct inode *inode, sector_t iblock,
107 unsigned long max_blocks, struct buffer_head *bh, int create)
109 if ((iblock + max_blocks) > max_block(inode->i_bdev))
110 return -EIO;
112 bh->b_bdev = inode->i_bdev;
113 bh->b_blocknr = iblock;
114 bh->b_size = max_blocks << inode->i_blkbits;
115 set_buffer_mapped(bh);
116 return 0;
119 static int
120 blkdev_direct_IO(int rw, struct file *file, const struct iovec *iov,
121 loff_t offset, unsigned long nr_segs)
123 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
125 return generic_direct_IO(rw, inode, inode->i_bdev, iov, offset,
126 nr_segs, blkdev_get_blocks);
129 static int blkdev_writepage(struct page * page)
131 return block_write_full_page(page, blkdev_get_block);
134 static int blkdev_readpage(struct file * file, struct page * page)
136 return block_read_full_page(page, blkdev_get_block);
139 static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to)
141 return block_prepare_write(page, from, to, blkdev_get_block);
144 static int blkdev_commit_write(struct file *file, struct page *page, unsigned from, unsigned to)
146 return block_commit_write(page, from, to);
150 * private llseek:
151 * for a block special file file->f_dentry->d_inode->i_size is zero
152 * so we compute the size by hand (just as in block_read/write above)
154 static loff_t block_llseek(struct file *file, loff_t offset, int origin)
156 /* ewww */
157 loff_t size = file->f_dentry->d_inode->i_bdev->bd_inode->i_size;
158 loff_t retval;
160 lock_kernel();
162 switch (origin) {
163 case 2:
164 offset += size;
165 break;
166 case 1:
167 offset += file->f_pos;
169 retval = -EINVAL;
170 if (offset >= 0 && offset <= size) {
171 if (offset != file->f_pos) {
172 file->f_pos = offset;
174 retval = offset;
176 unlock_kernel();
177 return retval;
181 * Filp may be NULL when we are called by an msync of a vma
182 * since the vma has no handle.
185 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
187 struct inode * inode = dentry->d_inode;
189 return sync_blockdev(inode->i_bdev);
193 * pseudo-fs
196 static struct super_block *bd_get_sb(struct file_system_type *fs_type,
197 int flags, char *dev_name, void *data)
199 return get_sb_pseudo(fs_type, "bdev:", NULL, 0x62646576);
202 static struct file_system_type bd_type = {
203 .name = "bdev",
204 .get_sb = bd_get_sb,
205 .kill_sb = kill_anon_super,
208 static struct vfsmount *bd_mnt;
209 struct super_block *blockdev_superblock;
212 * bdev cache handling - shamelessly stolen from inode.c
213 * We use smaller hashtable, though.
216 #define HASH_BITS 6
217 #define HASH_SIZE (1UL << HASH_BITS)
218 #define HASH_MASK (HASH_SIZE-1)
219 static struct list_head bdev_hashtable[HASH_SIZE];
220 static spinlock_t bdev_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
221 static kmem_cache_t * bdev_cachep;
223 #define alloc_bdev() \
224 ((struct block_device *) kmem_cache_alloc(bdev_cachep, SLAB_KERNEL))
225 #define destroy_bdev(bdev) kmem_cache_free(bdev_cachep, (bdev))
227 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
229 struct block_device * bdev = (struct block_device *) foo;
231 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
232 SLAB_CTOR_CONSTRUCTOR)
234 memset(bdev, 0, sizeof(*bdev));
235 sema_init(&bdev->bd_sem, 1);
236 INIT_LIST_HEAD(&bdev->bd_inodes);
240 void __init bdev_cache_init(void)
242 int i, err;
243 struct list_head *head = bdev_hashtable;
245 i = HASH_SIZE;
246 do {
247 INIT_LIST_HEAD(head);
248 head++;
249 i--;
250 } while (i);
252 bdev_cachep = kmem_cache_create("bdev_cache",
253 sizeof(struct block_device),
254 0, SLAB_HWCACHE_ALIGN, init_once,
255 NULL);
256 if (!bdev_cachep)
257 panic("Cannot create bdev_cache SLAB cache");
258 err = register_filesystem(&bd_type);
259 if (err)
260 panic("Cannot register bdev pseudo-fs");
261 bd_mnt = kern_mount(&bd_type);
262 err = PTR_ERR(bd_mnt);
263 if (IS_ERR(bd_mnt))
264 panic("Cannot create bdev pseudo-fs");
265 blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
269 * Most likely _very_ bad one - but then it's hardly critical for small
270 * /dev and can be fixed when somebody will need really large one.
272 static inline unsigned long hash(dev_t dev)
274 unsigned long tmp = dev;
275 tmp = tmp + (tmp >> HASH_BITS) + (tmp >> HASH_BITS*2);
276 return tmp & HASH_MASK;
279 static struct block_device *bdfind(dev_t dev, struct list_head *head)
281 struct list_head *p;
282 struct block_device *bdev;
283 list_for_each(p, head) {
284 bdev = list_entry(p, struct block_device, bd_hash);
285 if (bdev->bd_dev != dev)
286 continue;
287 atomic_inc(&bdev->bd_count);
288 return bdev;
290 return NULL;
293 struct block_device *bdget(dev_t dev)
295 struct list_head * head = bdev_hashtable + hash(dev);
296 struct block_device *bdev, *new_bdev;
297 spin_lock(&bdev_lock);
298 bdev = bdfind(dev, head);
299 spin_unlock(&bdev_lock);
300 if (bdev)
301 return bdev;
302 new_bdev = alloc_bdev();
303 if (new_bdev) {
304 struct inode *inode = new_inode(bd_mnt->mnt_sb);
305 if (inode) {
306 kdev_t kdev = to_kdev_t(dev);
308 atomic_set(&new_bdev->bd_count,1);
309 new_bdev->bd_dev = dev;
310 new_bdev->bd_contains = NULL;
311 new_bdev->bd_inode = inode;
312 new_bdev->bd_block_size = (1 << inode->i_blkbits);
313 new_bdev->bd_part_count = 0;
314 new_bdev->bd_invalidated = 0;
315 inode->i_mode = S_IFBLK;
316 inode->i_rdev = kdev;
317 inode->i_bdev = new_bdev;
318 inode->i_data.a_ops = &def_blk_aops;
319 inode->i_data.gfp_mask = GFP_USER;
320 inode->i_data.backing_dev_info = &default_backing_dev_info;
321 spin_lock(&bdev_lock);
322 bdev = bdfind(dev, head);
323 if (!bdev) {
324 list_add(&new_bdev->bd_hash, head);
325 spin_unlock(&bdev_lock);
326 return new_bdev;
328 spin_unlock(&bdev_lock);
329 iput(new_bdev->bd_inode);
331 destroy_bdev(new_bdev);
333 return bdev;
336 long nr_blockdev_pages(void)
338 long ret = 0;
339 int i;
341 spin_lock(&bdev_lock);
342 for (i = 0; i < ARRAY_SIZE(bdev_hashtable); i++) {
343 struct list_head *head = &bdev_hashtable[i];
344 struct list_head *lh;
346 if (head == NULL)
347 continue;
348 list_for_each(lh, head) {
349 struct block_device *bdev;
351 bdev = list_entry(lh, struct block_device, bd_hash);
352 ret += bdev->bd_inode->i_mapping->nrpages;
355 spin_unlock(&bdev_lock);
356 return ret;
359 static inline void __bd_forget(struct inode *inode)
361 list_del_init(&inode->i_devices);
362 inode->i_bdev = NULL;
363 inode->i_mapping = &inode->i_data;
366 void bdput(struct block_device *bdev)
368 if (atomic_dec_and_lock(&bdev->bd_count, &bdev_lock)) {
369 struct list_head *p;
370 if (bdev->bd_openers)
371 BUG();
372 list_del(&bdev->bd_hash);
373 while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
374 __bd_forget(list_entry(p, struct inode, i_devices));
376 spin_unlock(&bdev_lock);
377 iput(bdev->bd_inode);
378 destroy_bdev(bdev);
382 int bd_acquire(struct inode *inode)
384 struct block_device *bdev;
385 spin_lock(&bdev_lock);
386 if (inode->i_bdev) {
387 atomic_inc(&inode->i_bdev->bd_count);
388 spin_unlock(&bdev_lock);
389 return 0;
391 spin_unlock(&bdev_lock);
392 bdev = bdget(kdev_t_to_nr(inode->i_rdev));
393 if (!bdev)
394 return -ENOMEM;
395 spin_lock(&bdev_lock);
396 if (!inode->i_bdev) {
397 inode->i_bdev = bdev;
398 inode->i_mapping = bdev->bd_inode->i_mapping;
399 list_add(&inode->i_devices, &bdev->bd_inodes);
400 } else if (inode->i_bdev != bdev)
401 BUG();
402 spin_unlock(&bdev_lock);
403 return 0;
406 /* Call when you free inode */
408 void bd_forget(struct inode *inode)
410 spin_lock(&bdev_lock);
411 if (inode->i_bdev)
412 __bd_forget(inode);
413 spin_unlock(&bdev_lock);
416 int bd_claim(struct block_device *bdev, void *holder)
418 int res = -EBUSY;
419 spin_lock(&bdev_lock);
420 if (!bdev->bd_holder || bdev->bd_holder == holder) {
421 bdev->bd_holder = holder;
422 bdev->bd_holders++;
423 res = 0;
425 spin_unlock(&bdev_lock);
426 return res;
429 void bd_release(struct block_device *bdev)
431 spin_lock(&bdev_lock);
432 if (!--bdev->bd_holders)
433 bdev->bd_holder = NULL;
434 spin_unlock(&bdev_lock);
437 static const char *blkdevs[MAX_BLKDEV];
439 int get_blkdev_list(char * p)
441 int i;
442 int len;
444 len = sprintf(p, "\nBlock devices:\n");
445 for (i = 0; i < MAX_BLKDEV ; i++) {
446 if (blkdevs[i])
447 len += sprintf(p+len, "%3d %s\n", i, blkdevs[i]);
449 return len;
452 int register_blkdev(unsigned int major, const char * name, struct block_device_operations *bdops)
454 if (devfs_only())
455 return 0;
456 if (major == 0) {
457 for (major = MAX_BLKDEV-1; major > 0; major--) {
458 if (blkdevs[major] == NULL) {
459 blkdevs[major] = name;
460 return major;
463 return -EBUSY;
465 if (major >= MAX_BLKDEV)
466 return -EINVAL;
467 if (blkdevs[major])
468 return -EBUSY;
469 blkdevs[major] = name;
470 return 0;
473 int unregister_blkdev(unsigned int major, const char * name)
475 if (devfs_only())
476 return 0;
477 if (major >= MAX_BLKDEV)
478 return -EINVAL;
479 if (!blkdevs[major])
480 return -EINVAL;
481 if (strcmp(blkdevs[major], name))
482 return -EINVAL;
483 blkdevs[major] = NULL;
484 return 0;
488 * This routine checks whether a removable media has been changed,
489 * and invalidates all buffer-cache-entries in that case. This
490 * is a relatively slow routine, so we have to try to minimize using
491 * it. Thus it is called only upon a 'mount' or 'open'. This
492 * is the best way of combining speed and utility, I think.
493 * People changing diskettes in the middle of an operation deserve
494 * to lose :-)
496 int check_disk_change(struct block_device *bdev)
498 struct gendisk *disk = bdev->bd_disk;
499 struct block_device_operations * bdops = disk->fops;
500 kdev_t dev = to_kdev_t(bdev->bd_dev);
502 if (!bdops->media_changed)
503 return 0;
504 if (!bdops->media_changed(bdev->bd_disk))
505 return 0;
507 if (invalidate_device(dev, 0))
508 printk("VFS: busy inodes on changed media.\n");
510 if (bdops->revalidate_disk)
511 bdops->revalidate_disk(bdev->bd_disk);
512 if (bdev->bd_disk->minors > 1)
513 bdev->bd_invalidated = 1;
514 return 1;
517 int full_check_disk_change(struct block_device *bdev)
519 int res = 0;
520 if (bdev->bd_contains != bdev)
521 BUG();
522 down(&bdev->bd_sem);
523 if (check_disk_change(bdev) && bdev->bd_invalidated) {
524 rescan_partitions(bdev->bd_disk, bdev);
525 res = 1;
527 up(&bdev->bd_sem);
528 return res;
532 * Will die as soon as two remaining callers get converted.
534 int __check_disk_change(dev_t dev)
536 struct block_device *bdev = bdget(dev);
537 int res;
538 if (!bdev)
539 return 0;
540 if (blkdev_get(bdev, FMODE_READ, 0, BDEV_RAW) < 0)
541 return 0;
542 res = full_check_disk_change(bdev);
543 blkdev_put(bdev, BDEV_RAW);
544 return res;
547 static void bd_set_size(struct block_device *bdev, loff_t size)
549 unsigned bsize = bdev_hardsect_size(bdev);
550 bdev->bd_inode->i_size = size;
551 while (bsize < PAGE_CACHE_SIZE) {
552 if (size & bsize)
553 break;
554 bsize <<= 1;
556 bdev->bd_block_size = bsize;
557 bdev->bd_inode->i_blkbits = blksize_bits(bsize);
560 static int do_open(struct block_device *bdev, struct inode *inode, struct file *file)
562 struct module *owner = NULL;
563 struct gendisk *disk;
564 int ret = -ENXIO;
565 int part;
567 lock_kernel();
568 disk = get_gendisk(bdev->bd_dev, &part);
569 if (!disk) {
570 bdput(bdev);
571 return ret;
573 owner = disk->fops->owner;
575 down(&bdev->bd_sem);
576 if (!bdev->bd_openers) {
577 bdev->bd_disk = disk;
578 bdev->bd_contains = bdev;
579 if (!part) {
580 struct backing_dev_info *bdi;
581 if (disk->fops->open) {
582 ret = disk->fops->open(inode, file);
583 if (ret)
584 goto out_first;
586 bdev->bd_offset = 0;
587 if (!bdev->bd_openers) {
588 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
589 bdi = blk_get_backing_dev_info(bdev);
590 if (bdi == NULL)
591 bdi = &default_backing_dev_info;
592 bdev->bd_inode->i_data.backing_dev_info = bdi;
594 if (bdev->bd_invalidated)
595 rescan_partitions(disk, bdev);
596 } else {
597 struct hd_struct *p;
598 struct block_device *whole;
599 whole = bdget(MKDEV(disk->major, disk->first_minor));
600 ret = -ENOMEM;
601 if (!whole)
602 goto out_first;
603 ret = blkdev_get(whole, file->f_mode, file->f_flags, BDEV_RAW);
604 if (ret)
605 goto out_first;
606 bdev->bd_contains = whole;
607 down(&whole->bd_sem);
608 whole->bd_part_count++;
609 p = disk->part + part - 1;
610 bdev->bd_inode->i_data.backing_dev_info =
611 whole->bd_inode->i_data.backing_dev_info;
612 if (!(disk->flags & GENHD_FL_UP) || !p->nr_sects) {
613 whole->bd_part_count--;
614 up(&whole->bd_sem);
615 ret = -ENXIO;
616 goto out_first;
618 bdev->bd_offset = p->start_sect;
619 bd_set_size(bdev, (loff_t) p->nr_sects << 9);
620 up(&whole->bd_sem);
622 } else {
623 put_disk(disk);
624 if (owner)
625 __MOD_DEC_USE_COUNT(owner);
626 if (bdev->bd_contains == bdev) {
627 if (bdev->bd_disk->fops->open) {
628 ret = bdev->bd_disk->fops->open(inode, file);
629 if (ret)
630 goto out;
632 if (bdev->bd_invalidated)
633 rescan_partitions(bdev->bd_disk, bdev);
634 } else {
635 down(&bdev->bd_contains->bd_sem);
636 bdev->bd_contains->bd_part_count++;
637 up(&bdev->bd_contains->bd_sem);
640 bdev->bd_openers++;
641 up(&bdev->bd_sem);
642 unlock_kernel();
643 return 0;
645 out_first:
646 bdev->bd_disk = NULL;
647 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
648 if (bdev != bdev->bd_contains)
649 blkdev_put(bdev->bd_contains, BDEV_RAW);
650 bdev->bd_contains = NULL;
651 put_disk(disk);
652 if (owner)
653 __MOD_DEC_USE_COUNT(owner);
654 out:
655 up(&bdev->bd_sem);
656 unlock_kernel();
657 if (ret)
658 bdput(bdev);
659 return ret;
662 int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind)
665 * This crockload is due to bad choice of ->open() type.
666 * It will go away.
667 * For now, block device ->open() routine must _not_
668 * examine anything in 'inode' argument except ->i_rdev.
670 struct file fake_file = {};
671 struct dentry fake_dentry = {};
672 fake_file.f_mode = mode;
673 fake_file.f_flags = flags;
674 fake_file.f_dentry = &fake_dentry;
675 fake_dentry.d_inode = bdev->bd_inode;
677 return do_open(bdev, bdev->bd_inode, &fake_file);
680 int blkdev_open(struct inode * inode, struct file * filp)
682 struct block_device *bdev;
685 * Preserve backwards compatibility and allow large file access
686 * even if userspace doesn't ask for it explicitly. Some mkfs
687 * binary needs it. We might want to drop this workaround
688 * during an unstable branch.
690 filp->f_flags |= O_LARGEFILE;
692 bd_acquire(inode);
693 bdev = inode->i_bdev;
695 return do_open(bdev, inode, filp);
698 int blkdev_put(struct block_device *bdev, int kind)
700 int ret = 0;
701 struct inode *bd_inode = bdev->bd_inode;
702 struct gendisk *disk = bdev->bd_disk;
704 down(&bdev->bd_sem);
705 lock_kernel();
706 switch (kind) {
707 case BDEV_FILE:
708 case BDEV_FS:
709 sync_blockdev(bd_inode->i_bdev);
710 break;
712 if (!--bdev->bd_openers)
713 kill_bdev(bdev);
714 if (bdev->bd_contains == bdev) {
715 if (disk->fops->release)
716 ret = disk->fops->release(bd_inode, NULL);
717 } else {
718 down(&bdev->bd_contains->bd_sem);
719 bdev->bd_contains->bd_part_count--;
720 up(&bdev->bd_contains->bd_sem);
722 if (!bdev->bd_openers) {
723 struct module *owner = disk->fops->owner;
724 put_disk(disk);
725 if (owner)
726 __MOD_DEC_USE_COUNT(owner);
727 bdev->bd_disk = NULL;
728 bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;
729 if (bdev != bdev->bd_contains) {
730 blkdev_put(bdev->bd_contains, BDEV_RAW);
732 bdev->bd_contains = NULL;
734 unlock_kernel();
735 up(&bdev->bd_sem);
736 bdput(bdev);
737 return ret;
740 int blkdev_close(struct inode * inode, struct file * filp)
742 return blkdev_put(inode->i_bdev, BDEV_FILE);
745 static ssize_t blkdev_file_write(struct file *file, const char *buf,
746 size_t count, loff_t *ppos)
748 struct iovec local_iov = { .iov_base = (void *)buf, .iov_len = count };
750 return generic_file_write_nolock(file, &local_iov, 1, ppos);
753 struct address_space_operations def_blk_aops = {
754 .readpage = blkdev_readpage,
755 .writepage = blkdev_writepage,
756 .sync_page = block_sync_page,
757 .prepare_write = blkdev_prepare_write,
758 .commit_write = blkdev_commit_write,
759 .writepages = generic_writepages,
760 .vm_writeback = generic_vm_writeback,
761 .direct_IO = blkdev_direct_IO,
764 struct file_operations def_blk_fops = {
765 .open = blkdev_open,
766 .release = blkdev_close,
767 .llseek = block_llseek,
768 .read = generic_file_read,
769 .write = blkdev_file_write,
770 .mmap = generic_file_mmap,
771 .fsync = block_fsync,
772 .ioctl = blkdev_ioctl,
773 .readv = generic_file_readv,
774 .writev = generic_file_writev,
775 .sendfile = generic_file_sendfile,
778 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
780 int res;
781 mm_segment_t old_fs = get_fs();
782 set_fs(KERNEL_DS);
783 res = blkdev_ioctl(bdev->bd_inode, NULL, cmd, arg);
784 set_fs(old_fs);
785 return res;
788 const char *__bdevname(dev_t dev)
790 static char buffer[32];
791 const char * name = blkdevs[MAJOR(dev)];
793 if (!name)
794 name = "unknown-block";
796 sprintf(buffer, "%s(%d,%d)", name, MAJOR(dev), MINOR(dev));
797 return buffer;