- pre5:
[davej-history.git] / fs / block_dev.c
blob0ddf2fd3bec2cf102583f337f7ff625493a24a97
1 /*
2 * linux/fs/block_dev.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 #include <linux/config.h>
8 #include <linux/init.h>
9 #include <linux/mm.h>
10 #include <linux/locks.h>
11 #include <linux/fcntl.h>
12 #include <linux/malloc.h>
13 #include <linux/kmod.h>
14 #include <linux/devfs_fs_kernel.h>
15 #include <linux/smp_lock.h>
17 #include <asm/uaccess.h>
19 extern int *blk_size[];
20 extern int *blksize_size[];
22 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
23 #define NBUF 64
25 ssize_t block_write(struct file * filp, const char * buf,
26 size_t count, loff_t *ppos)
28 struct inode * inode = filp->f_dentry->d_inode;
29 ssize_t blocksize, blocksize_bits, i, buffercount, write_error;
30 ssize_t block, blocks;
31 loff_t offset;
32 ssize_t chars;
33 ssize_t written = 0;
34 struct buffer_head * bhlist[NBUF];
35 size_t size;
36 kdev_t dev;
37 struct buffer_head * bh, *bufferlist[NBUF];
38 register char * p;
40 write_error = buffercount = 0;
41 dev = inode->i_rdev;
42 if ( is_read_only( inode->i_rdev ))
43 return -EPERM;
44 blocksize = BLOCK_SIZE;
45 if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
46 blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
48 i = blocksize;
49 blocksize_bits = 0;
50 while(i != 1) {
51 blocksize_bits++;
52 i >>= 1;
55 block = *ppos >> blocksize_bits;
56 offset = *ppos & (blocksize-1);
58 if (blk_size[MAJOR(dev)])
59 size = ((loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS) >> blocksize_bits;
60 else
61 size = INT_MAX;
62 while (count>0) {
63 if (block >= size)
64 return written ? written : -ENOSPC;
65 chars = blocksize - offset;
66 if (chars > count)
67 chars=count;
69 #if 0
70 /* get the buffer head */
72 struct buffer_head * (*fn)(kdev_t, int, int) = getblk;
73 if (chars != blocksize)
74 fn = bread;
75 bh = fn(dev, block, blocksize);
76 if (!bh)
77 return written ? written : -EIO;
78 if (!buffer_uptodate(bh))
79 wait_on_buffer(bh);
81 #else
82 bh = getblk(dev, block, blocksize);
83 if (!bh)
84 return written ? written : -EIO;
86 if (!buffer_uptodate(bh))
88 if (chars == blocksize)
89 wait_on_buffer(bh);
90 else
92 bhlist[0] = bh;
93 if (!filp->f_reada || !read_ahead[MAJOR(dev)]) {
94 /* We do this to force the read of a single buffer */
95 blocks = 1;
96 } else {
97 /* Read-ahead before write */
98 blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9) / 2;
99 if (block + blocks > size) blocks = size - block;
100 if (blocks > NBUF) blocks=NBUF;
101 if (!blocks) blocks = 1;
102 for(i=1; i<blocks; i++)
104 bhlist[i] = getblk (dev, block+i, blocksize);
105 if (!bhlist[i])
107 while(i >= 0) brelse(bhlist[i--]);
108 return written ? written : -EIO;
112 ll_rw_block(READ, blocks, bhlist);
113 for(i=1; i<blocks; i++) brelse(bhlist[i]);
114 wait_on_buffer(bh);
115 if (!buffer_uptodate(bh)) {
116 brelse(bh);
117 return written ? written : -EIO;
121 #endif
122 block++;
123 p = offset + bh->b_data;
124 offset = 0;
125 *ppos += chars;
126 written += chars;
127 count -= chars;
128 copy_from_user(p,buf,chars);
129 p += chars;
130 buf += chars;
131 mark_buffer_uptodate(bh, 1);
132 mark_buffer_dirty(bh);
133 if (filp->f_flags & O_SYNC)
134 bufferlist[buffercount++] = bh;
135 else
136 brelse(bh);
137 if (buffercount == NBUF){
138 ll_rw_block(WRITE, buffercount, bufferlist);
139 for(i=0; i<buffercount; i++){
140 wait_on_buffer(bufferlist[i]);
141 if (!buffer_uptodate(bufferlist[i]))
142 write_error=1;
143 brelse(bufferlist[i]);
145 buffercount=0;
147 balance_dirty(dev);
148 if (write_error)
149 break;
151 if ( buffercount ){
152 ll_rw_block(WRITE, buffercount, bufferlist);
153 for(i=0; i<buffercount; i++){
154 wait_on_buffer(bufferlist[i]);
155 if (!buffer_uptodate(bufferlist[i]))
156 write_error=1;
157 brelse(bufferlist[i]);
160 filp->f_reada = 1;
161 if(write_error)
162 return -EIO;
163 return written;
166 ssize_t block_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
168 struct inode * inode = filp->f_dentry->d_inode;
169 size_t block;
170 loff_t offset;
171 ssize_t blocksize;
172 ssize_t blocksize_bits, i;
173 size_t blocks, rblocks, left;
174 int bhrequest, uptodate;
175 struct buffer_head ** bhb, ** bhe;
176 struct buffer_head * buflist[NBUF];
177 struct buffer_head * bhreq[NBUF];
178 unsigned int chars;
179 loff_t size;
180 kdev_t dev;
181 ssize_t read;
183 dev = inode->i_rdev;
184 blocksize = BLOCK_SIZE;
185 if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
186 blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
187 i = blocksize;
188 blocksize_bits = 0;
189 while (i != 1) {
190 blocksize_bits++;
191 i >>= 1;
194 offset = *ppos;
195 if (blk_size[MAJOR(dev)])
196 size = (loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS;
197 else
198 size = (loff_t) INT_MAX << BLOCK_SIZE_BITS;
200 if (offset > size)
201 left = 0;
202 /* size - offset might not fit into left, so check explicitly. */
203 else if (size - offset > INT_MAX)
204 left = INT_MAX;
205 else
206 left = size - offset;
207 if (left > count)
208 left = count;
209 if (left <= 0)
210 return 0;
211 read = 0;
212 block = offset >> blocksize_bits;
213 offset &= blocksize-1;
214 size >>= blocksize_bits;
215 rblocks = blocks = (left + offset + blocksize - 1) >> blocksize_bits;
216 bhb = bhe = buflist;
217 if (filp->f_reada) {
218 if (blocks < read_ahead[MAJOR(dev)] / (blocksize >> 9))
219 blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9);
220 if (rblocks > blocks)
221 blocks = rblocks;
224 if (block + blocks > size) {
225 blocks = size - block;
226 if (blocks == 0)
227 return 0;
230 /* We do this in a two stage process. We first try to request
231 as many blocks as we can, then we wait for the first one to
232 complete, and then we try to wrap up as many as are actually
233 done. This routine is rather generic, in that it can be used
234 in a filesystem by substituting the appropriate function in
235 for getblk.
237 This routine is optimized to make maximum use of the various
238 buffers and caches. */
240 do {
241 bhrequest = 0;
242 uptodate = 1;
243 while (blocks) {
244 --blocks;
245 *bhb = getblk(dev, block++, blocksize);
246 if (*bhb && !buffer_uptodate(*bhb)) {
247 uptodate = 0;
248 bhreq[bhrequest++] = *bhb;
251 if (++bhb == &buflist[NBUF])
252 bhb = buflist;
254 /* If the block we have on hand is uptodate, go ahead
255 and complete processing. */
256 if (uptodate)
257 break;
258 if (bhb == bhe)
259 break;
262 /* Now request them all */
263 if (bhrequest) {
264 ll_rw_block(READ, bhrequest, bhreq);
267 do { /* Finish off all I/O that has actually completed */
268 if (*bhe) {
269 wait_on_buffer(*bhe);
270 if (!buffer_uptodate(*bhe)) { /* read error? */
271 brelse(*bhe);
272 if (++bhe == &buflist[NBUF])
273 bhe = buflist;
274 left = 0;
275 break;
278 if (left < blocksize - offset)
279 chars = left;
280 else
281 chars = blocksize - offset;
282 *ppos += chars;
283 left -= chars;
284 read += chars;
285 if (*bhe) {
286 copy_to_user(buf,offset+(*bhe)->b_data,chars);
287 brelse(*bhe);
288 buf += chars;
289 } else {
290 while (chars-- > 0)
291 put_user(0,buf++);
293 offset = 0;
294 if (++bhe == &buflist[NBUF])
295 bhe = buflist;
296 } while (left > 0 && bhe != bhb && (!*bhe || !buffer_locked(*bhe)));
297 if (bhe == bhb && !blocks)
298 break;
299 } while (left > 0);
301 /* Release the read-ahead blocks */
302 while (bhe != bhb) {
303 brelse(*bhe);
304 if (++bhe == &buflist[NBUF])
305 bhe = buflist;
307 if (!read)
308 return -EIO;
309 filp->f_reada = 1;
310 return read;
314 * Filp may be NULL when we are called by an msync of a vma
315 * since the vma has no handle.
318 static int block_fsync(struct file *filp, struct dentry *dentry, int datasync)
320 return fsync_dev(dentry->d_inode->i_rdev);
324 * bdev cache handling - shamelessly stolen from inode.c
325 * We use smaller hashtable, though.
328 #define HASH_BITS 6
329 #define HASH_SIZE (1UL << HASH_BITS)
330 #define HASH_MASK (HASH_SIZE-1)
331 static struct list_head bdev_hashtable[HASH_SIZE];
332 static spinlock_t bdev_lock = SPIN_LOCK_UNLOCKED;
333 static kmem_cache_t * bdev_cachep;
335 #define alloc_bdev() \
336 ((struct block_device *) kmem_cache_alloc(bdev_cachep, SLAB_KERNEL))
337 #define destroy_bdev(bdev) kmem_cache_free(bdev_cachep, (bdev))
339 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
341 struct block_device * bdev = (struct block_device *) foo;
343 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
344 SLAB_CTOR_CONSTRUCTOR)
346 memset(bdev, 0, sizeof(*bdev));
347 sema_init(&bdev->bd_sem, 1);
351 void __init bdev_init(void)
353 int i;
354 struct list_head *head = bdev_hashtable;
356 i = HASH_SIZE;
357 do {
358 INIT_LIST_HEAD(head);
359 head++;
360 i--;
361 } while (i);
363 bdev_cachep = kmem_cache_create("bdev_cache",
364 sizeof(struct block_device),
365 0, SLAB_HWCACHE_ALIGN, init_once,
366 NULL);
367 if (!bdev_cachep)
368 panic("Cannot create bdev_cache SLAB cache");
372 * Most likely _very_ bad one - but then it's hardly critical for small
373 * /dev and can be fixed when somebody will need really large one.
375 static inline unsigned long hash(dev_t dev)
377 unsigned long tmp = dev;
378 tmp = tmp + (tmp >> HASH_BITS) + (tmp >> HASH_BITS*2);
379 return tmp & HASH_MASK;
382 static struct block_device *bdfind(dev_t dev, struct list_head *head)
384 struct list_head *p;
385 struct block_device *bdev;
386 for (p=head->next; p!=head; p=p->next) {
387 bdev = list_entry(p, struct block_device, bd_hash);
388 if (bdev->bd_dev != dev)
389 continue;
390 atomic_inc(&bdev->bd_count);
391 return bdev;
393 return NULL;
396 struct block_device *bdget(dev_t dev)
398 struct list_head * head = bdev_hashtable + hash(dev);
399 struct block_device *bdev, *new_bdev;
400 spin_lock(&bdev_lock);
401 bdev = bdfind(dev, head);
402 spin_unlock(&bdev_lock);
403 if (bdev)
404 return bdev;
405 new_bdev = alloc_bdev();
406 if (!new_bdev)
407 return NULL;
408 atomic_set(&new_bdev->bd_count,1);
409 new_bdev->bd_dev = dev;
410 new_bdev->bd_op = NULL;
411 spin_lock(&bdev_lock);
412 bdev = bdfind(dev, head);
413 if (!bdev) {
414 list_add(&new_bdev->bd_hash, head);
415 spin_unlock(&bdev_lock);
416 return new_bdev;
418 spin_unlock(&bdev_lock);
419 destroy_bdev(new_bdev);
420 return bdev;
423 void bdput(struct block_device *bdev)
425 if (atomic_dec_and_test(&bdev->bd_count)) {
426 spin_lock(&bdev_lock);
427 if (atomic_read(&bdev->bd_openers))
428 BUG();
429 list_del(&bdev->bd_hash);
430 spin_unlock(&bdev_lock);
431 destroy_bdev(bdev);
435 static struct {
436 const char *name;
437 struct block_device_operations *bdops;
438 } blkdevs[MAX_BLKDEV] = {
439 { NULL, NULL },
442 int get_blkdev_list(char * p)
444 int i;
445 int len;
447 len = sprintf(p, "\nBlock devices:\n");
448 for (i = 0; i < MAX_BLKDEV ; i++) {
449 if (blkdevs[i].bdops) {
450 len += sprintf(p+len, "%3d %s\n", i, blkdevs[i].name);
453 return len;
457 Return the function table of a device.
458 Load the driver if needed.
460 const struct block_device_operations * get_blkfops(unsigned int major)
462 const struct block_device_operations *ret = NULL;
464 /* major 0 is used for non-device mounts */
465 if (major && major < MAX_BLKDEV) {
466 #ifdef CONFIG_KMOD
467 if (!blkdevs[major].bdops) {
468 char name[20];
469 sprintf(name, "block-major-%d", major);
470 request_module(name);
472 #endif
473 ret = blkdevs[major].bdops;
475 return ret;
478 int register_blkdev(unsigned int major, const char * name, struct block_device_operations *bdops)
480 if (major == 0) {
481 for (major = MAX_BLKDEV-1; major > 0; major--) {
482 if (blkdevs[major].bdops == NULL) {
483 blkdevs[major].name = name;
484 blkdevs[major].bdops = bdops;
485 return major;
488 return -EBUSY;
490 if (major >= MAX_BLKDEV)
491 return -EINVAL;
492 if (blkdevs[major].bdops && blkdevs[major].bdops != bdops)
493 return -EBUSY;
494 blkdevs[major].name = name;
495 blkdevs[major].bdops = bdops;
496 return 0;
499 int unregister_blkdev(unsigned int major, const char * name)
501 if (major >= MAX_BLKDEV)
502 return -EINVAL;
503 if (!blkdevs[major].bdops)
504 return -EINVAL;
505 if (strcmp(blkdevs[major].name, name))
506 return -EINVAL;
507 blkdevs[major].name = NULL;
508 blkdevs[major].bdops = NULL;
509 return 0;
513 * This routine checks whether a removable media has been changed,
514 * and invalidates all buffer-cache-entries in that case. This
515 * is a relatively slow routine, so we have to try to minimize using
516 * it. Thus it is called only upon a 'mount' or 'open'. This
517 * is the best way of combining speed and utility, I think.
518 * People changing diskettes in the middle of an operation deserve
519 * to lose :-)
521 int check_disk_change(kdev_t dev)
523 int i;
524 const struct block_device_operations * bdops = NULL;
525 struct super_block * sb;
527 i = MAJOR(dev);
528 if (i < MAX_BLKDEV)
529 bdops = blkdevs[i].bdops;
530 if (bdops == NULL) {
531 devfs_handle_t de;
533 de = devfs_find_handle (NULL, NULL, i, MINOR (dev),
534 DEVFS_SPECIAL_BLK, 0);
535 if (de) bdops = devfs_get_ops (de);
537 if (bdops == NULL)
538 return 0;
539 if (bdops->check_media_change == NULL)
540 return 0;
541 if (!bdops->check_media_change(dev))
542 return 0;
544 printk(KERN_DEBUG "VFS: Disk change detected on device %s\n",
545 bdevname(dev));
547 sb = get_super(dev);
548 if (sb && invalidate_inodes(sb))
549 printk("VFS: busy inodes on changed media.\n");
551 destroy_buffers(dev);
553 if (bdops->revalidate)
554 bdops->revalidate(dev);
555 return 1;
558 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
560 kdev_t rdev = to_kdev_t(bdev->bd_dev);
561 struct inode inode_fake;
562 int res;
563 mm_segment_t old_fs = get_fs();
565 if (!bdev->bd_op->ioctl)
566 return -EINVAL;
567 inode_fake.i_rdev=rdev;
568 init_waitqueue_head(&inode_fake.i_wait);
569 set_fs(KERNEL_DS);
570 res = bdev->bd_op->ioctl(&inode_fake, NULL, cmd, arg);
571 set_fs(old_fs);
572 return res;
575 int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind)
577 int ret = -ENODEV;
578 kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
579 down(&bdev->bd_sem);
580 if (!bdev->bd_op)
581 bdev->bd_op = get_blkfops(MAJOR(rdev));
582 if (bdev->bd_op) {
584 * This crockload is due to bad choice of ->open() type.
585 * It will go away.
586 * For now, block device ->open() routine must _not_
587 * examine anything in 'inode' argument except ->i_rdev.
589 struct file fake_file = {};
590 struct dentry fake_dentry = {};
591 struct inode *fake_inode = get_empty_inode();
592 ret = -ENOMEM;
593 if (fake_inode) {
594 fake_file.f_mode = mode;
595 fake_file.f_flags = flags;
596 fake_file.f_dentry = &fake_dentry;
597 fake_dentry.d_inode = fake_inode;
598 fake_inode->i_rdev = rdev;
599 ret = 0;
600 if (bdev->bd_op->open)
601 ret = bdev->bd_op->open(fake_inode, &fake_file);
602 if (!ret)
603 atomic_inc(&bdev->bd_openers);
604 else if (!atomic_read(&bdev->bd_openers))
605 bdev->bd_op = NULL;
606 iput(fake_inode);
609 up(&bdev->bd_sem);
610 return ret;
613 int blkdev_open(struct inode * inode, struct file * filp)
615 int ret = -ENODEV;
616 struct block_device *bdev = inode->i_bdev;
617 down(&bdev->bd_sem);
618 lock_kernel();
619 if (!bdev->bd_op)
620 bdev->bd_op = get_blkfops(MAJOR(inode->i_rdev));
621 if (bdev->bd_op) {
622 ret = 0;
623 if (bdev->bd_op->open)
624 ret = bdev->bd_op->open(inode,filp);
625 if (!ret)
626 atomic_inc(&bdev->bd_openers);
627 else if (!atomic_read(&bdev->bd_openers))
628 bdev->bd_op = NULL;
630 unlock_kernel();
631 up(&bdev->bd_sem);
632 return ret;
635 int blkdev_put(struct block_device *bdev, int kind)
637 int ret = 0;
638 kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
639 down(&bdev->bd_sem);
640 /* syncing will go here */
641 lock_kernel();
642 if (kind == BDEV_FILE || kind == BDEV_FS)
643 fsync_dev(rdev);
644 if (atomic_dec_and_test(&bdev->bd_openers)) {
645 /* invalidating buffers will go here */
646 invalidate_buffers(rdev);
648 if (bdev->bd_op->release) {
649 struct inode * fake_inode = get_empty_inode();
650 ret = -ENOMEM;
651 if (fake_inode) {
652 fake_inode->i_rdev = rdev;
653 ret = bdev->bd_op->release(fake_inode, NULL);
654 iput(fake_inode);
657 if (!atomic_read(&bdev->bd_openers))
658 bdev->bd_op = NULL; /* we can't rely on driver being */
659 /* kind to stay around. */
660 unlock_kernel();
661 up(&bdev->bd_sem);
662 return ret;
665 static int blkdev_close(struct inode * inode, struct file * filp)
667 return blkdev_put(inode->i_bdev, BDEV_FILE);
670 static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
671 unsigned long arg)
673 if (inode->i_bdev->bd_op->ioctl)
674 return inode->i_bdev->bd_op->ioctl(inode, file, cmd, arg);
675 return -EINVAL;
678 struct file_operations def_blk_fops = {
679 open: blkdev_open,
680 release: blkdev_close,
681 read: block_read,
682 write: block_write,
683 fsync: block_fsync,
684 ioctl: blkdev_ioctl,
687 const char * bdevname(kdev_t dev)
689 static char buffer[32];
690 const char * name = blkdevs[MAJOR(dev)].name;
692 if (!name)
693 name = "unknown-block";
695 sprintf(buffer, "%s(%d,%d)", name, MAJOR(dev), MINOR(dev));
696 return buffer;