Import 2.3.50pre1
[davej-history.git] / fs / block_dev.c
blobc455a735d7699c73a24da17f7f00f5cd959b03f6
1 /*
2 * linux/fs/block_dev.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 #include <linux/config.h>
8 #include <linux/mm.h>
9 #include <linux/locks.h>
10 #include <linux/fcntl.h>
11 #include <linux/malloc.h>
12 #include <linux/kmod.h>
13 #include <linux/devfs_fs_kernel.h>
15 #include <asm/uaccess.h>
17 extern int *blk_size[];
18 extern int *blksize_size[];
20 #define MAX_BUF_PER_PAGE (PAGE_SIZE / 512)
21 #define NBUF 64
23 ssize_t block_write(struct file * filp, const char * buf,
24 size_t count, loff_t *ppos)
26 struct inode * inode = filp->f_dentry->d_inode;
27 ssize_t blocksize, blocksize_bits, i, buffercount, write_error;
28 ssize_t block, blocks;
29 loff_t offset;
30 ssize_t chars;
31 ssize_t written = 0;
32 struct buffer_head * bhlist[NBUF];
33 size_t size;
34 kdev_t dev;
35 struct buffer_head * bh, *bufferlist[NBUF];
36 register char * p;
38 write_error = buffercount = 0;
39 dev = inode->i_rdev;
40 if ( is_read_only( inode->i_rdev ))
41 return -EPERM;
42 blocksize = BLOCK_SIZE;
43 if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
44 blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
46 i = blocksize;
47 blocksize_bits = 0;
48 while(i != 1) {
49 blocksize_bits++;
50 i >>= 1;
53 block = *ppos >> blocksize_bits;
54 offset = *ppos & (blocksize-1);
56 if (blk_size[MAJOR(dev)])
57 size = ((loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS) >> blocksize_bits;
58 else
59 size = INT_MAX;
60 while (count>0) {
61 if (block >= size)
62 return written ? written : -ENOSPC;
63 chars = blocksize - offset;
64 if (chars > count)
65 chars=count;
67 #if 0
68 /* get the buffer head */
70 struct buffer_head * (*fn)(kdev_t, int, int) = getblk;
71 if (chars != blocksize)
72 fn = bread;
73 bh = fn(dev, block, blocksize);
74 if (!bh)
75 return written ? written : -EIO;
76 if (!buffer_uptodate(bh))
77 wait_on_buffer(bh);
79 #else
80 bh = getblk(dev, block, blocksize);
81 if (!bh)
82 return written ? written : -EIO;
84 if (!buffer_uptodate(bh))
86 if (chars == blocksize)
87 wait_on_buffer(bh);
88 else
90 bhlist[0] = bh;
91 if (!filp->f_reada || !read_ahead[MAJOR(dev)]) {
92 /* We do this to force the read of a single buffer */
93 blocks = 1;
94 } else {
95 /* Read-ahead before write */
96 blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9) / 2;
97 if (block + blocks > size) blocks = size - block;
98 if (blocks > NBUF) blocks=NBUF;
99 if (!blocks) blocks = 1;
100 for(i=1; i<blocks; i++)
102 bhlist[i] = getblk (dev, block+i, blocksize);
103 if (!bhlist[i])
105 while(i >= 0) brelse(bhlist[i--]);
106 return written ? written : -EIO;
110 ll_rw_block(READ, blocks, bhlist);
111 for(i=1; i<blocks; i++) brelse(bhlist[i]);
112 wait_on_buffer(bh);
113 if (!buffer_uptodate(bh)) {
114 brelse(bh);
115 return written ? written : -EIO;
119 #endif
120 block++;
121 p = offset + bh->b_data;
122 offset = 0;
123 *ppos += chars;
124 written += chars;
125 count -= chars;
126 copy_from_user(p,buf,chars);
127 p += chars;
128 buf += chars;
129 mark_buffer_uptodate(bh, 1);
130 mark_buffer_dirty(bh, 0);
131 if (filp->f_flags & O_SYNC)
132 bufferlist[buffercount++] = bh;
133 else
134 brelse(bh);
135 if (buffercount == NBUF){
136 ll_rw_block(WRITE, buffercount, bufferlist);
137 for(i=0; i<buffercount; i++){
138 wait_on_buffer(bufferlist[i]);
139 if (!buffer_uptodate(bufferlist[i]))
140 write_error=1;
141 brelse(bufferlist[i]);
143 buffercount=0;
145 balance_dirty(dev);
146 if (write_error)
147 break;
149 if ( buffercount ){
150 ll_rw_block(WRITE, buffercount, bufferlist);
151 for(i=0; i<buffercount; i++){
152 wait_on_buffer(bufferlist[i]);
153 if (!buffer_uptodate(bufferlist[i]))
154 write_error=1;
155 brelse(bufferlist[i]);
158 filp->f_reada = 1;
159 if(write_error)
160 return -EIO;
161 return written;
164 ssize_t block_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
166 struct inode * inode = filp->f_dentry->d_inode;
167 size_t block;
168 loff_t offset;
169 ssize_t blocksize;
170 ssize_t blocksize_bits, i;
171 size_t blocks, rblocks, left;
172 int bhrequest, uptodate;
173 struct buffer_head ** bhb, ** bhe;
174 struct buffer_head * buflist[NBUF];
175 struct buffer_head * bhreq[NBUF];
176 unsigned int chars;
177 loff_t size;
178 kdev_t dev;
179 ssize_t read;
181 dev = inode->i_rdev;
182 blocksize = BLOCK_SIZE;
183 if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)])
184 blocksize = blksize_size[MAJOR(dev)][MINOR(dev)];
185 i = blocksize;
186 blocksize_bits = 0;
187 while (i != 1) {
188 blocksize_bits++;
189 i >>= 1;
192 offset = *ppos;
193 if (blk_size[MAJOR(dev)])
194 size = (loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS;
195 else
196 size = (loff_t) INT_MAX << BLOCK_SIZE_BITS;
198 if (offset > size)
199 left = 0;
200 /* size - offset might not fit into left, so check explicitly. */
201 else if (size - offset > INT_MAX)
202 left = INT_MAX;
203 else
204 left = size - offset;
205 if (left > count)
206 left = count;
207 if (left <= 0)
208 return 0;
209 read = 0;
210 block = offset >> blocksize_bits;
211 offset &= blocksize-1;
212 size >>= blocksize_bits;
213 rblocks = blocks = (left + offset + blocksize - 1) >> blocksize_bits;
214 bhb = bhe = buflist;
215 if (filp->f_reada) {
216 if (blocks < read_ahead[MAJOR(dev)] / (blocksize >> 9))
217 blocks = read_ahead[MAJOR(dev)] / (blocksize >> 9);
218 if (rblocks > blocks)
219 blocks = rblocks;
222 if (block + blocks > size) {
223 blocks = size - block;
224 if (blocks == 0)
225 return 0;
228 /* We do this in a two stage process. We first try to request
229 as many blocks as we can, then we wait for the first one to
230 complete, and then we try to wrap up as many as are actually
231 done. This routine is rather generic, in that it can be used
232 in a filesystem by substituting the appropriate function in
233 for getblk.
235 This routine is optimized to make maximum use of the various
236 buffers and caches. */
238 do {
239 bhrequest = 0;
240 uptodate = 1;
241 while (blocks) {
242 --blocks;
243 *bhb = getblk(dev, block++, blocksize);
244 if (*bhb && !buffer_uptodate(*bhb)) {
245 uptodate = 0;
246 bhreq[bhrequest++] = *bhb;
249 if (++bhb == &buflist[NBUF])
250 bhb = buflist;
252 /* If the block we have on hand is uptodate, go ahead
253 and complete processing. */
254 if (uptodate)
255 break;
256 if (bhb == bhe)
257 break;
260 /* Now request them all */
261 if (bhrequest) {
262 ll_rw_block(READ, bhrequest, bhreq);
265 do { /* Finish off all I/O that has actually completed */
266 if (*bhe) {
267 wait_on_buffer(*bhe);
268 if (!buffer_uptodate(*bhe)) { /* read error? */
269 brelse(*bhe);
270 if (++bhe == &buflist[NBUF])
271 bhe = buflist;
272 left = 0;
273 break;
276 if (left < blocksize - offset)
277 chars = left;
278 else
279 chars = blocksize - offset;
280 *ppos += chars;
281 left -= chars;
282 read += chars;
283 if (*bhe) {
284 copy_to_user(buf,offset+(*bhe)->b_data,chars);
285 brelse(*bhe);
286 buf += chars;
287 } else {
288 while (chars-- > 0)
289 put_user(0,buf++);
291 offset = 0;
292 if (++bhe == &buflist[NBUF])
293 bhe = buflist;
294 } while (left > 0 && bhe != bhb && (!*bhe || !buffer_locked(*bhe)));
295 if (bhe == bhb && !blocks)
296 break;
297 } while (left > 0);
299 /* Release the read-ahead blocks */
300 while (bhe != bhb) {
301 brelse(*bhe);
302 if (++bhe == &buflist[NBUF])
303 bhe = buflist;
305 if (!read)
306 return -EIO;
307 filp->f_reada = 1;
308 return read;
312 * Filp may be NULL when we are called by an msync of a vma
313 * since the vma has no handle.
316 static int block_fsync(struct file *filp, struct dentry *dentry)
318 return fsync_dev(dentry->d_inode->i_rdev);
322 * bdev cache handling - shamelessly stolen from inode.c
323 * We use smaller hashtable, though.
326 #define HASH_BITS 6
327 #define HASH_SIZE (1UL << HASH_BITS)
328 #define HASH_MASK (HASH_SIZE-1)
329 static struct list_head bdev_hashtable[HASH_SIZE];
330 static spinlock_t bdev_lock = SPIN_LOCK_UNLOCKED;
331 static kmem_cache_t * bdev_cachep;
333 #define alloc_bdev() \
334 ((struct block_device *) kmem_cache_alloc(bdev_cachep, SLAB_KERNEL))
335 #define destroy_bdev(bdev) kmem_cache_free(bdev_cachep, (bdev))
337 static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
339 struct block_device * bdev = (struct block_device *) foo;
341 if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
342 SLAB_CTOR_CONSTRUCTOR)
344 memset(bdev, 0, sizeof(*bdev));
345 sema_init(&bdev->bd_sem, 1);
349 void bdev_init(void)
351 int i;
352 struct list_head *head = bdev_hashtable;
354 i = HASH_SIZE;
355 do {
356 INIT_LIST_HEAD(head);
357 head++;
358 i--;
359 } while (i);
361 bdev_cachep = kmem_cache_create("bdev_cache",
362 sizeof(struct block_device),
363 0, SLAB_HWCACHE_ALIGN, init_once,
364 NULL);
365 if (!bdev_cachep)
366 panic("cannot create bdev slab cache");
370 * Most likely _very_ bad one - but then it's hardly critical for small
371 * /dev and can be fixed when somebody will need really large one.
373 static inline unsigned long hash(dev_t dev)
375 unsigned long tmp = dev;
376 tmp = tmp + (tmp >> HASH_BITS) + (tmp >> HASH_BITS*2);
377 return tmp & HASH_MASK;
380 static struct block_device *bdfind(dev_t dev, struct list_head *head)
382 struct list_head *p;
383 struct block_device *bdev;
384 for (p=head->next; p!=head; p=p->next) {
385 bdev = list_entry(p, struct block_device, bd_hash);
386 if (bdev->bd_dev != dev)
387 continue;
388 atomic_inc(&bdev->bd_count);
389 return bdev;
391 return NULL;
394 struct block_device *bdget(dev_t dev)
396 struct list_head * head = bdev_hashtable + hash(dev);
397 struct block_device *bdev, *new_bdev;
398 spin_lock(&bdev_lock);
399 bdev = bdfind(dev, head);
400 spin_unlock(&bdev_lock);
401 if (bdev)
402 return bdev;
403 new_bdev = alloc_bdev();
404 if (!new_bdev)
405 return NULL;
406 atomic_set(&new_bdev->bd_count,1);
407 new_bdev->bd_dev = dev;
408 new_bdev->bd_op = NULL;
409 spin_lock(&bdev_lock);
410 bdev = bdfind(dev, head);
411 if (!bdev) {
412 list_add(&new_bdev->bd_hash, head);
413 spin_unlock(&bdev_lock);
414 return new_bdev;
416 spin_unlock(&bdev_lock);
417 destroy_bdev(new_bdev);
418 return bdev;
421 void bdput(struct block_device *bdev)
423 if (atomic_dec_and_test(&bdev->bd_count)) {
424 spin_lock(&bdev_lock);
425 if (atomic_read(&bdev->bd_openers))
426 BUG();
427 list_del(&bdev->bd_hash);
428 spin_unlock(&bdev_lock);
429 destroy_bdev(bdev);
433 static struct {
434 const char *name;
435 struct block_device_operations *bdops;
436 } blkdevs[MAX_BLKDEV] = {
437 { NULL, NULL },
440 int get_blkdev_list(char * p)
442 int i;
443 int len;
445 len = sprintf(p, "\nBlock devices:\n");
446 for (i = 0; i < MAX_BLKDEV ; i++) {
447 if (blkdevs[i].bdops) {
448 len += sprintf(p+len, "%3d %s\n", i, blkdevs[i].name);
451 return len;
455 Return the function table of a device.
456 Load the driver if needed.
458 const struct block_device_operations * get_blkfops(unsigned int major)
460 const struct block_device_operations *ret = NULL;
462 /* major 0 is used for non-device mounts */
463 if (major && major < MAX_BLKDEV) {
464 #ifdef CONFIG_KMOD
465 if (!blkdevs[major].bdops) {
466 char name[20];
467 sprintf(name, "block-major-%d", major);
468 request_module(name);
470 #endif
471 ret = blkdevs[major].bdops;
473 return ret;
476 int register_blkdev(unsigned int major, const char * name, struct block_device_operations *bdops)
478 if (major == 0) {
479 for (major = MAX_BLKDEV-1; major > 0; major--) {
480 if (blkdevs[major].bdops == NULL) {
481 blkdevs[major].name = name;
482 blkdevs[major].bdops = bdops;
483 return major;
486 return -EBUSY;
488 if (major >= MAX_BLKDEV)
489 return -EINVAL;
490 if (blkdevs[major].bdops && blkdevs[major].bdops != bdops)
491 return -EBUSY;
492 blkdevs[major].name = name;
493 blkdevs[major].bdops = bdops;
494 return 0;
497 int unregister_blkdev(unsigned int major, const char * name)
499 if (major >= MAX_BLKDEV)
500 return -EINVAL;
501 if (!blkdevs[major].bdops)
502 return -EINVAL;
503 if (strcmp(blkdevs[major].name, name))
504 return -EINVAL;
505 blkdevs[major].name = NULL;
506 blkdevs[major].bdops = NULL;
507 return 0;
511 * This routine checks whether a removable media has been changed,
512 * and invalidates all buffer-cache-entries in that case. This
513 * is a relatively slow routine, so we have to try to minimize using
514 * it. Thus it is called only upon a 'mount' or 'open'. This
515 * is the best way of combining speed and utility, I think.
516 * People changing diskettes in the middle of an operation deserve
517 * to lose :-)
519 int check_disk_change(kdev_t dev)
521 int i;
522 const struct block_device_operations * bdops = NULL;
523 struct super_block * sb;
525 i = MAJOR(dev);
526 if (i < MAX_BLKDEV)
527 bdops = blkdevs[i].bdops;
528 if (bdops == NULL) {
529 devfs_handle_t de;
531 de = devfs_find_handle (NULL, NULL, 0, i, MINOR (dev),
532 DEVFS_SPECIAL_BLK, 0);
533 if (de) bdops = devfs_get_ops (de);
535 if (bdops == NULL)
536 return 0;
537 if (bdops->check_media_change == NULL)
538 return 0;
539 if (!bdops->check_media_change(dev))
540 return 0;
542 printk(KERN_DEBUG "VFS: Disk change detected on device %s\n",
543 bdevname(dev));
545 sb = get_super(dev);
546 if (sb && invalidate_inodes(sb))
547 printk("VFS: busy inodes on changed media.\n");
549 destroy_buffers(dev);
551 if (bdops->revalidate)
552 bdops->revalidate(dev);
553 return 1;
556 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
558 kdev_t rdev = to_kdev_t(bdev->bd_dev);
559 struct inode inode_fake;
560 int res;
561 mm_segment_t old_fs = get_fs();
563 if (!bdev->bd_op->ioctl)
564 return -EINVAL;
565 inode_fake.i_rdev=rdev;
566 init_waitqueue_head(&inode_fake.i_wait);
567 set_fs(KERNEL_DS);
568 res = bdev->bd_op->ioctl(&inode_fake, NULL, cmd, arg);
569 set_fs(old_fs);
570 return res;
573 int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind)
575 int ret = -ENODEV;
576 kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
577 down(&bdev->bd_sem);
578 if (!bdev->bd_op)
579 bdev->bd_op = get_blkfops(MAJOR(rdev));
580 if (bdev->bd_op) {
582 * This crockload is due to bad choice of ->open() type.
583 * It will go away.
585 struct file fake_file = {};
586 struct dentry fake_dentry = {};
587 struct inode *fake_inode = get_empty_inode();
588 ret = -ENOMEM;
589 if (fake_inode) {
590 fake_file.f_mode = mode;
591 fake_file.f_flags = flags;
592 fake_file.f_dentry = &fake_dentry;
593 fake_dentry.d_inode = fake_inode;
594 fake_inode->i_rdev = rdev;
595 ret = 0;
596 if (bdev->bd_op->open)
597 ret = bdev->bd_op->open(fake_inode, &fake_file);
598 if (!ret)
599 atomic_inc(&bdev->bd_openers);
600 iput(fake_inode);
603 up(&bdev->bd_sem);
604 return ret;
607 int blkdev_open(struct inode * inode, struct file * filp)
609 int ret = -ENODEV;
610 struct block_device *bdev = inode->i_bdev;
611 down(&bdev->bd_sem);
612 if (!bdev->bd_op)
613 bdev->bd_op = get_blkfops(MAJOR(inode->i_rdev));
614 if (bdev->bd_op) {
615 ret = 0;
616 if (bdev->bd_op->open)
617 ret = bdev->bd_op->open(inode,filp);
618 if (!ret)
619 atomic_inc(&bdev->bd_openers);
621 up(&bdev->bd_sem);
622 return ret;
625 int blkdev_put(struct block_device *bdev, int kind)
627 int ret = 0;
628 kdev_t rdev = to_kdev_t(bdev->bd_dev); /* this should become bdev */
629 down(&bdev->bd_sem);
630 /* syncing will go here */
631 if (kind == BDEV_FILE || kind == BDEV_FS)
632 fsync_dev(rdev);
633 if (atomic_dec_and_test(&bdev->bd_openers)) {
634 /* invalidating buffers will go here */
635 invalidate_buffers(rdev);
637 if (bdev->bd_op->release) {
638 struct inode * fake_inode = get_empty_inode();
639 ret = -ENOMEM;
640 if (fake_inode) {
641 fake_inode->i_rdev = rdev;
642 ret = bdev->bd_op->release(fake_inode, NULL);
643 iput(fake_inode);
646 if (!atomic_read(&bdev->bd_openers))
647 bdev->bd_op = NULL; /* we can't rely on driver being */
648 /* kind to stay around. */
649 up(&bdev->bd_sem);
650 return ret;
653 static int blkdev_close(struct inode * inode, struct file * filp)
655 return blkdev_put(inode->i_bdev, BDEV_FILE);
658 static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
659 unsigned long arg)
661 if (inode->i_bdev->bd_op->ioctl)
662 return inode->i_bdev->bd_op->ioctl(inode, file, cmd, arg);
663 return -EINVAL;
666 struct file_operations def_blk_fops = {
667 open: blkdev_open,
668 release: blkdev_close,
669 read: block_read,
670 write: block_write,
671 fsync: block_fsync,
672 ioctl: blkdev_ioctl,
675 const char * bdevname(kdev_t dev)
677 static char buffer[32];
678 const char * name = blkdevs[MAJOR(dev)].name;
680 if (!name)
681 name = "unknown-block";
683 sprintf(buffer, "%s(%d,%d)", name, MAJOR(dev), MINOR(dev));
684 return buffer;