2 * linux/drivers/block/ll_rw_blk.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
9 * This handles all read/write requests to block devices
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/kernel_stat.h>
14 #include <linux/errno.h>
15 #include <linux/string.h>
16 #include <linux/config.h>
17 #include <linux/locks.h>
19 #include <linux/init.h>
20 #include <linux/smp_lock.h>
22 #include <asm/system.h>
24 #include <linux/blk.h>
26 #include <linux/module.h>
29 * MAC Floppy IWM hooks
32 #ifdef CONFIG_MAC_FLOPPY_IWM
33 extern int mac_floppy_init(void);
37 * The request-struct contains all necessary data
38 * to load a nr of sectors into memory
40 static struct request all_requests
[NR_REQUEST
];
43 * The "disk" task queue is used to start the actual requests
46 DECLARE_TASK_QUEUE(tq_disk
);
49 * Protect the request list against multiple users..
51 * With this spinlock the Linux block IO subsystem is 100% SMP threaded
52 * from the IRQ event side, and almost 100% SMP threaded from the syscall
53 * side (we still have protect against block device array operations, and
54 * the do_request() side is casually still unsafe. The kernel lock protects
55 * this part currently.).
57 * there is a fair chance that things will work just OK if these functions
58 * are called with no global kernel lock held ...
60 spinlock_t io_request_lock
= SPIN_LOCK_UNLOCKED
;
63 * used to wait on when there are no free requests
65 DECLARE_WAIT_QUEUE_HEAD(wait_for_request
);
67 /* This specifies how many sectors to read ahead on the disk. */
69 int read_ahead
[MAX_BLKDEV
] = {0, };
75 struct blk_dev_struct blk_dev
[MAX_BLKDEV
]; /* initialized by blk_dev_init() */
78 * blk_size contains the size of all block-devices in units of 1024 byte
81 * blk_size[MAJOR][MINOR]
83 * if (!blk_size[MAJOR]) then no minor size checking is done.
85 int * blk_size
[MAX_BLKDEV
] = { NULL
, NULL
, };
88 * blksize_size contains the size of all block-devices:
90 * blksize_size[MAJOR][MINOR]
92 * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
94 int * blksize_size
[MAX_BLKDEV
] = { NULL
, NULL
, };
97 * hardsect_size contains the size of the hardware sector of a device.
99 * hardsect_size[MAJOR][MINOR]
101 * if (!hardsect_size[MAJOR])
102 * then 512 bytes is assumed.
104 * sector_size is hardsect_size[MAJOR][MINOR]
105 * This is currently set by some scsi devices and read by the msdos fs driver.
106 * Other uses may appear later.
108 int * hardsect_size
[MAX_BLKDEV
] = { NULL
, NULL
, };
111 * The following tunes the read-ahead algorithm in mm/filemap.c
113 int * max_readahead
[MAX_BLKDEV
] = { NULL
, NULL
, };
116 * Max number of sectors per request
118 int * max_sectors
[MAX_BLKDEV
] = { NULL
, NULL
, };
121 * Max number of segments per request
123 int * max_segments
[MAX_BLKDEV
] = { NULL
, NULL
, };
125 static inline int get_max_sectors(kdev_t dev
)
127 if (!max_sectors
[MAJOR(dev
)])
129 return max_sectors
[MAJOR(dev
)][MINOR(dev
)];
132 static inline int get_max_segments(kdev_t dev
)
134 if (!max_segments
[MAJOR(dev
)])
136 return max_segments
[MAJOR(dev
)][MINOR(dev
)];
140 * Is called with the request spinlock aquired.
141 * NOTE: the device-specific queue() functions
144 static inline struct request
**get_queue(kdev_t dev
)
146 int major
= MAJOR(dev
);
147 struct blk_dev_struct
*bdev
= blk_dev
+ major
;
150 return bdev
->queue(dev
);
151 return &blk_dev
[major
].current_request
;
155 * remove the plug and let it rip..
157 void unplug_device(void * data
)
159 struct blk_dev_struct
* dev
= (struct blk_dev_struct
*) data
;
160 int queue_new_request
=0;
163 spin_lock_irqsave(&io_request_lock
,flags
);
164 if (dev
->current_request
== &dev
->plug
) {
165 struct request
* next
= dev
->plug
.next
;
166 dev
->current_request
= next
;
167 if (next
|| dev
->queue
) {
168 dev
->plug
.next
= NULL
;
169 queue_new_request
= 1;
172 if (queue_new_request
)
175 spin_unlock_irqrestore(&io_request_lock
,flags
);
179 * "plug" the device if there are no outstanding requests: this will
180 * force the transfer to start only after we have put all the requests
183 * This is called with interrupts off and no requests on the queue.
184 * (and with the request spinlock aquired)
186 static inline void plug_device(struct blk_dev_struct
* dev
)
188 if (dev
->current_request
)
190 dev
->current_request
= &dev
->plug
;
191 queue_task(&dev
->plug_tq
, &tq_disk
);
195 * look for a free request in the first N entries.
196 * NOTE: interrupts must be disabled on the way in (on SMP the request queue
197 * spinlock has to be aquired), and will still be disabled on the way out.
199 static inline struct request
* get_request(int n
, kdev_t dev
)
201 static struct request
*prev_found
= NULL
, *prev_limit
= NULL
;
202 register struct request
*req
, *limit
;
205 panic("get_request(%d): impossible!\n", n
);
207 limit
= all_requests
+ n
;
208 if (limit
!= prev_limit
) {
210 prev_found
= all_requests
;
214 req
= ((req
> all_requests
) ? req
: limit
) - 1;
215 if (req
->rq_status
== RQ_INACTIVE
)
217 if (req
== prev_found
)
221 req
->rq_status
= RQ_ACTIVE
;
227 * wait until a free request in the first N entries is available.
229 static struct request
* __get_request_wait(int n
, kdev_t dev
)
231 register struct request
*req
;
232 DECLARE_WAITQUEUE(wait
, current
);
235 add_wait_queue(&wait_for_request
, &wait
);
237 current
->state
= TASK_UNINTERRUPTIBLE
;
238 spin_lock_irqsave(&io_request_lock
,flags
);
239 req
= get_request(n
, dev
);
240 spin_unlock_irqrestore(&io_request_lock
,flags
);
243 run_task_queue(&tq_disk
);
246 remove_wait_queue(&wait_for_request
, &wait
);
247 current
->state
= TASK_RUNNING
;
251 static inline struct request
* get_request_wait(int n
, kdev_t dev
)
253 register struct request
*req
;
256 spin_lock_irqsave(&io_request_lock
,flags
);
257 req
= get_request(n
, dev
);
258 spin_unlock_irqrestore(&io_request_lock
,flags
);
261 return __get_request_wait(n
, dev
);
264 /* RO fail safe mechanism */
266 static long ro_bits
[MAX_BLKDEV
][8];
268 int is_read_only(kdev_t dev
)
274 if (major
< 0 || major
>= MAX_BLKDEV
) return 0;
275 return ro_bits
[major
][minor
>> 5] & (1 << (minor
& 31));
278 void set_device_ro(kdev_t dev
,int flag
)
284 if (major
< 0 || major
>= MAX_BLKDEV
) return;
285 if (flag
) ro_bits
[major
][minor
>> 5] |= 1 << (minor
& 31);
286 else ro_bits
[major
][minor
>> 5] &= ~(1 << (minor
& 31));
289 static inline void drive_stat_acct(struct request
*req
,
290 unsigned long nr_sectors
, int new_io
)
292 int major
= MAJOR(req
->rq_dev
);
293 int minor
= MINOR(req
->rq_dev
);
294 unsigned int disk_index
;
298 disk_index
= (minor
& 0x00f8) >> 3;
300 case SCSI_DISK0_MAJOR
:
301 disk_index
= (minor
& 0x00f0) >> 4;
303 case IDE0_MAJOR
: /* same as HD_MAJOR */
305 disk_index
= (minor
& 0x0040) >> 6;
308 disk_index
= ((minor
& 0x0040) >> 6) + 2;
313 if (disk_index
>= DK_NDRIVE
)
316 kstat
.dk_drive
[disk_index
] += new_io
;
317 if (req
->cmd
== READ
) {
318 kstat
.dk_drive_rio
[disk_index
] += new_io
;
319 kstat
.dk_drive_rblk
[disk_index
] += nr_sectors
;
320 } else if (req
->cmd
== WRITE
) {
321 kstat
.dk_drive_wio
[disk_index
] += new_io
;
322 kstat
.dk_drive_wblk
[disk_index
] += nr_sectors
;
324 printk(KERN_ERR
"drive_stat_acct: cmd not R/W?\n");
328 * add-request adds a request to the linked list.
329 * It disables interrupts (aquires the request spinlock) so that it can muck
330 * with the request-lists in peace. Thus it should be called with no spinlocks
333 * By this point, req->cmd is always either READ/WRITE, never READA,
334 * which is important for drive_stat_acct() above.
337 void add_request(struct blk_dev_struct
* dev
, struct request
* req
)
339 int major
= MAJOR(req
->rq_dev
);
340 struct request
* tmp
, **current_request
;
342 int queue_new_request
= 0;
344 drive_stat_acct(req
, req
->nr_sectors
, 1);
348 * We use the goto to reduce locking complexity
350 spin_lock_irqsave(&io_request_lock
,flags
);
351 current_request
= get_queue(req
->rq_dev
);
353 if (!(tmp
= *current_request
)) {
354 *current_request
= req
;
355 if (dev
->current_request
!= &dev
->plug
)
356 queue_new_request
= 1;
359 for ( ; tmp
->next
; tmp
= tmp
->next
) {
360 const int after_current
= IN_ORDER(tmp
,req
);
361 const int before_next
= IN_ORDER(req
,tmp
->next
);
363 if (!IN_ORDER(tmp
,tmp
->next
)) {
364 if (after_current
|| before_next
)
367 if (after_current
&& before_next
)
371 req
->next
= tmp
->next
;
374 /* for SCSI devices, call request_fn unconditionally */
375 if (scsi_blk_major(major
))
376 queue_new_request
= 1;
377 if (major
>= COMPAQ_SMART2_MAJOR
+0 &&
378 major
<= COMPAQ_SMART2_MAJOR
+7)
379 queue_new_request
= 1;
380 if (major
>= DAC960_MAJOR
+0 && major
<= DAC960_MAJOR
+7)
381 queue_new_request
= 1;
383 if (queue_new_request
)
385 spin_unlock_irqrestore(&io_request_lock
,flags
);
389 * Has to be called with the request spinlock aquired
391 static inline void attempt_merge (struct request
*req
, int max_sectors
)
393 struct request
*next
= req
->next
;
397 if (req
->sector
+ req
->nr_sectors
!= next
->sector
)
399 if (next
->sem
|| req
->cmd
!= next
->cmd
|| req
->rq_dev
!= next
->rq_dev
|| req
->nr_sectors
+ next
->nr_sectors
> max_sectors
)
401 req
->bhtail
->b_reqnext
= next
->bh
;
402 req
->bhtail
= next
->bhtail
;
403 req
->nr_sectors
+= next
->nr_sectors
;
404 next
->rq_status
= RQ_INACTIVE
;
405 req
->next
= next
->next
;
406 wake_up (&wait_for_request
);
409 void make_request(int major
,int rw
, struct buffer_head
* bh
)
411 unsigned int sector
, count
;
412 struct request
* req
;
413 int rw_ahead
, max_req
, max_sectors
;
416 count
= bh
->b_size
>> 9;
417 sector
= bh
->b_rsector
;
419 /* It had better not be a new buffer by the time we see it */
423 /* Only one thread can actually submit the I/O. */
424 if (test_and_set_bit(BH_Lock
, &bh
->b_state
))
427 if (blk_size
[major
]) {
428 unsigned long maxsector
= (blk_size
[major
][MINOR(bh
->b_rdev
)] << 1) + 1;
430 if (maxsector
< count
|| maxsector
- count
< sector
) {
431 bh
->b_state
&= (1 << BH_Lock
) | (1 << BH_Mapped
);
432 /* This may well happen - the kernel calls bread()
433 without checking the size of the device, e.g.,
434 when mounting a device. */
436 "attempt to access beyond end of device\n");
437 printk(KERN_INFO
"%s: rw=%d, want=%d, limit=%d\n",
438 kdevname(bh
->b_rdev
), rw
,
440 blk_size
[major
][MINOR(bh
->b_rdev
)]);
445 rw_ahead
= 0; /* normal case; gets changed below for READA */
449 rw
= READ
; /* drop into READ */
451 if (buffer_uptodate(bh
)) /* Hmmph! Already have it */
454 max_req
= NR_REQUEST
; /* reads take precedence */
458 goto do_write
; /* Skip the buffer refile */
460 if (!test_and_clear_bit(BH_Dirty
, &bh
->b_state
))
461 goto end_io
; /* Hmmph! Nothing to write */
465 * We don't allow the write-requests to fill up the
466 * queue completely: we want some room for reads,
467 * as they take precedence. The last third of the
468 * requests are only for reads.
471 max_req
= (NR_REQUEST
* 2) / 3;
474 printk(KERN_ERR
"make_request: bad block dev cmd,"
475 " must be R/W/RA/WA\n");
479 /* We'd better have a real physical mapping!
480 Check this bit only if the buffer was dirty and just locked
481 down by us so at this point flushpage will block and
482 won't clear the mapped bit under us. */
483 if (!buffer_mapped(bh
))
486 /* look for a free request. */
487 /* Loop uses two requests, 1 for loop and 1 for the real device.
488 * Cut max_req in half to avoid running out and deadlocking. */
489 if ((major
== LOOP_MAJOR
) || (major
== NBD_MAJOR
))
493 * Try to coalesce the new request with old requests
495 max_sectors
= get_max_sectors(bh
->b_rdev
);
498 * Now we acquire the request spinlock, we have to be mega careful
499 * not to schedule or do something nonatomic
501 spin_lock_irqsave(&io_request_lock
,flags
);
502 req
= *get_queue(bh
->b_rdev
);
504 /* MD and loop can't handle plugging without deadlocking */
505 if (major
!= MD_MAJOR
&& major
!= LOOP_MAJOR
&&
506 major
!= DDV_MAJOR
&& major
!= NBD_MAJOR
)
507 plug_device(blk_dev
+ major
); /* is atomic */
508 } else switch (major
) {
509 case IDE0_MAJOR
: /* same as HD_MAJOR */
521 case MFM_ACORN_MAJOR
:
523 * The scsi disk and cdrom drivers completely remove the request
524 * from the queue when they start processing an entry. For this
525 * reason it is safe to continue to add links to the top entry for
528 * All other drivers need to jump over the first entry, as that
529 * entry may be busy being processed and we thus can't change it.
531 if (req
== blk_dev
[major
].current_request
)
537 case SCSI_DISK0_MAJOR
:
538 case SCSI_DISK1_MAJOR
:
539 case SCSI_DISK2_MAJOR
:
540 case SCSI_DISK3_MAJOR
:
541 case SCSI_DISK4_MAJOR
:
542 case SCSI_DISK5_MAJOR
:
543 case SCSI_DISK6_MAJOR
:
544 case SCSI_DISK7_MAJOR
:
545 case SCSI_CDROM_MAJOR
:
555 case COMPAQ_SMART2_MAJOR
+0:
556 case COMPAQ_SMART2_MAJOR
+1:
557 case COMPAQ_SMART2_MAJOR
+2:
558 case COMPAQ_SMART2_MAJOR
+3:
559 case COMPAQ_SMART2_MAJOR
+4:
560 case COMPAQ_SMART2_MAJOR
+5:
561 case COMPAQ_SMART2_MAJOR
+6:
562 case COMPAQ_SMART2_MAJOR
+7:
569 if (req
->nr_sectors
+ count
> max_sectors
)
571 if (req
->rq_dev
!= bh
->b_rdev
)
573 /* Can we add it to the end of this request? */
574 if (req
->sector
+ req
->nr_sectors
== sector
) {
575 req
->bhtail
->b_reqnext
= bh
;
577 req
->nr_sectors
+= count
;
578 drive_stat_acct(req
, count
, 0);
579 /* Can we now merge this req with the next? */
580 attempt_merge(req
, max_sectors
);
581 /* or to the beginning? */
582 } else if (req
->sector
- count
== sector
) {
583 bh
->b_reqnext
= req
->bh
;
585 req
->buffer
= bh
->b_data
;
586 req
->current_nr_sectors
= count
;
587 req
->sector
= sector
;
588 req
->nr_sectors
+= count
;
589 drive_stat_acct(req
, count
, 0);
593 spin_unlock_irqrestore(&io_request_lock
,flags
);
596 } while ((req
= req
->next
) != NULL
);
599 /* find an unused request. */
600 req
= get_request(max_req
, bh
->b_rdev
);
602 spin_unlock_irqrestore(&io_request_lock
,flags
);
604 /* if no request available: if rw_ahead, forget it; otherwise try again blocking.. */
608 req
= __get_request_wait(max_req
, bh
->b_rdev
);
611 /* fill up the request-info, and add it to the queue */
614 req
->sector
= sector
;
615 req
->nr_sectors
= count
;
616 req
->current_nr_sectors
= count
;
617 req
->buffer
= bh
->b_data
;
622 add_request(major
+blk_dev
,req
);
626 bh
->b_end_io(bh
, test_bit(BH_Uptodate
, &bh
->b_state
));
629 /* This function can be used to request a number of buffers from a block
630 device. Currently the only restriction is that all buffers must belong to
633 void ll_rw_block(int rw
, int nr
, struct buffer_head
* bh
[])
637 struct blk_dev_struct
* dev
;
641 if ((major
= MAJOR(bh
[0]->b_dev
)) < MAX_BLKDEV
)
642 dev
= blk_dev
+ major
;
643 if (!dev
|| !dev
->request_fn
) {
645 "ll_rw_block: Trying to read nonexistent block-device %s (%ld)\n",
646 kdevname(bh
[0]->b_dev
), bh
[0]->b_blocknr
);
650 /* Determine correct block size for this device. */
651 correct_size
= BLOCK_SIZE
;
652 if (blksize_size
[major
]) {
653 i
= blksize_size
[major
][MINOR(bh
[0]->b_dev
)];
658 /* Verify requested block sizes. */
659 for (i
= 0; i
< nr
; i
++) {
660 if (bh
[i
]->b_size
!= correct_size
) {
661 printk(KERN_NOTICE
"ll_rw_block: device %s: "
662 "only %d-char blocks implemented (%u)\n",
663 kdevname(bh
[0]->b_dev
),
664 correct_size
, bh
[i
]->b_size
);
668 /* Md remaps blocks now */
669 bh
[i
]->b_rdev
= bh
[i
]->b_dev
;
670 bh
[i
]->b_rsector
=bh
[i
]->b_blocknr
*(bh
[i
]->b_size
>> 9);
671 #ifdef CONFIG_BLK_DEV_MD
672 if (major
==MD_MAJOR
&&
673 md_map (MINOR(bh
[i
]->b_dev
), &bh
[i
]->b_rdev
,
674 &bh
[i
]->b_rsector
, bh
[i
]->b_size
>> 9)) {
676 "Bad md_map in ll_rw_block\n");
682 if ((rw
& WRITE
) && is_read_only(bh
[0]->b_dev
)) {
683 printk(KERN_NOTICE
"Can't write to read-only device %s\n",
684 kdevname(bh
[0]->b_dev
));
688 for (i
= 0; i
< nr
; i
++) {
689 set_bit(BH_Req
, &bh
[i
]->b_state
);
690 #ifdef CONFIG_BLK_DEV_MD
691 if (MAJOR(bh
[i
]->b_dev
) == MD_MAJOR
) {
692 md_make_request(MINOR (bh
[i
]->b_dev
), rw
, bh
[i
]);
696 make_request(MAJOR(bh
[i
]->b_rdev
), rw
, bh
[i
]);
701 for (i
= 0; i
< nr
; i
++) {
702 mark_buffer_clean(bh
[i
]); /* remeber to refile it */
703 clear_bit(BH_Uptodate
, &bh
[i
]->b_state
);
704 bh
[i
]->b_end_io(bh
[i
], 0);
709 #ifdef CONFIG_STRAM_SWAP
710 extern int stram_device_init( void );
714 * First step of what used to be end_request
716 * 0 means continue with end_that_request_last,
717 * 1 means we are done
721 end_that_request_first( struct request
*req
, int uptodate
, char *name
)
723 struct buffer_head
* bh
;
728 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
729 kdevname(req
->rq_dev
), name
, req
->sector
);
730 if ((bh
= req
->bh
) != NULL
) {
731 nsect
= bh
->b_size
>> 9;
733 req
->nr_sectors
&= ~(nsect
- 1);
734 req
->sector
+= nsect
;
735 req
->sector
&= ~(nsect
- 1);
739 if ((bh
= req
->bh
) != NULL
) {
740 req
->bh
= bh
->b_reqnext
;
741 bh
->b_reqnext
= NULL
;
742 bh
->b_end_io(bh
, uptodate
);
743 if ((bh
= req
->bh
) != NULL
) {
744 req
->current_nr_sectors
= bh
->b_size
>> 9;
745 if (req
->nr_sectors
< req
->current_nr_sectors
) {
746 req
->nr_sectors
= req
->current_nr_sectors
;
747 printk("end_request: buffer-list destroyed\n");
749 req
->buffer
= bh
->b_data
;
757 end_that_request_last( struct request
*req
)
759 if (req
->sem
!= NULL
)
761 req
->rq_status
= RQ_INACTIVE
;
762 wake_up(&wait_for_request
);
765 int __init
blk_dev_init(void)
767 struct request
* req
;
768 struct blk_dev_struct
*dev
;
770 for (dev
= blk_dev
+ MAX_BLKDEV
; dev
-- != blk_dev
;) {
771 dev
->request_fn
= NULL
;
773 dev
->current_request
= NULL
;
774 dev
->plug
.rq_status
= RQ_INACTIVE
;
776 dev
->plug
.next
= NULL
;
777 dev
->plug_tq
.sync
= 0;
778 dev
->plug_tq
.routine
= &unplug_device
;
779 dev
->plug_tq
.data
= dev
;
782 req
= all_requests
+ NR_REQUEST
;
783 while (--req
>= all_requests
) {
784 req
->rq_status
= RQ_INACTIVE
;
787 memset(ro_bits
,0,sizeof(ro_bits
));
788 memset(max_readahead
, 0, sizeof(max_readahead
));
789 memset(max_sectors
, 0, sizeof(max_sectors
));
790 #ifdef CONFIG_AMIGA_Z2RAM
793 #ifdef CONFIG_STRAM_SWAP
796 #ifdef CONFIG_BLK_DEV_RAM
799 #ifdef CONFIG_BLK_DEV_LOOP
802 #ifdef CONFIG_ISP16_CDI
804 #endif CONFIG_ISP16_CDI
805 #ifdef CONFIG_BLK_DEV_IDE
806 ide_init(); /* this MUST precede hd_init */
808 #ifdef CONFIG_BLK_DEV_HD
811 #ifdef CONFIG_BLK_DEV_PS2
814 #ifdef CONFIG_BLK_DEV_XD
817 #ifdef CONFIG_BLK_DEV_MFM
821 { extern void paride_init(void); paride_init(); };
823 #ifdef CONFIG_MAC_FLOPPY
826 #ifdef CONFIG_BLK_DEV_SWIM_IOP
829 #ifdef CONFIG_AMIGA_FLOPPY
832 #ifdef CONFIG_ATARI_FLOPPY
835 #ifdef CONFIG_BLK_DEV_FD
838 #if !defined (__mc68000__) && !defined(CONFIG_PPC) && !defined(__sparc__)\
839 && !defined(CONFIG_APUS) && !defined(__sh__)
846 #ifdef CONFIG_ATARI_ACSI
848 #endif CONFIG_ATARI_ACSI
876 #ifdef CONFIG_BLK_DEV_MD
878 #endif CONFIG_BLK_DEV_MD
879 #ifdef CONFIG_APBLOCK
885 #ifdef CONFIG_BLK_DEV_NBD
891 EXPORT_SYMBOL(io_request_lock
);
892 EXPORT_SYMBOL(end_that_request_first
);
893 EXPORT_SYMBOL(end_that_request_last
);