drivers/block/ll_rw_blk.c

   1 /*
   2  *  linux/drivers/block/ll_rw_blk.c
   3  *
   4  * Copyright (C) 1991, 1992 Linus Torvalds
   5  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
   6  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
   7  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
   8  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
   9  */
  10
  11 /*
  12  * This handles all read/write requests to block devices
  13  */
  14 #include <linux/sched.h>
  15 #include <linux/kernel.h>
  16 #include <linux/kernel_stat.h>
  17 #include <linux/errno.h>
  18 #include <linux/string.h>
  19 #include <linux/config.h>
  20 #include <linux/locks.h>
  21 #include <linux/mm.h>
  22 #include <linux/init.h>
  23 #include <linux/smp_lock.h>
  24
  25 #include <asm/system.h>
  26 #include <asm/io.h>
  27 #include <linux/blk.h>
  28 #include <linux/highmem.h>
  29 #include <linux/raid/md.h>
  30
  31 #include <linux/module.h>
  32
  33 /*
  34  * MAC Floppy IWM hooks
  35  */
  36
  37 #ifdef CONFIG_MAC_FLOPPY_IWM
  38 extern int mac_floppy_init(void);
  39 #endif
  40
  41 extern int lvm_init(void);
  42
  43 /*
  44  * For the allocated request tables
  45  */
  46 static kmem_cache_t *request_cachep;
  47
  48 /*
  49  * The "disk" task queue is used to start the actual requests
  50  * after a plug
  51  */
  52 DECLARE_TASK_QUEUE(tq_disk);
  53
  54 /*
  55  * Protect the request list against multiple users..
  56  *
  57  * With this spinlock the Linux block IO subsystem is 100% SMP threaded
  58  * from the IRQ event side, and almost 100% SMP threaded from the syscall
  59  * side (we still have protect against block device array operations, and
  60  * the do_request() side is casually still unsafe. The kernel lock protects
  61  * this part currently.).
  62  *
  63  * there is a fair chance that things will work just OK if these functions
  64  * are called with no global kernel lock held ...
  65  */
  66 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
  67
  68 /* This specifies how many sectors to read ahead on the disk. */
  69
  70 int read_ahead[MAX_BLKDEV];
  71
  72 /* blk_dev_struct is:
  73  *      *request_fn
  74  *      *current_request
  75  */
  76 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
  77
  78 /*
  79  * blk_size contains the size of all block-devices in units of 1024 byte
  80  * sectors:
  81  *
  82  * blk_size[MAJOR][MINOR]
  83  *
  84  * if (!blk_size[MAJOR]) then no minor size checking is done.
  85  */
  86 int * blk_size[MAX_BLKDEV];
  87
  88 /*
  89  * blksize_size contains the size of all block-devices:
  90  *
  91  * blksize_size[MAJOR][MINOR]
  92  *
  93  * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
  94  */
  95 int * blksize_size[MAX_BLKDEV];
  96
  97 /*
  98  * hardsect_size contains the size of the hardware sector of a device.
  99  *
 100  * hardsect_size[MAJOR][MINOR]
 101  *
 102  * if (!hardsect_size[MAJOR])
 103  *              then 512 bytes is assumed.
 104  * else
 105  *              sector_size is hardsect_size[MAJOR][MINOR]
 106  * This is currently set by some scsi devices and read by the msdos fs driver.
 107  * Other uses may appear later.
 108  */
 109 int * hardsect_size[MAX_BLKDEV];
 110
 111 /*
 112  * The following tunes the read-ahead algorithm in mm/filemap.c
 113  */
 114 int * max_readahead[MAX_BLKDEV];
 115
 116 /*
 117  * Max number of sectors per request
 118  */
 119 int * max_sectors[MAX_BLKDEV];
 120
 121 static inline int get_max_sectors(kdev_t dev)
 122 {
 123         if (!max_sectors[MAJOR(dev)])
 124                 return MAX_SECTORS;
 125         return max_sectors[MAJOR(dev)][MINOR(dev)];
 126 }
 127
 128 static inline request_queue_t *__blk_get_queue(kdev_t dev)
 129 {
 130         struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
 131
 132         if (bdev->queue)
 133                 return bdev->queue(dev);
 134         else
 135                 return &blk_dev[MAJOR(dev)].request_queue;
 136 }
 137
 138 /*
 139  * NOTE: the device-specific queue() functions
 140  * have to be atomic!
 141  */
 142 request_queue_t *blk_get_queue(kdev_t dev)
 143 {
 144         request_queue_t *ret;
 145         unsigned long flags;
 146
 147         spin_lock_irqsave(&io_request_lock,flags);
 148         ret = __blk_get_queue(dev);
 149         spin_unlock_irqrestore(&io_request_lock,flags);
 150
 151         return ret;
 152 }
 153
 154 static int __blk_cleanup_queue(struct list_head *head)
 155 {
 156         struct list_head *entry;
 157         struct request *rq;
 158         int i = 0;
 159
 160         if (list_empty(head))
 161                 return 0;
 162
 163         entry = head->next;
 164         do {
 165                 rq = list_entry(entry, struct request, table);
 166                 entry = entry->next;
 167                 list_del(&rq->table);
 168                 kmem_cache_free(request_cachep, rq);
 169                 i++;
 170         } while (!list_empty(head));
 171
 172         return i;
 173 }
 174
 175 /**
 176  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
 177  * @q:    the request queue to be released
 178  *
 179  * Description:  blk_cleanup_queue is the pair to blk_init_queue().  It should
 180  *     be called when a request queue is being released; typically when a block
 181  *     device is being de-registered.
 182  *     Currently, its primary task it to free all the &struct request structures
 183  *     that were allocated to the queue.
 184  * Caveat:
 185  *     Hopefully the low level driver will have finished any outstanding
 186  *     requests first...
 187  **/
 188 void blk_cleanup_queue(request_queue_t * q)
 189 {
 190         int count = QUEUE_NR_REQUESTS;
 191
 192         count -= __blk_cleanup_queue(&q->request_freelist[READ]);
 193         count -= __blk_cleanup_queue(&q->request_freelist[WRITE]);
 194
 195         if (count)
 196                 printk("blk_cleanup_queue: leaked requests (%d)\n", count);
 197
 198         memset(q, 0, sizeof(*q));
 199 }
 200
 201 /**
 202  * blk_queue_headactive - indicate whether head of request queue may be active
 203  * @q:       The queue which this applies to.
 204  * @active:  A flag indication where the head of the queue is active.
 205  *
 206  * Description:
 207  *    The driver for a block device may choose to leave the currently active
 208  *    request on the request queue, removing it only when it has completed.
 209  *    The queue handling routines assume this by default for safety reasons
 210  *    and will not involve the head of the request queue in any merging or
 211  *    reordering of requests when the queue is unplugged (and thus may be
 212  *    working on this particular request).
 213  *
 214  *    If a driver removes requests from the queue before processing them, then
 215  *    it may indicate that it does so, there by allowing the head of the queue
 216  *    to be involved in merging and reordering.  This is done be calling
 217  *    blk_queue_headactive() with an @active flag of %0.
 218  *
 219  *    If a driver processes several requests at once, it must remove them (or
 220  *    at least all but one of them) from the request queue.
 221  *
 222  *    When a queue is plugged (see blk_queue_pluggable()) the head will be
 223  *    assumed to be inactive.
 224  **/
 225
 226 void blk_queue_headactive(request_queue_t * q, int active)
 227 {
 228         q->head_active = active;
 229 }
 230
 231 /**
 232  * blk_queue_pluggable - define a plugging function for a request queue
 233  * @q:   the request queue to which the function will apply
 234  * @plug: the function to be called to plug a queue
 235  *
 236  * Description:
 237  *   A request queue will be "plugged" if a request is added to it while it
 238  *   is empty.  This allows a number of requests to be added before any are
 239  *   processed, thus providing an opportunity for these requests to be merged
 240  *   or re-ordered.
 241  *   The default plugging function (generic_plug_device()) sets the "plugged"
 242  *   flag for the queue and adds a task to the $tq_disk task queue to unplug
 243  *   the queue and call the request function at a later time.
 244  *
 245  *   A device driver may provide an alternate plugging function by passing it to
 246  *   blk_queue_pluggable().   This function should set the "plugged" flag if it
 247  *   want calls to the request_function to be blocked, and should place a
 248  *   task on $tq_disk which will unplug the queue.  Alternately it can simply
 249  *   do nothing and there-by disable plugging of the device.
 250  **/
 251
 252 void blk_queue_pluggable (request_queue_t * q, plug_device_fn *plug)
 253 {
 254         q->plug_device_fn = plug;
 255 }
 256
 257
 258 /**
 259  * blk_queue_make_request - define an alternate make_request function for a device
 260  * @q:  the request queue for the device to be affected
 261  * @mfn: the alternate make_request function
 262  *
 263  * Description:
 264  *    The normal way for &struct buffer_heads to be passed to a device driver
 265  *    it to collect into requests on a request queue, and allow the device
 266  *    driver to select requests off that queue when it is ready.  This works
 267  *    well for many block devices. However some block devices (typically
 268  *    virtual devices such as md or lvm) do not benefit from the processes on
 269  *    the request queue, and are served best by having the requests passed
 270  *    directly to them.  This can be achieved by providing a function to
 271  *    blk_queue_make_request().
 272  **/
 273
 274 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 275 {
 276         q->make_request_fn = mfn;
 277 }
 278
 279 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
 280 {
 281         if (req->nr_segments < max_segments) {
 282                 req->nr_segments++;
 283                 q->elevator.nr_segments++;
 284                 return 1;
 285         }
 286         return 0;
 287 }
 288
 289 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
 290                             struct buffer_head *bh, int max_segments)
 291 {
 292         if (req->bhtail->b_data + req->bhtail->b_size == bh->b_data)
 293                 return 1;
 294         return ll_new_segment(q, req, max_segments);
 295 }
 296
 297 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
 298                              struct buffer_head *bh, int max_segments)
 299 {
 300         if (bh->b_data + bh->b_size == req->bh->b_data)
 301                 return 1;
 302         return ll_new_segment(q, req, max_segments);
 303 }
 304
 305 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
 306                                 struct request *next, int max_segments)
 307 {
 308         int total_segments = req->nr_segments + next->nr_segments;
 309         int same_segment;
 310
 311         same_segment = 0;
 312         if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data) {
 313                 total_segments--;
 314                 same_segment = 1;
 315         }
 316
 317         if (total_segments > max_segments)
 318                 return 0;
 319
 320         q->elevator.nr_segments -= same_segment;
 321         req->nr_segments = total_segments;
 322         return 1;
 323 }
 324
 325 /*
 326  * "plug" the device if there are no outstanding requests: this will
 327  * force the transfer to start only after we have put all the requests
 328  * on the list.
 329  *
 330  * This is called with interrupts off and no requests on the queue.
 331  * (and with the request spinlock acquired)
 332  */
 333 static void generic_plug_device(request_queue_t *q, kdev_t dev)
 334 {
 335         /*
 336          * no need to replug device
 337          */
 338         if (!list_empty(&q->queue_head) || q->plugged)
 339                 return;
 340
 341         q->plugged = 1;
 342         queue_task(&q->plug_tq, &tq_disk);
 343 }
 344
 345 /*
 346  * remove the plug and let it rip..
 347  */
 348 static inline void __generic_unplug_device(request_queue_t *q)
 349 {
 350         if (q->plugged) {
 351                 q->plugged = 0;
 352                 if (!list_empty(&q->queue_head))
 353                         q->request_fn(q);
 354         }
 355 }
 356
 357 static void generic_unplug_device(void *data)
 358 {
 359         request_queue_t *q = (request_queue_t *) data;
 360         unsigned long flags;
 361
 362         spin_lock_irqsave(&io_request_lock, flags);
 363         __generic_unplug_device(q);
 364         spin_unlock_irqrestore(&io_request_lock, flags);
 365 }
 366
 367 static void blk_init_free_list(request_queue_t *q)
 368 {
 369         struct request *rq;
 370         int i;
 371
 372         /*
 373          * Divide requests in half between read and write. This used to
 374          * be a 2/3 advantage for reads, but now reads can steal from
 375          * the write free list.
 376          */
 377         for (i = 0; i < QUEUE_NR_REQUESTS; i++) {
 378                 rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL);
 379                 rq->rq_status = RQ_INACTIVE;
 380                 list_add(&rq->table, &q->request_freelist[i & 1]);
 381         }
 382
 383         init_waitqueue_head(&q->wait_for_request);
 384         spin_lock_init(&q->request_lock);
 385 }
 386
 387 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
 388
 389 /**
 390  * blk_init_queue  - prepare a request queue for use with a block device
 391  * @q:    The &request_queue_t to be initialised
 392  * @rfn:  The function to be called to process requests that have been
 393  *        placed on the queue.
 394  *
 395  * Description:
 396  *    If a block device wishes to use the standard request handling procedures,
 397  *    which sorts requests and coalesces adjacent requests, then it must
 398  *    call blk_init_queue().  The function @rfn will be called when there
 399  *    are requests on the queue that need to be processed.  If the device
 400  *    supports plugging, then @rfn may not be called immediately when requests
 401  *    are available on the queue, but may be called at some time later instead.
 402  *    Plugged queues are generally unplugged when a buffer belonging to one
 403  *    of the requests on the queue is needed, or due to memory pressure.
 404  *
 405  *    @rfn is not required, or even expected, to remove all requests off the
 406  *    queue, but only as many as it can handle at a time.  If it does leave
 407  *    requests on the queue, it is responsible for arranging that the requests
 408  *    get dealt with eventually.
 409  *
 410  *    A global spin lock $io_request_lock must be held while manipulating the
 411  *    requests on the request queue.
 412  *
 413  *    The request on the head of the queue is by default assumed to be
 414  *    potentially active, and it is not considered for re-ordering or merging
 415  *    whenever the given queue is unplugged. This behaviour can be changed with
 416  *    blk_queue_headactive().
 417  *
 418  * Note:
 419  *    blk_init_queue() must be paired with a blk_cleanup-queue() call
 420  *    when the block device is deactivated (such as at module unload).
 421  **/
 422 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
 423 {
 424         INIT_LIST_HEAD(&q->queue_head);
 425         INIT_LIST_HEAD(&q->request_freelist[READ]);
 426         INIT_LIST_HEAD(&q->request_freelist[WRITE]);
 427         elevator_init(&q->elevator, ELEVATOR_LINUS);
 428         blk_init_free_list(q);
 429         q->request_fn           = rfn;
 430         q->back_merge_fn        = ll_back_merge_fn;
 431         q->front_merge_fn       = ll_front_merge_fn;
 432         q->merge_requests_fn    = ll_merge_requests_fn;
 433         q->make_request_fn      = __make_request;
 434         q->plug_tq.sync         = 0;
 435         q->plug_tq.routine      = &generic_unplug_device;
 436         q->plug_tq.data         = q;
 437         q->plugged              = 0;
 438         /*
 439          * These booleans describe the queue properties.  We set the
 440          * default (and most common) values here.  Other drivers can
 441          * use the appropriate functions to alter the queue properties.
 442          * as appropriate.
 443          */
 444         q->plug_device_fn       = generic_plug_device;
 445         q->head_active          = 1;
 446 }
 447
 448
 449 #define blkdev_free_rq(list) list_entry((list)->next, struct request, table);
 450 /*
 451  * Get a free request. io_request_lock must be held and interrupts
 452  * disabled on the way in.
 453  */
 454 static inline struct request *get_request(request_queue_t *q, int rw)
 455 {
 456         struct list_head *list = &q->request_freelist[rw];
 457         struct request *rq;
 458
 459         /*
 460          * Reads get preferential treatment and are allowed to steal
 461          * from the write free list if necessary.
 462          */
 463         if (!list_empty(list)) {
 464                 rq = blkdev_free_rq(list);
 465                 goto got_rq;
 466         }
 467
 468         /*
 469          * if the WRITE list is non-empty, we know that rw is READ
 470          * and that the READ list is empty. allow reads to 'steal'
 471          * from the WRITE list.
 472          */
 473         if (!list_empty(&q->request_freelist[WRITE])) {
 474                 list = &q->request_freelist[WRITE];
 475                 rq = blkdev_free_rq(list);
 476                 goto got_rq;
 477         }
 478
 479         return NULL;
 480
 481 got_rq:
 482         list_del(&rq->table);
 483         rq->free_list = list;
 484         rq->rq_status = RQ_ACTIVE;
 485         rq->special = NULL;
 486         rq->q = q;
 487         return rq;
 488 }
 489
 490 /*
 491  * No available requests for this queue, unplug the device.
 492  */
 493 static struct request *__get_request_wait(request_queue_t *q, int rw)
 494 {
 495         register struct request *rq;
 496         DECLARE_WAITQUEUE(wait, current);
 497
 498         add_wait_queue_exclusive(&q->wait_for_request, &wait);
 499         for (;;) {
 500                 __set_current_state(TASK_UNINTERRUPTIBLE);
 501                 spin_lock_irq(&io_request_lock);
 502                 rq = get_request(q, rw);
 503                 spin_unlock_irq(&io_request_lock);
 504                 if (rq)
 505                         break;
 506                 generic_unplug_device(q);
 507                 schedule();
 508         }
 509         remove_wait_queue(&q->wait_for_request, &wait);
 510         current->state = TASK_RUNNING;
 511         return rq;
 512 }
 513
 514 static inline struct request *get_request_wait(request_queue_t *q, int rw)
 515 {
 516         register struct request *rq;
 517
 518         spin_lock_irq(&io_request_lock);
 519         rq = get_request(q, rw);
 520         spin_unlock_irq(&io_request_lock);
 521         if (rq)
 522                 return rq;
 523         return __get_request_wait(q, rw);
 524 }
 525
 526 /* RO fail safe mechanism */
 527
 528 static long ro_bits[MAX_BLKDEV][8];
 529
 530 int is_read_only(kdev_t dev)
 531 {
 532         int minor,major;
 533
 534         major = MAJOR(dev);
 535         minor = MINOR(dev);
 536         if (major < 0 || major >= MAX_BLKDEV) return 0;
 537         return ro_bits[major][minor >> 5] & (1 << (minor & 31));
 538 }
 539
 540 void set_device_ro(kdev_t dev,int flag)
 541 {
 542         int minor,major;
 543
 544         major = MAJOR(dev);
 545         minor = MINOR(dev);
 546         if (major < 0 || major >= MAX_BLKDEV) return;
 547         if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
 548         else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
 549 }
 550
 551 inline void drive_stat_acct (kdev_t dev, int rw,
 552                                 unsigned long nr_sectors, int new_io)
 553 {
 554         unsigned int major = MAJOR(dev);
 555         unsigned int index;
 556
 557         index = disk_index(dev);
 558         if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
 559                 return;
 560
 561         kstat.dk_drive[major][index] += new_io;
 562         if (rw == READ) {
 563                 kstat.dk_drive_rio[major][index] += new_io;
 564                 kstat.dk_drive_rblk[major][index] += nr_sectors;
 565         } else if (rw == WRITE) {
 566                 kstat.dk_drive_wio[major][index] += new_io;
 567                 kstat.dk_drive_wblk[major][index] += nr_sectors;
 568         } else
 569                 printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
 570 }
 571
 572 /*
 573  * add-request adds a request to the linked list.
 574  * It disables interrupts (acquires the request spinlock) so that it can muck
 575  * with the request-lists in peace. Thus it should be called with no spinlocks
 576  * held.
 577  *
 578  * By this point, req->cmd is always either READ/WRITE, never READA,
 579  * which is important for drive_stat_acct() above.
 580  */
 581
 582 static inline void add_request(request_queue_t * q, struct request * req,
 583                                struct list_head *head, int lat)
 584 {
 585         int major;
 586
 587         drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
 588
 589         /*
 590          * let selected elevator insert the request
 591          */
 592         q->elevator.elevator_fn(req, &q->elevator, &q->queue_head, head, lat);
 593
 594         /*
 595          * FIXME(eric) I don't understand why there is a need for this
 596          * special case code.  It clearly doesn't fit any more with
 597          * the new queueing architecture, and it got added in 2.3.10.
 598          * I am leaving this in here until I hear back from the COMPAQ
 599          * people.
 600          */
 601         major = MAJOR(req->rq_dev);
 602         if (major >= COMPAQ_SMART2_MAJOR+0 && major <= COMPAQ_SMART2_MAJOR+7)
 603                 (q->request_fn)(q);
 604         if (major >= COMPAQ_CISS_MAJOR+0 && major <= COMPAQ_CISS_MAJOR+7)
 605                 (q->request_fn)(q);
 606         if (major >= DAC960_MAJOR+0 && major <= DAC960_MAJOR+7)
 607                 (q->request_fn)(q);
 608 }
 609
 610 /*
 611  * Must be called with io_request_lock held and interrupts disabled
 612  */
 613 void inline blkdev_release_request(struct request *req)
 614 {
 615         req->rq_status = RQ_INACTIVE;
 616
 617         /*
 618          * Request may not have originated from ll_rw_blk
 619          */
 620         if (req->free_list) {
 621                 list_add(&req->table, req->free_list);
 622                 req->free_list = NULL;
 623                 wake_up(&req->q->wait_for_request);
 624         }
 625 }
 626
 627 /*
 628  * Has to be called with the request spinlock acquired
 629  */
 630 static void attempt_merge(request_queue_t * q,
 631                           struct request *req,
 632                           int max_sectors,
 633                           int max_segments)
 634 {
 635         struct request *next;
 636
 637         next = blkdev_next_request(req);
 638         if (req->sector + req->nr_sectors != next->sector)
 639                 return;
 640         if (req->cmd != next->cmd || req->rq_dev != next->rq_dev || req->nr_sectors + next->nr_sectors > max_sectors || next->sem)
 641                 return;
 642         /*
 643          * If we are not allowed to merge these requests, then
 644          * return.  If we are allowed to merge, then the count
 645          * will have been updated to the appropriate number,
 646          * and we shouldn't do it here too.
 647          */
 648         if(!(q->merge_requests_fn)(q, req, next, max_segments))
 649                 return;
 650
 651         req->bhtail->b_reqnext = next->bh;
 652         req->bhtail = next->bhtail;
 653         req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
 654         list_del(&next->queue);
 655         blkdev_release_request(next);
 656 }
 657
 658 static inline void attempt_back_merge(request_queue_t * q,
 659                                       struct request *req,
 660                                       int max_sectors,
 661                                       int max_segments)
 662 {
 663         if (&req->queue == q->queue_head.prev)
 664                 return;
 665         attempt_merge(q, req, max_sectors, max_segments);
 666 }
 667
 668 static inline void attempt_front_merge(request_queue_t * q,
 669                                        struct list_head * head,
 670                                        struct request *req,
 671                                        int max_sectors,
 672                                        int max_segments)
 673 {
 674         struct list_head * prev;
 675
 676         prev = req->queue.prev;
 677         if (head == prev)
 678                 return;
 679         attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
 680 }
 681
 682 static int __make_request(request_queue_t * q, int rw,
 683                                   struct buffer_head * bh)
 684 {
 685         unsigned int sector, count;
 686         int max_segments = MAX_SEGMENTS;
 687         struct request * req = NULL, *freereq = NULL;
 688         int rw_ahead, max_sectors, el_ret;
 689         struct list_head *head;
 690         int latency;
 691         elevator_t *elevator = &q->elevator;
 692
 693         count = bh->b_size >> 9;
 694         sector = bh->b_rsector;
 695
 696         rw_ahead = 0;   /* normal case; gets changed below for READA */
 697         switch (rw) {
 698                 case READA:
 699                         rw_ahead = 1;
 700                         rw = READ;      /* drop into READ */
 701                 case READ:
 702                 case WRITE:
 703                         break;
 704                 default:
 705                         BUG();
 706                         goto end_io;
 707         }
 708
 709         /* We'd better have a real physical mapping!
 710            Check this bit only if the buffer was dirty and just locked
 711            down by us so at this point flushpage will block and
 712            won't clear the mapped bit under us. */
 713         if (!buffer_mapped(bh))
 714                 BUG();
 715
 716         /*
 717          * Temporary solution - in 2.5 this will be done by the lowlevel
 718          * driver. Create a bounce buffer if the buffer data points into
 719          * high memory - keep the original buffer otherwise.
 720          */
 721 #if CONFIG_HIGHMEM
 722         bh = create_bounce(rw, bh);
 723 #endif
 724
 725 /* look for a free request. */
 726         /*
 727          * Try to coalesce the new request with old requests
 728          */
 729         max_sectors = get_max_sectors(bh->b_rdev);
 730
 731         latency = elevator_request_latency(elevator, rw);
 732
 733         /*
 734          * Now we acquire the request spinlock, we have to be mega careful
 735          * not to schedule or do something nonatomic
 736          */
 737 again:
 738         spin_lock_irq(&io_request_lock);
 739
 740         /*
 741          * skip first entry, for devices with active queue head
 742          */
 743         head = &q->queue_head;
 744         if (q->head_active && !q->plugged)
 745                 head = head->next;
 746
 747         if (list_empty(head)) {
 748                 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
 749                 goto get_rq;
 750         }
 751
 752         el_ret = elevator->elevator_merge_fn(q, &req, bh, rw, &max_sectors, &max_segments);
 753         switch (el_ret) {
 754
 755                 case ELEVATOR_BACK_MERGE:
 756                         if (!q->back_merge_fn(q, req, bh, max_segments))
 757                                 break;
 758                         req->bhtail->b_reqnext = bh;
 759                         req->bhtail = bh;
 760                         req->nr_sectors = req->hard_nr_sectors += count;
 761                         req->e = elevator;
 762                         drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 763                         attempt_back_merge(q, req, max_sectors, max_segments);
 764                         goto out;
 765
 766                 case ELEVATOR_FRONT_MERGE:
 767                         if (!q->front_merge_fn(q, req, bh, max_segments))
 768                                 break;
 769                         bh->b_reqnext = req->bh;
 770                         req->bh = bh;
 771                         req->buffer = bh->b_data;
 772                         req->current_nr_sectors = count;
 773                         req->sector = req->hard_sector = sector;
 774                         req->nr_sectors = req->hard_nr_sectors += count;
 775                         req->e = elevator;
 776                         drive_stat_acct(req->rq_dev, req->cmd, count, 0);
 777                         attempt_front_merge(q, head, req, max_sectors, max_segments);
 778                         goto out;
 779                 /*
 780                  * elevator says don't/can't merge. get new request
 781                  */
 782                 case ELEVATOR_NO_MERGE:
 783                         break;
 784
 785                 default:
 786                         printk("elevator returned crap (%d)\n", el_ret);
 787                         BUG();
 788         }
 789
 790         /*
 791          * Grab a free request from the freelist. Read first try their
 792          * own queue - if that is empty, we steal from the write list.
 793          * Writes must block if the write list is empty, and read aheads
 794          * are not crucial.
 795          */
 796 get_rq:
 797         if (freereq) {
 798                 req = freereq;
 799                 freereq = NULL;
 800         } else if ((req = get_request(q, rw)) == NULL) {
 801                 spin_unlock_irq(&io_request_lock);
 802                 if (rw_ahead)
 803                         goto end_io;
 804
 805                 freereq = __get_request_wait(q, rw);
 806                 goto again;
 807         }
 808
 809 /* fill up the request-info, and add it to the queue */
 810         req->cmd = rw;
 811         req->errors = 0;
 812         req->hard_sector = req->sector = sector;
 813         req->hard_nr_sectors = req->nr_sectors = count;
 814         req->current_nr_sectors = count;
 815         req->nr_segments = 1; /* Always 1 for a new request. */
 816         req->nr_hw_segments = 1; /* Always 1 for a new request. */
 817         req->buffer = bh->b_data;
 818         req->sem = NULL;
 819         req->bh = bh;
 820         req->bhtail = bh;
 821         req->rq_dev = bh->b_rdev;
 822         req->e = elevator;
 823         add_request(q, req, head, latency);
 824 out:
 825         if (!q->plugged)
 826                 (q->request_fn)(q);
 827         if (freereq)
 828                 blkdev_release_request(freereq);
 829         spin_unlock_irq(&io_request_lock);
 830         return 0;
 831 end_io:
 832         bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
 833         return 0;
 834 }
 835
 836 void generic_make_request (int rw, struct buffer_head * bh)
 837 {
 838         int major = MAJOR(bh->b_rdev);
 839         request_queue_t *q;
 840         if (blk_size[major]) {
 841                 unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1;
 842                 unsigned int sector, count;
 843
 844                 count = bh->b_size >> 9;
 845                 sector = bh->b_rsector;
 846
 847                 if (maxsector < count || maxsector - count < sector) {
 848                         bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped);
 849                         if (blk_size[major][MINOR(bh->b_rdev)]) {
 850
 851                                 /* This may well happen - the kernel calls bread()
 852                                    without checking the size of the device, e.g.,
 853                                    when mounting a device. */
 854                                 printk(KERN_INFO
 855                                        "attempt to access beyond end of device\n");
 856                                 printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n",
 857                                        kdevname(bh->b_rdev), rw,
 858                                        (sector + count)>>1,
 859                                        blk_size[major][MINOR(bh->b_rdev)]);
 860                         }
 861                         bh->b_end_io(bh, 0);
 862                         return;
 863                 }
 864         }
 865
 866         /*
 867          * Resolve the mapping until finished. (drivers are
 868          * still free to implement/resolve their own stacking
 869          * by explicitly returning 0)
 870          */
 871         /* NOTE: we don't repeat the blk_size check for each new device.
 872          * Stacking drivers are expected to know what they are doing.
 873          */
 874         do {
 875                 q = blk_get_queue(bh->b_rdev);
 876                 if (!q) {
 877                         printk(KERN_ERR
 878                                "generic_make_request: Trying to access nonexistent block-device %s (%ld)\n",
 879                                kdevname(bh->b_rdev), bh->b_rsector);
 880                         buffer_IO_error(bh);
 881                         break;
 882                 }
 883
 884         }
 885         while (q->make_request_fn(q, rw, bh));
 886 }
 887
 888 /* This function can be used to request a number of buffers from a block
 889    device. Currently the only restriction is that all buffers must belong to
 890    the same device */
 891
 892 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
 893 {
 894         struct buffer_head *bh;
 895         unsigned int major;
 896         int correct_size;
 897         int i;
 898
 899         major = MAJOR(bhs[0]->b_dev);
 900
 901         /* Determine correct block size for this device. */
 902         correct_size = BLOCK_SIZE;
 903         if (blksize_size[major]) {
 904                 i = blksize_size[major][MINOR(bhs[0]->b_dev)];
 905                 if (i)
 906                         correct_size = i;
 907         }
 908
 909         /* Verify requested block sizes. */
 910         for (i = 0; i < nr; i++) {
 911                 bh = bhs[i];
 912                 if (bh->b_size != correct_size) {
 913                         printk(KERN_NOTICE "ll_rw_block: device %s: "
 914                                "only %d-char blocks implemented (%u)\n",
 915                                kdevname(bhs[0]->b_dev),
 916                                correct_size, bh->b_size);
 917                         goto sorry;
 918                 }
 919         }
 920
 921         if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
 922                 printk(KERN_NOTICE "Can't write to read-only device %s\n",
 923                        kdevname(bhs[0]->b_dev));
 924                 goto sorry;
 925         }
 926
 927         for (i = 0; i < nr; i++) {
 928                 bh = bhs[i];
 929
 930                 /* Only one thread can actually submit the I/O. */
 931                 if (test_and_set_bit(BH_Lock, &bh->b_state))
 932                         continue;
 933
 934                 set_bit(BH_Req, &bh->b_state);
 935
 936                 switch(rw) {
 937                 case WRITE:
 938                         if (!atomic_set_buffer_clean(bh))
 939                                 /* Hmmph! Nothing to write */
 940                                 goto end_io;
 941                         __mark_buffer_clean(bh);
 942                         kstat.pgpgout++;
 943                         break;
 944
 945                 case READA:
 946                 case READ:
 947                         if (buffer_uptodate(bh))
 948                                 /* Hmmph! Already have it */
 949                                 goto end_io;
 950                         kstat.pgpgin++;
 951                         break;
 952                 default:
 953                         BUG();
 954         end_io:
 955                         bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
 956                         continue;
 957
 958                 }
 959
 960                 /*
 961                  * First step, 'identity mapping' - RAID or LVM might
 962                  * further remap this.
 963                  */
 964                 bh->b_rdev = bh->b_dev;
 965                 bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
 966
 967                 generic_make_request(rw, bh);
 968         }
 969         return;
 970
 971 sorry:
 972         for (i = 0; i < nr; i++)
 973                 buffer_IO_error(bhs[i]);
 974 }
 975
 976
 977 #ifdef CONFIG_STRAM_SWAP
 978 extern int stram_device_init (void);
 979 #endif
 980
 981 /*
 982  * First step of what used to be end_request
 983  *
 984  * 0 means continue with end_that_request_last,
 985  * 1 means we are done
 986  */
 987
 988 int end_that_request_first (struct request *req, int uptodate, char *name)
 989 {
 990         struct buffer_head * bh;
 991         int nsect;
 992
 993         req->errors = 0;
 994         if (!uptodate)
 995                 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
 996                         kdevname(req->rq_dev), name, req->sector);
 997
 998         if ((bh = req->bh) != NULL) {
 999                 nsect = bh->b_size >> 9;
1000                 req->bh = bh->b_reqnext;
1001                 bh->b_reqnext = NULL;
1002                 bh->b_end_io(bh, uptodate);
1003                 if ((bh = req->bh) != NULL) {
1004                         req->hard_sector += nsect;
1005                         req->hard_nr_sectors -= nsect;
1006                         req->sector = req->hard_sector;
1007                         req->nr_sectors = req->hard_nr_sectors;
1008
1009                         req->current_nr_sectors = bh->b_size >> 9;
1010                         if (req->nr_sectors < req->current_nr_sectors) {
1011                                 req->nr_sectors = req->current_nr_sectors;
1012                                 printk("end_request: buffer-list destroyed\n");
1013                         }
1014                         req->buffer = bh->b_data;
1015                         return 1;
1016                 }
1017         }
1018         return 0;
1019 }
1020
1021 void end_that_request_last(struct request *req)
1022 {
1023         if (req->e) {
1024                 printk("end_that_request_last called with non-dequeued req\n");
1025                 BUG();
1026         }
1027         if (req->sem != NULL)
1028                 up(req->sem);
1029
1030         blkdev_release_request(req);
1031 }
1032
1033 int __init blk_dev_init(void)
1034 {
1035         struct blk_dev_struct *dev;
1036
1037         request_cachep = kmem_cache_create("blkdev_requests",
1038                                            sizeof(struct request),
1039                                            0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1040
1041         if (!request_cachep)
1042                 panic("Can't create request pool slab cache\n");
1043
1044         for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1045                 dev->queue = NULL;
1046
1047         memset(ro_bits,0,sizeof(ro_bits));
1048         memset(max_readahead, 0, sizeof(max_readahead));
1049         memset(max_sectors, 0, sizeof(max_sectors));
1050 #ifdef CONFIG_AMIGA_Z2RAM
1051         z2_init();
1052 #endif
1053 #ifdef CONFIG_STRAM_SWAP
1054         stram_device_init();
1055 #endif
1056 #ifdef CONFIG_BLK_DEV_RAM
1057         rd_init();
1058 #endif
1059 #ifdef CONFIG_BLK_DEV_LOOP
1060         loop_init();
1061 #endif
1062 #ifdef CONFIG_ISP16_CDI
1063         isp16_init();
1064 #endif
1065 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_IDE)
1066         ide_init();             /* this MUST precede hd_init */
1067 #endif
1068 #if defined(CONFIG_IDE) && defined(CONFIG_BLK_DEV_HD)
1069         hd_init();
1070 #endif
1071 #ifdef CONFIG_BLK_DEV_PS2
1072         ps2esdi_init();
1073 #endif
1074 #ifdef CONFIG_BLK_DEV_XD
1075         xd_init();
1076 #endif
1077 #ifdef CONFIG_BLK_DEV_MFM
1078         mfm_init();
1079 #endif
1080 #ifdef CONFIG_PARIDE
1081         { extern void paride_init(void); paride_init(); };
1082 #endif
1083 #ifdef CONFIG_MAC_FLOPPY
1084         swim3_init();
1085 #endif
1086 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1087         swimiop_init();
1088 #endif
1089 #ifdef CONFIG_AMIGA_FLOPPY
1090         amiga_floppy_init();
1091 #endif
1092 #ifdef CONFIG_ATARI_FLOPPY
1093         atari_floppy_init();
1094 #endif
1095 #ifdef CONFIG_BLK_DEV_FD
1096         floppy_init();
1097 #else
1098 #if defined(__i386__)   /* Do we even need this? */
1099         outb_p(0xc, 0x3f2);
1100 #endif
1101 #endif
1102 #ifdef CONFIG_CDU31A
1103         cdu31a_init();
1104 #endif
1105 #ifdef CONFIG_ATARI_ACSI
1106         acsi_init();
1107 #endif
1108 #ifdef CONFIG_MCD
1109         mcd_init();
1110 #endif
1111 #ifdef CONFIG_MCDX
1112         mcdx_init();
1113 #endif
1114 #ifdef CONFIG_SBPCD
1115         sbpcd_init();
1116 #endif
1117 #ifdef CONFIG_AZTCD
1118         aztcd_init();
1119 #endif
1120 #ifdef CONFIG_CDU535
1121         sony535_init();
1122 #endif
1123 #ifdef CONFIG_GSCD
1124         gscd_init();
1125 #endif
1126 #ifdef CONFIG_CM206
1127         cm206_init();
1128 #endif
1129 #ifdef CONFIG_OPTCD
1130         optcd_init();
1131 #endif
1132 #ifdef CONFIG_SJCD
1133         sjcd_init();
1134 #endif
1135 #ifdef CONFIG_APBLOCK
1136         ap_init();
1137 #endif
1138 #ifdef CONFIG_DDV
1139         ddv_init();
1140 #endif
1141 #ifdef CONFIG_BLK_DEV_NBD
1142         nbd_init();
1143 #endif
1144 #ifdef CONFIG_MDISK
1145         mdisk_init();
1146 #endif
1147 #ifdef CONFIG_DASD
1148         dasd_init();
1149 #endif
1150 #ifdef CONFIG_SUN_JSFLASH
1151         jsfd_init();
1152 #endif
1153 #ifdef CONFIG_BLK_DEV_LVM
1154         lvm_init();
1155 #endif
1156         return 0;
1157 };
1158
1159 EXPORT_SYMBOL(io_request_lock);
1160 EXPORT_SYMBOL(end_that_request_first);
1161 EXPORT_SYMBOL(end_that_request_last);
1162 EXPORT_SYMBOL(blk_init_queue);
1163 EXPORT_SYMBOL(blk_get_queue);
1164 EXPORT_SYMBOL(blk_cleanup_queue);
1165 EXPORT_SYMBOL(blk_queue_headactive);
1166 EXPORT_SYMBOL(blk_queue_pluggable);
1167 EXPORT_SYMBOL(blk_queue_make_request);
1168 EXPORT_SYMBOL(generic_make_request);
1169 EXPORT_SYMBOL(blkdev_release_request);