4 * Copyright (C) 2002, Linus Torvalds.
8 * 04Jul2002 akpm@zip.com.au
10 * 11Sep2002 janetinc@us.ibm.com
11 * added readv/writev support.
12 * 29Oct2002 akpm@zip.com.au
13 * rewrote bio_add_page() support.
14 * 30Oct2002 pbadari@us.ibm.com
15 * added support for non-aligned IO.
16 * 06Nov2002 pbadari@us.ibm.com
17 * added asynchronous IO support.
20 #include <linux/kernel.h>
21 #include <linux/types.h>
24 #include <linux/slab.h>
25 #include <linux/highmem.h>
26 #include <linux/pagemap.h>
27 #include <linux/bio.h>
28 #include <linux/wait.h>
29 #include <linux/err.h>
30 #include <linux/blkdev.h>
31 #include <linux/buffer_head.h>
32 #include <linux/rwsem.h>
33 #include <linux/uio.h>
34 #include <asm/atomic.h>
37 * How many user pages to map in one call to get_user_pages(). This determines
38 * the size of a structure on the stack.
43 * This code generally works in units of "dio_blocks". A dio_block is
44 * somewhere between the hard sector size and the filesystem block size. it
45 * is determined on a per-invocation basis. When talking to the filesystem
46 * we need to convert dio_blocks to fs_blocks by scaling the dio_block quantity
47 * down by dio->blkfactor. Similarly, fs-blocksize quantities are converted
48 * to bio_block quantities by shifting left by blkfactor.
50 * If blkfactor is zero then the user's request was aligned to the filesystem's
55 /* BIO submission state */
56 struct bio
*bio
; /* bio under assembly */
59 unsigned blkbits
; /* doesn't change */
60 unsigned blkfactor
; /* When we're using an alignment which
61 is finer than the filesystem's soft
62 blocksize, this specifies how much
63 finer. blkfactor=2 means 1/4-block
64 alignment. Does not change */
65 unsigned start_zero_done
; /* flag: sub-blocksize zeroing has
66 been performed at the start of a
68 int pages_in_io
; /* approximate total IO pages */
69 sector_t block_in_file
; /* Current offset into the underlying
70 file in dio_block units. */
71 unsigned blocks_available
; /* At block_in_file. changes */
72 sector_t final_block_in_request
;/* doesn't change */
73 unsigned first_block_in_page
; /* doesn't change, Used only once */
74 int boundary
; /* prev block is at a boundary */
75 int reap_counter
; /* rate limit reaping */
76 get_blocks_t
*get_blocks
; /* block mapping function */
77 sector_t final_block_in_bio
; /* current final block in bio + 1 */
78 sector_t next_block_for_io
; /* next block to be put under IO,
79 in dio_blocks units */
80 struct buffer_head map_bh
; /* last get_blocks() result */
83 * Deferred addition of a page to the dio. These variables are
84 * private to dio_send_cur_page(), submit_page_section() and
87 struct page
*cur_page
; /* The page */
88 unsigned cur_page_offset
; /* Offset into it, in bytes */
89 unsigned cur_page_len
; /* Nr of bytes at cur_page_offset */
90 sector_t cur_page_block
; /* Where it starts */
93 * Page fetching state. These variables belong to dio_refill_pages().
95 int curr_page
; /* changes */
96 int total_pages
; /* doesn't change */
97 unsigned long curr_user_address
;/* changes */
100 * Page queue. These variables belong to dio_refill_pages() and
103 struct page
*pages
[DIO_PAGES
]; /* page buffer */
104 unsigned head
; /* next page to process */
105 unsigned tail
; /* last valid page + 1 */
106 int page_errors
; /* errno from get_user_pages() */
108 /* BIO completion state */
109 atomic_t bio_count
; /* nr bios to be completed */
110 atomic_t bios_in_flight
; /* nr bios in flight */
111 spinlock_t bio_list_lock
; /* protects bio_list */
112 struct bio
*bio_list
; /* singly linked via bi_private */
113 struct task_struct
*waiter
; /* waiting task (NULL if none) */
115 /* AIO related stuff */
116 struct kiocb
*iocb
; /* kiocb */
117 int is_async
; /* is IO async ? */
118 int result
; /* IO result */
122 * How many pages are in the queue?
124 static inline unsigned dio_pages_present(struct dio
*dio
)
126 return dio
->tail
- dio
->head
;
130 * Go grab and pin some userspace pages. Typically we'll get 64 at a time.
132 static int dio_refill_pages(struct dio
*dio
)
137 nr_pages
= min(dio
->total_pages
- dio
->curr_page
, DIO_PAGES
);
138 down_read(¤t
->mm
->mmap_sem
);
139 ret
= get_user_pages(
140 current
, /* Task for fault acounting */
141 current
->mm
, /* whose pages? */
142 dio
->curr_user_address
, /* Where from? */
143 nr_pages
, /* How many pages? */
144 dio
->rw
== READ
, /* Write to memory? */
148 up_read(¤t
->mm
->mmap_sem
);
150 if (ret
< 0 && dio
->blocks_available
&& (dio
->rw
== WRITE
)) {
152 * A memory fault, but the filesystem has some outstanding
153 * mapped blocks. We need to use those blocks up to avoid
154 * leaking stale data in the file.
156 if (dio
->page_errors
== 0)
157 dio
->page_errors
= ret
;
158 dio
->pages
[0] = ZERO_PAGE(dio
->curr_user_address
);
166 dio
->curr_user_address
+= ret
* PAGE_SIZE
;
167 dio
->curr_page
+= ret
;
177 * Get another userspace page. Returns an ERR_PTR on error. Pages are
178 * buffered inside the dio so that we can call get_user_pages() against a
179 * decent number of pages, less frequently. To provide nicer use of the
182 static struct page
*dio_get_page(struct dio
*dio
)
184 if (dio_pages_present(dio
) == 0) {
187 ret
= dio_refill_pages(dio
);
190 BUG_ON(dio_pages_present(dio
) == 0);
192 return dio
->pages
[dio
->head
++];
196 * Called when a BIO has been processed. If the count goes to zero then IO is
197 * complete and we can signal this to the AIO layer.
199 static void finished_one_bio(struct dio
*dio
)
201 if (atomic_dec_and_test(&dio
->bio_count
)) {
203 aio_complete(dio
->iocb
, dio
->result
, 0);
209 static int dio_bio_complete(struct dio
*dio
, struct bio
*bio
);
211 * Asynchronous IO callback.
213 static int dio_bio_end_aio(struct bio
*bio
, unsigned int bytes_done
, int error
)
215 struct dio
*dio
= bio
->bi_private
;
220 /* cleanup the bio */
221 dio_bio_complete(dio
, bio
);
226 * The BIO completion handler simply queues the BIO up for the process-context
229 * During I/O bi_private points at the dio. After I/O, bi_private is used to
230 * implement a singly-linked list of completed BIOs, at dio->bio_list.
232 static int dio_bio_end_io(struct bio
*bio
, unsigned int bytes_done
, int error
)
234 struct dio
*dio
= bio
->bi_private
;
240 spin_lock_irqsave(&dio
->bio_list_lock
, flags
);
241 bio
->bi_private
= dio
->bio_list
;
243 atomic_dec(&dio
->bios_in_flight
);
244 if (dio
->waiter
&& atomic_read(&dio
->bios_in_flight
) == 0)
245 wake_up_process(dio
->waiter
);
246 spin_unlock_irqrestore(&dio
->bio_list_lock
, flags
);
251 dio_bio_alloc(struct dio
*dio
, struct block_device
*bdev
,
252 sector_t first_sector
, int nr_vecs
)
256 bio
= bio_alloc(GFP_KERNEL
, nr_vecs
);
261 bio
->bi_sector
= first_sector
;
263 bio
->bi_end_io
= dio_bio_end_aio
;
265 bio
->bi_end_io
= dio_bio_end_io
;
272 * In the AIO read case we speculatively dirty the pages before starting IO.
273 * During IO completion, any of these pages which happen to have been written
274 * back will be redirtied by bio_check_pages_dirty().
276 static void dio_bio_submit(struct dio
*dio
)
278 struct bio
*bio
= dio
->bio
;
280 bio
->bi_private
= dio
;
281 atomic_inc(&dio
->bio_count
);
282 atomic_inc(&dio
->bios_in_flight
);
283 if (dio
->is_async
&& dio
->rw
== READ
)
284 bio_set_pages_dirty(bio
);
285 submit_bio(dio
->rw
, bio
);
292 * Release any resources in case of a failure
294 static void dio_cleanup(struct dio
*dio
)
296 while (dio_pages_present(dio
))
297 page_cache_release(dio_get_page(dio
));
301 * Wait for the next BIO to complete. Remove it and return it.
303 static struct bio
*dio_await_one(struct dio
*dio
)
308 spin_lock_irqsave(&dio
->bio_list_lock
, flags
);
309 while (dio
->bio_list
== NULL
) {
310 set_current_state(TASK_UNINTERRUPTIBLE
);
311 if (dio
->bio_list
== NULL
) {
312 dio
->waiter
= current
;
313 spin_unlock_irqrestore(&dio
->bio_list_lock
, flags
);
316 spin_lock_irqsave(&dio
->bio_list_lock
, flags
);
319 set_current_state(TASK_RUNNING
);
322 dio
->bio_list
= bio
->bi_private
;
323 spin_unlock_irqrestore(&dio
->bio_list_lock
, flags
);
328 * Process one completed BIO. No locks are held.
330 static int dio_bio_complete(struct dio
*dio
, struct bio
*bio
)
332 const int uptodate
= test_bit(BIO_UPTODATE
, &bio
->bi_flags
);
333 struct bio_vec
*bvec
= bio
->bi_io_vec
;
339 if (dio
->is_async
&& dio
->rw
== READ
) {
340 bio_check_pages_dirty(bio
); /* transfers ownership */
342 for (page_no
= 0; page_no
< bio
->bi_vcnt
; page_no
++) {
343 struct page
*page
= bvec
[page_no
].bv_page
;
346 set_page_dirty_lock(page
);
347 page_cache_release(page
);
351 finished_one_bio(dio
);
352 return uptodate
? 0 : -EIO
;
356 * Wait on and process all in-flight BIOs.
358 static int dio_await_completion(struct dio
*dio
)
365 while (atomic_read(&dio
->bio_count
)) {
366 struct bio
*bio
= dio_await_one(dio
);
369 ret2
= dio_bio_complete(dio
, bio
);
377 * A really large O_DIRECT read or write can generate a lot of BIOs. So
378 * to keep the memory consumption sane we periodically reap any completed BIOs
379 * during the BIO generation phase.
381 * This also helps to limit the peak amount of pinned userspace memory.
383 static int dio_bio_reap(struct dio
*dio
)
387 if (dio
->reap_counter
++ >= 64) {
388 while (dio
->bio_list
) {
392 spin_lock_irqsave(&dio
->bio_list_lock
, flags
);
394 dio
->bio_list
= bio
->bi_private
;
395 spin_unlock_irqrestore(&dio
->bio_list_lock
, flags
);
396 ret
= dio_bio_complete(dio
, bio
);
398 dio
->reap_counter
= 0;
404 * Call into the fs to map some more disk blocks. We record the current number
405 * of available blocks at dio->blocks_available. These are in units of the
406 * fs blocksize, (1 << inode->i_blkbits).
408 * The fs is allowed to map lots of blocks at once. If it wants to do that,
409 * it uses the passed inode-relative block number as the file offset, as usual.
411 * get_blocks() is passed the number of i_blkbits-sized blocks which direct_io
412 * has remaining to do. The fs should not map more than this number of blocks.
414 * If the fs has mapped a lot of blocks, it should populate bh->b_size to
415 * indicate how much contiguous disk space has been made available at
418 * If *any* of the mapped blocks are new, then the fs must set buffer_new().
419 * This isn't very efficient...
421 * In the case of filesystem holes: the fs may return an arbitrarily-large
422 * hole by returning an appropriate value in b_size and by clearing
423 * buffer_mapped(). However the direct-io code will only process holes one
424 * block at a time - it will repeatedly call get_blocks() as it walks the hole.
426 static int get_more_blocks(struct dio
*dio
)
429 struct buffer_head
*map_bh
= &dio
->map_bh
;
430 sector_t fs_startblk
; /* Into file, in filesystem-sized blocks */
431 unsigned long fs_count
; /* Number of filesystem-sized blocks */
432 unsigned long dio_count
;/* Number of dio_block-sized blocks */
433 unsigned long blkmask
;
436 * If there was a memory error and we've overwritten all the
437 * mapped blocks then we can now return that memory error
439 ret
= dio
->page_errors
;
443 BUG_ON(dio
->block_in_file
>= dio
->final_block_in_request
);
444 fs_startblk
= dio
->block_in_file
>> dio
->blkfactor
;
445 dio_count
= dio
->final_block_in_request
- dio
->block_in_file
;
446 fs_count
= dio_count
>> dio
->blkfactor
;
447 blkmask
= (1 << dio
->blkfactor
) - 1;
448 if (dio_count
& blkmask
)
451 ret
= (*dio
->get_blocks
)(dio
->inode
, fs_startblk
, fs_count
,
452 map_bh
, dio
->rw
== WRITE
);
458 * There is no bio. Make one now.
460 static int dio_new_bio(struct dio
*dio
, sector_t start_sector
)
465 ret
= dio_bio_reap(dio
);
468 sector
= start_sector
<< (dio
->blkbits
- 9);
469 nr_pages
= min(dio
->pages_in_io
, bio_get_nr_vecs(dio
->map_bh
.b_bdev
));
470 BUG_ON(nr_pages
<= 0);
471 ret
= dio_bio_alloc(dio
, dio
->map_bh
.b_bdev
, sector
, nr_pages
);
478 * Attempt to put the current chunk of 'cur_page' into the current BIO. If
479 * that was successful then update final_block_in_bio and take a ref against
480 * the just-added page.
482 * Return zero on success. Non-zero means the caller needs to start a new BIO.
484 static int dio_bio_add_page(struct dio
*dio
)
488 ret
= bio_add_page(dio
->bio
, dio
->cur_page
,
489 dio
->cur_page_len
, dio
->cur_page_offset
);
490 if (ret
== dio
->cur_page_len
) {
492 page_cache_get(dio
->cur_page
);
493 dio
->final_block_in_bio
= dio
->cur_page_block
+
494 (dio
->cur_page_len
>> dio
->blkbits
);
503 * Put cur_page under IO. The section of cur_page which is described by
504 * cur_page_offset,cur_page_len is put into a BIO. The section of cur_page
505 * starts on-disk at cur_page_block.
507 * We take a ref against the page here (on behalf of its presence in the bio).
509 * The caller of this function is responsible for removing cur_page from the
510 * dio, and for dropping the refcount which came from that presence.
512 static int dio_send_cur_page(struct dio
*dio
)
518 * See whether this new request is contiguous with the old
520 if (dio
->final_block_in_bio
!= dio
->cur_page_block
)
523 * Submit now if the underlying fs is about to perform a
530 if (dio
->bio
== NULL
) {
531 ret
= dio_new_bio(dio
, dio
->cur_page_block
);
536 if (dio_bio_add_page(dio
) != 0) {
538 ret
= dio_new_bio(dio
, dio
->cur_page_block
);
540 ret
= dio_bio_add_page(dio
);
549 * An autonomous function to put a chunk of a page under deferred IO.
551 * The caller doesn't actually know (or care) whether this piece of page is in
552 * a BIO, or is under IO or whatever. We just take care of all possible
553 * situations here. The separation between the logic of do_direct_IO() and
554 * that of submit_page_section() is important for clarity. Please don't break.
556 * The chunk of page starts on-disk at blocknr.
558 * We perform deferred IO, by recording the last-submitted page inside our
559 * private part of the dio structure. If possible, we just expand the IO
560 * across that page here.
562 * If that doesn't work out then we put the old page into the bio and add this
563 * page to the dio instead.
566 submit_page_section(struct dio
*dio
, struct page
*page
,
567 unsigned offset
, unsigned len
, sector_t blocknr
)
572 * Can we just grow the current page's presence in the dio?
574 if ( (dio
->cur_page
== page
) &&
575 (dio
->cur_page_offset
+ dio
->cur_page_len
== offset
) &&
576 (dio
->cur_page_block
+
577 (dio
->cur_page_len
>> dio
->blkbits
) == blocknr
)) {
578 dio
->cur_page_len
+= len
;
581 * If dio->boundary then we want to schedule the IO now to
582 * avoid metadata seeks.
585 ret
= dio_send_cur_page(dio
);
586 page_cache_release(dio
->cur_page
);
587 dio
->cur_page
= NULL
;
593 * If there's a deferred page already there then send it.
596 ret
= dio_send_cur_page(dio
);
597 page_cache_release(dio
->cur_page
);
598 dio
->cur_page
= NULL
;
603 page_cache_get(page
); /* It is in dio */
604 dio
->cur_page
= page
;
605 dio
->cur_page_offset
= offset
;
606 dio
->cur_page_len
= len
;
607 dio
->cur_page_block
= blocknr
;
613 * Clean any dirty buffers in the blockdev mapping which alias newly-created
614 * file blocks. Only called for S_ISREG files - blockdevs do not set
617 static void clean_blockdev_aliases(struct dio
*dio
)
621 for (i
= 0; i
< dio
->blocks_available
; i
++) {
622 unmap_underlying_metadata(dio
->map_bh
.b_bdev
,
623 dio
->map_bh
.b_blocknr
+ i
);
628 * If we are not writing the entire block and get_block() allocated
629 * the block for us, we need to fill-in the unused portion of the
630 * block with zeros. This happens only if user-buffer, fileoffset or
631 * io length is not filesystem block-size multiple.
633 * `end' is zero if we're doing the start of the IO, 1 at the end of the
636 static void dio_zero_block(struct dio
*dio
, int end
)
638 unsigned dio_blocks_per_fs_block
;
639 unsigned this_chunk_blocks
; /* In dio_blocks */
640 unsigned this_chunk_bytes
;
643 dio
->start_zero_done
= 1;
644 if (!dio
->blkfactor
|| !buffer_new(&dio
->map_bh
))
647 dio_blocks_per_fs_block
= 1 << dio
->blkfactor
;
648 this_chunk_blocks
= dio
->block_in_file
& (dio_blocks_per_fs_block
- 1);
650 if (!this_chunk_blocks
)
654 * We need to zero out part of an fs block. It is either at the
655 * beginning or the end of the fs block.
658 this_chunk_blocks
= dio_blocks_per_fs_block
- this_chunk_blocks
;
660 this_chunk_bytes
= this_chunk_blocks
<< dio
->blkbits
;
662 page
= ZERO_PAGE(dio
->cur_user_address
);
663 if (submit_page_section(dio
, page
, 0, this_chunk_bytes
,
664 dio
->next_block_for_io
))
667 dio
->next_block_for_io
+= this_chunk_blocks
;
671 * Walk the user pages, and the file, mapping blocks to disk and generating
672 * a sequence of (page,offset,len,block) mappings. These mappings are injected
673 * into submit_page_section(), which takes care of the next stage of submission
675 * Direct IO against a blockdev is different from a file. Because we can
676 * happily perform page-sized but 512-byte aligned IOs. It is important that
677 * blockdev IO be able to have fine alignment and large sizes.
679 * So what we do is to permit the ->get_blocks function to populate bh.b_size
680 * with the size of IO which is permitted at this offset and this i_blkbits.
682 * For best results, the blockdev should be set up with 512-byte i_blkbits and
683 * it should set b_size to PAGE_SIZE or more inside get_blocks(). This gives
684 * fine alignment but still allows this function to work in PAGE_SIZE units.
686 static int do_direct_IO(struct dio
*dio
)
688 const unsigned blkbits
= dio
->blkbits
;
689 const unsigned blocks_per_page
= PAGE_SIZE
>> blkbits
;
691 unsigned block_in_page
;
692 struct buffer_head
*map_bh
= &dio
->map_bh
;
695 /* The I/O can start at any block offset within the first page */
696 block_in_page
= dio
->first_block_in_page
;
698 while (dio
->block_in_file
< dio
->final_block_in_request
) {
699 page
= dio_get_page(dio
);
705 while (block_in_page
< blocks_per_page
) {
706 unsigned offset_in_page
= block_in_page
<< blkbits
;
707 unsigned this_chunk_bytes
; /* # of bytes mapped */
708 unsigned this_chunk_blocks
; /* # of blocks */
711 if (dio
->blocks_available
== 0) {
713 * Need to go and map some more disk
715 unsigned long blkmask
;
716 unsigned long dio_remainder
;
718 ret
= get_more_blocks(dio
);
720 page_cache_release(page
);
723 if (!buffer_mapped(map_bh
))
726 dio
->blocks_available
=
727 map_bh
->b_size
>> dio
->blkbits
;
728 dio
->next_block_for_io
=
729 map_bh
->b_blocknr
<< dio
->blkfactor
;
730 if (buffer_new(map_bh
))
731 clean_blockdev_aliases(dio
);
736 blkmask
= (1 << dio
->blkfactor
) - 1;
737 dio_remainder
= (dio
->block_in_file
& blkmask
);
740 * If we are at the start of IO and that IO
741 * starts partway into a fs-block,
742 * dio_remainder will be non-zero. If the IO
743 * is a read then we can simply advance the IO
744 * cursor to the first block which is to be
745 * read. But if the IO is a write and the
746 * block was newly allocated we cannot do that;
747 * the start of the fs block must be zeroed out
750 if (!buffer_new(map_bh
))
751 dio
->next_block_for_io
+= dio_remainder
;
752 dio
->blocks_available
-= dio_remainder
;
756 if (!buffer_mapped(map_bh
)) {
759 if (dio
->block_in_file
>=
760 i_size_read(dio
->inode
)>>blkbits
) {
762 page_cache_release(page
);
765 kaddr
= kmap_atomic(page
, KM_USER0
);
766 memset(kaddr
+ (block_in_page
<< blkbits
),
768 flush_dcache_page(page
);
769 kunmap_atomic(kaddr
, KM_USER0
);
770 dio
->block_in_file
++;
776 * If we're performing IO which has an alignment which
777 * is finer than the underlying fs, go check to see if
778 * we must zero out the start of this block.
780 if (unlikely(dio
->blkfactor
&& !dio
->start_zero_done
))
781 dio_zero_block(dio
, 0);
784 * Work out, in this_chunk_blocks, how much disk we
785 * can add to this page
787 this_chunk_blocks
= dio
->blocks_available
;
788 u
= (PAGE_SIZE
- offset_in_page
) >> blkbits
;
789 if (this_chunk_blocks
> u
)
790 this_chunk_blocks
= u
;
791 u
= dio
->final_block_in_request
- dio
->block_in_file
;
792 if (this_chunk_blocks
> u
)
793 this_chunk_blocks
= u
;
794 this_chunk_bytes
= this_chunk_blocks
<< blkbits
;
795 BUG_ON(this_chunk_bytes
== 0);
797 dio
->boundary
= buffer_boundary(map_bh
);
798 ret
= submit_page_section(dio
, page
, offset_in_page
,
799 this_chunk_bytes
, dio
->next_block_for_io
);
801 page_cache_release(page
);
804 dio
->next_block_for_io
+= this_chunk_blocks
;
806 dio
->block_in_file
+= this_chunk_blocks
;
807 block_in_page
+= this_chunk_blocks
;
808 dio
->blocks_available
-= this_chunk_blocks
;
810 if (dio
->block_in_file
> dio
->final_block_in_request
)
812 if (dio
->block_in_file
== dio
->final_block_in_request
)
816 /* Drop the ref which was taken in get_user_pages() */
817 page_cache_release(page
);
825 direct_io_worker(int rw
, struct kiocb
*iocb
, struct inode
*inode
,
826 const struct iovec
*iov
, loff_t offset
, unsigned long nr_segs
,
827 unsigned blkbits
, get_blocks_t get_blocks
)
829 unsigned long user_addr
;
836 dio
= kmalloc(sizeof(*dio
), GFP_KERNEL
);
839 dio
->is_async
= !is_sync_kiocb(iocb
);
844 dio
->blkbits
= blkbits
;
845 dio
->blkfactor
= inode
->i_blkbits
- blkbits
;
846 dio
->start_zero_done
= 0;
847 dio
->block_in_file
= offset
>> blkbits
;
848 dio
->blocks_available
= 0;
850 dio
->cur_page
= NULL
;
853 dio
->reap_counter
= 0;
854 dio
->get_blocks
= get_blocks
;
855 dio
->final_block_in_bio
= -1;
856 dio
->next_block_for_io
= -1;
858 dio
->page_errors
= 0;
863 * BIO completion state.
865 * ->bio_count starts out at one, and we decrement it to zero after all
866 * BIOs are submitted. This to avoid the situation where a really fast
867 * (or synchronous) device could take the count to zero while we're
868 * still submitting BIOs.
870 atomic_set(&dio
->bio_count
, 1);
871 atomic_set(&dio
->bios_in_flight
, 0);
872 spin_lock_init(&dio
->bio_list_lock
);
873 dio
->bio_list
= NULL
;
876 dio
->pages_in_io
= 0;
877 for (seg
= 0; seg
< nr_segs
; seg
++)
878 dio
->pages_in_io
+= (iov
[seg
].iov_len
>> blkbits
) + 2;
880 for (seg
= 0; seg
< nr_segs
; seg
++) {
881 user_addr
= (unsigned long)iov
[seg
].iov_base
;
882 bytes
= iov
[seg
].iov_len
;
884 /* Index into the first page of the first block */
885 dio
->first_block_in_page
= (user_addr
& ~PAGE_MASK
) >> blkbits
;
886 dio
->final_block_in_request
= dio
->block_in_file
+
888 /* Page fetching state */
893 dio
->total_pages
= 0;
894 if (user_addr
& (PAGE_SIZE
-1)) {
896 bytes
-= PAGE_SIZE
- (user_addr
& (PAGE_SIZE
- 1));
898 dio
->total_pages
+= (bytes
+ PAGE_SIZE
- 1) / PAGE_SIZE
;
899 dio
->curr_user_address
= user_addr
;
901 ret
= do_direct_IO(dio
);
903 dio
->result
+= iov
[seg
].iov_len
-
904 ((dio
->final_block_in_request
- dio
->block_in_file
) <<
911 } /* end iovec loop */
914 * There may be some unwritten disk at the end of a part-written
915 * fs-block-sized block. Go zero that now.
917 dio_zero_block(dio
, 1);
920 ret2
= dio_send_cur_page(dio
);
923 page_cache_release(dio
->cur_page
);
924 dio
->cur_page
= NULL
;
930 * OK, all BIOs are submitted, so we can decrement bio_count to truly
931 * reflect the number of to-be-processed BIOs.
935 ret
= dio
->result
; /* Bytes written */
936 finished_one_bio(dio
); /* This can free the dio */
939 finished_one_bio(dio
);
940 ret2
= dio_await_completion(dio
);
944 ret
= dio
->page_errors
;
945 if (ret
== 0 && dio
->result
) {
946 loff_t i_size
= i_size_read(inode
);
950 * Adjust the return value if the read crossed a
951 * non-block-aligned EOF.
953 if (rw
== READ
&& (offset
+ ret
> i_size
))
954 ret
= i_size
- offset
;
962 * This is a library function for use by filesystem drivers.
965 blockdev_direct_IO(int rw
, struct kiocb
*iocb
, struct inode
*inode
,
966 struct block_device
*bdev
, const struct iovec
*iov
, loff_t offset
,
967 unsigned long nr_segs
, get_blocks_t get_blocks
)
972 unsigned blkbits
= inode
->i_blkbits
;
973 unsigned bdev_blkbits
= 0;
974 unsigned blocksize_mask
= (1 << blkbits
) - 1;
975 ssize_t retval
= -EINVAL
;
978 bdev_blkbits
= blksize_bits(bdev_hardsect_size(bdev
));
980 if (offset
& blocksize_mask
) {
982 blkbits
= bdev_blkbits
;
983 blocksize_mask
= (1 << blkbits
) - 1;
984 if (offset
& blocksize_mask
)
988 /* Check the memory alignment. Blocks cannot straddle pages */
989 for (seg
= 0; seg
< nr_segs
; seg
++) {
990 addr
= (unsigned long)iov
[seg
].iov_base
;
991 size
= iov
[seg
].iov_len
;
992 if ((addr
& blocksize_mask
) || (size
& blocksize_mask
)) {
994 blkbits
= bdev_blkbits
;
995 blocksize_mask
= (1 << blkbits
) - 1;
996 if ((addr
& blocksize_mask
) || (size
& blocksize_mask
))
1001 retval
= direct_io_worker(rw
, iocb
, inode
, iov
, offset
,
1002 nr_segs
, blkbits
, get_blocks
);