2 * linux/fs/nfs/direct.c
4 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6 * High-performance uncached I/O for the Linux NFS client
8 * There are important applications whose performance or correctness
9 * depends on uncached access to file data. Database clusters
10 * (multiple copies of the same instance running on separate hosts)
11 * implement their own cache coherency protocol that subsumes file
12 * system cache protocols. Applications that process datasets
13 * considerably larger than the client's memory do not always benefit
14 * from a local cache. A streaming video server, for instance, has no
15 * need to cache the contents of a file.
17 * When an application requests uncached I/O, all read and write requests
18 * are made directly to the server; data stored or fetched via these
19 * requests is not cached in the Linux page cache. The client does not
20 * correct unaligned requests from applications. All requested bytes are
21 * held on permanent storage before a direct write system call returns to
24 * Solaris implements an uncached I/O facility called directio() that
25 * is used for backups and sequential I/O to very large files. Solaris
26 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
27 * an undocumented mount option.
29 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
30 * help from Andrew Morton.
32 * 18 Dec 2001 Initial implementation for 2.4 --cel
33 * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
34 * 08 Jun 2003 Port to 2.5 APIs --cel
35 * 31 Mar 2004 Handle direct I/O without VFS support --cel
39 #include <linux/config.h>
40 #include <linux/errno.h>
41 #include <linux/sched.h>
42 #include <linux/kernel.h>
43 #include <linux/smp_lock.h>
44 #include <linux/file.h>
45 #include <linux/pagemap.h>
47 #include <linux/nfs_fs.h>
48 #include <linux/nfs_page.h>
49 #include <linux/sunrpc/clnt.h>
51 #include <asm/system.h>
52 #include <asm/uaccess.h>
54 #define NFSDBG_FACILITY NFSDBG_VFS
55 #define VERF_SIZE (2 * sizeof(__u32))
56 #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT)
60 * nfs_get_user_pages - find and set up pages underlying user's buffer
61 * rw: direction (read or write)
62 * user_addr: starting address of this segment of user's buffer
63 * count: size of this segment
64 * @pages: returned array of page struct pointers underlying user's buffer
67 nfs_get_user_pages(int rw
, unsigned long user_addr
, size_t size
,
71 unsigned long page_count
;
74 /* set an arbitrary limit to prevent arithmetic overflow */
75 if (size
> MAX_DIRECTIO_SIZE
)
78 page_count
= (user_addr
+ size
+ PAGE_SIZE
- 1) >> PAGE_SHIFT
;
79 page_count
-= user_addr
>> PAGE_SHIFT
;
81 array_size
= (page_count
* sizeof(struct page
*));
82 *pages
= kmalloc(array_size
, GFP_KERNEL
);
84 down_read(¤t
->mm
->mmap_sem
);
85 result
= get_user_pages(current
, current
->mm
, user_addr
,
86 page_count
, (rw
== READ
), 0,
88 up_read(¤t
->mm
->mmap_sem
);
94 * nfs_free_user_pages - tear down page struct array
95 * @pages: array of page struct pointers underlying target buffer
98 nfs_free_user_pages(struct page
**pages
, int npages
, int do_dirty
)
101 for (i
= 0; i
< npages
; i
++) {
103 set_page_dirty_lock(pages
[i
]);
104 page_cache_release(pages
[i
]);
110 * nfs_direct_read_seg - Read in one iov segment. Generate separate
111 * read RPCs for each "rsize" bytes.
112 * @inode: target inode
113 * @ctx: target file open context
114 * user_addr: starting address of this segment of user's buffer
115 * count: size of this segment
116 * file_offset: offset in file to begin the operation
117 * @pages: array of addresses of page structs defining user's buffer
118 * nr_pages: size of pages array
121 nfs_direct_read_seg(struct inode
*inode
, struct nfs_open_context
*ctx
,
122 unsigned long user_addr
, size_t count
, loff_t file_offset
,
123 struct page
**pages
, int nr_pages
)
125 const unsigned int rsize
= NFS_SERVER(inode
)->rsize
;
128 struct nfs_read_data rdata
= {
136 .fattr
= &rdata
.fattr
,
140 rdata
.args
.pgbase
= user_addr
& ~PAGE_MASK
;
141 rdata
.args
.offset
= file_offset
;
145 rdata
.args
.count
= count
;
146 if (rdata
.args
.count
> rsize
)
147 rdata
.args
.count
= rsize
;
148 rdata
.args
.pages
= &pages
[curpage
];
150 dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
151 rdata
.args
.count
, (long long) rdata
.args
.offset
,
152 user_addr
+ tot_bytes
, rdata
.args
.pgbase
, curpage
);
155 result
= NFS_PROTO(inode
)->read(&rdata
);
161 if (result
== -EISDIR
)
170 rdata
.args
.offset
+= result
;
171 rdata
.args
.pgbase
+= result
;
172 curpage
+= rdata
.args
.pgbase
>> PAGE_SHIFT
;
173 rdata
.args
.pgbase
&= ~PAGE_MASK
;
175 } while (count
!= 0);
177 /* XXX: should we zero the rest of the user's buffer if we
184 * nfs_direct_read - For each iov segment, map the user's buffer
185 * then generate read RPCs.
186 * @inode: target inode
187 * @ctx: target file open context
188 * @iov: array of vectors that define I/O buffer
189 * file_offset: offset in file to begin the operation
190 * nr_segs: size of iovec array
192 * generic_file_direct_IO has already pushed out any non-direct
193 * writes so that this read will see them when we read from the
197 nfs_direct_read(struct inode
*inode
, struct nfs_open_context
*ctx
,
198 const struct iovec
*iov
, loff_t file_offset
,
199 unsigned long nr_segs
)
201 ssize_t tot_bytes
= 0;
202 unsigned long seg
= 0;
204 while ((seg
< nr_segs
) && (tot_bytes
>= 0)) {
208 const struct iovec
*vec
= &iov
[seg
++];
209 unsigned long user_addr
= (unsigned long) vec
->iov_base
;
210 size_t size
= vec
->iov_len
;
212 page_count
= nfs_get_user_pages(READ
, user_addr
, size
, &pages
);
213 if (page_count
< 0) {
214 nfs_free_user_pages(pages
, 0, 0);
220 result
= nfs_direct_read_seg(inode
, ctx
, user_addr
, size
,
221 file_offset
, pages
, page_count
);
223 nfs_free_user_pages(pages
, page_count
, 1);
231 file_offset
+= result
;
240 * nfs_direct_write_seg - Write out one iov segment. Generate separate
241 * write RPCs for each "wsize" bytes, then commit.
242 * @inode: target inode
243 * @ctx: target file open context
244 * user_addr: starting address of this segment of user's buffer
245 * count: size of this segment
246 * file_offset: offset in file to begin the operation
247 * @pages: array of addresses of page structs defining user's buffer
248 * nr_pages: size of pages array
251 nfs_direct_write_seg(struct inode
*inode
, struct nfs_open_context
*ctx
,
252 unsigned long user_addr
, size_t count
, loff_t file_offset
,
253 struct page
**pages
, int nr_pages
)
255 const unsigned int wsize
= NFS_SERVER(inode
)->wsize
;
257 int curpage
, need_commit
, result
, tot_bytes
;
258 struct nfs_writeverf first_verf
;
259 struct nfs_write_data wdata
= {
267 .fattr
= &wdata
.fattr
,
272 wdata
.args
.stable
= NFS_UNSTABLE
;
273 if (IS_SYNC(inode
) || NFS_PROTO(inode
)->version
== 2 || count
<= wsize
)
274 wdata
.args
.stable
= NFS_FILE_SYNC
;
276 nfs_begin_data_update(inode
);
282 wdata
.args
.pgbase
= user_addr
& ~PAGE_MASK
;
283 wdata
.args
.offset
= file_offset
;
285 wdata
.args
.count
= request
;
286 if (wdata
.args
.count
> wsize
)
287 wdata
.args
.count
= wsize
;
288 wdata
.args
.pages
= &pages
[curpage
];
290 dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n",
291 wdata
.args
.count
, (long long) wdata
.args
.offset
,
292 user_addr
+ tot_bytes
, wdata
.args
.pgbase
, curpage
);
295 result
= NFS_PROTO(inode
)->write(&wdata
);
305 memcpy(&first_verf
.verifier
, &wdata
.verf
.verifier
,
307 if (wdata
.verf
.committed
!= NFS_FILE_SYNC
) {
309 if (memcmp(&first_verf
.verifier
,
310 &wdata
.verf
.verifier
, VERF_SIZE
))
315 wdata
.args
.offset
+= result
;
316 wdata
.args
.pgbase
+= result
;
317 curpage
+= wdata
.args
.pgbase
>> PAGE_SHIFT
;
318 wdata
.args
.pgbase
&= ~PAGE_MASK
;
320 } while (request
!= 0);
323 * Commit data written so far, even in the event of an error
326 wdata
.args
.count
= tot_bytes
;
327 wdata
.args
.offset
= file_offset
;
330 result
= NFS_PROTO(inode
)->commit(&wdata
);
333 if (result
< 0 || memcmp(&first_verf
.verifier
,
334 &wdata
.verf
.verifier
,
341 nfs_end_data_update_defer(inode
);
346 wdata
.args
.stable
= NFS_FILE_SYNC
;
351 * nfs_direct_write - For each iov segment, map the user's buffer
352 * then generate write and commit RPCs.
353 * @inode: target inode
354 * @ctx: target file open context
355 * @iov: array of vectors that define I/O buffer
356 * file_offset: offset in file to begin the operation
357 * nr_segs: size of iovec array
359 * Upon return, generic_file_direct_IO invalidates any cached pages
360 * that non-direct readers might access, so they will pick up these
361 * writes immediately.
363 static int nfs_direct_write(struct inode
*inode
, struct nfs_open_context
*ctx
,
364 const struct iovec
*iov
, loff_t file_offset
,
365 unsigned long nr_segs
)
367 ssize_t tot_bytes
= 0;
368 unsigned long seg
= 0;
370 while ((seg
< nr_segs
) && (tot_bytes
>= 0)) {
374 const struct iovec
*vec
= &iov
[seg
++];
375 unsigned long user_addr
= (unsigned long) vec
->iov_base
;
376 size_t size
= vec
->iov_len
;
378 page_count
= nfs_get_user_pages(WRITE
, user_addr
, size
, &pages
);
379 if (page_count
< 0) {
380 nfs_free_user_pages(pages
, 0, 0);
386 result
= nfs_direct_write_seg(inode
, ctx
, user_addr
, size
,
387 file_offset
, pages
, page_count
);
388 nfs_free_user_pages(pages
, page_count
, 0);
396 file_offset
+= result
;
404 * nfs_direct_IO - NFS address space operation for direct I/O
405 * rw: direction (read or write)
406 * @iocb: target I/O control block
407 * @iov: array of vectors that define I/O buffer
408 * file_offset: offset in file to begin the operation
409 * nr_segs: size of iovec array
413 nfs_direct_IO(int rw
, struct kiocb
*iocb
, const struct iovec
*iov
,
414 loff_t file_offset
, unsigned long nr_segs
)
416 ssize_t result
= -EINVAL
;
417 struct file
*file
= iocb
->ki_filp
;
418 struct nfs_open_context
*ctx
;
419 struct dentry
*dentry
= file
->f_dentry
;
420 struct inode
*inode
= dentry
->d_inode
;
423 * No support for async yet
425 if (!is_sync_kiocb(iocb
))
428 ctx
= (struct nfs_open_context
*)file
->private_data
;
431 dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n",
432 dentry
->d_name
.name
, file_offset
, nr_segs
);
434 result
= nfs_direct_read(inode
, ctx
, iov
,
435 file_offset
, nr_segs
);
438 dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n",
439 dentry
->d_name
.name
, file_offset
, nr_segs
);
441 result
= nfs_direct_write(inode
, ctx
, iov
,
442 file_offset
, nr_segs
);
451 * nfs_file_direct_read - file direct read operation for NFS files
452 * @iocb: target I/O control block
453 * @buf: user's buffer into which to read data
454 * count: number of bytes to read
455 * pos: byte offset in file where reading starts
457 * We use this function for direct reads instead of calling
458 * generic_file_aio_read() in order to avoid gfar's check to see if
459 * the request starts before the end of the file. For that check
460 * to work, we must generate a GETATTR before each direct read, and
461 * even then there is a window between the GETATTR and the subsequent
462 * READ where the file size could change. So our preference is simply
463 * to do all reads the application wants, and the server will take
464 * care of managing the end of file boundary.
466 * This function also eliminates unnecessarily updating the file's
467 * atime locally, as the NFS server sets the file's atime, and this
468 * client must read the updated atime from the server back into its
472 nfs_file_direct_read(struct kiocb
*iocb
, char __user
*buf
, size_t count
, loff_t pos
)
474 ssize_t retval
= -EINVAL
;
475 loff_t
*ppos
= &iocb
->ki_pos
;
476 struct file
*file
= iocb
->ki_filp
;
477 struct nfs_open_context
*ctx
=
478 (struct nfs_open_context
*) file
->private_data
;
479 struct dentry
*dentry
= file
->f_dentry
;
480 struct address_space
*mapping
= file
->f_mapping
;
481 struct inode
*inode
= mapping
->host
;
487 dprintk("nfs: direct read(%s/%s, %lu@%lu)\n",
488 dentry
->d_parent
->d_name
.name
, dentry
->d_name
.name
,
489 (unsigned long) count
, (unsigned long) pos
);
491 if (!is_sync_kiocb(iocb
))
496 if (!access_ok(VERIFY_WRITE
, iov
.iov_base
, iov
.iov_len
))
502 if (mapping
->nrpages
) {
503 retval
= filemap_fdatawrite(mapping
);
505 retval
= filemap_fdatawait(mapping
);
510 retval
= nfs_direct_read(inode
, ctx
, &iov
, pos
, 1);
512 *ppos
= pos
+ retval
;
519 * nfs_file_direct_write - file direct write operation for NFS files
520 * @iocb: target I/O control block
521 * @buf: user's buffer from which to write data
522 * count: number of bytes to write
523 * pos: byte offset in file where writing starts
525 * We use this function for direct writes instead of calling
526 * generic_file_aio_write() in order to avoid taking the inode
527 * semaphore and updating the i_size. The NFS server will set
528 * the new i_size and this client must read the updated size
529 * back into its cache. We let the server do generic write
530 * parameter checking and report problems.
532 * We also avoid an unnecessary invocation of generic_osync_inode(),
533 * as it is fairly meaningless to sync the metadata of an NFS file.
535 * We eliminate local atime updates, see direct read above.
537 * We avoid unnecessary page cache invalidations for normal cached
538 * readers of this file.
540 * Note that O_APPEND is not supported for NFS direct writes, as there
541 * is no atomic O_APPEND write facility in the NFS protocol.
544 nfs_file_direct_write(struct kiocb
*iocb
, const char __user
*buf
, size_t count
, loff_t pos
)
546 ssize_t retval
= -EINVAL
;
547 loff_t
*ppos
= &iocb
->ki_pos
;
548 unsigned long limit
= current
->signal
->rlim
[RLIMIT_FSIZE
].rlim_cur
;
549 struct file
*file
= iocb
->ki_filp
;
550 struct nfs_open_context
*ctx
=
551 (struct nfs_open_context
*) file
->private_data
;
552 struct dentry
*dentry
= file
->f_dentry
;
553 struct address_space
*mapping
= file
->f_mapping
;
554 struct inode
*inode
= mapping
->host
;
556 .iov_base
= (char __user
*)buf
,
560 dfprintk(VFS
, "nfs: direct write(%s/%s(%ld), %lu@%lu)\n",
561 dentry
->d_parent
->d_name
.name
, dentry
->d_name
.name
,
562 inode
->i_ino
, (unsigned long) count
, (unsigned long) pos
);
564 if (!is_sync_kiocb(iocb
))
571 if (!access_ok(VERIFY_READ
, iov
.iov_base
, iov
.iov_len
))
574 retval
= file
->f_error
;
579 if (limit
!= RLIM_INFINITY
) {
581 send_sig(SIGXFSZ
, current
, 0);
584 if (count
> limit
- (unsigned long) pos
)
585 count
= limit
- (unsigned long) pos
;
591 if (mapping
->nrpages
) {
592 retval
= filemap_fdatawrite(mapping
);
594 retval
= filemap_fdatawait(mapping
);
599 retval
= nfs_direct_write(inode
, ctx
, &iov
, pos
, 1);
600 if (mapping
->nrpages
)
601 invalidate_inode_pages2(mapping
);
603 *ppos
= pos
+ retval
;