4 * Writing file data over NFS.
6 * We do it like this: When a (user) process wishes to write data to an
7 * NFS file, a write request is allocated that contains the RPC task data
8 * plus some info on the page to be written, and added to the inode's
9 * write chain. If the process writes past the end of the page, an async
10 * RPC call to write the page is scheduled immediately; otherwise, the call
11 * is delayed for a few seconds.
13 * Just like readahead, no async I/O is performed if wsize < PAGE_SIZE.
15 * Write requests are kept on the inode's writeback list. Each entry in
16 * that list references the page (portion) to be written. When the
17 * cache timeout has expired, the RPC task is woken up, and tries to
18 * lock the page. As soon as it manages to do so, the request is moved
19 * from the writeback list to the writelock list.
21 * Note: we must make sure never to confuse the inode passed in the
22 * write_page request with the one in page->inode. As far as I understand
23 * it, these are different when doing a swap-out.
25 * To understand everything that goes on here and in the NFS read code,
26 * one should be aware that a page is locked in exactly one of the following
29 * - A write request is in progress.
30 * - A user process is in generic_file_write/nfs_update_page
31 * - A user process is in generic_file_read
33 * Also note that because of the way pages are invalidated in
34 * nfs_revalidate_inode, the following assertions hold:
36 * - If a page is dirty, there will be no read requests (a page will
37 * not be re-read unless invalidated by nfs_revalidate_inode).
38 * - If the page is not uptodate, there will be no pending write
39 * requests, and no process will be in nfs_update_page.
41 * FIXME: Interaction with the vmscan routines is not optimal yet.
42 * Either vmscan must be made nfs-savvy, or we need a different page
43 * reclaim concept that supports something like FS-independent
44 * buffer_heads with a b_ops-> field.
46 * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
49 #include <linux/types.h>
50 #include <linux/malloc.h>
51 #include <linux/swap.h>
52 #include <linux/pagemap.h>
53 #include <linux/file.h>
55 #include <linux/sunrpc/clnt.h>
56 #include <linux/nfs_fs.h>
57 #include <asm/uaccess.h>
59 #define NFS_PARANOIA 1
60 #define NFSDBG_FACILITY NFSDBG_PAGECACHE
62 static void nfs_wback_begin(struct rpc_task
*task
);
63 static void nfs_wback_result(struct rpc_task
*task
);
64 static void nfs_cancel_request(struct nfs_wreq
*req
);
69 #define NFS_WRITEBACK_DELAY (10 * HZ)
70 #define NFS_WRITEBACK_MAX 64
73 * Limit number of delayed writes
75 static int nr_write_requests
= 0;
76 static struct rpc_wait_queue write_queue
= RPC_INIT_WAITQ("write_chain");
78 /* Hack for future NFS swap support */
80 # define IS_SWAPFILE(inode) (0)
84 * Write a page synchronously.
85 * Offset is the data offset within the page.
88 nfs_writepage_sync(struct dentry
*dentry
, struct inode
*inode
,
89 struct page
*page
, unsigned long offset
, unsigned int count
)
91 unsigned int wsize
= NFS_SERVER(inode
)->wsize
;
92 int result
, refresh
= 0, written
= 0;
94 struct nfs_fattr fattr
;
96 dprintk("NFS: nfs_writepage_sync(%s/%s %d@%ld)\n",
97 dentry
->d_parent
->d_name
.name
, dentry
->d_name
.name
,
98 count
, page
->offset
+ offset
);
100 buffer
= (u8
*) page_address(page
) + offset
;
101 offset
+= page
->offset
;
104 if (count
< wsize
&& !IS_SWAPFILE(inode
))
107 result
= nfs_proc_write(NFS_DSERVER(dentry
), NFS_FH(dentry
),
108 IS_SWAPFILE(inode
), offset
, wsize
,
112 /* Must mark the page invalid after I/O error */
113 clear_bit(PG_uptodate
, &page
->flags
);
117 printk("NFS: short write, wsize=%u, result=%d\n",
125 * If we've extended the file, update the inode
126 * now so we don't invalidate the cache.
128 if (offset
> inode
->i_size
)
129 inode
->i_size
= offset
;
133 /* Note: we don't refresh if the call failed (fattr invalid) */
134 if (refresh
&& result
>= 0) {
135 /* See comments in nfs_wback_result */
136 /* N.B. I don't think this is right -- sync writes in order */
137 if (fattr
.size
< inode
->i_size
)
138 fattr
.size
= inode
->i_size
;
139 if (fattr
.mtime
.seconds
< inode
->i_mtime
)
140 printk("nfs_writepage_sync: prior time??\n");
141 /* Solaris 2.5 server seems to send garbled
142 * fattrs occasionally */
143 if (inode
->i_ino
== fattr
.fileid
) {
145 * We expect the mtime value to change, and
146 * don't want to invalidate the caches.
148 inode
->i_mtime
= fattr
.mtime
.seconds
;
149 nfs_refresh_inode(inode
, &fattr
);
152 printk("nfs_writepage_sync: inode %ld, got %u?\n",
153 inode
->i_ino
, fattr
.fileid
);
156 return written
? written
: result
;
160 * Append a writeback request to a list
163 append_write_request(struct nfs_wreq
**q
, struct nfs_wreq
*wreq
)
165 dprintk("NFS: append_write_request(%p, %p)\n", q
, wreq
);
166 rpc_append_list(q
, wreq
);
170 * Remove a writeback request from a list
173 remove_write_request(struct nfs_wreq
**q
, struct nfs_wreq
*wreq
)
175 dprintk("NFS: remove_write_request(%p, %p)\n", q
, wreq
);
176 rpc_remove_list(q
, wreq
);
180 * Find a non-busy write request for a given page to
181 * try to combine with.
183 static inline struct nfs_wreq
*
184 find_write_request(struct inode
*inode
, struct page
*page
)
186 pid_t pid
= current
->pid
;
187 struct nfs_wreq
*head
, *req
;
189 dprintk("NFS: find_write_request(%x/%ld, %p)\n",
190 inode
->i_dev
, inode
->i_ino
, page
);
191 if (!(req
= head
= NFS_WRITEBACK(inode
)))
195 * We can't combine with canceled requests or
196 * requests that have already been started..
198 if (req
->wb_flags
& (NFS_WRITE_CANCELLED
| NFS_WRITE_INPROGRESS
))
201 if (req
->wb_page
== page
&& req
->wb_pid
== pid
)
205 * Ehh, don't keep too many tasks queued..
207 rpc_wake_up_task(&req
->wb_task
);
209 } while ((req
= WB_NEXT(req
)) != head
);
214 * Find and release all failed requests for this inode.
217 nfs_check_failed_request(struct inode
* inode
)
224 * Try to merge adjacent write requests. This works only for requests
225 * issued by the same user.
228 update_write_request(struct nfs_wreq
*req
, unsigned int first
,
231 unsigned int rqfirst
= req
->wb_offset
,
232 rqlast
= rqfirst
+ req
->wb_bytes
,
233 last
= first
+ bytes
;
235 dprintk("nfs: trying to update write request %p\n", req
);
237 /* not contiguous? */
238 if (rqlast
< first
|| last
< rqfirst
)
246 req
->wb_offset
= rqfirst
;
247 req
->wb_bytes
= rqlast
- rqfirst
;
253 static kmem_cache_t
*nfs_wreq_cachep
;
255 int nfs_init_wreqcache(void)
257 nfs_wreq_cachep
= kmem_cache_create("nfs_wreq",
258 sizeof(struct nfs_wreq
),
259 0, SLAB_HWCACHE_ALIGN
,
261 if (nfs_wreq_cachep
== NULL
)
267 free_write_request(struct nfs_wreq
* req
)
269 if (!--req
->wb_count
)
270 kmem_cache_free(nfs_wreq_cachep
, req
);
274 * Create and initialize a writeback request
276 static inline struct nfs_wreq
*
277 create_write_request(struct file
* file
, struct page
*page
, unsigned int offset
, unsigned int bytes
)
279 struct dentry
*dentry
= file
->f_dentry
;
280 struct inode
*inode
= dentry
->d_inode
;
281 struct rpc_clnt
*clnt
= NFS_CLIENT(inode
);
282 struct nfs_wreq
*wreq
;
283 struct rpc_task
*task
;
285 dprintk("NFS: create_write_request(%s/%s, %ld+%d)\n",
286 dentry
->d_parent
->d_name
.name
, dentry
->d_name
.name
,
287 page
->offset
+ offset
, bytes
);
289 /* FIXME: Enforce hard limit on number of concurrent writes? */
290 wreq
= kmem_cache_alloc(nfs_wreq_cachep
, SLAB_KERNEL
);
293 memset(wreq
, 0, sizeof(*wreq
));
295 task
= &wreq
->wb_task
;
296 rpc_init_task(task
, clnt
, nfs_wback_result
, RPC_TASK_NFSWRITE
);
297 task
->tk_calldata
= wreq
;
298 task
->tk_action
= nfs_wback_begin
;
300 rpcauth_lookupcred(task
); /* Obtain user creds */
301 if (task
->tk_status
< 0)
304 /* Put the task on inode's writeback request list. */
305 wreq
->wb_file
= file
;
306 wreq
->wb_pid
= current
->pid
;
307 wreq
->wb_page
= page
;
308 init_waitqueue_head(&wreq
->wb_wait
);
309 wreq
->wb_offset
= offset
;
310 wreq
->wb_bytes
= bytes
;
311 wreq
->wb_count
= 2; /* One for the IO, one for us */
313 append_write_request(&NFS_WRITEBACK(inode
), wreq
);
315 if (nr_write_requests
++ > NFS_WRITEBACK_MAX
*3/4)
316 rpc_wake_up_next(&write_queue
);
321 rpc_release_task(task
);
322 kmem_cache_free(nfs_wreq_cachep
, wreq
);
328 * Schedule a writeback RPC call.
329 * If the server is congested, don't add to our backlog of queued
330 * requests but call it synchronously.
331 * The function returns whether we should wait for the thing or not.
333 * FIXME: Here we could walk the inode's lock list to see whether the
334 * page we're currently writing to has been write-locked by the caller.
335 * If it is, we could schedule an async write request with a long
336 * delay in order to avoid writing back the page until the lock is
340 schedule_write_request(struct nfs_wreq
*req
, int sync
)
342 struct rpc_task
*task
= &req
->wb_task
;
343 struct file
*file
= req
->wb_file
;
344 struct dentry
*dentry
= file
->f_dentry
;
345 struct inode
*inode
= dentry
->d_inode
;
347 if (NFS_CONGESTED(inode
) || nr_write_requests
>= NFS_WRITEBACK_MAX
)
352 struct rpc_clnt
*clnt
= NFS_CLIENT(inode
);
353 dprintk("NFS: %4d schedule_write_request (sync)\n",
355 /* Page is already locked */
356 rpc_clnt_sigmask(clnt
, &oldmask
);
358 rpc_clnt_sigunmask(clnt
, &oldmask
);
360 dprintk("NFS: %4d schedule_write_request (async)\n",
362 task
->tk_flags
|= RPC_TASK_ASYNC
;
363 task
->tk_timeout
= NFS_WRITEBACK_DELAY
;
364 rpc_sleep_on(&write_queue
, task
, NULL
, NULL
);
371 * Wait for request to complete.
374 wait_on_write_request(struct nfs_wreq
*req
)
376 struct file
*file
= req
->wb_file
;
377 struct dentry
*dentry
= file
->f_dentry
;
378 struct inode
*inode
= dentry
->d_inode
;
379 struct rpc_clnt
*clnt
= NFS_CLIENT(inode
);
380 DECLARE_WAITQUEUE(wait
, current
);
384 /* Make sure it's started.. */
385 if (!WB_INPROGRESS(req
))
386 rpc_wake_up_task(&req
->wb_task
);
388 rpc_clnt_sigmask(clnt
, &oldmask
);
389 add_wait_queue(&req
->wb_wait
, &wait
);
391 current
->state
= TASK_INTERRUPTIBLE
;
393 if (req
->wb_flags
& NFS_WRITE_COMPLETE
)
395 retval
= -ERESTARTSYS
;
400 remove_wait_queue(&req
->wb_wait
, &wait
);
401 current
->state
= TASK_RUNNING
;
402 rpc_clnt_sigunmask(clnt
, &oldmask
);
407 * Write a page to the server. This will be used for NFS swapping only
408 * (for now), and we currently do this synchronously only.
411 nfs_writepage(struct file
* file
, struct page
*page
)
413 struct dentry
*dentry
= file
->f_dentry
;
414 return nfs_writepage_sync(dentry
, dentry
->d_inode
, page
, 0, PAGE_SIZE
);
418 * Update and possibly write a cached page of an NFS file.
420 * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
421 * things with a page scheduled for an RPC call (e.g. invalidate it).
424 nfs_updatepage(struct file
*file
, struct page
*page
, unsigned long offset
, unsigned int count
)
426 struct dentry
*dentry
= file
->f_dentry
;
427 struct inode
*inode
= dentry
->d_inode
;
428 struct nfs_wreq
*req
;
429 int synchronous
= file
->f_flags
& O_SYNC
;
432 dprintk("NFS: nfs_updatepage(%s/%s %d@%ld)\n",
433 dentry
->d_parent
->d_name
.name
, dentry
->d_name
.name
,
434 count
, page
->offset
+offset
);
437 * Try to find a corresponding request on the writeback queue.
438 * If there is one, we can be sure that this request is not
439 * yet being processed, because we hold a lock on the page.
441 * If the request was created by us, update it. Otherwise,
442 * transfer the page lock and flush out the dirty page now.
443 * After returning, generic_file_write will wait on the
444 * page and retry the update.
446 req
= find_write_request(inode
, page
);
447 if (req
&& req
->wb_file
== file
&& update_write_request(req
, offset
, count
))
451 * If wsize is smaller than page size, update and write
452 * page synchronously.
454 if (NFS_SERVER(inode
)->wsize
< PAGE_SIZE
)
455 return nfs_writepage_sync(dentry
, inode
, page
, offset
, count
);
457 /* Create the write request. */
458 req
= create_write_request(file
, page
, offset
, count
);
463 * Ok, there's another user of this page with the new request..
464 * The IO completion will then free the page and the dentry.
466 atomic_inc(&page
->count
);
469 /* Schedule request */
470 synchronous
= schedule_write_request(req
, synchronous
);
473 if (req
->wb_bytes
== PAGE_SIZE
)
474 set_bit(PG_uptodate
, &page
->flags
);
478 int status
= wait_on_write_request(req
);
480 nfs_cancel_request(req
);
483 status
= req
->wb_status
;
489 clear_bit(PG_uptodate
, &page
->flags
);
492 free_write_request(req
);
497 * Cancel a write request. We always mark it cancelled,
498 * but if it's already in progress there's no point in
499 * calling rpc_exit, and we don't want to overwrite the
503 nfs_cancel_request(struct nfs_wreq
*req
)
505 req
->wb_flags
|= NFS_WRITE_CANCELLED
;
506 if (!WB_INPROGRESS(req
)) {
507 rpc_exit(&req
->wb_task
, 0);
508 rpc_wake_up_task(&req
->wb_task
);
513 * Cancel all writeback requests, both pending and in progress.
516 nfs_cancel_dirty(struct inode
*inode
, pid_t pid
)
518 struct nfs_wreq
*head
, *req
;
520 req
= head
= NFS_WRITEBACK(inode
);
521 while (req
!= NULL
) {
522 if (pid
== 0 || req
->wb_pid
== pid
)
523 nfs_cancel_request(req
);
524 if ((req
= WB_NEXT(req
)) == head
)
530 * If we're waiting on somebody else's request
531 * we need to increment the counter during the
532 * wait so that the request doesn't disappear
533 * from under us during the wait..
535 static int FASTCALL(wait_on_other_req(struct nfs_wreq
*));
536 static int wait_on_other_req(struct nfs_wreq
*req
)
540 retval
= wait_on_write_request(req
);
541 free_write_request(req
);
546 * This writes back a set of requests according to the condition.
548 * If this ever gets much more convoluted, use a fn pointer for
551 #define NFS_WB(inode, cond) { int retval = 0 ; \
553 struct nfs_wreq *req = NFS_WRITEBACK(inode); \
554 struct nfs_wreq *head = req; \
557 if (!(req->wb_flags & NFS_WRITE_COMPLETE)) \
559 req = WB_NEXT(req); \
560 if (req == head) goto out; \
562 retval = wait_on_other_req(req); \
564 out: return retval; \
568 nfs_wb_all(struct inode
*inode
)
574 * Write back all requests on one page - we do this before reading it.
577 nfs_wb_page(struct inode
*inode
, struct page
*page
)
579 NFS_WB(inode
, req
->wb_page
== page
);
583 * Write back all pending writes from one file descriptor..
586 nfs_wb_file(struct inode
*inode
, struct file
*file
)
588 NFS_WB(inode
, req
->wb_file
== file
);
592 nfs_inval(struct inode
*inode
)
594 nfs_cancel_dirty(inode
,0);
598 * The following procedures make up the writeback finite state machinery:
600 * 1. Try to lock the page if not yet locked by us,
601 * set up the RPC call info, and pass to the call FSM.
604 nfs_wback_begin(struct rpc_task
*task
)
606 struct nfs_wreq
*req
= (struct nfs_wreq
*) task
->tk_calldata
;
607 struct page
*page
= req
->wb_page
;
608 struct file
*file
= req
->wb_file
;
609 struct dentry
*dentry
= file
->f_dentry
;
611 dprintk("NFS: %4d nfs_wback_begin (%s/%s, status=%d flags=%x)\n",
612 task
->tk_pid
, dentry
->d_parent
->d_name
.name
,
613 dentry
->d_name
.name
, task
->tk_status
, req
->wb_flags
);
617 /* Setup the task struct for a writeback call */
618 req
->wb_flags
|= NFS_WRITE_INPROGRESS
;
619 req
->wb_args
.fh
= NFS_FH(dentry
);
620 req
->wb_args
.offset
= page
->offset
+ req
->wb_offset
;
621 req
->wb_args
.count
= req
->wb_bytes
;
622 req
->wb_args
.buffer
= (void *) (page_address(page
) + req
->wb_offset
);
624 rpc_call_setup(task
, NFSPROC_WRITE
, &req
->wb_args
, &req
->wb_fattr
, 0);
630 * 2. Collect the result
633 nfs_wback_result(struct rpc_task
*task
)
635 struct nfs_wreq
*req
= (struct nfs_wreq
*) task
->tk_calldata
;
636 struct file
*file
= req
->wb_file
;
637 struct page
*page
= req
->wb_page
;
638 int status
= task
->tk_status
;
639 struct dentry
*dentry
= file
->f_dentry
;
640 struct inode
*inode
= dentry
->d_inode
;
642 dprintk("NFS: %4d nfs_wback_result (%s/%s, status=%d, flags=%x)\n",
643 task
->tk_pid
, dentry
->d_parent
->d_name
.name
,
644 dentry
->d_name
.name
, status
, req
->wb_flags
);
646 /* Set the WRITE_COMPLETE flag, but leave WRITE_INPROGRESS set */
647 req
->wb_flags
|= NFS_WRITE_COMPLETE
;
648 req
->wb_status
= status
;
651 req
->wb_flags
|= NFS_WRITE_INVALIDATE
;
652 file
->f_error
= status
;
653 } else if (!WB_CANCELLED(req
)) {
654 struct nfs_fattr
*fattr
= &req
->wb_fattr
;
655 /* Update attributes as result of writeback.
656 * Beware: when UDP replies arrive out of order, we
657 * may end up overwriting a previous, bigger file size.
659 * When the file size shrinks we cancel all pending
662 if (fattr
->mtime
.seconds
>= inode
->i_mtime
) {
663 if (fattr
->size
< inode
->i_size
)
664 fattr
->size
= inode
->i_size
;
666 /* possible Solaris 2.5 server bug workaround */
667 if (inode
->i_ino
== fattr
->fileid
) {
669 * We expect these values to change, and
670 * don't want to invalidate the caches.
672 inode
->i_size
= fattr
->size
;
673 inode
->i_mtime
= fattr
->mtime
.seconds
;
674 nfs_refresh_inode(inode
, fattr
);
677 printk("nfs_wback_result: inode %ld, got %u?\n",
678 inode
->i_ino
, fattr
->fileid
);
682 rpc_release_task(task
);
684 if (WB_INVALIDATE(req
))
685 clear_bit(PG_uptodate
, &page
->flags
);
688 remove_write_request(&NFS_WRITEBACK(inode
), req
);
692 wake_up(&req
->wb_wait
);
693 free_write_request(req
);