Import 2.3.6pre2
[davej-history.git] / fs / nfs / write.c
blob8da08f06b8435aa43c430e4d94e8ec0f414fa2ca
1 /*
2 * linux/fs/nfs/write.c
4 * Writing file data over NFS.
6 * We do it like this: When a (user) process wishes to write data to an
7 * NFS file, a write request is allocated that contains the RPC task data
8 * plus some info on the page to be written, and added to the inode's
9 * write chain. If the process writes past the end of the page, an async
10 * RPC call to write the page is scheduled immediately; otherwise, the call
11 * is delayed for a few seconds.
13 * Just like readahead, no async I/O is performed if wsize < PAGE_SIZE.
15 * Write requests are kept on the inode's writeback list. Each entry in
16 * that list references the page (portion) to be written. When the
17 * cache timeout has expired, the RPC task is woken up, and tries to
18 * lock the page. As soon as it manages to do so, the request is moved
19 * from the writeback list to the writelock list.
21 * Note: we must make sure never to confuse the inode passed in the
22 * write_page request with the one in page->inode. As far as I understand
23 * it, these are different when doing a swap-out.
25 * To understand everything that goes on here and in the NFS read code,
26 * one should be aware that a page is locked in exactly one of the following
27 * cases:
29 * - A write request is in progress.
30 * - A user process is in generic_file_write/nfs_update_page
31 * - A user process is in generic_file_read
33 * Also note that because of the way pages are invalidated in
34 * nfs_revalidate_inode, the following assertions hold:
36 * - If a page is dirty, there will be no read requests (a page will
37 * not be re-read unless invalidated by nfs_revalidate_inode).
38 * - If the page is not uptodate, there will be no pending write
39 * requests, and no process will be in nfs_update_page.
41 * FIXME: Interaction with the vmscan routines is not optimal yet.
42 * Either vmscan must be made nfs-savvy, or we need a different page
43 * reclaim concept that supports something like FS-independent
44 * buffer_heads with a b_ops-> field.
46 * Copyright (C) 1996, 1997, Olaf Kirch <okir@monad.swb.de>
49 #include <linux/types.h>
50 #include <linux/malloc.h>
51 #include <linux/swap.h>
52 #include <linux/pagemap.h>
53 #include <linux/file.h>
55 #include <linux/sunrpc/clnt.h>
56 #include <linux/nfs_fs.h>
57 #include <asm/uaccess.h>
59 #define NFS_PARANOIA 1
60 #define NFSDBG_FACILITY NFSDBG_PAGECACHE
62 static void nfs_wback_begin(struct rpc_task *task);
63 static void nfs_wback_result(struct rpc_task *task);
64 static void nfs_cancel_request(struct nfs_wreq *req);
67 * Cache parameters
69 #define NFS_WRITEBACK_DELAY (10 * HZ)
70 #define NFS_WRITEBACK_MAX 64
73 * Limit number of delayed writes
75 static int nr_write_requests = 0;
76 static struct rpc_wait_queue write_queue = RPC_INIT_WAITQ("write_chain");
78 /* Hack for future NFS swap support */
79 #ifndef IS_SWAPFILE
80 # define IS_SWAPFILE(inode) (0)
81 #endif
84 * Write a page synchronously.
85 * Offset is the data offset within the page.
87 static int
88 nfs_writepage_sync(struct dentry *dentry, struct inode *inode,
89 struct page *page, unsigned long offset, unsigned int count)
91 unsigned int wsize = NFS_SERVER(inode)->wsize;
92 int result, refresh = 0, written = 0;
93 u8 *buffer;
94 struct nfs_fattr fattr;
96 dprintk("NFS: nfs_writepage_sync(%s/%s %d@%ld)\n",
97 dentry->d_parent->d_name.name, dentry->d_name.name,
98 count, page->offset + offset);
100 buffer = (u8 *) page_address(page) + offset;
101 offset += page->offset;
103 do {
104 if (count < wsize && !IS_SWAPFILE(inode))
105 wsize = count;
107 result = nfs_proc_write(NFS_DSERVER(dentry), NFS_FH(dentry),
108 IS_SWAPFILE(inode), offset, wsize,
109 buffer, &fattr);
111 if (result < 0) {
112 /* Must mark the page invalid after I/O error */
113 clear_bit(PG_uptodate, &page->flags);
114 goto io_error;
116 if (result != wsize)
117 printk("NFS: short write, wsize=%u, result=%d\n",
118 wsize, result);
119 refresh = 1;
120 buffer += wsize;
121 offset += wsize;
122 written += wsize;
123 count -= wsize;
125 * If we've extended the file, update the inode
126 * now so we don't invalidate the cache.
128 if (offset > inode->i_size)
129 inode->i_size = offset;
130 } while (count);
132 io_error:
133 /* Note: we don't refresh if the call failed (fattr invalid) */
134 if (refresh && result >= 0) {
135 /* See comments in nfs_wback_result */
136 /* N.B. I don't think this is right -- sync writes in order */
137 if (fattr.size < inode->i_size)
138 fattr.size = inode->i_size;
139 if (fattr.mtime.seconds < inode->i_mtime)
140 printk("nfs_writepage_sync: prior time??\n");
141 /* Solaris 2.5 server seems to send garbled
142 * fattrs occasionally */
143 if (inode->i_ino == fattr.fileid) {
145 * We expect the mtime value to change, and
146 * don't want to invalidate the caches.
148 inode->i_mtime = fattr.mtime.seconds;
149 nfs_refresh_inode(inode, &fattr);
151 else
152 printk("nfs_writepage_sync: inode %ld, got %u?\n",
153 inode->i_ino, fattr.fileid);
156 return written? written : result;
160 * Append a writeback request to a list
162 static inline void
163 append_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq)
165 dprintk("NFS: append_write_request(%p, %p)\n", q, wreq);
166 rpc_append_list(q, wreq);
170 * Remove a writeback request from a list
172 static inline void
173 remove_write_request(struct nfs_wreq **q, struct nfs_wreq *wreq)
175 dprintk("NFS: remove_write_request(%p, %p)\n", q, wreq);
176 rpc_remove_list(q, wreq);
180 * Find a non-busy write request for a given page to
181 * try to combine with.
183 static inline struct nfs_wreq *
184 find_write_request(struct inode *inode, struct page *page)
186 pid_t pid = current->pid;
187 struct nfs_wreq *head, *req;
189 dprintk("NFS: find_write_request(%x/%ld, %p)\n",
190 inode->i_dev, inode->i_ino, page);
191 if (!(req = head = NFS_WRITEBACK(inode)))
192 return NULL;
193 do {
195 * We can't combine with canceled requests or
196 * requests that have already been started..
198 if (req->wb_flags & (NFS_WRITE_CANCELLED | NFS_WRITE_INPROGRESS))
199 continue;
201 if (req->wb_page == page && req->wb_pid == pid)
202 return req;
205 * Ehh, don't keep too many tasks queued..
207 rpc_wake_up_task(&req->wb_task);
209 } while ((req = WB_NEXT(req)) != head);
210 return NULL;
214 * Find and release all failed requests for this inode.
217 nfs_check_failed_request(struct inode * inode)
219 /* FIXME! */
220 return 0;
224 * Try to merge adjacent write requests. This works only for requests
225 * issued by the same user.
227 static inline int
228 update_write_request(struct nfs_wreq *req, unsigned int first,
229 unsigned int bytes)
231 unsigned int rqfirst = req->wb_offset,
232 rqlast = rqfirst + req->wb_bytes,
233 last = first + bytes;
235 dprintk("nfs: trying to update write request %p\n", req);
237 /* not contiguous? */
238 if (rqlast < first || last < rqfirst)
239 return 0;
241 if (first < rqfirst)
242 rqfirst = first;
243 if (rqlast < last)
244 rqlast = last;
246 req->wb_offset = rqfirst;
247 req->wb_bytes = rqlast - rqfirst;
248 req->wb_count++;
250 return 1;
253 static kmem_cache_t *nfs_wreq_cachep;
255 int nfs_init_wreqcache(void)
257 nfs_wreq_cachep = kmem_cache_create("nfs_wreq",
258 sizeof(struct nfs_wreq),
259 0, SLAB_HWCACHE_ALIGN,
260 NULL, NULL);
261 if (nfs_wreq_cachep == NULL)
262 return -ENOMEM;
263 return 0;
266 static inline void
267 free_write_request(struct nfs_wreq * req)
269 if (!--req->wb_count)
270 kmem_cache_free(nfs_wreq_cachep, req);
274 * Create and initialize a writeback request
276 static inline struct nfs_wreq *
277 create_write_request(struct file * file, struct page *page, unsigned int offset, unsigned int bytes)
279 struct dentry *dentry = file->f_dentry;
280 struct inode *inode = dentry->d_inode;
281 struct rpc_clnt *clnt = NFS_CLIENT(inode);
282 struct nfs_wreq *wreq;
283 struct rpc_task *task;
285 dprintk("NFS: create_write_request(%s/%s, %ld+%d)\n",
286 dentry->d_parent->d_name.name, dentry->d_name.name,
287 page->offset + offset, bytes);
289 /* FIXME: Enforce hard limit on number of concurrent writes? */
290 wreq = kmem_cache_alloc(nfs_wreq_cachep, SLAB_KERNEL);
291 if (!wreq)
292 goto out_fail;
293 memset(wreq, 0, sizeof(*wreq));
295 task = &wreq->wb_task;
296 rpc_init_task(task, clnt, nfs_wback_result, RPC_TASK_NFSWRITE);
297 task->tk_calldata = wreq;
298 task->tk_action = nfs_wback_begin;
300 rpcauth_lookupcred(task); /* Obtain user creds */
301 if (task->tk_status < 0)
302 goto out_req;
304 /* Put the task on inode's writeback request list. */
305 wreq->wb_file = file;
306 wreq->wb_pid = current->pid;
307 wreq->wb_page = page;
308 init_waitqueue_head(&wreq->wb_wait);
309 wreq->wb_offset = offset;
310 wreq->wb_bytes = bytes;
311 wreq->wb_count = 2; /* One for the IO, one for us */
313 append_write_request(&NFS_WRITEBACK(inode), wreq);
315 if (nr_write_requests++ > NFS_WRITEBACK_MAX*3/4)
316 rpc_wake_up_next(&write_queue);
318 return wreq;
320 out_req:
321 rpc_release_task(task);
322 kmem_cache_free(nfs_wreq_cachep, wreq);
323 out_fail:
324 return NULL;
328 * Schedule a writeback RPC call.
329 * If the server is congested, don't add to our backlog of queued
330 * requests but call it synchronously.
331 * The function returns whether we should wait for the thing or not.
333 * FIXME: Here we could walk the inode's lock list to see whether the
334 * page we're currently writing to has been write-locked by the caller.
335 * If it is, we could schedule an async write request with a long
336 * delay in order to avoid writing back the page until the lock is
337 * released.
339 static inline int
340 schedule_write_request(struct nfs_wreq *req, int sync)
342 struct rpc_task *task = &req->wb_task;
343 struct file *file = req->wb_file;
344 struct dentry *dentry = file->f_dentry;
345 struct inode *inode = dentry->d_inode;
347 if (NFS_CONGESTED(inode) || nr_write_requests >= NFS_WRITEBACK_MAX)
348 sync = 1;
350 if (sync) {
351 sigset_t oldmask;
352 struct rpc_clnt *clnt = NFS_CLIENT(inode);
353 dprintk("NFS: %4d schedule_write_request (sync)\n",
354 task->tk_pid);
355 /* Page is already locked */
356 rpc_clnt_sigmask(clnt, &oldmask);
357 rpc_execute(task);
358 rpc_clnt_sigunmask(clnt, &oldmask);
359 } else {
360 dprintk("NFS: %4d schedule_write_request (async)\n",
361 task->tk_pid);
362 task->tk_flags |= RPC_TASK_ASYNC;
363 task->tk_timeout = NFS_WRITEBACK_DELAY;
364 rpc_sleep_on(&write_queue, task, NULL, NULL);
367 return sync;
371 * Wait for request to complete.
373 static int
374 wait_on_write_request(struct nfs_wreq *req)
376 struct file *file = req->wb_file;
377 struct dentry *dentry = file->f_dentry;
378 struct inode *inode = dentry->d_inode;
379 struct rpc_clnt *clnt = NFS_CLIENT(inode);
380 DECLARE_WAITQUEUE(wait, current);
381 sigset_t oldmask;
382 int retval;
384 /* Make sure it's started.. */
385 if (!WB_INPROGRESS(req))
386 rpc_wake_up_task(&req->wb_task);
388 rpc_clnt_sigmask(clnt, &oldmask);
389 add_wait_queue(&req->wb_wait, &wait);
390 for (;;) {
391 current->state = TASK_INTERRUPTIBLE;
392 retval = 0;
393 if (req->wb_flags & NFS_WRITE_COMPLETE)
394 break;
395 retval = -ERESTARTSYS;
396 if (signalled())
397 break;
398 schedule();
400 remove_wait_queue(&req->wb_wait, &wait);
401 current->state = TASK_RUNNING;
402 rpc_clnt_sigunmask(clnt, &oldmask);
403 return retval;
407 * Write a page to the server. This will be used for NFS swapping only
408 * (for now), and we currently do this synchronously only.
411 nfs_writepage(struct file * file, struct page *page)
413 struct dentry *dentry = file->f_dentry;
414 return nfs_writepage_sync(dentry, dentry->d_inode, page, 0, PAGE_SIZE);
418 * Update and possibly write a cached page of an NFS file.
420 * XXX: Keep an eye on generic_file_read to make sure it doesn't do bad
421 * things with a page scheduled for an RPC call (e.g. invalidate it).
424 nfs_updatepage(struct file *file, struct page *page, unsigned long offset, unsigned int count)
426 struct dentry *dentry = file->f_dentry;
427 struct inode *inode = dentry->d_inode;
428 struct nfs_wreq *req;
429 int synchronous = file->f_flags & O_SYNC;
430 int retval;
432 dprintk("NFS: nfs_updatepage(%s/%s %d@%ld)\n",
433 dentry->d_parent->d_name.name, dentry->d_name.name,
434 count, page->offset+offset);
437 * Try to find a corresponding request on the writeback queue.
438 * If there is one, we can be sure that this request is not
439 * yet being processed, because we hold a lock on the page.
441 * If the request was created by us, update it. Otherwise,
442 * transfer the page lock and flush out the dirty page now.
443 * After returning, generic_file_write will wait on the
444 * page and retry the update.
446 req = find_write_request(inode, page);
447 if (req && req->wb_file == file && update_write_request(req, offset, count))
448 goto updated;
451 * If wsize is smaller than page size, update and write
452 * page synchronously.
454 if (NFS_SERVER(inode)->wsize < PAGE_SIZE)
455 return nfs_writepage_sync(dentry, inode, page, offset, count);
457 /* Create the write request. */
458 req = create_write_request(file, page, offset, count);
459 if (!req)
460 return -ENOBUFS;
463 * Ok, there's another user of this page with the new request..
464 * The IO completion will then free the page and the dentry.
466 atomic_inc(&page->count);
467 file->f_count++;
469 /* Schedule request */
470 synchronous = schedule_write_request(req, synchronous);
472 updated:
473 if (req->wb_bytes == PAGE_SIZE)
474 set_bit(PG_uptodate, &page->flags);
476 retval = count;
477 if (synchronous) {
478 int status = wait_on_write_request(req);
479 if (status) {
480 nfs_cancel_request(req);
481 retval = status;
482 } else {
483 status = req->wb_status;
484 if (status < 0)
485 retval = status;
488 if (retval < 0)
489 clear_bit(PG_uptodate, &page->flags);
492 free_write_request(req);
493 return retval;
497 * Cancel a write request. We always mark it cancelled,
498 * but if it's already in progress there's no point in
499 * calling rpc_exit, and we don't want to overwrite the
500 * tk_status field.
502 static void
503 nfs_cancel_request(struct nfs_wreq *req)
505 req->wb_flags |= NFS_WRITE_CANCELLED;
506 if (!WB_INPROGRESS(req)) {
507 rpc_exit(&req->wb_task, 0);
508 rpc_wake_up_task(&req->wb_task);
513 * Cancel all writeback requests, both pending and in progress.
515 static void
516 nfs_cancel_dirty(struct inode *inode, pid_t pid)
518 struct nfs_wreq *head, *req;
520 req = head = NFS_WRITEBACK(inode);
521 while (req != NULL) {
522 if (pid == 0 || req->wb_pid == pid)
523 nfs_cancel_request(req);
524 if ((req = WB_NEXT(req)) == head)
525 break;
530 * If we're waiting on somebody else's request
531 * we need to increment the counter during the
532 * wait so that the request doesn't disappear
533 * from under us during the wait..
535 static int FASTCALL(wait_on_other_req(struct nfs_wreq *));
536 static int wait_on_other_req(struct nfs_wreq *req)
538 int retval;
539 req->wb_count++;
540 retval = wait_on_write_request(req);
541 free_write_request(req);
542 return retval;
546 * This writes back a set of requests according to the condition.
548 * If this ever gets much more convoluted, use a fn pointer for
549 * the condition..
551 #define NFS_WB(inode, cond) { int retval = 0 ; \
552 do { \
553 struct nfs_wreq *req = NFS_WRITEBACK(inode); \
554 struct nfs_wreq *head = req; \
555 if (!req) break; \
556 for (;;) { \
557 if (!(req->wb_flags & NFS_WRITE_COMPLETE)) \
558 if (cond) break; \
559 req = WB_NEXT(req); \
560 if (req == head) goto out; \
562 retval = wait_on_other_req(req); \
563 } while (!retval); \
564 out: return retval; \
568 nfs_wb_all(struct inode *inode)
570 NFS_WB(inode, 1);
574 * Write back all requests on one page - we do this before reading it.
577 nfs_wb_page(struct inode *inode, struct page *page)
579 NFS_WB(inode, req->wb_page == page);
583 * Write back all pending writes from one file descriptor..
586 nfs_wb_file(struct inode *inode, struct file *file)
588 NFS_WB(inode, req->wb_file == file);
591 void
592 nfs_inval(struct inode *inode)
594 nfs_cancel_dirty(inode,0);
598 * The following procedures make up the writeback finite state machinery:
600 * 1. Try to lock the page if not yet locked by us,
601 * set up the RPC call info, and pass to the call FSM.
603 static void
604 nfs_wback_begin(struct rpc_task *task)
606 struct nfs_wreq *req = (struct nfs_wreq *) task->tk_calldata;
607 struct page *page = req->wb_page;
608 struct file *file = req->wb_file;
609 struct dentry *dentry = file->f_dentry;
611 dprintk("NFS: %4d nfs_wback_begin (%s/%s, status=%d flags=%x)\n",
612 task->tk_pid, dentry->d_parent->d_name.name,
613 dentry->d_name.name, task->tk_status, req->wb_flags);
615 task->tk_status = 0;
617 /* Setup the task struct for a writeback call */
618 req->wb_flags |= NFS_WRITE_INPROGRESS;
619 req->wb_args.fh = NFS_FH(dentry);
620 req->wb_args.offset = page->offset + req->wb_offset;
621 req->wb_args.count = req->wb_bytes;
622 req->wb_args.buffer = (void *) (page_address(page) + req->wb_offset);
624 rpc_call_setup(task, NFSPROC_WRITE, &req->wb_args, &req->wb_fattr, 0);
626 return;
630 * 2. Collect the result
632 static void
633 nfs_wback_result(struct rpc_task *task)
635 struct nfs_wreq *req = (struct nfs_wreq *) task->tk_calldata;
636 struct file *file = req->wb_file;
637 struct page *page = req->wb_page;
638 int status = task->tk_status;
639 struct dentry *dentry = file->f_dentry;
640 struct inode *inode = dentry->d_inode;
642 dprintk("NFS: %4d nfs_wback_result (%s/%s, status=%d, flags=%x)\n",
643 task->tk_pid, dentry->d_parent->d_name.name,
644 dentry->d_name.name, status, req->wb_flags);
646 /* Set the WRITE_COMPLETE flag, but leave WRITE_INPROGRESS set */
647 req->wb_flags |= NFS_WRITE_COMPLETE;
648 req->wb_status = status;
650 if (status < 0) {
651 req->wb_flags |= NFS_WRITE_INVALIDATE;
652 file->f_error = status;
653 } else if (!WB_CANCELLED(req)) {
654 struct nfs_fattr *fattr = &req->wb_fattr;
655 /* Update attributes as result of writeback.
656 * Beware: when UDP replies arrive out of order, we
657 * may end up overwriting a previous, bigger file size.
659 * When the file size shrinks we cancel all pending
660 * writebacks.
662 if (fattr->mtime.seconds >= inode->i_mtime) {
663 if (fattr->size < inode->i_size)
664 fattr->size = inode->i_size;
666 /* possible Solaris 2.5 server bug workaround */
667 if (inode->i_ino == fattr->fileid) {
669 * We expect these values to change, and
670 * don't want to invalidate the caches.
672 inode->i_size = fattr->size;
673 inode->i_mtime = fattr->mtime.seconds;
674 nfs_refresh_inode(inode, fattr);
676 else
677 printk("nfs_wback_result: inode %ld, got %u?\n",
678 inode->i_ino, fattr->fileid);
682 rpc_release_task(task);
684 if (WB_INVALIDATE(req))
685 clear_bit(PG_uptodate, &page->flags);
687 __free_page(page);
688 remove_write_request(&NFS_WRITEBACK(inode), req);
689 nr_write_requests--;
690 fput(req->wb_file);
692 wake_up(&req->wb_wait);
693 free_write_request(req);