eventfd/kaio integration fix
[linux-2.6.22.y-op.git] / fs / read_write.c
blob4d03008f015b9ae6d2565cdbc179d034687c3d08
1 /*
2 * linux/fs/read_write.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/smp_lock.h>
13 #include <linux/fsnotify.h>
14 #include <linux/security.h>
15 #include <linux/module.h>
16 #include <linux/syscalls.h>
17 #include <linux/pagemap.h>
18 #include "read_write.h"
20 #include <asm/uaccess.h>
21 #include <asm/unistd.h>
23 const struct file_operations generic_ro_fops = {
24 .llseek = generic_file_llseek,
25 .read = do_sync_read,
26 .aio_read = generic_file_aio_read,
27 .mmap = generic_file_readonly_mmap,
28 .sendfile = generic_file_sendfile,
31 EXPORT_SYMBOL(generic_ro_fops);
33 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
35 long long retval;
36 struct inode *inode = file->f_mapping->host;
38 mutex_lock(&inode->i_mutex);
39 switch (origin) {
40 case SEEK_END:
41 offset += inode->i_size;
42 break;
43 case SEEK_CUR:
44 offset += file->f_pos;
46 retval = -EINVAL;
47 if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
48 if (offset != file->f_pos) {
49 file->f_pos = offset;
50 file->f_version = 0;
52 retval = offset;
54 mutex_unlock(&inode->i_mutex);
55 return retval;
58 EXPORT_SYMBOL(generic_file_llseek);
60 loff_t remote_llseek(struct file *file, loff_t offset, int origin)
62 long long retval;
64 lock_kernel();
65 switch (origin) {
66 case SEEK_END:
67 offset += i_size_read(file->f_path.dentry->d_inode);
68 break;
69 case SEEK_CUR:
70 offset += file->f_pos;
72 retval = -EINVAL;
73 if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
74 if (offset != file->f_pos) {
75 file->f_pos = offset;
76 file->f_version = 0;
78 retval = offset;
80 unlock_kernel();
81 return retval;
83 EXPORT_SYMBOL(remote_llseek);
85 loff_t no_llseek(struct file *file, loff_t offset, int origin)
87 return -ESPIPE;
89 EXPORT_SYMBOL(no_llseek);
91 loff_t default_llseek(struct file *file, loff_t offset, int origin)
93 long long retval;
95 lock_kernel();
96 switch (origin) {
97 case SEEK_END:
98 offset += i_size_read(file->f_path.dentry->d_inode);
99 break;
100 case SEEK_CUR:
101 offset += file->f_pos;
103 retval = -EINVAL;
104 if (offset >= 0) {
105 if (offset != file->f_pos) {
106 file->f_pos = offset;
107 file->f_version = 0;
109 retval = offset;
111 unlock_kernel();
112 return retval;
114 EXPORT_SYMBOL(default_llseek);
116 loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
118 loff_t (*fn)(struct file *, loff_t, int);
120 fn = no_llseek;
121 if (file->f_mode & FMODE_LSEEK) {
122 fn = default_llseek;
123 if (file->f_op && file->f_op->llseek)
124 fn = file->f_op->llseek;
126 return fn(file, offset, origin);
128 EXPORT_SYMBOL(vfs_llseek);
130 asmlinkage off_t sys_lseek(unsigned int fd, off_t offset, unsigned int origin)
132 off_t retval;
133 struct file * file;
134 int fput_needed;
136 retval = -EBADF;
137 file = fget_light(fd, &fput_needed);
138 if (!file)
139 goto bad;
141 retval = -EINVAL;
142 if (origin <= SEEK_MAX) {
143 loff_t res = vfs_llseek(file, offset, origin);
144 retval = res;
145 if (res != (loff_t)retval)
146 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
148 fput_light(file, fput_needed);
149 bad:
150 return retval;
153 #ifdef __ARCH_WANT_SYS_LLSEEK
154 asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
155 unsigned long offset_low, loff_t __user * result,
156 unsigned int origin)
158 int retval;
159 struct file * file;
160 loff_t offset;
161 int fput_needed;
163 retval = -EBADF;
164 file = fget_light(fd, &fput_needed);
165 if (!file)
166 goto bad;
168 retval = -EINVAL;
169 if (origin > SEEK_MAX)
170 goto out_putf;
172 offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
173 origin);
175 retval = (int)offset;
176 if (offset >= 0) {
177 retval = -EFAULT;
178 if (!copy_to_user(result, &offset, sizeof(offset)))
179 retval = 0;
181 out_putf:
182 fput_light(file, fput_needed);
183 bad:
184 return retval;
186 #endif
189 * rw_verify_area doesn't like huge counts. We limit
190 * them to something that fits in "int" so that others
191 * won't have to do range checks all the time.
193 #define MAX_RW_COUNT (INT_MAX & PAGE_CACHE_MASK)
195 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
197 struct inode *inode;
198 loff_t pos;
200 inode = file->f_path.dentry->d_inode;
201 if (unlikely((ssize_t) count < 0))
202 goto Einval;
203 pos = *ppos;
204 if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
205 goto Einval;
207 if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
208 int retval = locks_mandatory_area(
209 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
210 inode, file, pos, count);
211 if (retval < 0)
212 return retval;
214 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
216 Einval:
217 return -EINVAL;
220 static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
222 set_current_state(TASK_UNINTERRUPTIBLE);
223 if (!kiocbIsKicked(iocb))
224 schedule();
225 else
226 kiocbClearKicked(iocb);
227 __set_current_state(TASK_RUNNING);
230 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
232 struct iovec iov = { .iov_base = buf, .iov_len = len };
233 struct kiocb kiocb;
234 ssize_t ret;
236 init_sync_kiocb(&kiocb, filp);
237 kiocb.ki_pos = *ppos;
238 kiocb.ki_left = len;
240 for (;;) {
241 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
242 if (ret != -EIOCBRETRY)
243 break;
244 wait_on_retry_sync_kiocb(&kiocb);
247 if (-EIOCBQUEUED == ret)
248 ret = wait_on_sync_kiocb(&kiocb);
249 *ppos = kiocb.ki_pos;
250 return ret;
253 EXPORT_SYMBOL(do_sync_read);
255 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
257 ssize_t ret;
259 if (!(file->f_mode & FMODE_READ))
260 return -EBADF;
261 if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
262 return -EINVAL;
263 if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
264 return -EFAULT;
266 ret = rw_verify_area(READ, file, pos, count);
267 if (ret >= 0) {
268 count = ret;
269 ret = security_file_permission (file, MAY_READ);
270 if (!ret) {
271 if (file->f_op->read)
272 ret = file->f_op->read(file, buf, count, pos);
273 else
274 ret = do_sync_read(file, buf, count, pos);
275 if (ret > 0) {
276 fsnotify_access(file->f_path.dentry);
277 add_rchar(current, ret);
279 inc_syscr(current);
283 return ret;
286 EXPORT_SYMBOL(vfs_read);
288 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
290 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
291 struct kiocb kiocb;
292 ssize_t ret;
294 init_sync_kiocb(&kiocb, filp);
295 kiocb.ki_pos = *ppos;
296 kiocb.ki_left = len;
298 for (;;) {
299 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
300 if (ret != -EIOCBRETRY)
301 break;
302 wait_on_retry_sync_kiocb(&kiocb);
305 if (-EIOCBQUEUED == ret)
306 ret = wait_on_sync_kiocb(&kiocb);
307 *ppos = kiocb.ki_pos;
308 return ret;
311 EXPORT_SYMBOL(do_sync_write);
313 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
315 ssize_t ret;
317 if (!(file->f_mode & FMODE_WRITE))
318 return -EBADF;
319 if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
320 return -EINVAL;
321 if (unlikely(!access_ok(VERIFY_READ, buf, count)))
322 return -EFAULT;
324 ret = rw_verify_area(WRITE, file, pos, count);
325 if (ret >= 0) {
326 count = ret;
327 ret = security_file_permission (file, MAY_WRITE);
328 if (!ret) {
329 if (file->f_op->write)
330 ret = file->f_op->write(file, buf, count, pos);
331 else
332 ret = do_sync_write(file, buf, count, pos);
333 if (ret > 0) {
334 fsnotify_modify(file->f_path.dentry);
335 add_wchar(current, ret);
337 inc_syscw(current);
341 return ret;
344 EXPORT_SYMBOL(vfs_write);
346 static inline loff_t file_pos_read(struct file *file)
348 return file->f_pos;
351 static inline void file_pos_write(struct file *file, loff_t pos)
353 file->f_pos = pos;
356 asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
358 struct file *file;
359 ssize_t ret = -EBADF;
360 int fput_needed;
362 file = fget_light(fd, &fput_needed);
363 if (file) {
364 loff_t pos = file_pos_read(file);
365 ret = vfs_read(file, buf, count, &pos);
366 file_pos_write(file, pos);
367 fput_light(file, fput_needed);
370 return ret;
372 EXPORT_SYMBOL_GPL(sys_read);
374 asmlinkage ssize_t sys_write(unsigned int fd, const char __user * buf, size_t count)
376 struct file *file;
377 ssize_t ret = -EBADF;
378 int fput_needed;
380 file = fget_light(fd, &fput_needed);
381 if (file) {
382 loff_t pos = file_pos_read(file);
383 ret = vfs_write(file, buf, count, &pos);
384 file_pos_write(file, pos);
385 fput_light(file, fput_needed);
388 return ret;
391 asmlinkage ssize_t sys_pread64(unsigned int fd, char __user *buf,
392 size_t count, loff_t pos)
394 struct file *file;
395 ssize_t ret = -EBADF;
396 int fput_needed;
398 if (pos < 0)
399 return -EINVAL;
401 file = fget_light(fd, &fput_needed);
402 if (file) {
403 ret = -ESPIPE;
404 if (file->f_mode & FMODE_PREAD)
405 ret = vfs_read(file, buf, count, &pos);
406 fput_light(file, fput_needed);
409 return ret;
412 asmlinkage ssize_t sys_pwrite64(unsigned int fd, const char __user *buf,
413 size_t count, loff_t pos)
415 struct file *file;
416 ssize_t ret = -EBADF;
417 int fput_needed;
419 if (pos < 0)
420 return -EINVAL;
422 file = fget_light(fd, &fput_needed);
423 if (file) {
424 ret = -ESPIPE;
425 if (file->f_mode & FMODE_PWRITE)
426 ret = vfs_write(file, buf, count, &pos);
427 fput_light(file, fput_needed);
430 return ret;
434 * Reduce an iovec's length in-place. Return the resulting number of segments
436 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
438 unsigned long seg = 0;
439 size_t len = 0;
441 while (seg < nr_segs) {
442 seg++;
443 if (len + iov->iov_len >= to) {
444 iov->iov_len = to - len;
445 break;
447 len += iov->iov_len;
448 iov++;
450 return seg;
453 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
454 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
456 struct kiocb kiocb;
457 ssize_t ret;
459 init_sync_kiocb(&kiocb, filp);
460 kiocb.ki_pos = *ppos;
461 kiocb.ki_left = len;
462 kiocb.ki_nbytes = len;
464 for (;;) {
465 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
466 if (ret != -EIOCBRETRY)
467 break;
468 wait_on_retry_sync_kiocb(&kiocb);
471 if (ret == -EIOCBQUEUED)
472 ret = wait_on_sync_kiocb(&kiocb);
473 *ppos = kiocb.ki_pos;
474 return ret;
477 /* Do it by hand, with file-ops */
478 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
479 unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
481 struct iovec *vector = iov;
482 ssize_t ret = 0;
484 while (nr_segs > 0) {
485 void __user *base;
486 size_t len;
487 ssize_t nr;
489 base = vector->iov_base;
490 len = vector->iov_len;
491 vector++;
492 nr_segs--;
494 nr = fn(filp, base, len, ppos);
496 if (nr < 0) {
497 if (!ret)
498 ret = nr;
499 break;
501 ret += nr;
502 if (nr != len)
503 break;
506 return ret;
509 /* A write operation does a read from user space and vice versa */
510 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
512 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
513 unsigned long nr_segs, unsigned long fast_segs,
514 struct iovec *fast_pointer,
515 struct iovec **ret_pointer)
517 unsigned long seg;
518 ssize_t ret;
519 struct iovec *iov = fast_pointer;
522 * SuS says "The readv() function *may* fail if the iovcnt argument
523 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
524 * traditionally returned zero for zero segments, so...
526 if (nr_segs == 0) {
527 ret = 0;
528 goto out;
532 * First get the "struct iovec" from user memory and
533 * verify all the pointers
535 if (nr_segs > UIO_MAXIOV) {
536 ret = -EINVAL;
537 goto out;
539 if (nr_segs > fast_segs) {
540 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
541 if (iov == NULL) {
542 ret = -ENOMEM;
543 goto out;
546 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
547 ret = -EFAULT;
548 goto out;
552 * According to the Single Unix Specification we should return EINVAL
553 * if an element length is < 0 when cast to ssize_t or if the
554 * total length would overflow the ssize_t return value of the
555 * system call.
557 ret = 0;
558 for (seg = 0; seg < nr_segs; seg++) {
559 void __user *buf = iov[seg].iov_base;
560 ssize_t len = (ssize_t)iov[seg].iov_len;
562 /* see if we we're about to use an invalid len or if
563 * it's about to overflow ssize_t */
564 if (len < 0 || (ret + len < ret)) {
565 ret = -EINVAL;
566 goto out;
568 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
569 ret = -EFAULT;
570 goto out;
573 ret += len;
575 out:
576 *ret_pointer = iov;
577 return ret;
580 static ssize_t do_readv_writev(int type, struct file *file,
581 const struct iovec __user * uvector,
582 unsigned long nr_segs, loff_t *pos)
584 size_t tot_len;
585 struct iovec iovstack[UIO_FASTIOV];
586 struct iovec *iov = iovstack;
587 ssize_t ret;
588 io_fn_t fn;
589 iov_fn_t fnv;
591 if (!file->f_op) {
592 ret = -EINVAL;
593 goto out;
596 ret = rw_copy_check_uvector(type, uvector, nr_segs,
597 ARRAY_SIZE(iovstack), iovstack, &iov);
598 if (ret <= 0)
599 goto out;
601 tot_len = ret;
602 ret = rw_verify_area(type, file, pos, tot_len);
603 if (ret < 0)
604 goto out;
605 ret = security_file_permission(file, type == READ ? MAY_READ : MAY_WRITE);
606 if (ret)
607 goto out;
609 fnv = NULL;
610 if (type == READ) {
611 fn = file->f_op->read;
612 fnv = file->f_op->aio_read;
613 } else {
614 fn = (io_fn_t)file->f_op->write;
615 fnv = file->f_op->aio_write;
618 if (fnv)
619 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
620 pos, fnv);
621 else
622 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
624 out:
625 if (iov != iovstack)
626 kfree(iov);
627 if ((ret + (type == READ)) > 0) {
628 if (type == READ)
629 fsnotify_access(file->f_path.dentry);
630 else
631 fsnotify_modify(file->f_path.dentry);
633 return ret;
636 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
637 unsigned long vlen, loff_t *pos)
639 if (!(file->f_mode & FMODE_READ))
640 return -EBADF;
641 if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
642 return -EINVAL;
644 return do_readv_writev(READ, file, vec, vlen, pos);
647 EXPORT_SYMBOL(vfs_readv);
649 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
650 unsigned long vlen, loff_t *pos)
652 if (!(file->f_mode & FMODE_WRITE))
653 return -EBADF;
654 if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
655 return -EINVAL;
657 return do_readv_writev(WRITE, file, vec, vlen, pos);
660 EXPORT_SYMBOL(vfs_writev);
662 asmlinkage ssize_t
663 sys_readv(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
665 struct file *file;
666 ssize_t ret = -EBADF;
667 int fput_needed;
669 file = fget_light(fd, &fput_needed);
670 if (file) {
671 loff_t pos = file_pos_read(file);
672 ret = vfs_readv(file, vec, vlen, &pos);
673 file_pos_write(file, pos);
674 fput_light(file, fput_needed);
677 if (ret > 0)
678 add_rchar(current, ret);
679 inc_syscr(current);
680 return ret;
683 asmlinkage ssize_t
684 sys_writev(unsigned long fd, const struct iovec __user *vec, unsigned long vlen)
686 struct file *file;
687 ssize_t ret = -EBADF;
688 int fput_needed;
690 file = fget_light(fd, &fput_needed);
691 if (file) {
692 loff_t pos = file_pos_read(file);
693 ret = vfs_writev(file, vec, vlen, &pos);
694 file_pos_write(file, pos);
695 fput_light(file, fput_needed);
698 if (ret > 0)
699 add_wchar(current, ret);
700 inc_syscw(current);
701 return ret;
704 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
705 size_t count, loff_t max)
707 struct file * in_file, * out_file;
708 struct inode * in_inode, * out_inode;
709 loff_t pos;
710 ssize_t retval;
711 int fput_needed_in, fput_needed_out;
714 * Get input file, and verify that it is ok..
716 retval = -EBADF;
717 in_file = fget_light(in_fd, &fput_needed_in);
718 if (!in_file)
719 goto out;
720 if (!(in_file->f_mode & FMODE_READ))
721 goto fput_in;
722 retval = -EINVAL;
723 in_inode = in_file->f_path.dentry->d_inode;
724 if (!in_inode)
725 goto fput_in;
726 if (!in_file->f_op || !in_file->f_op->sendfile)
727 goto fput_in;
728 retval = -ESPIPE;
729 if (!ppos)
730 ppos = &in_file->f_pos;
731 else
732 if (!(in_file->f_mode & FMODE_PREAD))
733 goto fput_in;
734 retval = rw_verify_area(READ, in_file, ppos, count);
735 if (retval < 0)
736 goto fput_in;
737 count = retval;
739 retval = security_file_permission (in_file, MAY_READ);
740 if (retval)
741 goto fput_in;
744 * Get output file, and verify that it is ok..
746 retval = -EBADF;
747 out_file = fget_light(out_fd, &fput_needed_out);
748 if (!out_file)
749 goto fput_in;
750 if (!(out_file->f_mode & FMODE_WRITE))
751 goto fput_out;
752 retval = -EINVAL;
753 if (!out_file->f_op || !out_file->f_op->sendpage)
754 goto fput_out;
755 out_inode = out_file->f_path.dentry->d_inode;
756 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
757 if (retval < 0)
758 goto fput_out;
759 count = retval;
761 retval = security_file_permission (out_file, MAY_WRITE);
762 if (retval)
763 goto fput_out;
765 if (!max)
766 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
768 pos = *ppos;
769 retval = -EINVAL;
770 if (unlikely(pos < 0))
771 goto fput_out;
772 if (unlikely(pos + count > max)) {
773 retval = -EOVERFLOW;
774 if (pos >= max)
775 goto fput_out;
776 count = max - pos;
779 retval = in_file->f_op->sendfile(in_file, ppos, count, file_send_actor, out_file);
781 if (retval > 0) {
782 add_rchar(current, retval);
783 add_wchar(current, retval);
786 inc_syscr(current);
787 inc_syscw(current);
788 if (*ppos > max)
789 retval = -EOVERFLOW;
791 fput_out:
792 fput_light(out_file, fput_needed_out);
793 fput_in:
794 fput_light(in_file, fput_needed_in);
795 out:
796 return retval;
799 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t __user *offset, size_t count)
801 loff_t pos;
802 off_t off;
803 ssize_t ret;
805 if (offset) {
806 if (unlikely(get_user(off, offset)))
807 return -EFAULT;
808 pos = off;
809 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
810 if (unlikely(put_user(pos, offset)))
811 return -EFAULT;
812 return ret;
815 return do_sendfile(out_fd, in_fd, NULL, count, 0);
818 asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t __user *offset, size_t count)
820 loff_t pos;
821 ssize_t ret;
823 if (offset) {
824 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
825 return -EFAULT;
826 ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
827 if (unlikely(put_user(pos, offset)))
828 return -EFAULT;
829 return ret;
832 return do_sendfile(out_fd, in_fd, NULL, count, 0);