inotify: fix race
[linux-2.6.22.y-op.git] / fs / eventfd.c
blob2ce19c000d2adb40afc17205edcb937cbb84d730
1 /*
2 * fs/eventfd.c
4 * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org>
6 */
8 #include <linux/file.h>
9 #include <linux/poll.h>
10 #include <linux/init.h>
11 #include <linux/fs.h>
12 #include <linux/sched.h>
13 #include <linux/kernel.h>
14 #include <linux/list.h>
15 #include <linux/spinlock.h>
16 #include <linux/anon_inodes.h>
17 #include <linux/eventfd.h>
19 struct eventfd_ctx {
20 wait_queue_head_t wqh;
22 * Every time that a write(2) is performed on an eventfd, the
23 * value of the __u64 being written is added to "count" and a
24 * wakeup is performed on "wqh". A read(2) will return the "count"
25 * value to userspace, and will reset "count" to zero. The kernel
26 * size eventfd_signal() also, adds to the "count" counter and
27 * issue a wakeup.
29 __u64 count;
33 * Adds "n" to the eventfd counter "count". Returns "n" in case of
34 * success, or a value lower then "n" in case of coutner overflow.
35 * This function is supposed to be called by the kernel in paths
36 * that do not allow sleeping. In this function we allow the counter
37 * to reach the ULLONG_MAX value, and we signal this as overflow
38 * condition by returining a POLLERR to poll(2).
40 int eventfd_signal(struct file *file, int n)
42 struct eventfd_ctx *ctx = file->private_data;
43 unsigned long flags;
45 if (n < 0)
46 return -EINVAL;
47 spin_lock_irqsave(&ctx->wqh.lock, flags);
48 if (ULLONG_MAX - ctx->count < n)
49 n = (int) (ULLONG_MAX - ctx->count);
50 ctx->count += n;
51 if (waitqueue_active(&ctx->wqh))
52 wake_up_locked(&ctx->wqh);
53 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
55 return n;
58 static int eventfd_release(struct inode *inode, struct file *file)
60 kfree(file->private_data);
61 return 0;
64 static unsigned int eventfd_poll(struct file *file, poll_table *wait)
66 struct eventfd_ctx *ctx = file->private_data;
67 unsigned int events = 0;
68 unsigned long flags;
70 poll_wait(file, &ctx->wqh, wait);
72 spin_lock_irqsave(&ctx->wqh.lock, flags);
73 if (ctx->count > 0)
74 events |= POLLIN;
75 if (ctx->count == ULLONG_MAX)
76 events |= POLLERR;
77 if (ULLONG_MAX - 1 > ctx->count)
78 events |= POLLOUT;
79 spin_unlock_irqrestore(&ctx->wqh.lock, flags);
81 return events;
84 static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
85 loff_t *ppos)
87 struct eventfd_ctx *ctx = file->private_data;
88 ssize_t res;
89 __u64 ucnt;
90 DECLARE_WAITQUEUE(wait, current);
92 if (count < sizeof(ucnt))
93 return -EINVAL;
94 spin_lock_irq(&ctx->wqh.lock);
95 res = -EAGAIN;
96 ucnt = ctx->count;
97 if (ucnt > 0)
98 res = sizeof(ucnt);
99 else if (!(file->f_flags & O_NONBLOCK)) {
100 __add_wait_queue(&ctx->wqh, &wait);
101 for (res = 0;;) {
102 set_current_state(TASK_INTERRUPTIBLE);
103 if (ctx->count > 0) {
104 ucnt = ctx->count;
105 res = sizeof(ucnt);
106 break;
108 if (signal_pending(current)) {
109 res = -ERESTARTSYS;
110 break;
112 spin_unlock_irq(&ctx->wqh.lock);
113 schedule();
114 spin_lock_irq(&ctx->wqh.lock);
116 __remove_wait_queue(&ctx->wqh, &wait);
117 __set_current_state(TASK_RUNNING);
119 if (res > 0) {
120 ctx->count = 0;
121 if (waitqueue_active(&ctx->wqh))
122 wake_up_locked(&ctx->wqh);
124 spin_unlock_irq(&ctx->wqh.lock);
125 if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
126 return -EFAULT;
128 return res;
131 static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
132 loff_t *ppos)
134 struct eventfd_ctx *ctx = file->private_data;
135 ssize_t res;
136 __u64 ucnt;
137 DECLARE_WAITQUEUE(wait, current);
139 if (count < sizeof(ucnt))
140 return -EINVAL;
141 if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
142 return -EFAULT;
143 if (ucnt == ULLONG_MAX)
144 return -EINVAL;
145 spin_lock_irq(&ctx->wqh.lock);
146 res = -EAGAIN;
147 if (ULLONG_MAX - ctx->count > ucnt)
148 res = sizeof(ucnt);
149 else if (!(file->f_flags & O_NONBLOCK)) {
150 __add_wait_queue(&ctx->wqh, &wait);
151 for (res = 0;;) {
152 set_current_state(TASK_INTERRUPTIBLE);
153 if (ULLONG_MAX - ctx->count > ucnt) {
154 res = sizeof(ucnt);
155 break;
157 if (signal_pending(current)) {
158 res = -ERESTARTSYS;
159 break;
161 spin_unlock_irq(&ctx->wqh.lock);
162 schedule();
163 spin_lock_irq(&ctx->wqh.lock);
165 __remove_wait_queue(&ctx->wqh, &wait);
166 __set_current_state(TASK_RUNNING);
168 if (res > 0) {
169 ctx->count += ucnt;
170 if (waitqueue_active(&ctx->wqh))
171 wake_up_locked(&ctx->wqh);
173 spin_unlock_irq(&ctx->wqh.lock);
175 return res;
178 static const struct file_operations eventfd_fops = {
179 .release = eventfd_release,
180 .poll = eventfd_poll,
181 .read = eventfd_read,
182 .write = eventfd_write,
185 struct file *eventfd_fget(int fd)
187 struct file *file;
189 file = fget(fd);
190 if (!file)
191 return ERR_PTR(-EBADF);
192 if (file->f_op != &eventfd_fops) {
193 fput(file);
194 return ERR_PTR(-EINVAL);
197 return file;
200 asmlinkage long sys_eventfd(unsigned int count)
202 int error, fd;
203 struct eventfd_ctx *ctx;
204 struct file *file;
205 struct inode *inode;
207 ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
208 if (!ctx)
209 return -ENOMEM;
211 init_waitqueue_head(&ctx->wqh);
212 ctx->count = count;
215 * When we call this, the initialization must be complete, since
216 * anon_inode_getfd() will install the fd.
218 error = anon_inode_getfd(&fd, &inode, &file, "[eventfd]",
219 &eventfd_fops, ctx);
220 if (!error)
221 return fd;
223 kfree(ctx);
224 return error;