Linux-2.4.0-test2
[davej-history.git] / fs / select.c
blob86c2793d737764b5eb854479f1e55737bed031b8
1 /*
2 * This file contains the procedures for the handling of select and poll
4 * Created for Linux based loosely upon Mathius Lattner's minix
5 * patches by Peter MacDonald. Heavily edited by Linus.
7 * 4 February 1994
8 * COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
9 * flag set in its personality we do *not* modify the given timeout
10 * parameter to reflect time remaining.
12 * 24 January 2000
13 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation
14 * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
17 #include <linux/malloc.h>
18 #include <linux/smp_lock.h>
19 #include <linux/poll.h>
20 #include <linux/file.h>
22 #include <asm/uaccess.h>
24 #define ROUND_UP(x,y) (((x)+(y)-1)/(y))
25 #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
28 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
29 * I have rewritten this, taking some shortcuts: This code may not be easy to
30 * follow, but it should be free of race-conditions, and it's practical. If you
31 * understand what I'm doing here, then you understand how the linux
32 * sleep/wakeup mechanism works.
34 * Two very simple procedures, poll_wait() and free_wait() make all the
35 * work. poll_wait() is an inline-function defined in <linux/poll.h>,
36 * as all select/poll functions have to call it to add an entry to the
37 * poll table.
41 * I rewrote this again to make the poll_table size variable, take some
42 * more shortcuts, improve responsiveness, and remove another race that
43 * Linus noticed. -- jrs
46 static poll_table* alloc_wait(int nfds)
48 poll_table* out;
49 poll_table* walk;
51 out = (poll_table *) __get_free_page(GFP_KERNEL);
52 if(out==NULL)
53 return NULL;
54 out->nr = 0;
55 out->entry = (struct poll_table_entry *)(out + 1);
56 out->next = NULL;
57 nfds -=__MAX_POLL_TABLE_ENTRIES;
58 walk = out;
59 while(nfds > 0) {
60 poll_table *tmp = (poll_table *) __get_free_page(GFP_KERNEL);
61 if (!tmp) {
62 while(out != NULL) {
63 tmp = out->next;
64 free_page((unsigned long)out);
65 out = tmp;
67 return NULL;
69 tmp->nr = 0;
70 tmp->entry = (struct poll_table_entry *)(tmp + 1);
71 tmp->next = NULL;
72 walk->next = tmp;
73 walk = tmp;
74 nfds -=__MAX_POLL_TABLE_ENTRIES;
76 return out;
79 static void free_wait(poll_table * p)
81 struct poll_table_entry * entry;
82 poll_table *old;
84 while (p) {
85 entry = p->entry + p->nr;
86 while (p->nr > 0) {
87 p->nr--;
88 entry--;
89 remove_wait_queue(entry->wait_address,&entry->wait);
90 fput(entry->filp);
92 old = p;
93 p = p->next;
94 free_page((unsigned long) old);
98 void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
100 for (;;) {
101 if (p->nr < __MAX_POLL_TABLE_ENTRIES) {
102 struct poll_table_entry * entry;
103 entry = p->entry + p->nr;
104 get_file(filp);
105 entry->filp = filp;
106 entry->wait_address = wait_address;
107 init_waitqueue_entry(&entry->wait, current);
108 add_wait_queue(wait_address,&entry->wait);
109 p->nr++;
110 return;
112 p = p->next;
116 #define __IN(fds, n) (fds->in + n)
117 #define __OUT(fds, n) (fds->out + n)
118 #define __EX(fds, n) (fds->ex + n)
119 #define __RES_IN(fds, n) (fds->res_in + n)
120 #define __RES_OUT(fds, n) (fds->res_out + n)
121 #define __RES_EX(fds, n) (fds->res_ex + n)
123 #define BITS(fds, n) (*__IN(fds, n)|*__OUT(fds, n)|*__EX(fds, n))
125 static int max_select_fd(unsigned long n, fd_set_bits *fds)
127 unsigned long *open_fds;
128 unsigned long set;
129 int max;
131 /* handle last in-complete long-word first */
132 set = ~(~0UL << (n & (__NFDBITS-1)));
133 n /= __NFDBITS;
134 open_fds = current->files->open_fds->fds_bits+n;
135 max = 0;
136 if (set) {
137 set &= BITS(fds, n);
138 if (set) {
139 if (!(set & ~*open_fds))
140 goto get_max;
141 return -EBADF;
144 while (n) {
145 open_fds--;
146 n--;
147 set = BITS(fds, n);
148 if (!set)
149 continue;
150 if (set & ~*open_fds)
151 return -EBADF;
152 if (max)
153 continue;
154 get_max:
155 do {
156 max++;
157 set >>= 1;
158 } while (set);
159 max += n * __NFDBITS;
162 return max;
165 #define BIT(i) (1UL << ((i)&(__NFDBITS-1)))
166 #define MEM(i,m) ((m)+(unsigned)(i)/__NFDBITS)
167 #define ISSET(i,m) (((i)&*(m)) != 0)
168 #define SET(i,m) (*(m) |= (i))
170 #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
171 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
172 #define POLLEX_SET (POLLPRI)
174 int do_select(int n, fd_set_bits *fds, long *timeout)
176 poll_table *wait, *orig_wait;
177 int retval, i, off;
178 long __timeout = *timeout;
180 orig_wait = wait = NULL;
182 read_lock(&current->files->file_lock);
183 retval = max_select_fd(n, fds);
184 read_unlock(&current->files->file_lock);
186 if (retval < 0)
187 return retval;
188 n = retval;
189 if (__timeout) {
190 orig_wait = wait = alloc_wait(n);
191 if (!wait)
192 return -ENOMEM;
194 retval = 0;
195 for (;;) {
196 set_current_state(TASK_INTERRUPTIBLE);
197 for (i = 0 ; i < n; i++) {
198 unsigned long bit = BIT(i);
199 unsigned long mask;
200 struct file *file;
202 off = i / __NFDBITS;
203 if (!(bit & BITS(fds, off)))
204 continue;
205 file = fget(i);
206 mask = POLLNVAL;
207 if (file) {
208 mask = DEFAULT_POLLMASK;
209 if (file->f_op && file->f_op->poll)
210 mask = file->f_op->poll(file, wait);
211 fput(file);
213 if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) {
214 SET(bit, __RES_IN(fds,off));
215 retval++;
216 wait = NULL;
218 if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) {
219 SET(bit, __RES_OUT(fds,off));
220 retval++;
221 wait = NULL;
223 if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) {
224 SET(bit, __RES_EX(fds,off));
225 retval++;
226 wait = NULL;
229 wait = NULL;
230 if (retval || !__timeout || signal_pending(current))
231 break;
232 __timeout = schedule_timeout(__timeout);
234 current->state = TASK_RUNNING;
236 free_wait(orig_wait);
239 * Up-to-date the caller timeout.
241 *timeout = __timeout;
242 return retval;
246 * We can actually return ERESTARTSYS instead of EINTR, but I'd
247 * like to be certain this leads to no problems. So I return
248 * EINTR just for safety.
250 * Update: ERESTARTSYS breaks at least the xview clock binary, so
251 * I'm trying ERESTARTNOHAND which restart only when you want to.
253 #define MAX_SELECT_SECONDS \
254 ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
256 asmlinkage long
257 sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
259 fd_set_bits fds;
260 char *bits;
261 long timeout;
262 int ret, size;
264 timeout = MAX_SCHEDULE_TIMEOUT;
265 if (tvp) {
266 time_t sec, usec;
268 if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
269 || (ret = __get_user(sec, &tvp->tv_sec))
270 || (ret = __get_user(usec, &tvp->tv_usec)))
271 goto out_nofds;
273 ret = -EINVAL;
274 if (sec < 0 || usec < 0)
275 goto out_nofds;
277 if ((unsigned long) sec < MAX_SELECT_SECONDS) {
278 timeout = ROUND_UP(usec, 1000000/HZ);
279 timeout += sec * (unsigned long) HZ;
283 ret = -EINVAL;
284 if (n < 0)
285 goto out_nofds;
287 if (n > current->files->max_fdset)
288 n = current->files->max_fdset;
291 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
292 * since we used fdset we need to allocate memory in units of
293 * long-words.
295 ret = -ENOMEM;
296 size = FDS_BYTES(n);
297 bits = kmalloc(6 * size, GFP_KERNEL);
298 if (!bits)
299 goto out_nofds;
300 fds.in = (unsigned long *) bits;
301 fds.out = (unsigned long *) (bits + size);
302 fds.ex = (unsigned long *) (bits + 2*size);
303 fds.res_in = (unsigned long *) (bits + 3*size);
304 fds.res_out = (unsigned long *) (bits + 4*size);
305 fds.res_ex = (unsigned long *) (bits + 5*size);
307 if ((ret = get_fd_set(n, inp, fds.in)) ||
308 (ret = get_fd_set(n, outp, fds.out)) ||
309 (ret = get_fd_set(n, exp, fds.ex)))
310 goto out;
311 zero_fd_set(n, fds.res_in);
312 zero_fd_set(n, fds.res_out);
313 zero_fd_set(n, fds.res_ex);
315 ret = do_select(n, &fds, &timeout);
317 if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
318 time_t sec = 0, usec = 0;
319 if (timeout) {
320 sec = timeout / HZ;
321 usec = timeout % HZ;
322 usec *= (1000000/HZ);
324 put_user(sec, &tvp->tv_sec);
325 put_user(usec, &tvp->tv_usec);
328 if (ret < 0)
329 goto out;
330 if (!ret) {
331 ret = -ERESTARTNOHAND;
332 if (signal_pending(current))
333 goto out;
334 ret = 0;
337 set_fd_set(n, inp, fds.res_in);
338 set_fd_set(n, outp, fds.res_out);
339 set_fd_set(n, exp, fds.res_ex);
341 out:
342 kfree(bits);
343 out_nofds:
344 return ret;
347 #define POLLFD_PER_PAGE ((PAGE_SIZE) / sizeof(struct pollfd))
349 static void do_pollfd(unsigned int num, struct pollfd * fdpage,
350 poll_table ** pwait, int *count)
352 int i;
354 for (i = 0; i < num; i++) {
355 int fd;
356 unsigned int mask;
357 struct pollfd *fdp;
359 mask = 0;
360 fdp = fdpage+i;
361 fd = fdp->fd;
362 if (fd >= 0) {
363 struct file * file = fget(fd);
364 mask = POLLNVAL;
365 if (file != NULL) {
366 mask = DEFAULT_POLLMASK;
367 if (file->f_op && file->f_op->poll)
368 mask = file->f_op->poll(file, *pwait);
369 mask &= fdp->events | POLLERR | POLLHUP;
370 fput(file);
372 if (mask) {
373 *pwait = NULL;
374 (*count)++;
377 fdp->revents = mask;
381 static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft,
382 struct pollfd *fds[], poll_table *wait, long timeout)
384 int count = 0;
386 for (;;) {
387 unsigned int i;
389 set_current_state(TASK_INTERRUPTIBLE);
390 for (i=0; i < nchunks; i++)
391 do_pollfd(POLLFD_PER_PAGE, fds[i], &wait, &count);
392 if (nleft)
393 do_pollfd(nleft, fds[nchunks], &wait, &count);
394 wait = NULL;
395 if (count || !timeout || signal_pending(current))
396 break;
397 timeout = schedule_timeout(timeout);
399 current->state = TASK_RUNNING;
400 return count;
403 asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
405 int i, j, fdcount, err;
406 struct pollfd **fds;
407 poll_table *wait = NULL;
408 int nchunks, nleft;
410 /* Do a sanity check on nfds ... */
411 if (nfds > current->files->max_fds)
412 return -EINVAL;
414 if (timeout) {
415 /* Careful about overflow in the intermediate values */
416 if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ)
417 timeout = (unsigned long)(timeout*HZ+999)/1000+1;
418 else /* Negative or overflow */
419 timeout = MAX_SCHEDULE_TIMEOUT;
422 if (timeout) {
423 wait = alloc_wait(nfds);
424 if (!wait)
425 return -ENOMEM;
427 err = -ENOMEM;
429 fds = NULL;
430 if (nfds != 0) {
431 fds = (struct pollfd **)kmalloc(
432 (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *),
433 GFP_KERNEL);
434 if (fds == NULL)
435 goto out;
438 nchunks = 0;
439 nleft = nfds;
440 while (nleft > POLLFD_PER_PAGE) { /* allocate complete PAGE_SIZE chunks */
441 fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
442 if (fds[nchunks] == NULL)
443 goto out_fds;
444 nchunks++;
445 nleft -= POLLFD_PER_PAGE;
447 if (nleft) { /* allocate last PAGE_SIZE chunk, only nleft elements used */
448 fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
449 if (fds[nchunks] == NULL)
450 goto out_fds;
453 err = -EFAULT;
454 for (i=0; i < nchunks; i++)
455 if (copy_from_user(fds[i], ufds + i*POLLFD_PER_PAGE, PAGE_SIZE))
456 goto out_fds1;
457 if (nleft) {
458 if (copy_from_user(fds[nchunks], ufds + nchunks*POLLFD_PER_PAGE,
459 nleft * sizeof(struct pollfd)))
460 goto out_fds1;
463 fdcount = do_poll(nfds, nchunks, nleft, fds, wait, timeout);
465 /* OK, now copy the revents fields back to user space. */
466 for(i=0; i < nchunks; i++)
467 for (j=0; j < POLLFD_PER_PAGE; j++, ufds++)
468 __put_user((fds[i] + j)->revents, &ufds->revents);
469 if (nleft)
470 for (j=0; j < nleft; j++, ufds++)
471 __put_user((fds[nchunks] + j)->revents, &ufds->revents);
473 err = fdcount;
474 if (!fdcount && signal_pending(current))
475 err = -EINTR;
477 out_fds1:
478 if (nleft)
479 free_page((unsigned long)(fds[nchunks]));
480 out_fds:
481 for (i=0; i < nchunks; i++)
482 free_page((unsigned long)(fds[i]));
483 if (nfds != 0)
484 kfree(fds);
485 out:
486 free_wait(wait);
487 return err;