sys/kern/sys_select.c

   1 /*      $NetBSD: sys_select.c,v 1.13 2009/03/21 13:11:14 ad Exp $       */
   2
   3 /*-
   4  * Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
   5  * All rights reserved.
   6  *
   7  * This code is derived from software contributed to The NetBSD Foundation
   8  * by Andrew Doran.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  29  * POSSIBILITY OF SUCH DAMAGE.
  30  */
  31
  32 /*
  33  * Copyright (c) 1982, 1986, 1989, 1993
  34  *      The Regents of the University of California.  All rights reserved.
  35  * (c) UNIX System Laboratories, Inc.
  36  * All or some portions of this file are derived from material licensed
  37  * to the University of California by American Telephone and Telegraph
  38  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  39  * the permission of UNIX System Laboratories, Inc.
  40  *
  41  * Redistribution and use in source and binary forms, with or without
  42  * modification, are permitted provided that the following conditions
  43  * are met:
  44  * 1. Redistributions of source code must retain the above copyright
  45  *    notice, this list of conditions and the following disclaimer.
  46  * 2. Redistributions in binary form must reproduce the above copyright
  47  *    notice, this list of conditions and the following disclaimer in the
  48  *    documentation and/or other materials provided with the distribution.
  49  * 3. Neither the name of the University nor the names of its contributors
  50  *    may be used to endorse or promote products derived from this software
  51  *    without specific prior written permission.
  52  *
  53  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  63  * SUCH DAMAGE.
  64  *
  65  *      @(#)sys_generic.c       8.9 (Berkeley) 2/14/95
  66  */
  67
  68 /*
  69  * System calls relating to files.
  70  */
  71
  72 #include <sys/cdefs.h>
  73 __KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.13 2009/03/21 13:11:14 ad Exp $");
  74
  75 #include <sys/param.h>
  76 #include <sys/systm.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/ioctl.h>
  79 #include <sys/file.h>
  80 #include <sys/proc.h>
  81 #include <sys/socketvar.h>
  82 #include <sys/signalvar.h>
  83 #include <sys/uio.h>
  84 #include <sys/kernel.h>
  85 #include <sys/stat.h>
  86 #include <sys/poll.h>
  87 #include <sys/vnode.h>
  88 #include <sys/mount.h>
  89 #include <sys/syscallargs.h>
  90 #include <sys/cpu.h>
  91 #include <sys/atomic.h>
  92 #include <sys/socketvar.h>
  93 #include <sys/sleepq.h>
  94
  95 /* Flags for lwp::l_selflag. */
  96 #define SEL_RESET       0       /* awoken, interrupted, or not yet polling */
  97 #define SEL_SCANNING    1       /* polling descriptors */
  98 #define SEL_BLOCKING    2       /* about to block on select_cv */
  99
 100 /* Per-CPU state for select()/poll(). */
 101 #if MAXCPUS > 32
 102 #error adjust this code
 103 #endif
 104 typedef struct selcpu {
 105         kmutex_t        *sc_lock;
 106         sleepq_t        sc_sleepq;
 107         int             sc_ncoll;
 108         uint32_t        sc_mask;
 109 } selcpu_t;
 110
 111 static int      selscan(lwp_t *, fd_mask *, fd_mask *, int, register_t *);
 112 static int      pollscan(lwp_t *, struct pollfd *, int, register_t *);
 113 static void     selclear(void);
 114
 115 static syncobj_t select_sobj = {
 116         SOBJ_SLEEPQ_FIFO,
 117         sleepq_unsleep,
 118         sleepq_changepri,
 119         sleepq_lendpri,
 120         syncobj_noowner,
 121 };
 122
 123 /*
 124  * Select system call.
 125  */
 126 int
 127 sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
 128     register_t *retval)
 129 {
 130         /* {
 131                 syscallarg(int)                         nd;
 132                 syscallarg(fd_set *)                    in;
 133                 syscallarg(fd_set *)                    ou;
 134                 syscallarg(fd_set *)                    ex;
 135                 syscallarg(const struct timespec *)     ts;
 136                 syscallarg(sigset_t *)                  mask;
 137         } */
 138         struct timespec ats, *ts = NULL;
 139         sigset_t        amask, *mask = NULL;
 140         int             error;
 141
 142         if (SCARG(uap, ts)) {
 143                 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
 144                 if (error)
 145                         return error;
 146                 ts = &ats;
 147         }
 148         if (SCARG(uap, mask) != NULL) {
 149                 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
 150                 if (error)
 151                         return error;
 152                 mask = &amask;
 153         }
 154
 155         return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
 156             SCARG(uap, ou), SCARG(uap, ex), ts, mask);
 157 }
 158
 159 int
 160 inittimeleft(struct timespec *ts, struct timespec *sleepts)
 161 {
 162         if (itimespecfix(ts))
 163                 return -1;
 164         getnanouptime(sleepts);
 165         return 0;
 166 }
 167
 168 int
 169 gettimeleft(struct timespec *ts, struct timespec *sleepts)
 170 {
 171         /*
 172          * We have to recalculate the timeout on every retry.
 173          */
 174         struct timespec sleptts;
 175         /*
 176          * reduce ts by elapsed time
 177          * based on monotonic time scale
 178          */
 179         getnanouptime(&sleptts);
 180         timespecadd(ts, sleepts, ts);
 181         timespecsub(ts, &sleptts, ts);
 182         *sleepts = sleptts;
 183         return tstohz(ts);
 184 }
 185
 186 int
 187 sys___select50(struct lwp *l, const struct sys___select50_args *uap,
 188     register_t *retval)
 189 {
 190         /* {
 191                 syscallarg(int)                 nd;
 192                 syscallarg(fd_set *)            in;
 193                 syscallarg(fd_set *)            ou;
 194                 syscallarg(fd_set *)            ex;
 195                 syscallarg(struct timeval *)    tv;
 196         } */
 197         struct timeval atv;
 198         struct timespec ats, *ts = NULL;
 199         int error;
 200
 201         if (SCARG(uap, tv)) {
 202                 error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
 203                 if (error)
 204                         return error;
 205                 TIMEVAL_TO_TIMESPEC(&atv, &ats);
 206                 ts = &ats;
 207         }
 208
 209         return selcommon(l, retval, SCARG(uap, nd), SCARG(uap, in),
 210             SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
 211 }
 212
 213 int
 214 selcommon(lwp_t *l, register_t *retval, int nd, fd_set *u_in,
 215           fd_set *u_ou, fd_set *u_ex, struct timespec *ts, sigset_t *mask)
 216 {
 217         char            smallbits[howmany(FD_SETSIZE, NFDBITS) *
 218                             sizeof(fd_mask) * 6];
 219         proc_t          * const p = l->l_proc;
 220         char            *bits;
 221         int             ncoll, error, timo;
 222         size_t          ni;
 223         sigset_t        oldmask;
 224         struct timespec sleepts;
 225         selcpu_t        *sc;
 226         kmutex_t        *lock;
 227
 228         error = 0;
 229         if (nd < 0)
 230                 return (EINVAL);
 231         if (nd > p->p_fd->fd_nfiles) {
 232                 /* forgiving; slightly wrong */
 233                 nd = p->p_fd->fd_nfiles;
 234         }
 235         ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
 236         if (ni * 6 > sizeof(smallbits)) {
 237                 bits = kmem_alloc(ni * 6, KM_SLEEP);
 238                 if (bits == NULL)
 239                         return ENOMEM;
 240         } else
 241                 bits = smallbits;
 242
 243 #define getbits(name, x)                                                \
 244         if (u_ ## name) {                                               \
 245                 error = copyin(u_ ## name, bits + ni * x, ni);          \
 246                 if (error)                                              \
 247                         goto done;                                      \
 248         } else                                                          \
 249                 memset(bits + ni * x, 0, ni);
 250         getbits(in, 0);
 251         getbits(ou, 1);
 252         getbits(ex, 2);
 253 #undef  getbits
 254
 255         timo = 0;
 256         if (ts && inittimeleft(ts, &sleepts) == -1) {
 257                 error = EINVAL;
 258                 goto done;
 259         }
 260
 261         if (mask) {
 262                 sigminusset(&sigcantmask, mask);
 263                 mutex_enter(p->p_lock);
 264                 oldmask = l->l_sigmask;
 265                 l->l_sigmask = *mask;
 266                 mutex_exit(p->p_lock);
 267         } else
 268                 oldmask = l->l_sigmask; /* XXXgcc */
 269
 270         sc = curcpu()->ci_data.cpu_selcpu;
 271         lock = sc->sc_lock;
 272         l->l_selcpu = sc;
 273         SLIST_INIT(&l->l_selwait);
 274         for (;;) {
 275                 /*
 276                  * No need to lock.  If this is overwritten by another
 277                  * value while scanning, we will retry below.  We only
 278                  * need to see exact state from the descriptors that
 279                  * we are about to poll, and lock activity resulting
 280                  * from fo_poll is enough to provide an up to date value
 281                  * for new polling activity.
 282                  */
 283                 l->l_selflag = SEL_SCANNING;
 284                 ncoll = sc->sc_ncoll;
 285
 286                 error = selscan(l, (fd_mask *)(bits + ni * 0),
 287                     (fd_mask *)(bits + ni * 3), nd, retval);
 288
 289                 if (error || *retval)
 290                         break;
 291                 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
 292                         break;
 293                 mutex_spin_enter(lock);
 294                 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
 295                         mutex_spin_exit(lock);
 296                         continue;
 297                 }
 298                 l->l_selflag = SEL_BLOCKING;
 299                 l->l_kpriority = true;
 300                 sleepq_enter(&sc->sc_sleepq, l, lock);
 301                 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
 302                 error = sleepq_block(timo, true);
 303                 if (error != 0)
 304                         break;
 305         }
 306         selclear();
 307
 308         if (mask) {
 309                 mutex_enter(p->p_lock);
 310                 l->l_sigmask = oldmask;
 311                 mutex_exit(p->p_lock);
 312         }
 313
 314  done:
 315         /* select is not restarted after signals... */
 316         if (error == ERESTART)
 317                 error = EINTR;
 318         if (error == EWOULDBLOCK)
 319                 error = 0;
 320         if (error == 0 && u_in != NULL)
 321                 error = copyout(bits + ni * 3, u_in, ni);
 322         if (error == 0 && u_ou != NULL)
 323                 error = copyout(bits + ni * 4, u_ou, ni);
 324         if (error == 0 && u_ex != NULL)
 325                 error = copyout(bits + ni * 5, u_ex, ni);
 326         if (bits != smallbits)
 327                 kmem_free(bits, ni * 6);
 328         return (error);
 329 }
 330
 331 int
 332 selscan(lwp_t *l, fd_mask *ibitp, fd_mask *obitp, int nfd,
 333         register_t *retval)
 334 {
 335         static const int flag[3] = { POLLRDNORM | POLLHUP | POLLERR,
 336                                POLLWRNORM | POLLHUP | POLLERR,
 337                                POLLRDBAND };
 338         int msk, i, j, fd, n;
 339         fd_mask ibits, obits;
 340         file_t *fp;
 341
 342         n = 0;
 343         for (msk = 0; msk < 3; msk++) {
 344                 for (i = 0; i < nfd; i += NFDBITS) {
 345                         ibits = *ibitp++;
 346                         obits = 0;
 347                         while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
 348                                 ibits &= ~(1 << j);
 349                                 if ((fp = fd_getfile(fd)) == NULL)
 350                                         return (EBADF);
 351                                 if ((*fp->f_ops->fo_poll)(fp, flag[msk])) {
 352                                         obits |= (1 << j);
 353                                         n++;
 354                                 }
 355                                 fd_putfile(fd);
 356                         }
 357                         *obitp++ = obits;
 358                 }
 359         }
 360         *retval = n;
 361         return (0);
 362 }
 363
 364 /*
 365  * Poll system call.
 366  */
 367 int
 368 sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
 369 {
 370         /* {
 371                 syscallarg(struct pollfd *)     fds;
 372                 syscallarg(u_int)               nfds;
 373                 syscallarg(int)                 timeout;
 374         } */
 375         struct timespec ats, *ts = NULL;
 376
 377         if (SCARG(uap, timeout) != INFTIM) {
 378                 ats.tv_sec = SCARG(uap, timeout) / 1000;
 379                 ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
 380                 ts = &ats;
 381         }
 382
 383         return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
 384                 ts, NULL);
 385 }
 386
 387 /*
 388  * Poll system call.
 389  */
 390 int
 391 sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
 392     register_t *retval)
 393 {
 394         /* {
 395                 syscallarg(struct pollfd *)             fds;
 396                 syscallarg(u_int)                       nfds;
 397                 syscallarg(const struct timespec *)     ts;
 398                 syscallarg(const sigset_t *)            mask;
 399         } */
 400         struct timespec ats, *ts = NULL;
 401         sigset_t        amask, *mask = NULL;
 402         int             error;
 403
 404         if (SCARG(uap, ts)) {
 405                 error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
 406                 if (error)
 407                         return error;
 408                 ts = &ats;
 409         }
 410         if (SCARG(uap, mask)) {
 411                 error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
 412                 if (error)
 413                         return error;
 414                 mask = &amask;
 415         }
 416
 417         return pollcommon(l, retval, SCARG(uap, fds), SCARG(uap, nfds),
 418             ts, mask);
 419 }
 420
 421 int
 422 pollcommon(lwp_t *l, register_t *retval, struct pollfd *u_fds, u_int nfds,
 423     struct timespec *ts, sigset_t *mask)
 424 {
 425         struct pollfd   smallfds[32];
 426         struct pollfd   *fds;
 427         proc_t          * const p = l->l_proc;
 428         sigset_t        oldmask;
 429         int             ncoll, error, timo;
 430         size_t          ni;
 431         struct timespec sleepts;
 432         selcpu_t        *sc;
 433         kmutex_t        *lock;
 434
 435         if (nfds > p->p_fd->fd_nfiles) {
 436                 /* forgiving; slightly wrong */
 437                 nfds = p->p_fd->fd_nfiles;
 438         }
 439         ni = nfds * sizeof(struct pollfd);
 440         if (ni > sizeof(smallfds)) {
 441                 fds = kmem_alloc(ni, KM_SLEEP);
 442                 if (fds == NULL)
 443                         return ENOMEM;
 444         } else
 445                 fds = smallfds;
 446
 447         error = copyin(u_fds, fds, ni);
 448         if (error)
 449                 goto done;
 450
 451         timo = 0;
 452         if (ts && inittimeleft(ts, &sleepts) == -1) {
 453                 error = EINVAL;
 454                 goto done;
 455         }
 456
 457         if (mask) {
 458                 sigminusset(&sigcantmask, mask);
 459                 mutex_enter(p->p_lock);
 460                 oldmask = l->l_sigmask;
 461                 l->l_sigmask = *mask;
 462                 mutex_exit(p->p_lock);
 463         } else
 464                 oldmask = l->l_sigmask; /* XXXgcc */
 465
 466         sc = curcpu()->ci_data.cpu_selcpu;
 467         lock = sc->sc_lock;
 468         l->l_selcpu = sc;
 469         SLIST_INIT(&l->l_selwait);
 470         for (;;) {
 471                 /*
 472                  * No need to lock.  If this is overwritten by another
 473                  * value while scanning, we will retry below.  We only
 474                  * need to see exact state from the descriptors that
 475                  * we are about to poll, and lock activity resulting
 476                  * from fo_poll is enough to provide an up to date value
 477                  * for new polling activity.
 478                  */
 479                 ncoll = sc->sc_ncoll;
 480                 l->l_selflag = SEL_SCANNING;
 481
 482                 error = pollscan(l, fds, nfds, retval);
 483
 484                 if (error || *retval)
 485                         break;
 486                 if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
 487                         break;
 488                 mutex_spin_enter(lock);
 489                 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
 490                         mutex_spin_exit(lock);
 491                         continue;
 492                 }
 493                 l->l_selflag = SEL_BLOCKING;
 494                 l->l_kpriority = true;
 495                 sleepq_enter(&sc->sc_sleepq, l, lock);
 496                 sleepq_enqueue(&sc->sc_sleepq, sc, "select", &select_sobj);
 497                 error = sleepq_block(timo, true);
 498                 if (error != 0)
 499                         break;
 500         }
 501         selclear();
 502
 503         if (mask) {
 504                 mutex_enter(p->p_lock);
 505                 l->l_sigmask = oldmask;
 506                 mutex_exit(p->p_lock);
 507         }
 508  done:
 509         /* poll is not restarted after signals... */
 510         if (error == ERESTART)
 511                 error = EINTR;
 512         if (error == EWOULDBLOCK)
 513                 error = 0;
 514         if (error == 0)
 515                 error = copyout(fds, u_fds, ni);
 516         if (fds != smallfds)
 517                 kmem_free(fds, ni);
 518         return (error);
 519 }
 520
 521 int
 522 pollscan(lwp_t *l, struct pollfd *fds, int nfd, register_t *retval)
 523 {
 524         int i, n;
 525         file_t *fp;
 526
 527         n = 0;
 528         for (i = 0; i < nfd; i++, fds++) {
 529                 if (fds->fd < 0) {
 530                         fds->revents = 0;
 531                 } else if ((fp = fd_getfile(fds->fd)) == NULL) {
 532                         fds->revents = POLLNVAL;
 533                         n++;
 534                 } else {
 535                         fds->revents = (*fp->f_ops->fo_poll)(fp,
 536                             fds->events | POLLERR | POLLHUP);
 537                         if (fds->revents != 0)
 538                                 n++;
 539                         fd_putfile(fds->fd);
 540                 }
 541         }
 542         *retval = n;
 543         return (0);
 544 }
 545
 546 /*ARGSUSED*/
 547 int
 548 seltrue(dev_t dev, int events, lwp_t *l)
 549 {
 550
 551         return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 552 }
 553
 554 /*
 555  * Record a select request.  Concurrency issues:
 556  *
 557  * The caller holds the same lock across calls to selrecord() and
 558  * selnotify(), so we don't need to consider a concurrent wakeup
 559  * while in this routine.
 560  *
 561  * The only activity we need to guard against is selclear(), called by
 562  * another thread that is exiting selcommon() or pollcommon().
 563  * `sel_lwp' can only become non-NULL while the caller's lock is held,
 564  * so it cannot become non-NULL due to a change made by another thread
 565  * while we are in this routine.  It can only become _NULL_ due to a
 566  * call to selclear().
 567  *
 568  * If it is non-NULL and != selector there is the potential for
 569  * selclear() to be called by another thread.  If either of those
 570  * conditions are true, we're not interested in touching the `named
 571  * waiter' part of the selinfo record because we need to record a
 572  * collision.  Hence there is no need for additional locking in this
 573  * routine.
 574  */
 575 void
 576 selrecord(lwp_t *selector, struct selinfo *sip)
 577 {
 578         selcpu_t *sc;
 579         lwp_t *other;
 580
 581         KASSERT(selector == curlwp);
 582
 583         sc = selector->l_selcpu;
 584         other = sip->sel_lwp;
 585
 586         if (other == selector) {
 587                 /* `selector' has already claimed it. */
 588                 KASSERT(sip->sel_cpu = sc);
 589         } else if (other == NULL) {
 590                 /*
 591                  * First named waiter, although there may be unnamed
 592                  * waiters (collisions).  Issue a memory barrier to
 593                  * ensure that we access sel_lwp (above) before other
 594                  * fields - this guards against a call to selclear().
 595                  */
 596                 membar_enter();
 597                 sip->sel_lwp = selector;
 598                 SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
 599                 /* Replace selinfo's lock with our chosen CPU's lock. */
 600                 sip->sel_cpu = sc;
 601         } else {
 602                 /* Multiple waiters: record a collision. */
 603                 sip->sel_collision |= sc->sc_mask;
 604                 KASSERT(sip->sel_cpu != NULL);
 605         }
 606 }
 607
 608 /*
 609  * Do a wakeup when a selectable event occurs.  Concurrency issues:
 610  *
 611  * As per selrecord(), the caller's object lock is held.  If there
 612  * is a named waiter, we must acquire the associated selcpu's lock
 613  * in order to synchronize with selclear() and pollers going to sleep
 614  * in selcommon() and/or pollcommon().
 615  *
 616  * sip->sel_cpu cannot change at this point, as it is only changed
 617  * in selrecord(), and concurrent calls to selrecord() are locked
 618  * out by the caller.
 619  */
 620 void
 621 selnotify(struct selinfo *sip, int events, long knhint)
 622 {
 623         selcpu_t *sc;
 624         uint32_t mask;
 625         int index, oflag, swapin;
 626         lwp_t *l;
 627         kmutex_t *lock;
 628
 629         KNOTE(&sip->sel_klist, knhint);
 630
 631         if (sip->sel_lwp != NULL) {
 632                 /* One named LWP is waiting. */
 633                 swapin = 0;
 634                 sc = sip->sel_cpu;
 635                 lock = sc->sc_lock;
 636                 mutex_spin_enter(lock);
 637                 /* Still there? */
 638                 if (sip->sel_lwp != NULL) {
 639                         l = sip->sel_lwp;
 640                         /*
 641                          * If thread is sleeping, wake it up.  If it's not
 642                          * yet asleep, it will notice the change in state
 643                          * and will re-poll the descriptors.
 644                          */
 645                         oflag = l->l_selflag;
 646                         l->l_selflag = SEL_RESET;
 647                         if (oflag == SEL_BLOCKING && l->l_mutex == lock) {
 648                                 KASSERT(l->l_wchan == sc);
 649                                 swapin = sleepq_unsleep(l, false);
 650                         }
 651                 }
 652                 mutex_spin_exit(lock);
 653                 if (swapin)
 654                         uvm_kick_scheduler();
 655         }
 656
 657         if ((mask = sip->sel_collision) != 0) {
 658                 /*
 659                  * There was a collision (multiple waiters): we must
 660                  * inform all potentially interested waiters.
 661                  */
 662                 sip->sel_collision = 0;
 663                 do {
 664                         index = ffs(mask) - 1;
 665                         mask &= ~(1 << index);
 666                         sc = cpu_lookup(index)->ci_data.cpu_selcpu;
 667                         lock = sc->sc_lock;
 668                         mutex_spin_enter(lock);
 669                         sc->sc_ncoll++;
 670                         sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
 671                 } while (__predict_false(mask != 0));
 672         }
 673 }
 674
 675 /*
 676  * Remove an LWP from all objects that it is waiting for.  Concurrency
 677  * issues:
 678  *
 679  * The object owner's (e.g. device driver) lock is not held here.  Calls
 680  * can be made to selrecord() and we do not synchronize against those
 681  * directly using locks.  However, we use `sel_lwp' to lock out changes.
 682  * Before clearing it we must use memory barriers to ensure that we can
 683  * safely traverse the list of selinfo records.
 684  */
 685 static void
 686 selclear(void)
 687 {
 688         struct selinfo *sip, *next;
 689         selcpu_t *sc;
 690         lwp_t *l;
 691         kmutex_t *lock;
 692
 693         l = curlwp;
 694         sc = l->l_selcpu;
 695         lock = sc->sc_lock;
 696
 697         mutex_spin_enter(lock);
 698         for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) {
 699                 KASSERT(sip->sel_lwp == l);
 700                 KASSERT(sip->sel_cpu == l->l_selcpu);
 701                 /*
 702                  * Read link to next selinfo record, if any.
 703                  * It's no longer safe to touch `sip' after clearing
 704                  * `sel_lwp', so ensure that the read of `sel_chain'
 705                  * completes before the clearing of sel_lwp becomes
 706                  * globally visible.
 707                  */
 708                 next = SLIST_NEXT(sip, sel_chain);
 709                 membar_exit();
 710                 /* Release the record for another named waiter to use. */
 711                 sip->sel_lwp = NULL;
 712         }
 713         mutex_spin_exit(lock);
 714 }
 715
 716 /*
 717  * Initialize the select/poll system calls.  Called once for each
 718  * CPU in the system, as they are attached.
 719  */
 720 void
 721 selsysinit(struct cpu_info *ci)
 722 {
 723         selcpu_t *sc;
 724
 725         sc = kmem_alloc(roundup2(sizeof(selcpu_t), coherency_unit) +
 726             coherency_unit, KM_SLEEP);
 727         sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
 728         sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
 729         sleepq_init(&sc->sc_sleepq);
 730         sc->sc_ncoll = 0;
 731         sc->sc_mask = (1 << cpu_index(ci));
 732         ci->ci_data.cpu_selcpu = sc;
 733 }
 734
 735 /*
 736  * Initialize a selinfo record.
 737  */
 738 void
 739 selinit(struct selinfo *sip)
 740 {
 741
 742         memset(sip, 0, sizeof(*sip));
 743 }
 744
 745 /*
 746  * Destroy a selinfo record.  The owning object must not gain new
 747  * references while this is in progress: all activity on the record
 748  * must be stopped.
 749  *
 750  * Concurrency issues: we only need guard against a call to selclear()
 751  * by a thread exiting selcommon() and/or pollcommon().  The caller has
 752  * prevented further references being made to the selinfo record via
 753  * selrecord(), and it won't call selwakeup() again.
 754  */
 755 void
 756 seldestroy(struct selinfo *sip)
 757 {
 758         selcpu_t *sc;
 759         kmutex_t *lock;
 760         lwp_t *l;
 761
 762         if (sip->sel_lwp == NULL)
 763                 return;
 764
 765         /*
 766          * Lock out selclear().  The selcpu pointer can't change while
 767          * we are here since it is only ever changed in selrecord(),
 768          * and that will not be entered again for this record because
 769          * it is dying.
 770          */
 771         KASSERT(sip->sel_cpu != NULL);
 772         sc = sip->sel_cpu;
 773         lock = sc->sc_lock;
 774         mutex_spin_enter(lock);
 775         if ((l = sip->sel_lwp) != NULL) {
 776                 /*
 777                  * This should rarely happen, so although SLIST_REMOVE()
 778                  * is slow, using it here is not a problem.
 779                  */
 780                 KASSERT(l->l_selcpu == sc);
 781                 SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
 782                 sip->sel_lwp = NULL;
 783         }
 784         mutex_spin_exit(lock);
 785 }
 786
 787 int
 788 pollsock(struct socket *so, const struct timespec *tsp, int events)
 789 {
 790         int             ncoll, error, timo;
 791         struct timespec sleepts, ts;
 792         selcpu_t        *sc;
 793         lwp_t           *l;
 794         kmutex_t        *lock;
 795
 796         timo = 0;
 797         if (tsp != NULL) {
 798                 ts = *tsp;
 799                 if (inittimeleft(&ts, &sleepts) == -1)
 800                         return EINVAL;
 801         }
 802
 803         l = curlwp;
 804         sc = l->l_cpu->ci_data.cpu_selcpu;
 805         lock = sc->sc_lock;
 806         l->l_selcpu = sc;
 807         SLIST_INIT(&l->l_selwait);
 808         error = 0;
 809         for (;;) {
 810                 /*
 811                  * No need to lock.  If this is overwritten by another
 812                  * value while scanning, we will retry below.  We only
 813                  * need to see exact state from the descriptors that
 814                  * we are about to poll, and lock activity resulting
 815                  * from fo_poll is enough to provide an up to date value
 816                  * for new polling activity.
 817                  */
 818                 ncoll = sc->sc_ncoll;
 819                 l->l_selflag = SEL_SCANNING;
 820                 if (sopoll(so, events) != 0)
 821                         break;
 822                 if (tsp && (timo = gettimeleft(&ts, &sleepts)) <= 0)
 823                         break;
 824                 mutex_spin_enter(lock);
 825                 if (l->l_selflag != SEL_SCANNING || sc->sc_ncoll != ncoll) {
 826                         mutex_spin_exit(lock);
 827                         continue;
 828                 }
 829                 l->l_selflag = SEL_BLOCKING;
 830                 sleepq_enter(&sc->sc_sleepq, l, lock);
 831                 sleepq_enqueue(&sc->sc_sleepq, sc, "pollsock", &select_sobj);
 832                 error = sleepq_block(timo, true);
 833                 if (error != 0)
 834                         break;
 835         }
 836         selclear();
 837         /* poll is not restarted after signals... */
 838         if (error == ERESTART)
 839                 error = EINTR;
 840         if (error == EWOULDBLOCK)
 841                 error = 0;
 842         return (error);
 843 }