sys/kern/uipc_socket2.c

   1 /*
   2  * Copyright (c) 2005 Jeffrey M. Hsu.  All rights reserved.
   3  * Copyright (c) 1982, 1986, 1988, 1990, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  *
   6  * Redistribution and use in source and binary forms, with or without
   7  * modification, are permitted provided that the following conditions
   8  * are met:
   9  * 1. Redistributions of source code must retain the above copyright
  10  *    notice, this list of conditions and the following disclaimer.
  11  * 2. Redistributions in binary form must reproduce the above copyright
  12  *    notice, this list of conditions and the following disclaimer in the
  13  *    documentation and/or other materials provided with the distribution.
  14  * 3. Neither the name of the University nor the names of its contributors
  15  *    may be used to endorse or promote products derived from this software
  16  *    without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28  * SUCH DAMAGE.
  29  *
  30  *      @(#)uipc_socket2.c      8.1 (Berkeley) 6/10/93
  31  * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.17 2002/08/31 19:04:55 dwmalone Exp $
  32  */
  33
  34 #include "opt_param.h"
  35 #include <sys/param.h>
  36 #include <sys/systm.h>
  37 #include <sys/domain.h>
  38 #include <sys/file.h>   /* for maxfiles */
  39 #include <sys/kernel.h>
  40 #include <sys/ktr.h>
  41 #include <sys/proc.h>
  42 #include <sys/malloc.h>
  43 #include <sys/mbuf.h>
  44 #include <sys/protosw.h>
  45 #include <sys/resourcevar.h>
  46 #include <sys/stat.h>
  47 #include <sys/socket.h>
  48 #include <sys/socketvar.h>
  49 #include <sys/socketops.h>
  50 #include <sys/signalvar.h>
  51 #include <sys/sysctl.h>
  52 #include <sys/event.h>
  53
  54 #include <sys/thread2.h>
  55 #include <sys/msgport2.h>
  56 #include <sys/socketvar2.h>
  57
  58 #include <net/netisr2.h>
  59
  60 #ifndef KTR_SOWAKEUP
  61 #define KTR_SOWAKEUP    KTR_ALL
  62 #endif
  63 KTR_INFO_MASTER(sowakeup);
  64 KTR_INFO(KTR_SOWAKEUP, sowakeup, nconn_start, 0, "newconn sorwakeup start");
  65 KTR_INFO(KTR_SOWAKEUP, sowakeup, nconn_end, 1, "newconn sorwakeup end");
  66 KTR_INFO(KTR_SOWAKEUP, sowakeup, nconn_wakeupstart, 2, "newconn wakeup start");
  67 KTR_INFO(KTR_SOWAKEUP, sowakeup, nconn_wakeupend, 3, "newconn wakeup end");
  68 #define logsowakeup(name)       KTR_LOG(sowakeup_ ## name)
  69
  70 int     maxsockets;
  71
  72 /*
  73  * Primitive routines for operating on sockets and socket buffers
  74  */
  75
  76 u_long  sb_max = SB_MAX;
  77 u_long  sb_max_adj =
  78     SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
  79
  80 static  u_long sb_efficiency = 8;       /* parameter for sbreserve() */
  81
  82 /************************************************************************
  83  * signalsockbuf procedures                                             *
  84  ************************************************************************/
  85
  86 /*
  87  * Wait for data to arrive at/drain from a socket buffer.
  88  *
  89  * NOTE: Caller must generally hold the ssb_lock (client side lock) since
  90  *       WAIT/WAKEUP only works for one client at a time.
  91  *
  92  * NOTE: Caller always retries whatever operation it was waiting on.
  93  */
  94 int
  95 ssb_wait(struct signalsockbuf *ssb)
  96 {
  97         uint32_t flags;
  98         int pflags;
  99         int error;
 100
 101         pflags = (ssb->ssb_flags & SSB_NOINTR) ? 0 : PCATCH;
 102
 103         for (;;) {
 104                 flags = ssb->ssb_flags;
 105                 cpu_ccfence();
 106
 107                 /*
 108                  * WAKEUP and WAIT interlock each other.  We can catch the
 109                  * race by checking to see if WAKEUP has already been set,
 110                  * and only setting WAIT if WAKEUP is clear.
 111                  */
 112                 if (flags & SSB_WAKEUP) {
 113                         if (atomic_cmpset_int(&ssb->ssb_flags, flags,
 114                                               flags & ~SSB_WAKEUP)) {
 115                                 error = 0;
 116                                 break;
 117                         }
 118                         continue;
 119                 }
 120
 121                 /*
 122                  * Only set WAIT if WAKEUP is clear.
 123                  */
 124                 tsleep_interlock(&ssb->ssb_cc, pflags);
 125                 if (atomic_cmpset_int(&ssb->ssb_flags, flags,
 126                                       flags | SSB_WAIT)) {
 127                         error = tsleep(&ssb->ssb_cc, pflags | PINTERLOCKED,
 128                                        "sbwait", ssb->ssb_timeo);
 129                         break;
 130                 }
 131         }
 132         return (error);
 133 }
 134
 135 /*
 136  * Lock a sockbuf already known to be locked;
 137  * return any error returned from sleep (EINTR).
 138  */
 139 int
 140 _ssb_lock(struct signalsockbuf *ssb)
 141 {
 142         uint32_t flags;
 143         int pflags;
 144         int error;
 145
 146         pflags = (ssb->ssb_flags & SSB_NOINTR) ? 0 : PCATCH;
 147
 148         for (;;) {
 149                 flags = ssb->ssb_flags;
 150                 cpu_ccfence();
 151                 if (flags & SSB_LOCK) {
 152                         tsleep_interlock(&ssb->ssb_flags, pflags);
 153                         if (atomic_cmpset_int(&ssb->ssb_flags, flags,
 154                                               flags | SSB_WANT)) {
 155                                 error = tsleep(&ssb->ssb_flags,
 156                                                pflags | PINTERLOCKED,
 157                                                "sblock", 0);
 158                                 if (error)
 159                                         break;
 160                         }
 161                 } else {
 162                         if (atomic_cmpset_int(&ssb->ssb_flags, flags,
 163                                               flags | SSB_LOCK)) {
 164                                 lwkt_gettoken(&ssb->ssb_token);
 165                                 error = 0;
 166                                 break;
 167                         }
 168                 }
 169         }
 170         return (error);
 171 }
 172
 173 /*
 174  * This does the same for sockbufs.  Note that the xsockbuf structure,
 175  * since it is always embedded in a socket, does not include a self
 176  * pointer nor a length.  We make this entry point public in case
 177  * some other mechanism needs it.
 178  */
 179 void
 180 ssbtoxsockbuf(struct signalsockbuf *ssb, struct xsockbuf *xsb)
 181 {
 182         xsb->sb_cc = ssb->ssb_cc;
 183         xsb->sb_hiwat = ssb->ssb_hiwat;
 184         xsb->sb_mbcnt = ssb->ssb_mbcnt;
 185         xsb->sb_mbmax = ssb->ssb_mbmax;
 186         xsb->sb_lowat = ssb->ssb_lowat;
 187         xsb->sb_flags = ssb->ssb_flags;
 188         xsb->sb_timeo = ssb->ssb_timeo;
 189 }
 190
 191
 192 /************************************************************************
 193  * Procedures which manipulate socket state flags, wakeups, etc.        *
 194  ************************************************************************
 195  *
 196  * Normal sequence from the active (originating) side is that
 197  * soisconnecting() is called during processing of connect() call, resulting
 198  * in an eventual call to soisconnected() if/when the connection is
 199  * established.  When the connection is torn down soisdisconnecting() is
 200  * called during processing of disconnect() call, and soisdisconnected() is
 201  * called when the connection to the peer is totally severed.
 202  *
 203  * The semantics of these routines are such that connectionless protocols
 204  * can call soisconnected() and soisdisconnected() only, bypassing the
 205  * in-progress calls when setting up a ``connection'' takes no time.
 206  *
 207  * From the passive side, a socket is created with two queues of sockets:
 208  * so_incomp for connections in progress and so_comp for connections
 209  * already made and awaiting user acceptance.  As a protocol is preparing
 210  * incoming connections, it creates a socket structure queued on so_incomp
 211  * by calling sonewconn().  When the connection is established,
 212  * soisconnected() is called, and transfers the socket structure to so_comp,
 213  * making it available to accept().
 214  *
 215  * If a socket is closed with sockets on either so_incomp or so_comp, these
 216  * sockets are dropped.
 217  *
 218  * If higher level protocols are implemented in the kernel, the wakeups
 219  * done here will sometimes cause software-interrupt process scheduling.
 220  */
 221
 222 void
 223 soisconnecting(struct socket *so)
 224 {
 225         soclrstate(so, SS_ISCONNECTED | SS_ISDISCONNECTING);
 226         sosetstate(so, SS_ISCONNECTING);
 227 }
 228
 229 void
 230 soisconnected(struct socket *so)
 231 {
 232         struct socket *head;
 233
 234         while ((head = so->so_head) != NULL) {
 235                 lwkt_getpooltoken(head);
 236                 if (so->so_head == head)
 237                         break;
 238                 lwkt_relpooltoken(head);
 239         }
 240
 241         soclrstate(so, SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING);
 242         sosetstate(so, SS_ISCONNECTED);
 243         if (head && (so->so_state & SS_INCOMP)) {
 244                 if ((so->so_options & SO_ACCEPTFILTER) != 0) {
 245                         so->so_upcall = head->so_accf->so_accept_filter->accf_callback;
 246                         so->so_upcallarg = head->so_accf->so_accept_filter_arg;
 247                         atomic_set_int(&so->so_rcv.ssb_flags, SSB_UPCALL);
 248                         so->so_options &= ~SO_ACCEPTFILTER;
 249                         so->so_upcall(so, so->so_upcallarg, 0);
 250                         lwkt_relpooltoken(head);
 251                         return;
 252                 }
 253
 254                 /*
 255                  * Listen socket are not per-cpu.
 256                  */
 257                 KKASSERT((so->so_state & (SS_COMP | SS_INCOMP)) == SS_INCOMP);
 258                 TAILQ_REMOVE(&head->so_incomp, so, so_list);
 259                 head->so_incqlen--;
 260                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 261                 head->so_qlen++;
 262                 sosetstate(so, SS_COMP);
 263                 soclrstate(so, SS_INCOMP);
 264
 265                 /*
 266                  * XXX head may be on a different protocol thread.
 267                  *     sorwakeup()->sowakeup() is hacked atm.
 268                  */
 269                 sorwakeup(head);
 270                 wakeup_one(&head->so_timeo);
 271         } else {
 272                 wakeup(&so->so_timeo);
 273                 sorwakeup(so);
 274                 sowwakeup(so);
 275         }
 276         if (head)
 277                 lwkt_relpooltoken(head);
 278 }
 279
 280 void
 281 soisdisconnecting(struct socket *so)
 282 {
 283         soclrstate(so, SS_ISCONNECTING);
 284         sosetstate(so, SS_ISDISCONNECTING | SS_CANTRCVMORE | SS_CANTSENDMORE);
 285         wakeup((caddr_t)&so->so_timeo);
 286         sowwakeup(so);
 287         sorwakeup(so);
 288 }
 289
 290 void
 291 soisdisconnected(struct socket *so)
 292 {
 293         soclrstate(so, SS_ISCONNECTING | SS_ISCONNECTED | SS_ISDISCONNECTING);
 294         sosetstate(so, SS_CANTRCVMORE | SS_CANTSENDMORE | SS_ISDISCONNECTED);
 295         wakeup((caddr_t)&so->so_timeo);
 296         sbdrop(&so->so_snd.sb, so->so_snd.ssb_cc);
 297         sowwakeup(so);
 298         sorwakeup(so);
 299 }
 300
 301 void
 302 soisreconnecting(struct socket *so)
 303 {
 304         soclrstate(so, SS_ISDISCONNECTING | SS_ISDISCONNECTED |
 305                        SS_CANTRCVMORE | SS_CANTSENDMORE);
 306         sosetstate(so, SS_ISCONNECTING);
 307 }
 308
 309 void
 310 soisreconnected(struct socket *so)
 311 {
 312         soclrstate(so, SS_ISDISCONNECTED | SS_CANTRCVMORE | SS_CANTSENDMORE);
 313         soisconnected(so);
 314 }
 315
 316 /*
 317  * Set or change the message port a socket receives commands on.
 318  *
 319  * XXX
 320  */
 321 void
 322 sosetport(struct socket *so, lwkt_port_t port)
 323 {
 324         so->so_port = port;
 325 }
 326
 327 /*
 328  * When an attempt at a new connection is noted on a socket
 329  * which accepts connections, sonewconn is called.  If the
 330  * connection is possible (subject to space constraints, etc.)
 331  * then we allocate a new structure, propoerly linked into the
 332  * data structure of the original socket, and return this.
 333  * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
 334  *
 335  * The new socket is returned with one ref and so_pcb assigned.
 336  * The reference is implied by so_pcb.
 337  */
 338 struct socket *
 339 sonewconn_faddr(struct socket *head, int connstatus,
 340     const struct sockaddr *faddr, boolean_t keep_ref)
 341 {
 342         struct socket *so;
 343         struct socket *sp;
 344         struct pru_attach_info ai;
 345
 346         if (head->so_qlen > 3 * head->so_qlimit / 2)
 347                 return (NULL);
 348         so = soalloc(1, head->so_proto);
 349         if (so == NULL)
 350                 return (NULL);
 351
 352         /*
 353          * Set the port prior to attaching the inpcb to the current
 354          * cpu's protocol thread (which should be the current thread
 355          * but might not be in all cases).  This serializes any pcb ops
 356          * which occur to our cpu allowing us to complete the attachment
 357          * without racing anything.
 358          */
 359         if (head->so_proto->pr_flags & PR_SYNC_PORT)
 360                 sosetport(so, &netisr_sync_port);
 361         else
 362                 sosetport(so, netisr_cpuport(mycpuid));
 363         if ((head->so_options & SO_ACCEPTFILTER) != 0)
 364                 connstatus = 0;
 365         so->so_head = head;
 366         so->so_type = head->so_type;
 367         so->so_options = head->so_options &~ SO_ACCEPTCONN;
 368         so->so_linger = head->so_linger;
 369
 370         /*
 371          * NOTE: Clearing NOFDREF implies referencing the so with
 372          *       soreference().
 373          */
 374         so->so_state = head->so_state | SS_NOFDREF | SS_ASSERTINPROG;
 375         so->so_cred = crhold(head->so_cred);
 376         ai.sb_rlimit = NULL;
 377         ai.p_ucred = NULL;
 378         ai.fd_rdir = NULL;              /* jail code cruft XXX JH */
 379
 380         /*
 381          * Reserve space and call pru_attach.  We can direct-call the
 382          * function since we're already in the protocol thread.
 383          */
 384         if (soreserve(so, head->so_snd.ssb_hiwat,
 385                       head->so_rcv.ssb_hiwat, NULL) ||
 386             so_pru_attach_direct(so, 0, &ai)) {
 387                 so->so_head = NULL;
 388                 soclrstate(so, SS_ASSERTINPROG);
 389                 sofree(so);             /* remove implied pcb ref */
 390                 return (NULL);
 391         }
 392         KKASSERT(((so->so_proto->pr_flags & PR_ASYNC_RCVD) == 0 &&
 393             so->so_refs == 2) ||        /* attach + our base ref */
 394            ((so->so_proto->pr_flags & PR_ASYNC_RCVD) &&
 395             so->so_refs == 3));         /* + async rcvd ref */
 396         if (keep_ref) {
 397                 /*
 398                  * Keep the reference; caller will free it.
 399                  */
 400         } else {
 401                 sofree(so);
 402         }
 403         KKASSERT(so->so_port != NULL);
 404         so->so_rcv.ssb_lowat = head->so_rcv.ssb_lowat;
 405         so->so_snd.ssb_lowat = head->so_snd.ssb_lowat;
 406         so->so_rcv.ssb_timeo = head->so_rcv.ssb_timeo;
 407         so->so_snd.ssb_timeo = head->so_snd.ssb_timeo;
 408
 409         if (head->so_rcv.ssb_flags & SSB_AUTOLOWAT)
 410                 so->so_rcv.ssb_flags |= SSB_AUTOLOWAT;
 411         else
 412                 so->so_rcv.ssb_flags &= ~SSB_AUTOLOWAT;
 413
 414         if (head->so_snd.ssb_flags & SSB_AUTOLOWAT)
 415                 so->so_snd.ssb_flags |= SSB_AUTOLOWAT;
 416         else
 417                 so->so_snd.ssb_flags &= ~SSB_AUTOLOWAT;
 418
 419         if (head->so_rcv.ssb_flags & SSB_AUTOSIZE)
 420                 so->so_rcv.ssb_flags |= SSB_AUTOSIZE;
 421         else
 422                 so->so_rcv.ssb_flags &= ~SSB_AUTOSIZE;
 423
 424         if (head->so_snd.ssb_flags & SSB_AUTOSIZE)
 425                 so->so_snd.ssb_flags |= SSB_AUTOSIZE;
 426         else
 427                 so->so_snd.ssb_flags &= ~SSB_AUTOSIZE;
 428
 429         /*
 430          * Save the faddr, if the information is provided and
 431          * the protocol can perform the saving opertation.
 432          */
 433         if (faddr != NULL && so->so_proto->pr_usrreqs->pru_savefaddr != NULL)
 434                 so->so_proto->pr_usrreqs->pru_savefaddr(so, faddr);
 435
 436         lwkt_getpooltoken(head);
 437         if (connstatus) {
 438                 KKASSERT((so->so_state & (SS_INCOMP | SS_COMP)) == 0);
 439                 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
 440                 head->so_qlen++;
 441                 /*
 442                  * Set connstatus within head token, so that the accepted
 443                  * socket will have connstatus (SS_ISCONNECTED) set.
 444                  */
 445                 sosetstate(so, SS_COMP | connstatus);
 446         } else {
 447                 if (head->so_incqlen > head->so_qlimit) {
 448                         sp = TAILQ_FIRST(&head->so_incomp);
 449                         KKASSERT((sp->so_state & (SS_INCOMP | SS_COMP)) ==
 450                             SS_INCOMP);
 451                         TAILQ_REMOVE(&head->so_incomp, sp, so_list);
 452                         head->so_incqlen--;
 453                         soclrstate(sp, SS_INCOMP);
 454                         soabort_async(sp, TRUE);
 455                 }
 456                 KKASSERT((so->so_state & (SS_INCOMP | SS_COMP)) == 0);
 457                 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
 458                 head->so_incqlen++;
 459                 sosetstate(so, SS_INCOMP);
 460         }
 461         /*
 462          * Clear SS_ASSERTINPROG within head token, so that it will not
 463          * race against accept-close or abort for "synchronous" sockets,
 464          * e.g. unix socket, on other CPUs.
 465          */
 466         soclrstate(so, SS_ASSERTINPROG);
 467         lwkt_relpooltoken(head);
 468
 469         if (connstatus) {
 470                 /*
 471                  * XXX head may be on a different protocol thread.
 472                  *     sorwakeup()->sowakeup() is hacked atm.
 473                  */
 474                 logsowakeup(nconn_start);
 475                 sorwakeup(head);
 476                 logsowakeup(nconn_end);
 477
 478                 logsowakeup(nconn_wakeupstart);
 479                 wakeup((caddr_t)&head->so_timeo);
 480                 logsowakeup(nconn_wakeupend);
 481         }
 482         return (so);
 483 }
 484
 485 struct socket *
 486 sonewconn(struct socket *head, int connstatus)
 487 {
 488         return sonewconn_faddr(head, connstatus, NULL, FALSE /* don't ref */);
 489 }
 490
 491 /*
 492  * Socantsendmore indicates that no more data will be sent on the
 493  * socket; it would normally be applied to a socket when the user
 494  * informs the system that no more data is to be sent, by the protocol
 495  * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
 496  * will be received, and will normally be applied to the socket by a
 497  * protocol when it detects that the peer will send no more data.
 498  * Data queued for reading in the socket may yet be read.
 499  */
 500 void
 501 socantsendmore(struct socket *so)
 502 {
 503         sosetstate(so, SS_CANTSENDMORE);
 504         sowwakeup(so);
 505 }
 506
 507 void
 508 socantrcvmore(struct socket *so)
 509 {
 510         sosetstate(so, SS_CANTRCVMORE);
 511         sorwakeup(so);
 512 }
 513
 514 /*
 515  * Wakeup processes waiting on a socket buffer.  Do asynchronous notification
 516  * via SIGIO if the socket has the SS_ASYNC flag set.
 517  *
 518  * For users waiting on send/recv try to avoid unnecessary context switch
 519  * thrashing.  Particularly for senders of large buffers (needs to be
 520  * extended to sel and aio? XXX)
 521  *
 522  * WARNING!  Can be called on a foreign socket from the wrong protocol
 523  *           thread.  aka is called on the 'head' listen socket when
 524  *           a new connection comes in.
 525  */
 526
 527 void
 528 sowakeup(struct socket *so, struct signalsockbuf *ssb)
 529 {
 530         uint32_t flags;
 531
 532         /*
 533          * Atomically check the flags.  When no special features are being
 534          * used, WAIT is clear, and WAKEUP is already set, we can simply
 535          * return.  The upcoming synchronous waiter will not block.
 536          */
 537         flags = atomic_fetchadd_int(&ssb->ssb_flags, 0);
 538         if ((flags & SSB_NOTIFY_MASK) == 0) {
 539                 if (flags & SSB_WAKEUP)
 540                         return;
 541         }
 542
 543         /*
 544          * Check conditions, set the WAKEUP flag, and clear and signal if
 545          * the WAIT flag is found to be set.  This interlocks against the
 546          * client side.
 547          */
 548         for (;;) {
 549                 long space;
 550
 551                 flags = ssb->ssb_flags;
 552                 cpu_ccfence();
 553                 if (ssb->ssb_flags & SSB_PREALLOC)
 554                         space = ssb_space_prealloc(ssb);
 555                 else
 556                         space = ssb_space(ssb);
 557
 558                 if ((ssb == &so->so_snd && space >= ssb->ssb_lowat) ||
 559                     (ssb == &so->so_rcv && ssb->ssb_cc >= ssb->ssb_lowat) ||
 560                     (ssb == &so->so_snd && (so->so_state & SS_CANTSENDMORE)) ||
 561                     (ssb == &so->so_rcv && (so->so_state & SS_CANTRCVMORE))
 562                 ) {
 563                         if (atomic_cmpset_int(&ssb->ssb_flags, flags,
 564                                           (flags | SSB_WAKEUP) & ~SSB_WAIT)) {
 565                                 if (flags & SSB_WAIT)
 566                                         wakeup(&ssb->ssb_cc);
 567                                 break;
 568                         }
 569                 } else {
 570                         break;
 571                 }
 572         }
 573
 574         /*
 575          * Misc other events
 576          */
 577         if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
 578                 pgsigio(so->so_sigio, SIGIO, 0);
 579         if (ssb->ssb_flags & SSB_UPCALL)
 580                 (*so->so_upcall)(so, so->so_upcallarg, M_NOWAIT);
 581         KNOTE(&ssb->ssb_kq.ki_note, 0);
 582
 583         /*
 584          * This is a bit of a hack.  Multiple threads can wind up scanning
 585          * ssb_mlist concurrently due to the fact that this function can be
 586          * called on a foreign socket, so we can't afford to block here.
 587          *
 588          * We need the pool token for (so) (likely the listne socket if
 589          * SSB_MEVENT is set) because the predicate function may have
 590          * to access the accept queue.
 591          */
 592         if (ssb->ssb_flags & SSB_MEVENT) {
 593                 struct netmsg_so_notify *msg, *nmsg;
 594
 595                 lwkt_getpooltoken(so);
 596                 TAILQ_FOREACH_MUTABLE(msg, &ssb->ssb_mlist, nm_list, nmsg) {
 597                         if (msg->nm_predicate(msg)) {
 598                                 TAILQ_REMOVE(&ssb->ssb_mlist, msg, nm_list);
 599                                 lwkt_replymsg(&msg->base.lmsg,
 600                                               msg->base.lmsg.ms_error);
 601                         }
 602                 }
 603                 if (TAILQ_EMPTY(&ssb->ssb_mlist))
 604                         atomic_clear_int(&ssb->ssb_flags, SSB_MEVENT);
 605                 lwkt_relpooltoken(so);
 606         }
 607 }
 608
 609 /*
 610  * Socket buffer (struct signalsockbuf) utility routines.
 611  *
 612  * Each socket contains two socket buffers: one for sending data and
 613  * one for receiving data.  Each buffer contains a queue of mbufs,
 614  * information about the number of mbufs and amount of data in the
 615  * queue, and other fields allowing kevent()/select()/poll() statements
 616  * and notification on data availability to be implemented.
 617  *
 618  * Data stored in a socket buffer is maintained as a list of records.
 619  * Each record is a list of mbufs chained together with the m_next
 620  * field.  Records are chained together with the m_nextpkt field. The upper
 621  * level routine soreceive() expects the following conventions to be
 622  * observed when placing information in the receive buffer:
 623  *
 624  * 1. If the protocol requires each message be preceded by the sender's
 625  *    name, then a record containing that name must be present before
 626  *    any associated data (mbuf's must be of type MT_SONAME).
 627  * 2. If the protocol supports the exchange of ``access rights'' (really
 628  *    just additional data associated with the message), and there are
 629  *    ``rights'' to be received, then a record containing this data
 630  *    should be present (mbuf's must be of type MT_RIGHTS).
 631  * 3. If a name or rights record exists, then it must be followed by
 632  *    a data record, perhaps of zero length.
 633  *
 634  * Before using a new socket structure it is first necessary to reserve
 635  * buffer space to the socket, by calling sbreserve().  This should commit
 636  * some of the available buffer space in the system buffer pool for the
 637  * socket (currently, it does nothing but enforce limits).  The space
 638  * should be released by calling ssb_release() when the socket is destroyed.
 639  */
 640 int
 641 soreserve(struct socket *so, u_long sndcc, u_long rcvcc, struct rlimit *rl)
 642 {
 643         if (so->so_snd.ssb_lowat == 0)
 644                 atomic_set_int(&so->so_snd.ssb_flags, SSB_AUTOLOWAT);
 645         if (ssb_reserve(&so->so_snd, sndcc, so, rl) == 0)
 646                 goto bad;
 647         if (ssb_reserve(&so->so_rcv, rcvcc, so, rl) == 0)
 648                 goto bad2;
 649         if (so->so_rcv.ssb_lowat == 0)
 650                 so->so_rcv.ssb_lowat = 1;
 651         if (so->so_snd.ssb_lowat == 0)
 652                 so->so_snd.ssb_lowat = MCLBYTES;
 653         if (so->so_snd.ssb_lowat > so->so_snd.ssb_hiwat)
 654                 so->so_snd.ssb_lowat = so->so_snd.ssb_hiwat;
 655         return (0);
 656 bad2:
 657         ssb_release(&so->so_snd, so);
 658 bad:
 659         return (ENOBUFS);
 660 }
 661
 662 static int
 663 sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
 664 {
 665         int error = 0;
 666         u_long old_sb_max = sb_max;
 667
 668         error = SYSCTL_OUT(req, arg1, sizeof(int));
 669         if (error || !req->newptr)
 670                 return (error);
 671         error = SYSCTL_IN(req, arg1, sizeof(int));
 672         if (error)
 673                 return (error);
 674         if (sb_max < MSIZE + MCLBYTES) {
 675                 sb_max = old_sb_max;
 676                 return (EINVAL);
 677         }
 678         sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
 679         return (0);
 680 }
 681
 682 /*
 683  * Allot mbufs to a signalsockbuf.
 684  *
 685  * Attempt to scale mbmax so that mbcnt doesn't become limiting
 686  * if buffering efficiency is near the normal case.
 687  *
 688  * sb_max only applies to user-sockets (where rl != NULL).  It does
 689  * not apply to kernel sockets or kernel-controlled sockets.  Note
 690  * that NFS overrides the sockbuf limits created when nfsd creates
 691  * a socket.
 692  */
 693 int
 694 ssb_reserve(struct signalsockbuf *ssb, u_long cc, struct socket *so,
 695             struct rlimit *rl)
 696 {
 697         /*
 698          * rl will only be NULL when we're in an interrupt (eg, in tcp_input)
 699          * or when called from netgraph (ie, ngd_attach)
 700          */
 701         if (rl && cc > sb_max_adj)
 702                 cc = sb_max_adj;
 703         if (!chgsbsize(so->so_cred->cr_uidinfo, &ssb->ssb_hiwat, cc,
 704                        rl ? rl->rlim_cur : RLIM_INFINITY)) {
 705                 return (0);
 706         }
 707         if (rl)
 708                 ssb->ssb_mbmax = min(cc * sb_efficiency, sb_max);
 709         else
 710                 ssb->ssb_mbmax = cc * sb_efficiency;
 711
 712         /*
 713          * AUTOLOWAT is set on send buffers and prevents large writes
 714          * from generating a huge number of context switches.
 715          */
 716         if (ssb->ssb_flags & SSB_AUTOLOWAT) {
 717                 ssb->ssb_lowat = ssb->ssb_hiwat / 4;
 718                 if (ssb->ssb_lowat < MCLBYTES)
 719                         ssb->ssb_lowat = MCLBYTES;
 720         }
 721         if (ssb->ssb_lowat > ssb->ssb_hiwat)
 722                 ssb->ssb_lowat = ssb->ssb_hiwat;
 723         return (1);
 724 }
 725
 726 /*
 727  * Free mbufs held by a socket, and reserved mbuf space.
 728  */
 729 void
 730 ssb_release(struct signalsockbuf *ssb, struct socket *so)
 731 {
 732         sbflush(&ssb->sb);
 733         (void)chgsbsize(so->so_cred->cr_uidinfo, &ssb->ssb_hiwat, 0,
 734             RLIM_INFINITY);
 735         ssb->ssb_mbmax = 0;
 736 }
 737
 738 /*
 739  * Some routines that return EOPNOTSUPP for entry points that are not
 740  * supported by a protocol.  Fill in as needed.
 741  */
 742 void
 743 pr_generic_notsupp(netmsg_t msg)
 744 {
 745         lwkt_replymsg(&msg->lmsg, EOPNOTSUPP);
 746 }
 747
 748 int
 749 pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
 750            struct mbuf *top, struct mbuf *control, int flags,
 751            struct thread *td)
 752 {
 753         if (top)
 754                 m_freem(top);
 755         if (control)
 756                 m_freem(control);
 757         return (EOPNOTSUPP);
 758 }
 759
 760 int
 761 pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
 762                       struct uio *uio, struct sockbuf *sio,
 763                       struct mbuf **controlp, int *flagsp)
 764 {
 765         return (EOPNOTSUPP);
 766 }
 767
 768 /*
 769  * This isn't really a ``null'' operation, but it's the default one
 770  * and doesn't do anything destructive.
 771  */
 772 void
 773 pru_sense_null(netmsg_t msg)
 774 {
 775         msg->sense.nm_stat->st_blksize = msg->base.nm_so->so_snd.ssb_hiwat;
 776         lwkt_replymsg(&msg->lmsg, 0);
 777 }
 778
 779 /*
 780  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.  Callers
 781  * of this routine assume that it always succeeds, so we have to use a
 782  * blockable allocation even though we might be called from a critical thread.
 783  */
 784 struct sockaddr *
 785 dup_sockaddr(const struct sockaddr *sa)
 786 {
 787         struct sockaddr *sa2;
 788
 789         sa2 = kmalloc(sa->sa_len, M_SONAME, M_INTWAIT);
 790         bcopy(sa, sa2, sa->sa_len);
 791         return (sa2);
 792 }
 793
 794 /*
 795  * Create an external-format (``xsocket'') structure using the information
 796  * in the kernel-format socket structure pointed to by so.  This is done
 797  * to reduce the spew of irrelevant information over this interface,
 798  * to isolate user code from changes in the kernel structure, and
 799  * potentially to provide information-hiding if we decide that
 800  * some of this information should be hidden from users.
 801  */
 802 void
 803 sotoxsocket(struct socket *so, struct xsocket *xso)
 804 {
 805         xso->xso_len = sizeof *xso;
 806         xso->xso_so = so;
 807         xso->so_type = so->so_type;
 808         xso->so_options = so->so_options;
 809         xso->so_linger = so->so_linger;
 810         xso->so_state = so->so_state;
 811         xso->so_pcb = so->so_pcb;
 812         xso->xso_protocol = so->so_proto->pr_protocol;
 813         xso->xso_family = so->so_proto->pr_domain->dom_family;
 814         xso->so_qlen = so->so_qlen;
 815         xso->so_incqlen = so->so_incqlen;
 816         xso->so_qlimit = so->so_qlimit;
 817         xso->so_timeo = so->so_timeo;
 818         xso->so_error = so->so_error;
 819         xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
 820         xso->so_oobmark = so->so_oobmark;
 821         ssbtoxsockbuf(&so->so_snd, &xso->so_snd);
 822         ssbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
 823         xso->so_uid = so->so_cred->cr_uid;
 824 }
 825
 826 /*
 827  * Here is the definition of some of the basic objects in the kern.ipc
 828  * branch of the MIB.
 829  */
 830 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 831
 832 /*
 833  * This takes the place of kern.maxsockbuf, which moved to kern.ipc.
 834  *
 835  * NOTE! sb_max only applies to user-created socket buffers.
 836  */
 837 static int dummy;
 838 SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
 839 SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_INT|CTLFLAG_RW,
 840     &sb_max, 0, sysctl_handle_sb_max, "I", "Maximum socket buffer size");
 841 SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD,
 842     &maxsockets, 0, "Maximum number of sockets available");
 843 SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
 844     &sb_efficiency, 0,
 845     "Socket buffer limit scaler");
 846
 847 /*
 848  * Initialize maxsockets
 849  */
 850 static void
 851 init_maxsockets(void *ignored)
 852 {
 853     TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 854     maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
 855 }
 856 SYSINIT(param, SI_BOOT1_TUNABLES, SI_ORDER_ANY,
 857         init_maxsockets, NULL);
 858