net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.173 2000/08/15 20:15:23 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *
 245  *      TCP_CLOSE               socket is finished
 246  */
 247
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * [Note: Most of the TCP code has been rewriten/redesigned since this
 255  *  RFC1122 check. It is probably not correct anymore. It should be redone
 256  *  before 2.2. -AK]
 257  *
 258  * Use of PSH (4.2.2.2)
 259  *   MAY aggregate data sent without the PSH flag. (does)
 260  *   MAY queue data received without the PSH flag. (does)
 261  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 262  *   MAY implement PSH on send calls. (doesn't, thus:)
 263  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 264  *     MUST set PSH on last segment (does)
 265  *   MAY pass received PSH to application layer (doesn't)
 266  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 267  *
 268  * Window Size (4.2.2.3, 4.2.2.16)
 269  *   MUST treat window size as an unsigned number (does)
 270  *   SHOULD treat window size as a 32-bit number (does not)
 271  *   MUST NOT shrink window once it is offered (does not normally)
 272  *
 273  * Urgent Pointer (4.2.2.4)
 274  * **MUST point urgent pointer to last byte of urgent data (not right
 275  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 276  *      to off)
 277  *   MUST inform application layer asynchronously of incoming urgent
 278  *     data. (does)
 279  *   MUST provide application with means of determining the amount of
 280  *     urgent data pending. (does)
 281  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 282  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 283  *      [Follows BSD 1 byte of urgent data]
 284  *
 285  * TCP Options (4.2.2.5)
 286  *   MUST be able to receive TCP options in any segment. (does)
 287  *   MUST ignore unsupported options (does)
 288  *
 289  * Maximum Segment Size Option (4.2.2.6)
 290  *   MUST implement both sending and receiving MSS. (does, but currently
 291  *      only uses the smaller of both of them)
 292  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 293  *     it always). (does, even when MSS == 536, which is legal)
 294  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 295  *   MUST calculate "effective send MSS" correctly:
 296  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 297  *     (does - but allows operator override)
 298  *
 299  * TCP Checksum (4.2.2.7)
 300  *   MUST generate and check TCP checksum. (does)
 301  *
 302  * Initial Sequence Number Selection (4.2.2.8)
 303  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 304  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 305  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 306  *     With syncookies we don't)
 307  *
 308  * Simultaneous Open Attempts (4.2.2.10)
 309  *   MUST support simultaneous open attempts (does)
 310  *
 311  * Recovery from Old Duplicate SYN (4.2.2.11)
 312  *   MUST keep track of active vs. passive open (does)
 313  *
 314  * RST segment (4.2.2.12)
 315  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 316  *     anything with it, which is standard)
 317  *
 318  * Closing a Connection (4.2.2.13)
 319  *   MUST inform application of whether connection was closed by RST or
 320  *     normal close. (does)
 321  *   MAY allow "half-duplex" close (treat connection as closed for the
 322  *     local app, even before handshake is done). (does)
 323  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 324  *
 325  * Retransmission Timeout (4.2.2.15)
 326  *   MUST implement Jacobson's slow start and congestion avoidance
 327  *     stuff. (does)
 328  *
 329  * Probing Zero Windows (4.2.2.17)
 330  *   MUST support probing of zero windows. (does)
 331  *   MAY keep offered window closed indefinitely. (does)
 332  *   MUST allow remote window to stay closed indefinitely. (does)
 333  *
 334  * Passive Open Calls (4.2.2.18)
 335  *   MUST NOT let new passive open affect other connections. (doesn't)
 336  *   MUST support passive opens (LISTENs) concurrently. (does)
 337  *
 338  * Time to Live (4.2.2.19)
 339  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 340  *
 341  * Event Processing (4.2.2.20)
 342  *   SHOULD queue out-of-order segments. (does)
 343  *   MUST aggregate ACK segments whenever possible. (does but badly)
 344  *
 345  * Retransmission Timeout Calculation (4.2.3.1)
 346  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 347  *     calculation. (does, or at least explains them in the comments 8*b)
 348  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 349  *
 350  * When to Send an ACK Segment (4.2.3.2)
 351  *   SHOULD implement delayed ACK. (does)
 352  *   MUST keep ACK delay < 0.5 sec. (does)
 353  *
 354  * When to Send a Window Update (4.2.3.3)
 355  *   MUST implement receiver-side SWS. (does)
 356  *
 357  * When to Send Data (4.2.3.4)
 358  *   MUST implement sender-side SWS. (does)
 359  *   SHOULD implement Nagle algorithm. (does)
 360  *
 361  * TCP Connection Failures (4.2.3.5)
 362  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 363  *   SHOULD inform application layer of soft errors. (does)
 364  *
 365  * TCP Keep-Alives (4.2.3.6)
 366  *   MAY provide keep-alives. (does)
 367  *   MUST make keep-alives configurable on a per-connection basis. (does)
 368  *   MUST default to no keep-alives. (does)
 369  *   MUST make keep-alive interval configurable. (does)
 370  *   MUST make default keep-alive interval > 2 hours. (does)
 371  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 372  *     connection. (doesn't)
 373  *   SHOULD send keep-alive with no data. (does)
 374  *
 375  * TCP Multihoming (4.2.3.7)
 376  *   MUST get source address from IP layer before sending first
 377  *     SYN. (does)
 378  *   MUST use same local address for all segments of a connection. (does)
 379  *
 380  * IP Options (4.2.3.8)
 381  *   MUST ignore unsupported IP options. (does)
 382  *   MAY support Time Stamp and Record Route. (does)
 383  *   MUST allow application to specify a source route. (does)
 384  *   MUST allow received Source Route option to set route for all future
 385  *     segments on this connection. (does not (security issues))
 386  *
 387  * ICMP messages (4.2.3.9)
 388  *   MUST act on ICMP errors. (does)
 389  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 390  *   because that is deprecated now by the IETF, can be turned on)
 391  *   MUST NOT abort connection upon receipt of soft Destination
 392  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 393  *     Problems. (doesn't)
 394  *   SHOULD report soft Destination Unreachables etc. to the
 395  *     application. (does, except during SYN_RECV and may drop messages
 396  *     in some rare cases before accept() - ICMP is unreliable)
 397  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 398  *     messages (2, 3, 4). (does, but see above)
 399  *
 400  * Remote Address Validation (4.2.3.10)
 401  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 402  *   MUST ignore SYN with invalid source address. (does)
 403  *   MUST silently discard incoming SYN for broadcast/multicast
 404  *     address. (does)
 405  *
 406  * Asynchronous Reports (4.2.4.1)
 407  * MUST provide mechanism for reporting soft errors to application
 408  *     layer. (does)
 409  *
 410  * Type of Service (4.2.4.2)
 411  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 412  *
 413  * (Whew. -- MS 950903)
 414  * (Updated by AK, but not complete yet.)
 415  **/
 416
 417 #include <linux/config.h>
 418 #include <linux/types.h>
 419 #include <linux/fcntl.h>
 420 #include <linux/poll.h>
 421 #include <linux/init.h>
 422 #include <linux/smp_lock.h>
 423
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426
 427 #include <asm/uaccess.h>
 428
 429 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 430
 431 struct tcp_mib  tcp_statistics[NR_CPUS*2];
 432
 433 kmem_cache_t *tcp_openreq_cachep;
 434 kmem_cache_t *tcp_bucket_cachep;
 435 kmem_cache_t *tcp_timewait_cachep;
 436
 437 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 438
 439 int sysctl_tcp_mem[3] = { 0, };
 440 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
 441 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
 442
 443 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 444 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 445
 446 /* Pressure flag: try to collapse.
 447  * Technical note: it is used by multiple contexts non atomically.
 448  * All the tcp_mem_schedule() is of this nature: accounting
 449  * is strict, actions are advisory and have some latency. */
 450 int tcp_memory_pressure;
 451
 452 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
 453
 454 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 455 {
 456         int amt = TCP_PAGES(size);
 457
 458         sk->forward_alloc += amt*TCP_MEM_QUANTUM;
 459         atomic_add(amt, &tcp_memory_allocated);
 460
 461         /* Under limit. */
 462         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 463                 if (tcp_memory_pressure)
 464                         tcp_memory_pressure = 0;
 465                 return 1;
 466         }
 467
 468         /* Over hard limit. */
 469         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 470                 tcp_enter_memory_pressure();
 471                 goto suppress_allocation;
 472         }
 473
 474         /* Under pressure. */
 475         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 476                 tcp_enter_memory_pressure();
 477
 478         if (kind) {
 479                 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
 480                         return 1;
 481         } else {
 482                 if (sk->wmem_queued < sysctl_tcp_wmem[0])
 483                         return 1;
 484         }
 485
 486         if (!tcp_memory_pressure ||
 487             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
 488             * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
 489                         sk->forward_alloc))
 490                 return 1;
 491
 492 suppress_allocation:
 493
 494         if (kind == 0) {
 495                 tcp_moderate_sndbuf(sk);
 496
 497                 /* Fail only if socket is _under_ its sndbuf.
 498                  * In this case we cannot block, so that we have to fail.
 499                  */
 500                 if (sk->wmem_queued+size >= sk->sndbuf)
 501                         return 1;
 502         }
 503
 504         /* Alas. Undo changes. */
 505         sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
 506         atomic_sub(amt, &tcp_memory_allocated);
 507         return 0;
 508 }
 509
 510 void __tcp_mem_reclaim(struct sock *sk)
 511 {
 512         if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
 513                 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
 514                 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
 515                 if (tcp_memory_pressure &&
 516                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 517                         tcp_memory_pressure = 0;
 518         }
 519 }
 520
 521 void tcp_rfree(struct sk_buff *skb)
 522 {
 523         struct sock *sk = skb->sk;
 524
 525         atomic_sub(skb->truesize, &sk->rmem_alloc);
 526         sk->forward_alloc += skb->truesize;
 527 }
 528
 529 /*
 530  * LISTEN is a special case for poll..
 531  */
 532 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 533 {
 534         return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
 535 }
 536
 537 /*
 538  *      Wait for a TCP event.
 539  *
 540  *      Note that we don't need to lock the socket, as the upper poll layers
 541  *      take care of normal races (between the test and the event) and we don't
 542  *      go look at any of the socket buffers directly.
 543  */
 544 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 545 {
 546         unsigned int mask;
 547         struct sock *sk = sock->sk;
 548         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 549
 550         poll_wait(file, sk->sleep, wait);
 551         if (sk->state == TCP_LISTEN)
 552                 return tcp_listen_poll(sk, wait);
 553
 554         /* Socket is not locked. We are protected from async events
 555            by poll logic and correct handling of state changes
 556            made by another threads is impossible in any case.
 557          */
 558
 559         mask = 0;
 560         if (sk->err)
 561                 mask = POLLERR;
 562
 563         /*
 564          * POLLHUP is certainly not done right. But poll() doesn't
 565          * have a notion of HUP in just one direction, and for a
 566          * socket the read side is more interesting.
 567          *
 568          * Some poll() documentation says that POLLHUP is incompatible
 569          * with the POLLOUT/POLLWR flags, so somebody should check this
 570          * all. But careful, it tends to be safer to return too many
 571          * bits than too few, and you can easily break real applications
 572          * if you don't tell them that something has hung up!
 573          *
 574          * Check-me.
 575          *
 576          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 577          * our fs/select.c). It means that after we received EOF,
 578          * poll always returns immediately, making impossible poll() on write()
 579          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 580          * if and only if shutdown has been made in both directions.
 581          * Actually, it is interesting to look how Solaris and DUX
 582          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 583          * then we could set it on SND_SHUTDOWN. BTW examples given
 584          * in Stevens' books assume exactly this behaviour, it explains
 585          * why PULLHUP is incompatible with POLLOUT.    --ANK
 586          *
 587          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 588          * blocking on fresh not-connected or disconnected socket. --ANK
 589          */
 590         if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
 591                 mask |= POLLHUP;
 592         if (sk->shutdown & RCV_SHUTDOWN)
 593                 mask |= POLLIN | POLLRDNORM;
 594
 595         /* Connected? */
 596         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 597                 /* Potential race condition. If read of tp below will
 598                  * escape above sk->state, we can be illegally awaken
 599                  * in SYN_* states. */
 600                 if ((tp->rcv_nxt != tp->copied_seq) &&
 601                     (tp->urg_seq != tp->copied_seq ||
 602                      tp->rcv_nxt != tp->copied_seq+1 ||
 603                      sk->urginline || !tp->urg_data))
 604                         mask |= POLLIN | POLLRDNORM;
 605
 606                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 607                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
 608                                 mask |= POLLOUT | POLLWRNORM;
 609                         } else {  /* send SIGIO later */
 610                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 611                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 612
 613                                 /* Race breaker. If space is freed after
 614                                  * wspace test but before the flags are set,
 615                                  * IO signal will be lost.
 616                                  */
 617                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
 618                                         mask |= POLLOUT | POLLWRNORM;
 619                         }
 620                 }
 621
 622                 if (tp->urg_data & TCP_URG_VALID)
 623                         mask |= POLLPRI;
 624         }
 625         return mask;
 626 }
 627
 628 /*
 629  *      TCP socket write_space callback. Not used.
 630  */
 631 void tcp_write_space(struct sock *sk)
 632 {
 633 }
 634
 635 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 636 {
 637         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 638         int answ;
 639
 640         switch(cmd) {
 641         case SIOCINQ:
 642                 if (sk->state == TCP_LISTEN)
 643                         return(-EINVAL);
 644
 645                 lock_sock(sk);
 646                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 647                         answ = 0;
 648                 else if (sk->urginline || !tp->urg_data ||
 649                          before(tp->urg_seq,tp->copied_seq) ||
 650                          !before(tp->urg_seq,tp->rcv_nxt)) {
 651                         answ = tp->rcv_nxt - tp->copied_seq;
 652
 653                         /* Subtract 1, if FIN is in queue. */
 654                         if (answ && !skb_queue_empty(&sk->receive_queue))
 655                                 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
 656                 } else
 657                         answ = tp->urg_seq - tp->copied_seq;
 658                 release_sock(sk);
 659                 break;
 660         case SIOCATMARK:
 661                 {
 662                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 663                         break;
 664                 }
 665         case SIOCOUTQ:
 666                 if (sk->state == TCP_LISTEN)
 667                         return(-EINVAL);
 668
 669                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 670                         answ = 0;
 671                 else
 672                         answ = tp->write_seq - tp->snd_una;
 673                 break;
 674         default:
 675                 return(-ENOIOCTLCMD);
 676         };
 677
 678         return put_user(answ, (int *)arg);
 679 }
 680
 681
 682 int tcp_listen_start(struct sock *sk)
 683 {
 684         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 685         struct tcp_listen_opt *lopt;
 686
 687         sk->max_ack_backlog = 0;
 688         sk->ack_backlog = 0;
 689         tp->accept_queue = tp->accept_queue_tail = NULL;
 690         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 691
 692         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 693         if (!lopt)
 694                 return -ENOMEM;
 695
 696         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 697         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 698                 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 699                         break;
 700
 701         write_lock_bh(&tp->syn_wait_lock);
 702         tp->listen_opt = lopt;
 703         write_unlock_bh(&tp->syn_wait_lock);
 704
 705         /* There is race window here: we announce ourselves listening,
 706          * but this transition is still not validated by get_port().
 707          * It is OK, because this socket enters to hash table only
 708          * after validation is complete.
 709          */
 710         sk->state = TCP_LISTEN;
 711         if (sk->prot->get_port(sk, sk->num) == 0) {
 712                 sk->sport = htons(sk->num);
 713
 714                 sk_dst_reset(sk);
 715                 sk->prot->hash(sk);
 716
 717                 return 0;
 718         }
 719
 720         sk->state = TCP_CLOSE;
 721         write_lock_bh(&tp->syn_wait_lock);
 722         tp->listen_opt = NULL;
 723         write_unlock_bh(&tp->syn_wait_lock);
 724         kfree(lopt);
 725         return -EADDRINUSE;
 726 }
 727
 728 /*
 729  *      This routine closes sockets which have been at least partially
 730  *      opened, but not yet accepted.
 731  */
 732
 733 static void tcp_listen_stop (struct sock *sk)
 734 {
 735         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 736         struct tcp_listen_opt *lopt = tp->listen_opt;
 737         struct open_request *acc_req = tp->accept_queue;
 738         struct open_request *req;
 739         int i;
 740
 741         tcp_delete_keepalive_timer(sk);
 742
 743         /* make all the listen_opt local to us */
 744         write_lock_bh(&tp->syn_wait_lock);
 745         tp->listen_opt =NULL;
 746         write_unlock_bh(&tp->syn_wait_lock);
 747         tp->accept_queue = tp->accept_queue_tail = NULL;
 748
 749         if (lopt->qlen) {
 750                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
 751                         while ((req = lopt->syn_table[i]) != NULL) {
 752                                 lopt->syn_table[i] = req->dl_next;
 753                                 lopt->qlen--;
 754                                 tcp_openreq_free(req);
 755
 756                 /* Following specs, it would be better either to send FIN
 757                  * (and enter FIN-WAIT-1, it is normal close)
 758                  * or to send active reset (abort).
 759                  * Certainly, it is pretty dangerous while synflood, but it is
 760                  * bad justification for our negligence 8)
 761                  * To be honest, we are not able to make either
 762                  * of the variants now.                 --ANK
 763                  */
 764                         }
 765                 }
 766         }
 767         BUG_TRAP(lopt->qlen == 0);
 768
 769         kfree(lopt);
 770
 771         while ((req=acc_req) != NULL) {
 772                 struct sock *child = req->sk;
 773
 774                 acc_req = req->dl_next;
 775
 776                 local_bh_disable();
 777                 bh_lock_sock(child);
 778                 BUG_TRAP(child->lock.users==0);
 779                 sock_hold(child);
 780
 781                 tcp_disconnect(child, O_NONBLOCK);
 782
 783                 sock_orphan(child);
 784
 785                 atomic_inc(&tcp_orphan_count);
 786
 787                 tcp_destroy_sock(child);
 788
 789                 bh_unlock_sock(child);
 790                 local_bh_enable();
 791                 sock_put(child);
 792
 793                 tcp_acceptq_removed(sk);
 794                 tcp_openreq_fastfree(req);
 795         }
 796         BUG_TRAP(sk->ack_backlog == 0);
 797 }
 798
 799 /*
 800  *      Wait for a socket to get into the connected state
 801  *
 802  *      Note: Must be called with the socket locked.
 803  */
 804 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
 805 {
 806         struct task_struct *tsk = current;
 807         DECLARE_WAITQUEUE(wait, tsk);
 808
 809         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 810                 if(sk->err)
 811                         return sock_error(sk);
 812                 if((1 << sk->state) &
 813                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 814                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 815                                 send_sig(SIGPIPE, tsk, 0);
 816                         return -EPIPE;
 817                 }
 818                 if(!*timeo_p)
 819                         return -EAGAIN;
 820                 if(signal_pending(tsk))
 821                         return sock_intr_errno(*timeo_p);
 822
 823                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
 824                 add_wait_queue(sk->sleep, &wait);
 825                 sk->tp_pinfo.af_tcp.write_pending++;
 826
 827                 release_sock(sk);
 828                 *timeo_p = schedule_timeout(*timeo_p);
 829                 lock_sock(sk);
 830
 831                 __set_task_state(tsk, TASK_RUNNING);
 832                 remove_wait_queue(sk->sleep, &wait);
 833                 sk->tp_pinfo.af_tcp.write_pending--;
 834         }
 835         return 0;
 836 }
 837
 838 static inline int tcp_memory_free(struct sock *sk)
 839 {
 840         return sk->wmem_queued < sk->sndbuf;
 841 }
 842
 843 /*
 844  *      Wait for more memory for a socket
 845  */
 846 static long wait_for_tcp_memory(struct sock * sk, long timeo)
 847 {
 848         long vm_wait = 0;
 849         long current_timeo = timeo;
 850         DECLARE_WAITQUEUE(wait, current);
 851
 852         if (tcp_memory_free(sk))
 853                 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
 854
 855         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 856
 857         add_wait_queue(sk->sleep, &wait);
 858         for (;;) {
 859                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 860
 861                 set_current_state(TASK_INTERRUPTIBLE);
 862
 863                 if (signal_pending(current))
 864                         break;
 865                 if (tcp_memory_free(sk) && !vm_wait)
 866                         break;
 867                 if (sk->shutdown & SEND_SHUTDOWN)
 868                         break;
 869                 if (sk->err)
 870                         break;
 871                 release_sock(sk);
 872                 if (!tcp_memory_free(sk) || vm_wait)
 873                         current_timeo = schedule_timeout(current_timeo);
 874                 lock_sock(sk);
 875                 if (vm_wait) {
 876                         if (timeo != MAX_SCHEDULE_TIMEOUT &&
 877                             (timeo -= vm_wait-current_timeo) < 0)
 878                                 timeo = 0;
 879                         break;
 880                 } else {
 881                         timeo = current_timeo;
 882                 }
 883         }
 884         current->state = TASK_RUNNING;
 885         remove_wait_queue(sk->sleep, &wait);
 886         return timeo;
 887 }
 888
 889 /* When all user supplied data has been queued set the PSH bit */
 890 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
 891
 892 /*
 893  *      This routine copies from a user buffer into a socket,
 894  *      and starts the transmit system.
 895  */
 896
 897 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 898 {
 899         struct iovec *iov;
 900         struct tcp_opt *tp;
 901         struct sk_buff *skb;
 902         int iovlen, flags;
 903         int mss_now;
 904         int err, copied;
 905         long timeo;
 906
 907         err = 0;
 908         tp = &(sk->tp_pinfo.af_tcp);
 909
 910         lock_sock(sk);
 911         TCP_CHECK_TIMER(sk);
 912
 913         flags = msg->msg_flags;
 914
 915         timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
 916
 917         /* Wait for a connection to finish. */
 918         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 919                 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
 920                         goto out_unlock;
 921
 922         /* This should be in poll */
 923         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 924
 925         mss_now = tcp_current_mss(sk);
 926
 927         /* Ok commence sending. */
 928         iovlen = msg->msg_iovlen;
 929         iov = msg->msg_iov;
 930         copied = 0;
 931
 932         while(--iovlen >= 0) {
 933                 int seglen=iov->iov_len;
 934                 unsigned char * from=iov->iov_base;
 935
 936                 iov++;
 937
 938                 while(seglen > 0) {
 939                         int copy, tmp, queue_it;
 940
 941                         if (err)
 942                                 goto do_fault2;
 943
 944                         /* Stop on errors. */
 945                         if (sk->err)
 946                                 goto do_sock_err;
 947
 948                         /* Make sure that we are established. */
 949                         if (sk->shutdown & SEND_SHUTDOWN)
 950                                 goto do_shutdown;
 951
 952                         /* Now we need to check if we have a half
 953                          * built packet we can tack some data onto.
 954                          */
 955                         if (tp->send_head && !(flags & MSG_OOB)) {
 956                                 skb = sk->write_queue.prev;
 957                                 copy = skb->len;
 958                                 /* If the remote does SWS avoidance we should
 959                                  * queue the best we can if not we should in
 960                                  * fact send multiple packets...
 961                                  * A method for detecting this would be most
 962                                  * welcome.
 963                                  */
 964                                 if (skb_tailroom(skb) > 0 &&
 965                                     (mss_now - copy) > 0) {
 966                                         int last_byte_was_odd = (copy % 4);
 967
 968                                         copy = mss_now - copy;
 969                                         if(copy > skb_tailroom(skb))
 970                                                 copy = skb_tailroom(skb);
 971                                         if(copy > seglen)
 972                                                 copy = seglen;
 973                                         if(last_byte_was_odd) {
 974                                                 if(copy_from_user(skb_put(skb, copy),
 975                                                                   from, copy))
 976                                                         err = -EFAULT;
 977                                                 skb->csum = csum_partial(skb->data,
 978                                                                          skb->len, 0);
 979                                         } else {
 980                                                 skb->csum =
 981                                                         csum_and_copy_from_user(
 982                                                         from, skb_put(skb, copy),
 983                                                         copy, skb->csum, &err);
 984                                         }
 985                                         /*
 986                                          * FIXME: the *_user functions should
 987                                          *        return how much data was
 988                                          *        copied before the fault
 989                                          *        occurred and then a partial
 990                                          *        packet with this data should
 991                                          *        be sent.  Unfortunately
 992                                          *        csum_and_copy_from_user doesn't
 993                                          *        return this information.
 994                                          *        ATM it might send partly zeroed
 995                                          *        data in this case.
 996                                          */
 997                                         tp->write_seq += copy;
 998                                         TCP_SKB_CB(skb)->end_seq += copy;
 999                                         from += copy;
1000                                         copied += copy;
1001                                         seglen -= copy;
1002                                         if (PSH_NEEDED ||
1003                                             after(tp->write_seq, tp->pushed_seq+(tp->max_window>>1))) {
1004                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1005                                                 tp->pushed_seq = tp->write_seq;
1006                                         }
1007                                         continue;
1008                                 }
1009                         }
1010
1011                         copy = min(seglen, mss_now);
1012
1013                         /* Determine how large of a buffer to allocate.  */
1014                         tmp = MAX_TCP_HEADER + 15 + tp->mss_cache;
1015                         if (copy < mss_now && !(flags & MSG_OOB)) {
1016                                 /* What is happening here is that we want to
1017                                  * tack on later members of the users iovec
1018                                  * if possible into a single frame.  When we
1019                                  * leave this loop our we check to see if
1020                                  * we can send queued frames onto the wire.
1021                                  */
1022                                 queue_it = 1;
1023                         } else {
1024                                 queue_it = 0;
1025                         }
1026
1027                         skb = NULL;
1028                         if (tcp_memory_free(sk))
1029                                 skb = tcp_alloc_skb(sk, tmp, GFP_KERNEL);
1030                         if (skb == NULL) {
1031                                 /* If we didn't get any memory, we need to sleep. */
1032                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1033                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
1034
1035                                 if (!timeo) {
1036                                         err = -EAGAIN;
1037                                         goto do_interrupted;
1038                                 }
1039                                 if (signal_pending(current)) {
1040                                         err = sock_intr_errno(timeo);
1041                                         goto do_interrupted;
1042                                 }
1043                                 __tcp_push_pending_frames(sk, tp, mss_now);
1044                                 timeo = wait_for_tcp_memory(sk, timeo);
1045
1046                                 /* If SACK's were formed or PMTU events happened,
1047                                  * we must find out about it.
1048                                  */
1049                                 mss_now = tcp_current_mss(sk);
1050                                 continue;
1051                         }
1052
1053                         seglen -= copy;
1054
1055                         /* Prepare control bits for TCP header creation engine. */
1056                         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1057                         if (PSH_NEEDED ||
1058                             after(tp->write_seq+copy, tp->pushed_seq+(tp->max_window>>1))) {
1059                                 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK|TCPCB_FLAG_PSH;
1060                                 tp->pushed_seq = tp->write_seq + copy;
1061                         } else {
1062                                 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1063                         }
1064                         TCP_SKB_CB(skb)->sacked = 0;
1065                         if (flags & MSG_OOB) {
1066                                 /* Funny. 8) This makes URG fully meaningless.
1067                                  * Well, OK. It does not contradict to anything yet. */
1068                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
1069                                 TCP_SKB_CB(skb)->urg_ptr = copy;
1070                         } else
1071                                 TCP_SKB_CB(skb)->urg_ptr = 0;
1072
1073                         /* TCP data bytes are SKB_PUT() on top, later
1074                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
1075                          * Reserve header space and checksum the data.
1076                          */
1077                         skb_reserve(skb, MAX_TCP_HEADER);
1078                         skb->csum = csum_and_copy_from_user(from,
1079                                         skb_put(skb, copy), copy, 0, &err);
1080
1081                         if (err)
1082                                 goto do_fault;
1083
1084                         from += copy;
1085                         copied += copy;
1086
1087                         TCP_SKB_CB(skb)->seq = tp->write_seq;
1088                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
1089
1090                         /* This advances tp->write_seq for us. */
1091                         tcp_send_skb(sk, skb, queue_it, mss_now);
1092                 }
1093         }
1094         err = copied;
1095 out:
1096         __tcp_push_pending_frames(sk, tp, mss_now);
1097         TCP_CHECK_TIMER(sk);
1098 out_unlock:
1099         release_sock(sk);
1100         return err;
1101
1102 do_sock_err:
1103         if(copied)
1104                 err = copied;
1105         else
1106                 err = sock_error(sk);
1107         goto out;
1108 do_shutdown:
1109         if(copied)
1110                 err = copied;
1111         else {
1112                 if (!(flags&MSG_NOSIGNAL))
1113                         send_sig(SIGPIPE, current, 0);
1114                 err = -EPIPE;
1115         }
1116         goto out;
1117 do_interrupted:
1118         if(copied)
1119                 err = copied;
1120         goto out;
1121 do_fault:
1122         __kfree_skb(skb);
1123 do_fault2:
1124         err = -EFAULT;
1125         goto out;
1126 }
1127
1128 #undef PSH_NEEDED
1129
1130 /*
1131  *      Handle reading urgent data. BSD has very simple semantics for
1132  *      this, no blocking and very strange errors 8)
1133  */
1134
1135 static int tcp_recv_urg(struct sock * sk, long timeo,
1136                         struct msghdr *msg, int len, int flags,
1137                         int *addr_len)
1138 {
1139         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1140
1141         /* No URG data to read. */
1142         if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1143                 return -EINVAL; /* Yes this is right ! */
1144
1145         if (sk->state==TCP_CLOSE && !sk->done)
1146                 return -ENOTCONN;
1147
1148         if (tp->urg_data & TCP_URG_VALID) {
1149                 int err = 0;
1150                 char c = tp->urg_data;
1151
1152                 if (!(flags & MSG_PEEK))
1153                         tp->urg_data = TCP_URG_READ;
1154
1155                 /* Read urgent data. */
1156                 msg->msg_flags|=MSG_OOB;
1157
1158                 if(len>0) {
1159                         if (!(flags & MSG_PEEK))
1160                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1161                         len = 1;
1162                 } else
1163                         msg->msg_flags|=MSG_TRUNC;
1164
1165                 return err ? -EFAULT : len;
1166         }
1167
1168         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1169                 return 0;
1170
1171         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1172          * the available implementations agree in this case:
1173          * this call should never block, independent of the
1174          * blocking state of the socket.
1175          * Mike <pall@rz.uni-karlsruhe.de>
1176          */
1177         return -EAGAIN;
1178 }
1179
1180 /*
1181  *      Release a skb if it is no longer needed. This routine
1182  *      must be called with interrupts disabled or with the
1183  *      socket locked so that the sk_buff queue operation is ok.
1184  */
1185
1186 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1187 {
1188         __skb_unlink(skb, &sk->receive_queue);
1189         __kfree_skb(skb);
1190 }
1191
1192 /* Clean up the receive buffer for full frames taken by the user,
1193  * then send an ACK if necessary.  COPIED is the number of bytes
1194  * tcp_recvmsg has given to the user so far, it speeds up the
1195  * calculation of whether or not we must ACK for the sake of
1196  * a window update.
1197  */
1198 static void cleanup_rbuf(struct sock *sk, int copied)
1199 {
1200         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1201         struct sk_buff *skb;
1202         int time_to_ack = 0;
1203
1204         /* NOTE! The socket must be locked, so that we don't get
1205          * a messed-up receive queue.
1206          */
1207         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1208                 if (!skb->used)
1209                         break;
1210                 tcp_eat_skb(sk, skb);
1211         }
1212
1213         if (tcp_ack_scheduled(tp)) {
1214                    /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1215                 if (tp->ack.blocked
1216                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1217                     || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1218                     /*
1219                      * If this read emptied read buffer, we send ACK, if
1220                      * connection is not bidirectional, user drained
1221                      * receive buffer and there was a small segment
1222                      * in queue.
1223                      */
1224                     || (copied > 0 &&
1225                         (tp->ack.pending&TCP_ACK_PUSHED) &&
1226                         !tp->ack.pingpong &&
1227                         atomic_read(&sk->rmem_alloc) == 0)) {
1228                         time_to_ack = 1;
1229                 }
1230         }
1231
1232         /* We send an ACK if we can now advertise a non-zero window
1233          * which has been raised "significantly".
1234          *
1235          * Even if window raised up to infinity, do not send window open ACK
1236          * in states, where we will not receive more. It is useless.
1237          */
1238         if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1239                 __u32 rcv_window_now = tcp_receive_window(tp);
1240
1241                 /* Optimize, __tcp_select_window() is not cheap. */
1242                 if (2*rcv_window_now <= tp->window_clamp) {
1243                         __u32 new_window = __tcp_select_window(sk);
1244
1245                         /* Send ACK now, if this read freed lots of space
1246                          * in our buffer. Certainly, new_window is new window.
1247                          * We can advertise it now, if it is not less than current one.
1248                          * "Lots" means "at least twice" here.
1249                          */
1250                         if(new_window && new_window >= 2*rcv_window_now)
1251                                 time_to_ack = 1;
1252                 }
1253         }
1254         if (time_to_ack)
1255                 tcp_send_ack(sk);
1256 }
1257
1258 /* Now socket state including sk->err is changed only under lock,
1259  * hence we may omit checks after joining wait queue.
1260  * We check receive queue before schedule() only as optimization;
1261  * it is very likely that release_sock() added new data.
1262  */
1263
1264 static long tcp_data_wait(struct sock *sk, long timeo)
1265 {
1266         DECLARE_WAITQUEUE(wait, current);
1267
1268         add_wait_queue(sk->sleep, &wait);
1269
1270         __set_current_state(TASK_INTERRUPTIBLE);
1271
1272         set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1273         release_sock(sk);
1274
1275         if (skb_queue_empty(&sk->receive_queue))
1276                 timeo = schedule_timeout(timeo);
1277
1278         lock_sock(sk);
1279         clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1280
1281         remove_wait_queue(sk->sleep, &wait);
1282         __set_current_state(TASK_RUNNING);
1283         return timeo;
1284 }
1285
1286 static void tcp_prequeue_process(struct sock *sk)
1287 {
1288         struct sk_buff *skb;
1289         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1290
1291         net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1292
1293         /* RX process wants to run with disabled BHs, though it is not necessary */
1294         local_bh_disable();
1295         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1296                 sk->backlog_rcv(sk, skb);
1297         local_bh_enable();
1298
1299         /* Clear memory counter. */
1300         tp->ucopy.memory = 0;
1301 }
1302
1303 /*
1304  *      This routine copies from a sock struct into the user buffer.
1305  *
1306  *      Technical note: in 2.3 we work on _locked_ socket, so that
1307  *      tricks with *seq access order and skb->users are not required.
1308  *      Probably, code can be easily improved even more.
1309  */
1310
1311 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1312                 int len, int nonblock, int flags, int *addr_len)
1313 {
1314         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1315         int copied = 0;
1316         u32 peek_seq;
1317         u32 *seq;
1318         unsigned long used;
1319         int err;
1320         int target;             /* Read at least this many bytes */
1321         long timeo;
1322         struct task_struct *user_recv = NULL;
1323
1324         lock_sock(sk);
1325
1326         TCP_CHECK_TIMER(sk);
1327
1328         err = -ENOTCONN;
1329         if (sk->state == TCP_LISTEN)
1330                 goto out;
1331
1332         timeo = sock_rcvtimeo(sk, nonblock);
1333
1334         /* Urgent data needs to be handled specially. */
1335         if (flags & MSG_OOB)
1336                 goto recv_urg;
1337
1338         seq = &tp->copied_seq;
1339         if (flags & MSG_PEEK) {
1340                 peek_seq = tp->copied_seq;
1341                 seq = &peek_seq;
1342         }
1343
1344         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1345
1346         do {
1347                 struct sk_buff * skb;
1348                 u32 offset;
1349
1350                 /* Are we at urgent data? Stop if we have read anything. */
1351                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1352                         break;
1353
1354                 /* We need to check signals first, to get correct SIGURG
1355                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1356                  * and move it down to the bottom of the loop
1357                  */
1358                 if (signal_pending(current)) {
1359                         if (copied)
1360                                 break;
1361                         copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1362                         break;
1363                 }
1364
1365                 /* Next get a buffer. */
1366
1367                 skb = skb_peek(&sk->receive_queue);
1368                 do {
1369                         if (!skb)
1370                                 break;
1371
1372                         /* Now that we have two receive queues this
1373                          * shouldn't happen.
1374                          */
1375                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1376                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1377                                        *seq, TCP_SKB_CB(skb)->seq);
1378                                 break;
1379                         }
1380                         offset = *seq - TCP_SKB_CB(skb)->seq;
1381                         if (skb->h.th->syn)
1382                                 offset--;
1383                         if (offset < skb->len)
1384                                 goto found_ok_skb;
1385                         if (skb->h.th->fin)
1386                                 goto found_fin_ok;
1387                         if (!(flags & MSG_PEEK))
1388                                 skb->used = 1;
1389                         skb = skb->next;
1390                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1391
1392                 /* Well, if we have backlog, try to process it now yet. */
1393
1394                 if (copied >= target && sk->backlog.tail == NULL)
1395                         break;
1396
1397                 if (copied) {
1398                         if (sk->err ||
1399                             sk->state == TCP_CLOSE ||
1400                             (sk->shutdown & RCV_SHUTDOWN) ||
1401                             !timeo)
1402                                 break;
1403                 } else {
1404                         if (sk->done)
1405                                 break;
1406
1407                         if (sk->err) {
1408                                 copied = sock_error(sk);
1409                                 break;
1410                         }
1411
1412                         if (sk->shutdown & RCV_SHUTDOWN)
1413                                 break;
1414
1415                         if (sk->state == TCP_CLOSE) {
1416                                 if (!sk->done) {
1417                                         /* This occurs when user tries to read
1418                                          * from never connected socket.
1419                                          */
1420                                         copied = -ENOTCONN;
1421                                         break;
1422                                 }
1423                                 break;
1424                         }
1425
1426                         if (!timeo) {
1427                                 copied = -EAGAIN;
1428                                 break;
1429                         }
1430                 }
1431
1432                 cleanup_rbuf(sk, copied);
1433
1434                 if (tp->ucopy.task == user_recv) {
1435                         /* Install new reader */
1436                         if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1437                                 user_recv = current;
1438                                 tp->ucopy.task = user_recv;
1439                                 tp->ucopy.iov = msg->msg_iov;
1440                         }
1441
1442                         tp->ucopy.len = len;
1443
1444                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1445
1446                         /* Ugly... If prequeue is not empty, we have to
1447                          * process it before releasing socket, otherwise
1448                          * order will be broken at second iteration.
1449                          * More elegant solution is required!!!
1450                          *
1451                          * Look: we have the following (pseudo)queues:
1452                          *
1453                          * 1. packets in flight
1454                          * 2. backlog
1455                          * 3. prequeue
1456                          * 4. receive_queue
1457                          *
1458                          * Each queue can be processed only if the next ones
1459                          * are empty. At this point we have empty receive_queue.
1460                          * But prequeue _can_ be not empty after second iteration,
1461                          * when we jumped to start of loop because backlog
1462                          * processing added something to receive_queue.
1463                          * We cannot release_sock(), because backlog contains
1464                          * packets arrived _after_ prequeued ones.
1465                          *
1466                          * Shortly, algorithm is clear --- to process all
1467                          * the queues in order. We could make it more directly,
1468                          * requeueing packets from backlog to prequeue, if
1469                          * is not empty. It is more elegant, but eats cycles,
1470                          * unfortunately.
1471                          */
1472                         if (skb_queue_len(&tp->ucopy.prequeue))
1473                                 goto do_prequeue;
1474
1475                         /* __ Set realtime policy in scheduler __ */
1476                 }
1477
1478                 if (copied >= target) {
1479                         /* Do not sleep, just process backlog. */
1480                         release_sock(sk);
1481                         lock_sock(sk);
1482                 } else {
1483                         timeo = tcp_data_wait(sk, timeo);
1484                 }
1485
1486                 if (user_recv) {
1487                         int chunk;
1488
1489                         /* __ Restore normal policy in scheduler __ */
1490
1491                         if ((chunk = len - tp->ucopy.len) != 0) {
1492                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1493                                 len -= chunk;
1494                                 copied += chunk;
1495                         }
1496
1497                         if (tp->rcv_nxt == tp->copied_seq &&
1498                             skb_queue_len(&tp->ucopy.prequeue)) {
1499 do_prequeue:
1500                                 tcp_prequeue_process(sk);
1501
1502                                 if ((chunk = len - tp->ucopy.len) != 0) {
1503                                         net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1504                                         len -= chunk;
1505                                         copied += chunk;
1506                                 }
1507                         }
1508                 }
1509                 continue;
1510
1511         found_ok_skb:
1512                 /* Ok so how much can we use? */
1513                 used = skb->len - offset;
1514                 if (len < used)
1515                         used = len;
1516
1517                 /* Do we have urgent data here? */
1518                 if (tp->urg_data) {
1519                         u32 urg_offset = tp->urg_seq - *seq;
1520                         if (urg_offset < used) {
1521                                 if (!urg_offset) {
1522                                         if (!sk->urginline) {
1523                                                 ++*seq;
1524                                                 offset++;
1525                                                 used--;
1526                                         }
1527                                 } else
1528                                         used = urg_offset;
1529                         }
1530                 }
1531
1532                 err = 0;
1533                 if (!(flags&MSG_TRUNC)) {
1534                         err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1535                         if (err) {
1536                                 /* Exception. Bailout! */
1537                                 if (!copied)
1538                                         copied = -EFAULT;
1539                                 break;
1540                         }
1541                 }
1542
1543                 *seq += used;
1544                 copied += used;
1545                 len -= used;
1546
1547                 if (after(tp->copied_seq,tp->urg_seq)) {
1548                         tp->urg_data = 0;
1549                         if (skb_queue_len(&tp->out_of_order_queue) == 0
1550 #ifdef TCP_FORMAL_WINDOW
1551                             && tcp_receive_window(tp)
1552 #endif
1553                             ) {
1554                                 tcp_fast_path_on(tp);
1555                         }
1556                 }
1557                 if (used + offset < skb->len)
1558                         continue;
1559
1560                 /*      Process the FIN. We may also need to handle PSH
1561                  *      here and make it break out of MSG_WAITALL.
1562                  */
1563                 if (skb->h.th->fin)
1564                         goto found_fin_ok;
1565                 if (flags & MSG_PEEK)
1566                         continue;
1567                 skb->used = 1;
1568                 tcp_eat_skb(sk, skb);
1569                 continue;
1570
1571         found_fin_ok:
1572                 ++*seq;
1573                 if (flags & MSG_PEEK)
1574                         break;
1575
1576                 /* All is done. */
1577                 skb->used = 1;
1578                 break;
1579         } while (len > 0);
1580
1581         if (user_recv) {
1582                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1583                         int chunk;
1584
1585                         tp->ucopy.len = copied > 0 ? len : 0;
1586
1587                         tcp_prequeue_process(sk);
1588
1589                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1590                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1591                                 len -= chunk;
1592                                 copied += chunk;
1593                         }
1594                 }
1595
1596                 tp->ucopy.task = NULL;
1597                 tp->ucopy.len = 0;
1598         }
1599
1600         /* According to UNIX98, msg_name/msg_namelen are ignored
1601          * on connected socket. I was just happy when found this 8) --ANK
1602          */
1603
1604         /* Clean up data we have read: This will do ACK frames. */
1605         cleanup_rbuf(sk, copied);
1606
1607         TCP_CHECK_TIMER(sk);
1608         release_sock(sk);
1609         return copied;
1610
1611 out:
1612         TCP_CHECK_TIMER(sk);
1613         release_sock(sk);
1614         return err;
1615
1616 recv_urg:
1617         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1618         goto out;
1619 }
1620
1621 /*
1622  *      State processing on a close. This implements the state shift for
1623  *      sending our FIN frame. Note that we only send a FIN for some
1624  *      states. A shutdown() may have already sent the FIN, or we may be
1625  *      closed.
1626  */
1627
1628 static unsigned char new_state[16] = {
1629   /* current state:        new state:      action:      */
1630   /* (Invalid)          */ TCP_CLOSE,
1631   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1632   /* TCP_SYN_SENT       */ TCP_CLOSE,
1633   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1634   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1635   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1636   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1637   /* TCP_CLOSE          */ TCP_CLOSE,
1638   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1639   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1640   /* TCP_LISTEN         */ TCP_CLOSE,
1641   /* TCP_CLOSING        */ TCP_CLOSING,
1642 };
1643
1644 static int tcp_close_state(struct sock *sk)
1645 {
1646         int next = (int) new_state[sk->state];
1647         int ns = (next & TCP_STATE_MASK);
1648
1649         tcp_set_state(sk, ns);
1650
1651         return (next & TCP_ACTION_FIN);
1652 }
1653
1654 /*
1655  *      Shutdown the sending side of a connection. Much like close except
1656  *      that we don't receive shut down or set sk->dead.
1657  */
1658
1659 void tcp_shutdown(struct sock *sk, int how)
1660 {
1661         /*      We need to grab some memory, and put together a FIN,
1662          *      and then put it into the queue to be sent.
1663          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1664          */
1665         if (!(how & SEND_SHUTDOWN))
1666                 return;
1667
1668         /* If we've already sent a FIN, or it's a closed state, skip this. */
1669         if ((1 << sk->state) &
1670             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1671                 /* Clear out any half completed packets.  FIN if needed. */
1672                 if (tcp_close_state(sk))
1673                         tcp_send_fin(sk);
1674         }
1675 }
1676
1677
1678 /*
1679  *      Return 1 if we still have things to send in our buffers.
1680  */
1681
1682 static inline int closing(struct sock * sk)
1683 {
1684         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1685 }
1686
1687 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1688 {
1689         /* First the read buffer. */
1690         __skb_queue_purge(&sk->receive_queue);
1691
1692         /* Next, the error queue. */
1693         __skb_queue_purge(&sk->error_queue);
1694
1695         /* Next, the write queue. */
1696         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1697
1698         /* Account for returned memory. */
1699         tcp_mem_reclaim(sk);
1700
1701         BUG_TRAP(sk->wmem_queued == 0);
1702         BUG_TRAP(sk->forward_alloc == 0);
1703
1704         /* It is _impossible_ for the backlog to contain anything
1705          * when we get here.  All user references to this socket
1706          * have gone away, only the net layer knows can touch it.
1707          */
1708 }
1709
1710 /*
1711  * At this point, there should be no process reference to this
1712  * socket, and thus no user references at all.  Therefore we
1713  * can assume the socket waitqueue is inactive and nobody will
1714  * try to jump onto it.
1715  */
1716 void tcp_destroy_sock(struct sock *sk)
1717 {
1718         BUG_TRAP(sk->state==TCP_CLOSE);
1719         BUG_TRAP(sk->dead);
1720
1721         /* It cannot be in hash table! */
1722         BUG_TRAP(sk->pprev==NULL);
1723
1724         /* It it has not 0 sk->num, it must be bound */
1725         BUG_TRAP(!sk->num || sk->prev!=NULL);
1726
1727 #ifdef TCP_DEBUG
1728         if (sk->zapped) {
1729                 printk("TCP: double destroy sk=%p\n", sk);
1730                 sock_hold(sk);
1731         }
1732         sk->zapped = 1;
1733 #endif
1734
1735         sk->prot->destroy(sk);
1736
1737         tcp_kill_sk_queues(sk);
1738
1739 #ifdef INET_REFCNT_DEBUG
1740         if (atomic_read(&sk->refcnt) != 1) {
1741                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1742         }
1743 #endif
1744
1745         atomic_dec(&tcp_orphan_count);
1746         sock_put(sk);
1747 }
1748
1749 void tcp_close(struct sock *sk, long timeout)
1750 {
1751         struct sk_buff *skb;
1752         int data_was_unread = 0;
1753
1754         lock_sock(sk);
1755         sk->shutdown = SHUTDOWN_MASK;
1756
1757         if(sk->state == TCP_LISTEN) {
1758                 tcp_set_state(sk, TCP_CLOSE);
1759
1760                 /* Special case. */
1761                 tcp_listen_stop(sk);
1762
1763                 goto adjudge_to_death;
1764         }
1765
1766         /*  We need to flush the recv. buffs.  We do this only on the
1767          *  descriptor close, not protocol-sourced closes, because the
1768          *  reader process may not have drained the data yet!
1769          */
1770         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1771                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1772                 data_was_unread += len;
1773                 __kfree_skb(skb);
1774         }
1775
1776         tcp_mem_reclaim(sk);
1777
1778         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1779          * 3.10, we send a RST here because data was lost.  To
1780          * witness the awful effects of the old behavior of always
1781          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1782          * a bulk GET in an FTP client, suspend the process, wait
1783          * for the client to advertise a zero window, then kill -9
1784          * the FTP client, wheee...  Note: timeout is always zero
1785          * in such a case.
1786          */
1787         if(data_was_unread != 0) {
1788                 /* Unread data was tossed, zap the connection. */
1789                 NET_INC_STATS_USER(TCPAbortOnClose);
1790                 tcp_set_state(sk, TCP_CLOSE);
1791                 tcp_send_active_reset(sk, GFP_KERNEL);
1792         } else if (sk->linger && sk->lingertime==0) {
1793                 /* Check zero linger _after_ checking for unread data. */
1794                 sk->prot->disconnect(sk, 0);
1795                 NET_INC_STATS_USER(TCPAbortOnData);
1796         } else if (tcp_close_state(sk)) {
1797                 /* We FIN if the application ate all the data before
1798                  * zapping the connection.
1799                  */
1800
1801                 /* RED-PEN. Formally speaking, we have broken TCP state
1802                  * machine. State transitions:
1803                  *
1804                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1805                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1806                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1807                  *
1808                  * are legal only when FIN has been sent (i.e. in window),
1809                  * rather than queued out of window. Purists blame.
1810                  *
1811                  * F.e. "RFC state" is ESTABLISHED,
1812                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1813                  *
1814                  * The visible declinations are that sometimes
1815                  * we enter time-wait state, when it is not required really
1816                  * (harmless), do not send active resets, when they are
1817                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1818                  * they look as CLOSING or LAST_ACK for Linux)
1819                  * Probably, I missed some more holelets.
1820                  *                                              --ANK
1821                  */
1822                 tcp_send_fin(sk);
1823         }
1824
1825         if (timeout) {
1826                 struct task_struct *tsk = current;
1827                 DECLARE_WAITQUEUE(wait, current);
1828
1829                 add_wait_queue(sk->sleep, &wait);
1830
1831                 do {
1832                         set_current_state(TASK_INTERRUPTIBLE);
1833                         if (!closing(sk))
1834                                 break;
1835                         release_sock(sk);
1836                         timeout = schedule_timeout(timeout);
1837                         lock_sock(sk);
1838                 } while (!signal_pending(tsk) && timeout);
1839
1840                 tsk->state = TASK_RUNNING;
1841                 remove_wait_queue(sk->sleep, &wait);
1842         }
1843
1844 adjudge_to_death:
1845         /* It is the last release_sock in its life. It will remove backlog. */
1846         release_sock(sk);
1847
1848
1849         /* Now socket is owned by kernel and we acquire BH lock
1850            to finish close. No need to check for user refs.
1851          */
1852         local_bh_disable();
1853         bh_lock_sock(sk);
1854         BUG_TRAP(sk->lock.users==0);
1855
1856         sock_hold(sk);
1857         sock_orphan(sk);
1858
1859         /*      This is a (useful) BSD violating of the RFC. There is a
1860          *      problem with TCP as specified in that the other end could
1861          *      keep a socket open forever with no application left this end.
1862          *      We use a 3 minute timeout (about the same as BSD) then kill
1863          *      our end. If they send after that then tough - BUT: long enough
1864          *      that we won't make the old 4*rto = almost no time - whoops
1865          *      reset mistake.
1866          *
1867          *      Nope, it was not mistake. It is really desired behaviour
1868          *      f.e. on http servers, when such sockets are useless, but
1869          *      consume significant resources. Let's do it with special
1870          *      linger2 option.                                 --ANK
1871          */
1872
1873         if (sk->state == TCP_FIN_WAIT2) {
1874                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1875                 if (tp->linger2 < 0) {
1876                         tcp_set_state(sk, TCP_CLOSE);
1877                         tcp_send_active_reset(sk, GFP_ATOMIC);
1878                         NET_INC_STATS_BH(TCPAbortOnLinger);
1879                 } else {
1880                         int tmo = tcp_fin_time(tp);
1881
1882                         if (tmo > TCP_TIMEWAIT_LEN) {
1883                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1884                         } else {
1885                                 atomic_inc(&tcp_orphan_count);
1886                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1887                                 goto out;
1888                         }
1889                 }
1890         }
1891         if (sk->state != TCP_CLOSE) {
1892                 tcp_mem_reclaim(sk);
1893                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1894                     (sk->wmem_queued > SOCK_MIN_SNDBUF &&
1895                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1896                         if (net_ratelimit())
1897                                 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
1898                         tcp_set_state(sk, TCP_CLOSE);
1899                         tcp_send_active_reset(sk, GFP_ATOMIC);
1900                         NET_INC_STATS_BH(TCPAbortOnMemory);
1901                 }
1902         }
1903         atomic_inc(&tcp_orphan_count);
1904
1905         if (sk->state == TCP_CLOSE)
1906                 tcp_destroy_sock(sk);
1907         /* Otherwise, socket is reprieved until protocol close. */
1908
1909 out:
1910         bh_unlock_sock(sk);
1911         local_bh_enable();
1912         sock_put(sk);
1913 }
1914
1915 /* These states need RST on ABORT according to RFC793 */
1916
1917 extern __inline__ int tcp_need_reset(int state)
1918 {
1919         return ((1 << state) &
1920                 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1921                  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
1922 }
1923
1924 int tcp_disconnect(struct sock *sk, int flags)
1925 {
1926         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1927         int old_state;
1928         int err = 0;
1929
1930         old_state = sk->state;
1931         if (old_state != TCP_CLOSE)
1932                 tcp_set_state(sk, TCP_CLOSE);
1933
1934         /* ABORT function of RFC793 */
1935         if (old_state == TCP_LISTEN) {
1936                 tcp_listen_stop(sk);
1937         } else if (tcp_need_reset(old_state) ||
1938                    (tp->snd_nxt != tp->write_seq &&
1939                     (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
1940                 /* The last check adjusts for discrepance of Linux wrt. RFC
1941                  * states
1942                  */
1943                 tcp_send_active_reset(sk, gfp_any());
1944                 sk->err = ECONNRESET;
1945         } else if (old_state == TCP_SYN_SENT)
1946                 sk->err = ECONNRESET;
1947
1948         tcp_clear_xmit_timers(sk);
1949         __skb_queue_purge(&sk->receive_queue);
1950         tcp_writequeue_purge(sk);
1951         __skb_queue_purge(&tp->out_of_order_queue);
1952
1953         sk->dport = 0;
1954
1955         sk->rcv_saddr = 0;
1956         sk->saddr = 0;
1957 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1958         memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1959         memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1960 #endif
1961
1962         sk->shutdown = 0;
1963         sk->done = 0;
1964         tp->srtt = 0;
1965         if ((tp->write_seq += tp->max_window+2) == 0)
1966                 tp->write_seq = 1;
1967         tp->backoff = 0;
1968         tp->snd_cwnd = 2;
1969         tp->probes_out = 0;
1970         tp->packets_out = 0;
1971         tp->snd_ssthresh = 0x7fffffff;
1972         tp->snd_cwnd_cnt = 0;
1973         tp->ca_state = TCP_CA_Open;
1974         tcp_clear_retrans(tp);
1975         tcp_delack_init(tp);
1976         tp->send_head = NULL;
1977         tp->saw_tstamp = 0;
1978         tcp_sack_reset(tp);
1979         __sk_dst_reset(sk);
1980
1981         BUG_TRAP(!sk->num || sk->prev);
1982
1983         sk->error_report(sk);
1984         return err;
1985 }
1986
1987 /*
1988  *      Wait for an incoming connection, avoid race
1989  *      conditions. This must be called with the socket locked.
1990  */
1991 static int wait_for_connect(struct sock * sk, long timeo)
1992 {
1993         DECLARE_WAITQUEUE(wait, current);
1994         int err;
1995
1996         /*
1997          * True wake-one mechanism for incoming connections: only
1998          * one process gets woken up, not the 'whole herd'.
1999          * Since we do not 'race & poll' for established sockets
2000          * anymore, the common case will execute the loop only once.
2001          *
2002          * Subtle issue: "add_wait_queue_exclusive()" will be added
2003          * after any current non-exclusive waiters, and we know that
2004          * it will always _stay_ after any new non-exclusive waiters
2005          * because all non-exclusive waiters are added at the
2006          * beginning of the wait-queue. As such, it's ok to "drop"
2007          * our exclusiveness temporarily when we get woken up without
2008          * having to remove and re-insert us on the wait queue.
2009          */
2010         add_wait_queue_exclusive(sk->sleep, &wait);
2011         for (;;) {
2012                 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
2013                 release_sock(sk);
2014                 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2015                         timeo = schedule_timeout(timeo);
2016                 lock_sock(sk);
2017                 err = 0;
2018                 if (sk->tp_pinfo.af_tcp.accept_queue)
2019                         break;
2020                 err = -EINVAL;
2021                 if (sk->state != TCP_LISTEN)
2022                         break;
2023                 err = sock_intr_errno(timeo);
2024                 if (signal_pending(current))
2025                         break;
2026                 err = -EAGAIN;
2027                 if (!timeo)
2028                         break;
2029         }
2030         current->state = TASK_RUNNING;
2031         remove_wait_queue(sk->sleep, &wait);
2032         return err;
2033 }
2034
2035 /*
2036  *      This will accept the next outstanding connection.
2037  */
2038
2039 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2040 {
2041         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2042         struct open_request *req;
2043         struct sock *newsk;
2044         int error;
2045
2046         lock_sock(sk);
2047
2048         /* We need to make sure that this socket is listening,
2049          * and that it has something pending.
2050          */
2051         error = -EINVAL;
2052         if (sk->state != TCP_LISTEN)
2053                 goto out;
2054
2055         /* Find already established connection */
2056         if (!tp->accept_queue) {
2057                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2058
2059                 /* If this is a non blocking socket don't sleep */
2060                 error = -EAGAIN;
2061                 if (!timeo)
2062                         goto out;
2063
2064                 error = wait_for_connect(sk, timeo);
2065                 if (error)
2066                         goto out;
2067         }
2068
2069         req = tp->accept_queue;
2070         if ((tp->accept_queue = req->dl_next) == NULL)
2071                 tp->accept_queue_tail = NULL;
2072
2073         newsk = req->sk;
2074         tcp_acceptq_removed(sk);
2075         tcp_openreq_fastfree(req);
2076         BUG_TRAP(newsk->state != TCP_SYN_RECV);
2077         release_sock(sk);
2078         return newsk;
2079
2080 out:
2081         release_sock(sk);
2082         *err = error;
2083         return NULL;
2084 }
2085
2086 /*
2087  *      Socket option code for TCP.
2088  */
2089
2090 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2091                    int optlen)
2092 {
2093         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2094         int val;
2095         int err = 0;
2096
2097         if (level != SOL_TCP)
2098                 return tp->af_specific->setsockopt(sk, level, optname,
2099                                                    optval, optlen);
2100
2101         if(optlen<sizeof(int))
2102                 return -EINVAL;
2103
2104         if (get_user(val, (int *)optval))
2105                 return -EFAULT;
2106
2107         lock_sock(sk);
2108
2109         switch(optname) {
2110         case TCP_MAXSEG:
2111                 /* values greater than interface MTU won't take effect.  however at
2112                  * the point when this call is done we typically don't yet know
2113                  * which interface is going to be used
2114                  */
2115                 if(val < 8 || val > MAX_TCP_WINDOW) {
2116                         err = -EINVAL;
2117                         break;
2118                 }
2119                 tp->user_mss = val;
2120                 break;
2121
2122         case TCP_NODELAY:
2123                 /* You cannot try to use this and TCP_CORK in
2124                  * tandem, so let the user know.
2125                  */
2126                 if (tp->nonagle == 2) {
2127                         err = -EINVAL;
2128                         break;
2129                 }
2130                 tp->nonagle = (val == 0) ? 0 : 1;
2131                 if (val)
2132                         tcp_push_pending_frames(sk, tp);
2133                 break;
2134
2135         case TCP_CORK:
2136                 /* When set indicates to always queue non-full frames.
2137                  * Later the user clears this option and we transmit
2138                  * any pending partial frames in the queue.  This is
2139                  * meant to be used alongside sendfile() to get properly
2140                  * filled frames when the user (for example) must write
2141                  * out headers with a write() call first and then use
2142                  * sendfile to send out the data parts.
2143                  *
2144                  * You cannot try to use TCP_NODELAY and this mechanism
2145                  * at the same time, so let the user know.
2146                  */
2147                 if (tp->nonagle == 1) {
2148                         err = -EINVAL;
2149                         break;
2150                 }
2151                 if (val != 0) {
2152                         tp->nonagle = 2;
2153                 } else {
2154                         tp->nonagle = 0;
2155
2156                         tcp_push_pending_frames(sk, tp);
2157                 }
2158                 break;
2159
2160         case TCP_KEEPIDLE:
2161                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2162                         err = -EINVAL;
2163                 else {
2164                         tp->keepalive_time = val * HZ;
2165                         if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2166                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2167                                 if (tp->keepalive_time > elapsed)
2168                                         elapsed = tp->keepalive_time - elapsed;
2169                                 else
2170                                         elapsed = 0;
2171                                 tcp_reset_keepalive_timer(sk, elapsed);
2172                         }
2173                 }
2174                 break;
2175         case TCP_KEEPINTVL:
2176                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2177                         err = -EINVAL;
2178                 else
2179                         tp->keepalive_intvl = val * HZ;
2180                 break;
2181         case TCP_KEEPCNT:
2182                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2183                         err = -EINVAL;
2184                 else
2185                         tp->keepalive_probes = val;
2186                 break;
2187         case TCP_SYNCNT:
2188                 if (val < 1 || val > MAX_TCP_SYNCNT)
2189                         err = -EINVAL;
2190                 else
2191                         tp->syn_retries = val;
2192                 break;
2193
2194         case TCP_LINGER2:
2195                 if (val < 0)
2196                         tp->linger2 = -1;
2197                 else if (val > sysctl_tcp_fin_timeout/HZ)
2198                         tp->linger2 = 0;
2199                 else
2200                         tp->linger2 = val*HZ;
2201                 break;
2202
2203         case TCP_DEFER_ACCEPT:
2204                 tp->defer_accept = 0;
2205                 if (val > 0) {
2206                         /* Translate value in seconds to number of retransmits */
2207                         while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2208                                 tp->defer_accept++;
2209                         tp->defer_accept++;
2210                 }
2211                 break;
2212
2213         case TCP_WINDOW_CLAMP:
2214                 if (val==0) {
2215                         if (sk->state != TCP_CLOSE) {
2216                                 err = -EINVAL;
2217                                 break;
2218                         }
2219                         tp->window_clamp = 0;
2220                 } else {
2221                         tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2222                                 SOCK_MIN_RCVBUF/2 : val;
2223                 }
2224                 break;
2225
2226         default:
2227                 err = -ENOPROTOOPT;
2228                 break;
2229         };
2230         release_sock(sk);
2231         return err;
2232 }
2233
2234 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2235                    int *optlen)
2236 {
2237         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2238         int val, len;
2239
2240         if(level != SOL_TCP)
2241                 return tp->af_specific->getsockopt(sk, level, optname,
2242                                                    optval, optlen);
2243
2244         if(get_user(len,optlen))
2245                 return -EFAULT;
2246
2247         len = min(len, sizeof(int));
2248
2249         switch(optname) {
2250         case TCP_MAXSEG:
2251                 val = tp->mss_cache;
2252                 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2253                         val = tp->user_mss;
2254                 break;
2255         case TCP_NODELAY:
2256                 val = (tp->nonagle == 1);
2257                 break;
2258         case TCP_CORK:
2259                 val = (tp->nonagle == 2);
2260                 break;
2261         case TCP_KEEPIDLE:
2262                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2263                 break;
2264         case TCP_KEEPINTVL:
2265                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2266                 break;
2267         case TCP_KEEPCNT:
2268                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2269                 break;
2270         case TCP_SYNCNT:
2271                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2272                 break;
2273         case TCP_LINGER2:
2274                 val = tp->linger2;
2275                 if (val > 0)
2276                         val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2277                 break;
2278         case TCP_DEFER_ACCEPT:
2279                 val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
2280                 break;
2281         case TCP_WINDOW_CLAMP:
2282                 val = tp->window_clamp;
2283                 break;
2284         default:
2285                 return -ENOPROTOOPT;
2286         };
2287
2288         if(put_user(len, optlen))
2289                 return -EFAULT;
2290         if(copy_to_user(optval, &val,len))
2291                 return -EFAULT;
2292         return 0;
2293 }
2294
2295
2296 extern void __skb_cb_too_small_for_tcp(int, int);
2297
2298 void __init tcp_init(void)
2299 {
2300         struct sk_buff *skb = NULL;
2301         unsigned long goal;
2302         int order, i;
2303
2304         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2305                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2306                                            sizeof(skb->cb));
2307
2308         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2309                                                    sizeof(struct open_request),
2310                                                0, SLAB_HWCACHE_ALIGN,
2311                                                NULL, NULL);
2312         if(!tcp_openreq_cachep)
2313                 panic("tcp_init: Cannot alloc open_request cache.");
2314
2315         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2316                                               sizeof(struct tcp_bind_bucket),
2317                                               0, SLAB_HWCACHE_ALIGN,
2318                                               NULL, NULL);
2319         if(!tcp_bucket_cachep)
2320                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2321
2322         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2323                                                 sizeof(struct tcp_tw_bucket),
2324                                                 0, SLAB_HWCACHE_ALIGN,
2325                                                 NULL, NULL);
2326         if(!tcp_timewait_cachep)
2327                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2328
2329         /* Size and allocate the main established and bind bucket
2330          * hash tables.
2331          *
2332          * The methodology is similar to that of the buffer cache.
2333          */
2334         goal = num_physpages >> (23 - PAGE_SHIFT);
2335
2336         for(order = 0; (1UL << order) < goal; order++)
2337                 ;
2338         do {
2339                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2340                         sizeof(struct tcp_ehash_bucket);
2341                 tcp_ehash_size >>= 1;
2342                 while (tcp_ehash_size & (tcp_ehash_size-1))
2343                         tcp_ehash_size--;
2344                 tcp_ehash = (struct tcp_ehash_bucket *)
2345                         __get_free_pages(GFP_ATOMIC, order);
2346         } while (tcp_ehash == NULL && --order > 0);
2347
2348         if (!tcp_ehash)
2349                 panic("Failed to allocate TCP established hash table\n");
2350         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2351                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2352                 tcp_ehash[i].chain = NULL;
2353         }
2354
2355         do {
2356                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2357                         sizeof(struct tcp_bind_hashbucket);
2358                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2359                         continue;
2360                 tcp_bhash = (struct tcp_bind_hashbucket *)
2361                         __get_free_pages(GFP_ATOMIC, order);
2362         } while (tcp_bhash == NULL && --order >= 0);
2363
2364         if (!tcp_bhash)
2365                 panic("Failed to allocate TCP bind hash table\n");
2366         for (i = 0; i < tcp_bhash_size; i++) {
2367                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2368                 tcp_bhash[i].chain = NULL;
2369         }
2370
2371         /* Try to be a bit smarter and adjust defaults depending
2372          * on available memory.
2373          */
2374         if (order > 4) {
2375                 sysctl_local_port_range[0] = 32768;
2376                 sysctl_local_port_range[1] = 61000;
2377                 sysctl_tcp_max_tw_buckets = 180000;
2378                 sysctl_tcp_max_orphans = 4096<<(order-4);
2379                 sysctl_max_syn_backlog = 1024;
2380         } else if (order < 3) {
2381                 sysctl_local_port_range[0] = 1024*(3-order);
2382                 sysctl_tcp_max_tw_buckets >>= (3-order);
2383                 sysctl_tcp_max_orphans >>= (3-order);
2384                 sysctl_max_syn_backlog = 128;
2385         }
2386         tcp_port_rover = sysctl_local_port_range[0] - 1;
2387
2388         sysctl_tcp_mem[0] = 64<<order;
2389         sysctl_tcp_mem[1] = 200<<order;
2390         sysctl_tcp_mem[2] = 256<<order;
2391         if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2392                 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2393         if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2394                 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2395
2396         if (order < 3) {
2397                 sysctl_tcp_wmem[2] = 64*1024;
2398                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2399                 sysctl_tcp_rmem[1] = 43689;
2400                 sysctl_tcp_rmem[2] = 2*43689;
2401         }
2402
2403         printk("TCP: Hash tables configured (established %d bind %d)\n",
2404                tcp_ehash_size<<1, tcp_bhash_size);
2405 }