net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.180 2000/11/28 17:04:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *
 245  *      TCP_CLOSE               socket is finished
 246  */
 247
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * [Note: Most of the TCP code has been rewriten/redesigned since this
 255  *  RFC1122 check. It is probably not correct anymore. It should be redone
 256  *  before 2.2. -AK]
 257  *
 258  * Use of PSH (4.2.2.2)
 259  *   MAY aggregate data sent without the PSH flag. (does)
 260  *   MAY queue data received without the PSH flag. (does)
 261  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 262  *   MAY implement PSH on send calls. (doesn't, thus:)
 263  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 264  *     MUST set PSH on last segment (does)
 265  *   MAY pass received PSH to application layer (doesn't)
 266  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 267  *
 268  * Window Size (4.2.2.3, 4.2.2.16)
 269  *   MUST treat window size as an unsigned number (does)
 270  *   SHOULD treat window size as a 32-bit number (does not)
 271  *   MUST NOT shrink window once it is offered (does not normally)
 272  *
 273  * Urgent Pointer (4.2.2.4)
 274  * **MUST point urgent pointer to last byte of urgent data (not right
 275  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 276  *      to off)
 277  *   MUST inform application layer asynchronously of incoming urgent
 278  *     data. (does)
 279  *   MUST provide application with means of determining the amount of
 280  *     urgent data pending. (does)
 281  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 282  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 283  *      [Follows BSD 1 byte of urgent data]
 284  *
 285  * TCP Options (4.2.2.5)
 286  *   MUST be able to receive TCP options in any segment. (does)
 287  *   MUST ignore unsupported options (does)
 288  *
 289  * Maximum Segment Size Option (4.2.2.6)
 290  *   MUST implement both sending and receiving MSS. (does, but currently
 291  *      only uses the smaller of both of them)
 292  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 293  *     it always). (does, even when MSS == 536, which is legal)
 294  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 295  *   MUST calculate "effective send MSS" correctly:
 296  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 297  *     (does - but allows operator override)
 298  *
 299  * TCP Checksum (4.2.2.7)
 300  *   MUST generate and check TCP checksum. (does)
 301  *
 302  * Initial Sequence Number Selection (4.2.2.8)
 303  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 304  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 305  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 306  *     With syncookies we don't)
 307  *
 308  * Simultaneous Open Attempts (4.2.2.10)
 309  *   MUST support simultaneous open attempts (does)
 310  *
 311  * Recovery from Old Duplicate SYN (4.2.2.11)
 312  *   MUST keep track of active vs. passive open (does)
 313  *
 314  * RST segment (4.2.2.12)
 315  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 316  *     anything with it, which is standard)
 317  *
 318  * Closing a Connection (4.2.2.13)
 319  *   MUST inform application of whether connection was closed by RST or
 320  *     normal close. (does)
 321  *   MAY allow "half-duplex" close (treat connection as closed for the
 322  *     local app, even before handshake is done). (does)
 323  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 324  *
 325  * Retransmission Timeout (4.2.2.15)
 326  *   MUST implement Jacobson's slow start and congestion avoidance
 327  *     stuff. (does)
 328  *
 329  * Probing Zero Windows (4.2.2.17)
 330  *   MUST support probing of zero windows. (does)
 331  *   MAY keep offered window closed indefinitely. (does)
 332  *   MUST allow remote window to stay closed indefinitely. (does)
 333  *
 334  * Passive Open Calls (4.2.2.18)
 335  *   MUST NOT let new passive open affect other connections. (doesn't)
 336  *   MUST support passive opens (LISTENs) concurrently. (does)
 337  *
 338  * Time to Live (4.2.2.19)
 339  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 340  *
 341  * Event Processing (4.2.2.20)
 342  *   SHOULD queue out-of-order segments. (does)
 343  *   MUST aggregate ACK segments whenever possible. (does but badly)
 344  *
 345  * Retransmission Timeout Calculation (4.2.3.1)
 346  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 347  *     calculation. (does, or at least explains them in the comments 8*b)
 348  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 349  *
 350  * When to Send an ACK Segment (4.2.3.2)
 351  *   SHOULD implement delayed ACK. (does)
 352  *   MUST keep ACK delay < 0.5 sec. (does)
 353  *
 354  * When to Send a Window Update (4.2.3.3)
 355  *   MUST implement receiver-side SWS. (does)
 356  *
 357  * When to Send Data (4.2.3.4)
 358  *   MUST implement sender-side SWS. (does)
 359  *   SHOULD implement Nagle algorithm. (does)
 360  *
 361  * TCP Connection Failures (4.2.3.5)
 362  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 363  *   SHOULD inform application layer of soft errors. (does)
 364  *
 365  * TCP Keep-Alives (4.2.3.6)
 366  *   MAY provide keep-alives. (does)
 367  *   MUST make keep-alives configurable on a per-connection basis. (does)
 368  *   MUST default to no keep-alives. (does)
 369  *   MUST make keep-alive interval configurable. (does)
 370  *   MUST make default keep-alive interval > 2 hours. (does)
 371  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 372  *     connection. (doesn't)
 373  *   SHOULD send keep-alive with no data. (does)
 374  *
 375  * TCP Multihoming (4.2.3.7)
 376  *   MUST get source address from IP layer before sending first
 377  *     SYN. (does)
 378  *   MUST use same local address for all segments of a connection. (does)
 379  *
 380  * IP Options (4.2.3.8)
 381  *   MUST ignore unsupported IP options. (does)
 382  *   MAY support Time Stamp and Record Route. (does)
 383  *   MUST allow application to specify a source route. (does)
 384  *   MUST allow received Source Route option to set route for all future
 385  *     segments on this connection. (does not (security issues))
 386  *
 387  * ICMP messages (4.2.3.9)
 388  *   MUST act on ICMP errors. (does)
 389  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 390  *   because that is deprecated now by the IETF, can be turned on)
 391  *   MUST NOT abort connection upon receipt of soft Destination
 392  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 393  *     Problems. (doesn't)
 394  *   SHOULD report soft Destination Unreachables etc. to the
 395  *     application. (does, except during SYN_RECV and may drop messages
 396  *     in some rare cases before accept() - ICMP is unreliable)
 397  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 398  *     messages (2, 3, 4). (does, but see above)
 399  *
 400  * Remote Address Validation (4.2.3.10)
 401  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 402  *   MUST ignore SYN with invalid source address. (does)
 403  *   MUST silently discard incoming SYN for broadcast/multicast
 404  *     address. (does)
 405  *
 406  * Asynchronous Reports (4.2.4.1)
 407  * MUST provide mechanism for reporting soft errors to application
 408  *     layer. (does)
 409  *
 410  * Type of Service (4.2.4.2)
 411  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 412  *
 413  * (Whew. -- MS 950903)
 414  * (Updated by AK, but not complete yet.)
 415  **/
 416
 417 #include <linux/config.h>
 418 #include <linux/types.h>
 419 #include <linux/fcntl.h>
 420 #include <linux/poll.h>
 421 #include <linux/init.h>
 422 #include <linux/smp_lock.h>
 423
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426
 427 #include <asm/uaccess.h>
 428
 429 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 430
 431 struct tcp_mib  tcp_statistics[NR_CPUS*2];
 432
 433 kmem_cache_t *tcp_openreq_cachep;
 434 kmem_cache_t *tcp_bucket_cachep;
 435 kmem_cache_t *tcp_timewait_cachep;
 436
 437 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 438
 439 int sysctl_tcp_mem[3];
 440 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
 441 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
 442
 443 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 444 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 445
 446 /* Pressure flag: try to collapse.
 447  * Technical note: it is used by multiple contexts non atomically.
 448  * All the tcp_mem_schedule() is of this nature: accounting
 449  * is strict, actions are advisory and have some latency. */
 450 int tcp_memory_pressure;
 451
 452 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
 453
 454 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 455 {
 456         int amt = TCP_PAGES(size);
 457
 458         sk->forward_alloc += amt*TCP_MEM_QUANTUM;
 459         atomic_add(amt, &tcp_memory_allocated);
 460
 461         /* Under limit. */
 462         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 463                 if (tcp_memory_pressure)
 464                         tcp_memory_pressure = 0;
 465                 return 1;
 466         }
 467
 468         /* Over hard limit. */
 469         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 470                 tcp_enter_memory_pressure();
 471                 goto suppress_allocation;
 472         }
 473
 474         /* Under pressure. */
 475         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 476                 tcp_enter_memory_pressure();
 477
 478         if (kind) {
 479                 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
 480                         return 1;
 481         } else {
 482                 if (sk->wmem_queued < sysctl_tcp_wmem[0])
 483                         return 1;
 484         }
 485
 486         if (!tcp_memory_pressure ||
 487             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
 488             * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
 489                         sk->forward_alloc))
 490                 return 1;
 491
 492 suppress_allocation:
 493
 494         if (kind == 0) {
 495                 tcp_moderate_sndbuf(sk);
 496
 497                 /* Fail only if socket is _under_ its sndbuf.
 498                  * In this case we cannot block, so that we have to fail.
 499                  */
 500                 if (sk->wmem_queued+size >= sk->sndbuf)
 501                         return 1;
 502         }
 503
 504         /* Alas. Undo changes. */
 505         sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
 506         atomic_sub(amt, &tcp_memory_allocated);
 507         return 0;
 508 }
 509
 510 void __tcp_mem_reclaim(struct sock *sk)
 511 {
 512         if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
 513                 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
 514                 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
 515                 if (tcp_memory_pressure &&
 516                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 517                         tcp_memory_pressure = 0;
 518         }
 519 }
 520
 521 void tcp_rfree(struct sk_buff *skb)
 522 {
 523         struct sock *sk = skb->sk;
 524
 525         atomic_sub(skb->truesize, &sk->rmem_alloc);
 526         sk->forward_alloc += skb->truesize;
 527 }
 528
 529 /*
 530  * LISTEN is a special case for poll..
 531  */
 532 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 533 {
 534         return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
 535 }
 536
 537 /*
 538  *      Wait for a TCP event.
 539  *
 540  *      Note that we don't need to lock the socket, as the upper poll layers
 541  *      take care of normal races (between the test and the event) and we don't
 542  *      go look at any of the socket buffers directly.
 543  */
 544 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 545 {
 546         unsigned int mask;
 547         struct sock *sk = sock->sk;
 548         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 549
 550         poll_wait(file, sk->sleep, wait);
 551         if (sk->state == TCP_LISTEN)
 552                 return tcp_listen_poll(sk, wait);
 553
 554         /* Socket is not locked. We are protected from async events
 555            by poll logic and correct handling of state changes
 556            made by another threads is impossible in any case.
 557          */
 558
 559         mask = 0;
 560         if (sk->err)
 561                 mask = POLLERR;
 562
 563         /*
 564          * POLLHUP is certainly not done right. But poll() doesn't
 565          * have a notion of HUP in just one direction, and for a
 566          * socket the read side is more interesting.
 567          *
 568          * Some poll() documentation says that POLLHUP is incompatible
 569          * with the POLLOUT/POLLWR flags, so somebody should check this
 570          * all. But careful, it tends to be safer to return too many
 571          * bits than too few, and you can easily break real applications
 572          * if you don't tell them that something has hung up!
 573          *
 574          * Check-me.
 575          *
 576          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 577          * our fs/select.c). It means that after we received EOF,
 578          * poll always returns immediately, making impossible poll() on write()
 579          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 580          * if and only if shutdown has been made in both directions.
 581          * Actually, it is interesting to look how Solaris and DUX
 582          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 583          * then we could set it on SND_SHUTDOWN. BTW examples given
 584          * in Stevens' books assume exactly this behaviour, it explains
 585          * why PULLHUP is incompatible with POLLOUT.    --ANK
 586          *
 587          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 588          * blocking on fresh not-connected or disconnected socket. --ANK
 589          */
 590         if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
 591                 mask |= POLLHUP;
 592         if (sk->shutdown & RCV_SHUTDOWN)
 593                 mask |= POLLIN | POLLRDNORM;
 594
 595         /* Connected? */
 596         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 597                 /* Potential race condition. If read of tp below will
 598                  * escape above sk->state, we can be illegally awaken
 599                  * in SYN_* states. */
 600                 if ((tp->rcv_nxt != tp->copied_seq) &&
 601                     (tp->urg_seq != tp->copied_seq ||
 602                      tp->rcv_nxt != tp->copied_seq+1 ||
 603                      sk->urginline || !tp->urg_data))
 604                         mask |= POLLIN | POLLRDNORM;
 605
 606                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 607                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
 608                                 mask |= POLLOUT | POLLWRNORM;
 609                         } else {  /* send SIGIO later */
 610                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 611                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 612
 613                                 /* Race breaker. If space is freed after
 614                                  * wspace test but before the flags are set,
 615                                  * IO signal will be lost.
 616                                  */
 617                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
 618                                         mask |= POLLOUT | POLLWRNORM;
 619                         }
 620                 }
 621
 622                 if (tp->urg_data & TCP_URG_VALID)
 623                         mask |= POLLPRI;
 624         }
 625         return mask;
 626 }
 627
 628 /*
 629  *      TCP socket write_space callback. Not used.
 630  */
 631 void tcp_write_space(struct sock *sk)
 632 {
 633 }
 634
 635 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 636 {
 637         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 638         int answ;
 639
 640         switch(cmd) {
 641         case SIOCINQ:
 642                 if (sk->state == TCP_LISTEN)
 643                         return(-EINVAL);
 644
 645                 lock_sock(sk);
 646                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 647                         answ = 0;
 648                 else if (sk->urginline || !tp->urg_data ||
 649                          before(tp->urg_seq,tp->copied_seq) ||
 650                          !before(tp->urg_seq,tp->rcv_nxt)) {
 651                         answ = tp->rcv_nxt - tp->copied_seq;
 652
 653                         /* Subtract 1, if FIN is in queue. */
 654                         if (answ && !skb_queue_empty(&sk->receive_queue))
 655                                 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
 656                 } else
 657                         answ = tp->urg_seq - tp->copied_seq;
 658                 release_sock(sk);
 659                 break;
 660         case SIOCATMARK:
 661                 {
 662                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 663                         break;
 664                 }
 665         case SIOCOUTQ:
 666                 if (sk->state == TCP_LISTEN)
 667                         return(-EINVAL);
 668
 669                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 670                         answ = 0;
 671                 else
 672                         answ = tp->write_seq - tp->snd_una;
 673                 break;
 674         default:
 675                 return(-ENOIOCTLCMD);
 676         };
 677
 678         return put_user(answ, (int *)arg);
 679 }
 680
 681
 682 int tcp_listen_start(struct sock *sk)
 683 {
 684         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 685         struct tcp_listen_opt *lopt;
 686
 687         sk->max_ack_backlog = 0;
 688         sk->ack_backlog = 0;
 689         tp->accept_queue = tp->accept_queue_tail = NULL;
 690         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 691
 692         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 693         if (!lopt)
 694                 return -ENOMEM;
 695
 696         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 697         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 698                 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 699                         break;
 700
 701         write_lock_bh(&tp->syn_wait_lock);
 702         tp->listen_opt = lopt;
 703         write_unlock_bh(&tp->syn_wait_lock);
 704
 705         /* There is race window here: we announce ourselves listening,
 706          * but this transition is still not validated by get_port().
 707          * It is OK, because this socket enters to hash table only
 708          * after validation is complete.
 709          */
 710         sk->state = TCP_LISTEN;
 711         if (sk->prot->get_port(sk, sk->num) == 0) {
 712                 sk->sport = htons(sk->num);
 713
 714                 sk_dst_reset(sk);
 715                 sk->prot->hash(sk);
 716
 717                 return 0;
 718         }
 719
 720         sk->state = TCP_CLOSE;
 721         write_lock_bh(&tp->syn_wait_lock);
 722         tp->listen_opt = NULL;
 723         write_unlock_bh(&tp->syn_wait_lock);
 724         kfree(lopt);
 725         return -EADDRINUSE;
 726 }
 727
 728 /*
 729  *      This routine closes sockets which have been at least partially
 730  *      opened, but not yet accepted.
 731  */
 732
 733 static void tcp_listen_stop (struct sock *sk)
 734 {
 735         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 736         struct tcp_listen_opt *lopt = tp->listen_opt;
 737         struct open_request *acc_req = tp->accept_queue;
 738         struct open_request *req;
 739         int i;
 740
 741         tcp_delete_keepalive_timer(sk);
 742
 743         /* make all the listen_opt local to us */
 744         write_lock_bh(&tp->syn_wait_lock);
 745         tp->listen_opt =NULL;
 746         write_unlock_bh(&tp->syn_wait_lock);
 747         tp->accept_queue = tp->accept_queue_tail = NULL;
 748
 749         if (lopt->qlen) {
 750                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
 751                         while ((req = lopt->syn_table[i]) != NULL) {
 752                                 lopt->syn_table[i] = req->dl_next;
 753                                 lopt->qlen--;
 754                                 tcp_openreq_free(req);
 755
 756                 /* Following specs, it would be better either to send FIN
 757                  * (and enter FIN-WAIT-1, it is normal close)
 758                  * or to send active reset (abort).
 759                  * Certainly, it is pretty dangerous while synflood, but it is
 760                  * bad justification for our negligence 8)
 761                  * To be honest, we are not able to make either
 762                  * of the variants now.                 --ANK
 763                  */
 764                         }
 765                 }
 766         }
 767         BUG_TRAP(lopt->qlen == 0);
 768
 769         kfree(lopt);
 770
 771         while ((req=acc_req) != NULL) {
 772                 struct sock *child = req->sk;
 773
 774                 acc_req = req->dl_next;
 775
 776                 local_bh_disable();
 777                 bh_lock_sock(child);
 778                 BUG_TRAP(child->lock.users==0);
 779                 sock_hold(child);
 780
 781                 tcp_disconnect(child, O_NONBLOCK);
 782
 783                 sock_orphan(child);
 784
 785                 atomic_inc(&tcp_orphan_count);
 786
 787                 tcp_destroy_sock(child);
 788
 789                 bh_unlock_sock(child);
 790                 local_bh_enable();
 791                 sock_put(child);
 792
 793                 tcp_acceptq_removed(sk);
 794                 tcp_openreq_fastfree(req);
 795         }
 796         BUG_TRAP(sk->ack_backlog == 0);
 797 }
 798
 799 /*
 800  *      Wait for a socket to get into the connected state
 801  *
 802  *      Note: Must be called with the socket locked.
 803  */
 804 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
 805 {
 806         struct task_struct *tsk = current;
 807         DECLARE_WAITQUEUE(wait, tsk);
 808
 809         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 810                 if(sk->err)
 811                         return sock_error(sk);
 812                 if((1 << sk->state) &
 813                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 814                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 815                                 send_sig(SIGPIPE, tsk, 0);
 816                         return -EPIPE;
 817                 }
 818                 if(!*timeo_p)
 819                         return -EAGAIN;
 820                 if(signal_pending(tsk))
 821                         return sock_intr_errno(*timeo_p);
 822
 823                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
 824                 add_wait_queue(sk->sleep, &wait);
 825                 sk->tp_pinfo.af_tcp.write_pending++;
 826
 827                 release_sock(sk);
 828                 *timeo_p = schedule_timeout(*timeo_p);
 829                 lock_sock(sk);
 830
 831                 __set_task_state(tsk, TASK_RUNNING);
 832                 remove_wait_queue(sk->sleep, &wait);
 833                 sk->tp_pinfo.af_tcp.write_pending--;
 834         }
 835         return 0;
 836 }
 837
 838 static inline int tcp_memory_free(struct sock *sk)
 839 {
 840         return sk->wmem_queued < sk->sndbuf;
 841 }
 842
 843 /*
 844  *      Wait for more memory for a socket
 845  */
 846 static long wait_for_tcp_memory(struct sock * sk, long timeo)
 847 {
 848         long vm_wait = 0;
 849         long current_timeo = timeo;
 850         DECLARE_WAITQUEUE(wait, current);
 851
 852         if (tcp_memory_free(sk))
 853                 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
 854
 855         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 856
 857         add_wait_queue(sk->sleep, &wait);
 858         for (;;) {
 859                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 860
 861                 set_current_state(TASK_INTERRUPTIBLE);
 862
 863                 if (signal_pending(current))
 864                         break;
 865                 if (tcp_memory_free(sk) && !vm_wait)
 866                         break;
 867                 if (sk->shutdown & SEND_SHUTDOWN)
 868                         break;
 869                 if (sk->err)
 870                         break;
 871                 release_sock(sk);
 872                 if (!tcp_memory_free(sk) || vm_wait)
 873                         current_timeo = schedule_timeout(current_timeo);
 874                 lock_sock(sk);
 875                 if (vm_wait) {
 876                         if (timeo != MAX_SCHEDULE_TIMEOUT &&
 877                             (timeo -= vm_wait-current_timeo) < 0)
 878                                 timeo = 0;
 879                         break;
 880                 } else {
 881                         timeo = current_timeo;
 882                 }
 883         }
 884         current->state = TASK_RUNNING;
 885         remove_wait_queue(sk->sleep, &wait);
 886         return timeo;
 887 }
 888
 889 /* When all user supplied data has been queued set the PSH bit */
 890 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
 891
 892 /*
 893  *      This routine copies from a user buffer into a socket,
 894  *      and starts the transmit system.
 895  */
 896
 897 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 898 {
 899         struct iovec *iov;
 900         struct tcp_opt *tp;
 901         struct sk_buff *skb;
 902         int iovlen, flags;
 903         int mss_now;
 904         int err, copied;
 905         long timeo;
 906
 907         err = 0;
 908         tp = &(sk->tp_pinfo.af_tcp);
 909
 910         lock_sock(sk);
 911         TCP_CHECK_TIMER(sk);
 912
 913         flags = msg->msg_flags;
 914
 915         timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
 916
 917         /* Wait for a connection to finish. */
 918         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 919                 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
 920                         goto out_unlock;
 921
 922         /* This should be in poll */
 923         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 924
 925         mss_now = tcp_current_mss(sk);
 926
 927         /* Ok commence sending. */
 928         iovlen = msg->msg_iovlen;
 929         iov = msg->msg_iov;
 930         copied = 0;
 931
 932         while (--iovlen >= 0) {
 933                 int seglen=iov->iov_len;
 934                 unsigned char * from=iov->iov_base;
 935
 936                 iov++;
 937
 938                 while (seglen > 0) {
 939                         int copy, tmp, queue_it;
 940
 941                         if (err)
 942                                 goto do_fault2;
 943
 944                         /* Stop on errors. */
 945                         if (sk->err)
 946                                 goto do_sock_err;
 947
 948                         /* Make sure that we are established. */
 949                         if (sk->shutdown & SEND_SHUTDOWN)
 950                                 goto do_shutdown;
 951
 952                         /* Now we need to check if we have a half
 953                          * built packet we can tack some data onto.
 954                          */
 955                         skb = sk->write_queue.prev;
 956                         if (tp->send_head &&
 957                             (mss_now - skb->len) > 0) {
 958                                 copy = skb->len;
 959                                 if (skb_tailroom(skb) > 0) {
 960                                         int last_byte_was_odd = (copy % 4);
 961
 962                                         copy = mss_now - copy;
 963                                         if(copy > skb_tailroom(skb))
 964                                                 copy = skb_tailroom(skb);
 965                                         if(copy > seglen)
 966                                                 copy = seglen;
 967                                         if(last_byte_was_odd) {
 968                                                 if(copy_from_user(skb_put(skb, copy),
 969                                                                   from, copy))
 970                                                         err = -EFAULT;
 971                                                 skb->csum = csum_partial(skb->data,
 972                                                                          skb->len, 0);
 973                                         } else {
 974                                                 skb->csum =
 975                                                         csum_and_copy_from_user(
 976                                                         from, skb_put(skb, copy),
 977                                                         copy, skb->csum, &err);
 978                                         }
 979                                         /*
 980                                          * FIXME: the *_user functions should
 981                                          *        return how much data was
 982                                          *        copied before the fault
 983                                          *        occurred and then a partial
 984                                          *        packet with this data should
 985                                          *        be sent.  Unfortunately
 986                                          *        csum_and_copy_from_user doesn't
 987                                          *        return this information.
 988                                          *        ATM it might send partly zeroed
 989                                          *        data in this case.
 990                                          */
 991                                         tp->write_seq += copy;
 992                                         TCP_SKB_CB(skb)->end_seq += copy;
 993                                         from += copy;
 994                                         copied += copy;
 995                                         seglen -= copy;
 996                                         if (PSH_NEEDED ||
 997                                             after(tp->write_seq, tp->pushed_seq+(tp->max_window>>1))) {
 998                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 999                                                 tp->pushed_seq = tp->write_seq;
1000                                         }
1001                                         if (flags&MSG_OOB) {
1002                                                 tp->urg_mode = 1;
1003                                                 tp->snd_up = tp->write_seq;
1004                                                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
1005                                         }
1006                                         continue;
1007                                 } else {
1008                                         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
1009                                         tp->pushed_seq = tp->write_seq;
1010                                 }
1011                         }
1012
1013                         copy = min(seglen, mss_now);
1014
1015                         /* Determine how large of a buffer to allocate.  */
1016                         tmp = MAX_TCP_HEADER + 15 + tp->mss_cache;
1017                         if (copy < mss_now && !(flags & MSG_OOB)) {
1018                                 /* What is happening here is that we want to
1019                                  * tack on later members of the users iovec
1020                                  * if possible into a single frame.  When we
1021                                  * leave this loop our we check to see if
1022                                  * we can send queued frames onto the wire.
1023                                  */
1024                                 queue_it = 1;
1025                         } else {
1026                                 queue_it = 0;
1027                         }
1028
1029                         skb = NULL;
1030                         if (tcp_memory_free(sk))
1031                                 skb = tcp_alloc_skb(sk, tmp, sk->allocation);
1032                         if (skb == NULL) {
1033                                 /* If we didn't get any memory, we need to sleep. */
1034                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1035                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
1036
1037                                 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1038
1039                                 if (!timeo) {
1040                                         err = -EAGAIN;
1041                                         goto do_interrupted;
1042                                 }
1043                                 if (signal_pending(current)) {
1044                                         err = sock_intr_errno(timeo);
1045                                         goto do_interrupted;
1046                                 }
1047                                 timeo = wait_for_tcp_memory(sk, timeo);
1048
1049                                 /* If SACK's were formed or PMTU events happened,
1050                                  * we must find out about it.
1051                                  */
1052                                 mss_now = tcp_current_mss(sk);
1053                                 continue;
1054                         }
1055
1056                         seglen -= copy;
1057
1058                         /* Prepare control bits for TCP header creation engine. */
1059                         if (PSH_NEEDED ||
1060                             after(tp->write_seq+copy, tp->pushed_seq+(tp->max_window>>1))) {
1061                                 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK|TCPCB_FLAG_PSH;
1062                                 tp->pushed_seq = tp->write_seq + copy;
1063                         } else {
1064                                 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
1065                         }
1066                         TCP_SKB_CB(skb)->sacked = 0;
1067                         if (flags & MSG_OOB) {
1068                                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
1069                                 tp->urg_mode = 1;
1070                                 tp->snd_up = tp->write_seq + copy;
1071                         }
1072
1073                         /* TCP data bytes are SKB_PUT() on top, later
1074                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
1075                          * Reserve header space and checksum the data.
1076                          */
1077                         skb_reserve(skb, MAX_TCP_HEADER);
1078                         skb->csum = csum_and_copy_from_user(from,
1079                                         skb_put(skb, copy), copy, 0, &err);
1080
1081                         if (err)
1082                                 goto do_fault;
1083
1084                         from += copy;
1085                         copied += copy;
1086
1087                         TCP_SKB_CB(skb)->seq = tp->write_seq;
1088                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
1089
1090                         /* This advances tp->write_seq for us. */
1091                         tcp_send_skb(sk, skb, queue_it, mss_now);
1092                 }
1093         }
1094         err = copied;
1095 out:
1096         __tcp_push_pending_frames(sk, tp, mss_now, tp->nonagle);
1097 out_unlock:
1098         TCP_CHECK_TIMER(sk);
1099         release_sock(sk);
1100         return err;
1101
1102 do_sock_err:
1103         if (copied)
1104                 err = copied;
1105         else
1106                 err = sock_error(sk);
1107         goto out;
1108 do_shutdown:
1109         if (copied)
1110                 err = copied;
1111         else {
1112                 if (!(flags&MSG_NOSIGNAL))
1113                         send_sig(SIGPIPE, current, 0);
1114                 err = -EPIPE;
1115         }
1116         goto out;
1117 do_interrupted:
1118         if (copied)
1119                 err = copied;
1120         goto out_unlock;
1121 do_fault:
1122         __kfree_skb(skb);
1123 do_fault2:
1124         if (copied)
1125                 err = copied;
1126         else
1127                 err = -EFAULT;
1128         goto out;
1129 }
1130
1131 #undef PSH_NEEDED
1132
1133 /*
1134  *      Handle reading urgent data. BSD has very simple semantics for
1135  *      this, no blocking and very strange errors 8)
1136  */
1137
1138 static int tcp_recv_urg(struct sock * sk, long timeo,
1139                         struct msghdr *msg, int len, int flags,
1140                         int *addr_len)
1141 {
1142         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1143
1144         /* No URG data to read. */
1145         if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1146                 return -EINVAL; /* Yes this is right ! */
1147
1148         if (sk->state==TCP_CLOSE && !sk->done)
1149                 return -ENOTCONN;
1150
1151         if (tp->urg_data & TCP_URG_VALID) {
1152                 int err = 0;
1153                 char c = tp->urg_data;
1154
1155                 if (!(flags & MSG_PEEK))
1156                         tp->urg_data = TCP_URG_READ;
1157
1158                 /* Read urgent data. */
1159                 msg->msg_flags|=MSG_OOB;
1160
1161                 if(len>0) {
1162                         if (!(flags & MSG_PEEK))
1163                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1164                         len = 1;
1165                 } else
1166                         msg->msg_flags|=MSG_TRUNC;
1167
1168                 return err ? -EFAULT : len;
1169         }
1170
1171         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1172                 return 0;
1173
1174         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1175          * the available implementations agree in this case:
1176          * this call should never block, independent of the
1177          * blocking state of the socket.
1178          * Mike <pall@rz.uni-karlsruhe.de>
1179          */
1180         return -EAGAIN;
1181 }
1182
1183 /*
1184  *      Release a skb if it is no longer needed. This routine
1185  *      must be called with interrupts disabled or with the
1186  *      socket locked so that the sk_buff queue operation is ok.
1187  */
1188
1189 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1190 {
1191         __skb_unlink(skb, &sk->receive_queue);
1192         __kfree_skb(skb);
1193 }
1194
1195 /* Clean up the receive buffer for full frames taken by the user,
1196  * then send an ACK if necessary.  COPIED is the number of bytes
1197  * tcp_recvmsg has given to the user so far, it speeds up the
1198  * calculation of whether or not we must ACK for the sake of
1199  * a window update.
1200  */
1201 static void cleanup_rbuf(struct sock *sk, int copied)
1202 {
1203         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1204         struct sk_buff *skb;
1205         int time_to_ack = 0;
1206
1207         /* NOTE! The socket must be locked, so that we don't get
1208          * a messed-up receive queue.
1209          */
1210         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1211                 if (!skb->used)
1212                         break;
1213                 tcp_eat_skb(sk, skb);
1214         }
1215
1216         if (tcp_ack_scheduled(tp)) {
1217                    /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1218                 if (tp->ack.blocked
1219                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1220                     || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1221                     /*
1222                      * If this read emptied read buffer, we send ACK, if
1223                      * connection is not bidirectional, user drained
1224                      * receive buffer and there was a small segment
1225                      * in queue.
1226                      */
1227                     || (copied > 0 &&
1228                         (tp->ack.pending&TCP_ACK_PUSHED) &&
1229                         !tp->ack.pingpong &&
1230                         atomic_read(&sk->rmem_alloc) == 0)) {
1231                         time_to_ack = 1;
1232                 }
1233         }
1234
1235         /* We send an ACK if we can now advertise a non-zero window
1236          * which has been raised "significantly".
1237          *
1238          * Even if window raised up to infinity, do not send window open ACK
1239          * in states, where we will not receive more. It is useless.
1240          */
1241         if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1242                 __u32 rcv_window_now = tcp_receive_window(tp);
1243
1244                 /* Optimize, __tcp_select_window() is not cheap. */
1245                 if (2*rcv_window_now <= tp->window_clamp) {
1246                         __u32 new_window = __tcp_select_window(sk);
1247
1248                         /* Send ACK now, if this read freed lots of space
1249                          * in our buffer. Certainly, new_window is new window.
1250                          * We can advertise it now, if it is not less than current one.
1251                          * "Lots" means "at least twice" here.
1252                          */
1253                         if(new_window && new_window >= 2*rcv_window_now)
1254                                 time_to_ack = 1;
1255                 }
1256         }
1257         if (time_to_ack)
1258                 tcp_send_ack(sk);
1259 }
1260
1261 /* Now socket state including sk->err is changed only under lock,
1262  * hence we may omit checks after joining wait queue.
1263  * We check receive queue before schedule() only as optimization;
1264  * it is very likely that release_sock() added new data.
1265  */
1266
1267 static long tcp_data_wait(struct sock *sk, long timeo)
1268 {
1269         DECLARE_WAITQUEUE(wait, current);
1270
1271         add_wait_queue(sk->sleep, &wait);
1272
1273         __set_current_state(TASK_INTERRUPTIBLE);
1274
1275         set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1276         release_sock(sk);
1277
1278         if (skb_queue_empty(&sk->receive_queue))
1279                 timeo = schedule_timeout(timeo);
1280
1281         lock_sock(sk);
1282         clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1283
1284         remove_wait_queue(sk->sleep, &wait);
1285         __set_current_state(TASK_RUNNING);
1286         return timeo;
1287 }
1288
1289 static void tcp_prequeue_process(struct sock *sk)
1290 {
1291         struct sk_buff *skb;
1292         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1293
1294         net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1295
1296         /* RX process wants to run with disabled BHs, though it is not necessary */
1297         local_bh_disable();
1298         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1299                 sk->backlog_rcv(sk, skb);
1300         local_bh_enable();
1301
1302         /* Clear memory counter. */
1303         tp->ucopy.memory = 0;
1304 }
1305
1306 /*
1307  *      This routine copies from a sock struct into the user buffer.
1308  *
1309  *      Technical note: in 2.3 we work on _locked_ socket, so that
1310  *      tricks with *seq access order and skb->users are not required.
1311  *      Probably, code can be easily improved even more.
1312  */
1313
1314 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1315                 int len, int nonblock, int flags, int *addr_len)
1316 {
1317         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1318         int copied = 0;
1319         u32 peek_seq;
1320         u32 *seq;
1321         unsigned long used;
1322         int err;
1323         int target;             /* Read at least this many bytes */
1324         long timeo;
1325         struct task_struct *user_recv = NULL;
1326
1327         lock_sock(sk);
1328
1329         TCP_CHECK_TIMER(sk);
1330
1331         err = -ENOTCONN;
1332         if (sk->state == TCP_LISTEN)
1333                 goto out;
1334
1335         timeo = sock_rcvtimeo(sk, nonblock);
1336
1337         /* Urgent data needs to be handled specially. */
1338         if (flags & MSG_OOB)
1339                 goto recv_urg;
1340
1341         seq = &tp->copied_seq;
1342         if (flags & MSG_PEEK) {
1343                 peek_seq = tp->copied_seq;
1344                 seq = &peek_seq;
1345         }
1346
1347         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1348
1349         do {
1350                 struct sk_buff * skb;
1351                 u32 offset;
1352
1353                 /* Are we at urgent data? Stop if we have read anything. */
1354                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1355                         break;
1356
1357                 /* We need to check signals first, to get correct SIGURG
1358                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1359                  * and move it down to the bottom of the loop
1360                  */
1361                 if (signal_pending(current)) {
1362                         if (copied)
1363                                 break;
1364                         copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1365                         break;
1366                 }
1367
1368                 /* Next get a buffer. */
1369
1370                 skb = skb_peek(&sk->receive_queue);
1371                 do {
1372                         if (!skb)
1373                                 break;
1374
1375                         /* Now that we have two receive queues this
1376                          * shouldn't happen.
1377                          */
1378                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1379                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1380                                        *seq, TCP_SKB_CB(skb)->seq);
1381                                 break;
1382                         }
1383                         offset = *seq - TCP_SKB_CB(skb)->seq;
1384                         if (skb->h.th->syn)
1385                                 offset--;
1386                         if (offset < skb->len)
1387                                 goto found_ok_skb;
1388                         if (skb->h.th->fin)
1389                                 goto found_fin_ok;
1390                         if (!(flags & MSG_PEEK))
1391                                 skb->used = 1;
1392                         skb = skb->next;
1393                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1394
1395                 /* Well, if we have backlog, try to process it now yet. */
1396
1397                 if (copied >= target && sk->backlog.tail == NULL)
1398                         break;
1399
1400                 if (copied) {
1401                         if (sk->err ||
1402                             sk->state == TCP_CLOSE ||
1403                             (sk->shutdown & RCV_SHUTDOWN) ||
1404                             !timeo)
1405                                 break;
1406                 } else {
1407                         if (sk->done)
1408                                 break;
1409
1410                         if (sk->err) {
1411                                 copied = sock_error(sk);
1412                                 break;
1413                         }
1414
1415                         if (sk->shutdown & RCV_SHUTDOWN)
1416                                 break;
1417
1418                         if (sk->state == TCP_CLOSE) {
1419                                 if (!sk->done) {
1420                                         /* This occurs when user tries to read
1421                                          * from never connected socket.
1422                                          */
1423                                         copied = -ENOTCONN;
1424                                         break;
1425                                 }
1426                                 break;
1427                         }
1428
1429                         if (!timeo) {
1430                                 copied = -EAGAIN;
1431                                 break;
1432                         }
1433                 }
1434
1435                 cleanup_rbuf(sk, copied);
1436
1437                 if (tp->ucopy.task == user_recv) {
1438                         /* Install new reader */
1439                         if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1440                                 user_recv = current;
1441                                 tp->ucopy.task = user_recv;
1442                                 tp->ucopy.iov = msg->msg_iov;
1443                         }
1444
1445                         tp->ucopy.len = len;
1446
1447                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1448
1449                         /* Ugly... If prequeue is not empty, we have to
1450                          * process it before releasing socket, otherwise
1451                          * order will be broken at second iteration.
1452                          * More elegant solution is required!!!
1453                          *
1454                          * Look: we have the following (pseudo)queues:
1455                          *
1456                          * 1. packets in flight
1457                          * 2. backlog
1458                          * 3. prequeue
1459                          * 4. receive_queue
1460                          *
1461                          * Each queue can be processed only if the next ones
1462                          * are empty. At this point we have empty receive_queue.
1463                          * But prequeue _can_ be not empty after second iteration,
1464                          * when we jumped to start of loop because backlog
1465                          * processing added something to receive_queue.
1466                          * We cannot release_sock(), because backlog contains
1467                          * packets arrived _after_ prequeued ones.
1468                          *
1469                          * Shortly, algorithm is clear --- to process all
1470                          * the queues in order. We could make it more directly,
1471                          * requeueing packets from backlog to prequeue, if
1472                          * is not empty. It is more elegant, but eats cycles,
1473                          * unfortunately.
1474                          */
1475                         if (skb_queue_len(&tp->ucopy.prequeue))
1476                                 goto do_prequeue;
1477
1478                         /* __ Set realtime policy in scheduler __ */
1479                 }
1480
1481                 if (copied >= target) {
1482                         /* Do not sleep, just process backlog. */
1483                         release_sock(sk);
1484                         lock_sock(sk);
1485                 } else {
1486                         timeo = tcp_data_wait(sk, timeo);
1487                 }
1488
1489                 if (user_recv) {
1490                         int chunk;
1491
1492                         /* __ Restore normal policy in scheduler __ */
1493
1494                         if ((chunk = len - tp->ucopy.len) != 0) {
1495                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1496                                 len -= chunk;
1497                                 copied += chunk;
1498                         }
1499
1500                         if (tp->rcv_nxt == tp->copied_seq &&
1501                             skb_queue_len(&tp->ucopy.prequeue)) {
1502 do_prequeue:
1503                                 tcp_prequeue_process(sk);
1504
1505                                 if ((chunk = len - tp->ucopy.len) != 0) {
1506                                         net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1507                                         len -= chunk;
1508                                         copied += chunk;
1509                                 }
1510                         }
1511                 }
1512                 continue;
1513
1514         found_ok_skb:
1515                 /* Ok so how much can we use? */
1516                 used = skb->len - offset;
1517                 if (len < used)
1518                         used = len;
1519
1520                 /* Do we have urgent data here? */
1521                 if (tp->urg_data) {
1522                         u32 urg_offset = tp->urg_seq - *seq;
1523                         if (urg_offset < used) {
1524                                 if (!urg_offset) {
1525                                         if (!sk->urginline) {
1526                                                 ++*seq;
1527                                                 offset++;
1528                                                 used--;
1529                                         }
1530                                 } else
1531                                         used = urg_offset;
1532                         }
1533                 }
1534
1535                 err = 0;
1536                 if (!(flags&MSG_TRUNC)) {
1537                         err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1538                         if (err) {
1539                                 /* Exception. Bailout! */
1540                                 if (!copied)
1541                                         copied = -EFAULT;
1542                                 break;
1543                         }
1544                 }
1545
1546                 *seq += used;
1547                 copied += used;
1548                 len -= used;
1549
1550                 if (after(tp->copied_seq,tp->urg_seq)) {
1551                         tp->urg_data = 0;
1552                         if (skb_queue_len(&tp->out_of_order_queue) == 0
1553 #ifdef TCP_FORMAL_WINDOW
1554                             && tcp_receive_window(tp)
1555 #endif
1556                             ) {
1557                                 tcp_fast_path_on(tp);
1558                         }
1559                 }
1560                 if (used + offset < skb->len)
1561                         continue;
1562
1563                 /*      Process the FIN. We may also need to handle PSH
1564                  *      here and make it break out of MSG_WAITALL.
1565                  */
1566                 if (skb->h.th->fin)
1567                         goto found_fin_ok;
1568                 if (flags & MSG_PEEK)
1569                         continue;
1570                 skb->used = 1;
1571                 tcp_eat_skb(sk, skb);
1572                 continue;
1573
1574         found_fin_ok:
1575                 ++*seq;
1576                 if (flags & MSG_PEEK)
1577                         break;
1578
1579                 /* All is done. */
1580                 skb->used = 1;
1581                 break;
1582         } while (len > 0);
1583
1584         if (user_recv) {
1585                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1586                         int chunk;
1587
1588                         tp->ucopy.len = copied > 0 ? len : 0;
1589
1590                         tcp_prequeue_process(sk);
1591
1592                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1593                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1594                                 len -= chunk;
1595                                 copied += chunk;
1596                         }
1597                 }
1598
1599                 tp->ucopy.task = NULL;
1600                 tp->ucopy.len = 0;
1601         }
1602
1603         /* According to UNIX98, msg_name/msg_namelen are ignored
1604          * on connected socket. I was just happy when found this 8) --ANK
1605          */
1606
1607         /* Clean up data we have read: This will do ACK frames. */
1608         cleanup_rbuf(sk, copied);
1609
1610         TCP_CHECK_TIMER(sk);
1611         release_sock(sk);
1612         return copied;
1613
1614 out:
1615         TCP_CHECK_TIMER(sk);
1616         release_sock(sk);
1617         return err;
1618
1619 recv_urg:
1620         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1621         goto out;
1622 }
1623
1624 /*
1625  *      State processing on a close. This implements the state shift for
1626  *      sending our FIN frame. Note that we only send a FIN for some
1627  *      states. A shutdown() may have already sent the FIN, or we may be
1628  *      closed.
1629  */
1630
1631 static unsigned char new_state[16] = {
1632   /* current state:        new state:      action:      */
1633   /* (Invalid)          */ TCP_CLOSE,
1634   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1635   /* TCP_SYN_SENT       */ TCP_CLOSE,
1636   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1637   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1638   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1639   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1640   /* TCP_CLOSE          */ TCP_CLOSE,
1641   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1642   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1643   /* TCP_LISTEN         */ TCP_CLOSE,
1644   /* TCP_CLOSING        */ TCP_CLOSING,
1645 };
1646
1647 static int tcp_close_state(struct sock *sk)
1648 {
1649         int next = (int) new_state[sk->state];
1650         int ns = (next & TCP_STATE_MASK);
1651
1652         tcp_set_state(sk, ns);
1653
1654         return (next & TCP_ACTION_FIN);
1655 }
1656
1657 /*
1658  *      Shutdown the sending side of a connection. Much like close except
1659  *      that we don't receive shut down or set sk->dead.
1660  */
1661
1662 void tcp_shutdown(struct sock *sk, int how)
1663 {
1664         /*      We need to grab some memory, and put together a FIN,
1665          *      and then put it into the queue to be sent.
1666          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1667          */
1668         if (!(how & SEND_SHUTDOWN))
1669                 return;
1670
1671         /* If we've already sent a FIN, or it's a closed state, skip this. */
1672         if ((1 << sk->state) &
1673             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1674                 /* Clear out any half completed packets.  FIN if needed. */
1675                 if (tcp_close_state(sk))
1676                         tcp_send_fin(sk);
1677         }
1678 }
1679
1680
1681 /*
1682  *      Return 1 if we still have things to send in our buffers.
1683  */
1684
1685 static inline int closing(struct sock * sk)
1686 {
1687         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1688 }
1689
1690 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1691 {
1692         /* First the read buffer. */
1693         __skb_queue_purge(&sk->receive_queue);
1694
1695         /* Next, the error queue. */
1696         __skb_queue_purge(&sk->error_queue);
1697
1698         /* Next, the write queue. */
1699         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1700
1701         /* Account for returned memory. */
1702         tcp_mem_reclaim(sk);
1703
1704         BUG_TRAP(sk->wmem_queued == 0);
1705         BUG_TRAP(sk->forward_alloc == 0);
1706
1707         /* It is _impossible_ for the backlog to contain anything
1708          * when we get here.  All user references to this socket
1709          * have gone away, only the net layer knows can touch it.
1710          */
1711 }
1712
1713 /*
1714  * At this point, there should be no process reference to this
1715  * socket, and thus no user references at all.  Therefore we
1716  * can assume the socket waitqueue is inactive and nobody will
1717  * try to jump onto it.
1718  */
1719 void tcp_destroy_sock(struct sock *sk)
1720 {
1721         BUG_TRAP(sk->state==TCP_CLOSE);
1722         BUG_TRAP(sk->dead);
1723
1724         /* It cannot be in hash table! */
1725         BUG_TRAP(sk->pprev==NULL);
1726
1727         /* It it has not 0 sk->num, it must be bound */
1728         BUG_TRAP(!sk->num || sk->prev!=NULL);
1729
1730 #ifdef TCP_DEBUG
1731         if (sk->zapped) {
1732                 printk("TCP: double destroy sk=%p\n", sk);
1733                 sock_hold(sk);
1734         }
1735         sk->zapped = 1;
1736 #endif
1737
1738         sk->prot->destroy(sk);
1739
1740         tcp_kill_sk_queues(sk);
1741
1742 #ifdef INET_REFCNT_DEBUG
1743         if (atomic_read(&sk->refcnt) != 1) {
1744                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1745         }
1746 #endif
1747
1748         atomic_dec(&tcp_orphan_count);
1749         sock_put(sk);
1750 }
1751
1752 void tcp_close(struct sock *sk, long timeout)
1753 {
1754         struct sk_buff *skb;
1755         int data_was_unread = 0;
1756
1757         lock_sock(sk);
1758         sk->shutdown = SHUTDOWN_MASK;
1759
1760         if(sk->state == TCP_LISTEN) {
1761                 tcp_set_state(sk, TCP_CLOSE);
1762
1763                 /* Special case. */
1764                 tcp_listen_stop(sk);
1765
1766                 goto adjudge_to_death;
1767         }
1768
1769         /*  We need to flush the recv. buffs.  We do this only on the
1770          *  descriptor close, not protocol-sourced closes, because the
1771          *  reader process may not have drained the data yet!
1772          */
1773         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1774                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1775                 data_was_unread += len;
1776                 __kfree_skb(skb);
1777         }
1778
1779         tcp_mem_reclaim(sk);
1780
1781         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1782          * 3.10, we send a RST here because data was lost.  To
1783          * witness the awful effects of the old behavior of always
1784          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1785          * a bulk GET in an FTP client, suspend the process, wait
1786          * for the client to advertise a zero window, then kill -9
1787          * the FTP client, wheee...  Note: timeout is always zero
1788          * in such a case.
1789          */
1790         if(data_was_unread != 0) {
1791                 /* Unread data was tossed, zap the connection. */
1792                 NET_INC_STATS_USER(TCPAbortOnClose);
1793                 tcp_set_state(sk, TCP_CLOSE);
1794                 tcp_send_active_reset(sk, GFP_KERNEL);
1795         } else if (sk->linger && sk->lingertime==0) {
1796                 /* Check zero linger _after_ checking for unread data. */
1797                 sk->prot->disconnect(sk, 0);
1798                 NET_INC_STATS_USER(TCPAbortOnData);
1799         } else if (tcp_close_state(sk)) {
1800                 /* We FIN if the application ate all the data before
1801                  * zapping the connection.
1802                  */
1803
1804                 /* RED-PEN. Formally speaking, we have broken TCP state
1805                  * machine. State transitions:
1806                  *
1807                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1808                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1809                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1810                  *
1811                  * are legal only when FIN has been sent (i.e. in window),
1812                  * rather than queued out of window. Purists blame.
1813                  *
1814                  * F.e. "RFC state" is ESTABLISHED,
1815                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1816                  *
1817                  * The visible declinations are that sometimes
1818                  * we enter time-wait state, when it is not required really
1819                  * (harmless), do not send active resets, when they are
1820                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1821                  * they look as CLOSING or LAST_ACK for Linux)
1822                  * Probably, I missed some more holelets.
1823                  *                                              --ANK
1824                  */
1825                 tcp_send_fin(sk);
1826         }
1827
1828         if (timeout) {
1829                 struct task_struct *tsk = current;
1830                 DECLARE_WAITQUEUE(wait, current);
1831
1832                 add_wait_queue(sk->sleep, &wait);
1833
1834                 do {
1835                         set_current_state(TASK_INTERRUPTIBLE);
1836                         if (!closing(sk))
1837                                 break;
1838                         release_sock(sk);
1839                         timeout = schedule_timeout(timeout);
1840                         lock_sock(sk);
1841                 } while (!signal_pending(tsk) && timeout);
1842
1843                 tsk->state = TASK_RUNNING;
1844                 remove_wait_queue(sk->sleep, &wait);
1845         }
1846
1847 adjudge_to_death:
1848         /* It is the last release_sock in its life. It will remove backlog. */
1849         release_sock(sk);
1850
1851
1852         /* Now socket is owned by kernel and we acquire BH lock
1853            to finish close. No need to check for user refs.
1854          */
1855         local_bh_disable();
1856         bh_lock_sock(sk);
1857         BUG_TRAP(sk->lock.users==0);
1858
1859         sock_hold(sk);
1860         sock_orphan(sk);
1861
1862         /*      This is a (useful) BSD violating of the RFC. There is a
1863          *      problem with TCP as specified in that the other end could
1864          *      keep a socket open forever with no application left this end.
1865          *      We use a 3 minute timeout (about the same as BSD) then kill
1866          *      our end. If they send after that then tough - BUT: long enough
1867          *      that we won't make the old 4*rto = almost no time - whoops
1868          *      reset mistake.
1869          *
1870          *      Nope, it was not mistake. It is really desired behaviour
1871          *      f.e. on http servers, when such sockets are useless, but
1872          *      consume significant resources. Let's do it with special
1873          *      linger2 option.                                 --ANK
1874          */
1875
1876         if (sk->state == TCP_FIN_WAIT2) {
1877                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1878                 if (tp->linger2 < 0) {
1879                         tcp_set_state(sk, TCP_CLOSE);
1880                         tcp_send_active_reset(sk, GFP_ATOMIC);
1881                         NET_INC_STATS_BH(TCPAbortOnLinger);
1882                 } else {
1883                         int tmo = tcp_fin_time(tp);
1884
1885                         if (tmo > TCP_TIMEWAIT_LEN) {
1886                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1887                         } else {
1888                                 atomic_inc(&tcp_orphan_count);
1889                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1890                                 goto out;
1891                         }
1892                 }
1893         }
1894         if (sk->state != TCP_CLOSE) {
1895                 tcp_mem_reclaim(sk);
1896                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
1897                     (sk->wmem_queued > SOCK_MIN_SNDBUF &&
1898                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1899                         if (net_ratelimit())
1900                                 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
1901                         tcp_set_state(sk, TCP_CLOSE);
1902                         tcp_send_active_reset(sk, GFP_ATOMIC);
1903                         NET_INC_STATS_BH(TCPAbortOnMemory);
1904                 }
1905         }
1906         atomic_inc(&tcp_orphan_count);
1907
1908         if (sk->state == TCP_CLOSE)
1909                 tcp_destroy_sock(sk);
1910         /* Otherwise, socket is reprieved until protocol close. */
1911
1912 out:
1913         bh_unlock_sock(sk);
1914         local_bh_enable();
1915         sock_put(sk);
1916 }
1917
1918 /* These states need RST on ABORT according to RFC793 */
1919
1920 extern __inline__ int tcp_need_reset(int state)
1921 {
1922         return ((1 << state) &
1923                 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1924                  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
1925 }
1926
1927 int tcp_disconnect(struct sock *sk, int flags)
1928 {
1929         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1930         int old_state;
1931         int err = 0;
1932
1933         old_state = sk->state;
1934         if (old_state != TCP_CLOSE)
1935                 tcp_set_state(sk, TCP_CLOSE);
1936
1937         /* ABORT function of RFC793 */
1938         if (old_state == TCP_LISTEN) {
1939                 tcp_listen_stop(sk);
1940         } else if (tcp_need_reset(old_state) ||
1941                    (tp->snd_nxt != tp->write_seq &&
1942                     (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
1943                 /* The last check adjusts for discrepance of Linux wrt. RFC
1944                  * states
1945                  */
1946                 tcp_send_active_reset(sk, gfp_any());
1947                 sk->err = ECONNRESET;
1948         } else if (old_state == TCP_SYN_SENT)
1949                 sk->err = ECONNRESET;
1950
1951         tcp_clear_xmit_timers(sk);
1952         __skb_queue_purge(&sk->receive_queue);
1953         tcp_writequeue_purge(sk);
1954         __skb_queue_purge(&tp->out_of_order_queue);
1955
1956         sk->dport = 0;
1957
1958         if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
1959                 sk->rcv_saddr = 0;
1960                 sk->saddr = 0;
1961 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1962                 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1963                 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1964 #endif
1965         }
1966
1967         sk->shutdown = 0;
1968         sk->done = 0;
1969         tp->srtt = 0;
1970         if ((tp->write_seq += tp->max_window+2) == 0)
1971                 tp->write_seq = 1;
1972         tp->backoff = 0;
1973         tp->snd_cwnd = 2;
1974         tp->probes_out = 0;
1975         tp->packets_out = 0;
1976         tp->snd_ssthresh = 0x7fffffff;
1977         tp->snd_cwnd_cnt = 0;
1978         tp->ca_state = TCP_CA_Open;
1979         tcp_clear_retrans(tp);
1980         tcp_delack_init(tp);
1981         tp->send_head = NULL;
1982         tp->saw_tstamp = 0;
1983         tcp_sack_reset(tp);
1984         __sk_dst_reset(sk);
1985
1986         BUG_TRAP(!sk->num || sk->prev);
1987
1988         sk->error_report(sk);
1989         return err;
1990 }
1991
1992 /*
1993  *      Wait for an incoming connection, avoid race
1994  *      conditions. This must be called with the socket locked.
1995  */
1996 static int wait_for_connect(struct sock * sk, long timeo)
1997 {
1998         DECLARE_WAITQUEUE(wait, current);
1999         int err;
2000
2001         /*
2002          * True wake-one mechanism for incoming connections: only
2003          * one process gets woken up, not the 'whole herd'.
2004          * Since we do not 'race & poll' for established sockets
2005          * anymore, the common case will execute the loop only once.
2006          *
2007          * Subtle issue: "add_wait_queue_exclusive()" will be added
2008          * after any current non-exclusive waiters, and we know that
2009          * it will always _stay_ after any new non-exclusive waiters
2010          * because all non-exclusive waiters are added at the
2011          * beginning of the wait-queue. As such, it's ok to "drop"
2012          * our exclusiveness temporarily when we get woken up without
2013          * having to remove and re-insert us on the wait queue.
2014          */
2015         add_wait_queue_exclusive(sk->sleep, &wait);
2016         for (;;) {
2017                 current->state = TASK_INTERRUPTIBLE;
2018                 release_sock(sk);
2019                 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2020                         timeo = schedule_timeout(timeo);
2021                 lock_sock(sk);
2022                 err = 0;
2023                 if (sk->tp_pinfo.af_tcp.accept_queue)
2024                         break;
2025                 err = -EINVAL;
2026                 if (sk->state != TCP_LISTEN)
2027                         break;
2028                 err = sock_intr_errno(timeo);
2029                 if (signal_pending(current))
2030                         break;
2031                 err = -EAGAIN;
2032                 if (!timeo)
2033                         break;
2034         }
2035         current->state = TASK_RUNNING;
2036         remove_wait_queue(sk->sleep, &wait);
2037         return err;
2038 }
2039
2040 /*
2041  *      This will accept the next outstanding connection.
2042  */
2043
2044 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2045 {
2046         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2047         struct open_request *req;
2048         struct sock *newsk;
2049         int error;
2050
2051         lock_sock(sk);
2052
2053         /* We need to make sure that this socket is listening,
2054          * and that it has something pending.
2055          */
2056         error = -EINVAL;
2057         if (sk->state != TCP_LISTEN)
2058                 goto out;
2059
2060         /* Find already established connection */
2061         if (!tp->accept_queue) {
2062                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2063
2064                 /* If this is a non blocking socket don't sleep */
2065                 error = -EAGAIN;
2066                 if (!timeo)
2067                         goto out;
2068
2069                 error = wait_for_connect(sk, timeo);
2070                 if (error)
2071                         goto out;
2072         }
2073
2074         req = tp->accept_queue;
2075         if ((tp->accept_queue = req->dl_next) == NULL)
2076                 tp->accept_queue_tail = NULL;
2077
2078         newsk = req->sk;
2079         tcp_acceptq_removed(sk);
2080         tcp_openreq_fastfree(req);
2081         BUG_TRAP(newsk->state != TCP_SYN_RECV);
2082         release_sock(sk);
2083         return newsk;
2084
2085 out:
2086         release_sock(sk);
2087         *err = error;
2088         return NULL;
2089 }
2090
2091 /*
2092  *      Socket option code for TCP.
2093  */
2094
2095 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2096                    int optlen)
2097 {
2098         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2099         int val;
2100         int err = 0;
2101
2102         if (level != SOL_TCP)
2103                 return tp->af_specific->setsockopt(sk, level, optname,
2104                                                    optval, optlen);
2105
2106         if(optlen<sizeof(int))
2107                 return -EINVAL;
2108
2109         if (get_user(val, (int *)optval))
2110                 return -EFAULT;
2111
2112         lock_sock(sk);
2113
2114         switch(optname) {
2115         case TCP_MAXSEG:
2116                 /* values greater than interface MTU won't take effect.  however at
2117                  * the point when this call is done we typically don't yet know
2118                  * which interface is going to be used
2119                  */
2120                 if(val < 8 || val > MAX_TCP_WINDOW) {
2121                         err = -EINVAL;
2122                         break;
2123                 }
2124                 tp->user_mss = val;
2125                 break;
2126
2127         case TCP_NODELAY:
2128                 /* You cannot try to use this and TCP_CORK in
2129                  * tandem, so let the user know.
2130                  */
2131                 if (tp->nonagle == 2) {
2132                         err = -EINVAL;
2133                         break;
2134                 }
2135                 tp->nonagle = (val == 0) ? 0 : 1;
2136                 if (val)
2137                         tcp_push_pending_frames(sk, tp);
2138                 break;
2139
2140         case TCP_CORK:
2141                 /* When set indicates to always queue non-full frames.
2142                  * Later the user clears this option and we transmit
2143                  * any pending partial frames in the queue.  This is
2144                  * meant to be used alongside sendfile() to get properly
2145                  * filled frames when the user (for example) must write
2146                  * out headers with a write() call first and then use
2147                  * sendfile to send out the data parts.
2148                  *
2149                  * You cannot try to use TCP_NODELAY and this mechanism
2150                  * at the same time, so let the user know.
2151                  */
2152                 if (tp->nonagle == 1) {
2153                         err = -EINVAL;
2154                         break;
2155                 }
2156                 if (val != 0) {
2157                         tp->nonagle = 2;
2158                 } else {
2159                         tp->nonagle = 0;
2160
2161                         tcp_push_pending_frames(sk, tp);
2162                 }
2163                 break;
2164
2165         case TCP_KEEPIDLE:
2166                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2167                         err = -EINVAL;
2168                 else {
2169                         tp->keepalive_time = val * HZ;
2170                         if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2171                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2172                                 if (tp->keepalive_time > elapsed)
2173                                         elapsed = tp->keepalive_time - elapsed;
2174                                 else
2175                                         elapsed = 0;
2176                                 tcp_reset_keepalive_timer(sk, elapsed);
2177                         }
2178                 }
2179                 break;
2180         case TCP_KEEPINTVL:
2181                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2182                         err = -EINVAL;
2183                 else
2184                         tp->keepalive_intvl = val * HZ;
2185                 break;
2186         case TCP_KEEPCNT:
2187                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2188                         err = -EINVAL;
2189                 else
2190                         tp->keepalive_probes = val;
2191                 break;
2192         case TCP_SYNCNT:
2193                 if (val < 1 || val > MAX_TCP_SYNCNT)
2194                         err = -EINVAL;
2195                 else
2196                         tp->syn_retries = val;
2197                 break;
2198
2199         case TCP_LINGER2:
2200                 if (val < 0)
2201                         tp->linger2 = -1;
2202                 else if (val > sysctl_tcp_fin_timeout/HZ)
2203                         tp->linger2 = 0;
2204                 else
2205                         tp->linger2 = val*HZ;
2206                 break;
2207
2208         case TCP_DEFER_ACCEPT:
2209                 tp->defer_accept = 0;
2210                 if (val > 0) {
2211                         /* Translate value in seconds to number of retransmits */
2212                         while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2213                                 tp->defer_accept++;
2214                         tp->defer_accept++;
2215                 }
2216                 break;
2217
2218         case TCP_WINDOW_CLAMP:
2219                 if (val==0) {
2220                         if (sk->state != TCP_CLOSE) {
2221                                 err = -EINVAL;
2222                                 break;
2223                         }
2224                         tp->window_clamp = 0;
2225                 } else {
2226                         tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2227                                 SOCK_MIN_RCVBUF/2 : val;
2228                 }
2229                 break;
2230
2231         default:
2232                 err = -ENOPROTOOPT;
2233                 break;
2234         };
2235         release_sock(sk);
2236         return err;
2237 }
2238
2239 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2240                    int *optlen)
2241 {
2242         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2243         int val, len;
2244
2245         if(level != SOL_TCP)
2246                 return tp->af_specific->getsockopt(sk, level, optname,
2247                                                    optval, optlen);
2248
2249         if(get_user(len,optlen))
2250                 return -EFAULT;
2251
2252         len = min(len, sizeof(int));
2253
2254         switch(optname) {
2255         case TCP_MAXSEG:
2256                 val = tp->mss_cache;
2257                 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2258                         val = tp->user_mss;
2259                 break;
2260         case TCP_NODELAY:
2261                 val = (tp->nonagle == 1);
2262                 break;
2263         case TCP_CORK:
2264                 val = (tp->nonagle == 2);
2265                 break;
2266         case TCP_KEEPIDLE:
2267                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2268                 break;
2269         case TCP_KEEPINTVL:
2270                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2271                 break;
2272         case TCP_KEEPCNT:
2273                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2274                 break;
2275         case TCP_SYNCNT:
2276                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2277                 break;
2278         case TCP_LINGER2:
2279                 val = tp->linger2;
2280                 if (val > 0)
2281                         val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2282                 break;
2283         case TCP_DEFER_ACCEPT:
2284                 val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
2285                 break;
2286         case TCP_WINDOW_CLAMP:
2287                 val = tp->window_clamp;
2288                 break;
2289         case TCP_INFO:
2290         {
2291                 struct tcp_info info;
2292                 u32 now = tcp_time_stamp;
2293
2294                 if(get_user(len,optlen))
2295                         return -EFAULT;
2296                 info.tcpi_state = sk->state;
2297                 info.tcpi_ca_state = tp->ca_state;
2298                 info.tcpi_retransmits = tp->retransmits;
2299                 info.tcpi_probes = tp->probes_out;
2300                 info.tcpi_backoff = tp->backoff;
2301                 info.tcpi_options = 0;
2302                 if (tp->tstamp_ok)
2303                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2304                 if (tp->sack_ok)
2305                         info.tcpi_options |= TCPI_OPT_SACK;
2306                 if (tp->wscale_ok) {
2307                         info.tcpi_options |= TCPI_OPT_WSCALE;
2308                         info.tcpi_snd_wscale = tp->snd_wscale;
2309                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2310                 } else {
2311                         info.tcpi_snd_wscale = 0;
2312                         info.tcpi_rcv_wscale = 0;
2313                 }
2314 #ifdef CONFIG_INET_ECN
2315                 if (tp->ecn_flags&TCP_ECN_OK)
2316                         info.tcpi_options |= TCPI_OPT_ECN;
2317 #endif
2318
2319                 info.tcpi_rto = (1000000*tp->rto)/HZ;
2320                 info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2321                 info.tcpi_snd_mss = tp->mss_cache;
2322                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2323
2324                 info.tcpi_unacked = tp->packets_out;
2325                 info.tcpi_sacked = tp->sacked_out;
2326                 info.tcpi_lost = tp->lost_out;
2327                 info.tcpi_retrans = tp->retrans_out;
2328                 info.tcpi_fackets = tp->fackets_out;
2329
2330                 info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2331                 info.tcpi_last_ack_sent = 0;
2332                 info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2333                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2334
2335                 info.tcpi_pmtu = tp->pmtu_cookie;
2336                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2337                 info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2338                 info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2339                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2340                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2341                 info.tcpi_advmss = tp->advmss;
2342                 info.tcpi_reordering = tp->reordering;
2343
2344                 len = min(len, sizeof(info));
2345                 if(put_user(len, optlen))
2346                         return -EFAULT;
2347                 if(copy_to_user(optval, &info,len))
2348                         return -EFAULT;
2349                 return 0;
2350         }
2351         default:
2352                 return -ENOPROTOOPT;
2353         };
2354
2355         if(put_user(len, optlen))
2356                 return -EFAULT;
2357         if(copy_to_user(optval, &val,len))
2358                 return -EFAULT;
2359         return 0;
2360 }
2361
2362
2363 extern void __skb_cb_too_small_for_tcp(int, int);
2364
2365 void __init tcp_init(void)
2366 {
2367         struct sk_buff *skb = NULL;
2368         unsigned long goal;
2369         int order, i;
2370
2371         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2372                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2373                                            sizeof(skb->cb));
2374
2375         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2376                                                    sizeof(struct open_request),
2377                                                0, SLAB_HWCACHE_ALIGN,
2378                                                NULL, NULL);
2379         if(!tcp_openreq_cachep)
2380                 panic("tcp_init: Cannot alloc open_request cache.");
2381
2382         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2383                                               sizeof(struct tcp_bind_bucket),
2384                                               0, SLAB_HWCACHE_ALIGN,
2385                                               NULL, NULL);
2386         if(!tcp_bucket_cachep)
2387                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2388
2389         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2390                                                 sizeof(struct tcp_tw_bucket),
2391                                                 0, SLAB_HWCACHE_ALIGN,
2392                                                 NULL, NULL);
2393         if(!tcp_timewait_cachep)
2394                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2395
2396         /* Size and allocate the main established and bind bucket
2397          * hash tables.
2398          *
2399          * The methodology is similar to that of the buffer cache.
2400          */
2401         goal = num_physpages >> (23 - PAGE_SHIFT);
2402
2403         for(order = 0; (1UL << order) < goal; order++)
2404                 ;
2405         do {
2406                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2407                         sizeof(struct tcp_ehash_bucket);
2408                 tcp_ehash_size >>= 1;
2409                 while (tcp_ehash_size & (tcp_ehash_size-1))
2410                         tcp_ehash_size--;
2411                 tcp_ehash = (struct tcp_ehash_bucket *)
2412                         __get_free_pages(GFP_ATOMIC, order);
2413         } while (tcp_ehash == NULL && --order > 0);
2414
2415         if (!tcp_ehash)
2416                 panic("Failed to allocate TCP established hash table\n");
2417         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2418                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2419                 tcp_ehash[i].chain = NULL;
2420         }
2421
2422         do {
2423                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2424                         sizeof(struct tcp_bind_hashbucket);
2425                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2426                         continue;
2427                 tcp_bhash = (struct tcp_bind_hashbucket *)
2428                         __get_free_pages(GFP_ATOMIC, order);
2429         } while (tcp_bhash == NULL && --order >= 0);
2430
2431         if (!tcp_bhash)
2432                 panic("Failed to allocate TCP bind hash table\n");
2433         for (i = 0; i < tcp_bhash_size; i++) {
2434                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2435                 tcp_bhash[i].chain = NULL;
2436         }
2437
2438         /* Try to be a bit smarter and adjust defaults depending
2439          * on available memory.
2440          */
2441         if (order > 4) {
2442                 sysctl_local_port_range[0] = 32768;
2443                 sysctl_local_port_range[1] = 61000;
2444                 sysctl_tcp_max_tw_buckets = 180000;
2445                 sysctl_tcp_max_orphans = 4096<<(order-4);
2446                 sysctl_max_syn_backlog = 1024;
2447         } else if (order < 3) {
2448                 sysctl_local_port_range[0] = 1024*(3-order);
2449                 sysctl_tcp_max_tw_buckets >>= (3-order);
2450                 sysctl_tcp_max_orphans >>= (3-order);
2451                 sysctl_max_syn_backlog = 128;
2452         }
2453         tcp_port_rover = sysctl_local_port_range[0] - 1;
2454
2455         sysctl_tcp_mem[0] = 64<<order;
2456         sysctl_tcp_mem[1] = 200<<order;
2457         sysctl_tcp_mem[2] = 256<<order;
2458         if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2459                 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2460         if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2461                 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2462
2463         if (order < 3) {
2464                 sysctl_tcp_wmem[2] = 64*1024;
2465                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2466                 sysctl_tcp_rmem[1] = 43689;
2467                 sysctl_tcp_rmem[2] = 2*43689;
2468         }
2469
2470         printk("TCP: Hash tables configured (established %d bind %d)\n",
2471                tcp_ehash_size<<1, tcp_bhash_size);
2472 }