net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.170 2000/07/08 00:20:43 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen :    Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *
 245  *      TCP_CLOSE               socket is finished
 246  */
 247
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * [Note: Most of the TCP code has been rewriten/redesigned since this
 255  *  RFC1122 check. It is probably not correct anymore. It should be redone
 256  *  before 2.2. -AK]
 257  *
 258  * Use of PSH (4.2.2.2)
 259  *   MAY aggregate data sent without the PSH flag. (does)
 260  *   MAY queue data received without the PSH flag. (does)
 261  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 262  *   MAY implement PSH on send calls. (doesn't, thus:)
 263  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 264  *     MUST set PSH on last segment (does)
 265  *   MAY pass received PSH to application layer (doesn't)
 266  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 267  *
 268  * Window Size (4.2.2.3, 4.2.2.16)
 269  *   MUST treat window size as an unsigned number (does)
 270  *   SHOULD treat window size as a 32-bit number (does not)
 271  *   MUST NOT shrink window once it is offered (does not normally)
 272  *
 273  * Urgent Pointer (4.2.2.4)
 274  * **MUST point urgent pointer to last byte of urgent data (not right
 275  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 276  *      to off)
 277  *   MUST inform application layer asynchronously of incoming urgent
 278  *     data. (does)
 279  *   MUST provide application with means of determining the amount of
 280  *     urgent data pending. (does)
 281  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 282  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 283  *      [Follows BSD 1 byte of urgent data]
 284  *
 285  * TCP Options (4.2.2.5)
 286  *   MUST be able to receive TCP options in any segment. (does)
 287  *   MUST ignore unsupported options (does)
 288  *
 289  * Maximum Segment Size Option (4.2.2.6)
 290  *   MUST implement both sending and receiving MSS. (does, but currently
 291  *      only uses the smaller of both of them)
 292  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 293  *     it always). (does, even when MSS == 536, which is legal)
 294  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 295  *   MUST calculate "effective send MSS" correctly:
 296  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 297  *     (does - but allows operator override)
 298  *
 299  * TCP Checksum (4.2.2.7)
 300  *   MUST generate and check TCP checksum. (does)
 301  *
 302  * Initial Sequence Number Selection (4.2.2.8)
 303  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 304  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 305  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 306  *     With syncookies we don't)
 307  *
 308  * Simultaneous Open Attempts (4.2.2.10)
 309  *   MUST support simultaneous open attempts (does)
 310  *
 311  * Recovery from Old Duplicate SYN (4.2.2.11)
 312  *   MUST keep track of active vs. passive open (does)
 313  *
 314  * RST segment (4.2.2.12)
 315  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 316  *     anything with it, which is standard)
 317  *
 318  * Closing a Connection (4.2.2.13)
 319  *   MUST inform application of whether connection was closed by RST or
 320  *     normal close. (does)
 321  *   MAY allow "half-duplex" close (treat connection as closed for the
 322  *     local app, even before handshake is done). (does)
 323  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 324  *
 325  * Retransmission Timeout (4.2.2.15)
 326  *   MUST implement Jacobson's slow start and congestion avoidance
 327  *     stuff. (does)
 328  *
 329  * Probing Zero Windows (4.2.2.17)
 330  *   MUST support probing of zero windows. (does)
 331  *   MAY keep offered window closed indefinitely. (does)
 332  *   MUST allow remote window to stay closed indefinitely. (does)
 333  *
 334  * Passive Open Calls (4.2.2.18)
 335  *   MUST NOT let new passive open affect other connections. (doesn't)
 336  *   MUST support passive opens (LISTENs) concurrently. (does)
 337  *
 338  * Time to Live (4.2.2.19)
 339  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 340  *
 341  * Event Processing (4.2.2.20)
 342  *   SHOULD queue out-of-order segments. (does)
 343  *   MUST aggregate ACK segments whenever possible. (does but badly)
 344  *
 345  * Retransmission Timeout Calculation (4.2.3.1)
 346  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 347  *     calculation. (does, or at least explains them in the comments 8*b)
 348  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 349  *
 350  * When to Send an ACK Segment (4.2.3.2)
 351  *   SHOULD implement delayed ACK. (does)
 352  *   MUST keep ACK delay < 0.5 sec. (does)
 353  *
 354  * When to Send a Window Update (4.2.3.3)
 355  *   MUST implement receiver-side SWS. (does)
 356  *
 357  * When to Send Data (4.2.3.4)
 358  *   MUST implement sender-side SWS. (does)
 359  *   SHOULD implement Nagle algorithm. (does)
 360  *
 361  * TCP Connection Failures (4.2.3.5)
 362  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 363  *   SHOULD inform application layer of soft errors. (does)
 364  *
 365  * TCP Keep-Alives (4.2.3.6)
 366  *   MAY provide keep-alives. (does)
 367  *   MUST make keep-alives configurable on a per-connection basis. (does)
 368  *   MUST default to no keep-alives. (does)
 369  *   MUST make keep-alive interval configurable. (does)
 370  *   MUST make default keep-alive interval > 2 hours. (does)
 371  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 372  *     connection. (doesn't)
 373  *   SHOULD send keep-alive with no data. (does)
 374  *
 375  * TCP Multihoming (4.2.3.7)
 376  *   MUST get source address from IP layer before sending first
 377  *     SYN. (does)
 378  *   MUST use same local address for all segments of a connection. (does)
 379  *
 380  * IP Options (4.2.3.8)
 381  *   MUST ignore unsupported IP options. (does)
 382  *   MAY support Time Stamp and Record Route. (does)
 383  *   MUST allow application to specify a source route. (does)
 384  *   MUST allow received Source Route option to set route for all future
 385  *     segments on this connection. (does not (security issues))
 386  *
 387  * ICMP messages (4.2.3.9)
 388  *   MUST act on ICMP errors. (does)
 389  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 390  *   because that is deprecated now by the IETF, can be turned on)
 391  *   MUST NOT abort connection upon receipt of soft Destination
 392  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 393  *     Problems. (doesn't)
 394  *   SHOULD report soft Destination Unreachables etc. to the
 395  *     application. (does, except during SYN_RECV and may drop messages
 396  *     in some rare cases before accept() - ICMP is unreliable)
 397  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 398  *     messages (2, 3, 4). (does, but see above)
 399  *
 400  * Remote Address Validation (4.2.3.10)
 401  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 402  *   MUST ignore SYN with invalid source address. (does)
 403  *   MUST silently discard incoming SYN for broadcast/multicast
 404  *     address. (does)
 405  *
 406  * Asynchronous Reports (4.2.4.1)
 407  * MUST provide mechanism for reporting soft errors to application
 408  *     layer. (does)
 409  *
 410  * Type of Service (4.2.4.2)
 411  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 412  *
 413  * (Whew. -- MS 950903)
 414  * (Updated by AK, but not complete yet.)
 415  **/
 416
 417 #include <linux/config.h>
 418 #include <linux/types.h>
 419 #include <linux/fcntl.h>
 420 #include <linux/poll.h>
 421 #include <linux/init.h>
 422 #include <linux/smp_lock.h>
 423
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426
 427 #include <asm/uaccess.h>
 428
 429 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 430
 431 struct tcp_mib  tcp_statistics[NR_CPUS*2];
 432
 433 kmem_cache_t *tcp_openreq_cachep;
 434 kmem_cache_t *tcp_bucket_cachep;
 435 kmem_cache_t *tcp_timewait_cachep;
 436
 437 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 438
 439 /*
 440  * LISTEN is a special case for poll..
 441  */
 442 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 443 {
 444         return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
 445 }
 446
 447 /*
 448  *      Wait for a TCP event.
 449  *
 450  *      Note that we don't need to lock the socket, as the upper poll layers
 451  *      take care of normal races (between the test and the event) and we don't
 452  *      go look at any of the socket buffers directly.
 453  */
 454 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 455 {
 456         unsigned int mask;
 457         struct sock *sk = sock->sk;
 458         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 459
 460         poll_wait(file, sk->sleep, wait);
 461         if (sk->state == TCP_LISTEN)
 462                 return tcp_listen_poll(sk, wait);
 463
 464         /* Socket is not locked. We are protected from async events
 465            by poll logic and correct handling of state changes
 466            made by another threads is impossible in any case.
 467          */
 468
 469         mask = 0;
 470         if (sk->err)
 471                 mask = POLLERR;
 472
 473         /*
 474          * POLLHUP is certainly not done right. But poll() doesn't
 475          * have a notion of HUP in just one direction, and for a
 476          * socket the read side is more interesting.
 477          *
 478          * Some poll() documentation says that POLLHUP is incompatible
 479          * with the POLLOUT/POLLWR flags, so somebody should check this
 480          * all. But careful, it tends to be safer to return too many
 481          * bits than too few, and you can easily break real applications
 482          * if you don't tell them that something has hung up!
 483          *
 484          * Check-me.
 485          *
 486          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 487          * our fs/select.c). It means that after we received EOF,
 488          * poll always returns immediately, making impossible poll() on write()
 489          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 490          * if and only if shutdown has been made in both directions.
 491          * Actually, it is interesting to look how Solaris and DUX
 492          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 493          * then we could set it on SND_SHUTDOWN. BTW examples given
 494          * in Stevens' books assume exactly this behaviour, it explains
 495          * why PULLHUP is incompatible with POLLOUT.    --ANK
 496          *
 497          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 498          * blocking on fresh not-connected or disconnected socket. --ANK
 499          */
 500         if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
 501                 mask |= POLLHUP;
 502         if (sk->shutdown & RCV_SHUTDOWN)
 503                 mask |= POLLIN | POLLRDNORM;
 504
 505         /* Connected? */
 506         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 507                 if ((tp->rcv_nxt != tp->copied_seq) &&
 508                     (tp->urg_seq != tp->copied_seq ||
 509                      tp->rcv_nxt != tp->copied_seq+1 ||
 510                      sk->urginline || !tp->urg_data))
 511                         mask |= POLLIN | POLLRDNORM;
 512
 513                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 514                         if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
 515                                 mask |= POLLOUT | POLLWRNORM;
 516                         } else {  /* send SIGIO later */
 517                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 518                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 519
 520                                 /* Race breaker. If space is freed after
 521                                  * wspace test but before the flags are set,
 522                                  * IO signal will be lost.
 523                                  */
 524                                 if (sock_wspace(sk) >= tcp_min_write_space(sk))
 525                                         mask |= POLLOUT | POLLWRNORM;
 526                         }
 527                 }
 528
 529                 if (tp->urg_data & TCP_URG_VALID)
 530                         mask |= POLLPRI;
 531         }
 532         return mask;
 533 }
 534
 535 /*
 536  *      Socket write_space callback.
 537  *      This (or rather the sock_wake_async) should agree with poll.
 538  *
 539  *      WARNING. This callback is called, when socket is not locked.
 540  *
 541  *      This wakeup is used by TCP only as dead-lock breaker, real
 542  *      wakeup occurs when incoming ack frees some space in buffer.
 543  */
 544 void tcp_write_space(struct sock *sk)
 545 {
 546         struct socket *sock;
 547
 548         read_lock(&sk->callback_lock);
 549         if ((sock = sk->socket) != NULL && atomic_read(&sk->wmem_alloc) == 0) {
 550                 if (test_bit(SOCK_NOSPACE, &sock->flags)) {
 551                         if (sk->sleep && waitqueue_active(sk->sleep)) {
 552                                 clear_bit(SOCK_NOSPACE, &sock->flags);
 553                                 wake_up_interruptible(sk->sleep);
 554                         }
 555                 }
 556
 557                 if (sock->fasync_list)
 558                         sock_wake_async(sock, 2, POLL_OUT);
 559         }
 560         read_unlock(&sk->callback_lock);
 561 }
 562
 563 /* Listening TCP sockets never sleep to wait for memory, so
 564  * it is completely silly to wake them up on queue space
 565  * available events.  So we hook them up to this dummy callback.
 566  */
 567 static void tcp_listen_write_space(struct sock *sk)
 568 {
 569 }
 570
 571 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 572 {
 573         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 574         int answ;
 575
 576         switch(cmd) {
 577         case SIOCINQ:
 578                 if (sk->state == TCP_LISTEN)
 579                         return(-EINVAL);
 580
 581                 lock_sock(sk);
 582                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 583                         answ = 0;
 584                 else if (sk->urginline || !tp->urg_data ||
 585                          before(tp->urg_seq,tp->copied_seq) ||
 586                          !before(tp->urg_seq,tp->rcv_nxt)) {
 587                         answ = tp->rcv_nxt - tp->copied_seq;
 588
 589                         /* Subtract 1, if FIN is in queue. */
 590                         if (answ && !skb_queue_empty(&sk->receive_queue))
 591                                 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
 592                 } else
 593                         answ = tp->urg_seq - tp->copied_seq;
 594                 release_sock(sk);
 595                 break;
 596         case SIOCATMARK:
 597                 {
 598                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 599                         break;
 600                 }
 601         case SIOCOUTQ:
 602                 if (sk->state == TCP_LISTEN)
 603                         return(-EINVAL);
 604
 605                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 606                         answ = 0;
 607                 else
 608                         answ = tp->write_seq - tp->snd_una;
 609                 break;
 610         default:
 611                 return(-ENOIOCTLCMD);
 612         };
 613
 614         return put_user(answ, (int *)arg);
 615 }
 616
 617
 618 int tcp_listen_start(struct sock *sk)
 619 {
 620         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 621         struct tcp_listen_opt *lopt;
 622
 623         sk->max_ack_backlog = 0;
 624         sk->ack_backlog = 0;
 625         tp->accept_queue = tp->accept_queue_tail = NULL;
 626         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 627
 628         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 629         if (!lopt)
 630                 return -ENOMEM;
 631
 632         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 633         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 634                 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 635                         break;
 636
 637         write_lock_bh(&tp->syn_wait_lock);
 638         tp->listen_opt = lopt;
 639         write_unlock_bh(&tp->syn_wait_lock);
 640
 641         /* There is race window here: we announce ourselves listening,
 642          * but this transition is still not validated by get_port().
 643          * It is OK, because this socket enters to hash table only
 644          * after validation is complete.
 645          */
 646         sk->state = TCP_LISTEN;
 647         if (sk->prot->get_port(sk, sk->num) == 0) {
 648                 sk->sport = htons(sk->num);
 649
 650                 sk->write_space = tcp_listen_write_space;
 651                 sk_dst_reset(sk);
 652                 sk->prot->hash(sk);
 653
 654                 return 0;
 655         }
 656
 657         sk->state = TCP_CLOSE;
 658         write_lock_bh(&tp->syn_wait_lock);
 659         tp->listen_opt = NULL;
 660         write_unlock_bh(&tp->syn_wait_lock);
 661         kfree(lopt);
 662         return -EADDRINUSE;
 663 }
 664
 665 /*
 666  *      This routine closes sockets which have been at least partially
 667  *      opened, but not yet accepted.
 668  */
 669
 670 static void tcp_listen_stop (struct sock *sk)
 671 {
 672         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 673         struct tcp_listen_opt *lopt = tp->listen_opt;
 674         struct open_request *acc_req = tp->accept_queue;
 675         struct open_request *req;
 676         int i;
 677
 678         tcp_delete_keepalive_timer(sk);
 679
 680         /* make all the listen_opt local to us */
 681         write_lock_bh(&tp->syn_wait_lock);
 682         tp->listen_opt =NULL;
 683         write_unlock_bh(&tp->syn_wait_lock);
 684         tp->accept_queue = tp->accept_queue_tail = NULL;
 685
 686         if (lopt->qlen) {
 687                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
 688                         while ((req = lopt->syn_table[i]) != NULL) {
 689                                 lopt->syn_table[i] = req->dl_next;
 690                                 lopt->qlen--;
 691                                 tcp_openreq_free(req);
 692
 693                 /* Following specs, it would be better either to send FIN
 694                  * (and enter FIN-WAIT-1, it is normal close)
 695                  * or to send active reset (abort).
 696                  * Certainly, it is pretty dangerous while synflood, but it is
 697                  * bad justification for our negligence 8)
 698                  * To be honest, we are not able to make either
 699                  * of the variants now.                 --ANK
 700                  */
 701                         }
 702                 }
 703         }
 704         BUG_TRAP(lopt->qlen == 0);
 705
 706         kfree(lopt);
 707
 708         while ((req=acc_req) != NULL) {
 709                 struct sock *child = req->sk;
 710
 711                 acc_req = req->dl_next;
 712
 713                 local_bh_disable();
 714                 bh_lock_sock(child);
 715                 BUG_TRAP(child->lock.users==0);
 716                 sock_hold(child);
 717
 718                 tcp_disconnect(child, O_NONBLOCK);
 719
 720                 sock_orphan(child);
 721
 722                 atomic_inc(&tcp_orphan_count);
 723
 724                 tcp_destroy_sock(child);
 725
 726                 bh_unlock_sock(child);
 727                 local_bh_enable();
 728                 sock_put(child);
 729
 730                 tcp_acceptq_removed(sk);
 731                 tcp_openreq_fastfree(req);
 732         }
 733         BUG_TRAP(sk->ack_backlog == 0);
 734 }
 735
 736 /*
 737  *      Wait for a socket to get into the connected state
 738  *
 739  *      Note: Must be called with the socket locked.
 740  */
 741 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
 742 {
 743         struct task_struct *tsk = current;
 744         DECLARE_WAITQUEUE(wait, tsk);
 745
 746         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 747                 if(sk->err)
 748                         return sock_error(sk);
 749                 if((1 << sk->state) &
 750                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 751                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 752                                 send_sig(SIGPIPE, tsk, 0);
 753                         return -EPIPE;
 754                 }
 755                 if(!*timeo_p)
 756                         return -EAGAIN;
 757                 if(signal_pending(tsk))
 758                         return sock_intr_errno(*timeo_p);
 759
 760                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
 761                 add_wait_queue(sk->sleep, &wait);
 762                 sk->tp_pinfo.af_tcp.write_pending++;
 763
 764                 release_sock(sk);
 765                 *timeo_p = schedule_timeout(*timeo_p);
 766                 lock_sock(sk);
 767
 768                 __set_task_state(tsk, TASK_RUNNING);
 769                 remove_wait_queue(sk->sleep, &wait);
 770                 sk->tp_pinfo.af_tcp.write_pending--;
 771         }
 772         return 0;
 773 }
 774
 775 static inline int tcp_memory_free(struct sock *sk)
 776 {
 777         return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
 778 }
 779
 780 /*
 781  *      Wait for more memory for a socket
 782  */
 783 static long wait_for_tcp_memory(struct sock * sk, long timeo)
 784 {
 785         if (!tcp_memory_free(sk)) {
 786                 DECLARE_WAITQUEUE(wait, current);
 787
 788                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 789
 790                 add_wait_queue(sk->sleep, &wait);
 791                 for (;;) {
 792                         set_bit(SOCK_NOSPACE, &sk->socket->flags);
 793
 794                         set_current_state(TASK_INTERRUPTIBLE);
 795
 796                         if (signal_pending(current))
 797                                 break;
 798                         if (tcp_memory_free(sk))
 799                                 break;
 800                         if (sk->shutdown & SEND_SHUTDOWN)
 801                                 break;
 802                         if (sk->err)
 803                                 break;
 804                         release_sock(sk);
 805                         if (!tcp_memory_free(sk))
 806                                 timeo = schedule_timeout(timeo);
 807                         lock_sock(sk);
 808                 }
 809                 current->state = TASK_RUNNING;
 810                 remove_wait_queue(sk->sleep, &wait);
 811         }
 812         return timeo;
 813 }
 814
 815 /* When all user supplied data has been queued set the PSH bit */
 816 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
 817
 818 /*
 819  *      This routine copies from a user buffer into a socket,
 820  *      and starts the transmit system.
 821  */
 822
 823 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 824 {
 825         struct iovec *iov;
 826         struct tcp_opt *tp;
 827         struct sk_buff *skb;
 828         int iovlen, flags;
 829         int mss_now;
 830         int err, copied;
 831         long timeo;
 832
 833         err = 0;
 834         tp = &(sk->tp_pinfo.af_tcp);
 835
 836         lock_sock(sk);
 837         TCP_CHECK_TIMER(sk);
 838
 839         flags = msg->msg_flags;
 840
 841         timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
 842
 843         /* Wait for a connection to finish. */
 844         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 845                 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
 846                         goto out_unlock;
 847
 848         /* This should be in poll */
 849         clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 850
 851         mss_now = tcp_current_mss(sk);
 852
 853         /* Ok commence sending. */
 854         iovlen = msg->msg_iovlen;
 855         iov = msg->msg_iov;
 856         copied = 0;
 857
 858         while(--iovlen >= 0) {
 859                 int seglen=iov->iov_len;
 860                 unsigned char * from=iov->iov_base;
 861
 862                 iov++;
 863
 864                 while(seglen > 0) {
 865                         int copy, tmp, queue_it;
 866
 867                         if (err)
 868                                 goto do_fault2;
 869
 870                         /* Stop on errors. */
 871                         if (sk->err)
 872                                 goto do_sock_err;
 873
 874                         /* Make sure that we are established. */
 875                         if (sk->shutdown & SEND_SHUTDOWN)
 876                                 goto do_shutdown;
 877
 878                         /* Now we need to check if we have a half
 879                          * built packet we can tack some data onto.
 880                          */
 881                         if (tp->send_head && !(flags & MSG_OOB)) {
 882                                 skb = sk->write_queue.prev;
 883                                 copy = skb->len;
 884                                 /* If the remote does SWS avoidance we should
 885                                  * queue the best we can if not we should in
 886                                  * fact send multiple packets...
 887                                  * A method for detecting this would be most
 888                                  * welcome.
 889                                  */
 890                                 if (skb_tailroom(skb) > 0 &&
 891                                     (mss_now - copy) > 0) {
 892                                         int last_byte_was_odd = (copy % 4);
 893
 894                                         copy = mss_now - copy;
 895                                         if(copy > skb_tailroom(skb))
 896                                                 copy = skb_tailroom(skb);
 897                                         if(copy > seglen)
 898                                                 copy = seglen;
 899                                         if(last_byte_was_odd) {
 900                                                 if(copy_from_user(skb_put(skb, copy),
 901                                                                   from, copy))
 902                                                         err = -EFAULT;
 903                                                 skb->csum = csum_partial(skb->data,
 904                                                                          skb->len, 0);
 905                                         } else {
 906                                                 skb->csum =
 907                                                         csum_and_copy_from_user(
 908                                                         from, skb_put(skb, copy),
 909                                                         copy, skb->csum, &err);
 910                                         }
 911                                         /*
 912                                          * FIXME: the *_user functions should
 913                                          *        return how much data was
 914                                          *        copied before the fault
 915                                          *        occurred and then a partial
 916                                          *        packet with this data should
 917                                          *        be sent.  Unfortunately
 918                                          *        csum_and_copy_from_user doesn't
 919                                          *        return this information.
 920                                          *        ATM it might send partly zeroed
 921                                          *        data in this case.
 922                                          */
 923                                         tp->write_seq += copy;
 924                                         TCP_SKB_CB(skb)->end_seq += copy;
 925                                         from += copy;
 926                                         copied += copy;
 927                                         seglen -= copy;
 928                                         if (PSH_NEEDED)
 929                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 930                                         continue;
 931                                 }
 932                         }
 933
 934                         /* A chunk was here doing something strange
 935                          * with psh etc. It is deleted, because it was
 936                          * evident non-sense.                    --ANK
 937                          */
 938
 939                         copy = min(seglen, mss_now);
 940
 941                         /* Determine how large of a buffer to allocate.  */
 942                         tmp = MAX_TCP_HEADER + 15;
 943                         if (copy < mss_now && !(flags & MSG_OOB)) {
 944                                 tmp += mss_now;
 945
 946                                 /* What is happening here is that we want to
 947                                  * tack on later members of the users iovec
 948                                  * if possible into a single frame.  When we
 949                                  * leave this loop our caller checks to see if
 950                                  * we can send queued frames onto the wire.
 951                                  * See tcp_v[46]_sendmsg() for this.
 952                                  */
 953                                 queue_it = 1;
 954                         } else {
 955                                 tmp += copy;
 956                                 queue_it = 0;
 957                         }
 958
 959                         if (tcp_memory_free(sk)) {
 960                                 skb = alloc_skb(tmp, GFP_KERNEL);
 961                                 if (skb == NULL)
 962                                         goto do_oom;
 963                                 skb_set_owner_w(skb, sk);
 964                         } else {
 965                                 /* If we didn't get any memory, we need to sleep. */
 966                                 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
 967                                 set_bit(SOCK_NOSPACE, &sk->socket->flags);
 968
 969                                 if (!timeo) {
 970                                         err = -EAGAIN;
 971                                         goto do_interrupted;
 972                                 }
 973                                 if (signal_pending(current)) {
 974                                         err = sock_intr_errno(timeo);
 975                                         goto do_interrupted;
 976                                 }
 977                                 __tcp_push_pending_frames(sk, tp, mss_now);
 978                                 timeo = wait_for_tcp_memory(sk, timeo);
 979
 980                                 /* If SACK's were formed or PMTU events happened,
 981                                  * we must find out about it.
 982                                  */
 983                                 mss_now = tcp_current_mss(sk);
 984                                 continue;
 985                         }
 986
 987                         seglen -= copy;
 988
 989                         /* Prepare control bits for TCP header creation engine. */
 990                         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 991                                                   ((PSH_NEEDED) ?
 992                                                    TCPCB_FLAG_PSH : 0));
 993                         TCP_SKB_CB(skb)->sacked = 0;
 994                         if (flags & MSG_OOB) {
 995                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 996                                 TCP_SKB_CB(skb)->urg_ptr = copy;
 997                         } else
 998                                 TCP_SKB_CB(skb)->urg_ptr = 0;
 999
1000                         /* TCP data bytes are SKB_PUT() on top, later
1001                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
1002                          * Reserve header space and checksum the data.
1003                          */
1004                         skb_reserve(skb, MAX_TCP_HEADER);
1005                         skb->csum = csum_and_copy_from_user(from,
1006                                         skb_put(skb, copy), copy, 0, &err);
1007
1008                         if (err)
1009                                 goto do_fault;
1010
1011                         from += copy;
1012                         copied += copy;
1013
1014                         TCP_SKB_CB(skb)->seq = tp->write_seq;
1015                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
1016
1017                         /* This advances tp->write_seq for us. */
1018                         tcp_send_skb(sk, skb, queue_it, mss_now);
1019                 }
1020         }
1021         err = copied;
1022 out:
1023         __tcp_push_pending_frames(sk, tp, mss_now);
1024         TCP_CHECK_TIMER(sk);
1025 out_unlock:
1026         release_sock(sk);
1027         return err;
1028
1029 do_sock_err:
1030         if(copied)
1031                 err = copied;
1032         else
1033                 err = sock_error(sk);
1034         goto out;
1035 do_shutdown:
1036         if(copied)
1037                 err = copied;
1038         else {
1039                 if (!(flags&MSG_NOSIGNAL))
1040                         send_sig(SIGPIPE, current, 0);
1041                 err = -EPIPE;
1042         }
1043         goto out;
1044 do_oom:
1045         err = copied ? : -ENOBUFS;
1046         goto out;
1047 do_interrupted:
1048         if(copied)
1049                 err = copied;
1050         goto out;
1051 do_fault:
1052         kfree_skb(skb);
1053 do_fault2:
1054         err = -EFAULT;
1055         goto out;
1056 }
1057
1058 #undef PSH_NEEDED
1059
1060 /*
1061  *      Handle reading urgent data. BSD has very simple semantics for
1062  *      this, no blocking and very strange errors 8)
1063  */
1064
1065 static int tcp_recv_urg(struct sock * sk, long timeo,
1066                         struct msghdr *msg, int len, int flags,
1067                         int *addr_len)
1068 {
1069         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1070
1071         /* No URG data to read. */
1072         if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1073                 return -EINVAL; /* Yes this is right ! */
1074
1075         if (sk->done)
1076                 return -ENOTCONN;
1077
1078         if (tp->urg_data & TCP_URG_VALID) {
1079                 int err = 0;
1080                 char c = tp->urg_data;
1081
1082                 if (!(flags & MSG_PEEK))
1083                         tp->urg_data = TCP_URG_READ;
1084
1085                 /* Read urgent data. */
1086                 msg->msg_flags|=MSG_OOB;
1087
1088                 if(len>0) {
1089                         if (!(flags & MSG_PEEK))
1090                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1091                         len = 1;
1092                 } else
1093                         msg->msg_flags|=MSG_TRUNC;
1094
1095                 return err ? -EFAULT : len;
1096         }
1097
1098         /* Do not set sk->done, it is set only by normal data receive */
1099         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1100                 return 0;
1101
1102         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1103          * the available implementations agree in this case:
1104          * this call should never block, independent of the
1105          * blocking state of the socket.
1106          * Mike <pall@rz.uni-karlsruhe.de>
1107          */
1108         return -EAGAIN;
1109 }
1110
1111 /*
1112  *      Release a skb if it is no longer needed. This routine
1113  *      must be called with interrupts disabled or with the
1114  *      socket locked so that the sk_buff queue operation is ok.
1115  */
1116
1117 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1118 {
1119         __skb_unlink(skb, &sk->receive_queue);
1120         BUG_TRAP(atomic_read(&skb->users) == 1);
1121         /* Well, if I missed something then punishment will be terrible oops. */
1122         __kfree_skb(skb);
1123 }
1124
1125 /* Clean up the receive buffer for full frames taken by the user,
1126  * then send an ACK if necessary.  COPIED is the number of bytes
1127  * tcp_recvmsg has given to the user so far, it speeds up the
1128  * calculation of whether or not we must ACK for the sake of
1129  * a window update.
1130  */
1131 static void cleanup_rbuf(struct sock *sk, int copied)
1132 {
1133         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1134         struct sk_buff *skb;
1135         int time_to_ack = 0;
1136
1137         /* NOTE! The socket must be locked, so that we don't get
1138          * a messed-up receive queue.
1139          */
1140         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1141                 if (!skb->used)
1142                         break;
1143                 tcp_eat_skb(sk, skb);
1144         }
1145
1146         if (tp->ack.pending) {
1147                    /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1148                 if (tp->ack.blocked
1149 #ifdef TCP_MORE_COARSE_ACKS
1150                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1151                     || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1152 #endif
1153                     /*
1154                      * If this read emptied read buffer, we send ACK when:
1155                      *
1156                      * -- ATO estimator diverged. In this case it is useless
1157                      * to delay ACK, it will miss in any case.
1158                      *
1159                      * -- The second condition is triggered when we did not
1160                      * ACK 8 segments not depending of their size.
1161                      * Linux senders allocate full-sized frame even for one byte
1162                      * packets, so that default queue for MTU=8K can hold
1163                      * only 8 packets. Note, that no other workarounds
1164                      * but counting packets are possible. If sender selected
1165                      * a small sndbuf or have larger mtu lockup will still
1166                      * occur. Well, not lockup, but 10-20msec gap.
1167                      * It is essentially dead lockup for 1Gib ethernet
1168                      * and loopback :-). The value 8 covers all reasonable
1169                      * cases and we may receive packet of any size
1170                      * with maximal possible rate now.
1171                      */
1172                     || (copied > 0 &&
1173                         (tp->ack.ato >= TCP_DELACK_MAX || tp->ack.rcv_segs > 7) &&
1174                         !tp->ack.pingpong &&
1175                         atomic_read(&sk->rmem_alloc) == 0)) {
1176                         time_to_ack = 1;
1177                 }
1178         }
1179
1180         /* We send an ACK if we can now advertise a non-zero window
1181          * which has been raised "significantly".
1182          *
1183          * Even if window raised up to infinity, do not send window open ACK
1184          * in states, where we will not receive more. It is useless.
1185          */
1186         if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1187                 __u32 rcv_window_now = tcp_receive_window(tp);
1188                 __u32 new_window = __tcp_select_window(sk);
1189
1190                 /* Send ACK now, if this read freed lots of space
1191                  * in our buffer. Certainly, new_window is new window.
1192                  * We can advertise it now, if it is not less than current one.
1193                  * "Lots" means "at least twice" here.
1194                  */
1195                 if(new_window && new_window >= 2*rcv_window_now)
1196                         time_to_ack = 1;
1197         }
1198         if (time_to_ack)
1199                 tcp_send_ack(sk);
1200 }
1201
1202 /* Now socket state including sk->err is changed only under lock,
1203  * hence we may omit checks after joining wait queue.
1204  * We check receive queue before schedule() only as optimization;
1205  * it is very likely that release_sock() added new data.
1206  */
1207
1208 static long tcp_data_wait(struct sock *sk, long timeo)
1209 {
1210         DECLARE_WAITQUEUE(wait, current);
1211
1212         add_wait_queue(sk->sleep, &wait);
1213
1214         __set_current_state(TASK_INTERRUPTIBLE);
1215
1216         set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1217         release_sock(sk);
1218
1219         if (skb_queue_empty(&sk->receive_queue))
1220                 timeo = schedule_timeout(timeo);
1221
1222         lock_sock(sk);
1223         clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1224
1225         remove_wait_queue(sk->sleep, &wait);
1226         __set_current_state(TASK_RUNNING);
1227         return timeo;
1228 }
1229
1230 static void tcp_prequeue_process(struct sock *sk)
1231 {
1232         struct sk_buff *skb;
1233         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1234
1235         net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1236
1237         /* RX process wants to run with disabled BHs, though it is not necessary */
1238         local_bh_disable();
1239         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1240                 sk->backlog_rcv(sk, skb);
1241         local_bh_enable();
1242
1243         /* Clear memory counter. */
1244         tp->ucopy.memory = 0;
1245 }
1246
1247 /*
1248  *      This routine copies from a sock struct into the user buffer.
1249  *
1250  *      Technical note: in 2.3 we work on _locked_ socket, so that
1251  *      tricks with *seq access order and skb->users are not required.
1252  *      Probably, code can be easily improved even more.
1253  */
1254
1255 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1256                 int len, int nonblock, int flags, int *addr_len)
1257 {
1258         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1259         int copied = 0;
1260         u32 peek_seq;
1261         u32 *seq;
1262         unsigned long used;
1263         int err;
1264         int target;             /* Read at least this many bytes */
1265         long timeo;
1266         struct task_struct *user_recv = NULL;
1267
1268         lock_sock(sk);
1269
1270         TCP_CHECK_TIMER(sk);
1271
1272         err = -ENOTCONN;
1273         if (sk->state == TCP_LISTEN)
1274                 goto out;
1275
1276         timeo = sock_rcvtimeo(sk, nonblock);
1277
1278         /* Urgent data needs to be handled specially. */
1279         if (flags & MSG_OOB)
1280                 goto recv_urg;
1281
1282         seq = &tp->copied_seq;
1283         if (flags & MSG_PEEK) {
1284                 peek_seq = tp->copied_seq;
1285                 seq = &peek_seq;
1286         }
1287
1288         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1289
1290         do {
1291                 struct sk_buff * skb;
1292                 u32 offset;
1293
1294                 /* Are we at urgent data? Stop if we have read anything. */
1295                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1296                         break;
1297
1298                 /* We need to check signals first, to get correct SIGURG
1299                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1300                  * and move it down to the bottom of the loop
1301                  */
1302                 if (signal_pending(current)) {
1303                         if (copied)
1304                                 break;
1305                         copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1306                         break;
1307                 }
1308
1309                 /* Next get a buffer. */
1310
1311                 skb = skb_peek(&sk->receive_queue);
1312                 do {
1313                         if (!skb)
1314                                 break;
1315
1316                         /* Now that we have two receive queues this
1317                          * shouldn't happen.
1318                          */
1319                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1320                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1321                                        *seq, TCP_SKB_CB(skb)->seq);
1322                                 break;
1323                         }
1324                         offset = *seq - TCP_SKB_CB(skb)->seq;
1325                         if (skb->h.th->syn)
1326                                 offset--;
1327                         if (offset < skb->len)
1328                                 goto found_ok_skb;
1329                         if (skb->h.th->fin)
1330                                 goto found_fin_ok;
1331                         if (!(flags & MSG_PEEK))
1332                                 skb->used = 1;
1333                         skb = skb->next;
1334                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1335
1336                 /* Well, if we have backlog, try to process it now yet. */
1337
1338                 if (copied >= target && sk->backlog.tail == NULL)
1339                         break;
1340
1341                 if (copied) {
1342                         if (sk->err ||
1343                             sk->state == TCP_CLOSE ||
1344                             (sk->shutdown & RCV_SHUTDOWN) ||
1345                             !timeo)
1346                                 break;
1347                 } else {
1348                         if (sk->err) {
1349                                 copied = sock_error(sk);
1350                                 break;
1351                         }
1352
1353                         if (sk->shutdown & RCV_SHUTDOWN) {
1354                                 if (!(flags&MSG_PEEK))
1355                                         sk->done = 1;
1356                                 break;
1357                         }
1358
1359                         if (sk->state == TCP_CLOSE) {
1360                                 if (sk->done) {
1361                                         copied = -ENOTCONN;
1362                                         break;
1363                                 } else if (!(flags&MSG_PEEK))
1364                                         sk->done = 1;
1365                                 break;
1366                         }
1367
1368                         if (!timeo) {
1369                                 copied = -EAGAIN;
1370                                 break;
1371                         }
1372                 }
1373
1374                 cleanup_rbuf(sk, copied);
1375
1376                 if (tp->ucopy.task == user_recv) {
1377                         /* Install new reader */
1378                         if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1379                                 user_recv = current;
1380                                 tp->ucopy.task = user_recv;
1381                                 tp->ucopy.iov = msg->msg_iov;
1382                         }
1383
1384                         tp->ucopy.len = len;
1385
1386                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1387
1388                         /* Ugly... If prequeue is not empty, we have to
1389                          * process it before releasing socket, otherwise
1390                          * order will be broken at second iteration.
1391                          * More elegant solution is required!!!
1392                          *
1393                          * Look: we have the following (pseudo)queues:
1394                          *
1395                          * 1. packets in flight
1396                          * 2. backlog
1397                          * 3. prequeue
1398                          * 4. receive_queue
1399                          *
1400                          * Each queue can be processed only if the next ones
1401                          * are empty. At this point we have empty receive_queue.
1402                          * But prequeue _can_ be not empty after second iteration,
1403                          * when we jumped to start of loop because backlog
1404                          * processing added something to receive_queue.
1405                          * We cannot release_sock(), because backlog contains
1406                          * packets arrived _after_ prequeued ones.
1407                          *
1408                          * Shortly, algorithm is clear --- to process all
1409                          * the queues in order. We could make it more directly,
1410                          * requeueing packets from backlog to prequeue, if
1411                          * is not empty. It is more elegant, but eats cycles,
1412                          * unfortunately.
1413                          */
1414                         if (skb_queue_len(&tp->ucopy.prequeue))
1415                                 goto do_prequeue;
1416
1417                         /* __ Set realtime policy in scheduler __ */
1418                 }
1419
1420                 if (copied >= target) {
1421                         /* Do not sleep, just process backlog. */
1422                         release_sock(sk);
1423                         lock_sock(sk);
1424                 } else {
1425                         timeo = tcp_data_wait(sk, timeo);
1426                 }
1427
1428                 if (user_recv) {
1429                         int chunk;
1430
1431                         /* __ Restore normal policy in scheduler __ */
1432
1433                         if ((chunk = len - tp->ucopy.len) != 0) {
1434                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1435                                 len -= chunk;
1436                                 copied += chunk;
1437                         }
1438
1439                         if (tp->rcv_nxt == tp->copied_seq &&
1440                             skb_queue_len(&tp->ucopy.prequeue)) {
1441 do_prequeue:
1442                                 tcp_prequeue_process(sk);
1443
1444                                 if ((chunk = len - tp->ucopy.len) != 0) {
1445                                         net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1446                                         len -= chunk;
1447                                         copied += chunk;
1448                                 }
1449                         }
1450                 }
1451                 continue;
1452
1453         found_ok_skb:
1454                 /* Ok so how much can we use? */
1455                 used = skb->len - offset;
1456                 if (len < used)
1457                         used = len;
1458
1459                 /* Do we have urgent data here? */
1460                 if (tp->urg_data) {
1461                         u32 urg_offset = tp->urg_seq - *seq;
1462                         if (urg_offset < used) {
1463                                 if (!urg_offset) {
1464                                         if (!sk->urginline) {
1465                                                 ++*seq;
1466                                                 offset++;
1467                                                 used--;
1468                                         }
1469                                 } else
1470                                         used = urg_offset;
1471                         }
1472                 }
1473
1474                 err = 0;
1475                 if (!(flags&MSG_TRUNC)) {
1476                         err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1477                         if (err) {
1478                                 /* Exception. Bailout! */
1479                                 if (!copied)
1480                                         copied = -EFAULT;
1481                                 break;
1482                         }
1483                 }
1484
1485                 *seq += used;
1486                 copied += used;
1487                 len -= used;
1488
1489                 if (after(tp->copied_seq,tp->urg_seq)) {
1490                         tp->urg_data = 0;
1491                         if (skb_queue_len(&tp->out_of_order_queue) == 0
1492 #ifdef TCP_FORMAL_WINDOW
1493                             && tcp_receive_window(tp)
1494 #endif
1495                             ) {
1496                                 tcp_fast_path_on(tp);
1497                         }
1498                 }
1499                 if (used + offset < skb->len)
1500                         continue;
1501
1502                 /*      Process the FIN. We may also need to handle PSH
1503                  *      here and make it break out of MSG_WAITALL.
1504                  */
1505                 if (skb->h.th->fin)
1506                         goto found_fin_ok;
1507                 if (flags & MSG_PEEK)
1508                         continue;
1509                 skb->used = 1;
1510                 tcp_eat_skb(sk, skb);
1511                 continue;
1512
1513         found_fin_ok:
1514                 ++*seq;
1515                 if (flags & MSG_PEEK)
1516                         break;
1517
1518                 /* All is done. */
1519                 skb->used = 1;
1520                 break;
1521         } while (len > 0);
1522
1523         if (user_recv) {
1524                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1525                         int chunk;
1526
1527                         tp->ucopy.len = copied > 0 ? len : 0;
1528
1529                         tcp_prequeue_process(sk);
1530
1531                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1532                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1533                                 len -= chunk;
1534                                 copied += chunk;
1535                         }
1536                 }
1537
1538                 tp->ucopy.task = NULL;
1539                 tp->ucopy.len = 0;
1540         }
1541
1542         /* According to UNIX98, msg_name/msg_namelen are ignored
1543          * on connected socket. I was just happy when found this 8) --ANK
1544          */
1545
1546         /* Clean up data we have read: This will do ACK frames. */
1547         cleanup_rbuf(sk, copied);
1548
1549         TCP_CHECK_TIMER(sk);
1550         release_sock(sk);
1551         return copied;
1552
1553 out:
1554         TCP_CHECK_TIMER(sk);
1555         release_sock(sk);
1556         return err;
1557
1558 recv_urg:
1559         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1560         goto out;
1561 }
1562
1563 /*
1564  *      State processing on a close. This implements the state shift for
1565  *      sending our FIN frame. Note that we only send a FIN for some
1566  *      states. A shutdown() may have already sent the FIN, or we may be
1567  *      closed.
1568  */
1569
1570 static unsigned char new_state[16] = {
1571   /* current state:        new state:      action:      */
1572   /* (Invalid)          */ TCP_CLOSE,
1573   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1574   /* TCP_SYN_SENT       */ TCP_CLOSE,
1575   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1576   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1577   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1578   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1579   /* TCP_CLOSE          */ TCP_CLOSE,
1580   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1581   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1582   /* TCP_LISTEN         */ TCP_CLOSE,
1583   /* TCP_CLOSING        */ TCP_CLOSING,
1584 };
1585
1586 static int tcp_close_state(struct sock *sk)
1587 {
1588         int next = (int) new_state[sk->state];
1589         int ns = (next & TCP_STATE_MASK);
1590
1591         tcp_set_state(sk, ns);
1592
1593         return (next & TCP_ACTION_FIN);
1594 }
1595
1596 /*
1597  *      Shutdown the sending side of a connection. Much like close except
1598  *      that we don't receive shut down or set sk->dead.
1599  */
1600
1601 void tcp_shutdown(struct sock *sk, int how)
1602 {
1603         /*      We need to grab some memory, and put together a FIN,
1604          *      and then put it into the queue to be sent.
1605          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1606          */
1607         if (!(how & SEND_SHUTDOWN))
1608                 return;
1609
1610         /* If we've already sent a FIN, or it's a closed state, skip this. */
1611         if ((1 << sk->state) &
1612             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1613                 /* Clear out any half completed packets.  FIN if needed. */
1614                 if (tcp_close_state(sk))
1615                         tcp_send_fin(sk);
1616         }
1617 }
1618
1619
1620 /*
1621  *      Return 1 if we still have things to send in our buffers.
1622  */
1623
1624 static inline int closing(struct sock * sk)
1625 {
1626         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1627 }
1628
1629 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1630 {
1631         /* First the read buffer. */
1632         skb_queue_purge(&sk->receive_queue);
1633
1634         /* Next, the error queue. */
1635         skb_queue_purge(&sk->error_queue);
1636
1637         /* Next, the write queue. */
1638         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1639
1640         /* It is _impossible_ for the backlog to contain anything
1641          * when we get here.  All user references to this socket
1642          * have gone away, only the net layer knows can touch it.
1643          */
1644 }
1645
1646 /*
1647  * At this point, there should be no process reference to this
1648  * socket, and thus no user references at all.  Therefore we
1649  * can assume the socket waitqueue is inactive and nobody will
1650  * try to jump onto it.
1651  */
1652 void tcp_destroy_sock(struct sock *sk)
1653 {
1654         BUG_TRAP(sk->state==TCP_CLOSE);
1655         BUG_TRAP(sk->dead);
1656
1657         /* It cannot be in hash table! */
1658         BUG_TRAP(sk->pprev==NULL);
1659
1660         /* It it has not 0 sk->num, it must be bound */
1661         BUG_TRAP(!sk->num || sk->prev!=NULL);
1662
1663 #ifdef TCP_DEBUG
1664         if (sk->zapped) {
1665                 printk("TCP: double destroy sk=%p\n", sk);
1666                 sock_hold(sk);
1667         }
1668         sk->zapped = 1;
1669 #endif
1670
1671         sk->prot->destroy(sk);
1672
1673         tcp_kill_sk_queues(sk);
1674
1675 #ifdef INET_REFCNT_DEBUG
1676         if (atomic_read(&sk->refcnt) != 1) {
1677                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1678         }
1679 #endif
1680
1681         atomic_dec(&tcp_orphan_count);
1682         sock_put(sk);
1683 }
1684
1685 void tcp_close(struct sock *sk, long timeout)
1686 {
1687         struct sk_buff *skb;
1688         int data_was_unread = 0;
1689
1690         lock_sock(sk);
1691         sk->shutdown = SHUTDOWN_MASK;
1692
1693         if(sk->state == TCP_LISTEN) {
1694                 tcp_set_state(sk, TCP_CLOSE);
1695
1696                 /* Special case. */
1697                 tcp_listen_stop(sk);
1698
1699                 goto adjudge_to_death;
1700         }
1701
1702         /*  We need to flush the recv. buffs.  We do this only on the
1703          *  descriptor close, not protocol-sourced closes, because the
1704          *  reader process may not have drained the data yet!
1705          */
1706         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1707                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1708                 data_was_unread += len;
1709                 kfree_skb(skb);
1710         }
1711
1712         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1713          * 3.10, we send a RST here because data was lost.  To
1714          * witness the awful effects of the old behavior of always
1715          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1716          * a bulk GET in an FTP client, suspend the process, wait
1717          * for the client to advertise a zero window, then kill -9
1718          * the FTP client, wheee...  Note: timeout is always zero
1719          * in such a case.
1720          */
1721         if(data_was_unread != 0) {
1722                 /* Unread data was tossed, zap the connection. */
1723                 tcp_set_state(sk, TCP_CLOSE);
1724                 tcp_send_active_reset(sk, GFP_KERNEL);
1725         } else if (sk->linger && sk->lingertime==0) {
1726                 /* Check zero linger _after_ checking for unread data. */
1727                 sk->prot->disconnect(sk, 0);
1728         } else if (tcp_close_state(sk)) {
1729                 /* We FIN if the application ate all the data before
1730                  * zapping the connection.
1731                  */
1732
1733                 /* RED-PEN. Formally speaking, we have broken TCP state
1734                  * machine. State transitions:
1735                  *
1736                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1737                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1738                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1739                  *
1740                  * are legal only when FIN has been sent (i.e. in window),
1741                  * rather than queued out of window. Purists blame.
1742                  *
1743                  * F.e. "RFC state" is ESTABLISHED,
1744                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1745                  *
1746                  * The visible declinations are that sometimes
1747                  * we enter time-wait state, when it is not required really
1748                  * (harmless), do not send active resets, when they are
1749                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1750                  * they look as CLOSING or LAST_ACK for Linux)
1751                  * Probably, I missed some more holelets.
1752                  *                                              --ANK
1753                  */
1754                 tcp_send_fin(sk);
1755         }
1756
1757         if (timeout) {
1758                 struct task_struct *tsk = current;
1759                 DECLARE_WAITQUEUE(wait, current);
1760
1761                 add_wait_queue(sk->sleep, &wait);
1762
1763                 do {
1764                         set_current_state(TASK_INTERRUPTIBLE);
1765                         if (!closing(sk))
1766                                 break;
1767                         release_sock(sk);
1768                         timeout = schedule_timeout(timeout);
1769                         lock_sock(sk);
1770                 } while (!signal_pending(tsk) && timeout);
1771
1772                 tsk->state = TASK_RUNNING;
1773                 remove_wait_queue(sk->sleep, &wait);
1774         }
1775
1776 adjudge_to_death:
1777         /* It is the last release_sock in its life. It will remove backlog. */
1778         release_sock(sk);
1779
1780
1781         /* Now socket is owned by kernel and we acquire BH lock
1782            to finish close. No need to check for user refs.
1783          */
1784         local_bh_disable();
1785         bh_lock_sock(sk);
1786         BUG_TRAP(sk->lock.users==0);
1787
1788         sock_hold(sk);
1789         sock_orphan(sk);
1790
1791         /*      This is a (useful) BSD violating of the RFC. There is a
1792          *      problem with TCP as specified in that the other end could
1793          *      keep a socket open forever with no application left this end.
1794          *      We use a 3 minute timeout (about the same as BSD) then kill
1795          *      our end. If they send after that then tough - BUT: long enough
1796          *      that we won't make the old 4*rto = almost no time - whoops
1797          *      reset mistake.
1798          *
1799          *      Nope, it was not mistake. It is really desired behaviour
1800          *      f.e. on http servers, when such sockets are useless, but
1801          *      consume significant resources. Let's do it with special
1802          *      linger2 option.                                 --ANK
1803          */
1804
1805         if (sk->state == TCP_FIN_WAIT2) {
1806                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1807                 if (tp->linger2 < 0) {
1808                         tcp_set_state(sk, TCP_CLOSE);
1809                         tcp_send_active_reset(sk, GFP_ATOMIC);
1810                 } else {
1811                         int tmo = tcp_fin_time(tp);
1812
1813                         if (tmo > TCP_TIMEWAIT_LEN) {
1814                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1815                         } else {
1816                                 atomic_inc(&tcp_orphan_count);
1817                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1818                                 goto out;
1819                         }
1820                 }
1821         }
1822         if (sk->state != TCP_CLOSE &&
1823             atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) {
1824                 if (net_ratelimit())
1825                         printk(KERN_INFO "TCP: too many of orphaned sockets\n");
1826                 tcp_set_state(sk, TCP_CLOSE);
1827                 tcp_send_active_reset(sk, GFP_ATOMIC);
1828         }
1829         atomic_inc(&tcp_orphan_count);
1830
1831         if (sk->state == TCP_CLOSE)
1832                 tcp_destroy_sock(sk);
1833         /* Otherwise, socket is reprieved until protocol close. */
1834
1835 out:
1836         bh_unlock_sock(sk);
1837         local_bh_enable();
1838         sock_put(sk);
1839 }
1840
1841 /* These states need RST on ABORT according to RFC793 */
1842
1843 extern __inline__ int tcp_need_reset(int state)
1844 {
1845         return ((1 << state) &
1846                 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1847                  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
1848 }
1849
1850 int tcp_disconnect(struct sock *sk, int flags)
1851 {
1852         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1853         int old_state;
1854         int err = 0;
1855
1856         old_state = sk->state;
1857         if (old_state != TCP_CLOSE)
1858                 tcp_set_state(sk, TCP_CLOSE);
1859
1860         /* ABORT function of RFC793 */
1861         if (old_state == TCP_LISTEN) {
1862                 tcp_listen_stop(sk);
1863         } else if (tcp_need_reset(old_state) ||
1864                    (tp->snd_nxt != tp->write_seq &&
1865                     (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
1866                 /* The last check adjusts for discrepance of Linux wrt. RFC
1867                  * states
1868                  */
1869                 tcp_send_active_reset(sk, gfp_any());
1870                 sk->err = ECONNRESET;
1871         } else if (old_state == TCP_SYN_SENT)
1872                 sk->err = ECONNRESET;
1873
1874         tcp_clear_xmit_timers(sk);
1875         __skb_queue_purge(&sk->receive_queue);
1876         __skb_queue_purge(&sk->write_queue);
1877         __skb_queue_purge(&tp->out_of_order_queue);
1878
1879         sk->dport = 0;
1880
1881         sk->rcv_saddr = 0;
1882         sk->saddr = 0;
1883 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1884         memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1885         memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1886 #endif
1887
1888         sk->shutdown = 0;
1889         sk->done = 0;
1890         sk->write_space = tcp_write_space;
1891         tp->srtt = 0;
1892         if (sysctl_tcp_tw_recycle) {
1893                 if ((tp->write_seq += 2) == 0)
1894                         tp->write_seq = 1;
1895         } else {
1896                 tp->write_seq = 0;
1897         }
1898         tp->backoff = 0;
1899         tp->snd_cwnd = 2;
1900         tp->probes_out = 0;
1901         tp->packets_out = 0;
1902         tp->high_seq = 0;
1903         tp->snd_ssthresh = 0x7fffffff;
1904         tp->snd_cwnd_cnt = 0;
1905         tp->dup_acks = 0;
1906         tcp_delack_init(tp);
1907         tp->send_head = tp->retrans_head = NULL;
1908         tp->saw_tstamp = 0;
1909         __sk_dst_reset(sk);
1910
1911         BUG_TRAP(!sk->num || sk->prev);
1912
1913         sk->error_report(sk);
1914         return err;
1915 }
1916
1917 /*
1918  *      Wait for an incoming connection, avoid race
1919  *      conditions. This must be called with the socket locked,
1920  *      and without the kernel lock held.
1921  */
1922 static int wait_for_connect(struct sock * sk, long timeo)
1923 {
1924         DECLARE_WAITQUEUE(wait, current);
1925         int err;
1926
1927         /*
1928          * True wake-one mechanism for incoming connections: only
1929          * one process gets woken up, not the 'whole herd'.
1930          * Since we do not 'race & poll' for established sockets
1931          * anymore, the common case will execute the loop only once.
1932          *
1933          * Subtle issue: "add_wait_queue_exclusive()" will be added
1934          * after any current non-exclusive waiters, and we know that
1935          * it will always _stay_ after any new non-exclusive waiters
1936          * because all non-exclusive waiters are added at the
1937          * beginning of the wait-queue. As such, it's ok to "drop"
1938          * our exclusiveness temporarily when we get woken up without
1939          * having to remove and re-insert us on the wait queue.
1940          */
1941         add_wait_queue_exclusive(sk->sleep, &wait);
1942         for (;;) {
1943                 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1944                 release_sock(sk);
1945                 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
1946                         timeo = schedule_timeout(timeo);
1947                 lock_sock(sk);
1948                 err = 0;
1949                 if (sk->tp_pinfo.af_tcp.accept_queue)
1950                         break;
1951                 err = -EINVAL;
1952                 if (sk->state != TCP_LISTEN)
1953                         break;
1954                 err = sock_intr_errno(timeo);
1955                 if (signal_pending(current))
1956                         break;
1957                 err = -EAGAIN;
1958                 if (!timeo)
1959                         break;
1960         }
1961         current->state = TASK_RUNNING;
1962         remove_wait_queue(sk->sleep, &wait);
1963         return err;
1964 }
1965
1966 /*
1967  *      This will accept the next outstanding connection.
1968  *
1969  *      Be careful about race conditions here - this is subtle.
1970  */
1971
1972 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1973 {
1974         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1975         struct open_request *req;
1976         struct sock *newsk;
1977         int error;
1978
1979         lock_sock(sk);
1980
1981         /* We need to make sure that this socket is listening,
1982          * and that it has something pending.
1983          */
1984         error = -EINVAL;
1985         if (sk->state != TCP_LISTEN)
1986                 goto out;
1987
1988         /* Find already established connection */
1989         if (!tp->accept_queue) {
1990                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1991
1992                 /* If this is a non blocking socket don't sleep */
1993                 error = -EAGAIN;
1994                 if (!timeo)
1995                         goto out;
1996
1997                 error = wait_for_connect(sk, timeo);
1998                 if (error)
1999                         goto out;
2000         }
2001
2002         req = tp->accept_queue;
2003         if ((tp->accept_queue = req->dl_next) == NULL)
2004                 tp->accept_queue_tail = NULL;
2005
2006         newsk = req->sk;
2007         tcp_acceptq_removed(sk);
2008         tcp_openreq_fastfree(req);
2009         BUG_TRAP(newsk->state != TCP_SYN_RECV);
2010         release_sock(sk);
2011         return newsk;
2012
2013 out:
2014         release_sock(sk);
2015         *err = error;
2016         return NULL;
2017 }
2018
2019 /*
2020  *      Socket option code for TCP.
2021  */
2022
2023 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2024                    int optlen)
2025 {
2026         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2027         int val;
2028         int err = 0;
2029
2030         if (level != SOL_TCP)
2031                 return tp->af_specific->setsockopt(sk, level, optname,
2032                                                    optval, optlen);
2033
2034         if(optlen<sizeof(int))
2035                 return -EINVAL;
2036
2037         if (get_user(val, (int *)optval))
2038                 return -EFAULT;
2039
2040         lock_sock(sk);
2041
2042         switch(optname) {
2043         case TCP_MAXSEG:
2044                 /* values greater than interface MTU won't take effect.  however at
2045                  * the point when this call is done we typically don't yet know
2046                  * which interface is going to be used
2047                  */
2048                 if(val < 8 || val > MAX_TCP_WINDOW) {
2049                         err = -EINVAL;
2050                         break;
2051                 }
2052                 tp->user_mss = val;
2053                 break;
2054
2055         case TCP_NODELAY:
2056                 /* You cannot try to use this and TCP_CORK in
2057                  * tandem, so let the user know.
2058                  */
2059                 if (tp->nonagle == 2) {
2060                         err = -EINVAL;
2061                         break;
2062                 }
2063                 tp->nonagle = (val == 0) ? 0 : 1;
2064                 if (val)
2065                         tcp_push_pending_frames(sk, tp);
2066                 break;
2067
2068         case TCP_CORK:
2069                 /* When set indicates to always queue non-full frames.
2070                  * Later the user clears this option and we transmit
2071                  * any pending partial frames in the queue.  This is
2072                  * meant to be used alongside sendfile() to get properly
2073                  * filled frames when the user (for example) must write
2074                  * out headers with a write() call first and then use
2075                  * sendfile to send out the data parts.
2076                  *
2077                  * You cannot try to use TCP_NODELAY and this mechanism
2078                  * at the same time, so let the user know.
2079                  */
2080                 if (tp->nonagle == 1) {
2081                         err = -EINVAL;
2082                         break;
2083                 }
2084                 if (val != 0) {
2085                         tp->nonagle = 2;
2086                 } else {
2087                         tp->nonagle = 0;
2088
2089                         tcp_push_pending_frames(sk, tp);
2090                 }
2091                 break;
2092
2093         case TCP_KEEPIDLE:
2094                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2095                         err = -EINVAL;
2096                 else {
2097                         tp->keepalive_time = val * HZ;
2098                         if (sk->keepopen) {
2099                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2100                                 if (tp->keepalive_time > elapsed)
2101                                         elapsed = tp->keepalive_time - elapsed;
2102                                 else
2103                                         elapsed = 0;
2104                                 tcp_reset_keepalive_timer(sk, elapsed);
2105                         }
2106                 }
2107                 break;
2108         case TCP_KEEPINTVL:
2109                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2110                         err = -EINVAL;
2111                 else
2112                         tp->keepalive_intvl = val * HZ;
2113                 break;
2114         case TCP_KEEPCNT:
2115                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2116                         err = -EINVAL;
2117                 else
2118                         tp->keepalive_probes = val;
2119                 break;
2120         case TCP_SYNCNT:
2121                 if (val < 1 || val > MAX_TCP_SYNCNT)
2122                         err = -EINVAL;
2123                 else
2124                         tp->syn_retries = val;
2125                 break;
2126
2127         case TCP_LINGER2:
2128                 if (val < 0)
2129                         tp->linger2 = -1;
2130                 else if (val > sysctl_tcp_fin_timeout/HZ)
2131                         tp->linger2 = 0;
2132                 else
2133                         tp->linger2 = val*HZ;
2134                 break;
2135
2136         case TCP_DEFER_ACCEPT:
2137                 tp->defer_accept = 0;
2138                 if (val > 0) {
2139                         /* Translate value in seconds to number of retransmits */
2140                         while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2141                                 tp->defer_accept++;
2142                         tp->defer_accept++;
2143                 }
2144                 break;
2145
2146         case TCP_WINDOW_CLAMP:
2147                 if (val==0) {
2148                         if (sk->state != TCP_CLOSE) {
2149                                 err = -EINVAL;
2150                                 break;
2151                         }
2152                         tp->window_clamp = 0;
2153                 } else {
2154                         tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2155                                 SOCK_MIN_SNDBUF : val;
2156                 }
2157                 break;
2158
2159         default:
2160                 err = -ENOPROTOOPT;
2161                 break;
2162         };
2163         release_sock(sk);
2164         return err;
2165 }
2166
2167 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2168                    int *optlen)
2169 {
2170         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2171         int val, len;
2172
2173         if(level != SOL_TCP)
2174                 return tp->af_specific->getsockopt(sk, level, optname,
2175                                                    optval, optlen);
2176
2177         if(get_user(len,optlen))
2178                 return -EFAULT;
2179
2180         len = min(len, sizeof(int));
2181
2182         switch(optname) {
2183         case TCP_MAXSEG:
2184                 val = tp->mss_cache;
2185                 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2186                         val = tp->user_mss;
2187                 break;
2188         case TCP_NODELAY:
2189                 val = (tp->nonagle == 1);
2190                 break;
2191         case TCP_CORK:
2192                 val = (tp->nonagle == 2);
2193                 break;
2194         case TCP_KEEPIDLE:
2195                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2196                 break;
2197         case TCP_KEEPINTVL:
2198                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2199                 break;
2200         case TCP_KEEPCNT:
2201                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2202                 break;
2203         case TCP_SYNCNT:
2204                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2205                 break;
2206         case TCP_LINGER2:
2207                 val = tp->linger2;
2208                 if (val > 0)
2209                         val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2210                 break;
2211         case TCP_DEFER_ACCEPT:
2212                 val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
2213                 break;
2214         case TCP_WINDOW_CLAMP:
2215                 val = tp->window_clamp;
2216                 break;
2217         default:
2218                 return -ENOPROTOOPT;
2219         };
2220
2221         if(put_user(len, optlen))
2222                 return -EFAULT;
2223         if(copy_to_user(optval, &val,len))
2224                 return -EFAULT;
2225         return 0;
2226 }
2227
2228
2229 extern void __skb_cb_too_small_for_tcp(int, int);
2230
2231 void __init tcp_init(void)
2232 {
2233         struct sk_buff *skb = NULL;
2234         unsigned long goal;
2235         int order, i;
2236
2237         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2238                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2239                                            sizeof(skb->cb));
2240
2241         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2242                                                    sizeof(struct open_request),
2243                                                0, SLAB_HWCACHE_ALIGN,
2244                                                NULL, NULL);
2245         if(!tcp_openreq_cachep)
2246                 panic("tcp_init: Cannot alloc open_request cache.");
2247
2248         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2249                                               sizeof(struct tcp_bind_bucket),
2250                                               0, SLAB_HWCACHE_ALIGN,
2251                                               NULL, NULL);
2252         if(!tcp_bucket_cachep)
2253                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2254
2255         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2256                                                 sizeof(struct tcp_tw_bucket),
2257                                                 0, SLAB_HWCACHE_ALIGN,
2258                                                 NULL, NULL);
2259         if(!tcp_timewait_cachep)
2260                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2261
2262         /* Size and allocate the main established and bind bucket
2263          * hash tables.
2264          *
2265          * The methodology is similar to that of the buffer cache.
2266          */
2267         goal = num_physpages >> (23 - PAGE_SHIFT);
2268
2269         for(order = 0; (1UL << order) < goal; order++)
2270                 ;
2271         do {
2272                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2273                         sizeof(struct tcp_ehash_bucket);
2274                 tcp_ehash_size >>= 1;
2275                 while (tcp_ehash_size & (tcp_ehash_size-1))
2276                         tcp_ehash_size--;
2277                 tcp_ehash = (struct tcp_ehash_bucket *)
2278                         __get_free_pages(GFP_ATOMIC, order);
2279         } while (tcp_ehash == NULL && --order > 0);
2280
2281         if (!tcp_ehash)
2282                 panic("Failed to allocate TCP established hash table\n");
2283         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2284                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2285                 tcp_ehash[i].chain = NULL;
2286         }
2287
2288         do {
2289                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2290                         sizeof(struct tcp_bind_hashbucket);
2291                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2292                         continue;
2293                 tcp_bhash = (struct tcp_bind_hashbucket *)
2294                         __get_free_pages(GFP_ATOMIC, order);
2295         } while (tcp_bhash == NULL && --order >= 0);
2296
2297         if (!tcp_bhash)
2298                 panic("Failed to allocate TCP bind hash table\n");
2299         for (i = 0; i < tcp_bhash_size; i++) {
2300                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2301                 tcp_bhash[i].chain = NULL;
2302         }
2303
2304         /* Try to be a bit smarter and adjust defaults depending
2305          * on available memory.
2306          */
2307         if (order > 4) {
2308                 sysctl_local_port_range[0] = 32768;
2309                 sysctl_local_port_range[1] = 61000;
2310                 sysctl_tcp_max_tw_buckets = 180000;
2311                 sysctl_tcp_max_orphans = 4096<<(order-4);
2312                 sysctl_max_syn_backlog = 1024;
2313         } else if (order < 3) {
2314                 sysctl_local_port_range[0] = 1024*(3-order);
2315                 sysctl_tcp_max_tw_buckets >>= (3-order);
2316                 sysctl_tcp_max_orphans >>= (3-order);
2317                 sysctl_max_syn_backlog = 128;
2318         }
2319         tcp_port_rover = sysctl_local_port_range[0] - 1;
2320
2321         printk("TCP: Hash tables configured (established %d bind %d)\n",
2322                tcp_ehash_size<<1, tcp_bhash_size);
2323 }