net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.158 2000/01/21 23:45:57 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen :    Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *
 208  *              This program is free software; you can redistribute it and/or
 209  *              modify it under the terms of the GNU General Public License
 210  *              as published by the Free Software Foundation; either version
 211  *              2 of the License, or(at your option) any later version.
 212  *
 213  * Description of States:
 214  *
 215  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 216  *
 217  *      TCP_SYN_RECV            received a connection request, sent ack,
 218  *                              waiting for final ack in three-way handshake.
 219  *
 220  *      TCP_ESTABLISHED         connection established
 221  *
 222  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 223  *                              transmission of remaining buffered data
 224  *
 225  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 226  *                              to shutdown
 227  *
 228  *      TCP_CLOSING             both sides have shutdown but we still have
 229  *                              data we have to finish sending
 230  *
 231  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 232  *                              closed, can only be entered from FIN_WAIT2
 233  *                              or CLOSING.  Required because the other end
 234  *                              may not have gotten our last ACK causing it
 235  *                              to retransmit the data packet (which we ignore)
 236  *
 237  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 238  *                              us to finish writing our data and to shutdown
 239  *                              (we have to close() to move on to LAST_ACK)
 240  *
 241  *      TCP_LAST_ACK            out side has shutdown after remote has
 242  *                              shutdown.  There may still be data in our
 243  *                              buffer that we have to finish sending
 244  *
 245  *      TCP_CLOSE               socket is finished
 246  */
 247
 248 /*
 249  * RFC1122 status:
 250  * NOTE: I'm not going to be doing comments in the code for this one except
 251  * for violations and the like.  tcp.c is just too big... If I say something
 252  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 253  * with Alan. -- MS 950903
 254  * [Note: Most of the TCP code has been rewriten/redesigned since this
 255  *  RFC1122 check. It is probably not correct anymore. It should be redone
 256  *  before 2.2. -AK]
 257  *
 258  * Use of PSH (4.2.2.2)
 259  *   MAY aggregate data sent without the PSH flag. (does)
 260  *   MAY queue data received without the PSH flag. (does)
 261  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 262  *   MAY implement PSH on send calls. (doesn't, thus:)
 263  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 264  *     MUST set PSH on last segment (does)
 265  *   MAY pass received PSH to application layer (doesn't)
 266  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 267  *
 268  * Window Size (4.2.2.3, 4.2.2.16)
 269  *   MUST treat window size as an unsigned number (does)
 270  *   SHOULD treat window size as a 32-bit number (does not)
 271  *   MUST NOT shrink window once it is offered (does not normally)
 272  *
 273  * Urgent Pointer (4.2.2.4)
 274  * **MUST point urgent pointer to last byte of urgent data (not right
 275  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 276  *      to off)
 277  *   MUST inform application layer asynchronously of incoming urgent
 278  *     data. (does)
 279  *   MUST provide application with means of determining the amount of
 280  *     urgent data pending. (does)
 281  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 282  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 283  *      [Follows BSD 1 byte of urgent data]
 284  *
 285  * TCP Options (4.2.2.5)
 286  *   MUST be able to receive TCP options in any segment. (does)
 287  *   MUST ignore unsupported options (does)
 288  *
 289  * Maximum Segment Size Option (4.2.2.6)
 290  *   MUST implement both sending and receiving MSS. (does, but currently
 291  *      only uses the smaller of both of them)
 292  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 293  *     it always). (does, even when MSS == 536, which is legal)
 294  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 295  *   MUST calculate "effective send MSS" correctly:
 296  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 297  *     (does - but allows operator override)
 298  *
 299  * TCP Checksum (4.2.2.7)
 300  *   MUST generate and check TCP checksum. (does)
 301  *
 302  * Initial Sequence Number Selection (4.2.2.8)
 303  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 304  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 305  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 306  *     With syncookies we don't)
 307  *
 308  * Simultaneous Open Attempts (4.2.2.10)
 309  *   MUST support simultaneous open attempts (does)
 310  *
 311  * Recovery from Old Duplicate SYN (4.2.2.11)
 312  *   MUST keep track of active vs. passive open (does)
 313  *
 314  * RST segment (4.2.2.12)
 315  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 316  *     anything with it, which is standard)
 317  *
 318  * Closing a Connection (4.2.2.13)
 319  *   MUST inform application of whether connection was closed by RST or
 320  *     normal close. (does)
 321  *   MAY allow "half-duplex" close (treat connection as closed for the
 322  *     local app, even before handshake is done). (does)
 323  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 324  *
 325  * Retransmission Timeout (4.2.2.15)
 326  *   MUST implement Jacobson's slow start and congestion avoidance
 327  *     stuff. (does)
 328  *
 329  * Probing Zero Windows (4.2.2.17)
 330  *   MUST support probing of zero windows. (does)
 331  *   MAY keep offered window closed indefinitely. (does)
 332  *   MUST allow remote window to stay closed indefinitely. (does)
 333  *
 334  * Passive Open Calls (4.2.2.18)
 335  *   MUST NOT let new passive open affect other connections. (doesn't)
 336  *   MUST support passive opens (LISTENs) concurrently. (does)
 337  *
 338  * Time to Live (4.2.2.19)
 339  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 340  *
 341  * Event Processing (4.2.2.20)
 342  *   SHOULD queue out-of-order segments. (does)
 343  *   MUST aggregate ACK segments whenever possible. (does but badly)
 344  *
 345  * Retransmission Timeout Calculation (4.2.3.1)
 346  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 347  *     calculation. (does, or at least explains them in the comments 8*b)
 348  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 349  *
 350  * When to Send an ACK Segment (4.2.3.2)
 351  *   SHOULD implement delayed ACK. (does)
 352  *   MUST keep ACK delay < 0.5 sec. (does)
 353  *
 354  * When to Send a Window Update (4.2.3.3)
 355  *   MUST implement receiver-side SWS. (does)
 356  *
 357  * When to Send Data (4.2.3.4)
 358  *   MUST implement sender-side SWS. (does)
 359  *   SHOULD implement Nagle algorithm. (does)
 360  *
 361  * TCP Connection Failures (4.2.3.5)
 362  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 363  *   SHOULD inform application layer of soft errors. (does)
 364  *
 365  * TCP Keep-Alives (4.2.3.6)
 366  *   MAY provide keep-alives. (does)
 367  *   MUST make keep-alives configurable on a per-connection basis. (does)
 368  *   MUST default to no keep-alives. (does)
 369  *   MUST make keep-alive interval configurable. (does)
 370  *   MUST make default keep-alive interval > 2 hours. (does)
 371  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 372  *     connection. (doesn't)
 373  *   SHOULD send keep-alive with no data. (does)
 374  *
 375  * TCP Multihoming (4.2.3.7)
 376  *   MUST get source address from IP layer before sending first
 377  *     SYN. (does)
 378  *   MUST use same local address for all segments of a connection. (does)
 379  *
 380  * IP Options (4.2.3.8)
 381  *   MUST ignore unsupported IP options. (does)
 382  *   MAY support Time Stamp and Record Route. (does)
 383  *   MUST allow application to specify a source route. (does)
 384  *   MUST allow received Source Route option to set route for all future
 385  *     segments on this connection. (does not (security issues))
 386  *
 387  * ICMP messages (4.2.3.9)
 388  *   MUST act on ICMP errors. (does)
 389  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 390  *   because that is deprecated now by the IETF, can be turned on)
 391  *   MUST NOT abort connection upon receipt of soft Destination
 392  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 393  *     Problems. (doesn't)
 394  *   SHOULD report soft Destination Unreachables etc. to the
 395  *     application. (does, except during SYN_RECV and may drop messages
 396  *     in some rare cases before accept() - ICMP is unreliable)
 397  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 398  *     messages (2, 3, 4). (does, but see above)
 399  *
 400  * Remote Address Validation (4.2.3.10)
 401  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 402  *   MUST ignore SYN with invalid source address. (does)
 403  *   MUST silently discard incoming SYN for broadcast/multicast
 404  *     address. (does)
 405  *
 406  * Asynchronous Reports (4.2.4.1)
 407  * MUST provide mechanism for reporting soft errors to application
 408  *     layer. (does)
 409  *
 410  * Type of Service (4.2.4.2)
 411  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 412  *
 413  * (Whew. -- MS 950903)
 414  * (Updated by AK, but not complete yet.)
 415  **/
 416
 417 #include <linux/config.h>
 418 #include <linux/types.h>
 419 #include <linux/fcntl.h>
 420 #include <linux/poll.h>
 421 #include <linux/init.h>
 422 #include <linux/smp_lock.h>
 423
 424 #include <net/icmp.h>
 425 #include <net/tcp.h>
 426
 427 #include <asm/uaccess.h>
 428
 429 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 430
 431 struct tcp_mib  tcp_statistics[NR_CPUS*2];
 432
 433 kmem_cache_t *tcp_openreq_cachep;
 434 kmem_cache_t *tcp_bucket_cachep;
 435 kmem_cache_t *tcp_timewait_cachep;
 436
 437 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 438
 439 /*
 440  * LISTEN is a special case for poll..
 441  */
 442 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 443 {
 444         return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
 445 }
 446
 447 /*
 448  *      Compute minimal free write space needed to queue new packets.
 449  */
 450 #define tcp_min_write_space(__sk) \
 451         (atomic_read(&(__sk)->wmem_alloc) / 2)
 452
 453 /*
 454  *      Wait for a TCP event.
 455  *
 456  *      Note that we don't need to lock the socket, as the upper poll layers
 457  *      take care of normal races (between the test and the event) and we don't
 458  *      go look at any of the socket buffers directly.
 459  */
 460 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 461 {
 462         unsigned int mask;
 463         struct sock *sk = sock->sk;
 464         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 465
 466         poll_wait(file, sk->sleep, wait);
 467         if (sk->state == TCP_LISTEN)
 468                 return tcp_listen_poll(sk, wait);
 469
 470         /* Socket is not locked. We are protected from async events
 471            by poll logic and correct handling of state changes
 472            made by another threads is impossible in any case.
 473          */
 474
 475         mask = 0;
 476         if (sk->err)
 477                 mask = POLLERR;
 478
 479         /*
 480          * POLLHUP is certainly not done right. But poll() doesn't
 481          * have a notion of HUP in just one direction, and for a
 482          * socket the read side is more interesting.
 483          *
 484          * Some poll() documentation says that POLLHUP is incompatible
 485          * with the POLLOUT/POLLWR flags, so somebody should check this
 486          * all. But careful, it tends to be safer to return too many
 487          * bits than too few, and you can easily break real applications
 488          * if you don't tell them that something has hung up!
 489          *
 490          * Check-me.
 491          *
 492          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 493          * our fs/select.c). It means that after we received EOF,
 494          * poll always returns immediately, making impossible poll() on write()
 495          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 496          * if and only if shutdown has been made in both directions.
 497          * Actually, it is interesting to look how Solaris and DUX
 498          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 499          * then we could set it on SND_SHUTDOWN. BTW examples given
 500          * in Stevens' books assume exactly this behaviour, it explains
 501          * why PULLHUP is incompatible with POLLOUT.    --ANK
 502          *
 503          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 504          * blocking on fresh not-connected or disconnected socket. --ANK
 505          */
 506         if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
 507                 mask |= POLLHUP;
 508         if (sk->shutdown & RCV_SHUTDOWN)
 509                 mask |= POLLIN | POLLRDNORM;
 510
 511         /* Connected? */
 512         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 513                 if ((tp->rcv_nxt != tp->copied_seq) &&
 514                     (tp->urg_seq != tp->copied_seq ||
 515                      tp->rcv_nxt != tp->copied_seq+1 ||
 516                      sk->urginline || !tp->urg_data))
 517                         mask |= POLLIN | POLLRDNORM;
 518
 519                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 520                         if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
 521                                 mask |= POLLOUT | POLLWRNORM;
 522                         } else {  /* send SIGIO later */
 523                                 sk->socket->flags |= SO_NOSPACE;
 524                         }
 525                 }
 526
 527                 if (tp->urg_data & TCP_URG_VALID)
 528                         mask |= POLLPRI;
 529         }
 530         return mask;
 531 }
 532
 533 /*
 534  *      Socket write_space callback.
 535  *      This (or rather the sock_wake_async) should agree with poll.
 536  *
 537  *      WARNING. This callback is called from any context (process,
 538  *      bh or irq). Do not make anything more smart from it.
 539  */
 540 void tcp_write_space(struct sock *sk)
 541 {
 542         read_lock(&sk->callback_lock);
 543         if (!sk->dead) {
 544                 /* Why??!! Does it really not overshedule? --ANK */
 545                 wake_up_interruptible(sk->sleep);
 546
 547                 if (sock_wspace(sk) >= tcp_min_write_space(sk))
 548                         sock_wake_async(sk->socket, 2, POLL_OUT);
 549         }
 550         read_unlock(&sk->callback_lock);
 551 }
 552
 553 /* Listening TCP sockets never sleep to wait for memory, so
 554  * it is completely silly to wake them up on queue space
 555  * available events.  So we hook them up to this dummy callback.
 556  */
 557 static void tcp_listen_write_space(struct sock *sk)
 558 {
 559 }
 560
 561 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 562 {
 563         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 564         int answ;
 565
 566         switch(cmd) {
 567         case SIOCINQ:
 568                 if (sk->state == TCP_LISTEN)
 569                         return(-EINVAL);
 570
 571                 lock_sock(sk);
 572                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 573                         answ = 0;
 574                 else if (sk->urginline || !tp->urg_data ||
 575                          before(tp->urg_seq,tp->copied_seq) ||
 576                          !before(tp->urg_seq,tp->rcv_nxt))
 577                         answ = tp->rcv_nxt - tp->copied_seq;
 578                 else
 579                         answ = tp->urg_seq - tp->copied_seq;
 580                 release_sock(sk);
 581                 break;
 582         case SIOCATMARK:
 583                 {
 584                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 585                         break;
 586                 }
 587         case SIOCOUTQ:
 588                 if (sk->state == TCP_LISTEN)
 589                         return(-EINVAL);
 590
 591                 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
 592                         answ = 0;
 593                 else
 594                         answ = tp->write_seq - tp->snd_una;
 595                 break;
 596         default:
 597                 return(-ENOIOCTLCMD);
 598         };
 599
 600         return put_user(answ, (int *)arg);
 601 }
 602
 603
 604 int tcp_listen_start(struct sock *sk)
 605 {
 606         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 607         struct tcp_listen_opt *lopt;
 608
 609         sk->max_ack_backlog = 0;
 610         sk->ack_backlog = 0;
 611         tp->accept_queue = NULL;
 612         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 613
 614         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 615         if (!lopt)
 616                 return -ENOMEM;
 617
 618         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 619         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 620                 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 621                         break;
 622
 623         write_lock_bh(&tp->syn_wait_lock);
 624         tp->listen_opt = lopt;
 625         write_unlock_bh(&tp->syn_wait_lock);
 626
 627         sk->state = TCP_LISTEN;
 628         if (sk->num == 0) {
 629                 if (sk->prot->get_port(sk, 0) != 0) {
 630                         sk->state = TCP_CLOSE;
 631                         write_lock_bh(&tp->syn_wait_lock);
 632                         tp->listen_opt = NULL;
 633                         write_unlock_bh(&tp->syn_wait_lock);
 634                         kfree(lopt);
 635                         return -EAGAIN;
 636                 }
 637                 sk->sport = htons(sk->num);
 638         } else {
 639                 if (sk->prev)
 640                         ((struct tcp_bind_bucket*)sk->prev)->fastreuse = 0;
 641         }
 642
 643         sk_dst_reset(sk);
 644         sk->prot->hash(sk);
 645         sk->socket->flags |= SO_ACCEPTCON;
 646         sk->write_space = tcp_listen_write_space;
 647
 648         return 0;
 649 }
 650
 651 /*
 652  *      This routine closes sockets which have been at least partially
 653  *      opened, but not yet accepted.
 654  */
 655
 656 static void tcp_listen_stop (struct sock *sk)
 657 {
 658         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 659         struct tcp_listen_opt *lopt = tp->listen_opt;
 660         struct open_request *acc_req = tp->accept_queue;
 661         struct open_request *req;
 662         int i;
 663
 664         tcp_delete_keepalive_timer(sk);
 665
 666         /* make all the listen_opt local to us */
 667         write_lock_bh(&tp->syn_wait_lock);
 668         tp->listen_opt =NULL;
 669         write_unlock_bh(&tp->syn_wait_lock);
 670         tp->accept_queue = NULL;
 671
 672         if (lopt->qlen) {
 673                 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
 674                         while ((req = lopt->syn_table[i]) != NULL) {
 675                                 lopt->syn_table[i] = req->dl_next;
 676                                 lopt->qlen--;
 677                                 tcp_openreq_free(req);
 678
 679                 /* Following specs, it would be better either to send FIN
 680                  * (and enter FIN-WAIT-1, it is normal close)
 681                  * or to send active reset (abort).
 682                  * Certainly, it is pretty dangerous while synflood, but it is
 683                  * bad justification for our negligence 8)
 684                  * To be honest, we are not able to make either
 685                  * of the variants now.                 --ANK
 686                  */
 687                         }
 688                 }
 689         }
 690         BUG_TRAP(lopt->qlen == 0);
 691
 692         kfree(lopt);
 693
 694         while ((req=acc_req) != NULL) {
 695                 struct sock *child = req->sk;
 696
 697                 acc_req = req->dl_next;
 698
 699                 local_bh_disable();
 700                 bh_lock_sock(child);
 701                 BUG_TRAP(child->lock.users==0);
 702                 sock_hold(child);
 703
 704                 tcp_disconnect(child, O_NONBLOCK);
 705
 706                 sock_orphan(child);
 707
 708                 atomic_inc(&tcp_orphan_count);
 709
 710                 tcp_destroy_sock(child);
 711
 712                 bh_unlock_sock(child);
 713                 local_bh_enable();
 714                 sock_put(child);
 715
 716                 tcp_acceptq_removed(sk);
 717                 tcp_openreq_fastfree(req);
 718         }
 719         BUG_TRAP(sk->ack_backlog == 0);
 720 }
 721
 722 /*
 723  *      Wait for a socket to get into the connected state
 724  *
 725  *      Note: Must be called with the socket locked.
 726  */
 727 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
 728 {
 729         struct task_struct *tsk = current;
 730         DECLARE_WAITQUEUE(wait, tsk);
 731
 732         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 733                 if(sk->err)
 734                         return sock_error(sk);
 735                 if((1 << sk->state) &
 736                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 737                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 738                                 send_sig(SIGPIPE, tsk, 0);
 739                         return -EPIPE;
 740                 }
 741                 if(!*timeo_p)
 742                         return -EAGAIN;
 743                 if(signal_pending(tsk))
 744                         return -ERESTARTSYS;
 745
 746                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
 747                 add_wait_queue(sk->sleep, &wait);
 748                 sk->tp_pinfo.af_tcp.write_pending++;
 749
 750                 release_sock(sk);
 751                 *timeo_p = schedule_timeout(*timeo_p);
 752                 lock_sock(sk);
 753
 754                 __set_task_state(tsk, TASK_RUNNING);
 755                 remove_wait_queue(sk->sleep, &wait);
 756                 sk->tp_pinfo.af_tcp.write_pending--;
 757         }
 758         return 0;
 759 }
 760
 761 static inline int tcp_memory_free(struct sock *sk)
 762 {
 763         return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
 764 }
 765
 766 /*
 767  *      Wait for more memory for a socket
 768  */
 769 static long wait_for_tcp_memory(struct sock * sk, long timeo)
 770 {
 771         if (!tcp_memory_free(sk)) {
 772                 DECLARE_WAITQUEUE(wait, current);
 773
 774                 sk->socket->flags &= ~SO_NOSPACE;
 775                 add_wait_queue(sk->sleep, &wait);
 776                 for (;;) {
 777                         set_current_state(TASK_INTERRUPTIBLE);
 778
 779                         if (signal_pending(current))
 780                                 break;
 781                         if (tcp_memory_free(sk))
 782                                 break;
 783                         if (sk->shutdown & SEND_SHUTDOWN)
 784                                 break;
 785                         if (sk->err)
 786                                 break;
 787                         release_sock(sk);
 788                         if (!tcp_memory_free(sk))
 789                                 timeo = schedule_timeout(timeo);
 790                         lock_sock(sk);
 791                 }
 792                 current->state = TASK_RUNNING;
 793                 remove_wait_queue(sk->sleep, &wait);
 794         }
 795         return timeo;
 796 }
 797
 798 /* When all user supplied data has been queued set the PSH bit */
 799 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
 800
 801 /*
 802  *      This routine copies from a user buffer into a socket,
 803  *      and starts the transmit system.
 804  */
 805
 806 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
 807 {
 808         struct iovec *iov;
 809         struct tcp_opt *tp;
 810         struct sk_buff *skb;
 811         int iovlen, flags;
 812         int mss_now;
 813         int err, copied;
 814         long timeo;
 815
 816         err = 0;
 817         tp = &(sk->tp_pinfo.af_tcp);
 818
 819         lock_sock(sk);
 820         TCP_CHECK_TIMER(sk);
 821
 822         flags = msg->msg_flags;
 823
 824         timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
 825
 826         /* Wait for a connection to finish. */
 827         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 828                 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
 829                         goto out_unlock;
 830
 831         /* This should be in poll */
 832         sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
 833
 834         mss_now = tcp_current_mss(sk);
 835
 836         /* Ok commence sending. */
 837         iovlen = msg->msg_iovlen;
 838         iov = msg->msg_iov;
 839         copied = 0;
 840
 841         while(--iovlen >= 0) {
 842                 int seglen=iov->iov_len;
 843                 unsigned char * from=iov->iov_base;
 844
 845                 iov++;
 846
 847                 while(seglen > 0) {
 848                         int copy, tmp, queue_it;
 849
 850                         if (err)
 851                                 goto do_fault2;
 852
 853                         /* Stop on errors. */
 854                         if (sk->err)
 855                                 goto do_sock_err;
 856
 857                         /* Make sure that we are established. */
 858                         if (sk->shutdown & SEND_SHUTDOWN)
 859                                 goto do_shutdown;
 860
 861                         /* Now we need to check if we have a half
 862                          * built packet we can tack some data onto.
 863                          */
 864                         if (tp->send_head && !(flags & MSG_OOB)) {
 865                                 skb = sk->write_queue.prev;
 866                                 copy = skb->len;
 867                                 /* If the remote does SWS avoidance we should
 868                                  * queue the best we can if not we should in
 869                                  * fact send multiple packets...
 870                                  * A method for detecting this would be most
 871                                  * welcome.
 872                                  */
 873                                 if (skb_tailroom(skb) > 0 &&
 874                                     (mss_now - copy) > 0) {
 875                                         int last_byte_was_odd = (copy % 4);
 876
 877                                         copy = mss_now - copy;
 878                                         if(copy > skb_tailroom(skb))
 879                                                 copy = skb_tailroom(skb);
 880                                         if(copy > seglen)
 881                                                 copy = seglen;
 882                                         if(last_byte_was_odd) {
 883                                                 if(copy_from_user(skb_put(skb, copy),
 884                                                                   from, copy))
 885                                                         err = -EFAULT;
 886                                                 skb->csum = csum_partial(skb->data,
 887                                                                          skb->len, 0);
 888                                         } else {
 889                                                 skb->csum =
 890                                                         csum_and_copy_from_user(
 891                                                         from, skb_put(skb, copy),
 892                                                         copy, skb->csum, &err);
 893                                         }
 894                                         /*
 895                                          * FIXME: the *_user functions should
 896                                          *        return how much data was
 897                                          *        copied before the fault
 898                                          *        occurred and then a partial
 899                                          *        packet with this data should
 900                                          *        be sent.  Unfortunately
 901                                          *        csum_and_copy_from_user doesn't
 902                                          *        return this information.
 903                                          *        ATM it might send partly zeroed
 904                                          *        data in this case.
 905                                          */
 906                                         tp->write_seq += copy;
 907                                         TCP_SKB_CB(skb)->end_seq += copy;
 908                                         from += copy;
 909                                         copied += copy;
 910                                         seglen -= copy;
 911                                         if (PSH_NEEDED)
 912                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 913                                         continue;
 914                                 }
 915                         }
 916
 917                         /* A chunk was here doing something strange
 918                          * with psh etc. It is deleted, because it was
 919                          * evident non-sense.                    --ANK
 920                          */
 921
 922                         copy = min(seglen, mss_now);
 923
 924                         /* Determine how large of a buffer to allocate.  */
 925                         tmp = MAX_TCP_HEADER + 15;
 926                         if (copy < mss_now && !(flags & MSG_OOB)) {
 927                                 tmp += mss_now;
 928
 929                                 /* What is happening here is that we want to
 930                                  * tack on later members of the users iovec
 931                                  * if possible into a single frame.  When we
 932                                  * leave this loop our caller checks to see if
 933                                  * we can send queued frames onto the wire.
 934                                  * See tcp_v[46]_sendmsg() for this.
 935                                  */
 936                                 queue_it = 1;
 937                         } else {
 938                                 tmp += copy;
 939                                 queue_it = 0;
 940                         }
 941                         skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
 942
 943                         /* If we didn't get any memory, we need to sleep. */
 944                         if (skb == NULL) {
 945                                 sk->socket->flags |= SO_NOSPACE;
 946                                 if (!timeo) {
 947                                         err = -EAGAIN;
 948                                         goto do_interrupted;
 949                                 }
 950                                 if (signal_pending(current)) {
 951                                         err = -ERESTARTSYS;
 952                                         goto do_interrupted;
 953                                 }
 954                                 __tcp_push_pending_frames(sk, tp, mss_now);
 955                                 timeo = wait_for_tcp_memory(sk, timeo);
 956
 957                                 /* If SACK's were formed or PMTU events happened,
 958                                  * we must find out about it.
 959                                  */
 960                                 mss_now = tcp_current_mss(sk);
 961                                 continue;
 962                         }
 963
 964                         seglen -= copy;
 965
 966                         /* Prepare control bits for TCP header creation engine. */
 967                         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 968                                                   ((PSH_NEEDED) ?
 969                                                    TCPCB_FLAG_PSH : 0));
 970                         TCP_SKB_CB(skb)->sacked = 0;
 971                         if (flags & MSG_OOB) {
 972                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 973                                 TCP_SKB_CB(skb)->urg_ptr = copy;
 974                         } else
 975                                 TCP_SKB_CB(skb)->urg_ptr = 0;
 976
 977                         /* TCP data bytes are SKB_PUT() on top, later
 978                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 979                          * Reserve header space and checksum the data.
 980                          */
 981                         skb_reserve(skb, MAX_TCP_HEADER);
 982                         skb->csum = csum_and_copy_from_user(from,
 983                                         skb_put(skb, copy), copy, 0, &err);
 984
 985                         if (err)
 986                                 goto do_fault;
 987
 988                         from += copy;
 989                         copied += copy;
 990
 991                         TCP_SKB_CB(skb)->seq = tp->write_seq;
 992                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 993
 994                         /* This advances tp->write_seq for us. */
 995                         tcp_send_skb(sk, skb, queue_it, mss_now);
 996                 }
 997         }
 998         sk->err = 0;
 999         err = copied;
1000         goto out;
1001
1002 do_sock_err:
1003         if(copied)
1004                 err = copied;
1005         else
1006                 err = sock_error(sk);
1007         goto out;
1008 do_shutdown:
1009         if(copied)
1010                 err = copied;
1011         else {
1012                 if (!(flags&MSG_NOSIGNAL))
1013                         send_sig(SIGPIPE, current, 0);
1014                 err = -EPIPE;
1015         }
1016         goto out;
1017 do_interrupted:
1018         if(copied)
1019                 err = copied;
1020         goto out;
1021 do_fault:
1022         kfree_skb(skb);
1023 do_fault2:
1024         err = -EFAULT;
1025 out:
1026         __tcp_push_pending_frames(sk, tp, mss_now);
1027         TCP_CHECK_TIMER(sk);
1028 out_unlock:
1029         release_sock(sk);
1030         tcp_push_pending_frames(sk, tp);
1031         return err;
1032 }
1033
1034 #undef PSH_NEEDED
1035
1036 /*
1037  *      Handle reading urgent data. BSD has very simple semantics for
1038  *      this, no blocking and very strange errors 8)
1039  */
1040
1041 static int tcp_recv_urg(struct sock * sk, long timeo,
1042                         struct msghdr *msg, int len, int flags,
1043                         int *addr_len)
1044 {
1045         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1046
1047         /* No URG data to read. */
1048         if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1049                 return -EINVAL; /* Yes this is right ! */
1050
1051         if (sk->done)
1052                 return -ENOTCONN;
1053
1054         if (tp->urg_data & TCP_URG_VALID) {
1055                 int err = 0;
1056                 char c = tp->urg_data;
1057
1058                 if (!(flags & MSG_PEEK))
1059                         tp->urg_data = TCP_URG_READ;
1060
1061                 /* Read urgent data. */
1062                 msg->msg_flags|=MSG_OOB;
1063
1064                 if(len>0) {
1065                         err = memcpy_toiovec(msg->msg_iov, &c, 1);
1066                         len = 1;
1067                 } else
1068                         msg->msg_flags|=MSG_TRUNC;
1069
1070                 return err ? -EFAULT : len;
1071         }
1072
1073         /* Do not set sk->done, it is set only by normal data receive */
1074         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1075                 return 0;
1076
1077         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1078          * the available implementations agree in this case:
1079          * this call should never block, independent of the
1080          * blocking state of the socket.
1081          * Mike <pall@rz.uni-karlsruhe.de>
1082          */
1083         return -EAGAIN;
1084 }
1085
1086 /*
1087  *      Release a skb if it is no longer needed. This routine
1088  *      must be called with interrupts disabled or with the
1089  *      socket locked so that the sk_buff queue operation is ok.
1090  */
1091
1092 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1093 {
1094         __skb_unlink(skb, &sk->receive_queue);
1095         BUG_TRAP(atomic_read(&skb->users) == 1);
1096         /* Well, if I missed something then punishment will be terrible oops. */
1097         __kfree_skb(skb);
1098 }
1099
1100 /* Clean up the receive buffer for full frames taken by the user,
1101  * then send an ACK if necessary.  COPIED is the number of bytes
1102  * tcp_recvmsg has given to the user so far, it speeds up the
1103  * calculation of whether or not we must ACK for the sake of
1104  * a window update.
1105  */
1106 static void cleanup_rbuf(struct sock *sk, int copied)
1107 {
1108         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1109         struct sk_buff *skb;
1110         int time_to_ack;
1111
1112         /* NOTE! The socket must be locked, so that we don't get
1113          * a messed-up receive queue.
1114          */
1115         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1116                 if (!skb->used)
1117                         break;
1118                 tcp_eat_skb(sk, skb);
1119         }
1120
1121         /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1122         time_to_ack = tp->ack.blocked && tp->ack.pending;
1123 #if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
1124         if (tp->ack.pending &&
1125             (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
1126                 time_to_ack = 1;
1127 #endif
1128
1129         /* We send an ACK if we can now advertise a non-zero window
1130          * which has been raised "significantly".
1131          *
1132          * Even if window raised up to infinity, do not send window open ACK
1133          * in states, where we will not receive more. It is useless.
1134          */
1135         if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1136                 __u32 rcv_window_now = tcp_receive_window(tp);
1137                 __u32 new_window = __tcp_select_window(sk);
1138
1139                 /* We won't be raising the window any further than
1140                  * the window-clamp allows.  Our window selection
1141                  * also keeps things a nice multiple of MSS.  These
1142                  * checks are necessary to prevent spurious ACKs
1143                  * which don't advertize a larger window.
1144                  */
1145                 if((new_window && (new_window >= rcv_window_now * 2)) &&
1146                    ((rcv_window_now + tp->ack.rcv_mss) <= tp->window_clamp))
1147                         time_to_ack = 1;
1148         }
1149         if (time_to_ack)
1150                 tcp_send_ack(sk);
1151 }
1152
1153 /* Now socket state including sk->err is changed only under lock,
1154  * hence we may omit checks after joining wait queue.
1155  * We check receive queue before schedule() only as optimization;
1156  * it is very likely that release_sock() added new data.
1157  */
1158
1159 static long tcp_data_wait(struct sock *sk, long timeo)
1160 {
1161         DECLARE_WAITQUEUE(wait, current);
1162
1163         add_wait_queue(sk->sleep, &wait);
1164
1165         __set_current_state(TASK_INTERRUPTIBLE);
1166
1167         sk->socket->flags |= SO_WAITDATA;
1168         release_sock(sk);
1169
1170         if (skb_queue_empty(&sk->receive_queue))
1171                 timeo = schedule_timeout(timeo);
1172
1173         lock_sock(sk);
1174         sk->socket->flags &= ~SO_WAITDATA;
1175
1176         remove_wait_queue(sk->sleep, &wait);
1177         __set_current_state(TASK_RUNNING);
1178         return timeo;
1179 }
1180
1181 static void tcp_prequeue_process(struct sock *sk)
1182 {
1183         struct sk_buff *skb;
1184         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1185
1186         net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1187
1188         /* RX process wants to run with disabled BHs, though it is not necessary */
1189         local_bh_disable();
1190         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1191                 sk->backlog_rcv(sk, skb);
1192         local_bh_enable();
1193
1194         /* Clear memory counter. */
1195         tp->ucopy.memory = 0;
1196 }
1197
1198 /*
1199  *      This routine copies from a sock struct into the user buffer.
1200  *
1201  *      Technical note: in 2.3 we work on _locked_ socket, so that
1202  *      tricks with *seq access order and skb->users are not required.
1203  *      Probably, code can be easily improved even more.
1204  */
1205
1206 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1207                 int len, int nonblock, int flags, int *addr_len)
1208 {
1209         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1210         int copied = 0;
1211         u32 peek_seq;
1212         u32 *seq;
1213         unsigned long used;
1214         int err;
1215         int target;             /* Read at least this many bytes */
1216         long timeo;
1217         struct task_struct *user_recv = NULL;
1218
1219         lock_sock(sk);
1220
1221         TCP_CHECK_TIMER(sk);
1222
1223
1224         if (sk->err)
1225                 goto out_err;
1226
1227         err = -ENOTCONN;
1228         if (sk->state == TCP_LISTEN)
1229                 goto out;
1230
1231         timeo = sock_rcvtimeo(sk, nonblock);
1232
1233         /* Urgent data needs to be handled specially. */
1234         if (flags & MSG_OOB)
1235                 goto recv_urg;
1236
1237         seq = &tp->copied_seq;
1238         if (flags & MSG_PEEK) {
1239                 peek_seq = tp->copied_seq;
1240                 seq = &peek_seq;
1241         }
1242
1243         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1244
1245         /*
1246          *      BUG BUG BUG
1247          *      This violates 1003.1g compliance. We must wait for
1248          *      data to exist even if we read none!
1249          */
1250
1251         while (len > 0) {
1252                 struct sk_buff * skb;
1253                 u32 offset;
1254
1255                 /* Are we at urgent data? Stop if we have read anything. */
1256                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1257                         break;
1258
1259                 /* We need to check signals first, to get correct SIGURG
1260                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1261                  * and move it down to the bottom of the loop
1262                  */
1263                 if (signal_pending(current)) {
1264                         if (copied)
1265                                 break;
1266                         copied = -ERESTARTSYS;
1267                         if (!timeo)
1268                                 copied = -EAGAIN;
1269                         break;
1270                 }
1271
1272                 /* Next get a buffer. */
1273
1274                 skb = skb_peek(&sk->receive_queue);
1275                 do {
1276                         if (!skb)
1277                                 break;
1278
1279                         /* Now that we have two receive queues this
1280                          * shouldn't happen.
1281                          */
1282                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1283                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1284                                        *seq, TCP_SKB_CB(skb)->seq);
1285                                 break;
1286                         }
1287                         offset = *seq - TCP_SKB_CB(skb)->seq;
1288                         if (skb->h.th->syn)
1289                                 offset--;
1290                         if (offset < skb->len)
1291                                 goto found_ok_skb;
1292                         if (skb->h.th->fin)
1293                                 goto found_fin_ok;
1294                         if (!(flags & MSG_PEEK))
1295                                 skb->used = 1;
1296                         skb = skb->next;
1297                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1298
1299                 /* Well, if we have backlog, try to process it now yet. */
1300
1301                 if (copied >= target && sk->backlog.tail == NULL)
1302                         break;
1303
1304                 if (copied) {
1305                         if (sk->err ||
1306                             sk->state == TCP_CLOSE ||
1307                             (sk->shutdown & RCV_SHUTDOWN) ||
1308                             !timeo)
1309                                 break;
1310                 } else {
1311                         if (sk->err) {
1312                                 copied = sock_error(sk);
1313                                 break;
1314                         }
1315
1316                         if (sk->done) {
1317                                 copied = -ENOTCONN;
1318                                 break;
1319                         }
1320
1321                         if (sk->state == TCP_CLOSE) {
1322                                 if (!(flags&MSG_PEEK))
1323                                         sk->done = 1;
1324                                 break;
1325                         }
1326
1327                         if (sk->shutdown & RCV_SHUTDOWN)
1328                                 break;
1329
1330                         if (!timeo) {
1331                                 copied = -EAGAIN;
1332                                 break;
1333                         }
1334                 }
1335
1336                 cleanup_rbuf(sk, copied);
1337
1338                 if (tp->ucopy.task == user_recv) {
1339                         /* Install new reader */
1340                         if (user_recv == NULL && !(flags&MSG_PEEK)) {
1341                                 user_recv = current;
1342                                 tp->ucopy.task = user_recv;
1343                                 tp->ucopy.iov = msg->msg_iov;
1344                         }
1345
1346                         tp->ucopy.len = len;
1347
1348                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt);
1349
1350                         /* __ Set realtime policy in scheduler __ */
1351                 }
1352
1353                 if (copied >= target) {
1354                         /* Do not sleep, just process backlog. */
1355                         release_sock(sk);
1356                         lock_sock(sk);
1357                 } else {
1358                         timeo = tcp_data_wait(sk, timeo);
1359                 }
1360
1361                 if (user_recv) {
1362                         int chunk;
1363
1364                         /* __ Restore normal policy in scheduler __ */
1365
1366                         if ((chunk = len - tp->ucopy.len) != 0) {
1367                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1368                                 len -= chunk;
1369                                 copied += chunk;
1370                         }
1371
1372                         if (tp->rcv_nxt == tp->copied_seq &&
1373                             skb_queue_len(&tp->ucopy.prequeue)) {
1374                                 tcp_prequeue_process(sk);
1375
1376                                 if ((chunk = len - tp->ucopy.len) != 0) {
1377                                         net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1378                                         len -= chunk;
1379                                         copied += chunk;
1380                                 }
1381                         }
1382 #if 1/*def CONFIG_TCP_MORE_COARSE_ACKS*/
1383                         if (tp->ack.pending &&
1384                             (tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss)
1385                                 tcp_send_ack(sk);
1386 #endif
1387                 }
1388                 continue;
1389
1390         found_ok_skb:
1391                 /* Ok so how much can we use? */
1392                 used = skb->len - offset;
1393                 if (len < used)
1394                         used = len;
1395
1396                 /* Do we have urgent data here? */
1397                 if (tp->urg_data) {
1398                         u32 urg_offset = tp->urg_seq - *seq;
1399                         if (urg_offset < used) {
1400                                 if (!urg_offset) {
1401                                         if (!sk->urginline) {
1402                                                 ++*seq;
1403                                                 offset++;
1404                                                 used--;
1405                                         }
1406                                 } else
1407                                         used = urg_offset;
1408                         }
1409                 }
1410
1411                 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1412                 if (err) {
1413                         /* Exception. Bailout! */
1414                         if (!copied)
1415                                 copied = -EFAULT;
1416                         break;
1417                 }
1418
1419                 *seq += used;
1420                 copied += used;
1421                 len -= used;
1422
1423                 if (after(tp->copied_seq,tp->urg_seq)) {
1424                         tp->urg_data = 0;
1425                         if (skb_queue_len(&tp->out_of_order_queue) == 0
1426 #ifdef TCP_FORMAL_WINDOW
1427                             && tcp_receive_window(tp)
1428 #endif
1429                             ) {
1430                                 tcp_fast_path_on(tp);
1431                         }
1432                 }
1433                 if (used + offset < skb->len)
1434                         continue;
1435
1436                 /*      Process the FIN. We may also need to handle PSH
1437                  *      here and make it break out of MSG_WAITALL.
1438                  */
1439                 if (skb->h.th->fin)
1440                         goto found_fin_ok;
1441                 if (flags & MSG_PEEK)
1442                         continue;
1443                 skb->used = 1;
1444                 tcp_eat_skb(sk, skb);
1445
1446 #ifdef CONFIG_TCP_LESS_COARSE_ACKS
1447                 /* Possible improvement. When sender is faster than receiver,
1448                  * traffic looks like: fill window ... wait for window open ...
1449                  * fill window. We lose at least one rtt, because call
1450                  * cleanup_rbuf only once. Probably, if "len" was large
1451                  * we should insert several intermediate cleanup_rbuf(s).
1452                  *
1453                  * F.e.:
1454                  */
1455                 do {
1456                         u32 full_space = min(tp->window_clamp, tcp_full_space(sk));
1457
1458                         /* Try to ACK, if total buffer length is larger
1459                            than maximal window and if rcv_window has
1460                            chances to increase twice. It will result
1461                            to exponentially decreased ACKing during
1462                            read to huge (usually, mmapped) buffer.
1463                          */
1464                         if (len >= full_space && tp->rcv_wnd <= full_space/2)
1465                                 cleanup_rbuf(sk, copied);
1466                 } while (0);
1467 #endif
1468                 continue;
1469
1470         found_fin_ok:
1471                 ++*seq;
1472                 if (flags & MSG_PEEK)
1473                         break;
1474
1475                 /* All is done. */
1476                 skb->used = 1;
1477                 break;
1478         }
1479
1480         if (user_recv) {
1481                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1482                         int chunk;
1483
1484                         tp->ucopy.len = copied > 0 ? len : 0;
1485
1486                         tcp_prequeue_process(sk);
1487
1488                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1489                                 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1490                                 len -= chunk;
1491                                 copied += chunk;
1492                         }
1493                 }
1494
1495                 tp->ucopy.task = NULL;
1496                 tp->ucopy.len = 0;
1497         }
1498
1499         /* According to UNIX98, msg_name/msg_namelen are ignored
1500          * on connected socket. I was just happy when found this 8) --ANK
1501          */
1502
1503         /* Clean up data we have read: This will do ACK frames. */
1504         cleanup_rbuf(sk, copied);
1505
1506         TCP_CHECK_TIMER(sk);
1507         release_sock(sk);
1508         return copied;
1509
1510 out_err:
1511         err = sock_error(sk);
1512
1513 out:
1514         TCP_CHECK_TIMER(sk);
1515         release_sock(sk);
1516         return err;
1517
1518 recv_urg:
1519         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1520         goto out;
1521 }
1522
1523 /*
1524  *      State processing on a close. This implements the state shift for
1525  *      sending our FIN frame. Note that we only send a FIN for some
1526  *      states. A shutdown() may have already sent the FIN, or we may be
1527  *      closed.
1528  */
1529
1530 static unsigned char new_state[16] = {
1531   /* current state:        new state:      action:      */
1532   /* (Invalid)          */ TCP_CLOSE,
1533   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1534   /* TCP_SYN_SENT       */ TCP_CLOSE,
1535   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1536   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1537   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1538   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1539   /* TCP_CLOSE          */ TCP_CLOSE,
1540   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1541   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1542   /* TCP_LISTEN         */ TCP_CLOSE,
1543   /* TCP_CLOSING        */ TCP_CLOSING,
1544 };
1545
1546 static int tcp_close_state(struct sock *sk)
1547 {
1548         int next = (int) new_state[sk->state];
1549         int ns = (next & TCP_STATE_MASK);
1550
1551         tcp_set_state(sk, ns);
1552
1553         return (next & TCP_ACTION_FIN);
1554 }
1555
1556 /*
1557  *      Shutdown the sending side of a connection. Much like close except
1558  *      that we don't receive shut down or set sk->dead.
1559  */
1560
1561 void tcp_shutdown(struct sock *sk, int how)
1562 {
1563         /*      We need to grab some memory, and put together a FIN,
1564          *      and then put it into the queue to be sent.
1565          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1566          */
1567         if (!(how & SEND_SHUTDOWN))
1568                 return;
1569
1570         /* If we've already sent a FIN, or it's a closed state, skip this. */
1571         if ((1 << sk->state) &
1572             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1573                 /* Clear out any half completed packets.  FIN if needed. */
1574                 if (tcp_close_state(sk))
1575                         tcp_send_fin(sk);
1576         }
1577 }
1578
1579
1580 /*
1581  *      Return 1 if we still have things to send in our buffers.
1582  */
1583
1584 static inline int closing(struct sock * sk)
1585 {
1586         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1587 }
1588
1589 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1590 {
1591         /* First the read buffer. */
1592         skb_queue_purge(&sk->receive_queue);
1593
1594         /* Next, the error queue. */
1595         skb_queue_purge(&sk->error_queue);
1596
1597         /* Next, the write queue. */
1598         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1599
1600         /* It is _impossible_ for the backlog to contain anything
1601          * when we get here.  All user references to this socket
1602          * have gone away, only the net layer knows can touch it.
1603          */
1604 }
1605
1606 /*
1607  * At this point, there should be no process reference to this
1608  * socket, and thus no user references at all.  Therefore we
1609  * can assume the socket waitqueue is inactive and nobody will
1610  * try to jump onto it.
1611  */
1612 void tcp_destroy_sock(struct sock *sk)
1613 {
1614         BUG_TRAP(sk->state==TCP_CLOSE);
1615         BUG_TRAP(sk->dead);
1616
1617         /* It cannot be in hash table! */
1618         BUG_TRAP(sk->pprev==NULL);
1619
1620         /* It it has not 0 sk->num, it must be bound */
1621         BUG_TRAP(!sk->num || sk->prev!=NULL);
1622
1623 #ifdef TCP_DEBUG
1624         if (sk->zapped) {
1625                 printk("TCP: double destroy sk=%p\n", sk);
1626                 sock_hold(sk);
1627         }
1628         sk->zapped = 1;
1629 #endif
1630
1631         sk->prot->destroy(sk);
1632
1633         tcp_kill_sk_queues(sk);
1634
1635 #ifdef INET_REFCNT_DEBUG
1636         if (atomic_read(&sk->refcnt) != 1) {
1637                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1638         }
1639 #endif
1640
1641         atomic_dec(&tcp_orphan_count);
1642         sock_put(sk);
1643 }
1644
1645 void tcp_close(struct sock *sk, long timeout)
1646 {
1647         struct sk_buff *skb;
1648         int data_was_unread = 0;
1649
1650         lock_sock(sk);
1651         sk->shutdown = SHUTDOWN_MASK;
1652
1653         if(sk->state == TCP_LISTEN) {
1654                 tcp_set_state(sk, TCP_CLOSE);
1655
1656                 /* Special case. */
1657                 tcp_listen_stop(sk);
1658
1659                 goto adjudge_to_death;
1660         }
1661
1662         /*  We need to flush the recv. buffs.  We do this only on the
1663          *  descriptor close, not protocol-sourced closes, because the
1664          *  reader process may not have drained the data yet!
1665          */
1666         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1667                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1668                 data_was_unread += len;
1669                 kfree_skb(skb);
1670         }
1671
1672         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1673          * 3.10, we send a RST here because data was lost.  To
1674          * witness the awful effects of the old behavior of always
1675          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1676          * a bulk GET in an FTP client, suspend the process, wait
1677          * for the client to advertise a zero window, then kill -9
1678          * the FTP client, wheee...  Note: timeout is always zero
1679          * in such a case.
1680          */
1681         if(data_was_unread != 0) {
1682                 /* Unread data was tossed, zap the connection. */
1683                 tcp_set_state(sk, TCP_CLOSE);
1684                 tcp_send_active_reset(sk, GFP_KERNEL);
1685         } else if (sk->linger && sk->lingertime==0) {
1686                 /* Check zero linger _after_ checking for unread data. */
1687                 sk->prot->disconnect(sk, 0);
1688         } else if (tcp_close_state(sk)) {
1689                 /* We FIN if the application ate all the data before
1690                  * zapping the connection.
1691                  */
1692
1693                 /* RED-PEN. Formally speaking, we have broken TCP state
1694                  * machine. State transitions:
1695                  *
1696                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1697                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1698                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1699                  *
1700                  * are legal only when FIN has been sent (i.e. in window),
1701                  * rather than queued out of window. Purists blame.
1702                  *
1703                  * F.e. "RFC state" is ESTABLISHED,
1704                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1705                  *
1706                  * The visible declinations are that sometimes
1707                  * we enter time-wait state, when it is not required really
1708                  * (harmless), do not send active resets, when they are
1709                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1710                  * they look as CLOSING or LAST_ACK for Linux)
1711                  * Probably, I missed some more holelets.
1712                  *                                              --ANK
1713                  */
1714                 tcp_send_fin(sk);
1715         }
1716
1717         if (timeout) {
1718                 struct task_struct *tsk = current;
1719                 DECLARE_WAITQUEUE(wait, current);
1720
1721                 add_wait_queue(sk->sleep, &wait);
1722
1723                 do {
1724                         set_current_state(TASK_INTERRUPTIBLE);
1725                         if (!closing(sk))
1726                                 break;
1727                         release_sock(sk);
1728                         timeout = schedule_timeout(timeout);
1729                         lock_sock(sk);
1730                 } while (!signal_pending(tsk) && timeout);
1731
1732                 tsk->state = TASK_RUNNING;
1733                 remove_wait_queue(sk->sleep, &wait);
1734         }
1735
1736 adjudge_to_death:
1737         /* It is the last release_sock in its life. It will remove backlog. */
1738         release_sock(sk);
1739
1740
1741         /* Now socket is owned by kernel and we acquire BH lock
1742            to finish close. No need to check for user refs.
1743          */
1744         local_bh_disable();
1745         bh_lock_sock(sk);
1746         BUG_TRAP(sk->lock.users==0);
1747
1748         sock_hold(sk);
1749         sock_orphan(sk);
1750
1751         /*      This is a (useful) BSD violating of the RFC. There is a
1752          *      problem with TCP as specified in that the other end could
1753          *      keep a socket open forever with no application left this end.
1754          *      We use a 3 minute timeout (about the same as BSD) then kill
1755          *      our end. If they send after that then tough - BUT: long enough
1756          *      that we won't make the old 4*rto = almost no time - whoops
1757          *      reset mistake.
1758          *
1759          *      Nope, it was not mistake. It is really desired behaviour
1760          *      f.e. on http servers, when such sockets are useless, but
1761          *      consume significant resources. Let's do it with special
1762          *      linger2 option.                                 --ANK
1763          */
1764
1765         if (sk->state == TCP_FIN_WAIT2) {
1766                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1767                 if (tp->linger2 < 0) {
1768                         tcp_set_state(sk, TCP_CLOSE);
1769                         tcp_send_active_reset(sk, GFP_ATOMIC);
1770                 } else {
1771                         int tmo = tcp_fin_time(tp);
1772
1773                         if (tmo > TCP_TIMEWAIT_LEN) {
1774                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
1775                         } else {
1776                                 atomic_inc(&tcp_orphan_count);
1777                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1778                                 goto out;
1779                         }
1780                 }
1781         }
1782         if (sk->state != TCP_CLOSE &&
1783             atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans) {
1784                 if (net_ratelimit())
1785                         printk(KERN_INFO "TCP: too many of orphaned sockets\n");
1786                 tcp_set_state(sk, TCP_CLOSE);
1787                 tcp_send_active_reset(sk, GFP_ATOMIC);
1788         }
1789         atomic_inc(&tcp_orphan_count);
1790
1791         if (sk->state == TCP_CLOSE)
1792                 tcp_destroy_sock(sk);
1793         /* Otherwise, socket is reprieved until protocol close. */
1794
1795 out:
1796         bh_unlock_sock(sk);
1797         local_bh_enable();
1798         sock_put(sk);
1799 }
1800
1801 /* These states need RST on ABORT according to RFC793 */
1802
1803 extern __inline__ int tcp_need_reset(int state)
1804 {
1805         return ((1 << state) &
1806                 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
1807                  TCPF_FIN_WAIT2|TCPF_SYN_RECV));
1808 }
1809
1810 int tcp_disconnect(struct sock *sk, int flags)
1811 {
1812         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1813         int old_state;
1814         int err = 0;
1815
1816         old_state = sk->state;
1817         if (old_state != TCP_CLOSE)
1818                 tcp_set_state(sk, TCP_CLOSE);
1819
1820         /* ABORT function of RFC793 */
1821         if (old_state == TCP_LISTEN) {
1822                 tcp_listen_stop(sk);
1823         } else if (tcp_need_reset(old_state) ||
1824                    (tp->snd_nxt != tp->write_seq &&
1825                     (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
1826                 /* The last check adjusts for discrepance of Linux wrt. RFC
1827                  * states
1828                  */
1829                 tcp_send_active_reset(sk, gfp_any());
1830                 sk->err = ECONNRESET;
1831         } else if (old_state == TCP_SYN_SENT)
1832                 sk->err = ECONNRESET;
1833
1834         tcp_clear_xmit_timers(sk);
1835         __skb_queue_purge(&sk->receive_queue);
1836         __skb_queue_purge(&sk->write_queue);
1837         __skb_queue_purge(&tp->out_of_order_queue);
1838
1839         sk->dport = 0;
1840
1841         sk->rcv_saddr = 0;
1842         sk->saddr = 0;
1843 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1844         memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1845         memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1846 #endif
1847
1848         sk->shutdown = 0;
1849         sk->done = 0;
1850         sk->write_space = tcp_write_space;
1851         tp->srtt = 0;
1852         if (sysctl_tcp_tw_recycle) {
1853                 if ((tp->write_seq += 2) == 0)
1854                         tp->write_seq = 1;
1855         } else {
1856                 tp->write_seq = 0;
1857         }
1858         tp->backoff = 0;
1859         tp->snd_cwnd = 2;
1860         tp->probes_out = 0;
1861         tp->packets_out = 0;
1862         tp->high_seq = 0;
1863         tp->snd_ssthresh = 0x7fffffff;
1864         tp->snd_cwnd_cnt = 0;
1865         tp->dup_acks = 0;
1866         tcp_delack_init(tp);
1867         tp->send_head = tp->retrans_head = NULL;
1868         tp->saw_tstamp = 0;
1869         __sk_dst_reset(sk);
1870
1871         BUG_TRAP(!sk->num || sk->prev);
1872
1873         sk->error_report(sk);
1874         return err;
1875 }
1876
1877 /*
1878  *      Wait for an incoming connection, avoid race
1879  *      conditions. This must be called with the socket locked,
1880  *      and without the kernel lock held.
1881  */
1882 static int wait_for_connect(struct sock * sk, long timeo)
1883 {
1884         DECLARE_WAITQUEUE(wait, current);
1885         int err;
1886
1887         /*
1888          * True wake-one mechanism for incoming connections: only
1889          * one process gets woken up, not the 'whole herd'.
1890          * Since we do not 'race & poll' for established sockets
1891          * anymore, the common case will execute the loop only once.
1892          *
1893          * Subtle issue: "add_wait_queue_exclusive()" will be added
1894          * after any current non-exclusive waiters, and we know that
1895          * it will always _stay_ after any new non-exclusive waiters
1896          * because all non-exclusive waiters are added at the
1897          * beginning of the wait-queue. As such, it's ok to "drop"
1898          * our exclusiveness temporarily when we get woken up without
1899          * having to remove and re-insert us on the wait queue.
1900          */
1901         add_wait_queue_exclusive(sk->sleep, &wait);
1902         for (;;) {
1903                 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1904                 release_sock(sk);
1905                 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
1906                         timeo = schedule_timeout(timeo);
1907                 lock_sock(sk);
1908                 err = 0;
1909                 if (sk->tp_pinfo.af_tcp.accept_queue)
1910                         break;
1911                 err = -EINVAL;
1912                 if (sk->state != TCP_LISTEN)
1913                         break;
1914                 err = -ERESTARTSYS;
1915                 if (signal_pending(current))
1916                         break;
1917                 err = -EAGAIN;
1918                 if (!timeo)
1919                         break;
1920         }
1921         current->state = TASK_RUNNING;
1922         remove_wait_queue(sk->sleep, &wait);
1923         return err;
1924 }
1925
1926 /*
1927  *      This will accept the next outstanding connection.
1928  *
1929  *      Be careful about race conditions here - this is subtle.
1930  */
1931
1932 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1933 {
1934         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1935         struct open_request *req;
1936         struct sock *newsk;
1937         int error;
1938         long timeo;
1939
1940         lock_sock(sk);
1941
1942         /* We need to make sure that this socket is listening,
1943          * and that it has something pending.
1944          */
1945         error = -EINVAL;
1946         if (sk->state != TCP_LISTEN)
1947                 goto out;
1948
1949         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1950
1951         /* Find already established connection */
1952         if (!tp->accept_queue) {
1953                 /* If this is a non blocking socket don't sleep */
1954                 error = -EAGAIN;
1955                 if (!timeo)
1956                         goto out;
1957
1958                 error = wait_for_connect(sk, timeo);
1959                 if (error)
1960                         goto out;
1961         }
1962
1963         req = tp->accept_queue;
1964         tp->accept_queue = req->dl_next;
1965
1966         newsk = req->sk;
1967         tcp_acceptq_removed(sk);
1968         tcp_openreq_fastfree(req);
1969         BUG_TRAP(newsk->state != TCP_SYN_RECV);
1970         release_sock(sk);
1971         return newsk;
1972
1973 out:
1974         release_sock(sk);
1975         *err = error;
1976         return NULL;
1977 }
1978
1979 /*
1980  *      Socket option code for TCP.
1981  */
1982
1983 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
1984                    int optlen)
1985 {
1986         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1987         int val;
1988         int err = 0;
1989
1990         if (level != SOL_TCP)
1991                 return tp->af_specific->setsockopt(sk, level, optname,
1992                                                    optval, optlen);
1993
1994         if(optlen<sizeof(int))
1995                 return -EINVAL;
1996
1997         if (get_user(val, (int *)optval))
1998                 return -EFAULT;
1999
2000         lock_sock(sk);
2001
2002         switch(optname) {
2003         case TCP_MAXSEG:
2004                 /* values greater than interface MTU won't take effect.  however at
2005                  * the point when this call is done we typically don't yet know
2006                  * which interface is going to be used
2007                  */
2008                 if(val < 8 || val > MAX_TCP_WINDOW) {
2009                         err = -EINVAL;
2010                         break;
2011                 }
2012                 tp->user_mss = val;
2013                 break;
2014
2015         case TCP_NODELAY:
2016                 /* You cannot try to use this and TCP_CORK in
2017                  * tandem, so let the user know.
2018                  */
2019                 if (tp->nonagle == 2) {
2020                         err = -EINVAL;
2021                         break;
2022                 }
2023                 tp->nonagle = (val == 0) ? 0 : 1;
2024                 break;
2025
2026         case TCP_CORK:
2027                 /* When set indicates to always queue non-full frames.
2028                  * Later the user clears this option and we transmit
2029                  * any pending partial frames in the queue.  This is
2030                  * meant to be used alongside sendfile() to get properly
2031                  * filled frames when the user (for example) must write
2032                  * out headers with a write() call first and then use
2033                  * sendfile to send out the data parts.
2034                  *
2035                  * You cannot try to use TCP_NODELAY and this mechanism
2036                  * at the same time, so let the user know.
2037                  */
2038                 if (tp->nonagle == 1) {
2039                         err = -EINVAL;
2040                         break;
2041                 }
2042                 if (val != 0) {
2043                         tp->nonagle = 2;
2044                 } else {
2045                         tp->nonagle = 0;
2046
2047                         tcp_push_pending_frames(sk, tp);
2048                 }
2049                 break;
2050
2051         case TCP_KEEPIDLE:
2052                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2053                         err = -EINVAL;
2054                 else {
2055                         tp->keepalive_time = val * HZ;
2056                         if (sk->keepopen) {
2057                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2058                                 if (tp->keepalive_time > elapsed)
2059                                         elapsed = tp->keepalive_time - elapsed;
2060                                 else
2061                                         elapsed = 0;
2062                                 tcp_reset_keepalive_timer(sk, elapsed);
2063                         }
2064                 }
2065                 break;
2066         case TCP_KEEPINTVL:
2067                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2068                         err = -EINVAL;
2069                 else
2070                         tp->keepalive_intvl = val * HZ;
2071                 break;
2072         case TCP_KEEPCNT:
2073                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2074                         err = -EINVAL;
2075                 else
2076                         tp->keepalive_probes = val;
2077                 break;
2078         case TCP_SYNCNT:
2079                 if (val < 1 || val > MAX_TCP_SYNCNT)
2080                         err = -EINVAL;
2081                 else
2082                         tp->syn_retries = val;
2083                 break;
2084
2085         case TCP_LINGER2:
2086                 if (val < 0)
2087                         tp->linger2 = -1;
2088                 else if (val > sysctl_tcp_fin_timeout/HZ)
2089                         tp->linger2 = 0;
2090                 else
2091                         tp->linger2 = val*HZ;
2092                 break;
2093
2094         case TCP_DEFER_ACCEPT:
2095                 tp->defer_accept = 0;
2096                 if (val > 0) {
2097                         /* Translate value in seconds to number of retransmits */
2098                         while (val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2099                                 tp->defer_accept++;
2100                         tp->defer_accept++;
2101                 }
2102                 break;
2103
2104         case TCP_WINDOW_CLAMP:
2105                 if (val==0) {
2106                         if (sk->state != TCP_CLOSE) {
2107                                 err = -EINVAL;
2108                                 break;
2109                         }
2110                         tp->window_clamp = 0;
2111                 } else {
2112                         tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2113                                 SOCK_MIN_SNDBUF : val;
2114                 }
2115                 break;
2116
2117         default:
2118                 err = -ENOPROTOOPT;
2119                 break;
2120         };
2121         release_sock(sk);
2122         return err;
2123 }
2124
2125 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2126                    int *optlen)
2127 {
2128         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2129         int val, len;
2130
2131         if(level != SOL_TCP)
2132                 return tp->af_specific->getsockopt(sk, level, optname,
2133                                                    optval, optlen);
2134
2135         if(get_user(len,optlen))
2136                 return -EFAULT;
2137
2138         len = min(len, sizeof(int));
2139
2140         switch(optname) {
2141         case TCP_MAXSEG:
2142                 val = tp->mss_cache;
2143                 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2144                         val = tp->user_mss;
2145                 break;
2146         case TCP_NODELAY:
2147                 val = (tp->nonagle == 1);
2148                 break;
2149         case TCP_CORK:
2150                 val = (tp->nonagle == 2);
2151                 break;
2152         case TCP_KEEPIDLE:
2153                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2154                 break;
2155         case TCP_KEEPINTVL:
2156                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2157                 break;
2158         case TCP_KEEPCNT:
2159                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2160                 break;
2161         case TCP_SYNCNT:
2162                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2163                 break;
2164         case TCP_LINGER2:
2165                 val = tp->linger2;
2166                 if (val > 0)
2167                         val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2168                 break;
2169         case TCP_DEFER_ACCEPT:
2170                 val = tp->defer_accept == 0 ? 0 : (TCP_TIMEOUT_INIT<<(tp->defer_accept-1));
2171                 break;
2172         case TCP_WINDOW_CLAMP:
2173                 val = tp->window_clamp;
2174                 break;
2175         default:
2176                 return -ENOPROTOOPT;
2177         };
2178
2179         if(put_user(len, optlen))
2180                 return -EFAULT;
2181         if(copy_to_user(optval, &val,len))
2182                 return -EFAULT;
2183         return 0;
2184 }
2185
2186
2187 extern void __skb_cb_too_small_for_tcp(int, int);
2188
2189 void __init tcp_init(void)
2190 {
2191         struct sk_buff *skb = NULL;
2192         unsigned long goal;
2193         int order, i;
2194
2195         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2196                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2197                                            sizeof(skb->cb));
2198
2199         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2200                                                    sizeof(struct open_request),
2201                                                0, SLAB_HWCACHE_ALIGN,
2202                                                NULL, NULL);
2203         if(!tcp_openreq_cachep)
2204                 panic("tcp_init: Cannot alloc open_request cache.");
2205
2206         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2207                                               sizeof(struct tcp_bind_bucket),
2208                                               0, SLAB_HWCACHE_ALIGN,
2209                                               NULL, NULL);
2210         if(!tcp_bucket_cachep)
2211                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2212
2213         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2214                                                 sizeof(struct tcp_tw_bucket),
2215                                                 0, SLAB_HWCACHE_ALIGN,
2216                                                 NULL, NULL);
2217         if(!tcp_timewait_cachep)
2218                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2219
2220         /* Size and allocate the main established and bind bucket
2221          * hash tables.
2222          *
2223          * The methodology is similar to that of the buffer cache.
2224          */
2225         goal = num_physpages >> (23 - PAGE_SHIFT);
2226
2227         for(order = 0; (1UL << order) < goal; order++)
2228                 ;
2229         do {
2230                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2231                         sizeof(struct tcp_ehash_bucket);
2232                 tcp_ehash_size >>= 1;
2233                 while (tcp_ehash_size & (tcp_ehash_size-1))
2234                         tcp_ehash_size--;
2235                 tcp_ehash = (struct tcp_ehash_bucket *)
2236                         __get_free_pages(GFP_ATOMIC, order);
2237         } while (tcp_ehash == NULL && --order > 0);
2238
2239         if (!tcp_ehash)
2240                 panic("Failed to allocate TCP established hash table\n");
2241         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2242                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2243                 tcp_ehash[i].chain = NULL;
2244         }
2245
2246         do {
2247                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2248                         sizeof(struct tcp_bind_hashbucket);
2249                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2250                         continue;
2251                 tcp_bhash = (struct tcp_bind_hashbucket *)
2252                         __get_free_pages(GFP_ATOMIC, order);
2253         } while (tcp_bhash == NULL && --order >= 0);
2254
2255         if (!tcp_bhash)
2256                 panic("Failed to allocate TCP bind hash table\n");
2257         for (i = 0; i < tcp_bhash_size; i++) {
2258                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2259                 tcp_bhash[i].chain = NULL;
2260         }
2261
2262         /* Try to be a bit smarter and adjust defaults depending
2263          * on available memory.
2264          */
2265         if (order > 4) {
2266                 sysctl_local_port_range[0] = 32768;
2267                 sysctl_local_port_range[1] = 61000;
2268                 sysctl_tcp_max_tw_buckets = 180000;
2269                 sysctl_tcp_max_orphans = 4096<<(order-4);
2270                 sysctl_max_syn_backlog = 1024;
2271         } else if (order < 3) {
2272                 sysctl_local_port_range[0] = 1024*(3-order);
2273                 sysctl_tcp_max_tw_buckets >>= (3-order);
2274                 sysctl_tcp_max_orphans >>= (3-order);
2275                 sysctl_max_syn_backlog = 128;
2276         }
2277         tcp_port_rover = sysctl_local_port_range[0] - 1;
2278
2279         printk("TCP: Hash tables configured (established %d bind %d)\n",
2280                tcp_ehash_size<<1, tcp_bhash_size);
2281 }