net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.119 1998/08/26 12:04:14 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen :    Make poll agree with SIGIO
 205  *
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *
 243  *      TCP_CLOSE               socket is finished
 244  */
 245
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * [Note: Most of the TCP code has been rewriten/redesigned since this
 253  *  RFC1122 check. It is probably not correct anymore. It should be redone
 254  *  before 2.2. -AK]
 255  *
 256  * Use of PSH (4.2.2.2)
 257  *   MAY aggregate data sent without the PSH flag. (does)
 258  *   MAY queue data received without the PSH flag. (does)
 259  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 260  *   MAY implement PSH on send calls. (doesn't, thus:)
 261  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 262  *     MUST set PSH on last segment (does)
 263  *   MAY pass received PSH to application layer (doesn't)
 264  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 265  *
 266  * Window Size (4.2.2.3, 4.2.2.16)
 267  *   MUST treat window size as an unsigned number (does)
 268  *   SHOULD treat window size as a 32-bit number (does not)
 269  *   MUST NOT shrink window once it is offered (does not normally)
 270  *
 271  * Urgent Pointer (4.2.2.4)
 272  * **MUST point urgent pointer to last byte of urgent data (not right
 273  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 274  *      to off)
 275  *   MUST inform application layer asynchronously of incoming urgent
 276  *     data. (does)
 277  *   MUST provide application with means of determining the amount of
 278  *     urgent data pending. (does)
 279  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 280  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 281  *      [Follows BSD 1 byte of urgent data]
 282  *
 283  * TCP Options (4.2.2.5)
 284  *   MUST be able to receive TCP options in any segment. (does)
 285  *   MUST ignore unsupported options (does)
 286  *
 287  * Maximum Segment Size Option (4.2.2.6)
 288  *   MUST implement both sending and receiving MSS. (does, but currently
 289  *      only uses the smaller of both of them)
 290  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 291  *     it always). (does, even when MSS == 536, which is legal)
 292  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 293  *   MUST calculate "effective send MSS" correctly:
 294  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 295  *     (does - but allows operator override)
 296  *
 297  * TCP Checksum (4.2.2.7)
 298  *   MUST generate and check TCP checksum. (does)
 299  *
 300  * Initial Sequence Number Selection (4.2.2.8)
 301  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 302  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 303  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 304  *     With syncookies we doesn't)
 305  *
 306  * Simultaneous Open Attempts (4.2.2.10)
 307  *   MUST support simultaneous open attempts (does)
 308  *
 309  * Recovery from Old Duplicate SYN (4.2.2.11)
 310  *   MUST keep track of active vs. passive open (does)
 311  *
 312  * RST segment (4.2.2.12)
 313  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 314  *     anything with it, which is standard)
 315  *
 316  * Closing a Connection (4.2.2.13)
 317  *   MUST inform application of whether connection was closed by RST or
 318  *     normal close. (does)
 319  *   MAY allow "half-duplex" close (treat connection as closed for the
 320  *     local app, even before handshake is done). (does)
 321  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 322  *
 323  * Retransmission Timeout (4.2.2.15)
 324  *   MUST implement Jacobson's slow start and congestion avoidance
 325  *     stuff. (does)
 326  *
 327  * Probing Zero Windows (4.2.2.17)
 328  *   MUST support probing of zero windows. (does)
 329  *   MAY keep offered window closed indefinitely. (does)
 330  *   MUST allow remote window to stay closed indefinitely. (does)
 331  *
 332  * Passive Open Calls (4.2.2.18)
 333  *   MUST NOT let new passive open affect other connections. (doesn't)
 334  *   MUST support passive opens (LISTENs) concurrently. (does)
 335  *
 336  * Time to Live (4.2.2.19)
 337  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 338  *
 339  * Event Processing (4.2.2.20)
 340  *   SHOULD queue out-of-order segments. (does)
 341  *   MUST aggregate ACK segments whenever possible. (does but badly)
 342  *
 343  * Retransmission Timeout Calculation (4.2.3.1)
 344  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 345  *     calculation. (does, or at least explains them in the comments 8*b)
 346  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 347  *
 348  * When to Send an ACK Segment (4.2.3.2)
 349  *   SHOULD implement delayed ACK. (does)
 350  *   MUST keep ACK delay < 0.5 sec. (does)
 351  *
 352  * When to Send a Window Update (4.2.3.3)
 353  *   MUST implement receiver-side SWS. (does)
 354  *
 355  * When to Send Data (4.2.3.4)
 356  *   MUST implement sender-side SWS. (does)
 357  *   SHOULD implement Nagle algorithm. (does)
 358  *
 359  * TCP Connection Failures (4.2.3.5)
 360  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 361  *   SHOULD inform application layer of soft errors. (does)
 362  *
 363  * TCP Keep-Alives (4.2.3.6)
 364  *   MAY provide keep-alives. (does)
 365  *   MUST make keep-alives configurable on a per-connection basis. (does)
 366  *   MUST default to no keep-alives. (does)
 367  *   MUST make keep-alive interval configurable. (does)
 368  *   MUST make default keep-alive interval > 2 hours. (does)
 369  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 370  *     connection. (doesn't)
 371  *   SHOULD send keep-alive with no data. (does)
 372  *
 373  * TCP Multihoming (4.2.3.7)
 374  *   MUST get source address from IP layer before sending first
 375  *     SYN. (does)
 376  *   MUST use same local address for all segments of a connection. (does)
 377  *
 378  * IP Options (4.2.3.8)
 379  *   MUST ignore unsupported IP options. (does)
 380  *   MAY support Time Stamp and Record Route. (does)
 381  *   MUST allow application to specify a source route. (does)
 382  *   MUST allow received Source Route option to set route for all future
 383  *     segments on this connection. (does not (security issues))
 384  *
 385  * ICMP messages (4.2.3.9)
 386  *   MUST act on ICMP errors. (does)
 387  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 388  *   because that is deprecated now by the IETF, can be turned on)
 389  *   MUST NOT abort connection upon receipt of soft Destination
 390  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  *     Problems. (doesn't)
 392  *   SHOULD report soft Destination Unreachables etc. to the
 393  *     application. (does, except during SYN_RECV and may drop messages
 394  *     in some rare cases before accept() - ICMP is unreliable)
 395  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 396  *     messages (2, 3, 4). (does, but see above)
 397  *
 398  * Remote Address Validation (4.2.3.10)
 399  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 400  *   MUST ignore SYN with invalid source address. (does)
 401  *   MUST silently discard incoming SYN for broadcast/multicast
 402  *     address. (does)
 403  *
 404  * Asynchronous Reports (4.2.4.1)
 405  * MUST provide mechanism for reporting soft errors to application
 406  *     layer. (does)
 407  *
 408  * Type of Service (4.2.4.2)
 409  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 410  *
 411  * (Whew. -- MS 950903)
 412  * (Updated by AK, but not complete yet.)
 413  **/
 414
 415 #include <linux/types.h>
 416 #include <linux/fcntl.h>
 417 #include <linux/poll.h>
 418 #include <linux/init.h>
 419
 420 #include <net/icmp.h>
 421 #include <net/tcp.h>
 422
 423 #include <asm/uaccess.h>
 424
 425 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 426
 427 struct tcp_mib  tcp_statistics;
 428
 429 kmem_cache_t *tcp_openreq_cachep;
 430 kmem_cache_t *tcp_bucket_cachep;
 431 kmem_cache_t *tcp_timewait_cachep;
 432
 433 /*
 434  *      Find someone to 'accept'. Must be called with
 435  *      the socket locked or with interrupts disabled
 436  */
 437
 438 static struct open_request *tcp_find_established(struct tcp_opt *tp,
 439                                                  struct open_request **prevp)
 440 {
 441         struct open_request *req = tp->syn_wait_queue;
 442         struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
 443         while(req) {
 444                 if (req->sk &&
 445                     ((1 << req->sk->state) &
 446                      ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
 447                         break;
 448                 prev = req;
 449                 req = req->dl_next;
 450         }
 451         *prevp = prev;
 452         return req;
 453 }
 454
 455 /*
 456  *      Walk down the receive queue counting readable data.
 457  *
 458  *      Must be called with the socket lock held.
 459  */
 460
 461 static int tcp_readable(struct sock *sk)
 462 {
 463         unsigned long counted;
 464         unsigned long amount;
 465         struct sk_buff *skb;
 466         int sum;
 467
 468         SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
 469
 470         skb = skb_peek(&sk->receive_queue);
 471         if (skb == NULL) {
 472                 SOCK_DEBUG(sk, "empty\n");
 473                 return(0);
 474         }
 475
 476         counted = sk->tp_pinfo.af_tcp.copied_seq;       /* Where we are at the moment */
 477         amount = 0;
 478
 479         /* Do until a push or until we are out of data. */
 480         do {
 481                 /* Found a hole so stops here. */
 482                 if (before(counted, TCP_SKB_CB(skb)->seq))      /* should not happen */
 483                         break;
 484
 485                 /* Length - header but start from where we are up to
 486                  * avoid overlaps.
 487                  */
 488                 sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
 489                 if (sum >= 0) {
 490                         /* Add it up, move on. */
 491                         amount += sum;
 492                         counted += sum;
 493                         if (skb->h.th->syn)
 494                                 counted++;
 495                 }
 496
 497                 /* Don't count urg data ... but do it in the right place!
 498                  * Consider: "old_data (ptr is here) URG PUSH data"
 499                  * The old code would stop at the first push because
 500                  * it counted the urg (amount==1) and then does amount--
 501                  * *after* the loop.  This means tcp_readable() always
 502                  * returned zero if any URG PUSH was in the queue, even
 503                  * though there was normal data available. If we subtract
 504                  * the urg data right here, we even get it to work for more
 505                  * than one URG PUSH skb without normal data.
 506                  * This means that poll() finally works now with urg data
 507                  * in the queue.  Note that rlogin was never affected
 508                  * because it doesn't use poll(); it uses two processes
 509                  * and a blocking read().  And the queue scan in tcp_read()
 510                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 511                  */
 512
 513                 /* Don't count urg data. */
 514                 if (skb->h.th->urg)
 515                         amount--;
 516 #if 0
 517                 if (amount && skb->h.th->psh) break;
 518 #endif
 519                 skb = skb->next;
 520         } while(skb != (struct sk_buff *)&sk->receive_queue);
 521
 522         SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
 523         return(amount);
 524 }
 525
 526 /*
 527  * LISTEN is a special case for poll..
 528  */
 529 static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 530 {
 531         struct open_request *req, *dummy;
 532
 533         lock_sock(sk);
 534         req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
 535         release_sock(sk);
 536         if (req)
 537                 return POLLIN | POLLRDNORM;
 538         return 0;
 539 }
 540
 541 /*
 542  *      Compute minimal free write space needed to queue new packets.
 543  */
 544 static inline int tcp_min_write_space(struct sock *sk, struct tcp_opt *tp)
 545 {
 546         int space;
 547 #if 1 /* This needs benchmarking and real world tests */
 548         space = max(tp->mss_cache + 128, MIN_WRITE_SPACE);
 549 #else /* 2.0 way */
 550         /* More than half of the socket queue free? */
 551         space = atomic_read(&sk->wmem_alloc) / 2;
 552 #endif
 553         return space;
 554 }
 555
 556 /*
 557  *      Wait for a TCP event.
 558  *
 559  *      Note that we don't need to lock the socket, as the upper poll layers
 560  *      take care of normal races (between the test and the event) and we don't
 561  *      go look at any of the socket buffers directly.
 562  */
 563 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 564 {
 565         unsigned int mask;
 566         struct sock *sk = sock->sk;
 567         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 568
 569         poll_wait(file, sk->sleep, wait);
 570         if (sk->state == TCP_LISTEN)
 571                 return tcp_listen_poll(sk, wait);
 572
 573         mask = 0;
 574         if (sk->err)
 575                 mask = POLLERR;
 576         /* Connected? */
 577         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE)) {
 578                 if (sk->shutdown & RCV_SHUTDOWN)
 579                         mask |= POLLHUP;
 580
 581                 if ((tp->rcv_nxt != tp->copied_seq) &&
 582                     (tp->urg_seq != tp->copied_seq ||
 583                      tp->rcv_nxt != tp->copied_seq+1 ||
 584                      sk->urginline || !tp->urg_data))
 585                         mask |= POLLIN | POLLRDNORM;
 586
 587                 /* Always wake the user up when an error occurred */
 588                 if (sock_wspace(sk) >= tcp_min_write_space(sk, tp) || sk->err)
 589                         mask |= POLLOUT | POLLWRNORM;
 590                 if (tp->urg_data & URG_VALID)
 591                         mask |= POLLPRI;
 592         }
 593         return mask;
 594 }
 595
 596 /*
 597  *      Socket write_space callback.
 598  *      This (or rather the sock_wake_async) should agree with poll.
 599  */
 600 void tcp_write_space(struct sock *sk)
 601 {
 602         if (sk->dead)
 603                 return;
 604
 605         wake_up_interruptible(sk->sleep);
 606         if (sock_wspace(sk) >=
 607             tcp_min_write_space(sk, &(sk->tp_pinfo.af_tcp)))
 608                 sock_wake_async(sk->socket, 2);
 609 }
 610
 611
 612 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 613 {
 614         int answ;
 615
 616         switch(cmd) {
 617         case TIOCINQ:
 618 #ifdef FIXME    /* FIXME: */
 619         case FIONREAD:
 620 #endif
 621                 if (sk->state == TCP_LISTEN)
 622                         return(-EINVAL);
 623                 lock_sock(sk);
 624                 answ = tcp_readable(sk);
 625                 release_sock(sk);
 626                 break;
 627         case SIOCATMARK:
 628                 {
 629                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 630                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 631                         break;
 632                 }
 633         case TIOCOUTQ:
 634                 if (sk->state == TCP_LISTEN)
 635                         return(-EINVAL);
 636                 answ = sock_wspace(sk);
 637                 break;
 638         default:
 639                 return(-ENOIOCTLCMD);
 640         };
 641
 642         return put_user(answ, (int *)arg);
 643 }
 644
 645 /*
 646  *      Wait for a socket to get into the connected state
 647  *
 648  *      Note: must be called with the socket locked.
 649  */
 650 static int wait_for_tcp_connect(struct sock * sk, int flags)
 651 {
 652         struct task_struct *tsk = current;
 653         struct wait_queue wait = { tsk, NULL };
 654
 655         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 656                 if(sk->err)
 657                         return sock_error(sk);
 658                 if((1 << sk->state) &
 659                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 660                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 661                                 send_sig(SIGPIPE, tsk, 0);
 662                         return -EPIPE;
 663                 }
 664                 if(flags & MSG_DONTWAIT)
 665                         return -EAGAIN;
 666                 if(signal_pending(tsk))
 667                         return -ERESTARTSYS;
 668
 669                 tsk->state = TASK_INTERRUPTIBLE;
 670                 add_wait_queue(sk->sleep, &wait);
 671                 release_sock(sk);
 672
 673                 if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
 674                     sk->err == 0)
 675                         schedule();
 676
 677                 tsk->state = TASK_RUNNING;
 678                 remove_wait_queue(sk->sleep, &wait);
 679                 lock_sock(sk);
 680         }
 681         return 0;
 682 }
 683
 684 static inline int tcp_memory_free(struct sock *sk)
 685 {
 686         return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
 687 }
 688
 689 /*
 690  *      Wait for more memory for a socket
 691  */
 692 static void wait_for_tcp_memory(struct sock * sk)
 693 {
 694         release_sock(sk);
 695         if (!tcp_memory_free(sk)) {
 696                 struct wait_queue wait = { current, NULL };
 697
 698                 sk->socket->flags &= ~SO_NOSPACE;
 699                 add_wait_queue(sk->sleep, &wait);
 700                 for (;;) {
 701                         if (signal_pending(current))
 702                                 break;
 703                         current->state = TASK_INTERRUPTIBLE;
 704                         if (tcp_memory_free(sk))
 705                                 break;
 706                         if (sk->shutdown & SEND_SHUTDOWN)
 707                                 break;
 708                         if (sk->err)
 709                                 break;
 710                         schedule();
 711                 }
 712                 current->state = TASK_RUNNING;
 713                 remove_wait_queue(sk->sleep, &wait);
 714         }
 715         lock_sock(sk);
 716 }
 717
 718 /*
 719  *      This routine copies from a user buffer into a socket,
 720  *      and starts the transmit system.
 721  *
 722  *      Note: must be called with the socket locked.
 723  */
 724
 725 int tcp_do_sendmsg(struct sock *sk, int iovlen, struct iovec *iov, int flags)
 726 {
 727         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 728         int mss_now;
 729         int err = 0;
 730         int copied  = 0;
 731
 732         /* Verify that the socket is locked */
 733         if (!atomic_read(&sk->sock_readers))
 734                 printk("tcp_do_sendmsg: socket not locked!\n");
 735
 736         /* Wait for a connection to finish. */
 737         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 738                 if((err = wait_for_tcp_connect(sk, flags)) != 0)
 739                         return err;
 740
 741         mss_now = tcp_current_mss(sk);
 742
 743         /* Ok commence sending. */
 744         while(--iovlen >= 0) {
 745                 int seglen=iov->iov_len;
 746                 unsigned char * from=iov->iov_base;
 747
 748                 iov++;
 749
 750                 while(seglen > 0) {
 751                         int copy, tmp, queue_it;
 752                         struct sk_buff *skb;
 753
 754                         if (err)
 755                                 return -EFAULT;
 756
 757                         /* Stop on errors. */
 758                         if (sk->err)
 759                                 goto do_sock_err;
 760
 761                         /* Make sure that we are established. */
 762                         if (sk->shutdown & SEND_SHUTDOWN)
 763                                 goto do_shutdown;
 764
 765                         /* Now we need to check if we have a half
 766                          * built packet we can tack some data onto.
 767                          */
 768                         if (tp->send_head && !(flags & MSG_OOB)) {
 769                                 skb = sk->write_queue.prev;
 770                                 copy = skb->len;
 771                                 /* If the remote does SWS avoidance we should
 772                                  * queue the best we can if not we should in
 773                                  * fact send multiple packets...
 774                                  * A method for detecting this would be most
 775                                  * welcome.
 776                                  */
 777                                 if (skb_tailroom(skb) > 0 &&
 778                                     (mss_now - copy) > 0 &&
 779                                     tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
 780                                         int last_byte_was_odd = (copy % 4);
 781
 782                                         copy = mss_now - copy;
 783                                         if(copy > skb_tailroom(skb))
 784                                                 copy = skb_tailroom(skb);
 785                                         if(copy > seglen)
 786                                                 copy = seglen;
 787                                         if(last_byte_was_odd) {
 788                                                 if(copy_from_user(skb_put(skb, copy),
 789                                                                   from, copy))
 790                                                         err = -EFAULT;
 791                                                 skb->csum = csum_partial(skb->data,
 792                                                                          skb->len, 0);
 793                                         } else {
 794                                                 skb->csum =
 795                                                         csum_and_copy_from_user(
 796                                                         from, skb_put(skb, copy),
 797                                                         copy, skb->csum, &err);
 798                                         }
 799                                         tp->write_seq += copy;
 800                                         TCP_SKB_CB(skb)->end_seq += copy;
 801                                         from += copy;
 802                                         copied += copy;
 803                                         seglen -= copy;
 804                                         if(!seglen && !iovlen)
 805                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 806                                         continue;
 807                                 }
 808                         }
 809
 810                         /* We also need to worry about the window.  If
 811                          * window < 1/2 the maximum window we've seen
 812                          * from this host, don't use it.  This is
 813                          * sender side silly window prevention, as
 814                          * specified in RFC1122.  (Note that this is
 815                          * different than earlier versions of SWS
 816                          * prevention, e.g. RFC813.).  What we
 817                          * actually do is use the whole MSS.  Since
 818                          * the results in the right edge of the packet
 819                          * being outside the window, it will be queued
 820                          * for later rather than sent.
 821                          */
 822                         copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 823                         if(copy >= (tp->max_window >> 1))
 824                                 copy = min(copy, mss_now);
 825                         else
 826                                 copy = mss_now;
 827                         if(copy > seglen)
 828                                 copy = seglen;
 829
 830                         tmp = MAX_HEADER + sk->prot->max_header;
 831                         queue_it = 0;
 832                         if (copy < min(mss_now, tp->max_window >> 1) &&
 833                             !(flags & MSG_OOB)) {
 834                                 tmp += min(mss_now, tp->max_window);
 835
 836                                 /* What is happening here is that we want to
 837                                  * tack on later members of the users iovec
 838                                  * if possible into a single frame.  When we
 839                                  * leave this loop our caller checks to see if
 840                                  * we can send queued frames onto the wire.
 841                                  * See tcp_v[46]_sendmsg() for this.
 842                                  */
 843                                 queue_it = 1;
 844                         } else {
 845                                 tmp += copy;
 846                         }
 847                         skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
 848
 849                         /* If we didn't get any memory, we need to sleep. */
 850                         if (skb == NULL) {
 851                                 sk->socket->flags |= SO_NOSPACE;
 852                                 if (flags&MSG_DONTWAIT) {
 853                                         err = -EAGAIN;
 854                                         goto do_interrupted;
 855                                 }
 856                                 if (signal_pending(current)) {
 857                                         err = -ERESTARTSYS;
 858                                         goto do_interrupted;
 859                                 }
 860                                 wait_for_tcp_memory(sk);
 861
 862                                 /* If SACK's were formed or PMTU events happened,
 863                                  * we must find out about it.
 864                                  */
 865                                 mss_now = tcp_current_mss(sk);
 866                                 continue;
 867                         }
 868
 869                         seglen -= copy;
 870
 871                         /* Prepare control bits for TCP header creation engine. */
 872                         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 873                                                   ((!seglen && !iovlen) ?
 874                                                    TCPCB_FLAG_PSH : 0));
 875                         TCP_SKB_CB(skb)->sacked = 0;
 876                         if (flags & MSG_OOB) {
 877                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 878                                 TCP_SKB_CB(skb)->urg_ptr = copy;
 879                         } else
 880                                 TCP_SKB_CB(skb)->urg_ptr = 0;
 881
 882                         /* TCP data bytes are SKB_PUT() on top, later
 883                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 884                          * Reserve header space and checksum the data.
 885                          */
 886                         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 887                         skb->csum = csum_and_copy_from_user(from,
 888                                         skb_put(skb, copy), copy, 0, &err);
 889
 890                         from += copy;
 891                         copied += copy;
 892
 893                         TCP_SKB_CB(skb)->seq = tp->write_seq;
 894                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 895
 896                         /* This advances tp->write_seq for us. */
 897                         tcp_send_skb(sk, skb, queue_it);
 898                 }
 899         }
 900         sk->err = 0;
 901         if (err)
 902                 return -EFAULT;
 903         return copied;
 904
 905 do_sock_err:
 906         if(copied)
 907                 return copied;
 908         return sock_error(sk);
 909 do_shutdown:
 910         if(copied)
 911                 return copied;
 912         if (!(flags&MSG_NOSIGNAL))
 913                 send_sig(SIGPIPE, current, 0);
 914         return -EPIPE;
 915 do_interrupted:
 916         if(copied)
 917                 return copied;
 918         return err;
 919 }
 920
 921 /*
 922  *      Send an ack if one is backlogged at this point. Ought to merge
 923  *      this with tcp_send_ack().
 924  *      This is called for delayed acks also.
 925  */
 926
 927 void tcp_read_wakeup(struct sock *sk)
 928 {
 929         /* If we're closed, don't send an ack, or we'll get a RST
 930          * from the closed destination.
 931          */
 932         if (sk->state != TCP_CLOSE)
 933                 tcp_send_ack(sk);
 934 }
 935
 936 /*
 937  *      Handle reading urgent data. BSD has very simple semantics for
 938  *      this, no blocking and very strange errors 8)
 939  */
 940
 941 static int tcp_recv_urg(struct sock * sk, int nonblock,
 942                         struct msghdr *msg, int len, int flags,
 943                         int *addr_len)
 944 {
 945         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 946
 947         /* No URG data to read. */
 948         if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
 949                 return -EINVAL; /* Yes this is right ! */
 950
 951         if (sk->err)
 952                 return sock_error(sk);
 953
 954         if (sk->done)
 955                 return -ENOTCONN;
 956
 957         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
 958                 sk->done = 1;
 959                 return 0;
 960         }
 961
 962         lock_sock(sk);
 963         if (tp->urg_data & URG_VALID) {
 964                 int err = 0;
 965                 char c = tp->urg_data;
 966
 967                 if (!(flags & MSG_PEEK))
 968                         tp->urg_data = URG_READ;
 969
 970                 if(msg->msg_name)
 971                         tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
 972                                                        msg->msg_name);
 973
 974                 if(addr_len)
 975                         *addr_len = tp->af_specific->sockaddr_len;
 976
 977                 /* Read urgent data. */
 978                 msg->msg_flags|=MSG_OOB;
 979                 release_sock(sk);
 980
 981                 if(len>0)
 982                 {
 983                         err = memcpy_toiovec(msg->msg_iov, &c, 1);
 984                         /* N.B. already set above ... */
 985                         msg->msg_flags|=MSG_OOB;
 986                 }
 987                 else
 988                         msg->msg_flags|=MSG_TRUNC;
 989
 990                 /* N.B. Is this right?? If len == 0 we didn't read any data */
 991                 return err ? -EFAULT : 1;
 992         }
 993         release_sock(sk);
 994
 995         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 996          * the available implementations agree in this case:
 997          * this call should never block, independent of the
 998          * blocking state of the socket.
 999          * Mike <pall@rz.uni-karlsruhe.de>
1000          */
1001         return -EAGAIN;
1002 }
1003
1004 /*
1005  *      Release a skb if it is no longer needed. This routine
1006  *      must be called with interrupts disabled or with the
1007  *      socket locked so that the sk_buff queue operation is ok.
1008  */
1009
1010 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1011 {
1012         __skb_unlink(skb, &sk->receive_queue);
1013         kfree_skb(skb);
1014 }
1015
1016 /* Clean up the receive buffer for full frames taken by the user,
1017  * then send an ACK if necessary.  COPIED is the number of bytes
1018  * tcp_recvmsg has given to the user so far, it speeds up the
1019  * calculation of whether or not we must ACK for the sake of
1020  * a window update.
1021  */
1022 static void cleanup_rbuf(struct sock *sk, int copied)
1023 {
1024         struct sk_buff *skb;
1025
1026         /* NOTE! The socket must be locked, so that we don't get
1027          * a messed-up receive queue.
1028          */
1029         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1030                 if (!skb->used || atomic_read(&skb->users) > 1)
1031                         break;
1032                 tcp_eat_skb(sk, skb);
1033         }
1034
1035         SOCK_DEBUG(sk, "sk->rspace = %lu\n", sock_rspace(sk));
1036
1037         /* We send an ACK if we can now advertise a non-zero window
1038          * which has been raised "significantly".
1039          */
1040         if(copied > 0) {
1041                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1042                 __u32 rcv_window_now = tcp_receive_window(tp);
1043
1044                 /* We won't be raising the window any further than
1045                  * the window-clamp allows.  Our window selection
1046                  * also keeps things a nice multiple of MSS.  These
1047                  * checks are necessary to prevent spurious ACKs
1048                  * which don't advertize a larger window.
1049                  */
1050                 if((copied >= rcv_window_now) &&
1051                    ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
1052                         tcp_read_wakeup(sk);
1053         }
1054 }
1055
1056
1057 /*
1058  *      This routine copies from a sock struct into the user buffer.
1059  */
1060
1061 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1062                 int len, int nonblock, int flags, int *addr_len)
1063 {
1064         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1065         struct wait_queue wait = { current, NULL };
1066         int copied = 0;
1067         u32 peek_seq;
1068         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1069         unsigned long used;
1070         int err = 0;
1071         int target = 1;         /* Read at least this many bytes */
1072
1073         if (sk->state == TCP_LISTEN)
1074                 return -ENOTCONN;
1075
1076         /* Urgent data needs to be handled specially. */
1077         if (flags & MSG_OOB)
1078                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1079
1080         /*      Copying sequence to update. This is volatile to handle
1081          *      the multi-reader case neatly (memcpy_to/fromfs might be
1082          *      inline and thus not flush cached variables otherwise).
1083          */
1084         peek_seq = tp->copied_seq;
1085         seq = &tp->copied_seq;
1086         if (flags & MSG_PEEK)
1087                 seq = &peek_seq;
1088
1089         /* Handle the POSIX bogosity MSG_WAITALL. */
1090         if (flags & MSG_WAITALL)
1091                 target=len;
1092
1093         add_wait_queue(sk->sleep, &wait);
1094         lock_sock(sk);
1095
1096         /*
1097          *      BUG BUG BUG
1098          *      This violates 1003.1g compliance. We must wait for
1099          *      data to exist even if we read none!
1100          */
1101
1102         while (len > 0) {
1103                 struct sk_buff * skb;
1104                 u32 offset;
1105
1106                 /* Are we at urgent data? Stop if we have read anything. */
1107                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1108                         break;
1109
1110                 /* We need to check signals first, to get correct SIGURG
1111                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1112                  * and move it down to the bottom of the loop
1113                  */
1114                 if (signal_pending(current)) {
1115                         if (copied)
1116                                 break;
1117                         copied = -ERESTARTSYS;
1118                         if (nonblock)
1119                                 copied = -EAGAIN;
1120                         break;
1121                 }
1122
1123                 /* Next get a buffer. */
1124                 current->state = TASK_INTERRUPTIBLE;
1125
1126                 skb = skb_peek(&sk->receive_queue);
1127                 do {
1128                         if (!skb)
1129                                 break;
1130
1131                         /* Now that we have two receive queues this
1132                          * shouldn't happen.
1133                          */
1134                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1135                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1136                                        *seq, TCP_SKB_CB(skb)->seq);
1137                                 break;
1138                         }
1139                         offset = *seq - TCP_SKB_CB(skb)->seq;
1140                         if (skb->h.th->syn)
1141                                 offset--;
1142                         if (offset < skb->len)
1143                                 goto found_ok_skb;
1144                         if (skb->h.th->fin)
1145                                 goto found_fin_ok;
1146                         if (!(flags & MSG_PEEK))
1147                                 skb->used = 1;
1148                         skb = skb->next;
1149                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1150
1151                 if (copied >= target)
1152                         break;
1153
1154                 if (sk->err && !(flags&MSG_PEEK)) {
1155                         copied = sock_error(sk);
1156                         break;
1157                 }
1158
1159                 if (sk->shutdown & RCV_SHUTDOWN) {
1160                         sk->done = 1;
1161                         break;
1162                 }
1163
1164                 if (sk->state == TCP_CLOSE) {
1165                         if (!sk->done) {
1166                                 sk->done = 1;
1167                                 break;
1168                         }
1169                         copied = -ENOTCONN;
1170                         break;
1171                 }
1172
1173                 if (nonblock) {
1174                         copied = -EAGAIN;
1175                         break;
1176                 }
1177
1178                 cleanup_rbuf(sk, copied);
1179                 release_sock(sk);
1180                 sk->socket->flags |= SO_WAITDATA;
1181                 schedule();
1182                 sk->socket->flags &= ~SO_WAITDATA;
1183                 lock_sock(sk);
1184                 continue;
1185
1186         found_ok_skb:
1187                 /*      Lock the buffer. We can be fairly relaxed as
1188                  *      an interrupt will never steal a buffer we are
1189                  *      using unless I've missed something serious in
1190                  *      tcp_data.
1191                  */
1192                 atomic_inc(&skb->users);
1193
1194                 /* Ok so how much can we use? */
1195                 used = skb->len - offset;
1196                 if (len < used)
1197                         used = len;
1198
1199                 /* Do we have urgent data here? */
1200                 if (tp->urg_data) {
1201                         u32 urg_offset = tp->urg_seq - *seq;
1202                         if (urg_offset < used) {
1203                                 if (!urg_offset) {
1204                                         if (!sk->urginline) {
1205                                                 ++*seq;
1206                                                 offset++;
1207                                                 used--;
1208                                         }
1209                                 } else
1210                                         used = urg_offset;
1211                         }
1212                 }
1213
1214                 /*      Copy it - We _MUST_ update *seq first so that we
1215                  *      don't ever double read when we have dual readers
1216                  */
1217                 *seq += used;
1218
1219                 /*      This memcpy_toiovec can sleep. If it sleeps and we
1220                  *      do a second read it relies on the skb->users to avoid
1221                  *      a crash when cleanup_rbuf() gets called.
1222                  */
1223                 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1224                 if (err) {
1225                         /* Exception. Bailout! */
1226                         atomic_dec(&skb->users);
1227                         copied = -EFAULT;
1228                         break;
1229                 }
1230
1231                 copied += used;
1232                 len -= used;
1233
1234                 /*      We now will not sleep again until we are finished
1235                  *      with skb. Sorry if you are doing the SMP port
1236                  *      but you'll just have to fix it neatly ;)
1237                  */
1238                 atomic_dec(&skb->users);
1239
1240                 if (after(tp->copied_seq,tp->urg_seq))
1241                         tp->urg_data = 0;
1242                 if (used + offset < skb->len)
1243                         continue;
1244
1245                 /*      Process the FIN. We may also need to handle PSH
1246                  *      here and make it break out of MSG_WAITALL.
1247                  */
1248                 if (skb->h.th->fin)
1249                         goto found_fin_ok;
1250                 if (flags & MSG_PEEK)
1251                         continue;
1252                 skb->used = 1;
1253                 if (atomic_read(&skb->users) == 1)
1254                         tcp_eat_skb(sk, skb);
1255                 continue;
1256
1257         found_fin_ok:
1258                 ++*seq;
1259                 if (flags & MSG_PEEK)
1260                         break;
1261
1262                 /* All is done. */
1263                 skb->used = 1;
1264                 sk->shutdown |= RCV_SHUTDOWN;
1265                 break;
1266         }
1267
1268         if(copied > 0 && msg->msg_name)
1269                 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1270                                                msg->msg_name);
1271
1272         if(addr_len)
1273                 *addr_len = tp->af_specific->sockaddr_len;
1274
1275         remove_wait_queue(sk->sleep, &wait);
1276         current->state = TASK_RUNNING;
1277
1278         /* Clean up data we have read: This will do ACK frames. */
1279         cleanup_rbuf(sk, copied);
1280         release_sock(sk);
1281         return copied;
1282 }
1283
1284 /*
1285  * Check whether to renew the timer.
1286  */
1287 static inline void tcp_check_fin_timer(struct sock *sk)
1288 {
1289         if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev)
1290                 tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
1291 }
1292
1293 /*
1294  *      State processing on a close. This implements the state shift for
1295  *      sending our FIN frame. Note that we only send a FIN for some
1296  *      states. A shutdown() may have already sent the FIN, or we may be
1297  *      closed.
1298  */
1299
1300 static unsigned char new_state[16] = {
1301   /* current state:        new state:      action:      */
1302   /* (Invalid)          */ TCP_CLOSE,
1303   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1304   /* TCP_SYN_SENT       */ TCP_CLOSE,
1305   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1306   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1307   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1308   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1309   /* TCP_CLOSE          */ TCP_CLOSE,
1310   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1311   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1312   /* TCP_LISTEN         */ TCP_CLOSE,
1313   /* TCP_CLOSING        */ TCP_CLOSING,
1314 };
1315
1316 static int tcp_close_state(struct sock *sk, int dead)
1317 {
1318         int next = (int) new_state[sk->state];
1319         int ns = (next & TCP_STATE_MASK);
1320
1321         tcp_set_state(sk, ns);
1322
1323         /*      This is a (useful) BSD violating of the RFC. There is a
1324          *      problem with TCP as specified in that the other end could
1325          *      keep a socket open forever with no application left this end.
1326          *      We use a 3 minute timeout (about the same as BSD) then kill
1327          *      our end. If they send after that then tough - BUT: long enough
1328          *      that we won't make the old 4*rto = almost no time - whoops
1329          *      reset mistake.
1330          */
1331         if (dead)
1332                 tcp_check_fin_timer(sk);
1333
1334         return (next & TCP_ACTION_FIN);
1335 }
1336
1337 /*
1338  *      Shutdown the sending side of a connection. Much like close except
1339  *      that we don't receive shut down or set sk->dead.
1340  */
1341
1342 void tcp_shutdown(struct sock *sk, int how)
1343 {
1344         /*      We need to grab some memory, and put together a FIN,
1345          *      and then put it into the queue to be sent.
1346          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1347          */
1348         if (!(how & SEND_SHUTDOWN))
1349                 return;
1350
1351         /* If we've already sent a FIN, or it's a closed state, skip this. */
1352         if ((1 << sk->state) &
1353             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1354                 lock_sock(sk);
1355
1356                 /* Flag that the sender has shutdown. */
1357                 sk->shutdown |= SEND_SHUTDOWN;
1358
1359                 /* Clear out any half completed packets.  FIN if needed. */
1360                 if (tcp_close_state(sk,0))
1361                         tcp_send_fin(sk);
1362
1363                 release_sock(sk);
1364         }
1365 }
1366
1367
1368 /*
1369  *      Return 1 if we still have things to send in our buffers.
1370  */
1371
1372 static inline int closing(struct sock * sk)
1373 {
1374         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1375 }
1376
1377 /*
1378  *      This routine closes sockets which have been at least partially
1379  *      opened, but not yet accepted. Currently it is only called by
1380  *      tcp_close, and timeout mirrors the value there.
1381  */
1382
1383 static void tcp_close_pending (struct sock *sk)
1384 {
1385         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1386         struct open_request *req = tp->syn_wait_queue;
1387
1388         while(req) {
1389                 struct open_request *iter;
1390
1391                 if (req->sk)
1392                         tcp_close(req->sk, 0);
1393
1394                 iter = req;
1395                 req = req->dl_next;
1396
1397                 (*iter->class->destructor)(iter);
1398                 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1399                 sk->ack_backlog--;
1400                 tcp_openreq_free(iter);
1401         }
1402
1403         tcp_synq_init(tp);
1404 }
1405
1406 void tcp_close(struct sock *sk, unsigned long timeout)
1407 {
1408         struct sk_buff *skb;
1409         int data_was_unread = 0;
1410
1411         /*
1412          * Check whether the socket is locked ... supposedly
1413          * it's impossible to tcp_close() a locked socket.
1414          */
1415         if (atomic_read(&sk->sock_readers))
1416                 printk("tcp_close: socket already locked!\n");
1417
1418         /* We need to grab some memory, and put together a FIN,
1419          * and then put it into the queue to be sent.
1420          */
1421         lock_sock(sk);
1422         if(sk->state == TCP_LISTEN) {
1423                 /* Special case. */
1424                 tcp_set_state(sk, TCP_CLOSE);
1425                 tcp_close_pending(sk);
1426                 release_sock(sk);
1427                 sk->dead = 1;
1428                 return;
1429         }
1430
1431         /* It is questionable, what the role of this is now.
1432          * In any event either it should be removed, or
1433          * increment of SLT_KEEPALIVE be done, this is causing
1434          * big problems.  For now I comment it out.  -DaveM
1435          */
1436         /* sk->keepopen = 1; */
1437         sk->shutdown = SHUTDOWN_MASK;
1438
1439         if (!sk->dead)
1440                 sk->state_change(sk);
1441
1442         /*  We need to flush the recv. buffs.  We do this only on the
1443          *  descriptor close, not protocol-sourced closes, because the
1444          *  reader process may not have drained the data yet!
1445          */
1446         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1447                 data_was_unread++;
1448                 kfree_skb(skb);
1449         }
1450
1451         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1452          * 3.10, we send a RST here because data was lost.  To
1453          * witness the awful effects of the old behavior of always
1454          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1455          * a bulk GET in an FTP client, suspend the process, wait
1456          * for the client to advertise a zero window, then kill -9
1457          * the FTP client, wheee...  Note: timeout is always zero
1458          * in such a case.
1459          */
1460         if(data_was_unread != 0) {
1461                 /* Unread data was tossed, zap the connection. */
1462                 tcp_set_state(sk, TCP_CLOSE);
1463                 tcp_send_active_reset(sk);
1464         } else if (tcp_close_state(sk,1)) {
1465                 /* We FIN if the application ate all the data before
1466                  * zapping the connection.
1467                  */
1468                 tcp_send_fin(sk);
1469         }
1470
1471         if (timeout) {
1472                 struct task_struct *tsk = current;
1473                 struct wait_queue wait = { tsk, NULL };
1474
1475                 tsk->timeout = timeout;
1476                 add_wait_queue(sk->sleep, &wait);
1477                 release_sock(sk);
1478
1479                 while (1) {
1480                         tsk->state = TASK_INTERRUPTIBLE;
1481                         if (!closing(sk))
1482                                 break;
1483                         schedule();
1484                         if (signal_pending(tsk) || !tsk->timeout)
1485                                 break;
1486                 }
1487
1488                 tsk->timeout=0;
1489                 tsk->state = TASK_RUNNING;
1490                 remove_wait_queue(sk->sleep, &wait);
1491
1492                 lock_sock(sk);
1493         }
1494
1495         /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1496          * we may need to set up a timer.
1497          */
1498         tcp_check_fin_timer(sk);
1499
1500         sk->dead = 1;
1501         release_sock(sk);
1502 }
1503
1504 /*
1505  *      Wait for an incoming connection, avoid race
1506  *      conditions. This must be called with the socket locked.
1507  */
1508 static struct open_request * wait_for_connect(struct sock * sk,
1509                                               struct open_request **pprev)
1510 {
1511         struct wait_queue wait = { current, NULL };
1512         struct open_request *req;
1513
1514         add_wait_queue(sk->sleep, &wait);
1515         for (;;) {
1516                 current->state = TASK_INTERRUPTIBLE;
1517                 release_sock(sk);
1518                 schedule();
1519                 lock_sock(sk);
1520                 req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
1521                 if (req)
1522                         break;
1523                 if (signal_pending(current))
1524                         break;
1525         }
1526         current->state = TASK_RUNNING;
1527         remove_wait_queue(sk->sleep, &wait);
1528         return req;
1529 }
1530
1531 /*
1532  *      This will accept the next outstanding connection.
1533  *
1534  *      Be careful about race conditions here - this is subtle.
1535  */
1536
1537 struct sock *tcp_accept(struct sock *sk, int flags)
1538 {
1539         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1540         struct open_request *req, *prev;
1541         struct sock *newsk = NULL;
1542         int error;
1543
1544         lock_sock(sk);
1545
1546         /* We need to make sure that this socket is listening,
1547          * and that it has something pending.
1548          */
1549         error = EINVAL;
1550         if (sk->state != TCP_LISTEN)
1551                 goto out;
1552
1553         /* Find already established connection */
1554         req = tcp_find_established(tp, &prev);
1555         if (!req) {
1556                 /* If this is a non blocking socket don't sleep */
1557                 error = EAGAIN;
1558                 if (flags & O_NONBLOCK)
1559                         goto out;
1560
1561                 error = ERESTARTSYS;
1562                 req = wait_for_connect(sk, &prev);
1563                 if (!req)
1564                         goto out;
1565         }
1566
1567         tcp_synq_unlink(tp, req, prev);
1568         newsk = req->sk;
1569         req->class->destructor(req);
1570         tcp_openreq_free(req);
1571         sk->ack_backlog--;
1572
1573         /*
1574          * This does not pass any already set errors on the new socket
1575          * to the user, but they will be returned on the first socket operation
1576          * after the accept.
1577          */
1578
1579         error = 0;
1580 out:
1581         release_sock(sk);
1582         sk->err = error;
1583         return newsk;
1584 }
1585
1586 /*
1587  *      Socket option code for TCP.
1588  */
1589
1590 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
1591                    int optlen)
1592 {
1593         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1594         int val;
1595
1596         if (level != SOL_TCP)
1597                 return tp->af_specific->setsockopt(sk, level, optname,
1598                                                    optval, optlen);
1599
1600         if(optlen<sizeof(int))
1601                 return -EINVAL;
1602
1603         if (get_user(val, (int *)optval))
1604                 return -EFAULT;
1605
1606         switch(optname) {
1607                 case TCP_MAXSEG:
1608 /* values greater than interface MTU won't take effect.  however at
1609  * the point when this call is done we typically don't yet know
1610  * which interface is going to be used
1611  */
1612                         if(val<1||val>MAX_WINDOW)
1613                                 return -EINVAL;
1614                         tp->user_mss=val;
1615                         return 0;
1616                 case TCP_NODELAY:
1617                         sk->nonagle=(val==0)?0:1;
1618                         return 0;
1619                 default:
1620                         return(-ENOPROTOOPT);
1621         };
1622 }
1623
1624 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
1625                    int *optlen)
1626 {
1627         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1628         int val;
1629         int len;
1630
1631         if(level != SOL_TCP)
1632                 return tp->af_specific->getsockopt(sk, level, optname,
1633                                                    optval, optlen);
1634
1635         if(get_user(len,optlen))
1636                 return -EFAULT;
1637
1638         len = min(len,sizeof(int));
1639
1640         switch(optname) {
1641                 case TCP_MAXSEG:
1642                         val=tp->user_mss;
1643                         break;
1644                 case TCP_NODELAY:
1645                         val=sk->nonagle;
1646                         break;
1647                 default:
1648                         return(-ENOPROTOOPT);
1649         };
1650
1651         if(put_user(len, optlen))
1652                 return -EFAULT;
1653         if(copy_to_user(optval, &val,len))
1654                 return -EFAULT;
1655         return 0;
1656 }
1657
1658 void tcp_set_keepalive(struct sock *sk, int val)
1659 {
1660         if (!sk->keepopen && val)
1661                 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1662         else if (sk->keepopen && !val)
1663                 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1664 }
1665
1666 extern void __skb_cb_too_small_for_tcp(int, int);
1667
1668 void __init tcp_init(void)
1669 {
1670         struct sk_buff *skb = NULL;
1671
1672         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
1673                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
1674                                            sizeof(skb->cb));
1675
1676         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
1677                                                    sizeof(struct open_request),
1678                                                0, SLAB_HWCACHE_ALIGN,
1679                                                NULL, NULL);
1680         if(!tcp_openreq_cachep)
1681                 panic("tcp_init: Cannot alloc open_request cache.");
1682
1683         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
1684                                               sizeof(struct tcp_bind_bucket),
1685                                               0, SLAB_HWCACHE_ALIGN,
1686                                               NULL, NULL);
1687         if(!tcp_bucket_cachep)
1688                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
1689
1690         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
1691                                                 sizeof(struct tcp_tw_bucket),
1692                                                 0, SLAB_HWCACHE_ALIGN,
1693                                                 NULL, NULL);
1694         if(!tcp_timewait_cachep)
1695                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
1696 }