net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.151 1999/09/07 02:31:21 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen :    Make poll agree with SIGIO
 205  *
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *
 243  *      TCP_CLOSE               socket is finished
 244  */
 245
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * [Note: Most of the TCP code has been rewriten/redesigned since this
 253  *  RFC1122 check. It is probably not correct anymore. It should be redone
 254  *  before 2.2. -AK]
 255  *
 256  * Use of PSH (4.2.2.2)
 257  *   MAY aggregate data sent without the PSH flag. (does)
 258  *   MAY queue data received without the PSH flag. (does)
 259  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 260  *   MAY implement PSH on send calls. (doesn't, thus:)
 261  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 262  *     MUST set PSH on last segment (does)
 263  *   MAY pass received PSH to application layer (doesn't)
 264  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 265  *
 266  * Window Size (4.2.2.3, 4.2.2.16)
 267  *   MUST treat window size as an unsigned number (does)
 268  *   SHOULD treat window size as a 32-bit number (does not)
 269  *   MUST NOT shrink window once it is offered (does not normally)
 270  *
 271  * Urgent Pointer (4.2.2.4)
 272  * **MUST point urgent pointer to last byte of urgent data (not right
 273  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 274  *      to off)
 275  *   MUST inform application layer asynchronously of incoming urgent
 276  *     data. (does)
 277  *   MUST provide application with means of determining the amount of
 278  *     urgent data pending. (does)
 279  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 280  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 281  *      [Follows BSD 1 byte of urgent data]
 282  *
 283  * TCP Options (4.2.2.5)
 284  *   MUST be able to receive TCP options in any segment. (does)
 285  *   MUST ignore unsupported options (does)
 286  *
 287  * Maximum Segment Size Option (4.2.2.6)
 288  *   MUST implement both sending and receiving MSS. (does, but currently
 289  *      only uses the smaller of both of them)
 290  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 291  *     it always). (does, even when MSS == 536, which is legal)
 292  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 293  *   MUST calculate "effective send MSS" correctly:
 294  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 295  *     (does - but allows operator override)
 296  *
 297  * TCP Checksum (4.2.2.7)
 298  *   MUST generate and check TCP checksum. (does)
 299  *
 300  * Initial Sequence Number Selection (4.2.2.8)
 301  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 302  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 303  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 304  *     With syncookies we don't)
 305  *
 306  * Simultaneous Open Attempts (4.2.2.10)
 307  *   MUST support simultaneous open attempts (does)
 308  *
 309  * Recovery from Old Duplicate SYN (4.2.2.11)
 310  *   MUST keep track of active vs. passive open (does)
 311  *
 312  * RST segment (4.2.2.12)
 313  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 314  *     anything with it, which is standard)
 315  *
 316  * Closing a Connection (4.2.2.13)
 317  *   MUST inform application of whether connection was closed by RST or
 318  *     normal close. (does)
 319  *   MAY allow "half-duplex" close (treat connection as closed for the
 320  *     local app, even before handshake is done). (does)
 321  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 322  *
 323  * Retransmission Timeout (4.2.2.15)
 324  *   MUST implement Jacobson's slow start and congestion avoidance
 325  *     stuff. (does)
 326  *
 327  * Probing Zero Windows (4.2.2.17)
 328  *   MUST support probing of zero windows. (does)
 329  *   MAY keep offered window closed indefinitely. (does)
 330  *   MUST allow remote window to stay closed indefinitely. (does)
 331  *
 332  * Passive Open Calls (4.2.2.18)
 333  *   MUST NOT let new passive open affect other connections. (doesn't)
 334  *   MUST support passive opens (LISTENs) concurrently. (does)
 335  *
 336  * Time to Live (4.2.2.19)
 337  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 338  *
 339  * Event Processing (4.2.2.20)
 340  *   SHOULD queue out-of-order segments. (does)
 341  *   MUST aggregate ACK segments whenever possible. (does but badly)
 342  *
 343  * Retransmission Timeout Calculation (4.2.3.1)
 344  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 345  *     calculation. (does, or at least explains them in the comments 8*b)
 346  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 347  *
 348  * When to Send an ACK Segment (4.2.3.2)
 349  *   SHOULD implement delayed ACK. (does)
 350  *   MUST keep ACK delay < 0.5 sec. (does)
 351  *
 352  * When to Send a Window Update (4.2.3.3)
 353  *   MUST implement receiver-side SWS. (does)
 354  *
 355  * When to Send Data (4.2.3.4)
 356  *   MUST implement sender-side SWS. (does)
 357  *   SHOULD implement Nagle algorithm. (does)
 358  *
 359  * TCP Connection Failures (4.2.3.5)
 360  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 361  *   SHOULD inform application layer of soft errors. (does)
 362  *
 363  * TCP Keep-Alives (4.2.3.6)
 364  *   MAY provide keep-alives. (does)
 365  *   MUST make keep-alives configurable on a per-connection basis. (does)
 366  *   MUST default to no keep-alives. (does)
 367  *   MUST make keep-alive interval configurable. (does)
 368  *   MUST make default keep-alive interval > 2 hours. (does)
 369  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 370  *     connection. (doesn't)
 371  *   SHOULD send keep-alive with no data. (does)
 372  *
 373  * TCP Multihoming (4.2.3.7)
 374  *   MUST get source address from IP layer before sending first
 375  *     SYN. (does)
 376  *   MUST use same local address for all segments of a connection. (does)
 377  *
 378  * IP Options (4.2.3.8)
 379  *   MUST ignore unsupported IP options. (does)
 380  *   MAY support Time Stamp and Record Route. (does)
 381  *   MUST allow application to specify a source route. (does)
 382  *   MUST allow received Source Route option to set route for all future
 383  *     segments on this connection. (does not (security issues))
 384  *
 385  * ICMP messages (4.2.3.9)
 386  *   MUST act on ICMP errors. (does)
 387  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 388  *   because that is deprecated now by the IETF, can be turned on)
 389  *   MUST NOT abort connection upon receipt of soft Destination
 390  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  *     Problems. (doesn't)
 392  *   SHOULD report soft Destination Unreachables etc. to the
 393  *     application. (does, except during SYN_RECV and may drop messages
 394  *     in some rare cases before accept() - ICMP is unreliable)
 395  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 396  *     messages (2, 3, 4). (does, but see above)
 397  *
 398  * Remote Address Validation (4.2.3.10)
 399  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 400  *   MUST ignore SYN with invalid source address. (does)
 401  *   MUST silently discard incoming SYN for broadcast/multicast
 402  *     address. (does)
 403  *
 404  * Asynchronous Reports (4.2.4.1)
 405  * MUST provide mechanism for reporting soft errors to application
 406  *     layer. (does)
 407  *
 408  * Type of Service (4.2.4.2)
 409  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 410  *
 411  * (Whew. -- MS 950903)
 412  * (Updated by AK, but not complete yet.)
 413  **/
 414
 415 #include <linux/config.h>
 416 #include <linux/types.h>
 417 #include <linux/fcntl.h>
 418 #include <linux/poll.h>
 419 #include <linux/init.h>
 420 #include <linux/smp_lock.h>
 421
 422 #include <net/icmp.h>
 423 #include <net/tcp.h>
 424
 425 #include <asm/uaccess.h>
 426
 427 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 428
 429 struct tcp_mib  tcp_statistics;
 430
 431 kmem_cache_t *tcp_openreq_cachep;
 432 kmem_cache_t *tcp_bucket_cachep;
 433 kmem_cache_t *tcp_timewait_cachep;
 434
 435 /*
 436  *      Find someone to 'accept'. Must be called with
 437  *      the listening socket locked.
 438  */
 439
 440 static struct open_request *tcp_find_established(struct tcp_opt *tp,
 441                                                  struct open_request **prevp)
 442 {
 443         struct open_request *req = tp->syn_wait_queue;
 444         struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
 445         while(req) {
 446                 if (req->sk) {
 447                         if((1 << req->sk->state) &
 448                            ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
 449                                 break;
 450                 }
 451                 prev = req;
 452                 req = req->dl_next;
 453         }
 454         *prevp = prev;
 455         return req;
 456 }
 457
 458 /*
 459  *      Walk down the receive queue counting readable data.
 460  *
 461  *      Must be called with the socket lock held.
 462  */
 463
 464 static int tcp_readable(struct sock *sk)
 465 {
 466         unsigned long counted;
 467         unsigned long amount;
 468         struct sk_buff *skb;
 469         int sum;
 470
 471         SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
 472
 473         skb = skb_peek(&sk->receive_queue);
 474         if (skb == NULL) {
 475                 SOCK_DEBUG(sk, "empty\n");
 476                 return(0);
 477         }
 478
 479         counted = sk->tp_pinfo.af_tcp.copied_seq;       /* Where we are at the moment */
 480         amount = 0;
 481
 482         /* Do until a push or until we are out of data. */
 483         do {
 484                 /* Found a hole so stops here. */
 485                 if (before(counted, TCP_SKB_CB(skb)->seq))      /* should not happen */
 486                         break;
 487
 488                 /* Length - header but start from where we are up to
 489                  * avoid overlaps.
 490                  */
 491                 sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
 492                 if (sum >= 0) {
 493                         /* Add it up, move on. */
 494                         amount += sum;
 495                         counted += sum;
 496                         if (skb->h.th->syn)
 497                                 counted++;
 498                 }
 499
 500                 /* Don't count urg data ... but do it in the right place!
 501                  * Consider: "old_data (ptr is here) URG PUSH data"
 502                  * The old code would stop at the first push because
 503                  * it counted the urg (amount==1) and then does amount--
 504                  * *after* the loop.  This means tcp_readable() always
 505                  * returned zero if any URG PUSH was in the queue, even
 506                  * though there was normal data available. If we subtract
 507                  * the urg data right here, we even get it to work for more
 508                  * than one URG PUSH skb without normal data.
 509                  * This means that poll() finally works now with urg data
 510                  * in the queue.  Note that rlogin was never affected
 511                  * because it doesn't use poll(); it uses two processes
 512                  * and a blocking read().  And the queue scan in tcp_read()
 513                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 514                  */
 515
 516                 /* Don't count urg data. */
 517                 if (skb->h.th->urg)
 518                         amount--;
 519 #if 0
 520                 if (amount && skb->h.th->psh) break;
 521 #endif
 522                 skb = skb->next;
 523         } while(skb != (struct sk_buff *)&sk->receive_queue);
 524
 525         SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
 526         return(amount);
 527 }
 528
 529 /*
 530  * LISTEN is a special case for poll..
 531  */
 532 static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 533 {
 534         struct open_request *req, *dummy;
 535
 536         lock_sock(sk);
 537         req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
 538         release_sock(sk);
 539         if (req)
 540                 return POLLIN | POLLRDNORM;
 541         return 0;
 542 }
 543
 544 /*
 545  *      Compute minimal free write space needed to queue new packets.
 546  */
 547 #define tcp_min_write_space(__sk) \
 548         (atomic_read(&(__sk)->wmem_alloc) / 2)
 549
 550 /*
 551  *      Wait for a TCP event.
 552  *
 553  *      Note that we don't need to lock the socket, as the upper poll layers
 554  *      take care of normal races (between the test and the event) and we don't
 555  *      go look at any of the socket buffers directly.
 556  */
 557 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 558 {
 559         unsigned int mask;
 560         struct sock *sk = sock->sk;
 561         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 562
 563         poll_wait(file, sk->sleep, wait);
 564         if (sk->state == TCP_LISTEN)
 565                 return tcp_listen_poll(sk, wait);
 566
 567         /* Socket is not locked. We are protected from async events
 568            by poll logic and correct handling of state changes
 569            made by another threads is impossible in any case.
 570          */
 571
 572         mask = 0;
 573         if (sk->err)
 574                 mask = POLLERR;
 575
 576         /*
 577          * POLLHUP is certainly not done right. But poll() doesn't
 578          * have a notion of HUP in just one direction, and for a
 579          * socket the read side is more interesting.
 580          *
 581          * Some poll() documentation says that POLLHUP is incompatible
 582          * with the POLLOUT/POLLWR flags, so somebody should check this
 583          * all. But careful, it tends to be safer to return too many
 584          * bits than too few, and you can easily break real applications
 585          * if you don't tell them that something has hung up!
 586          *
 587          * Check-me.
 588          */
 589         if (sk->shutdown & RCV_SHUTDOWN)
 590                 mask |= POLLHUP;
 591
 592         /* Connected? */
 593         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 594                 if ((tp->rcv_nxt != tp->copied_seq) &&
 595                     (tp->urg_seq != tp->copied_seq ||
 596                      tp->rcv_nxt != tp->copied_seq+1 ||
 597                      sk->urginline || !tp->urg_data))
 598                         mask |= POLLIN | POLLRDNORM;
 599
 600                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 601                         if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
 602                                 mask |= POLLOUT | POLLWRNORM;
 603                         } else {  /* send SIGIO later */
 604                                 sk->socket->flags |= SO_NOSPACE;
 605                         }
 606                 }
 607
 608                 if (tp->urg_data & URG_VALID)
 609                         mask |= POLLPRI;
 610         }
 611         return mask;
 612 }
 613
 614 /*
 615  *      Socket write_space callback.
 616  *      This (or rather the sock_wake_async) should agree with poll.
 617  *
 618  *      WARNING. This callback is called from any context (process,
 619  *      bh or irq). Do not make anything more smart from it.
 620  */
 621 void tcp_write_space(struct sock *sk)
 622 {
 623         read_lock(&sk->callback_lock);
 624         if (!sk->dead) {
 625                 /* Why??!! Does it really not overshedule? --ANK */
 626                 wake_up_interruptible(sk->sleep);
 627
 628                 if (sock_wspace(sk) >= tcp_min_write_space(sk))
 629                         sock_wake_async(sk->socket, 2);
 630         }
 631         read_unlock(&sk->callback_lock);
 632 }
 633
 634
 635 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 636 {
 637         int answ;
 638
 639         switch(cmd) {
 640         case TIOCINQ:
 641 #ifdef FIXME    /* FIXME: */
 642         case FIONREAD:
 643 #endif
 644                 if (sk->state == TCP_LISTEN)
 645                         return(-EINVAL);
 646                 lock_sock(sk);
 647                 answ = tcp_readable(sk);
 648                 release_sock(sk);
 649                 break;
 650         case SIOCATMARK:
 651                 {
 652                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 653                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 654                         break;
 655                 }
 656         case TIOCOUTQ:
 657                 if (sk->state == TCP_LISTEN)
 658                         return(-EINVAL);
 659                 answ = sock_wspace(sk);
 660                 break;
 661         default:
 662                 return(-ENOIOCTLCMD);
 663         };
 664
 665         return put_user(answ, (int *)arg);
 666 }
 667
 668 /*
 669  *      Wait for a socket to get into the connected state
 670  *
 671  *      Note: Must be called with the socket locked.
 672  */
 673 static int wait_for_tcp_connect(struct sock * sk, int flags)
 674 {
 675         struct task_struct *tsk = current;
 676         DECLARE_WAITQUEUE(wait, tsk);
 677
 678         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 679                 if(sk->err)
 680                         return sock_error(sk);
 681                 if((1 << sk->state) &
 682                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 683                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 684                                 send_sig(SIGPIPE, tsk, 0);
 685                         return -EPIPE;
 686                 }
 687                 if(flags & MSG_DONTWAIT)
 688                         return -EAGAIN;
 689                 if(signal_pending(tsk))
 690                         return -ERESTARTSYS;
 691
 692                 __set_task_state(tsk, TASK_INTERRUPTIBLE);
 693                 add_wait_queue(sk->sleep, &wait);
 694                 sk->tp_pinfo.af_tcp.write_pending++;
 695
 696                 release_sock(sk);
 697                 schedule();
 698                 lock_sock(sk);
 699
 700                 __set_task_state(tsk, TASK_RUNNING);
 701                 remove_wait_queue(sk->sleep, &wait);
 702                 sk->tp_pinfo.af_tcp.write_pending--;
 703         }
 704         return 0;
 705 }
 706
 707 static inline int tcp_memory_free(struct sock *sk)
 708 {
 709         return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
 710 }
 711
 712 /*
 713  *      Wait for more memory for a socket
 714  */
 715 static void wait_for_tcp_memory(struct sock * sk)
 716 {
 717         if (!tcp_memory_free(sk)) {
 718                 DECLARE_WAITQUEUE(wait, current);
 719
 720                 sk->socket->flags &= ~SO_NOSPACE;
 721                 add_wait_queue(sk->sleep, &wait);
 722                 for (;;) {
 723                         set_current_state(TASK_INTERRUPTIBLE);
 724
 725                         if (signal_pending(current))
 726                                 break;
 727                         if (tcp_memory_free(sk))
 728                                 break;
 729                         if (sk->shutdown & SEND_SHUTDOWN)
 730                                 break;
 731                         if (sk->err)
 732                                 break;
 733                         release_sock(sk);
 734                         if (!tcp_memory_free(sk))
 735                                 schedule();
 736                         lock_sock(sk);
 737                 }
 738                 current->state = TASK_RUNNING;
 739                 remove_wait_queue(sk->sleep, &wait);
 740         }
 741 }
 742
 743 /* When all user supplied data has been queued set the PSH bit */
 744 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
 745
 746 /*
 747  *      This routine copies from a user buffer into a socket,
 748  *      and starts the transmit system.
 749  *
 750  *      Note: must be called with the socket locked.
 751  */
 752
 753 int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 754 {
 755         struct iovec *iov;
 756         struct tcp_opt *tp;
 757         struct sk_buff *skb;
 758         int iovlen, flags;
 759         int mss_now;
 760         int err, copied;
 761
 762         err = 0;
 763         tp = &(sk->tp_pinfo.af_tcp);
 764
 765         /* Wait for a connection to finish. */
 766         flags = msg->msg_flags;
 767         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 768                 if((err = wait_for_tcp_connect(sk, flags)) != 0)
 769                         goto out;
 770
 771         /* This should be in poll */
 772         sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
 773
 774         mss_now = tcp_current_mss(sk);
 775
 776         /* Ok commence sending. */
 777         iovlen = msg->msg_iovlen;
 778         iov = msg->msg_iov;
 779         copied = 0;
 780
 781         while(--iovlen >= 0) {
 782                 int seglen=iov->iov_len;
 783                 unsigned char * from=iov->iov_base;
 784
 785                 iov++;
 786
 787                 while(seglen > 0) {
 788                         int copy, tmp, queue_it, psh;
 789
 790                         if (err)
 791                                 goto do_fault2;
 792
 793                         /* Stop on errors. */
 794                         if (sk->err)
 795                                 goto do_sock_err;
 796
 797                         /* Make sure that we are established. */
 798                         if (sk->shutdown & SEND_SHUTDOWN)
 799                                 goto do_shutdown;
 800
 801                         /* Now we need to check if we have a half
 802                          * built packet we can tack some data onto.
 803                          */
 804                         if (tp->send_head && !(flags & MSG_OOB)) {
 805                                 skb = sk->write_queue.prev;
 806                                 copy = skb->len;
 807                                 /* If the remote does SWS avoidance we should
 808                                  * queue the best we can if not we should in
 809                                  * fact send multiple packets...
 810                                  * A method for detecting this would be most
 811                                  * welcome.
 812                                  */
 813                                 if (skb_tailroom(skb) > 0 &&
 814                                     (mss_now - copy) > 0 &&
 815                                     tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
 816                                         int last_byte_was_odd = (copy % 4);
 817
 818                                         copy = mss_now - copy;
 819                                         if(copy > skb_tailroom(skb))
 820                                                 copy = skb_tailroom(skb);
 821                                         if(copy > seglen)
 822                                                 copy = seglen;
 823                                         if(last_byte_was_odd) {
 824                                                 if(copy_from_user(skb_put(skb, copy),
 825                                                                   from, copy))
 826                                                         err = -EFAULT;
 827                                                 skb->csum = csum_partial(skb->data,
 828                                                                          skb->len, 0);
 829                                         } else {
 830                                                 skb->csum =
 831                                                         csum_and_copy_from_user(
 832                                                         from, skb_put(skb, copy),
 833                                                         copy, skb->csum, &err);
 834                                         }
 835                                         /*
 836                                          * FIXME: the *_user functions should
 837                                          *        return how much data was
 838                                          *        copied before the fault
 839                                          *        occurred and then a partial
 840                                          *        packet with this data should
 841                                          *        be sent.  Unfortunately
 842                                          *        csum_and_copy_from_user doesn't
 843                                          *        return this information.
 844                                          *        ATM it might send partly zeroed
 845                                          *        data in this case.
 846                                          */
 847                                         tp->write_seq += copy;
 848                                         TCP_SKB_CB(skb)->end_seq += copy;
 849                                         from += copy;
 850                                         copied += copy;
 851                                         seglen -= copy;
 852                                         if (PSH_NEEDED)
 853                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 854                                         continue;
 855                                 }
 856                         }
 857
 858                         /* We also need to worry about the window.  If
 859                          * window < 1/2 the maximum window we've seen
 860                          * from this host, don't use it.  This is
 861                          * sender side silly window prevention, as
 862                          * specified in RFC1122.  (Note that this is
 863                          * different than earlier versions of SWS
 864                          * prevention, e.g. RFC813.).  What we
 865                          * actually do is use the whole MSS.  Since
 866                          * the results in the right edge of the packet
 867                          * being outside the window, it will be queued
 868                          * for later rather than sent.
 869                          */
 870                         psh = 0;
 871                         copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 872                         if(copy > (tp->max_window >> 1)) {
 873                                 copy = min(copy, mss_now);
 874                                 psh = 1;
 875                         } else {
 876                                 copy = mss_now;
 877                         }
 878                         if(copy > seglen)
 879                                 copy = seglen;
 880
 881                         /* Determine how large of a buffer to allocate.  */
 882                         tmp = MAX_HEADER + sk->prot->max_header;
 883                         if (copy < min(mss_now, tp->max_window >> 1) &&
 884                             !(flags & MSG_OOB)) {
 885                                 tmp += min(mss_now, tp->max_window);
 886
 887                                 /* What is happening here is that we want to
 888                                  * tack on later members of the users iovec
 889                                  * if possible into a single frame.  When we
 890                                  * leave this loop our caller checks to see if
 891                                  * we can send queued frames onto the wire.
 892                                  * See tcp_v[46]_sendmsg() for this.
 893                                  */
 894                                 queue_it = 1;
 895                         } else {
 896                                 tmp += copy;
 897                                 queue_it = 0;
 898                         }
 899                         skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
 900
 901                         /* If we didn't get any memory, we need to sleep. */
 902                         if (skb == NULL) {
 903                                 sk->socket->flags |= SO_NOSPACE;
 904                                 if (flags&MSG_DONTWAIT) {
 905                                         err = -EAGAIN;
 906                                         goto do_interrupted;
 907                                 }
 908                                 if (signal_pending(current)) {
 909                                         err = -ERESTARTSYS;
 910                                         goto do_interrupted;
 911                                 }
 912                                 tcp_push_pending_frames(sk, tp);
 913                                 wait_for_tcp_memory(sk);
 914
 915                                 /* If SACK's were formed or PMTU events happened,
 916                                  * we must find out about it.
 917                                  */
 918                                 mss_now = tcp_current_mss(sk);
 919                                 continue;
 920                         }
 921
 922                         seglen -= copy;
 923
 924                         /* Prepare control bits for TCP header creation engine. */
 925                         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 926                                                   ((PSH_NEEDED || psh) ?
 927                                                    TCPCB_FLAG_PSH : 0));
 928                         TCP_SKB_CB(skb)->sacked = 0;
 929                         if (flags & MSG_OOB) {
 930                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 931                                 TCP_SKB_CB(skb)->urg_ptr = copy;
 932                         } else
 933                                 TCP_SKB_CB(skb)->urg_ptr = 0;
 934
 935                         /* TCP data bytes are SKB_PUT() on top, later
 936                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 937                          * Reserve header space and checksum the data.
 938                          */
 939                         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 940                         skb->csum = csum_and_copy_from_user(from,
 941                                         skb_put(skb, copy), copy, 0, &err);
 942
 943                         if (err)
 944                                 goto do_fault;
 945
 946                         from += copy;
 947                         copied += copy;
 948
 949                         TCP_SKB_CB(skb)->seq = tp->write_seq;
 950                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 951
 952                         /* This advances tp->write_seq for us. */
 953                         tcp_send_skb(sk, skb, queue_it);
 954                 }
 955         }
 956         sk->err = 0;
 957         err = copied;
 958         goto out;
 959
 960 do_sock_err:
 961         if(copied)
 962                 err = copied;
 963         else
 964                 err = sock_error(sk);
 965         goto out;
 966 do_shutdown:
 967         if(copied)
 968                 err = copied;
 969         else {
 970                 if (!(flags&MSG_NOSIGNAL))
 971                         send_sig(SIGPIPE, current, 0);
 972                 err = -EPIPE;
 973         }
 974         goto out;
 975 do_interrupted:
 976         if(copied)
 977                 err = copied;
 978         goto out;
 979 do_fault:
 980         kfree_skb(skb);
 981 do_fault2:
 982         err = -EFAULT;
 983 out:
 984         tcp_push_pending_frames(sk, tp);
 985         return err;
 986 }
 987
 988 #undef PSH_NEEDED
 989
 990 /*
 991  *      Send an ack if one is backlogged at this point. Ought to merge
 992  *      this with tcp_send_ack().
 993  *      This is called for delayed acks also.
 994  */
 995
 996 void tcp_read_wakeup(struct sock *sk)
 997 {
 998         /* If we're closed, don't send an ack, or we'll get a RST
 999          * from the closed destination.
1000          */
1001         if (sk->state != TCP_CLOSE)
1002                 tcp_send_ack(sk);
1003 }
1004
1005 /*
1006  *      Handle reading urgent data. BSD has very simple semantics for
1007  *      this, no blocking and very strange errors 8)
1008  */
1009
1010 static int tcp_recv_urg(struct sock * sk, int nonblock,
1011                         struct msghdr *msg, int len, int flags,
1012                         int *addr_len)
1013 {
1014         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1015
1016         /* No URG data to read. */
1017         if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
1018                 return -EINVAL; /* Yes this is right ! */
1019
1020         if (sk->done)
1021                 return -ENOTCONN;
1022
1023         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
1024                 sk->done = 1;
1025                 return 0;
1026         }
1027
1028         if (tp->urg_data & URG_VALID) {
1029                 int err = 0;
1030                 char c = tp->urg_data;
1031
1032                 if (!(flags & MSG_PEEK))
1033                         tp->urg_data = URG_READ;
1034
1035                 if(msg->msg_name)
1036                         tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1037                                                        msg->msg_name);
1038
1039                 if(addr_len)
1040                         *addr_len = tp->af_specific->sockaddr_len;
1041
1042                 /* Read urgent data. */
1043                 msg->msg_flags|=MSG_OOB;
1044
1045                 if(len>0) {
1046                         err = memcpy_toiovec(msg->msg_iov, &c, 1);
1047                         len = 1;
1048                 } else
1049                         msg->msg_flags|=MSG_TRUNC;
1050
1051                 return err ? -EFAULT : len;
1052         }
1053
1054         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1055          * the available implementations agree in this case:
1056          * this call should never block, independent of the
1057          * blocking state of the socket.
1058          * Mike <pall@rz.uni-karlsruhe.de>
1059          */
1060         return -EAGAIN;
1061 }
1062
1063 /*
1064  *      Release a skb if it is no longer needed. This routine
1065  *      must be called with interrupts disabled or with the
1066  *      socket locked so that the sk_buff queue operation is ok.
1067  */
1068
1069 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1070 {
1071         __skb_unlink(skb, &sk->receive_queue);
1072         __kfree_skb(skb);
1073 }
1074
1075 /* Clean up the receive buffer for full frames taken by the user,
1076  * then send an ACK if necessary.  COPIED is the number of bytes
1077  * tcp_recvmsg has given to the user so far, it speeds up the
1078  * calculation of whether or not we must ACK for the sake of
1079  * a window update.
1080  */
1081 static void cleanup_rbuf(struct sock *sk, int copied)
1082 {
1083         struct sk_buff *skb;
1084
1085         /* NOTE! The socket must be locked, so that we don't get
1086          * a messed-up receive queue.
1087          */
1088         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1089                 if (!skb->used || atomic_read(&skb->users) > 1)
1090                         break;
1091                 tcp_eat_skb(sk, skb);
1092         }
1093
1094         /* We send an ACK if we can now advertise a non-zero window
1095          * which has been raised "significantly".
1096          */
1097         if(copied > 0) {
1098                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1099                 __u32 rcv_window_now = tcp_receive_window(tp);
1100                 __u32 new_window = __tcp_select_window(sk);
1101
1102                 /* We won't be raising the window any further than
1103                  * the window-clamp allows.  Our window selection
1104                  * also keeps things a nice multiple of MSS.  These
1105                  * checks are necessary to prevent spurious ACKs
1106                  * which don't advertize a larger window.
1107                  */
1108                 if((new_window && (new_window >= rcv_window_now * 2)) &&
1109                    ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
1110                         tcp_read_wakeup(sk);
1111         }
1112 }
1113
1114 /* Now socket state including sk->err is changed only under lock,
1115    hence we should check only pending signals.
1116  */
1117
1118 static void tcp_data_wait(struct sock *sk)
1119 {
1120         DECLARE_WAITQUEUE(wait, current);
1121
1122         add_wait_queue(sk->sleep, &wait);
1123
1124         __set_current_state(TASK_INTERRUPTIBLE);
1125
1126         sk->socket->flags |= SO_WAITDATA;
1127         release_sock(sk);
1128
1129         if (skb_queue_empty(&sk->receive_queue))
1130                 schedule();
1131
1132         lock_sock(sk);
1133         sk->socket->flags &= ~SO_WAITDATA;
1134
1135         remove_wait_queue(sk->sleep, &wait);
1136         __set_current_state(TASK_RUNNING);
1137 }
1138
1139 /*
1140  *      This routine copies from a sock struct into the user buffer.
1141  */
1142
1143 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1144                 int len, int nonblock, int flags, int *addr_len)
1145 {
1146         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1147         int copied = 0;
1148         u32 peek_seq;
1149         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1150         unsigned long used;
1151         int err;
1152         int target = 1;         /* Read at least this many bytes */
1153
1154         lock_sock(sk);
1155
1156         if (sk->err)
1157                 goto out_err;
1158
1159         err = -ENOTCONN;
1160         if (sk->state == TCP_LISTEN)
1161                 goto out;
1162
1163         /* Urgent data needs to be handled specially. */
1164         if (flags & MSG_OOB)
1165                 goto recv_urg;
1166
1167         /*      Copying sequence to update. This is volatile to handle
1168          *      the multi-reader case neatly (memcpy_to/fromfs might be
1169          *      inline and thus not flush cached variables otherwise).
1170          */
1171         peek_seq = tp->copied_seq;
1172         seq = &tp->copied_seq;
1173         if (flags & MSG_PEEK)
1174                 seq = &peek_seq;
1175
1176         /* Handle the POSIX bogosity MSG_WAITALL. */
1177         if (flags & MSG_WAITALL)
1178                 target=len;
1179
1180
1181         /*
1182          *      BUG BUG BUG
1183          *      This violates 1003.1g compliance. We must wait for
1184          *      data to exist even if we read none!
1185          */
1186
1187         while (len > 0) {
1188                 struct sk_buff * skb;
1189                 u32 offset;
1190
1191                 /* Are we at urgent data? Stop if we have read anything. */
1192                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1193                         break;
1194
1195                 /* We need to check signals first, to get correct SIGURG
1196                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1197                  * and move it down to the bottom of the loop
1198                  */
1199                 if (signal_pending(current)) {
1200                         if (copied)
1201                                 break;
1202                         copied = -ERESTARTSYS;
1203                         if (nonblock)
1204                                 copied = -EAGAIN;
1205                         break;
1206                 }
1207
1208                 /* Next get a buffer. */
1209
1210                 skb = skb_peek(&sk->receive_queue);
1211                 do {
1212                         if (!skb)
1213                                 break;
1214
1215                         /* Now that we have two receive queues this
1216                          * shouldn't happen.
1217                          */
1218                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1219                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1220                                        *seq, TCP_SKB_CB(skb)->seq);
1221                                 break;
1222                         }
1223                         offset = *seq - TCP_SKB_CB(skb)->seq;
1224                         if (skb->h.th->syn)
1225                                 offset--;
1226                         if (offset < skb->len)
1227                                 goto found_ok_skb;
1228                         if (skb->h.th->fin)
1229                                 goto found_fin_ok;
1230                         if (!(flags & MSG_PEEK))
1231                                 skb->used = 1;
1232                         skb = skb->next;
1233                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1234
1235                 if (copied >= target)
1236                         break;
1237
1238                 if (sk->err && !(flags&MSG_PEEK)) {
1239                         if (!copied)
1240                                 copied = sock_error(sk);
1241                         break;
1242                 }
1243
1244                 if (sk->shutdown & RCV_SHUTDOWN) {
1245                         sk->done = 1;
1246                         break;
1247                 }
1248
1249                 if (sk->state == TCP_CLOSE) {
1250                         if (!sk->done) {
1251                                 sk->done = 1;
1252                                 break;
1253                         }
1254                         if (!copied)
1255                                 copied = -ENOTCONN;
1256                         break;
1257                 }
1258
1259                 if (nonblock) {
1260                         copied = -EAGAIN;
1261                         break;
1262                 }
1263
1264                 cleanup_rbuf(sk, copied);
1265                 tcp_data_wait(sk);
1266                 continue;
1267
1268         found_ok_skb:
1269                 /*      Lock the buffer. We can be fairly relaxed as
1270                  *      an interrupt will never steal a buffer we are
1271                  *      using unless I've missed something serious in
1272                  *      tcp_data.
1273                  */
1274                 atomic_inc(&skb->users);
1275
1276                 /* Ok so how much can we use? */
1277                 used = skb->len - offset;
1278                 if (len < used)
1279                         used = len;
1280
1281                 /* Do we have urgent data here? */
1282                 if (tp->urg_data) {
1283                         u32 urg_offset = tp->urg_seq - *seq;
1284                         if (urg_offset < used) {
1285                                 if (!urg_offset) {
1286                                         if (!sk->urginline) {
1287                                                 ++*seq;
1288                                                 offset++;
1289                                                 used--;
1290                                         }
1291                                 } else
1292                                         used = urg_offset;
1293                         }
1294                 }
1295
1296                 /*      Copy it - We _MUST_ update *seq first so that we
1297                  *      don't ever double read when we have dual readers
1298                  */
1299                 *seq += used;
1300
1301                 /*      This memcpy_toiovec can sleep. If it sleeps and we
1302                  *      do a second read it relies on the skb->users to avoid
1303                  *      a crash when cleanup_rbuf() gets called.
1304                  */
1305                 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1306                 if (err) {
1307                         /* Exception. Bailout! */
1308                         atomic_dec(&skb->users);
1309                         copied = -EFAULT;
1310                         break;
1311                 }
1312
1313                 copied += used;
1314                 len -= used;
1315
1316                 /*      We now will not sleep again until we are finished
1317                  *      with skb. Sorry if you are doing the SMP port
1318                  *      but you'll just have to fix it neatly ;)
1319                  *
1320                  *      Very funny Alan... -DaveM
1321                  */
1322                 atomic_dec(&skb->users);
1323
1324                 if (after(tp->copied_seq,tp->urg_seq))
1325                         tp->urg_data = 0;
1326                 if (used + offset < skb->len)
1327                         continue;
1328
1329                 /*      Process the FIN. We may also need to handle PSH
1330                  *      here and make it break out of MSG_WAITALL.
1331                  */
1332                 if (skb->h.th->fin)
1333                         goto found_fin_ok;
1334                 if (flags & MSG_PEEK)
1335                         continue;
1336                 skb->used = 1;
1337                 if (atomic_read(&skb->users) == 1)
1338                         tcp_eat_skb(sk, skb);
1339                 continue;
1340
1341         found_fin_ok:
1342                 ++*seq;
1343                 if (flags & MSG_PEEK)
1344                         break;
1345
1346                 /* All is done. */
1347                 skb->used = 1;
1348                 sk->shutdown |= RCV_SHUTDOWN;
1349                 break;
1350         }
1351
1352         if (copied >= 0 && msg->msg_name)
1353                 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1354                                                msg->msg_name);
1355
1356         if(addr_len)
1357                 *addr_len = tp->af_specific->sockaddr_len;
1358
1359         /* Clean up data we have read: This will do ACK frames. */
1360         cleanup_rbuf(sk, copied);
1361         release_sock(sk);
1362         return copied;
1363
1364 out_err:
1365         err = sock_error(sk);
1366
1367 out:
1368         release_sock(sk);
1369         return err;
1370
1371 recv_urg:
1372         err = tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1373         goto out;
1374 }
1375
1376 /*
1377  * Check whether to renew the timer.
1378  */
1379 static inline void tcp_check_fin_timer(struct sock *sk)
1380 {
1381         if (sk->state == TCP_FIN_WAIT2)
1382                 tcp_reset_keepalive_timer(sk, sysctl_tcp_fin_timeout);
1383 }
1384
1385 /*
1386  *      State processing on a close. This implements the state shift for
1387  *      sending our FIN frame. Note that we only send a FIN for some
1388  *      states. A shutdown() may have already sent the FIN, or we may be
1389  *      closed.
1390  */
1391
1392 static unsigned char new_state[16] = {
1393   /* current state:        new state:      action:      */
1394   /* (Invalid)          */ TCP_CLOSE,
1395   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1396   /* TCP_SYN_SENT       */ TCP_CLOSE,
1397   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1398   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1399   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1400   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1401   /* TCP_CLOSE          */ TCP_CLOSE,
1402   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1403   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1404   /* TCP_LISTEN         */ TCP_CLOSE,
1405   /* TCP_CLOSING        */ TCP_CLOSING,
1406 };
1407
1408 static int tcp_close_state(struct sock *sk, int dead)
1409 {
1410         int next = (int) new_state[sk->state];
1411         int ns = (next & TCP_STATE_MASK);
1412
1413         tcp_set_state(sk, ns);
1414
1415         /*      This is a (useful) BSD violating of the RFC. There is a
1416          *      problem with TCP as specified in that the other end could
1417          *      keep a socket open forever with no application left this end.
1418          *      We use a 3 minute timeout (about the same as BSD) then kill
1419          *      our end. If they send after that then tough - BUT: long enough
1420          *      that we won't make the old 4*rto = almost no time - whoops
1421          *      reset mistake.
1422          */
1423         if (dead)
1424                 tcp_check_fin_timer(sk);
1425
1426         return (next & TCP_ACTION_FIN);
1427 }
1428
1429 /*
1430  *      Shutdown the sending side of a connection. Much like close except
1431  *      that we don't receive shut down or set sk->dead.
1432  */
1433
1434 void tcp_shutdown(struct sock *sk, int how)
1435 {
1436         /*      We need to grab some memory, and put together a FIN,
1437          *      and then put it into the queue to be sent.
1438          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1439          */
1440         if (!(how & SEND_SHUTDOWN))
1441                 return;
1442
1443         /* If we've already sent a FIN, or it's a closed state, skip this. */
1444         if ((1 << sk->state) &
1445             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1446
1447                 /* Clear out any half completed packets.  FIN if needed. */
1448                 if (tcp_close_state(sk,0))
1449                         tcp_send_fin(sk);
1450         }
1451 }
1452
1453
1454 /*
1455  *      Return 1 if we still have things to send in our buffers.
1456  */
1457
1458 static inline int closing(struct sock * sk)
1459 {
1460         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1461 }
1462
1463 /*
1464  *      This routine closes sockets which have been at least partially
1465  *      opened, but not yet accepted. Currently it is only called by
1466  *      tcp_close.
1467  */
1468
1469 static void tcp_close_pending (struct sock *sk)
1470 {
1471         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1472         struct open_request *req = tp->syn_wait_queue;
1473
1474         while(req) {
1475                 struct open_request *iter;
1476
1477                 if (req->sk)
1478                         tcp_close(req->sk, 0);
1479
1480                 iter = req;
1481                 req = req->dl_next;
1482
1483                 if (iter->sk) {
1484                         sk->ack_backlog--;
1485                 } else {
1486                         tcp_dec_slow_timer(TCP_SLT_SYNACK);
1487                         tp->syn_backlog--;
1488                 }
1489                 (*iter->class->destructor)(iter);
1490                 tcp_openreq_free(iter);
1491         }
1492         BUG_TRAP(tp->syn_backlog == 0);
1493         BUG_TRAP(sk->ack_backlog == 0);
1494         tcp_synq_init(tp);
1495 }
1496
1497 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1498 {
1499         /* First the read buffer. */
1500         skb_queue_purge(&sk->receive_queue);
1501
1502         /* Next, the error queue. */
1503         skb_queue_purge(&sk->error_queue);
1504
1505         /* Next, the write queue. */
1506         BUG_TRAP(skb_queue_empty(&sk->write_queue));
1507
1508         /* It is _impossible_ for the backlog to contain anything
1509          * when we get here.  All user references to this socket
1510          * have gone away, only the net layer knows can touch it.
1511          */
1512 }
1513
1514 /*
1515  * At this point, there should be no process reference to this
1516  * socket, and thus no user references at all.  Therefore we
1517  * can assume the socket waitqueue is inactive and nobody will
1518  * try to jump onto it.
1519  */
1520 void tcp_destroy_sock(struct sock *sk)
1521 {
1522         BUG_TRAP(sk->state==TCP_CLOSE);
1523         BUG_TRAP(sk->dead);
1524
1525         /* It cannot be in hash table! */
1526         BUG_TRAP(sk->pprev==NULL);
1527
1528         /* It it has not 0 sk->num, it must be bound */
1529         BUG_TRAP(!sk->num || sk->prev!=NULL);
1530
1531         sk->prot->destroy(sk);
1532
1533         tcp_kill_sk_queues(sk);
1534
1535 #ifdef INET_REFCNT_DEBUG
1536         if (atomic_read(&sk->refcnt) != 1) {
1537                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1538         }
1539 #endif
1540
1541         sock_put(sk);
1542 }
1543
1544 void tcp_close(struct sock *sk, long timeout)
1545 {
1546         struct sk_buff *skb;
1547         int data_was_unread = 0;
1548
1549         lock_sock(sk);
1550         if(sk->state == TCP_LISTEN) {
1551                 tcp_set_state(sk, TCP_CLOSE);
1552
1553                 /* Special case. */
1554                 tcp_close_pending(sk);
1555
1556                 goto adjudge_to_death;
1557         }
1558
1559         sk->shutdown = SHUTDOWN_MASK;
1560
1561         /*  We need to flush the recv. buffs.  We do this only on the
1562          *  descriptor close, not protocol-sourced closes, because the
1563          *  reader process may not have drained the data yet!
1564          */
1565         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1566                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1567                 data_was_unread += len;
1568                 kfree_skb(skb);
1569         }
1570
1571         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1572          * 3.10, we send a RST here because data was lost.  To
1573          * witness the awful effects of the old behavior of always
1574          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1575          * a bulk GET in an FTP client, suspend the process, wait
1576          * for the client to advertise a zero window, then kill -9
1577          * the FTP client, wheee...  Note: timeout is always zero
1578          * in such a case.
1579          */
1580         if(data_was_unread != 0) {
1581                 /* Unread data was tossed, zap the connection. */
1582                 tcp_set_state(sk, TCP_CLOSE);
1583                 tcp_send_active_reset(sk, GFP_KERNEL);
1584         } else if (tcp_close_state(sk,1)) {
1585                 /* We FIN if the application ate all the data before
1586                  * zapping the connection.
1587                  */
1588                 tcp_send_fin(sk);
1589         }
1590
1591         if (timeout) {
1592                 struct task_struct *tsk = current;
1593                 DECLARE_WAITQUEUE(wait, current);
1594
1595                 add_wait_queue(sk->sleep, &wait);
1596
1597                 while (1) {
1598                         set_current_state(TASK_INTERRUPTIBLE);
1599                         if (!closing(sk))
1600                                 break;
1601                         release_sock(sk);
1602                         timeout = schedule_timeout(timeout);
1603                         lock_sock(sk);
1604                         if (!signal_pending(tsk) || timeout)
1605                                 break;
1606                 }
1607
1608                 tsk->state = TASK_RUNNING;
1609                 remove_wait_queue(sk->sleep, &wait);
1610         }
1611
1612         /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1613          * we may need to set up a timer.
1614          */
1615         tcp_check_fin_timer(sk);
1616
1617 adjudge_to_death:
1618         /* It is the last release_sock in its life. It will remove backlog. */
1619         release_sock(sk);
1620
1621
1622         /* Now socket is owned by kernel and we acquire BH lock
1623            to finish close. No need to check for user refs.
1624          */
1625         local_bh_disable();
1626         bh_lock_sock(sk);
1627         BUG_TRAP(sk->lock.users==0);
1628
1629         sock_hold(sk);
1630
1631         /* Announce socket dead, detach it from wait queue and inode. */
1632         write_lock_irq(&sk->callback_lock);
1633         sk->dead = 1;
1634         sk->socket = NULL;
1635         sk->sleep = NULL;
1636         write_unlock_irq(&sk->callback_lock);
1637
1638         if (sk->state == TCP_CLOSE)
1639                 tcp_destroy_sock(sk);
1640         /* Otherwise, socket is reprieved until protocol close. */
1641
1642         bh_unlock_sock(sk);
1643         local_bh_enable();
1644         sock_put(sk);
1645 }
1646
1647 int tcp_disconnect(struct sock *sk, int flags)
1648 {
1649         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1650         int old_state;
1651         int err = 0;
1652
1653         old_state = sk->state;
1654         if (old_state != TCP_CLOSE)
1655                 tcp_set_state(sk, TCP_CLOSE);
1656
1657         /* ABORT function of RFC793 */
1658         if (old_state == TCP_LISTEN) {
1659                 tcp_close_pending(sk);
1660         } else if (tcp_connected(old_state)) {
1661                 tcp_send_active_reset(sk, GFP_KERNEL);
1662                 sk->err = ECONNRESET;
1663         } else if (old_state == TCP_SYN_SENT)
1664                 sk->err = ECONNRESET;
1665
1666         tcp_clear_xmit_timers(sk);
1667         __skb_queue_purge(&sk->receive_queue);
1668         __skb_queue_purge(&sk->write_queue);
1669         __skb_queue_purge(&tp->out_of_order_queue);
1670
1671         sk->dport = 0;
1672
1673         sk->rcv_saddr = 0;
1674         sk->saddr = 0;
1675 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1676         memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
1677         memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
1678 #endif
1679
1680         sk->zapped = 0;
1681         sk->shutdown = 0;
1682         sk->done = 0;
1683         sk->write_space = tcp_write_space;
1684         tp->srtt = 0;
1685 #ifdef CONFIG_TCP_TW_RECYCLE
1686         if ((tp->write_seq += 2) == 0)
1687                 tp->write_seq = 1;
1688 #else
1689         tp->write_seq = 0;
1690 #endif
1691         tp->ato = 0;
1692         tp->backoff = 0;
1693         tp->snd_cwnd = 2;
1694         tp->probes_out = 0;
1695         tp->high_seq = 0;
1696         tp->snd_ssthresh = 0x7fffffff;
1697         tp->snd_cwnd_cnt = 0;
1698         tp->dup_acks = 0;
1699         tp->delayed_acks = 0;
1700         tp->send_head = tp->retrans_head = NULL;
1701         tp->saw_tstamp = 0;
1702         __sk_dst_reset(sk);
1703
1704         BUG_TRAP(!sk->num || sk->prev);
1705
1706         sk->error_report(sk);
1707         return err;
1708 }
1709
1710 /*
1711  *      Wait for an incoming connection, avoid race
1712  *      conditions. This must be called with the socket locked,
1713  *      and without the kernel lock held.
1714  */
1715 static struct open_request * wait_for_connect(struct sock * sk,
1716                                               struct open_request **pprev)
1717 {
1718         DECLARE_WAITQUEUE(wait, current);
1719         struct open_request *req;
1720
1721         /*
1722          * True wake-one mechanism for incoming connections: only
1723          * one process gets woken up, not the 'whole herd'.
1724          * Since we do not 'race & poll' for established sockets
1725          * anymore, the common case will execute the loop only once.
1726          *
1727          * Subtle issue: "add_wait_queue_exclusive()" will be added
1728          * after any current non-exclusive waiters, and we know that
1729          * it will always _stay_ after any new non-exclusive waiters
1730          * because all non-exclusive waiters are added at the
1731          * beginning of the wait-queue. As such, it's ok to "drop"
1732          * our exclusiveness temporarily when we get woken up without
1733          * having to remove and re-insert us on the wait queue.
1734          */
1735         add_wait_queue_exclusive(sk->sleep, &wait);
1736         for (;;) {
1737                 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1738                 release_sock(sk);
1739                 schedule();
1740                 lock_sock(sk);
1741                 req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
1742                 if (req)
1743                         break;
1744                 if (signal_pending(current))
1745                         break;
1746         }
1747         current->state = TASK_RUNNING;
1748         remove_wait_queue(sk->sleep, &wait);
1749         return req;
1750 }
1751
1752 /*
1753  *      This will accept the next outstanding connection.
1754  *
1755  *      Be careful about race conditions here - this is subtle.
1756  */
1757
1758 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
1759 {
1760         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1761         struct open_request *req, *prev;
1762         struct sock *newsk;
1763         int error;
1764
1765         lock_sock(sk);
1766
1767         /* We need to make sure that this socket is listening,
1768          * and that it has something pending.
1769          */
1770         error = -EINVAL;
1771         if (sk->state != TCP_LISTEN)
1772                 goto out;
1773
1774         /* Find already established connection */
1775         req = tcp_find_established(tp, &prev);
1776         if (!req) {
1777                 /* If this is a non blocking socket don't sleep */
1778                 error = -EAGAIN;
1779                 if (flags & O_NONBLOCK)
1780                         goto out;
1781
1782                 error = -ERESTARTSYS;
1783                 req = wait_for_connect(sk, &prev);
1784                 if (!req)
1785                         goto out;
1786         }
1787
1788         tcp_synq_unlink(tp, req, prev);
1789         newsk = req->sk;
1790         req->class->destructor(req);
1791         tcp_openreq_free(req);
1792         sk->ack_backlog--;
1793         release_sock(sk);
1794         return newsk;
1795
1796 out:
1797         release_sock(sk);
1798         *err = error;
1799         return NULL;
1800 }
1801
1802 /*
1803  *      Socket option code for TCP.
1804  */
1805
1806 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
1807                    int optlen)
1808 {
1809         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1810         int val;
1811         int err = 0;
1812
1813         if (level != SOL_TCP)
1814                 return tp->af_specific->setsockopt(sk, level, optname,
1815                                                    optval, optlen);
1816
1817         if(optlen<sizeof(int))
1818                 return -EINVAL;
1819
1820         if (get_user(val, (int *)optval))
1821                 return -EFAULT;
1822
1823         lock_sock(sk);
1824
1825         switch(optname) {
1826         case TCP_MAXSEG:
1827                 /* values greater than interface MTU won't take effect.  however at
1828                  * the point when this call is done we typically don't yet know
1829                  * which interface is going to be used
1830                  */
1831                 if(val < 1 || val > MAX_WINDOW) {
1832                         err = -EINVAL;
1833                         break;
1834                 }
1835                 tp->user_mss = val;
1836                 break;
1837
1838         case TCP_NODELAY:
1839                 /* You cannot try to use this and TCP_CORK in
1840                  * tandem, so let the user know.
1841                  */
1842                 if (sk->nonagle == 2) {
1843                         err = -EINVAL;
1844                         break;
1845                 }
1846                 sk->nonagle = (val == 0) ? 0 : 1;
1847                 break;
1848
1849         case TCP_CORK:
1850                 /* When set indicates to always queue non-full frames.
1851                  * Later the user clears this option and we transmit
1852                  * any pending partial frames in the queue.  This is
1853                  * meant to be used alongside sendfile() to get properly
1854                  * filled frames when the user (for example) must write
1855                  * out headers with a write() call first and then use
1856                  * sendfile to send out the data parts.
1857                  *
1858                  * You cannot try to use TCP_NODELAY and this mechanism
1859                  * at the same time, so let the user know.
1860                  */
1861                 if (sk->nonagle == 1) {
1862                         err = -EINVAL;
1863                         break;
1864                 }
1865                 if (val != 0) {
1866                         sk->nonagle = 2;
1867                 } else {
1868                         sk->nonagle = 0;
1869
1870                         tcp_push_pending_frames(sk, tp);
1871                 }
1872                 break;
1873
1874         case TCP_KEEPIDLE:
1875                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1876                         err = -EINVAL;
1877                 else {
1878                         tp->keepalive_time = val * HZ;
1879                         if (sk->keepopen) {
1880                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1881                                 if (tp->keepalive_time > elapsed)
1882                                         elapsed = tp->keepalive_time - elapsed;
1883                                 else
1884                                         elapsed = 0;
1885                                 tcp_reset_keepalive_timer(sk, elapsed);
1886                         }
1887                 }
1888                 break;
1889         case TCP_KEEPINTVL:
1890                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1891                         err = -EINVAL;
1892                 else
1893                         tp->keepalive_intvl = val * HZ;
1894                 break;
1895         case TCP_KEEPCNT:
1896                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1897                         err = -EINVAL;
1898                 else
1899                         tp->keepalive_probes = val;
1900                 break;
1901         case TCP_SYNCNT:
1902                 if (val < 1 || val > MAX_TCP_SYNCNT)
1903                         err = -EINVAL;
1904                 else
1905                         tp->syn_retries = val;
1906                 break;
1907
1908         default:
1909                 err = -ENOPROTOOPT;
1910                 break;
1911         };
1912         release_sock(sk);
1913         return err;
1914 }
1915
1916 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
1917                    int *optlen)
1918 {
1919         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1920         int val, len;
1921
1922         if(level != SOL_TCP)
1923                 return tp->af_specific->getsockopt(sk, level, optname,
1924                                                    optval, optlen);
1925
1926         if(get_user(len,optlen))
1927                 return -EFAULT;
1928
1929         len = min(len, sizeof(int));
1930
1931         switch(optname) {
1932         case TCP_MAXSEG:
1933                 val = tp->user_mss;
1934                 break;
1935         case TCP_NODELAY:
1936                 val = (sk->nonagle == 1);
1937                 break;
1938         case TCP_CORK:
1939                 val = (sk->nonagle == 2);
1940                 break;
1941         case TCP_KEEPIDLE:
1942                 if (tp->keepalive_time)
1943                         val = tp->keepalive_time / HZ;
1944                 else
1945                         val = sysctl_tcp_keepalive_time / HZ;
1946                 break;
1947         case TCP_KEEPINTVL:
1948                 if (tp->keepalive_intvl)
1949                         val = tp->keepalive_intvl / HZ;
1950                 else
1951                         val = sysctl_tcp_keepalive_intvl / HZ;
1952                 break;
1953         case TCP_KEEPCNT:
1954                 if (tp->keepalive_probes)
1955                         val = tp->keepalive_probes;
1956                 else
1957                         val = sysctl_tcp_keepalive_probes;
1958                 break;
1959         case TCP_SYNCNT:
1960                 if (tp->syn_retries)
1961                         val = tp->syn_retries;
1962                 else
1963                         val = sysctl_tcp_syn_retries;
1964                 break;
1965         default:
1966                 return -ENOPROTOOPT;
1967         };
1968
1969         if(put_user(len, optlen))
1970                 return -EFAULT;
1971         if(copy_to_user(optval, &val,len))
1972                 return -EFAULT;
1973         return 0;
1974 }
1975
1976
1977 extern void __skb_cb_too_small_for_tcp(int, int);
1978
1979 void __init tcp_init(void)
1980 {
1981         struct sk_buff *skb = NULL;
1982         unsigned long goal;
1983         int order, i;
1984
1985         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
1986                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
1987                                            sizeof(skb->cb));
1988
1989         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
1990                                                    sizeof(struct open_request),
1991                                                0, SLAB_HWCACHE_ALIGN,
1992                                                NULL, NULL);
1993         if(!tcp_openreq_cachep)
1994                 panic("tcp_init: Cannot alloc open_request cache.");
1995
1996         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
1997                                               sizeof(struct tcp_bind_bucket),
1998                                               0, SLAB_HWCACHE_ALIGN,
1999                                               NULL, NULL);
2000         if(!tcp_bucket_cachep)
2001                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2002
2003         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2004                                                 sizeof(struct tcp_tw_bucket),
2005                                                 0, SLAB_HWCACHE_ALIGN,
2006                                                 NULL, NULL);
2007         if(!tcp_timewait_cachep)
2008                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2009
2010         /* Size and allocate the main established and bind bucket
2011          * hash tables.
2012          *
2013          * The methodology is similar to that of the buffer cache.
2014          */
2015         goal = num_physpages >> (23 - PAGE_SHIFT);
2016
2017         for(order = 0; (1UL << order) < goal; order++)
2018                 ;
2019         do {
2020                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2021                         sizeof(struct tcp_ehash_bucket);
2022                 tcp_ehash_size >>= 1;
2023                 while (tcp_ehash_size & (tcp_ehash_size-1))
2024                         tcp_ehash_size--;
2025                 tcp_ehash = (struct tcp_ehash_bucket *)
2026                         __get_free_pages(GFP_ATOMIC, order);
2027         } while (tcp_ehash == NULL && --order > 0);
2028
2029         if (!tcp_ehash)
2030                 panic("Failed to allocate TCP established hash table\n");
2031         for (i = 0; i < (tcp_ehash_size<<1); i++) {
2032                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2033                 tcp_ehash[i].chain = NULL;
2034         }
2035
2036         do {
2037                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2038                         sizeof(struct tcp_bind_hashbucket);
2039                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2040                         continue;
2041                 tcp_bhash = (struct tcp_bind_hashbucket *)
2042                         __get_free_pages(GFP_ATOMIC, order);
2043         } while (tcp_bhash == NULL && --order >= 0);
2044
2045         if (!tcp_bhash)
2046                 panic("Failed to allocate TCP bind hash table\n");
2047         for (i = 0; i < tcp_bhash_size; i++) {
2048                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2049                 tcp_bhash[i].chain = NULL;
2050         }
2051
2052         if (order > 4) {
2053                 sysctl_local_port_range[0] = 32768;
2054                 sysctl_local_port_range[1] = 61000;
2055         } else if (order < 3) {
2056                 sysctl_local_port_range[0] = 1024*(3-order);
2057         }
2058         tcp_port_rover = sysctl_local_port_range[0] - 1;
2059
2060         printk("TCP: Hash tables configured (established %d bind %d)\n",
2061                tcp_ehash_size<<1, tcp_bhash_size);
2062 }