net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.141 1999/05/12 11:24:40 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen :    Make poll agree with SIGIO
 205  *
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *
 243  *      TCP_CLOSE               socket is finished
 244  */
 245
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * [Note: Most of the TCP code has been rewriten/redesigned since this
 253  *  RFC1122 check. It is probably not correct anymore. It should be redone
 254  *  before 2.2. -AK]
 255  *
 256  * Use of PSH (4.2.2.2)
 257  *   MAY aggregate data sent without the PSH flag. (does)
 258  *   MAY queue data received without the PSH flag. (does)
 259  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 260  *   MAY implement PSH on send calls. (doesn't, thus:)
 261  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 262  *     MUST set PSH on last segment (does)
 263  *   MAY pass received PSH to application layer (doesn't)
 264  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 265  *
 266  * Window Size (4.2.2.3, 4.2.2.16)
 267  *   MUST treat window size as an unsigned number (does)
 268  *   SHOULD treat window size as a 32-bit number (does not)
 269  *   MUST NOT shrink window once it is offered (does not normally)
 270  *
 271  * Urgent Pointer (4.2.2.4)
 272  * **MUST point urgent pointer to last byte of urgent data (not right
 273  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 274  *      to off)
 275  *   MUST inform application layer asynchronously of incoming urgent
 276  *     data. (does)
 277  *   MUST provide application with means of determining the amount of
 278  *     urgent data pending. (does)
 279  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 280  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 281  *      [Follows BSD 1 byte of urgent data]
 282  *
 283  * TCP Options (4.2.2.5)
 284  *   MUST be able to receive TCP options in any segment. (does)
 285  *   MUST ignore unsupported options (does)
 286  *
 287  * Maximum Segment Size Option (4.2.2.6)
 288  *   MUST implement both sending and receiving MSS. (does, but currently
 289  *      only uses the smaller of both of them)
 290  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 291  *     it always). (does, even when MSS == 536, which is legal)
 292  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 293  *   MUST calculate "effective send MSS" correctly:
 294  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 295  *     (does - but allows operator override)
 296  *
 297  * TCP Checksum (4.2.2.7)
 298  *   MUST generate and check TCP checksum. (does)
 299  *
 300  * Initial Sequence Number Selection (4.2.2.8)
 301  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 302  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 303  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 304  *     With syncookies we don't)
 305  *
 306  * Simultaneous Open Attempts (4.2.2.10)
 307  *   MUST support simultaneous open attempts (does)
 308  *
 309  * Recovery from Old Duplicate SYN (4.2.2.11)
 310  *   MUST keep track of active vs. passive open (does)
 311  *
 312  * RST segment (4.2.2.12)
 313  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 314  *     anything with it, which is standard)
 315  *
 316  * Closing a Connection (4.2.2.13)
 317  *   MUST inform application of whether connection was closed by RST or
 318  *     normal close. (does)
 319  *   MAY allow "half-duplex" close (treat connection as closed for the
 320  *     local app, even before handshake is done). (does)
 321  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 322  *
 323  * Retransmission Timeout (4.2.2.15)
 324  *   MUST implement Jacobson's slow start and congestion avoidance
 325  *     stuff. (does)
 326  *
 327  * Probing Zero Windows (4.2.2.17)
 328  *   MUST support probing of zero windows. (does)
 329  *   MAY keep offered window closed indefinitely. (does)
 330  *   MUST allow remote window to stay closed indefinitely. (does)
 331  *
 332  * Passive Open Calls (4.2.2.18)
 333  *   MUST NOT let new passive open affect other connections. (doesn't)
 334  *   MUST support passive opens (LISTENs) concurrently. (does)
 335  *
 336  * Time to Live (4.2.2.19)
 337  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 338  *
 339  * Event Processing (4.2.2.20)
 340  *   SHOULD queue out-of-order segments. (does)
 341  *   MUST aggregate ACK segments whenever possible. (does but badly)
 342  *
 343  * Retransmission Timeout Calculation (4.2.3.1)
 344  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 345  *     calculation. (does, or at least explains them in the comments 8*b)
 346  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 347  *
 348  * When to Send an ACK Segment (4.2.3.2)
 349  *   SHOULD implement delayed ACK. (does)
 350  *   MUST keep ACK delay < 0.5 sec. (does)
 351  *
 352  * When to Send a Window Update (4.2.3.3)
 353  *   MUST implement receiver-side SWS. (does)
 354  *
 355  * When to Send Data (4.2.3.4)
 356  *   MUST implement sender-side SWS. (does)
 357  *   SHOULD implement Nagle algorithm. (does)
 358  *
 359  * TCP Connection Failures (4.2.3.5)
 360  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 361  *   SHOULD inform application layer of soft errors. (does)
 362  *
 363  * TCP Keep-Alives (4.2.3.6)
 364  *   MAY provide keep-alives. (does)
 365  *   MUST make keep-alives configurable on a per-connection basis. (does)
 366  *   MUST default to no keep-alives. (does)
 367  *   MUST make keep-alive interval configurable. (does)
 368  *   MUST make default keep-alive interval > 2 hours. (does)
 369  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 370  *     connection. (doesn't)
 371  *   SHOULD send keep-alive with no data. (does)
 372  *
 373  * TCP Multihoming (4.2.3.7)
 374  *   MUST get source address from IP layer before sending first
 375  *     SYN. (does)
 376  *   MUST use same local address for all segments of a connection. (does)
 377  *
 378  * IP Options (4.2.3.8)
 379  *   MUST ignore unsupported IP options. (does)
 380  *   MAY support Time Stamp and Record Route. (does)
 381  *   MUST allow application to specify a source route. (does)
 382  *   MUST allow received Source Route option to set route for all future
 383  *     segments on this connection. (does not (security issues))
 384  *
 385  * ICMP messages (4.2.3.9)
 386  *   MUST act on ICMP errors. (does)
 387  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 388  *   because that is deprecated now by the IETF, can be turned on)
 389  *   MUST NOT abort connection upon receipt of soft Destination
 390  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  *     Problems. (doesn't)
 392  *   SHOULD report soft Destination Unreachables etc. to the
 393  *     application. (does, except during SYN_RECV and may drop messages
 394  *     in some rare cases before accept() - ICMP is unreliable)
 395  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 396  *     messages (2, 3, 4). (does, but see above)
 397  *
 398  * Remote Address Validation (4.2.3.10)
 399  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 400  *   MUST ignore SYN with invalid source address. (does)
 401  *   MUST silently discard incoming SYN for broadcast/multicast
 402  *     address. (does)
 403  *
 404  * Asynchronous Reports (4.2.4.1)
 405  * MUST provide mechanism for reporting soft errors to application
 406  *     layer. (does)
 407  *
 408  * Type of Service (4.2.4.2)
 409  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 410  *
 411  * (Whew. -- MS 950903)
 412  * (Updated by AK, but not complete yet.)
 413  **/
 414
 415 #include <linux/types.h>
 416 #include <linux/fcntl.h>
 417 #include <linux/poll.h>
 418 #include <linux/init.h>
 419 #include <linux/smp_lock.h>
 420
 421 #include <net/icmp.h>
 422 #include <net/tcp.h>
 423
 424 #include <asm/uaccess.h>
 425
 426 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 427
 428 struct tcp_mib  tcp_statistics;
 429
 430 kmem_cache_t *tcp_openreq_cachep;
 431 kmem_cache_t *tcp_bucket_cachep;
 432 kmem_cache_t *tcp_timewait_cachep;
 433
 434 /*
 435  *      Find someone to 'accept'. Must be called with
 436  *      the socket locked or with interrupts disabled
 437  */
 438
 439 static struct open_request *tcp_find_established(struct tcp_opt *tp,
 440                                                  struct open_request **prevp)
 441 {
 442         struct open_request *req = tp->syn_wait_queue;
 443         struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
 444         while(req) {
 445                 if (req->sk &&
 446                     ((1 << req->sk->state) &
 447                      ~(TCPF_SYN_SENT|TCPF_SYN_RECV)))
 448                         break;
 449                 prev = req;
 450                 req = req->dl_next;
 451         }
 452         *prevp = prev;
 453         return req;
 454 }
 455
 456 /*
 457  *      Walk down the receive queue counting readable data.
 458  *
 459  *      Must be called with the socket lock held.
 460  */
 461
 462 static int tcp_readable(struct sock *sk)
 463 {
 464         unsigned long counted;
 465         unsigned long amount;
 466         struct sk_buff *skb;
 467         int sum;
 468
 469         SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
 470
 471         skb = skb_peek(&sk->receive_queue);
 472         if (skb == NULL) {
 473                 SOCK_DEBUG(sk, "empty\n");
 474                 return(0);
 475         }
 476
 477         counted = sk->tp_pinfo.af_tcp.copied_seq;       /* Where we are at the moment */
 478         amount = 0;
 479
 480         /* Do until a push or until we are out of data. */
 481         do {
 482                 /* Found a hole so stops here. */
 483                 if (before(counted, TCP_SKB_CB(skb)->seq))      /* should not happen */
 484                         break;
 485
 486                 /* Length - header but start from where we are up to
 487                  * avoid overlaps.
 488                  */
 489                 sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
 490                 if (sum >= 0) {
 491                         /* Add it up, move on. */
 492                         amount += sum;
 493                         counted += sum;
 494                         if (skb->h.th->syn)
 495                                 counted++;
 496                 }
 497
 498                 /* Don't count urg data ... but do it in the right place!
 499                  * Consider: "old_data (ptr is here) URG PUSH data"
 500                  * The old code would stop at the first push because
 501                  * it counted the urg (amount==1) and then does amount--
 502                  * *after* the loop.  This means tcp_readable() always
 503                  * returned zero if any URG PUSH was in the queue, even
 504                  * though there was normal data available. If we subtract
 505                  * the urg data right here, we even get it to work for more
 506                  * than one URG PUSH skb without normal data.
 507                  * This means that poll() finally works now with urg data
 508                  * in the queue.  Note that rlogin was never affected
 509                  * because it doesn't use poll(); it uses two processes
 510                  * and a blocking read().  And the queue scan in tcp_read()
 511                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 512                  */
 513
 514                 /* Don't count urg data. */
 515                 if (skb->h.th->urg)
 516                         amount--;
 517 #if 0
 518                 if (amount && skb->h.th->psh) break;
 519 #endif
 520                 skb = skb->next;
 521         } while(skb != (struct sk_buff *)&sk->receive_queue);
 522
 523         SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
 524         return(amount);
 525 }
 526
 527 /*
 528  * LISTEN is a special case for poll..
 529  */
 530 static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 531 {
 532         struct open_request *req, *dummy;
 533
 534         lock_sock(sk);
 535         req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
 536         release_sock(sk);
 537         if (req)
 538                 return POLLIN | POLLRDNORM;
 539         return 0;
 540 }
 541
 542 /*
 543  *      Compute minimal free write space needed to queue new packets.
 544  */
 545 #define tcp_min_write_space(__sk) \
 546         (atomic_read(&(__sk)->wmem_alloc) / 2)
 547
 548 /*
 549  *      Wait for a TCP event.
 550  *
 551  *      Note that we don't need to lock the socket, as the upper poll layers
 552  *      take care of normal races (between the test and the event) and we don't
 553  *      go look at any of the socket buffers directly.
 554  */
 555 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 556 {
 557         unsigned int mask;
 558         struct sock *sk = sock->sk;
 559         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 560
 561         poll_wait(file, sk->sleep, wait);
 562         if (sk->state == TCP_LISTEN)
 563                 return tcp_listen_poll(sk, wait);
 564
 565         mask = 0;
 566         if (sk->err)
 567                 mask = POLLERR;
 568
 569         /*
 570          * POLLHUP is certainly not done right. But poll() doesn't
 571          * have a notion of HUP in just one direction, and for a
 572          * socket the read side is more interesting.
 573          *
 574          * Some poll() documentation says that POLLHUP is incompatible
 575          * with the POLLOUT/POLLWR flags, so somebody should check this
 576          * all. But careful, it tends to be safer to return too many
 577          * bits than too few, and you can easily break real applications
 578          * if you don't tell them that something has hung up!
 579          *
 580          * Check-me.
 581          */
 582         if (sk->shutdown & RCV_SHUTDOWN)
 583                 mask |= POLLHUP;
 584
 585         /* Connected? */
 586         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 587                 if ((tp->rcv_nxt != tp->copied_seq) &&
 588                     (tp->urg_seq != tp->copied_seq ||
 589                      tp->rcv_nxt != tp->copied_seq+1 ||
 590                      sk->urginline || !tp->urg_data))
 591                         mask |= POLLIN | POLLRDNORM;
 592
 593                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 594                         if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
 595                                 mask |= POLLOUT | POLLWRNORM;
 596                         } else {  /* send SIGIO later */
 597                                 sk->socket->flags |= SO_NOSPACE;
 598                         }
 599                 }
 600
 601                 if (tp->urg_data & URG_VALID)
 602                         mask |= POLLPRI;
 603         }
 604         return mask;
 605 }
 606
 607 /*
 608  *      Socket write_space callback.
 609  *      This (or rather the sock_wake_async) should agree with poll.
 610  */
 611 void tcp_write_space(struct sock *sk)
 612 {
 613         if (sk->dead)
 614                 return;
 615
 616         wake_up_interruptible(sk->sleep);
 617         if (sock_wspace(sk) >=
 618             tcp_min_write_space(sk))
 619                 sock_wake_async(sk->socket, 2);
 620 }
 621
 622
 623 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 624 {
 625         int answ;
 626
 627         switch(cmd) {
 628         case TIOCINQ:
 629 #ifdef FIXME    /* FIXME: */
 630         case FIONREAD:
 631 #endif
 632                 if (sk->state == TCP_LISTEN)
 633                         return(-EINVAL);
 634                 lock_sock(sk);
 635                 answ = tcp_readable(sk);
 636                 release_sock(sk);
 637                 break;
 638         case SIOCATMARK:
 639                 {
 640                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 641                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 642                         break;
 643                 }
 644         case TIOCOUTQ:
 645                 if (sk->state == TCP_LISTEN)
 646                         return(-EINVAL);
 647                 answ = sock_wspace(sk);
 648                 break;
 649         default:
 650                 return(-ENOIOCTLCMD);
 651         };
 652
 653         return put_user(answ, (int *)arg);
 654 }
 655
 656 /*
 657  *      Wait for a socket to get into the connected state
 658  *
 659  *      Note: Must be called with the socket locked, and it
 660  *            runs with the kernel fully unlocked.
 661  */
 662 static int wait_for_tcp_connect(struct sock * sk, int flags)
 663 {
 664         struct task_struct *tsk = current;
 665         DECLARE_WAITQUEUE(wait, tsk);
 666
 667         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 668                 if(sk->err)
 669                         return sock_error(sk);
 670                 if((1 << sk->state) &
 671                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 672                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 673                                 send_sig(SIGPIPE, tsk, 0);
 674                         return -EPIPE;
 675                 }
 676                 if(flags & MSG_DONTWAIT)
 677                         return -EAGAIN;
 678                 if(signal_pending(tsk))
 679                         return -ERESTARTSYS;
 680
 681                 tsk->state = TASK_INTERRUPTIBLE;
 682                 add_wait_queue(sk->sleep, &wait);
 683                 release_sock(sk);
 684
 685                 if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
 686                     sk->err == 0)
 687                         schedule();
 688
 689                 tsk->state = TASK_RUNNING;
 690                 remove_wait_queue(sk->sleep, &wait);
 691                 lock_sock(sk);
 692         }
 693         return 0;
 694 }
 695
 696 static inline int tcp_memory_free(struct sock *sk)
 697 {
 698         return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
 699 }
 700
 701 /*
 702  *      Wait for more memory for a socket
 703  *
 704  * NOTE: This runs with the kernel fully unlocked.
 705  */
 706 static void wait_for_tcp_memory(struct sock * sk)
 707 {
 708         release_sock(sk);
 709         if (!tcp_memory_free(sk)) {
 710                 DECLARE_WAITQUEUE(wait, current);
 711
 712                 sk->socket->flags &= ~SO_NOSPACE;
 713                 add_wait_queue(sk->sleep, &wait);
 714                 for (;;) {
 715                         if (signal_pending(current))
 716                                 break;
 717                         current->state = TASK_INTERRUPTIBLE;
 718                         if (tcp_memory_free(sk))
 719                                 break;
 720                         if (sk->shutdown & SEND_SHUTDOWN)
 721                                 break;
 722                         if (sk->err)
 723                                 break;
 724                         schedule();
 725                 }
 726                 current->state = TASK_RUNNING;
 727                 remove_wait_queue(sk->sleep, &wait);
 728         }
 729         lock_sock(sk);
 730 }
 731
 732 /* When all user supplied data has been queued set the PSH bit */
 733 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
 734
 735 /*
 736  *      This routine copies from a user buffer into a socket,
 737  *      and starts the transmit system.
 738  *
 739  *      Note: must be called with the socket locked.
 740  */
 741
 742 int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 743 {
 744         struct iovec *iov;
 745         struct tcp_opt *tp;
 746         struct sk_buff *skb;
 747         int iovlen, flags;
 748         int mss_now;
 749         int err, copied;
 750
 751         unlock_kernel();
 752         lock_sock(sk);
 753
 754         err = 0;
 755         tp = &(sk->tp_pinfo.af_tcp);
 756
 757         /* Wait for a connection to finish. */
 758         flags = msg->msg_flags;
 759         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 760                 if((err = wait_for_tcp_connect(sk, flags)) != 0)
 761                         goto out;
 762
 763         /* This should be in poll */
 764         sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
 765
 766         mss_now = tcp_current_mss(sk);
 767
 768         /* Ok commence sending. */
 769         iovlen = msg->msg_iovlen;
 770         iov = msg->msg_iov;
 771         copied = 0;
 772
 773         while(--iovlen >= 0) {
 774                 int seglen=iov->iov_len;
 775                 unsigned char * from=iov->iov_base;
 776
 777                 iov++;
 778
 779                 while(seglen > 0) {
 780                         int copy, tmp, queue_it, psh;
 781
 782                         if (err)
 783                                 goto do_fault2;
 784
 785                         /* Stop on errors. */
 786                         if (sk->err)
 787                                 goto do_sock_err;
 788
 789                         /* Make sure that we are established. */
 790                         if (sk->shutdown & SEND_SHUTDOWN)
 791                                 goto do_shutdown;
 792
 793                         /* Now we need to check if we have a half
 794                          * built packet we can tack some data onto.
 795                          */
 796                         if (tp->send_head && !(flags & MSG_OOB)) {
 797                                 skb = sk->write_queue.prev;
 798                                 copy = skb->len;
 799                                 /* If the remote does SWS avoidance we should
 800                                  * queue the best we can if not we should in
 801                                  * fact send multiple packets...
 802                                  * A method for detecting this would be most
 803                                  * welcome.
 804                                  */
 805                                 if (skb_tailroom(skb) > 0 &&
 806                                     (mss_now - copy) > 0 &&
 807                                     tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
 808                                         int last_byte_was_odd = (copy % 4);
 809
 810                                         copy = mss_now - copy;
 811                                         if(copy > skb_tailroom(skb))
 812                                                 copy = skb_tailroom(skb);
 813                                         if(copy > seglen)
 814                                                 copy = seglen;
 815                                         if(last_byte_was_odd) {
 816                                                 if(copy_from_user(skb_put(skb, copy),
 817                                                                   from, copy))
 818                                                         err = -EFAULT;
 819                                                 skb->csum = csum_partial(skb->data,
 820                                                                          skb->len, 0);
 821                                         } else {
 822                                                 skb->csum =
 823                                                         csum_and_copy_from_user(
 824                                                         from, skb_put(skb, copy),
 825                                                         copy, skb->csum, &err);
 826                                         }
 827                                         /*
 828                                          * FIXME: the *_user functions should
 829                                          *        return how much data was
 830                                          *        copied before the fault
 831                                          *        occurred and then a partial
 832                                          *        packet with this data should
 833                                          *        be sent.  Unfortunately
 834                                          *        csum_and_copy_from_user doesn't
 835                                          *        return this information.
 836                                          *        ATM it might send partly zeroed
 837                                          *        data in this case.
 838                                          */
 839                                         tp->write_seq += copy;
 840                                         TCP_SKB_CB(skb)->end_seq += copy;
 841                                         from += copy;
 842                                         copied += copy;
 843                                         seglen -= copy;
 844                                         if (PSH_NEEDED)
 845                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 846                                         continue;
 847                                 }
 848                         }
 849
 850                         /* We also need to worry about the window.  If
 851                          * window < 1/2 the maximum window we've seen
 852                          * from this host, don't use it.  This is
 853                          * sender side silly window prevention, as
 854                          * specified in RFC1122.  (Note that this is
 855                          * different than earlier versions of SWS
 856                          * prevention, e.g. RFC813.).  What we
 857                          * actually do is use the whole MSS.  Since
 858                          * the results in the right edge of the packet
 859                          * being outside the window, it will be queued
 860                          * for later rather than sent.
 861                          */
 862                         psh = 0;
 863                         copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 864                         if(copy > (tp->max_window >> 1)) {
 865                                 copy = min(copy, mss_now);
 866                                 psh = 1;
 867                         } else {
 868                                 copy = mss_now;
 869                         }
 870                         if(copy > seglen)
 871                                 copy = seglen;
 872
 873                         /* Determine how large of a buffer to allocate.  */
 874                         tmp = MAX_HEADER + sk->prot->max_header;
 875                         if (copy < min(mss_now, tp->max_window >> 1) &&
 876                             !(flags & MSG_OOB)) {
 877                                 tmp += min(mss_now, tp->max_window);
 878
 879                                 /* What is happening here is that we want to
 880                                  * tack on later members of the users iovec
 881                                  * if possible into a single frame.  When we
 882                                  * leave this loop our caller checks to see if
 883                                  * we can send queued frames onto the wire.
 884                                  * See tcp_v[46]_sendmsg() for this.
 885                                  */
 886                                 queue_it = 1;
 887                         } else {
 888                                 tmp += copy;
 889                                 queue_it = 0;
 890                         }
 891                         skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
 892
 893                         /* If we didn't get any memory, we need to sleep. */
 894                         if (skb == NULL) {
 895                                 sk->socket->flags |= SO_NOSPACE;
 896                                 if (flags&MSG_DONTWAIT) {
 897                                         err = -EAGAIN;
 898                                         goto do_interrupted;
 899                                 }
 900                                 if (signal_pending(current)) {
 901                                         err = -ERESTARTSYS;
 902                                         goto do_interrupted;
 903                                 }
 904                                 tcp_push_pending_frames(sk, tp);
 905                                 wait_for_tcp_memory(sk);
 906
 907                                 /* If SACK's were formed or PMTU events happened,
 908                                  * we must find out about it.
 909                                  */
 910                                 mss_now = tcp_current_mss(sk);
 911                                 continue;
 912                         }
 913
 914                         seglen -= copy;
 915
 916                         /* Prepare control bits for TCP header creation engine. */
 917                         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 918                                                   ((PSH_NEEDED || psh) ?
 919                                                    TCPCB_FLAG_PSH : 0));
 920                         TCP_SKB_CB(skb)->sacked = 0;
 921                         if (flags & MSG_OOB) {
 922                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 923                                 TCP_SKB_CB(skb)->urg_ptr = copy;
 924                         } else
 925                                 TCP_SKB_CB(skb)->urg_ptr = 0;
 926
 927                         /* TCP data bytes are SKB_PUT() on top, later
 928                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 929                          * Reserve header space and checksum the data.
 930                          */
 931                         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 932                         skb->csum = csum_and_copy_from_user(from,
 933                                         skb_put(skb, copy), copy, 0, &err);
 934
 935                         if (err)
 936                                 goto do_fault;
 937
 938                         from += copy;
 939                         copied += copy;
 940
 941                         TCP_SKB_CB(skb)->seq = tp->write_seq;
 942                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 943
 944                         /* This advances tp->write_seq for us. */
 945                         tcp_send_skb(sk, skb, queue_it);
 946                 }
 947         }
 948         sk->err = 0;
 949         err = copied;
 950         goto out;
 951
 952 do_sock_err:
 953         if(copied)
 954                 err = copied;
 955         else
 956                 err = sock_error(sk);
 957         goto out;
 958 do_shutdown:
 959         if(copied)
 960                 err = copied;
 961         else {
 962                 if (!(flags&MSG_NOSIGNAL))
 963                         send_sig(SIGPIPE, current, 0);
 964                 err = -EPIPE;
 965         }
 966         goto out;
 967 do_interrupted:
 968         if(copied)
 969                 err = copied;
 970         goto out;
 971 do_fault:
 972         kfree_skb(skb);
 973 do_fault2:
 974         err = -EFAULT;
 975 out:
 976         tcp_push_pending_frames(sk, tp);
 977         release_sock(sk);
 978         lock_kernel();
 979         return err;
 980 }
 981
 982 #undef PSH_NEEDED
 983
 984 /*
 985  *      Send an ack if one is backlogged at this point. Ought to merge
 986  *      this with tcp_send_ack().
 987  *      This is called for delayed acks also.
 988  */
 989
 990 void tcp_read_wakeup(struct sock *sk)
 991 {
 992         /* If we're closed, don't send an ack, or we'll get a RST
 993          * from the closed destination.
 994          */
 995         if (sk->state != TCP_CLOSE)
 996                 tcp_send_ack(sk);
 997 }
 998
 999 /*
1000  *      Handle reading urgent data. BSD has very simple semantics for
1001  *      this, no blocking and very strange errors 8)
1002  */
1003
1004 static int tcp_recv_urg(struct sock * sk, int nonblock,
1005                         struct msghdr *msg, int len, int flags,
1006                         int *addr_len)
1007 {
1008         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1009
1010         /* No URG data to read. */
1011         if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
1012                 return -EINVAL; /* Yes this is right ! */
1013
1014         if (sk->err)
1015                 return sock_error(sk);
1016
1017         if (sk->done)
1018                 return -ENOTCONN;
1019
1020         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
1021                 sk->done = 1;
1022                 return 0;
1023         }
1024
1025         lock_sock(sk);
1026         if (tp->urg_data & URG_VALID) {
1027                 int err = 0;
1028                 char c = tp->urg_data;
1029
1030                 if (!(flags & MSG_PEEK))
1031                         tp->urg_data = URG_READ;
1032
1033                 if(msg->msg_name)
1034                         tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1035                                                        msg->msg_name);
1036
1037                 if(addr_len)
1038                         *addr_len = tp->af_specific->sockaddr_len;
1039
1040                 /* Read urgent data. */
1041                 msg->msg_flags|=MSG_OOB;
1042                 release_sock(sk);
1043
1044                 if(len>0)
1045                 {
1046                         err = memcpy_toiovec(msg->msg_iov, &c, 1);
1047                         /* N.B. already set above ... */
1048                         msg->msg_flags|=MSG_OOB;
1049                 }
1050                 else
1051                         msg->msg_flags|=MSG_TRUNC;
1052
1053                 /* N.B. Is this right?? If len == 0 we didn't read any data */
1054                 return err ? -EFAULT : 1;
1055         }
1056         release_sock(sk);
1057
1058         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1059          * the available implementations agree in this case:
1060          * this call should never block, independent of the
1061          * blocking state of the socket.
1062          * Mike <pall@rz.uni-karlsruhe.de>
1063          */
1064         return -EAGAIN;
1065 }
1066
1067 /*
1068  *      Release a skb if it is no longer needed. This routine
1069  *      must be called with interrupts disabled or with the
1070  *      socket locked so that the sk_buff queue operation is ok.
1071  */
1072
1073 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1074 {
1075         __skb_unlink(skb, &sk->receive_queue);
1076         kfree_skb(skb);
1077 }
1078
1079 /* Clean up the receive buffer for full frames taken by the user,
1080  * then send an ACK if necessary.  COPIED is the number of bytes
1081  * tcp_recvmsg has given to the user so far, it speeds up the
1082  * calculation of whether or not we must ACK for the sake of
1083  * a window update.
1084  */
1085 static void cleanup_rbuf(struct sock *sk, int copied)
1086 {
1087         struct sk_buff *skb;
1088
1089         /* NOTE! The socket must be locked, so that we don't get
1090          * a messed-up receive queue.
1091          */
1092         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1093                 if (!skb->used || atomic_read(&skb->users) > 1)
1094                         break;
1095                 tcp_eat_skb(sk, skb);
1096         }
1097
1098         /* We send an ACK if we can now advertise a non-zero window
1099          * which has been raised "significantly".
1100          */
1101         if(copied > 0) {
1102                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1103                 __u32 rcv_window_now = tcp_receive_window(tp);
1104                 __u32 new_window = __tcp_select_window(sk);
1105
1106                 /* We won't be raising the window any further than
1107                  * the window-clamp allows.  Our window selection
1108                  * also keeps things a nice multiple of MSS.  These
1109                  * checks are necessary to prevent spurious ACKs
1110                  * which don't advertize a larger window.
1111                  */
1112                 if((new_window && (new_window >= rcv_window_now * 2)) &&
1113                    ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
1114                         tcp_read_wakeup(sk);
1115         }
1116 }
1117
1118
1119 /*
1120  *      This routine copies from a sock struct into the user buffer.
1121  */
1122
1123 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1124                 int len, int nonblock, int flags, int *addr_len)
1125 {
1126         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1127         DECLARE_WAITQUEUE(wait, current);
1128         int copied = 0;
1129         u32 peek_seq;
1130         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1131         unsigned long used;
1132         int err = 0;
1133         int target = 1;         /* Read at least this many bytes */
1134
1135         if (sk->err)
1136                 return sock_error(sk);
1137
1138         if (sk->state == TCP_LISTEN)
1139                 return -ENOTCONN;
1140
1141         /* Urgent data needs to be handled specially. */
1142         if (flags & MSG_OOB)
1143                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1144
1145         /*      Copying sequence to update. This is volatile to handle
1146          *      the multi-reader case neatly (memcpy_to/fromfs might be
1147          *      inline and thus not flush cached variables otherwise).
1148          */
1149         peek_seq = tp->copied_seq;
1150         seq = &tp->copied_seq;
1151         if (flags & MSG_PEEK)
1152                 seq = &peek_seq;
1153
1154         /* Handle the POSIX bogosity MSG_WAITALL. */
1155         if (flags & MSG_WAITALL)
1156                 target=len;
1157
1158         unlock_kernel();
1159         add_wait_queue(sk->sleep, &wait);
1160         lock_sock(sk);
1161
1162         /*
1163          *      BUG BUG BUG
1164          *      This violates 1003.1g compliance. We must wait for
1165          *      data to exist even if we read none!
1166          */
1167
1168         while (len > 0) {
1169                 struct sk_buff * skb;
1170                 u32 offset;
1171
1172                 /* Are we at urgent data? Stop if we have read anything. */
1173                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1174                         break;
1175
1176                 /* We need to check signals first, to get correct SIGURG
1177                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1178                  * and move it down to the bottom of the loop
1179                  */
1180                 if (signal_pending(current)) {
1181                         if (copied)
1182                                 break;
1183                         copied = -ERESTARTSYS;
1184                         if (nonblock)
1185                                 copied = -EAGAIN;
1186                         break;
1187                 }
1188
1189                 /* Next get a buffer. */
1190                 current->state = TASK_INTERRUPTIBLE;
1191
1192                 skb = skb_peek(&sk->receive_queue);
1193                 do {
1194                         if (!skb)
1195                                 break;
1196
1197                         /* Now that we have two receive queues this
1198                          * shouldn't happen.
1199                          */
1200                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1201                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1202                                        *seq, TCP_SKB_CB(skb)->seq);
1203                                 break;
1204                         }
1205                         offset = *seq - TCP_SKB_CB(skb)->seq;
1206                         if (skb->h.th->syn)
1207                                 offset--;
1208                         if (offset < skb->len)
1209                                 goto found_ok_skb;
1210                         if (skb->h.th->fin)
1211                                 goto found_fin_ok;
1212                         if (!(flags & MSG_PEEK))
1213                                 skb->used = 1;
1214                         skb = skb->next;
1215                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1216
1217                 if (copied >= target)
1218                         break;
1219
1220                 /*
1221                    These three lines and clause if (sk->state == TCP_CLOSE)
1222                    are unlikely to be correct, if target > 1.
1223                    I DO NOT FIX IT, because I have no idea, what
1224                    POSIX prescribes to make here. Probably, it really
1225                    wants to lose data 8), if not all target is received.
1226                                                                  --ANK
1227                  */
1228                 if (sk->err && !(flags&MSG_PEEK)) {
1229                         copied = sock_error(sk);
1230                         break;
1231                 }
1232
1233                 if (sk->shutdown & RCV_SHUTDOWN) {
1234                         sk->done = 1;
1235                         break;
1236                 }
1237
1238                 if (sk->state == TCP_CLOSE) {
1239                         if (!sk->done) {
1240                                 sk->done = 1;
1241                                 break;
1242                         }
1243                         copied = -ENOTCONN;
1244                         break;
1245                 }
1246
1247                 if (nonblock) {
1248                         copied = -EAGAIN;
1249                         break;
1250                 }
1251
1252                 cleanup_rbuf(sk, copied);
1253                 release_sock(sk);
1254                 sk->socket->flags |= SO_WAITDATA;
1255                 schedule();
1256                 sk->socket->flags &= ~SO_WAITDATA;
1257                 lock_sock(sk);
1258                 continue;
1259
1260         found_ok_skb:
1261                 /*      Lock the buffer. We can be fairly relaxed as
1262                  *      an interrupt will never steal a buffer we are
1263                  *      using unless I've missed something serious in
1264                  *      tcp_data.
1265                  */
1266                 atomic_inc(&skb->users);
1267
1268                 /* Ok so how much can we use? */
1269                 used = skb->len - offset;
1270                 if (len < used)
1271                         used = len;
1272
1273                 /* Do we have urgent data here? */
1274                 if (tp->urg_data) {
1275                         u32 urg_offset = tp->urg_seq - *seq;
1276                         if (urg_offset < used) {
1277                                 if (!urg_offset) {
1278                                         if (!sk->urginline) {
1279                                                 ++*seq;
1280                                                 offset++;
1281                                                 used--;
1282                                         }
1283                                 } else
1284                                         used = urg_offset;
1285                         }
1286                 }
1287
1288                 /*      Copy it - We _MUST_ update *seq first so that we
1289                  *      don't ever double read when we have dual readers
1290                  */
1291                 *seq += used;
1292
1293                 /*      This memcpy_toiovec can sleep. If it sleeps and we
1294                  *      do a second read it relies on the skb->users to avoid
1295                  *      a crash when cleanup_rbuf() gets called.
1296                  */
1297                 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1298                 if (err) {
1299                         /* Exception. Bailout! */
1300                         atomic_dec(&skb->users);
1301                         copied = -EFAULT;
1302                         break;
1303                 }
1304
1305                 copied += used;
1306                 len -= used;
1307
1308                 /*      We now will not sleep again until we are finished
1309                  *      with skb. Sorry if you are doing the SMP port
1310                  *      but you'll just have to fix it neatly ;)
1311                  *
1312                  *      Very funny Alan... -DaveM
1313                  */
1314                 atomic_dec(&skb->users);
1315
1316                 if (after(tp->copied_seq,tp->urg_seq))
1317                         tp->urg_data = 0;
1318                 if (used + offset < skb->len)
1319                         continue;
1320
1321                 /*      Process the FIN. We may also need to handle PSH
1322                  *      here and make it break out of MSG_WAITALL.
1323                  */
1324                 if (skb->h.th->fin)
1325                         goto found_fin_ok;
1326                 if (flags & MSG_PEEK)
1327                         continue;
1328                 skb->used = 1;
1329                 if (atomic_read(&skb->users) == 1)
1330                         tcp_eat_skb(sk, skb);
1331                 continue;
1332
1333         found_fin_ok:
1334                 ++*seq;
1335                 if (flags & MSG_PEEK)
1336                         break;
1337
1338                 /* All is done. */
1339                 skb->used = 1;
1340                 sk->shutdown |= RCV_SHUTDOWN;
1341                 break;
1342         }
1343
1344         if(copied > 0 && msg->msg_name)
1345                 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1346                                                msg->msg_name);
1347
1348         if(addr_len)
1349                 *addr_len = tp->af_specific->sockaddr_len;
1350
1351         remove_wait_queue(sk->sleep, &wait);
1352         current->state = TASK_RUNNING;
1353
1354         /* Clean up data we have read: This will do ACK frames. */
1355         cleanup_rbuf(sk, copied);
1356         release_sock(sk);
1357         lock_kernel();
1358         return copied;
1359 }
1360
1361 /*
1362  * Check whether to renew the timer.
1363  */
1364 static inline void tcp_check_fin_timer(struct sock *sk)
1365 {
1366         if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev)
1367                 tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
1368 }
1369
1370 /*
1371  *      State processing on a close. This implements the state shift for
1372  *      sending our FIN frame. Note that we only send a FIN for some
1373  *      states. A shutdown() may have already sent the FIN, or we may be
1374  *      closed.
1375  */
1376
1377 static unsigned char new_state[16] = {
1378   /* current state:        new state:      action:      */
1379   /* (Invalid)          */ TCP_CLOSE,
1380   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1381   /* TCP_SYN_SENT       */ TCP_CLOSE,
1382   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1383   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1384   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1385   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1386   /* TCP_CLOSE          */ TCP_CLOSE,
1387   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1388   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1389   /* TCP_LISTEN         */ TCP_CLOSE,
1390   /* TCP_CLOSING        */ TCP_CLOSING,
1391 };
1392
1393 static int tcp_close_state(struct sock *sk, int dead)
1394 {
1395         int next = (int) new_state[sk->state];
1396         int ns = (next & TCP_STATE_MASK);
1397
1398         tcp_set_state(sk, ns);
1399
1400         /*      This is a (useful) BSD violating of the RFC. There is a
1401          *      problem with TCP as specified in that the other end could
1402          *      keep a socket open forever with no application left this end.
1403          *      We use a 3 minute timeout (about the same as BSD) then kill
1404          *      our end. If they send after that then tough - BUT: long enough
1405          *      that we won't make the old 4*rto = almost no time - whoops
1406          *      reset mistake.
1407          */
1408         if (dead)
1409                 tcp_check_fin_timer(sk);
1410
1411         return (next & TCP_ACTION_FIN);
1412 }
1413
1414 /*
1415  *      Shutdown the sending side of a connection. Much like close except
1416  *      that we don't receive shut down or set sk->dead.
1417  */
1418
1419 void tcp_shutdown(struct sock *sk, int how)
1420 {
1421         /*      We need to grab some memory, and put together a FIN,
1422          *      and then put it into the queue to be sent.
1423          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1424          */
1425         if (!(how & SEND_SHUTDOWN))
1426                 return;
1427
1428         /* If we've already sent a FIN, or it's a closed state, skip this. */
1429         if ((1 << sk->state) &
1430             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1431                 lock_sock(sk);
1432
1433                 /* Clear out any half completed packets.  FIN if needed. */
1434                 if (tcp_close_state(sk,0))
1435                         tcp_send_fin(sk);
1436
1437                 release_sock(sk);
1438         }
1439 }
1440
1441
1442 /*
1443  *      Return 1 if we still have things to send in our buffers.
1444  */
1445
1446 static inline int closing(struct sock * sk)
1447 {
1448         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1449 }
1450
1451 /*
1452  *      This routine closes sockets which have been at least partially
1453  *      opened, but not yet accepted. Currently it is only called by
1454  *      tcp_close, and timeout mirrors the value there.
1455  */
1456
1457 static void tcp_close_pending (struct sock *sk)
1458 {
1459         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1460         struct open_request *req = tp->syn_wait_queue;
1461
1462         while(req) {
1463                 struct open_request *iter;
1464
1465                 if (req->sk)
1466                         tcp_close(req->sk, 0);
1467
1468                 iter = req;
1469                 req = req->dl_next;
1470
1471                 (*iter->class->destructor)(iter);
1472                 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1473                 sk->ack_backlog--;
1474                 tcp_openreq_free(iter);
1475         }
1476
1477         tcp_synq_init(tp);
1478 }
1479
1480 void tcp_close(struct sock *sk, long timeout)
1481 {
1482         struct sk_buff *skb;
1483         int data_was_unread = 0;
1484
1485         /* We need to grab some memory, and put together a FIN,
1486          * and then put it into the queue to be sent.
1487          */
1488         unlock_kernel();
1489         lock_sock(sk);
1490         if(sk->state == TCP_LISTEN) {
1491                 /* Special case. */
1492                 tcp_set_state(sk, TCP_CLOSE);
1493                 tcp_close_pending(sk);
1494                 release_sock(sk);
1495                 lock_kernel();
1496                 sk->dead = 1;
1497                 return;
1498         }
1499
1500         /* It is questionable, what the role of this is now.
1501          * In any event either it should be removed, or
1502          * increment of SLT_KEEPALIVE be done, this is causing
1503          * big problems.  For now I comment it out.  -DaveM
1504          */
1505         /* sk->keepopen = 1; */
1506         sk->shutdown = SHUTDOWN_MASK;
1507
1508         if (!sk->dead)
1509                 sk->state_change(sk);
1510
1511         /*  We need to flush the recv. buffs.  We do this only on the
1512          *  descriptor close, not protocol-sourced closes, because the
1513          *  reader process may not have drained the data yet!
1514          */
1515         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1516                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1517                 data_was_unread += len;
1518                 kfree_skb(skb);
1519         }
1520
1521         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1522          * 3.10, we send a RST here because data was lost.  To
1523          * witness the awful effects of the old behavior of always
1524          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1525          * a bulk GET in an FTP client, suspend the process, wait
1526          * for the client to advertise a zero window, then kill -9
1527          * the FTP client, wheee...  Note: timeout is always zero
1528          * in such a case.
1529          */
1530         if(data_was_unread != 0) {
1531                 /* Unread data was tossed, zap the connection. */
1532                 tcp_set_state(sk, TCP_CLOSE);
1533                 tcp_send_active_reset(sk);
1534         } else if (tcp_close_state(sk,1)) {
1535                 /* We FIN if the application ate all the data before
1536                  * zapping the connection.
1537                  */
1538                 tcp_send_fin(sk);
1539         }
1540
1541         if (timeout) {
1542                 struct task_struct *tsk = current;
1543                 DECLARE_WAITQUEUE(wait, current);
1544
1545                 add_wait_queue(sk->sleep, &wait);
1546                 release_sock(sk);
1547
1548                 while (1) {
1549                         tsk->state = TASK_INTERRUPTIBLE;
1550                         if (!closing(sk))
1551                                 break;
1552                         timeout = schedule_timeout(timeout);
1553                         if (signal_pending(tsk) || !timeout)
1554                                 break;
1555                 }
1556
1557                 tsk->state = TASK_RUNNING;
1558                 remove_wait_queue(sk->sleep, &wait);
1559
1560                 lock_sock(sk);
1561         }
1562
1563         /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1564          * we may need to set up a timer.
1565          */
1566         tcp_check_fin_timer(sk);
1567
1568         release_sock(sk);
1569         lock_kernel();
1570         sk->dead = 1;
1571 }
1572
1573 /*
1574  *      Wait for an incoming connection, avoid race
1575  *      conditions. This must be called with the socket locked,
1576  *      and without the kernel lock held.
1577  */
1578 static struct open_request * wait_for_connect(struct sock * sk,
1579                                               struct open_request **pprev)
1580 {
1581         DECLARE_WAITQUEUE(wait, current);
1582         struct open_request *req;
1583
1584         /*
1585          * True wake-one mechanism for incoming connections: only
1586          * one process gets woken up, not the 'whole herd'.
1587          * Since we do not 'race & poll' for established sockets
1588          * anymore, the common case will execute the loop only once.
1589          *
1590          * Or rather, it _would_ execute only once if it wasn't for
1591          * some extraneous wakeups that currently happen.
1592          *
1593          * Subtle issue: "add_wait_queue_exclusive()" will be added
1594          * after any current non-exclusive waiters, and we know that
1595          * it will always _stay_ after any new non-exclusive waiters
1596          * because all non-exclusive waiters are added at the
1597          * beginning of the wait-queue. As such, it's ok to "drop"
1598          * our exclusiveness temporarily when we get woken up without
1599          * having to remove and re-insert us on the wait queue.
1600          */
1601         add_wait_queue_exclusive(sk->sleep, &wait);
1602         for (;;) {
1603                 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1604                 release_sock(sk);
1605                 schedule();
1606                 lock_sock(sk);
1607                 req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
1608                 if (req)
1609                         break;
1610                 if (signal_pending(current))
1611                         break;
1612         }
1613         current->state = TASK_RUNNING;
1614         remove_wait_queue(sk->sleep, &wait);
1615         return req;
1616 }
1617
1618 /*
1619  *      This will accept the next outstanding connection.
1620  *
1621  *      Be careful about race conditions here - this is subtle.
1622  */
1623
1624 struct sock *tcp_accept(struct sock *sk, int flags)
1625 {
1626         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1627         struct open_request *req, *prev;
1628         struct sock *newsk = NULL;
1629         int error;
1630
1631         unlock_kernel();
1632         lock_sock(sk);
1633
1634         /* We need to make sure that this socket is listening,
1635          * and that it has something pending.
1636          */
1637         error = EINVAL;
1638         if (sk->state != TCP_LISTEN)
1639                 goto out;
1640
1641         /* Find already established connection */
1642         req = tcp_find_established(tp, &prev);
1643         if (!req) {
1644                 /* If this is a non blocking socket don't sleep */
1645                 error = EAGAIN;
1646                 if (flags & O_NONBLOCK)
1647                         goto out;
1648
1649                 error = ERESTARTSYS;
1650                 req = wait_for_connect(sk, &prev);
1651                 if (!req)
1652                         goto out;
1653         }
1654
1655         tcp_synq_unlink(tp, req, prev);
1656         newsk = req->sk;
1657         req->class->destructor(req);
1658         tcp_openreq_free(req);
1659         sk->ack_backlog--;
1660         if(sk->keepopen)
1661                 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1662
1663         release_sock(sk);
1664         lock_kernel();
1665         return newsk;
1666
1667 out:
1668         /* sk should be in LISTEN state, thus accept can use sk->err for
1669          * internal purposes without stomping one anyone's feed.
1670          */
1671         sk->err = error;
1672         release_sock(sk);
1673         lock_kernel();
1674         return newsk;
1675 }
1676
1677 /*
1678  *      Socket option code for TCP.
1679  */
1680
1681 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
1682                    int optlen)
1683 {
1684         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1685         int val;
1686
1687         if (level != SOL_TCP)
1688                 return tp->af_specific->setsockopt(sk, level, optname,
1689                                                    optval, optlen);
1690
1691         if(optlen<sizeof(int))
1692                 return -EINVAL;
1693
1694         if (get_user(val, (int *)optval))
1695                 return -EFAULT;
1696
1697         switch(optname) {
1698         case TCP_MAXSEG:
1699                 /* values greater than interface MTU won't take effect.  however at
1700                  * the point when this call is done we typically don't yet know
1701                  * which interface is going to be used
1702                  */
1703                 if(val < 1 || val > MAX_WINDOW)
1704                         return -EINVAL;
1705                 tp->user_mss = val;
1706                 return 0;
1707
1708         case TCP_NODELAY:
1709                 /* You cannot try to use this and TCP_CORK in
1710                  * tandem, so let the user know.
1711                  */
1712                 if (sk->nonagle == 2)
1713                         return -EINVAL;
1714                 sk->nonagle = (val == 0) ? 0 : 1;
1715                 return 0;
1716
1717         case TCP_CORK:
1718                 /* When set indicates to always queue non-full frames.
1719                  * Later the user clears this option and we transmit
1720                  * any pending partial frames in the queue.  This is
1721                  * meant to be used alongside sendfile() to get properly
1722                  * filled frames when the user (for example) must write
1723                  * out headers with a write() call first and then use
1724                  * sendfile to send out the data parts.
1725                  *
1726                  * You cannot try to use TCP_NODELAY and this mechanism
1727                  * at the same time, so let the user know.
1728                  */
1729                 if (sk->nonagle == 1)
1730                         return -EINVAL;
1731                 if (val != 0) {
1732                         sk->nonagle = 2;
1733                 } else {
1734                         sk->nonagle = 0;
1735
1736                         lock_sock(sk);
1737                         tcp_push_pending_frames(sk, tp);
1738                         release_sock(sk);
1739                 }
1740                 return 0;
1741
1742         default:
1743                 return -ENOPROTOOPT;
1744         };
1745 }
1746
1747 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
1748                    int *optlen)
1749 {
1750         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1751         int val, len;
1752
1753         if(level != SOL_TCP)
1754                 return tp->af_specific->getsockopt(sk, level, optname,
1755                                                    optval, optlen);
1756
1757         if(get_user(len,optlen))
1758                 return -EFAULT;
1759
1760         len = min(len, sizeof(int));
1761
1762         switch(optname) {
1763         case TCP_MAXSEG:
1764                 val = tp->user_mss;
1765                 break;
1766         case TCP_NODELAY:
1767                 val = (sk->nonagle == 1);
1768                 break;
1769         case TCP_CORK:
1770                 val = (sk->nonagle == 2);
1771                 break;
1772         default:
1773                 return -ENOPROTOOPT;
1774         };
1775
1776         if(put_user(len, optlen))
1777                 return -EFAULT;
1778         if(copy_to_user(optval, &val,len))
1779                 return -EFAULT;
1780         return 0;
1781 }
1782
1783 void tcp_set_keepalive(struct sock *sk, int val)
1784 {
1785         if (!sk->keepopen && val)
1786                 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1787         else if (sk->keepopen && !val)
1788                 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1789 }
1790
1791 extern void __skb_cb_too_small_for_tcp(int, int);
1792
1793 void __init tcp_init(void)
1794 {
1795         struct sk_buff *skb = NULL;
1796         unsigned long goal;
1797         int order;
1798
1799         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
1800                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
1801                                            sizeof(skb->cb));
1802
1803         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
1804                                                    sizeof(struct open_request),
1805                                                0, SLAB_HWCACHE_ALIGN,
1806                                                NULL, NULL);
1807         if(!tcp_openreq_cachep)
1808                 panic("tcp_init: Cannot alloc open_request cache.");
1809
1810         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
1811                                               sizeof(struct tcp_bind_bucket),
1812                                               0, SLAB_HWCACHE_ALIGN,
1813                                               NULL, NULL);
1814         if(!tcp_bucket_cachep)
1815                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
1816
1817         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
1818                                                 sizeof(struct tcp_tw_bucket),
1819                                                 0, SLAB_HWCACHE_ALIGN,
1820                                                 NULL, NULL);
1821         if(!tcp_timewait_cachep)
1822                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
1823
1824         /* Size and allocate the main established and bind bucket
1825          * hash tables.
1826          *
1827          * The methodology is similar to that of the buffer cache.
1828          */
1829         goal = num_physpages >> (20 - PAGE_SHIFT);
1830         for(order = 5; (1UL << order) < goal; order++)
1831                 ;
1832         do {
1833                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
1834                         sizeof(struct sock *);
1835                 tcp_ehash = (struct sock **)
1836                         __get_free_pages(GFP_ATOMIC, order);
1837         } while (tcp_ehash == NULL && --order > 4);
1838
1839         if (!tcp_ehash)
1840                 panic("Failed to allocate TCP established hash table\n");
1841         memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *));
1842
1843         do {
1844                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
1845                         sizeof(struct tcp_bind_bucket *);
1846                 tcp_bhash = (struct tcp_bind_bucket **)
1847                         __get_free_pages(GFP_ATOMIC, order);
1848         } while (tcp_bhash == NULL && --order > 4);
1849
1850         if (!tcp_bhash)
1851                 panic("Failed to allocate TCP bind hash table\n");
1852         memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *));
1853
1854         printk("TCP: Hash tables configured (established %d bind %d)\n",
1855                tcp_ehash_size, tcp_bhash_size);
1856 }