net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.145 1999/06/29 12:35:56 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but its a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen :    Make poll agree with SIGIO
 205  *
 206  *              This program is free software; you can redistribute it and/or
 207  *              modify it under the terms of the GNU General Public License
 208  *              as published by the Free Software Foundation; either version
 209  *              2 of the License, or(at your option) any later version.
 210  *
 211  * Description of States:
 212  *
 213  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 214  *
 215  *      TCP_SYN_RECV            received a connection request, sent ack,
 216  *                              waiting for final ack in three-way handshake.
 217  *
 218  *      TCP_ESTABLISHED         connection established
 219  *
 220  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 221  *                              transmission of remaining buffered data
 222  *
 223  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 224  *                              to shutdown
 225  *
 226  *      TCP_CLOSING             both sides have shutdown but we still have
 227  *                              data we have to finish sending
 228  *
 229  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 230  *                              closed, can only be entered from FIN_WAIT2
 231  *                              or CLOSING.  Required because the other end
 232  *                              may not have gotten our last ACK causing it
 233  *                              to retransmit the data packet (which we ignore)
 234  *
 235  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 236  *                              us to finish writing our data and to shutdown
 237  *                              (we have to close() to move on to LAST_ACK)
 238  *
 239  *      TCP_LAST_ACK            out side has shutdown after remote has
 240  *                              shutdown.  There may still be data in our
 241  *                              buffer that we have to finish sending
 242  *
 243  *      TCP_CLOSE               socket is finished
 244  */
 245
 246 /*
 247  * RFC1122 status:
 248  * NOTE: I'm not going to be doing comments in the code for this one except
 249  * for violations and the like.  tcp.c is just too big... If I say something
 250  * "does?" or "doesn't?", it means I'm not sure, and will have to hash it out
 251  * with Alan. -- MS 950903
 252  * [Note: Most of the TCP code has been rewriten/redesigned since this
 253  *  RFC1122 check. It is probably not correct anymore. It should be redone
 254  *  before 2.2. -AK]
 255  *
 256  * Use of PSH (4.2.2.2)
 257  *   MAY aggregate data sent without the PSH flag. (does)
 258  *   MAY queue data received without the PSH flag. (does)
 259  *   SHOULD collapse successive PSH flags when it packetizes data. (doesn't)
 260  *   MAY implement PSH on send calls. (doesn't, thus:)
 261  *     MUST NOT buffer data indefinitely (doesn't [1 second])
 262  *     MUST set PSH on last segment (does)
 263  *   MAY pass received PSH to application layer (doesn't)
 264  *   SHOULD send maximum-sized segment whenever possible. (almost always does)
 265  *
 266  * Window Size (4.2.2.3, 4.2.2.16)
 267  *   MUST treat window size as an unsigned number (does)
 268  *   SHOULD treat window size as a 32-bit number (does not)
 269  *   MUST NOT shrink window once it is offered (does not normally)
 270  *
 271  * Urgent Pointer (4.2.2.4)
 272  * **MUST point urgent pointer to last byte of urgent data (not right
 273  *     after). (doesn't, to be like BSD. That's configurable, but defaults
 274  *      to off)
 275  *   MUST inform application layer asynchronously of incoming urgent
 276  *     data. (does)
 277  *   MUST provide application with means of determining the amount of
 278  *     urgent data pending. (does)
 279  * **MUST support urgent data sequence of arbitrary length. (doesn't, but
 280  *   it's sort of tricky to fix, as urg_ptr is a 16-bit quantity)
 281  *      [Follows BSD 1 byte of urgent data]
 282  *
 283  * TCP Options (4.2.2.5)
 284  *   MUST be able to receive TCP options in any segment. (does)
 285  *   MUST ignore unsupported options (does)
 286  *
 287  * Maximum Segment Size Option (4.2.2.6)
 288  *   MUST implement both sending and receiving MSS. (does, but currently
 289  *      only uses the smaller of both of them)
 290  *   SHOULD send an MSS with every SYN where receive MSS != 536 (MAY send
 291  *     it always). (does, even when MSS == 536, which is legal)
 292  *   MUST assume MSS == 536 if no MSS received at connection setup (does)
 293  *   MUST calculate "effective send MSS" correctly:
 294  *     min(physical_MTU, remote_MSS+20) - sizeof(tcphdr) - sizeof(ipopts)
 295  *     (does - but allows operator override)
 296  *
 297  * TCP Checksum (4.2.2.7)
 298  *   MUST generate and check TCP checksum. (does)
 299  *
 300  * Initial Sequence Number Selection (4.2.2.8)
 301  *   MUST use the RFC 793 clock selection mechanism.  (doesn't, but it's
 302  *     OK: RFC 793 specifies a 250KHz clock, while we use 1MHz, which is
 303  *     necessary for 10Mbps networks - and harder than BSD to spoof!
 304  *     With syncookies we don't)
 305  *
 306  * Simultaneous Open Attempts (4.2.2.10)
 307  *   MUST support simultaneous open attempts (does)
 308  *
 309  * Recovery from Old Duplicate SYN (4.2.2.11)
 310  *   MUST keep track of active vs. passive open (does)
 311  *
 312  * RST segment (4.2.2.12)
 313  *   SHOULD allow an RST segment to contain data (does, but doesn't do
 314  *     anything with it, which is standard)
 315  *
 316  * Closing a Connection (4.2.2.13)
 317  *   MUST inform application of whether connection was closed by RST or
 318  *     normal close. (does)
 319  *   MAY allow "half-duplex" close (treat connection as closed for the
 320  *     local app, even before handshake is done). (does)
 321  *   MUST linger in TIME_WAIT for 2 * MSL (does)
 322  *
 323  * Retransmission Timeout (4.2.2.15)
 324  *   MUST implement Jacobson's slow start and congestion avoidance
 325  *     stuff. (does)
 326  *
 327  * Probing Zero Windows (4.2.2.17)
 328  *   MUST support probing of zero windows. (does)
 329  *   MAY keep offered window closed indefinitely. (does)
 330  *   MUST allow remote window to stay closed indefinitely. (does)
 331  *
 332  * Passive Open Calls (4.2.2.18)
 333  *   MUST NOT let new passive open affect other connections. (doesn't)
 334  *   MUST support passive opens (LISTENs) concurrently. (does)
 335  *
 336  * Time to Live (4.2.2.19)
 337  *   MUST make TCP TTL configurable. (does - IP_TTL option)
 338  *
 339  * Event Processing (4.2.2.20)
 340  *   SHOULD queue out-of-order segments. (does)
 341  *   MUST aggregate ACK segments whenever possible. (does but badly)
 342  *
 343  * Retransmission Timeout Calculation (4.2.3.1)
 344  *   MUST implement Karn's algorithm and Jacobson's algorithm for RTO
 345  *     calculation. (does, or at least explains them in the comments 8*b)
 346  *  SHOULD initialize RTO to 0 and RTT to 3. (does)
 347  *
 348  * When to Send an ACK Segment (4.2.3.2)
 349  *   SHOULD implement delayed ACK. (does)
 350  *   MUST keep ACK delay < 0.5 sec. (does)
 351  *
 352  * When to Send a Window Update (4.2.3.3)
 353  *   MUST implement receiver-side SWS. (does)
 354  *
 355  * When to Send Data (4.2.3.4)
 356  *   MUST implement sender-side SWS. (does)
 357  *   SHOULD implement Nagle algorithm. (does)
 358  *
 359  * TCP Connection Failures (4.2.3.5)
 360  *  MUST handle excessive retransmissions "properly" (see the RFC). (does)
 361  *   SHOULD inform application layer of soft errors. (does)
 362  *
 363  * TCP Keep-Alives (4.2.3.6)
 364  *   MAY provide keep-alives. (does)
 365  *   MUST make keep-alives configurable on a per-connection basis. (does)
 366  *   MUST default to no keep-alives. (does)
 367  *   MUST make keep-alive interval configurable. (does)
 368  *   MUST make default keep-alive interval > 2 hours. (does)
 369  *   MUST NOT interpret failure to ACK keep-alive packet as dead
 370  *     connection. (doesn't)
 371  *   SHOULD send keep-alive with no data. (does)
 372  *
 373  * TCP Multihoming (4.2.3.7)
 374  *   MUST get source address from IP layer before sending first
 375  *     SYN. (does)
 376  *   MUST use same local address for all segments of a connection. (does)
 377  *
 378  * IP Options (4.2.3.8)
 379  *   MUST ignore unsupported IP options. (does)
 380  *   MAY support Time Stamp and Record Route. (does)
 381  *   MUST allow application to specify a source route. (does)
 382  *   MUST allow received Source Route option to set route for all future
 383  *     segments on this connection. (does not (security issues))
 384  *
 385  * ICMP messages (4.2.3.9)
 386  *   MUST act on ICMP errors. (does)
 387  *   MUST slow transmission upon receipt of a Source Quench. (doesn't anymore
 388  *   because that is deprecated now by the IETF, can be turned on)
 389  *   MUST NOT abort connection upon receipt of soft Destination
 390  *     Unreachables (0, 1, 5), Time Exceededs and Parameter
 391  *     Problems. (doesn't)
 392  *   SHOULD report soft Destination Unreachables etc. to the
 393  *     application. (does, except during SYN_RECV and may drop messages
 394  *     in some rare cases before accept() - ICMP is unreliable)
 395  *   SHOULD abort connection upon receipt of hard Destination Unreachable
 396  *     messages (2, 3, 4). (does, but see above)
 397  *
 398  * Remote Address Validation (4.2.3.10)
 399  *   MUST reject as an error OPEN for invalid remote IP address. (does)
 400  *   MUST ignore SYN with invalid source address. (does)
 401  *   MUST silently discard incoming SYN for broadcast/multicast
 402  *     address. (does)
 403  *
 404  * Asynchronous Reports (4.2.4.1)
 405  * MUST provide mechanism for reporting soft errors to application
 406  *     layer. (does)
 407  *
 408  * Type of Service (4.2.4.2)
 409  *   MUST allow application layer to set Type of Service. (does IP_TOS)
 410  *
 411  * (Whew. -- MS 950903)
 412  * (Updated by AK, but not complete yet.)
 413  **/
 414
 415 #include <linux/types.h>
 416 #include <linux/fcntl.h>
 417 #include <linux/poll.h>
 418 #include <linux/init.h>
 419 #include <linux/smp_lock.h>
 420
 421 #include <net/icmp.h>
 422 #include <net/tcp.h>
 423
 424 #include <asm/uaccess.h>
 425
 426 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 427
 428 struct tcp_mib  tcp_statistics;
 429
 430 kmem_cache_t *tcp_openreq_cachep;
 431 kmem_cache_t *tcp_bucket_cachep;
 432 kmem_cache_t *tcp_timewait_cachep;
 433
 434 /*
 435  *      Find someone to 'accept'. Must be called with
 436  *      the listening socket locked.
 437  */
 438
 439 static struct open_request *tcp_find_established(struct tcp_opt *tp,
 440                                                  struct open_request **prevp)
 441 {
 442         struct open_request *req = tp->syn_wait_queue;
 443         struct open_request *prev = (struct open_request *)&tp->syn_wait_queue;
 444         while(req) {
 445                 if (req->sk) {
 446                         if((1 << req->sk->state) &
 447                            ~(TCPF_SYN_SENT|TCPF_SYN_RECV))
 448                                 break;
 449                 }
 450                 prev = req;
 451                 req = req->dl_next;
 452         }
 453         *prevp = prev;
 454         return req;
 455 }
 456
 457 /*
 458  *      Walk down the receive queue counting readable data.
 459  *
 460  *      Must be called with the socket lock held.
 461  */
 462
 463 static int tcp_readable(struct sock *sk)
 464 {
 465         unsigned long counted;
 466         unsigned long amount;
 467         struct sk_buff *skb;
 468         int sum;
 469
 470         SOCK_DEBUG(sk, "tcp_readable: %p - ",sk);
 471
 472         skb = skb_peek(&sk->receive_queue);
 473         if (skb == NULL) {
 474                 SOCK_DEBUG(sk, "empty\n");
 475                 return(0);
 476         }
 477
 478         counted = sk->tp_pinfo.af_tcp.copied_seq;       /* Where we are at the moment */
 479         amount = 0;
 480
 481         /* Do until a push or until we are out of data. */
 482         do {
 483                 /* Found a hole so stops here. */
 484                 if (before(counted, TCP_SKB_CB(skb)->seq))      /* should not happen */
 485                         break;
 486
 487                 /* Length - header but start from where we are up to
 488                  * avoid overlaps.
 489                  */
 490                 sum = skb->len - (counted - TCP_SKB_CB(skb)->seq);
 491                 if (sum >= 0) {
 492                         /* Add it up, move on. */
 493                         amount += sum;
 494                         counted += sum;
 495                         if (skb->h.th->syn)
 496                                 counted++;
 497                 }
 498
 499                 /* Don't count urg data ... but do it in the right place!
 500                  * Consider: "old_data (ptr is here) URG PUSH data"
 501                  * The old code would stop at the first push because
 502                  * it counted the urg (amount==1) and then does amount--
 503                  * *after* the loop.  This means tcp_readable() always
 504                  * returned zero if any URG PUSH was in the queue, even
 505                  * though there was normal data available. If we subtract
 506                  * the urg data right here, we even get it to work for more
 507                  * than one URG PUSH skb without normal data.
 508                  * This means that poll() finally works now with urg data
 509                  * in the queue.  Note that rlogin was never affected
 510                  * because it doesn't use poll(); it uses two processes
 511                  * and a blocking read().  And the queue scan in tcp_read()
 512                  * was correct.  Mike <pall@rz.uni-karlsruhe.de>
 513                  */
 514
 515                 /* Don't count urg data. */
 516                 if (skb->h.th->urg)
 517                         amount--;
 518 #if 0
 519                 if (amount && skb->h.th->psh) break;
 520 #endif
 521                 skb = skb->next;
 522         } while(skb != (struct sk_buff *)&sk->receive_queue);
 523
 524         SOCK_DEBUG(sk, "got %lu bytes.\n",amount);
 525         return(amount);
 526 }
 527
 528 /*
 529  * LISTEN is a special case for poll..
 530  */
 531 static unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
 532 {
 533         struct open_request *req, *dummy;
 534
 535         lock_sock(sk);
 536         req = tcp_find_established(&sk->tp_pinfo.af_tcp, &dummy);
 537         release_sock(sk);
 538         if (req)
 539                 return POLLIN | POLLRDNORM;
 540         return 0;
 541 }
 542
 543 /*
 544  *      Compute minimal free write space needed to queue new packets.
 545  */
 546 #define tcp_min_write_space(__sk) \
 547         (atomic_read(&(__sk)->wmem_alloc) / 2)
 548
 549 /*
 550  *      Wait for a TCP event.
 551  *
 552  *      Note that we don't need to lock the socket, as the upper poll layers
 553  *      take care of normal races (between the test and the event) and we don't
 554  *      go look at any of the socket buffers directly.
 555  */
 556 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
 557 {
 558         unsigned int mask;
 559         struct sock *sk = sock->sk;
 560         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 561
 562         poll_wait(file, sk->sleep, wait);
 563         if (sk->state == TCP_LISTEN)
 564                 return tcp_listen_poll(sk, wait);
 565
 566         mask = 0;
 567         if (sk->err)
 568                 mask = POLLERR;
 569
 570         /*
 571          * POLLHUP is certainly not done right. But poll() doesn't
 572          * have a notion of HUP in just one direction, and for a
 573          * socket the read side is more interesting.
 574          *
 575          * Some poll() documentation says that POLLHUP is incompatible
 576          * with the POLLOUT/POLLWR flags, so somebody should check this
 577          * all. But careful, it tends to be safer to return too many
 578          * bits than too few, and you can easily break real applications
 579          * if you don't tell them that something has hung up!
 580          *
 581          * Check-me.
 582          */
 583         if (sk->shutdown & RCV_SHUTDOWN)
 584                 mask |= POLLHUP;
 585
 586         /* Connected? */
 587         if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
 588                 if ((tp->rcv_nxt != tp->copied_seq) &&
 589                     (tp->urg_seq != tp->copied_seq ||
 590                      tp->rcv_nxt != tp->copied_seq+1 ||
 591                      sk->urginline || !tp->urg_data))
 592                         mask |= POLLIN | POLLRDNORM;
 593
 594                 if (!(sk->shutdown & SEND_SHUTDOWN)) {
 595                         if (sock_wspace(sk) >= tcp_min_write_space(sk)) {
 596                                 mask |= POLLOUT | POLLWRNORM;
 597                         } else {  /* send SIGIO later */
 598                                 sk->socket->flags |= SO_NOSPACE;
 599                         }
 600                 }
 601
 602                 if (tp->urg_data & URG_VALID)
 603                         mask |= POLLPRI;
 604         }
 605         return mask;
 606 }
 607
 608 /*
 609  *      Socket write_space callback.
 610  *      This (or rather the sock_wake_async) should agree with poll.
 611  */
 612 void tcp_write_space(struct sock *sk)
 613 {
 614         if (sk->dead)
 615                 return;
 616
 617         wake_up_interruptible(sk->sleep);
 618         if (sock_wspace(sk) >=
 619             tcp_min_write_space(sk))
 620                 sock_wake_async(sk->socket, 2);
 621 }
 622
 623
 624 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 625 {
 626         int answ;
 627
 628         switch(cmd) {
 629         case TIOCINQ:
 630 #ifdef FIXME    /* FIXME: */
 631         case FIONREAD:
 632 #endif
 633                 if (sk->state == TCP_LISTEN)
 634                         return(-EINVAL);
 635                 lock_sock(sk);
 636                 answ = tcp_readable(sk);
 637                 release_sock(sk);
 638                 break;
 639         case SIOCATMARK:
 640                 {
 641                         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
 642                         answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 643                         break;
 644                 }
 645         case TIOCOUTQ:
 646                 if (sk->state == TCP_LISTEN)
 647                         return(-EINVAL);
 648                 answ = sock_wspace(sk);
 649                 break;
 650         default:
 651                 return(-ENOIOCTLCMD);
 652         };
 653
 654         return put_user(answ, (int *)arg);
 655 }
 656
 657 /*
 658  *      Wait for a socket to get into the connected state
 659  *
 660  *      Note: Must be called with the socket locked, and it
 661  *            runs with the kernel fully unlocked.
 662  */
 663 static int wait_for_tcp_connect(struct sock * sk, int flags)
 664 {
 665         struct task_struct *tsk = current;
 666         DECLARE_WAITQUEUE(wait, tsk);
 667
 668         while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 669                 if(sk->err)
 670                         return sock_error(sk);
 671                 if((1 << sk->state) &
 672                    ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 673                         if(sk->keepopen && !(flags&MSG_NOSIGNAL))
 674                                 send_sig(SIGPIPE, tsk, 0);
 675                         return -EPIPE;
 676                 }
 677                 if(flags & MSG_DONTWAIT)
 678                         return -EAGAIN;
 679                 if(signal_pending(tsk))
 680                         return -ERESTARTSYS;
 681
 682                 tsk->state = TASK_INTERRUPTIBLE;
 683                 add_wait_queue(sk->sleep, &wait);
 684                 release_sock(sk);
 685
 686                 if (((1 << sk->state) & ~(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT)) &&
 687                     sk->err == 0)
 688                         schedule();
 689
 690                 tsk->state = TASK_RUNNING;
 691                 remove_wait_queue(sk->sleep, &wait);
 692                 lock_sock(sk);
 693         }
 694         return 0;
 695 }
 696
 697 static inline int tcp_memory_free(struct sock *sk)
 698 {
 699         return atomic_read(&sk->wmem_alloc) < sk->sndbuf;
 700 }
 701
 702 /*
 703  *      Wait for more memory for a socket
 704  *
 705  * NOTE: This runs with the kernel fully unlocked.
 706  */
 707 static void wait_for_tcp_memory(struct sock * sk)
 708 {
 709         release_sock(sk);
 710         if (!tcp_memory_free(sk)) {
 711                 DECLARE_WAITQUEUE(wait, current);
 712
 713                 sk->socket->flags &= ~SO_NOSPACE;
 714                 add_wait_queue(sk->sleep, &wait);
 715                 for (;;) {
 716                         if (signal_pending(current))
 717                                 break;
 718                         current->state = TASK_INTERRUPTIBLE;
 719                         if (tcp_memory_free(sk))
 720                                 break;
 721                         if (sk->shutdown & SEND_SHUTDOWN)
 722                                 break;
 723                         if (sk->err)
 724                                 break;
 725                         schedule();
 726                 }
 727                 current->state = TASK_RUNNING;
 728                 remove_wait_queue(sk->sleep, &wait);
 729         }
 730         lock_sock(sk);
 731 }
 732
 733 /* When all user supplied data has been queued set the PSH bit */
 734 #define PSH_NEEDED (seglen == 0 && iovlen == 0)
 735
 736 /*
 737  *      This routine copies from a user buffer into a socket,
 738  *      and starts the transmit system.
 739  *
 740  *      Note: must be called with the socket locked.
 741  */
 742
 743 int tcp_do_sendmsg(struct sock *sk, struct msghdr *msg)
 744 {
 745         struct iovec *iov;
 746         struct tcp_opt *tp;
 747         struct sk_buff *skb;
 748         int iovlen, flags;
 749         int mss_now;
 750         int err, copied;
 751
 752         lock_sock(sk);
 753
 754         err = 0;
 755         tp = &(sk->tp_pinfo.af_tcp);
 756
 757         /* Wait for a connection to finish. */
 758         flags = msg->msg_flags;
 759         if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 760                 if((err = wait_for_tcp_connect(sk, flags)) != 0)
 761                         goto out;
 762
 763         /* This should be in poll */
 764         sk->socket->flags &= ~SO_NOSPACE; /* clear SIGIO XXX */
 765
 766         mss_now = tcp_current_mss(sk);
 767
 768         /* Ok commence sending. */
 769         iovlen = msg->msg_iovlen;
 770         iov = msg->msg_iov;
 771         copied = 0;
 772
 773         while(--iovlen >= 0) {
 774                 int seglen=iov->iov_len;
 775                 unsigned char * from=iov->iov_base;
 776
 777                 iov++;
 778
 779                 while(seglen > 0) {
 780                         int copy, tmp, queue_it, psh;
 781
 782                         if (err)
 783                                 goto do_fault2;
 784
 785                         /* Stop on errors. */
 786                         if (sk->err)
 787                                 goto do_sock_err;
 788
 789                         /* Make sure that we are established. */
 790                         if (sk->shutdown & SEND_SHUTDOWN)
 791                                 goto do_shutdown;
 792
 793                         /* Now we need to check if we have a half
 794                          * built packet we can tack some data onto.
 795                          */
 796                         if (tp->send_head && !(flags & MSG_OOB)) {
 797                                 skb = sk->write_queue.prev;
 798                                 copy = skb->len;
 799                                 /* If the remote does SWS avoidance we should
 800                                  * queue the best we can if not we should in
 801                                  * fact send multiple packets...
 802                                  * A method for detecting this would be most
 803                                  * welcome.
 804                                  */
 805                                 if (skb_tailroom(skb) > 0 &&
 806                                     (mss_now - copy) > 0 &&
 807                                     tp->snd_nxt < TCP_SKB_CB(skb)->end_seq) {
 808                                         int last_byte_was_odd = (copy % 4);
 809
 810                                         copy = mss_now - copy;
 811                                         if(copy > skb_tailroom(skb))
 812                                                 copy = skb_tailroom(skb);
 813                                         if(copy > seglen)
 814                                                 copy = seglen;
 815                                         if(last_byte_was_odd) {
 816                                                 if(copy_from_user(skb_put(skb, copy),
 817                                                                   from, copy))
 818                                                         err = -EFAULT;
 819                                                 skb->csum = csum_partial(skb->data,
 820                                                                          skb->len, 0);
 821                                         } else {
 822                                                 skb->csum =
 823                                                         csum_and_copy_from_user(
 824                                                         from, skb_put(skb, copy),
 825                                                         copy, skb->csum, &err);
 826                                         }
 827                                         /*
 828                                          * FIXME: the *_user functions should
 829                                          *        return how much data was
 830                                          *        copied before the fault
 831                                          *        occurred and then a partial
 832                                          *        packet with this data should
 833                                          *        be sent.  Unfortunately
 834                                          *        csum_and_copy_from_user doesn't
 835                                          *        return this information.
 836                                          *        ATM it might send partly zeroed
 837                                          *        data in this case.
 838                                          */
 839                                         tp->write_seq += copy;
 840                                         TCP_SKB_CB(skb)->end_seq += copy;
 841                                         from += copy;
 842                                         copied += copy;
 843                                         seglen -= copy;
 844                                         if (PSH_NEEDED)
 845                                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 846                                         continue;
 847                                 }
 848                         }
 849
 850                         /* We also need to worry about the window.  If
 851                          * window < 1/2 the maximum window we've seen
 852                          * from this host, don't use it.  This is
 853                          * sender side silly window prevention, as
 854                          * specified in RFC1122.  (Note that this is
 855                          * different than earlier versions of SWS
 856                          * prevention, e.g. RFC813.).  What we
 857                          * actually do is use the whole MSS.  Since
 858                          * the results in the right edge of the packet
 859                          * being outside the window, it will be queued
 860                          * for later rather than sent.
 861                          */
 862                         psh = 0;
 863                         copy = tp->snd_wnd - (tp->snd_nxt - tp->snd_una);
 864                         if(copy > (tp->max_window >> 1)) {
 865                                 copy = min(copy, mss_now);
 866                                 psh = 1;
 867                         } else {
 868                                 copy = mss_now;
 869                         }
 870                         if(copy > seglen)
 871                                 copy = seglen;
 872
 873                         /* Determine how large of a buffer to allocate.  */
 874                         tmp = MAX_HEADER + sk->prot->max_header;
 875                         if (copy < min(mss_now, tp->max_window >> 1) &&
 876                             !(flags & MSG_OOB)) {
 877                                 tmp += min(mss_now, tp->max_window);
 878
 879                                 /* What is happening here is that we want to
 880                                  * tack on later members of the users iovec
 881                                  * if possible into a single frame.  When we
 882                                  * leave this loop our caller checks to see if
 883                                  * we can send queued frames onto the wire.
 884                                  * See tcp_v[46]_sendmsg() for this.
 885                                  */
 886                                 queue_it = 1;
 887                         } else {
 888                                 tmp += copy;
 889                                 queue_it = 0;
 890                         }
 891                         skb = sock_wmalloc(sk, tmp, 0, GFP_KERNEL);
 892
 893                         /* If we didn't get any memory, we need to sleep. */
 894                         if (skb == NULL) {
 895                                 sk->socket->flags |= SO_NOSPACE;
 896                                 if (flags&MSG_DONTWAIT) {
 897                                         err = -EAGAIN;
 898                                         goto do_interrupted;
 899                                 }
 900                                 if (signal_pending(current)) {
 901                                         err = -ERESTARTSYS;
 902                                         goto do_interrupted;
 903                                 }
 904                                 tcp_push_pending_frames(sk, tp);
 905                                 wait_for_tcp_memory(sk);
 906
 907                                 /* If SACK's were formed or PMTU events happened,
 908                                  * we must find out about it.
 909                                  */
 910                                 mss_now = tcp_current_mss(sk);
 911                                 continue;
 912                         }
 913
 914                         seglen -= copy;
 915
 916                         /* Prepare control bits for TCP header creation engine. */
 917                         TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK |
 918                                                   ((PSH_NEEDED || psh) ?
 919                                                    TCPCB_FLAG_PSH : 0));
 920                         TCP_SKB_CB(skb)->sacked = 0;
 921                         if (flags & MSG_OOB) {
 922                                 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_URG;
 923                                 TCP_SKB_CB(skb)->urg_ptr = copy;
 924                         } else
 925                                 TCP_SKB_CB(skb)->urg_ptr = 0;
 926
 927                         /* TCP data bytes are SKB_PUT() on top, later
 928                          * TCP+IP+DEV headers are SKB_PUSH()'d beneath.
 929                          * Reserve header space and checksum the data.
 930                          */
 931                         skb_reserve(skb, MAX_HEADER + sk->prot->max_header);
 932                         skb->csum = csum_and_copy_from_user(from,
 933                                         skb_put(skb, copy), copy, 0, &err);
 934
 935                         if (err)
 936                                 goto do_fault;
 937
 938                         from += copy;
 939                         copied += copy;
 940
 941                         TCP_SKB_CB(skb)->seq = tp->write_seq;
 942                         TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + copy;
 943
 944                         /* This advances tp->write_seq for us. */
 945                         tcp_send_skb(sk, skb, queue_it);
 946                 }
 947         }
 948         sk->err = 0;
 949         err = copied;
 950         goto out;
 951
 952 do_sock_err:
 953         if(copied)
 954                 err = copied;
 955         else
 956                 err = sock_error(sk);
 957         goto out;
 958 do_shutdown:
 959         if(copied)
 960                 err = copied;
 961         else {
 962                 if (!(flags&MSG_NOSIGNAL))
 963                         send_sig(SIGPIPE, current, 0);
 964                 err = -EPIPE;
 965         }
 966         goto out;
 967 do_interrupted:
 968         if(copied)
 969                 err = copied;
 970         goto out;
 971 do_fault:
 972         kfree_skb(skb);
 973 do_fault2:
 974         err = -EFAULT;
 975 out:
 976         tcp_push_pending_frames(sk, tp);
 977         release_sock(sk);
 978         return err;
 979 }
 980
 981 #undef PSH_NEEDED
 982
 983 /*
 984  *      Send an ack if one is backlogged at this point. Ought to merge
 985  *      this with tcp_send_ack().
 986  *      This is called for delayed acks also.
 987  */
 988
 989 void tcp_read_wakeup(struct sock *sk)
 990 {
 991         /* If we're closed, don't send an ack, or we'll get a RST
 992          * from the closed destination.
 993          */
 994         if (sk->state != TCP_CLOSE)
 995                 tcp_send_ack(sk);
 996 }
 997
 998 /*
 999  *      Handle reading urgent data. BSD has very simple semantics for
1000  *      this, no blocking and very strange errors 8)
1001  */
1002
1003 static int tcp_recv_urg(struct sock * sk, int nonblock,
1004                         struct msghdr *msg, int len, int flags,
1005                         int *addr_len)
1006 {
1007         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1008
1009         /* No URG data to read. */
1010         if (sk->urginline || !tp->urg_data || tp->urg_data == URG_READ)
1011                 return -EINVAL; /* Yes this is right ! */
1012
1013         if (sk->err)
1014                 return sock_error(sk);
1015
1016         if (sk->done)
1017                 return -ENOTCONN;
1018
1019         if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN)) {
1020                 sk->done = 1;
1021                 return 0;
1022         }
1023
1024         lock_sock(sk);
1025         if (tp->urg_data & URG_VALID) {
1026                 int err = 0;
1027                 char c = tp->urg_data;
1028
1029                 if (!(flags & MSG_PEEK))
1030                         tp->urg_data = URG_READ;
1031
1032                 if(msg->msg_name)
1033                         tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1034                                                        msg->msg_name);
1035
1036                 if(addr_len)
1037                         *addr_len = tp->af_specific->sockaddr_len;
1038
1039                 /* Read urgent data. */
1040                 msg->msg_flags|=MSG_OOB;
1041                 release_sock(sk);
1042
1043                 if(len>0)
1044                 {
1045                         err = memcpy_toiovec(msg->msg_iov, &c, 1);
1046                         /* N.B. already set above ... */
1047                         msg->msg_flags|=MSG_OOB;
1048                 }
1049                 else
1050                         msg->msg_flags|=MSG_TRUNC;
1051
1052                 /* N.B. Is this right?? If len == 0 we didn't read any data */
1053                 return err ? -EFAULT : 1;
1054         }
1055         release_sock(sk);
1056
1057         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1058          * the available implementations agree in this case:
1059          * this call should never block, independent of the
1060          * blocking state of the socket.
1061          * Mike <pall@rz.uni-karlsruhe.de>
1062          */
1063         return -EAGAIN;
1064 }
1065
1066 /*
1067  *      Release a skb if it is no longer needed. This routine
1068  *      must be called with interrupts disabled or with the
1069  *      socket locked so that the sk_buff queue operation is ok.
1070  */
1071
1072 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1073 {
1074         __skb_unlink(skb, &sk->receive_queue);
1075         kfree_skb(skb);
1076 }
1077
1078 /* Clean up the receive buffer for full frames taken by the user,
1079  * then send an ACK if necessary.  COPIED is the number of bytes
1080  * tcp_recvmsg has given to the user so far, it speeds up the
1081  * calculation of whether or not we must ACK for the sake of
1082  * a window update.
1083  */
1084 static void cleanup_rbuf(struct sock *sk, int copied)
1085 {
1086         struct sk_buff *skb;
1087
1088         /* NOTE! The socket must be locked, so that we don't get
1089          * a messed-up receive queue.
1090          */
1091         while ((skb=skb_peek(&sk->receive_queue)) != NULL) {
1092                 if (!skb->used || atomic_read(&skb->users) > 1)
1093                         break;
1094                 tcp_eat_skb(sk, skb);
1095         }
1096
1097         /* We send an ACK if we can now advertise a non-zero window
1098          * which has been raised "significantly".
1099          */
1100         if(copied > 0) {
1101                 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1102                 __u32 rcv_window_now = tcp_receive_window(tp);
1103                 __u32 new_window = __tcp_select_window(sk);
1104
1105                 /* We won't be raising the window any further than
1106                  * the window-clamp allows.  Our window selection
1107                  * also keeps things a nice multiple of MSS.  These
1108                  * checks are necessary to prevent spurious ACKs
1109                  * which don't advertize a larger window.
1110                  */
1111                 if((new_window && (new_window >= rcv_window_now * 2)) &&
1112                    ((rcv_window_now + tp->mss_cache) <= tp->window_clamp))
1113                         tcp_read_wakeup(sk);
1114         }
1115 }
1116
1117
1118 /*
1119  *      This routine copies from a sock struct into the user buffer.
1120  */
1121
1122 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1123                 int len, int nonblock, int flags, int *addr_len)
1124 {
1125         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1126         DECLARE_WAITQUEUE(wait, current);
1127         int copied = 0;
1128         u32 peek_seq;
1129         volatile u32 *seq;      /* So gcc doesn't overoptimise */
1130         unsigned long used;
1131         int err = 0;
1132         int target = 1;         /* Read at least this many bytes */
1133
1134         if (sk->err)
1135                 return sock_error(sk);
1136
1137         if (sk->state == TCP_LISTEN)
1138                 return -ENOTCONN;
1139
1140         /* Urgent data needs to be handled specially. */
1141         if (flags & MSG_OOB)
1142                 return tcp_recv_urg(sk, nonblock, msg, len, flags, addr_len);
1143
1144         /*      Copying sequence to update. This is volatile to handle
1145          *      the multi-reader case neatly (memcpy_to/fromfs might be
1146          *      inline and thus not flush cached variables otherwise).
1147          */
1148         peek_seq = tp->copied_seq;
1149         seq = &tp->copied_seq;
1150         if (flags & MSG_PEEK)
1151                 seq = &peek_seq;
1152
1153         /* Handle the POSIX bogosity MSG_WAITALL. */
1154         if (flags & MSG_WAITALL)
1155                 target=len;
1156
1157         add_wait_queue(sk->sleep, &wait);
1158         lock_sock(sk);
1159
1160         /*
1161          *      BUG BUG BUG
1162          *      This violates 1003.1g compliance. We must wait for
1163          *      data to exist even if we read none!
1164          */
1165
1166         while (len > 0) {
1167                 struct sk_buff * skb;
1168                 u32 offset;
1169
1170                 /* Are we at urgent data? Stop if we have read anything. */
1171                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1172                         break;
1173
1174                 /* We need to check signals first, to get correct SIGURG
1175                  * handling. FIXME: Need to check this doesnt impact 1003.1g
1176                  * and move it down to the bottom of the loop
1177                  */
1178                 if (signal_pending(current)) {
1179                         if (copied)
1180                                 break;
1181                         copied = -ERESTARTSYS;
1182                         if (nonblock)
1183                                 copied = -EAGAIN;
1184                         break;
1185                 }
1186
1187                 /* Next get a buffer. */
1188                 current->state = TASK_INTERRUPTIBLE;
1189
1190                 skb = skb_peek(&sk->receive_queue);
1191                 do {
1192                         if (!skb)
1193                                 break;
1194
1195                         /* Now that we have two receive queues this
1196                          * shouldn't happen.
1197                          */
1198                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1199                                 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1200                                        *seq, TCP_SKB_CB(skb)->seq);
1201                                 break;
1202                         }
1203                         offset = *seq - TCP_SKB_CB(skb)->seq;
1204                         if (skb->h.th->syn)
1205                                 offset--;
1206                         if (offset < skb->len)
1207                                 goto found_ok_skb;
1208                         if (skb->h.th->fin)
1209                                 goto found_fin_ok;
1210                         if (!(flags & MSG_PEEK))
1211                                 skb->used = 1;
1212                         skb = skb->next;
1213                 } while (skb != (struct sk_buff *)&sk->receive_queue);
1214
1215                 if (copied >= target)
1216                         break;
1217
1218                 /*
1219                    These three lines and clause if (sk->state == TCP_CLOSE)
1220                    are unlikely to be correct, if target > 1.
1221                    I DO NOT FIX IT, because I have no idea, what
1222                    POSIX prescribes to make here. Probably, it really
1223                    wants to lose data 8), if not all target is received.
1224                                                                  --ANK
1225                  */
1226                 if (sk->err && !(flags&MSG_PEEK)) {
1227                         copied = sock_error(sk);
1228                         break;
1229                 }
1230
1231                 if (sk->shutdown & RCV_SHUTDOWN) {
1232                         sk->done = 1;
1233                         break;
1234                 }
1235
1236                 if (sk->state == TCP_CLOSE) {
1237                         if (!sk->done) {
1238                                 sk->done = 1;
1239                                 break;
1240                         }
1241                         copied = -ENOTCONN;
1242                         break;
1243                 }
1244
1245                 if (nonblock) {
1246                         copied = -EAGAIN;
1247                         break;
1248                 }
1249
1250                 cleanup_rbuf(sk, copied);
1251                 release_sock(sk);
1252                 sk->socket->flags |= SO_WAITDATA;
1253                 schedule();
1254                 sk->socket->flags &= ~SO_WAITDATA;
1255                 lock_sock(sk);
1256                 continue;
1257
1258         found_ok_skb:
1259                 /*      Lock the buffer. We can be fairly relaxed as
1260                  *      an interrupt will never steal a buffer we are
1261                  *      using unless I've missed something serious in
1262                  *      tcp_data.
1263                  */
1264                 atomic_inc(&skb->users);
1265
1266                 /* Ok so how much can we use? */
1267                 used = skb->len - offset;
1268                 if (len < used)
1269                         used = len;
1270
1271                 /* Do we have urgent data here? */
1272                 if (tp->urg_data) {
1273                         u32 urg_offset = tp->urg_seq - *seq;
1274                         if (urg_offset < used) {
1275                                 if (!urg_offset) {
1276                                         if (!sk->urginline) {
1277                                                 ++*seq;
1278                                                 offset++;
1279                                                 used--;
1280                                         }
1281                                 } else
1282                                         used = urg_offset;
1283                         }
1284                 }
1285
1286                 /*      Copy it - We _MUST_ update *seq first so that we
1287                  *      don't ever double read when we have dual readers
1288                  */
1289                 *seq += used;
1290
1291                 /*      This memcpy_toiovec can sleep. If it sleeps and we
1292                  *      do a second read it relies on the skb->users to avoid
1293                  *      a crash when cleanup_rbuf() gets called.
1294                  */
1295                 err = memcpy_toiovec(msg->msg_iov, ((unsigned char *)skb->h.th) + skb->h.th->doff*4 + offset, used);
1296                 if (err) {
1297                         /* Exception. Bailout! */
1298                         atomic_dec(&skb->users);
1299                         copied = -EFAULT;
1300                         break;
1301                 }
1302
1303                 copied += used;
1304                 len -= used;
1305
1306                 /*      We now will not sleep again until we are finished
1307                  *      with skb. Sorry if you are doing the SMP port
1308                  *      but you'll just have to fix it neatly ;)
1309                  *
1310                  *      Very funny Alan... -DaveM
1311                  */
1312                 atomic_dec(&skb->users);
1313
1314                 if (after(tp->copied_seq,tp->urg_seq))
1315                         tp->urg_data = 0;
1316                 if (used + offset < skb->len)
1317                         continue;
1318
1319                 /*      Process the FIN. We may also need to handle PSH
1320                  *      here and make it break out of MSG_WAITALL.
1321                  */
1322                 if (skb->h.th->fin)
1323                         goto found_fin_ok;
1324                 if (flags & MSG_PEEK)
1325                         continue;
1326                 skb->used = 1;
1327                 if (atomic_read(&skb->users) == 1)
1328                         tcp_eat_skb(sk, skb);
1329                 continue;
1330
1331         found_fin_ok:
1332                 ++*seq;
1333                 if (flags & MSG_PEEK)
1334                         break;
1335
1336                 /* All is done. */
1337                 skb->used = 1;
1338                 sk->shutdown |= RCV_SHUTDOWN;
1339                 break;
1340         }
1341
1342         if (copied > 0 && msg->msg_name)
1343                 tp->af_specific->addr2sockaddr(sk, (struct sockaddr *)
1344                                                msg->msg_name);
1345
1346         if(addr_len)
1347                 *addr_len = tp->af_specific->sockaddr_len;
1348
1349         remove_wait_queue(sk->sleep, &wait);
1350         current->state = TASK_RUNNING;
1351
1352         /* Clean up data we have read: This will do ACK frames. */
1353         cleanup_rbuf(sk, copied);
1354         release_sock(sk);
1355         return copied;
1356 }
1357
1358 /*
1359  * Check whether to renew the timer.
1360  */
1361 static inline void tcp_check_fin_timer(struct sock *sk)
1362 {
1363         if (sk->state == TCP_FIN_WAIT2 && !sk->timer.prev)
1364                 tcp_reset_msl_timer(sk, TIME_CLOSE, sysctl_tcp_fin_timeout);
1365 }
1366
1367 /*
1368  *      State processing on a close. This implements the state shift for
1369  *      sending our FIN frame. Note that we only send a FIN for some
1370  *      states. A shutdown() may have already sent the FIN, or we may be
1371  *      closed.
1372  */
1373
1374 static unsigned char new_state[16] = {
1375   /* current state:        new state:      action:      */
1376   /* (Invalid)          */ TCP_CLOSE,
1377   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1378   /* TCP_SYN_SENT       */ TCP_CLOSE,
1379   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1380   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1381   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1382   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1383   /* TCP_CLOSE          */ TCP_CLOSE,
1384   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1385   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1386   /* TCP_LISTEN         */ TCP_CLOSE,
1387   /* TCP_CLOSING        */ TCP_CLOSING,
1388 };
1389
1390 static int tcp_close_state(struct sock *sk, int dead)
1391 {
1392         int next = (int) new_state[sk->state];
1393         int ns = (next & TCP_STATE_MASK);
1394
1395         tcp_set_state(sk, ns);
1396
1397         /*      This is a (useful) BSD violating of the RFC. There is a
1398          *      problem with TCP as specified in that the other end could
1399          *      keep a socket open forever with no application left this end.
1400          *      We use a 3 minute timeout (about the same as BSD) then kill
1401          *      our end. If they send after that then tough - BUT: long enough
1402          *      that we won't make the old 4*rto = almost no time - whoops
1403          *      reset mistake.
1404          */
1405         if (dead)
1406                 tcp_check_fin_timer(sk);
1407
1408         return (next & TCP_ACTION_FIN);
1409 }
1410
1411 /*
1412  *      Shutdown the sending side of a connection. Much like close except
1413  *      that we don't receive shut down or set sk->dead.
1414  */
1415
1416 void tcp_shutdown(struct sock *sk, int how)
1417 {
1418         /*      We need to grab some memory, and put together a FIN,
1419          *      and then put it into the queue to be sent.
1420          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1421          */
1422         if (!(how & SEND_SHUTDOWN))
1423                 return;
1424
1425         /* If we've already sent a FIN, or it's a closed state, skip this. */
1426         lock_sock(sk);
1427         if ((1 << sk->state) &
1428             (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1429
1430                 /* Clear out any half completed packets.  FIN if needed. */
1431                 if (tcp_close_state(sk,0))
1432                         tcp_send_fin(sk);
1433         }
1434         release_sock(sk);
1435 }
1436
1437
1438 /*
1439  *      Return 1 if we still have things to send in our buffers.
1440  */
1441
1442 static inline int closing(struct sock * sk)
1443 {
1444         return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1445 }
1446
1447 /*
1448  *      This routine closes sockets which have been at least partially
1449  *      opened, but not yet accepted. Currently it is only called by
1450  *      tcp_close, and timeout mirrors the value there.
1451  */
1452
1453 static void tcp_close_pending (struct sock *sk)
1454 {
1455         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1456         struct open_request *req = tp->syn_wait_queue;
1457
1458         while(req) {
1459                 struct open_request *iter;
1460
1461                 if (req->sk)
1462                         tcp_close(req->sk, 0);
1463
1464                 iter = req;
1465                 req = req->dl_next;
1466
1467                 (*iter->class->destructor)(iter);
1468                 tcp_dec_slow_timer(TCP_SLT_SYNACK);
1469                 sk->ack_backlog--;
1470                 tcp_openreq_free(iter);
1471         }
1472
1473         tcp_synq_init(tp);
1474 }
1475
1476 void tcp_close(struct sock *sk, long timeout)
1477 {
1478         struct sk_buff *skb;
1479         int data_was_unread = 0;
1480
1481         /* We need to grab some memory, and put together a FIN,
1482          * and then put it into the queue to be sent.
1483          */
1484         lock_sock(sk);
1485         if(sk->state == TCP_LISTEN) {
1486                 /* Special case. */
1487                 tcp_set_state(sk, TCP_CLOSE);
1488                 tcp_close_pending(sk);
1489                 release_sock(sk);
1490                 sk->dead = 1;
1491                 return;
1492         }
1493
1494         unlock_kernel();
1495
1496         /* It is questionable, what the role of this is now.
1497          * In any event either it should be removed, or
1498          * increment of SLT_KEEPALIVE be done, this is causing
1499          * big problems.  For now I comment it out.  -DaveM
1500          */
1501         /* sk->keepopen = 1; */
1502         sk->shutdown = SHUTDOWN_MASK;
1503
1504         if (!sk->dead)
1505                 sk->state_change(sk);
1506
1507         /*  We need to flush the recv. buffs.  We do this only on the
1508          *  descriptor close, not protocol-sourced closes, because the
1509          *  reader process may not have drained the data yet!
1510          */
1511         while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1512                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1513                 data_was_unread += len;
1514                 kfree_skb(skb);
1515         }
1516
1517         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1518          * 3.10, we send a RST here because data was lost.  To
1519          * witness the awful effects of the old behavior of always
1520          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1521          * a bulk GET in an FTP client, suspend the process, wait
1522          * for the client to advertise a zero window, then kill -9
1523          * the FTP client, wheee...  Note: timeout is always zero
1524          * in such a case.
1525          */
1526         if(data_was_unread != 0) {
1527                 /* Unread data was tossed, zap the connection. */
1528                 tcp_set_state(sk, TCP_CLOSE);
1529                 tcp_send_active_reset(sk);
1530         } else if (tcp_close_state(sk,1)) {
1531                 /* We FIN if the application ate all the data before
1532                  * zapping the connection.
1533                  */
1534                 tcp_send_fin(sk);
1535         }
1536
1537         if (timeout) {
1538                 struct task_struct *tsk = current;
1539                 DECLARE_WAITQUEUE(wait, current);
1540
1541                 add_wait_queue(sk->sleep, &wait);
1542
1543                 while (1) {
1544                         tsk->state = TASK_INTERRUPTIBLE;
1545                         if (!closing(sk))
1546                                 break;
1547                         release_sock(sk);
1548                         timeout = schedule_timeout(timeout);
1549                         lock_sock(sk);
1550                         if (signal_pending(tsk) || !timeout)
1551                                 break;
1552                 }
1553
1554                 tsk->state = TASK_RUNNING;
1555                 remove_wait_queue(sk->sleep, &wait);
1556         }
1557
1558         /* Now that the socket is dead, if we are in the FIN_WAIT2 state
1559          * we may need to set up a timer.
1560          */
1561         tcp_check_fin_timer(sk);
1562
1563         sk->dead = 1;
1564
1565         release_sock(sk);
1566         lock_kernel();
1567 }
1568
1569 /*
1570  *      Wait for an incoming connection, avoid race
1571  *      conditions. This must be called with the socket locked,
1572  *      and without the kernel lock held.
1573  */
1574 static struct open_request * wait_for_connect(struct sock * sk,
1575                                               struct open_request **pprev)
1576 {
1577         DECLARE_WAITQUEUE(wait, current);
1578         struct open_request *req;
1579
1580         /*
1581          * True wake-one mechanism for incoming connections: only
1582          * one process gets woken up, not the 'whole herd'.
1583          * Since we do not 'race & poll' for established sockets
1584          * anymore, the common case will execute the loop only once.
1585          *
1586          * Subtle issue: "add_wait_queue_exclusive()" will be added
1587          * after any current non-exclusive waiters, and we know that
1588          * it will always _stay_ after any new non-exclusive waiters
1589          * because all non-exclusive waiters are added at the
1590          * beginning of the wait-queue. As such, it's ok to "drop"
1591          * our exclusiveness temporarily when we get woken up without
1592          * having to remove and re-insert us on the wait queue.
1593          */
1594         add_wait_queue_exclusive(sk->sleep, &wait);
1595         for (;;) {
1596                 current->state = TASK_EXCLUSIVE | TASK_INTERRUPTIBLE;
1597                 release_sock(sk);
1598                 schedule();
1599                 lock_sock(sk);
1600                 req = tcp_find_established(&(sk->tp_pinfo.af_tcp), pprev);
1601                 if (req)
1602                         break;
1603                 if (signal_pending(current))
1604                         break;
1605         }
1606         current->state = TASK_RUNNING;
1607         remove_wait_queue(sk->sleep, &wait);
1608         return req;
1609 }
1610
1611 /*
1612  *      This will accept the next outstanding connection.
1613  *
1614  *      Be careful about race conditions here - this is subtle.
1615  */
1616
1617 struct sock *tcp_accept(struct sock *sk, int flags)
1618 {
1619         struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1620         struct open_request *req, *prev;
1621         struct sock *newsk = NULL;
1622         int error;
1623
1624         unlock_kernel();
1625         lock_sock(sk);
1626
1627         /* We need to make sure that this socket is listening,
1628          * and that it has something pending.
1629          */
1630         error = EINVAL;
1631         if (sk->state != TCP_LISTEN)
1632                 goto out;
1633
1634         /* Find already established connection */
1635         req = tcp_find_established(tp, &prev);
1636         if (!req) {
1637                 /* If this is a non blocking socket don't sleep */
1638                 error = EAGAIN;
1639                 if (flags & O_NONBLOCK)
1640                         goto out;
1641
1642                 error = ERESTARTSYS;
1643                 req = wait_for_connect(sk, &prev);
1644                 if (!req)
1645                         goto out;
1646         }
1647
1648         tcp_synq_unlink(tp, req, prev);
1649         newsk = req->sk;
1650         req->class->destructor(req);
1651         tcp_openreq_free(req);
1652         sk->ack_backlog--;
1653         if(sk->keepopen)
1654                 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1655         release_sock(sk);
1656         lock_kernel();
1657         return newsk;
1658
1659 out:
1660         /* sk should be in LISTEN state, thus accept can use sk->err for
1661          * internal purposes without stomping on anyone's feed.
1662          */
1663         sk->err = error;
1664         release_sock(sk);
1665         lock_kernel();
1666         return newsk;
1667 }
1668
1669 /*
1670  *      Socket option code for TCP.
1671  */
1672
1673 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
1674                    int optlen)
1675 {
1676         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1677         int val;
1678
1679         if (level != SOL_TCP)
1680                 return tp->af_specific->setsockopt(sk, level, optname,
1681                                                    optval, optlen);
1682
1683         if(optlen<sizeof(int))
1684                 return -EINVAL;
1685
1686         if (get_user(val, (int *)optval))
1687                 return -EFAULT;
1688
1689         switch(optname) {
1690         case TCP_MAXSEG:
1691                 /* values greater than interface MTU won't take effect.  however at
1692                  * the point when this call is done we typically don't yet know
1693                  * which interface is going to be used
1694                  */
1695                 if(val < 1 || val > MAX_WINDOW)
1696                         return -EINVAL;
1697                 tp->user_mss = val;
1698                 return 0;
1699
1700         case TCP_NODELAY:
1701                 /* You cannot try to use this and TCP_CORK in
1702                  * tandem, so let the user know.
1703                  */
1704                 if (sk->nonagle == 2)
1705                         return -EINVAL;
1706                 sk->nonagle = (val == 0) ? 0 : 1;
1707                 return 0;
1708
1709         case TCP_CORK:
1710                 /* When set indicates to always queue non-full frames.
1711                  * Later the user clears this option and we transmit
1712                  * any pending partial frames in the queue.  This is
1713                  * meant to be used alongside sendfile() to get properly
1714                  * filled frames when the user (for example) must write
1715                  * out headers with a write() call first and then use
1716                  * sendfile to send out the data parts.
1717                  *
1718                  * You cannot try to use TCP_NODELAY and this mechanism
1719                  * at the same time, so let the user know.
1720                  */
1721                 if (sk->nonagle == 1)
1722                         return -EINVAL;
1723                 if (val != 0) {
1724                         sk->nonagle = 2;
1725                 } else {
1726                         sk->nonagle = 0;
1727
1728                         lock_sock(sk);
1729                         tcp_push_pending_frames(sk, tp);
1730                         release_sock(sk);
1731                 }
1732                 return 0;
1733
1734         default:
1735                 return -ENOPROTOOPT;
1736         };
1737 }
1738
1739 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
1740                    int *optlen)
1741 {
1742         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1743         int val, len;
1744
1745         if(level != SOL_TCP)
1746                 return tp->af_specific->getsockopt(sk, level, optname,
1747                                                    optval, optlen);
1748
1749         if(get_user(len,optlen))
1750                 return -EFAULT;
1751
1752         len = min(len, sizeof(int));
1753
1754         switch(optname) {
1755         case TCP_MAXSEG:
1756                 val = tp->user_mss;
1757                 break;
1758         case TCP_NODELAY:
1759                 val = (sk->nonagle == 1);
1760                 break;
1761         case TCP_CORK:
1762                 val = (sk->nonagle == 2);
1763                 break;
1764         default:
1765                 return -ENOPROTOOPT;
1766         };
1767
1768         if(put_user(len, optlen))
1769                 return -EFAULT;
1770         if(copy_to_user(optval, &val,len))
1771                 return -EFAULT;
1772         return 0;
1773 }
1774
1775 void tcp_set_keepalive(struct sock *sk, int val)
1776 {
1777         if (!sk->keepopen && val)
1778                 tcp_inc_slow_timer(TCP_SLT_KEEPALIVE);
1779         else if (sk->keepopen && !val)
1780                 tcp_dec_slow_timer(TCP_SLT_KEEPALIVE);
1781 }
1782
1783 extern void __skb_cb_too_small_for_tcp(int, int);
1784
1785 void __init tcp_init(void)
1786 {
1787         struct sk_buff *skb = NULL;
1788         unsigned long goal;
1789         int order;
1790
1791         if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
1792                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
1793                                            sizeof(skb->cb));
1794
1795         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
1796                                                    sizeof(struct open_request),
1797                                                0, SLAB_HWCACHE_ALIGN,
1798                                                NULL, NULL);
1799         if(!tcp_openreq_cachep)
1800                 panic("tcp_init: Cannot alloc open_request cache.");
1801
1802         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
1803                                               sizeof(struct tcp_bind_bucket),
1804                                               0, SLAB_HWCACHE_ALIGN,
1805                                               NULL, NULL);
1806         if(!tcp_bucket_cachep)
1807                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
1808
1809         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
1810                                                 sizeof(struct tcp_tw_bucket),
1811                                                 0, SLAB_HWCACHE_ALIGN,
1812                                                 NULL, NULL);
1813         if(!tcp_timewait_cachep)
1814                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
1815
1816         /* Size and allocate the main established and bind bucket
1817          * hash tables.
1818          *
1819          * The methodology is similar to that of the buffer cache.
1820          */
1821         goal = num_physpages >> (20 - PAGE_SHIFT);
1822         for(order = 5; (1UL << order) < goal; order++)
1823                 ;
1824         do {
1825                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
1826                         sizeof(struct sock *);
1827                 tcp_ehash = (struct sock **)
1828                         __get_free_pages(GFP_ATOMIC, order);
1829         } while (tcp_ehash == NULL && --order > 4);
1830
1831         if (!tcp_ehash)
1832                 panic("Failed to allocate TCP established hash table\n");
1833         memset(tcp_ehash, 0, tcp_ehash_size * sizeof(struct sock *));
1834
1835         do {
1836                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
1837                         sizeof(struct tcp_bind_bucket *);
1838                 tcp_bhash = (struct tcp_bind_bucket **)
1839                         __get_free_pages(GFP_ATOMIC, order);
1840         } while (tcp_bhash == NULL && --order > 4);
1841
1842         if (!tcp_bhash)
1843                 panic("Failed to allocate TCP bind hash table\n");
1844         memset(tcp_bhash, 0, tcp_bhash_size * sizeof(struct tcp_bind_bucket *));
1845
1846         printk("TCP: Hash tables configured (established %d bind %d)\n",
1847                tcp_ehash_size, tcp_bhash_size);
1848 }