net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro, <bir7@leland.Stanford.Edu>
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/types.h>
 252 #include <linux/fcntl.h>
 253 #include <linux/poll.h>
 254 #include <linux/init.h>
 255 #include <linux/smp_lock.h>
 256 #include <linux/fs.h>
 257 #include <linux/random.h>
 258
 259 #include <net/icmp.h>
 260 #include <net/tcp.h>
 261 #include <net/xfrm.h>
 262 #include <net/ip.h>
 263
 264
 265 #include <asm/uaccess.h>
 266 #include <asm/ioctls.h>
 267
 268 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 269
 270 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
 271
 272 kmem_cache_t *tcp_openreq_cachep;
 273 kmem_cache_t *tcp_bucket_cachep;
 274 kmem_cache_t *tcp_timewait_cachep;
 275
 276 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 277
 278 int sysctl_tcp_mem[3];
 279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 281
 282 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 283 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 284
 285 /* Pressure flag: try to collapse.
 286  * Technical note: it is used by multiple contexts non atomically.
 287  * All the tcp_mem_schedule() is of this nature: accounting
 288  * is strict, actions are advisory and have some latency. */
 289 int tcp_memory_pressure;
 290
 291 #define TCP_PAGES(amt) (((amt) + TCP_MEM_QUANTUM - 1) / TCP_MEM_QUANTUM)
 292
 293 int tcp_mem_schedule(struct sock *sk, int size, int kind)
 294 {
 295         int amt = TCP_PAGES(size);
 296
 297         sk->sk_forward_alloc += amt * TCP_MEM_QUANTUM;
 298         atomic_add(amt, &tcp_memory_allocated);
 299
 300         /* Under limit. */
 301         if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
 302                 if (tcp_memory_pressure)
 303                         tcp_memory_pressure = 0;
 304                 return 1;
 305         }
 306
 307         /* Over hard limit. */
 308         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
 309                 tcp_enter_memory_pressure();
 310                 goto suppress_allocation;
 311         }
 312
 313         /* Under pressure. */
 314         if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
 315                 tcp_enter_memory_pressure();
 316
 317         if (kind) {
 318                 if (atomic_read(&sk->sk_rmem_alloc) < sysctl_tcp_rmem[0])
 319                         return 1;
 320         } else if (sk->sk_wmem_queued < sysctl_tcp_wmem[0])
 321                 return 1;
 322
 323         if (!tcp_memory_pressure ||
 324             sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated) *
 325                                 TCP_PAGES(sk->sk_wmem_queued +
 326                                           atomic_read(&sk->sk_rmem_alloc) +
 327                                           sk->sk_forward_alloc))
 328                 return 1;
 329
 330 suppress_allocation:
 331
 332         if (!kind) {
 333                 tcp_moderate_sndbuf(sk);
 334
 335                 /* Fail only if socket is _under_ its sndbuf.
 336                  * In this case we cannot block, so that we have to fail.
 337                  */
 338                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
 339                         return 1;
 340         }
 341
 342         /* Alas. Undo changes. */
 343         sk->sk_forward_alloc -= amt * TCP_MEM_QUANTUM;
 344         atomic_sub(amt, &tcp_memory_allocated);
 345         return 0;
 346 }
 347
 348 void __tcp_mem_reclaim(struct sock *sk)
 349 {
 350         if (sk->sk_forward_alloc >= TCP_MEM_QUANTUM) {
 351                 atomic_sub(sk->sk_forward_alloc / TCP_MEM_QUANTUM,
 352                            &tcp_memory_allocated);
 353                 sk->sk_forward_alloc &= TCP_MEM_QUANTUM - 1;
 354                 if (tcp_memory_pressure &&
 355                     atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
 356                         tcp_memory_pressure = 0;
 357         }
 358 }
 359
 360 void tcp_rfree(struct sk_buff *skb)
 361 {
 362         struct sock *sk = skb->sk;
 363
 364         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
 365         sk->sk_forward_alloc += skb->truesize;
 366 }
 367
 368 /*
 369  * LISTEN is a special case for poll..
 370  */
 371 static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
 372                                                poll_table *wait)
 373 {
 374         return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
 375 }
 376
 377 /*
 378  *      Wait for a TCP event.
 379  *
 380  *      Note that we don't need to lock the socket, as the upper poll layers
 381  *      take care of normal races (between the test and the event) and we don't
 382  *      go look at any of the socket buffers directly.
 383  */
 384 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 385 {
 386         unsigned int mask;
 387         struct sock *sk = sock->sk;
 388         struct tcp_opt *tp = tcp_sk(sk);
 389
 390         poll_wait(file, sk->sk_sleep, wait);
 391         if (sk->sk_state == TCP_LISTEN)
 392                 return tcp_listen_poll(sk, wait);
 393
 394         /* Socket is not locked. We are protected from async events
 395            by poll logic and correct handling of state changes
 396            made by another threads is impossible in any case.
 397          */
 398
 399         mask = 0;
 400         if (sk->sk_err)
 401                 mask = POLLERR;
 402
 403         /*
 404          * POLLHUP is certainly not done right. But poll() doesn't
 405          * have a notion of HUP in just one direction, and for a
 406          * socket the read side is more interesting.
 407          *
 408          * Some poll() documentation says that POLLHUP is incompatible
 409          * with the POLLOUT/POLLWR flags, so somebody should check this
 410          * all. But careful, it tends to be safer to return too many
 411          * bits than too few, and you can easily break real applications
 412          * if you don't tell them that something has hung up!
 413          *
 414          * Check-me.
 415          *
 416          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 417          * our fs/select.c). It means that after we received EOF,
 418          * poll always returns immediately, making impossible poll() on write()
 419          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 420          * if and only if shutdown has been made in both directions.
 421          * Actually, it is interesting to look how Solaris and DUX
 422          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 423          * then we could set it on SND_SHUTDOWN. BTW examples given
 424          * in Stevens' books assume exactly this behaviour, it explains
 425          * why PULLHUP is incompatible with POLLOUT.    --ANK
 426          *
 427          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 428          * blocking on fresh not-connected or disconnected socket. --ANK
 429          */
 430         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 431                 mask |= POLLHUP;
 432         if (sk->sk_shutdown & RCV_SHUTDOWN)
 433                 mask |= POLLIN | POLLRDNORM;
 434
 435         /* Connected? */
 436         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 437                 /* Potential race condition. If read of tp below will
 438                  * escape above sk->sk_state, we can be illegally awaken
 439                  * in SYN_* states. */
 440                 if ((tp->rcv_nxt != tp->copied_seq) &&
 441                     (tp->urg_seq != tp->copied_seq ||
 442                      tp->rcv_nxt != tp->copied_seq + 1 ||
 443                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 444                         mask |= POLLIN | POLLRDNORM;
 445
 446                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 447                         if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
 448                                 mask |= POLLOUT | POLLWRNORM;
 449                         } else {  /* send SIGIO later */
 450                                 set_bit(SOCK_ASYNC_NOSPACE,
 451                                         &sk->sk_socket->flags);
 452                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 453
 454                                 /* Race breaker. If space is freed after
 455                                  * wspace test but before the flags are set,
 456                                  * IO signal will be lost.
 457                                  */
 458                                 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
 459                                         mask |= POLLOUT | POLLWRNORM;
 460                         }
 461                 }
 462
 463                 if (tp->urg_data & TCP_URG_VALID)
 464                         mask |= POLLPRI;
 465         }
 466         return mask;
 467 }
 468
 469 /*
 470  *      TCP socket write_space callback.
 471  */
 472 void tcp_write_space(struct sock *sk)
 473 {
 474         struct socket *sock = sk->sk_socket;
 475
 476         if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
 477                 clear_bit(SOCK_NOSPACE, &sock->flags);
 478
 479                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
 480                         wake_up_interruptible(sk->sk_sleep);
 481
 482                 if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
 483                         sock_wake_async(sock, 2, POLL_OUT);
 484         }
 485 }
 486
 487 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 488 {
 489         struct tcp_opt *tp = tcp_sk(sk);
 490         int answ;
 491
 492         switch (cmd) {
 493         case SIOCINQ:
 494                 if (sk->sk_state == TCP_LISTEN)
 495                         return -EINVAL;
 496
 497                 lock_sock(sk);
 498                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 499                         answ = 0;
 500                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 501                          !tp->urg_data ||
 502                          before(tp->urg_seq, tp->copied_seq) ||
 503                          !before(tp->urg_seq, tp->rcv_nxt)) {
 504                         answ = tp->rcv_nxt - tp->copied_seq;
 505
 506                         /* Subtract 1, if FIN is in queue. */
 507                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 508                                 answ -=
 509                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 510                 } else
 511                         answ = tp->urg_seq - tp->copied_seq;
 512                 release_sock(sk);
 513                 break;
 514         case SIOCATMARK:
 515                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 516                 break;
 517         case SIOCOUTQ:
 518                 if (sk->sk_state == TCP_LISTEN)
 519                         return -EINVAL;
 520
 521                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 522                         answ = 0;
 523                 else
 524                         answ = tp->write_seq - tp->snd_una;
 525                 break;
 526         default:
 527                 return -ENOIOCTLCMD;
 528         };
 529
 530         return put_user(answ, (int *)arg);
 531 }
 532
 533
 534 int tcp_listen_start(struct sock *sk)
 535 {
 536         struct inet_opt *inet = inet_sk(sk);
 537         struct tcp_opt *tp = tcp_sk(sk);
 538         struct tcp_listen_opt *lopt;
 539
 540         sk->sk_max_ack_backlog = 0;
 541         sk->sk_ack_backlog = 0;
 542         tp->accept_queue = tp->accept_queue_tail = NULL;
 543         tp->syn_wait_lock = RW_LOCK_UNLOCKED;
 544         tcp_delack_init(tp);
 545
 546         lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
 547         if (!lopt)
 548                 return -ENOMEM;
 549
 550         memset(lopt, 0, sizeof(struct tcp_listen_opt));
 551         for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
 552                 if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
 553                         break;
 554         get_random_bytes(&lopt->hash_rnd, 4);
 555
 556         write_lock_bh(&tp->syn_wait_lock);
 557         tp->listen_opt = lopt;
 558         write_unlock_bh(&tp->syn_wait_lock);
 559
 560         /* There is race window here: we announce ourselves listening,
 561          * but this transition is still not validated by get_port().
 562          * It is OK, because this socket enters to hash table only
 563          * after validation is complete.
 564          */
 565         sk->sk_state = TCP_LISTEN;
 566         if (!sk->sk_prot->get_port(sk, inet->num)) {
 567                 inet->sport = htons(inet->num);
 568
 569                 sk_dst_reset(sk);
 570                 sk->sk_prot->hash(sk);
 571
 572                 return 0;
 573         }
 574
 575         sk->sk_state = TCP_CLOSE;
 576         write_lock_bh(&tp->syn_wait_lock);
 577         tp->listen_opt = NULL;
 578         write_unlock_bh(&tp->syn_wait_lock);
 579         kfree(lopt);
 580         return -EADDRINUSE;
 581 }
 582
 583 /*
 584  *      This routine closes sockets which have been at least partially
 585  *      opened, but not yet accepted.
 586  */
 587
 588 static void tcp_listen_stop (struct sock *sk)
 589 {
 590         struct tcp_opt *tp = tcp_sk(sk);
 591         struct tcp_listen_opt *lopt = tp->listen_opt;
 592         struct open_request *acc_req = tp->accept_queue;
 593         struct open_request *req;
 594         int i;
 595
 596         tcp_delete_keepalive_timer(sk);
 597
 598         /* make all the listen_opt local to us */
 599         write_lock_bh(&tp->syn_wait_lock);
 600         tp->listen_opt = NULL;
 601         write_unlock_bh(&tp->syn_wait_lock);
 602         tp->accept_queue = tp->accept_queue_tail = NULL;
 603
 604         if (lopt->qlen) {
 605                 for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
 606                         while ((req = lopt->syn_table[i]) != NULL) {
 607                                 lopt->syn_table[i] = req->dl_next;
 608                                 lopt->qlen--;
 609                                 tcp_openreq_free(req);
 610
 611                 /* Following specs, it would be better either to send FIN
 612                  * (and enter FIN-WAIT-1, it is normal close)
 613                  * or to send active reset (abort).
 614                  * Certainly, it is pretty dangerous while synflood, but it is
 615                  * bad justification for our negligence 8)
 616                  * To be honest, we are not able to make either
 617                  * of the variants now.                 --ANK
 618                  */
 619                         }
 620                 }
 621         }
 622         BUG_TRAP(!lopt->qlen);
 623
 624         kfree(lopt);
 625
 626         while ((req = acc_req) != NULL) {
 627                 struct sock *child = req->sk;
 628
 629                 acc_req = req->dl_next;
 630
 631                 local_bh_disable();
 632                 bh_lock_sock(child);
 633                 BUG_TRAP(!sock_owned_by_user(child));
 634                 sock_hold(child);
 635
 636                 tcp_disconnect(child, O_NONBLOCK);
 637
 638                 sock_orphan(child);
 639
 640                 atomic_inc(&tcp_orphan_count);
 641
 642                 tcp_destroy_sock(child);
 643
 644                 bh_unlock_sock(child);
 645                 local_bh_enable();
 646                 sock_put(child);
 647
 648                 tcp_acceptq_removed(sk);
 649                 tcp_openreq_fastfree(req);
 650         }
 651         BUG_TRAP(!sk->sk_ack_backlog);
 652 }
 653
 654 /*
 655  *      Wait for a socket to get into the connected state
 656  *
 657  *      Note: Must be called with the socket locked.
 658  */
 659 static int wait_for_tcp_connect(struct sock *sk, int flags, long *timeo_p)
 660 {
 661         struct tcp_opt *tp = tcp_sk(sk);
 662         struct task_struct *tsk = current;
 663         DEFINE_WAIT(wait);
 664
 665         while ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
 666                 if (sk->sk_err)
 667                         return sock_error(sk);
 668                 if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
 669                         return -EPIPE;
 670                 if (!*timeo_p)
 671                         return -EAGAIN;
 672                 if (signal_pending(tsk))
 673                         return sock_intr_errno(*timeo_p);
 674
 675                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 676                 tp->write_pending++;
 677
 678                 release_sock(sk);
 679                 *timeo_p = schedule_timeout(*timeo_p);
 680                 lock_sock(sk);
 681
 682                 finish_wait(sk->sk_sleep, &wait);
 683                 tp->write_pending--;
 684         }
 685         return 0;
 686 }
 687
 688 static inline int tcp_memory_free(struct sock *sk)
 689 {
 690         return sk->sk_wmem_queued < sk->sk_sndbuf;
 691 }
 692
 693 /*
 694  *      Wait for more memory for a socket
 695  */
 696 static int wait_for_tcp_memory(struct sock *sk, long *timeo)
 697 {
 698         struct tcp_opt *tp = tcp_sk(sk);
 699         int err = 0;
 700         long vm_wait = 0;
 701         long current_timeo = *timeo;
 702         DEFINE_WAIT(wait);
 703
 704         if (tcp_memory_free(sk))
 705                 current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2;
 706
 707         for (;;) {
 708                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 709
 710                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
 711
 712                 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 713                         goto do_error;
 714                 if (!*timeo)
 715                         goto do_nonblock;
 716                 if (signal_pending(current))
 717                         goto do_interrupted;
 718                 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 719                 if (tcp_memory_free(sk) && !vm_wait)
 720                         break;
 721
 722                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 723                 tp->write_pending++;
 724                 release_sock(sk);
 725                 if (!tcp_memory_free(sk) || vm_wait)
 726                         current_timeo = schedule_timeout(current_timeo);
 727                 lock_sock(sk);
 728                 tp->write_pending--;
 729
 730                 if (vm_wait) {
 731                         vm_wait -= current_timeo;
 732                         current_timeo = *timeo;
 733                         if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
 734                             (current_timeo -= vm_wait) < 0)
 735                                 current_timeo = 0;
 736                         vm_wait = 0;
 737                 }
 738                 *timeo = current_timeo;
 739         }
 740 out:
 741         finish_wait(sk->sk_sleep, &wait);
 742         return err;
 743
 744 do_error:
 745         err = -EPIPE;
 746         goto out;
 747 do_nonblock:
 748         err = -EAGAIN;
 749         goto out;
 750 do_interrupted:
 751         err = sock_intr_errno(*timeo);
 752         goto out;
 753 }
 754
 755 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 756                          size_t psize, int flags);
 757
 758 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
 759                                int off)
 760 {
 761         if (i) {
 762                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
 763                 return page == frag->page &&
 764                        off == frag->page_offset + frag->size;
 765         }
 766         return 0;
 767 }
 768
 769 static inline void fill_page_desc(struct sk_buff *skb, int i,
 770                                   struct page *page, int off, int size)
 771 {
 772         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 773         frag->page = page;
 774         frag->page_offset = off;
 775         frag->size = size;
 776         skb_shinfo(skb)->nr_frags = i + 1;
 777 }
 778
 779 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
 780 {
 781         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 782         tp->pushed_seq = tp->write_seq;
 783 }
 784
 785 static inline int forced_push(struct tcp_opt *tp)
 786 {
 787         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 788 }
 789
 790 static inline void skb_entail(struct sock *sk, struct tcp_opt *tp,
 791                               struct sk_buff *skb)
 792 {
 793         skb->csum = 0;
 794         TCP_SKB_CB(skb)->seq = tp->write_seq;
 795         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 796         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 797         TCP_SKB_CB(skb)->sacked = 0;
 798         __skb_queue_tail(&sk->sk_write_queue, skb);
 799         tcp_charge_skb(sk, skb);
 800         if (!tp->send_head)
 801                 tp->send_head = skb;
 802         else if (tp->nonagle&TCP_NAGLE_PUSH)
 803                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 804 }
 805
 806 static inline void tcp_mark_urg(struct tcp_opt *tp, int flags,
 807                                 struct sk_buff *skb)
 808 {
 809         if (flags & MSG_OOB) {
 810                 tp->urg_mode = 1;
 811                 tp->snd_up = tp->write_seq;
 812                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 813         }
 814 }
 815
 816 static inline void tcp_push(struct sock *sk, struct tcp_opt *tp, int flags,
 817                             int mss_now, int nonagle)
 818 {
 819         if (tp->send_head) {
 820                 struct sk_buff *skb = sk->sk_write_queue.prev;
 821                 if (!(flags & MSG_MORE) || forced_push(tp))
 822                         tcp_mark_push(tp, skb);
 823                 tcp_mark_urg(tp, flags, skb);
 824                 __tcp_push_pending_frames(sk, tp, mss_now,
 825                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 826         }
 827 }
 828
 829 static int tcp_error(struct sock *sk, int flags, int err)
 830 {
 831         if (err == -EPIPE)
 832                 err = sock_error(sk) ? : -EPIPE;
 833         if (err == -EPIPE && !(flags & MSG_NOSIGNAL))
 834                 send_sig(SIGPIPE, current, 0);
 835         return err;
 836 }
 837
 838 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 839                          size_t psize, int flags)
 840 {
 841         struct tcp_opt *tp = tcp_sk(sk);
 842         int mss_now;
 843         int err;
 844         ssize_t copied;
 845         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 846
 847         /* Wait for a connection to finish. */
 848         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 849                 if ((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
 850                         goto out_err;
 851
 852         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 853
 854         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 855         copied = 0;
 856
 857         err = -EPIPE;
 858         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 859                 goto do_error;
 860
 861         while (psize > 0) {
 862                 struct sk_buff *skb = sk->sk_write_queue.prev;
 863                 struct page *page = pages[poffset / PAGE_SIZE];
 864                 int copy, i;
 865                 int offset = poffset % PAGE_SIZE;
 866                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 867
 868                 if (!tp->send_head || (copy = mss_now - skb->len) <= 0) {
 869 new_segment:
 870                         if (!tcp_memory_free(sk))
 871                                 goto wait_for_sndbuf;
 872
 873                         skb = tcp_alloc_pskb(sk, 0, tp->mss_cache,
 874                                              sk->sk_allocation);
 875                         if (!skb)
 876                                 goto wait_for_memory;
 877
 878                         skb_entail(sk, tp, skb);
 879                         copy = mss_now;
 880                 }
 881
 882                 if (copy > size)
 883                         copy = size;
 884
 885                 i = skb_shinfo(skb)->nr_frags;
 886                 if (can_coalesce(skb, i, page, offset)) {
 887                         skb_shinfo(skb)->frags[i - 1].size += copy;
 888                 } else if (i < MAX_SKB_FRAGS) {
 889                         get_page(page);
 890                         fill_page_desc(skb, i, page, offset, copy);
 891                 } else {
 892                         tcp_mark_push(tp, skb);
 893                         goto new_segment;
 894                 }
 895
 896                 skb->len += copy;
 897                 skb->data_len += copy;
 898                 skb->ip_summed = CHECKSUM_HW;
 899                 tp->write_seq += copy;
 900                 TCP_SKB_CB(skb)->end_seq += copy;
 901
 902                 if (!copied)
 903                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 904
 905                 copied += copy;
 906                 poffset += copy;
 907                 if (!(psize -= copy))
 908                         goto out;
 909
 910                 if (skb->len != mss_now || (flags & MSG_OOB))
 911                         continue;
 912
 913                 if (forced_push(tp)) {
 914                         tcp_mark_push(tp, skb);
 915                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 916                 } else if (skb == tp->send_head)
 917                         tcp_push_one(sk, mss_now);
 918                 continue;
 919
 920 wait_for_sndbuf:
 921                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 922 wait_for_memory:
 923                 if (copied)
 924                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 925
 926                 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
 927                         goto do_error;
 928
 929                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 930         }
 931
 932 out:
 933         if (copied)
 934                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 935         return copied;
 936
 937 do_error:
 938         if (copied)
 939                 goto out;
 940 out_err:
 941         return tcp_error(sk, flags, err);
 942 }
 943
 944 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 945                      size_t size, int flags)
 946 {
 947         ssize_t res;
 948         struct sock *sk = sock->sk;
 949
 950 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 951
 952         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 953             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 954                 return sock_no_sendpage(sock, page, offset, size, flags);
 955
 956 #undef TCP_ZC_CSUM_FLAGS
 957
 958         lock_sock(sk);
 959         TCP_CHECK_TIMER(sk);
 960         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 961         TCP_CHECK_TIMER(sk);
 962         release_sock(sk);
 963         return res;
 964 }
 965
 966 #define TCP_PAGE(sk)    (inet_sk(sk)->sndmsg_page)
 967 #define TCP_OFF(sk)     (inet_sk(sk)->sndmsg_off)
 968
 969 static inline int tcp_copy_to_page(struct sock *sk, char *from,
 970                                    struct sk_buff *skb, struct page *page,
 971                                    int off, int copy)
 972 {
 973         int err = 0;
 974         unsigned int csum;
 975
 976         if (skb->ip_summed == CHECKSUM_NONE) {
 977                 csum = csum_and_copy_from_user(from, page_address(page) + off,
 978                                        copy, 0, &err);
 979                 if (err) return err;
 980                 skb->csum = csum_block_add(skb->csum, csum, skb->len);
 981         } else {
 982                 if (copy_from_user(page_address(page) + off, from, copy))
 983                         return -EFAULT;
 984         }
 985
 986         skb->len += copy;
 987         skb->data_len += copy;
 988         skb->truesize += copy;
 989         sk->sk_wmem_queued += copy;
 990         sk->sk_forward_alloc -= copy;
 991         return 0;
 992 }
 993
 994 static inline int skb_add_data(struct sk_buff *skb, char *from, int copy)
 995 {
 996         int err = 0;
 997         unsigned int csum;
 998         int off = skb->len;
 999
1000         if (skb->ip_summed == CHECKSUM_NONE) {
1001                 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
1002                                        copy, 0, &err);
1003                 if (!err) {
1004                         skb->csum = csum_block_add(skb->csum, csum, off);
1005                         return 0;
1006                 }
1007         } else {
1008                 if (!copy_from_user(skb_put(skb, copy), from, copy))
1009                         return 0;
1010         }
1011
1012         __skb_trim(skb, off);
1013         return -EFAULT;
1014 }
1015
1016 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1017 {
1018         int tmp = tp->mss_cache_std;
1019
1020         if (sk->sk_route_caps & NETIF_F_SG) {
1021                 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1022
1023                 if (tmp >= pgbreak &&
1024                     tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
1025                         tmp = pgbreak;
1026         }
1027         return tmp;
1028 }
1029
1030 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1031                 int size)
1032 {
1033         struct iovec *iov;
1034         struct tcp_opt *tp = tcp_sk(sk);
1035         struct sk_buff *skb;
1036         int iovlen, flags;
1037         int mss_now;
1038         int err, copied;
1039         long timeo;
1040
1041         lock_sock(sk);
1042         TCP_CHECK_TIMER(sk);
1043
1044         flags = msg->msg_flags;
1045         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
1046
1047         /* Wait for a connection to finish. */
1048         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1049                 if ((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1050                         goto out_err;
1051
1052         /* This should be in poll */
1053         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1054
1055         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1056
1057         /* Ok commence sending. */
1058         iovlen = msg->msg_iovlen;
1059         iov = msg->msg_iov;
1060         copied = 0;
1061
1062         err = -EPIPE;
1063         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
1064                 goto do_error;
1065
1066         while (--iovlen >= 0) {
1067                 int seglen = iov->iov_len;
1068                 unsigned char *from = iov->iov_base;
1069
1070                 iov++;
1071
1072                 while (seglen > 0) {
1073                         int copy;
1074
1075                         skb = sk->sk_write_queue.prev;
1076
1077                         if (!tp->send_head ||
1078                             (copy = mss_now - skb->len) <= 0) {
1079
1080 new_segment:
1081                                 /* Allocate new segment. If the interface is SG,
1082                                  * allocate skb fitting to single page.
1083                                  */
1084                                 if (!tcp_memory_free(sk))
1085                                         goto wait_for_sndbuf;
1086
1087                                 skb = tcp_alloc_pskb(sk, select_size(sk, tp),
1088                                                      0, sk->sk_allocation);
1089                                 if (!skb)
1090                                         goto wait_for_memory;
1091
1092                                 /*
1093                                  * Check whether we can use HW checksum.
1094                                  */
1095                                 if (sk->sk_route_caps &
1096                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
1097                                      NETIF_F_HW_CSUM))
1098                                         skb->ip_summed = CHECKSUM_HW;
1099
1100                                 skb_entail(sk, tp, skb);
1101                                 copy = mss_now;
1102                         }
1103
1104                         /* Try to append data to the end of skb. */
1105                         if (copy > seglen)
1106                                 copy = seglen;
1107
1108                         /* Where to copy to? */
1109                         if (skb_tailroom(skb) > 0) {
1110                                 /* We have some space in skb head. Superb! */
1111                                 if (copy > skb_tailroom(skb))
1112                                         copy = skb_tailroom(skb);
1113                                 if ((err = skb_add_data(skb, from, copy)) != 0)
1114                                         goto do_fault;
1115                         } else {
1116                                 int merge = 0;
1117                                 int i = skb_shinfo(skb)->nr_frags;
1118                                 struct page *page = TCP_PAGE(sk);
1119                                 int off = TCP_OFF(sk);
1120
1121                                 if (can_coalesce(skb, i, page, off) &&
1122                                     off != PAGE_SIZE) {
1123                                         /* We can extend the last page
1124                                          * fragment. */
1125                                         merge = 1;
1126                                 } else if (i == MAX_SKB_FRAGS ||
1127                                            (!i &&
1128                                            !(sk->sk_route_caps & NETIF_F_SG))) {
1129                                         /* Need to add new fragment and cannot
1130                                          * do this because interface is non-SG,
1131                                          * or because all the page slots are
1132                                          * busy. */
1133                                         tcp_mark_push(tp, skb);
1134                                         goto new_segment;
1135                                 } else if (page) {
1136                                         /* If page is cached, align
1137                                          * offset to L1 cache boundary
1138                                          */
1139                                         off = (off + L1_CACHE_BYTES - 1) &
1140                                               ~(L1_CACHE_BYTES - 1);
1141                                         if (off == PAGE_SIZE) {
1142                                                 put_page(page);
1143                                                 TCP_PAGE(sk) = page = NULL;
1144                                         }
1145                                 }
1146
1147                                 if (!page) {
1148                                         /* Allocate new cache page. */
1149                                         if (!(page = tcp_alloc_page(sk)))
1150                                                 goto wait_for_memory;
1151                                         off = 0;
1152                                 }
1153
1154                                 if (copy > PAGE_SIZE - off)
1155                                         copy = PAGE_SIZE - off;
1156
1157                                 /* Time to copy data. We are close to
1158                                  * the end! */
1159                                 err = tcp_copy_to_page(sk, from, skb, page,
1160                                                        off, copy);
1161                                 if (err) {
1162                                         /* If this page was new, give it to the
1163                                          * socket so it does not get leaked.
1164                                          */
1165                                         if (!TCP_PAGE(sk)) {
1166                                                 TCP_PAGE(sk) = page;
1167                                                 TCP_OFF(sk) = 0;
1168                                         }
1169                                         goto do_error;
1170                                 }
1171
1172                                 /* Update the skb. */
1173                                 if (merge) {
1174                                         skb_shinfo(skb)->frags[i - 1].size +=
1175                                                                         copy;
1176                                 } else {
1177                                         fill_page_desc(skb, i, page, off, copy);
1178                                         if (TCP_PAGE(sk)) {
1179                                                 get_page(page);
1180                                         } else if (off + copy < PAGE_SIZE) {
1181                                                 get_page(page);
1182                                                 TCP_PAGE(sk) = page;
1183                                         }
1184                                 }
1185
1186                                 TCP_OFF(sk) = off + copy;
1187                         }
1188
1189                         if (!copied)
1190                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1191
1192                         tp->write_seq += copy;
1193                         TCP_SKB_CB(skb)->end_seq += copy;
1194
1195                         from += copy;
1196                         copied += copy;
1197                         if ((seglen -= copy) == 0 && iovlen == 0)
1198                                 goto out;
1199
1200                         if (skb->len != mss_now || (flags & MSG_OOB))
1201                                 continue;
1202
1203                         if (forced_push(tp)) {
1204                                 tcp_mark_push(tp, skb);
1205                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
1206                         } else if (skb == tp->send_head)
1207                                 tcp_push_one(sk, mss_now);
1208                         continue;
1209
1210 wait_for_sndbuf:
1211                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1212 wait_for_memory:
1213                         if (copied)
1214                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1215
1216                         if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1217                                 goto do_error;
1218
1219                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1220                 }
1221         }
1222
1223 out:
1224         if (copied)
1225                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1226         TCP_CHECK_TIMER(sk);
1227         release_sock(sk);
1228         return copied;
1229
1230 do_fault:
1231         if (!skb->len) {
1232                 if (tp->send_head == skb)
1233                         tp->send_head = NULL;
1234                 __skb_unlink(skb, skb->list);
1235                 tcp_free_skb(sk, skb);
1236         }
1237
1238 do_error:
1239         if (copied)
1240                 goto out;
1241 out_err:
1242         err = tcp_error(sk, flags, err);
1243         TCP_CHECK_TIMER(sk);
1244         release_sock(sk);
1245         return err;
1246 }
1247
1248 /*
1249  *      Handle reading urgent data. BSD has very simple semantics for
1250  *      this, no blocking and very strange errors 8)
1251  */
1252
1253 static int tcp_recv_urg(struct sock *sk, long timeo,
1254                         struct msghdr *msg, int len, int flags,
1255                         int *addr_len)
1256 {
1257         struct tcp_opt *tp = tcp_sk(sk);
1258
1259         /* No URG data to read. */
1260         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1261             tp->urg_data == TCP_URG_READ)
1262                 return -EINVAL; /* Yes this is right ! */
1263
1264         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1265                 return -ENOTCONN;
1266
1267         if (tp->urg_data & TCP_URG_VALID) {
1268                 int err = 0;
1269                 char c = tp->urg_data;
1270
1271                 if (!(flags & MSG_PEEK))
1272                         tp->urg_data = TCP_URG_READ;
1273
1274                 /* Read urgent data. */
1275                 msg->msg_flags |= MSG_OOB;
1276
1277                 if (len > 0) {
1278                         if (!(flags & MSG_TRUNC))
1279                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1280                         len = 1;
1281                 } else
1282                         msg->msg_flags |= MSG_TRUNC;
1283
1284                 return err ? -EFAULT : len;
1285         }
1286
1287         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1288                 return 0;
1289
1290         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1291          * the available implementations agree in this case:
1292          * this call should never block, independent of the
1293          * blocking state of the socket.
1294          * Mike <pall@rz.uni-karlsruhe.de>
1295          */
1296         return -EAGAIN;
1297 }
1298
1299 /*
1300  *      Release a skb if it is no longer needed. This routine
1301  *      must be called with interrupts disabled or with the
1302  *      socket locked so that the sk_buff queue operation is ok.
1303  */
1304
1305 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
1306 {
1307         __skb_unlink(skb, &sk->sk_receive_queue);
1308         __kfree_skb(skb);
1309 }
1310
1311 /* Clean up the receive buffer for full frames taken by the user,
1312  * then send an ACK if necessary.  COPIED is the number of bytes
1313  * tcp_recvmsg has given to the user so far, it speeds up the
1314  * calculation of whether or not we must ACK for the sake of
1315  * a window update.
1316  */
1317 static void cleanup_rbuf(struct sock *sk, int copied)
1318 {
1319         struct tcp_opt *tp = tcp_sk(sk);
1320         int time_to_ack = 0;
1321
1322 #if TCP_DEBUG
1323         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1324
1325         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1326 #endif
1327
1328         if (tcp_ack_scheduled(tp)) {
1329                    /* Delayed ACKs frequently hit locked sockets during bulk
1330                     * receive. */
1331                 if (tp->ack.blocked ||
1332                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1333                     tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
1334                     /*
1335                      * If this read emptied read buffer, we send ACK, if
1336                      * connection is not bidirectional, user drained
1337                      * receive buffer and there was a small segment
1338                      * in queue.
1339                      */
1340                     (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
1341                      !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
1342                         time_to_ack = 1;
1343         }
1344
1345         /* We send an ACK if we can now advertise a non-zero window
1346          * which has been raised "significantly".
1347          *
1348          * Even if window raised up to infinity, do not send window open ACK
1349          * in states, where we will not receive more. It is useless.
1350          */
1351         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1352                 __u32 rcv_window_now = tcp_receive_window(tp);
1353
1354                 /* Optimize, __tcp_select_window() is not cheap. */
1355                 if (2*rcv_window_now <= tp->window_clamp) {
1356                         __u32 new_window = __tcp_select_window(sk);
1357
1358                         /* Send ACK now, if this read freed lots of space
1359                          * in our buffer. Certainly, new_window is new window.
1360                          * We can advertise it now, if it is not less than current one.
1361                          * "Lots" means "at least twice" here.
1362                          */
1363                         if (new_window && new_window >= 2 * rcv_window_now)
1364                                 time_to_ack = 1;
1365                 }
1366         }
1367         if (time_to_ack)
1368                 tcp_send_ack(sk);
1369 }
1370
1371 /* Now socket state including sk->sk_err is changed only under lock,
1372  * hence we may omit checks after joining wait queue.
1373  * We check receive queue before schedule() only as optimization;
1374  * it is very likely that release_sock() added new data.
1375  */
1376
1377 static long tcp_data_wait(struct sock *sk, long timeo)
1378 {
1379         DEFINE_WAIT(wait);
1380
1381         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1382
1383         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1384         release_sock(sk);
1385
1386         if (skb_queue_empty(&sk->sk_receive_queue))
1387                 timeo = schedule_timeout(timeo);
1388
1389         lock_sock(sk);
1390         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1391
1392         finish_wait(sk->sk_sleep, &wait);
1393         return timeo;
1394 }
1395
1396 static void tcp_prequeue_process(struct sock *sk)
1397 {
1398         struct sk_buff *skb;
1399         struct tcp_opt *tp = tcp_sk(sk);
1400
1401         NET_ADD_STATS_USER(TCPPrequeued, skb_queue_len(&tp->ucopy.prequeue));
1402
1403         /* RX process wants to run with disabled BHs, though it is not
1404          * necessary */
1405         local_bh_disable();
1406         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1407                 sk->sk_backlog_rcv(sk, skb);
1408         local_bh_enable();
1409
1410         /* Clear memory counter. */
1411         tp->ucopy.memory = 0;
1412 }
1413
1414 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1415 {
1416         struct sk_buff *skb;
1417         u32 offset;
1418
1419         skb_queue_walk(&sk->sk_receive_queue, skb) {
1420                 offset = seq - TCP_SKB_CB(skb)->seq;
1421                 if (skb->h.th->syn)
1422                         offset--;
1423                 if (offset < skb->len || skb->h.th->fin) {
1424                         *off = offset;
1425                         return skb;
1426                 }
1427         }
1428         return NULL;
1429 }
1430
1431 /*
1432  * This routine provides an alternative to tcp_recvmsg() for routines
1433  * that would like to handle copying from skbuffs directly in 'sendfile'
1434  * fashion.
1435  * Note:
1436  *      - It is assumed that the socket was locked by the caller.
1437  *      - The routine does not block.
1438  *      - At present, there is no support for reading OOB data
1439  *        or for 'peeking' the socket using this routine
1440  *        (although both would be easy to implement).
1441  */
1442 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1443                   sk_read_actor_t recv_actor)
1444 {
1445         struct sk_buff *skb;
1446         struct tcp_opt *tp = tcp_sk(sk);
1447         u32 seq = tp->copied_seq;
1448         u32 offset;
1449         int copied = 0;
1450
1451         if (sk->sk_state == TCP_LISTEN)
1452                 return -ENOTCONN;
1453         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1454                 if (offset < skb->len) {
1455                         size_t used, len;
1456
1457                         len = skb->len - offset;
1458                         /* Stop reading if we hit a patch of urgent data */
1459                         if (tp->urg_data) {
1460                                 u32 urg_offset = tp->urg_seq - seq;
1461                                 if (urg_offset < len)
1462                                         len = urg_offset;
1463                                 if (!len)
1464                                         break;
1465                         }
1466                         used = recv_actor(desc, skb, offset, len);
1467                         if (used <= len) {
1468                                 seq += used;
1469                                 copied += used;
1470                                 offset += used;
1471                         }
1472                         if (offset != skb->len)
1473                                 break;
1474                 }
1475                 if (skb->h.th->fin) {
1476                         tcp_eat_skb(sk, skb);
1477                         ++seq;
1478                         break;
1479                 }
1480                 tcp_eat_skb(sk, skb);
1481                 if (!desc->count)
1482                         break;
1483         }
1484         tp->copied_seq = seq;
1485         /* Clean up data we have read: This will do ACK frames. */
1486         if (copied)
1487                 cleanup_rbuf(sk, copied);
1488         return copied;
1489 }
1490
1491 /*
1492  *      This routine copies from a sock struct into the user buffer.
1493  *
1494  *      Technical note: in 2.3 we work on _locked_ socket, so that
1495  *      tricks with *seq access order and skb->users are not required.
1496  *      Probably, code can be easily improved even more.
1497  */
1498
1499 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1500                 int len, int nonblock, int flags, int *addr_len)
1501 {
1502         struct tcp_opt *tp = tcp_sk(sk);
1503         int copied = 0;
1504         u32 peek_seq;
1505         u32 *seq;
1506         unsigned long used;
1507         int err;
1508         int target;             /* Read at least this many bytes */
1509         long timeo;
1510         struct task_struct *user_recv = NULL;
1511
1512         lock_sock(sk);
1513
1514         TCP_CHECK_TIMER(sk);
1515
1516         err = -ENOTCONN;
1517         if (sk->sk_state == TCP_LISTEN)
1518                 goto out;
1519
1520         timeo = sock_rcvtimeo(sk, nonblock);
1521
1522         /* Urgent data needs to be handled specially. */
1523         if (flags & MSG_OOB)
1524                 goto recv_urg;
1525
1526         seq = &tp->copied_seq;
1527         if (flags & MSG_PEEK) {
1528                 peek_seq = tp->copied_seq;
1529                 seq = &peek_seq;
1530         }
1531
1532         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1533
1534         do {
1535                 struct sk_buff *skb;
1536                 u32 offset;
1537
1538                 /* Are we at urgent data? Stop if we have read anything. */
1539                 if (copied && tp->urg_data && tp->urg_seq == *seq)
1540                         break;
1541
1542                 /* We need to check signals first, to get correct SIGURG
1543                  * handling. FIXME: Need to check this doesn't impact 1003.1g
1544                  * and move it down to the bottom of the loop
1545                  */
1546                 if (signal_pending(current)) {
1547                         if (copied)
1548                                 break;
1549                         copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1550                         break;
1551                 }
1552
1553                 /* Next get a buffer. */
1554
1555                 skb = skb_peek(&sk->sk_receive_queue);
1556                 do {
1557                         if (!skb)
1558                                 break;
1559
1560                         /* Now that we have two receive queues this
1561                          * shouldn't happen.
1562                          */
1563                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1564                                 printk(KERN_INFO "recvmsg bug: copied %X "
1565                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1566                                 break;
1567                         }
1568                         offset = *seq - TCP_SKB_CB(skb)->seq;
1569                         if (skb->h.th->syn)
1570                                 offset--;
1571                         if (offset < skb->len)
1572                                 goto found_ok_skb;
1573                         if (skb->h.th->fin)
1574                                 goto found_fin_ok;
1575                         BUG_TRAP(flags & MSG_PEEK);
1576                         skb = skb->next;
1577                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1578
1579                 /* Well, if we have backlog, try to process it now yet. */
1580
1581                 if (copied >= target && !sk->sk_backlog.tail)
1582                         break;
1583
1584                 if (copied) {
1585                         if (sk->sk_err ||
1586                             sk->sk_state == TCP_CLOSE ||
1587                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1588                             !timeo ||
1589                             (flags & MSG_PEEK))
1590                                 break;
1591                 } else {
1592                         if (sock_flag(sk, SOCK_DONE))
1593                                 break;
1594
1595                         if (sk->sk_err) {
1596                                 copied = sock_error(sk);
1597                                 break;
1598                         }
1599
1600                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1601                                 break;
1602
1603                         if (sk->sk_state == TCP_CLOSE) {
1604                                 if (!sock_flag(sk, SOCK_DONE)) {
1605                                         /* This occurs when user tries to read
1606                                          * from never connected socket.
1607                                          */
1608                                         copied = -ENOTCONN;
1609                                         break;
1610                                 }
1611                                 break;
1612                         }
1613
1614                         if (!timeo) {
1615                                 copied = -EAGAIN;
1616                                 break;
1617                         }
1618                 }
1619
1620                 cleanup_rbuf(sk, copied);
1621
1622                 if (tp->ucopy.task == user_recv) {
1623                         /* Install new reader */
1624                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1625                                 user_recv = current;
1626                                 tp->ucopy.task = user_recv;
1627                                 tp->ucopy.iov = msg->msg_iov;
1628                         }
1629
1630                         tp->ucopy.len = len;
1631
1632                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1633                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1634
1635                         /* Ugly... If prequeue is not empty, we have to
1636                          * process it before releasing socket, otherwise
1637                          * order will be broken at second iteration.
1638                          * More elegant solution is required!!!
1639                          *
1640                          * Look: we have the following (pseudo)queues:
1641                          *
1642                          * 1. packets in flight
1643                          * 2. backlog
1644                          * 3. prequeue
1645                          * 4. receive_queue
1646                          *
1647                          * Each queue can be processed only if the next ones
1648                          * are empty. At this point we have empty receive_queue.
1649                          * But prequeue _can_ be not empty after 2nd iteration,
1650                          * when we jumped to start of loop because backlog
1651                          * processing added something to receive_queue.
1652                          * We cannot release_sock(), because backlog contains
1653                          * packets arrived _after_ prequeued ones.
1654                          *
1655                          * Shortly, algorithm is clear --- to process all
1656                          * the queues in order. We could make it more directly,
1657                          * requeueing packets from backlog to prequeue, if
1658                          * is not empty. It is more elegant, but eats cycles,
1659                          * unfortunately.
1660                          */
1661                         if (skb_queue_len(&tp->ucopy.prequeue))
1662                                 goto do_prequeue;
1663
1664                         /* __ Set realtime policy in scheduler __ */
1665                 }
1666
1667                 if (copied >= target) {
1668                         /* Do not sleep, just process backlog. */
1669                         release_sock(sk);
1670                         lock_sock(sk);
1671                 } else {
1672                         timeo = tcp_data_wait(sk, timeo);
1673                 }
1674
1675                 if (user_recv) {
1676                         int chunk;
1677
1678                         /* __ Restore normal policy in scheduler __ */
1679
1680                         if ((chunk = len - tp->ucopy.len) != 0) {
1681                                 NET_ADD_STATS_USER(TCPDirectCopyFromBacklog, chunk);
1682                                 len -= chunk;
1683                                 copied += chunk;
1684                         }
1685
1686                         if (tp->rcv_nxt == tp->copied_seq &&
1687                             skb_queue_len(&tp->ucopy.prequeue)) {
1688 do_prequeue:
1689                                 tcp_prequeue_process(sk);
1690
1691                                 if ((chunk = len - tp->ucopy.len) != 0) {
1692                                         NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1693                                         len -= chunk;
1694                                         copied += chunk;
1695                                 }
1696                         }
1697                 }
1698                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1699                         if (net_ratelimit())
1700                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1701                                        current->comm, current->pid);
1702                         peek_seq = tp->copied_seq;
1703                 }
1704                 continue;
1705
1706         found_ok_skb:
1707                 /* Ok so how much can we use? */
1708                 used = skb->len - offset;
1709                 if (len < used)
1710                         used = len;
1711
1712                 /* Do we have urgent data here? */
1713                 if (tp->urg_data) {
1714                         u32 urg_offset = tp->urg_seq - *seq;
1715                         if (urg_offset < used) {
1716                                 if (!urg_offset) {
1717                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1718                                                 ++*seq;
1719                                                 offset++;
1720                                                 used--;
1721                                                 if (!used)
1722                                                         goto skip_copy;
1723                                         }
1724                                 } else
1725                                         used = urg_offset;
1726                         }
1727                 }
1728
1729                 if (!(flags & MSG_TRUNC)) {
1730                         err = skb_copy_datagram_iovec(skb, offset,
1731                                                       msg->msg_iov, used);
1732                         if (err) {
1733                                 /* Exception. Bailout! */
1734                                 if (!copied)
1735                                         copied = -EFAULT;
1736                                 break;
1737                         }
1738                 }
1739
1740                 *seq += used;
1741                 copied += used;
1742                 len -= used;
1743
1744 skip_copy:
1745                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1746                         tp->urg_data = 0;
1747                         tcp_fast_path_check(sk, tp);
1748                 }
1749                 if (used + offset < skb->len)
1750                         continue;
1751
1752                 if (skb->h.th->fin)
1753                         goto found_fin_ok;
1754                 if (!(flags & MSG_PEEK))
1755                         tcp_eat_skb(sk, skb);
1756                 continue;
1757
1758         found_fin_ok:
1759                 /* Process the FIN. */
1760                 ++*seq;
1761                 if (!(flags & MSG_PEEK))
1762                         tcp_eat_skb(sk, skb);
1763                 break;
1764         } while (len > 0);
1765
1766         if (user_recv) {
1767                 if (skb_queue_len(&tp->ucopy.prequeue)) {
1768                         int chunk;
1769
1770                         tp->ucopy.len = copied > 0 ? len : 0;
1771
1772                         tcp_prequeue_process(sk);
1773
1774                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1775                                 NET_ADD_STATS_USER(TCPDirectCopyFromPrequeue, chunk);
1776                                 len -= chunk;
1777                                 copied += chunk;
1778                         }
1779                 }
1780
1781                 tp->ucopy.task = NULL;
1782                 tp->ucopy.len = 0;
1783         }
1784
1785         /* According to UNIX98, msg_name/msg_namelen are ignored
1786          * on connected socket. I was just happy when found this 8) --ANK
1787          */
1788
1789         /* Clean up data we have read: This will do ACK frames. */
1790         cleanup_rbuf(sk, copied);
1791
1792         TCP_CHECK_TIMER(sk);
1793         release_sock(sk);
1794         return copied;
1795
1796 out:
1797         TCP_CHECK_TIMER(sk);
1798         release_sock(sk);
1799         return err;
1800
1801 recv_urg:
1802         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1803         goto out;
1804 }
1805
1806 /*
1807  *      State processing on a close. This implements the state shift for
1808  *      sending our FIN frame. Note that we only send a FIN for some
1809  *      states. A shutdown() may have already sent the FIN, or we may be
1810  *      closed.
1811  */
1812
1813 static unsigned char new_state[16] = {
1814   /* current state:        new state:      action:      */
1815   /* (Invalid)          */ TCP_CLOSE,
1816   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1817   /* TCP_SYN_SENT       */ TCP_CLOSE,
1818   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1819   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1820   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1821   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1822   /* TCP_CLOSE          */ TCP_CLOSE,
1823   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1824   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1825   /* TCP_LISTEN         */ TCP_CLOSE,
1826   /* TCP_CLOSING        */ TCP_CLOSING,
1827 };
1828
1829 static int tcp_close_state(struct sock *sk)
1830 {
1831         int next = (int)new_state[sk->sk_state];
1832         int ns = next & TCP_STATE_MASK;
1833
1834         tcp_set_state(sk, ns);
1835
1836         return next & TCP_ACTION_FIN;
1837 }
1838
1839 /*
1840  *      Shutdown the sending side of a connection. Much like close except
1841  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1842  */
1843
1844 void tcp_shutdown(struct sock *sk, int how)
1845 {
1846         /*      We need to grab some memory, and put together a FIN,
1847          *      and then put it into the queue to be sent.
1848          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1849          */
1850         if (!(how & SEND_SHUTDOWN))
1851                 return;
1852
1853         /* If we've already sent a FIN, or it's a closed state, skip this. */
1854         if ((1 << sk->sk_state) &
1855             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1856              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1857                 /* Clear out any half completed packets.  FIN if needed. */
1858                 if (tcp_close_state(sk))
1859                         tcp_send_fin(sk);
1860         }
1861 }
1862
1863
1864 /*
1865  *      Return 1 if we still have things to send in our buffers.
1866  */
1867
1868 static inline int closing(struct sock *sk)
1869 {
1870         return (1 << sk->sk_state) &
1871                (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
1872 }
1873
1874 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1875 {
1876         /* First the read buffer. */
1877         __skb_queue_purge(&sk->sk_receive_queue);
1878
1879         /* Next, the error queue. */
1880         __skb_queue_purge(&sk->sk_error_queue);
1881
1882         /* Next, the write queue. */
1883         BUG_TRAP(skb_queue_empty(&sk->sk_write_queue));
1884
1885         /* Account for returned memory. */
1886         tcp_mem_reclaim(sk);
1887
1888         BUG_TRAP(!sk->sk_wmem_queued);
1889         BUG_TRAP(!sk->sk_forward_alloc);
1890
1891         /* It is _impossible_ for the backlog to contain anything
1892          * when we get here.  All user references to this socket
1893          * have gone away, only the net layer knows can touch it.
1894          */
1895 }
1896
1897 /*
1898  * At this point, there should be no process reference to this
1899  * socket, and thus no user references at all.  Therefore we
1900  * can assume the socket waitqueue is inactive and nobody will
1901  * try to jump onto it.
1902  */
1903 void tcp_destroy_sock(struct sock *sk)
1904 {
1905         BUG_TRAP(sk->sk_state == TCP_CLOSE);
1906         BUG_TRAP(sock_flag(sk, SOCK_DEAD));
1907
1908         /* It cannot be in hash table! */
1909         BUG_TRAP(sk_unhashed(sk));
1910
1911         /* If it has not 0 inet_sk(sk)->num, it must be bound */
1912         BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
1913
1914 #ifdef TCP_DEBUG
1915         if (sk->sk_zapped) {
1916                 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1917                 sock_hold(sk);
1918         }
1919         sk->sk_zapped = 1;
1920 #endif
1921
1922         sk->sk_prot->destroy(sk);
1923
1924         tcp_kill_sk_queues(sk);
1925
1926         xfrm_sk_free_policy(sk);
1927
1928 #ifdef INET_REFCNT_DEBUG
1929         if (atomic_read(&sk->sk_refcnt) != 1) {
1930                 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
1931                        sk, atomic_read(&sk->sk_refcnt));
1932         }
1933 #endif
1934
1935         atomic_dec(&tcp_orphan_count);
1936         sock_put(sk);
1937 }
1938
1939 void tcp_close(struct sock *sk, long timeout)
1940 {
1941         struct sk_buff *skb;
1942         int data_was_unread = 0;
1943
1944         lock_sock(sk);
1945         sk->sk_shutdown = SHUTDOWN_MASK;
1946
1947         if (sk->sk_state == TCP_LISTEN) {
1948                 tcp_set_state(sk, TCP_CLOSE);
1949
1950                 /* Special case. */
1951                 tcp_listen_stop(sk);
1952
1953                 goto adjudge_to_death;
1954         }
1955
1956         /*  We need to flush the recv. buffs.  We do this only on the
1957          *  descriptor close, not protocol-sourced closes, because the
1958          *  reader process may not have drained the data yet!
1959          */
1960         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1961                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1962                           skb->h.th->fin;
1963                 data_was_unread += len;
1964                 __kfree_skb(skb);
1965         }
1966
1967         tcp_mem_reclaim(sk);
1968
1969         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1970          * 3.10, we send a RST here because data was lost.  To
1971          * witness the awful effects of the old behavior of always
1972          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1973          * a bulk GET in an FTP client, suspend the process, wait
1974          * for the client to advertise a zero window, then kill -9
1975          * the FTP client, wheee...  Note: timeout is always zero
1976          * in such a case.
1977          */
1978         if (data_was_unread) {
1979                 /* Unread data was tossed, zap the connection. */
1980                 NET_INC_STATS_USER(TCPAbortOnClose);
1981                 tcp_set_state(sk, TCP_CLOSE);
1982                 tcp_send_active_reset(sk, GFP_KERNEL);
1983         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1984                 /* Check zero linger _after_ checking for unread data. */
1985                 sk->sk_prot->disconnect(sk, 0);
1986                 NET_INC_STATS_USER(TCPAbortOnData);
1987         } else if (tcp_close_state(sk)) {
1988                 /* We FIN if the application ate all the data before
1989                  * zapping the connection.
1990                  */
1991
1992                 /* RED-PEN. Formally speaking, we have broken TCP state
1993                  * machine. State transitions:
1994                  *
1995                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1996                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1997                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1998                  *
1999                  * are legal only when FIN has been sent (i.e. in window),
2000                  * rather than queued out of window. Purists blame.
2001                  *
2002                  * F.e. "RFC state" is ESTABLISHED,
2003                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
2004                  *
2005                  * The visible declinations are that sometimes
2006                  * we enter time-wait state, when it is not required really
2007                  * (harmless), do not send active resets, when they are
2008                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
2009                  * they look as CLOSING or LAST_ACK for Linux)
2010                  * Probably, I missed some more holelets.
2011                  *                                              --ANK
2012                  */
2013                 tcp_send_fin(sk);
2014         }
2015
2016         if (timeout) {
2017                 struct task_struct *tsk = current;
2018                 DEFINE_WAIT(wait);
2019
2020                 do {
2021                         prepare_to_wait(sk->sk_sleep, &wait,
2022                                         TASK_INTERRUPTIBLE);
2023                         if (!closing(sk))
2024                                 break;
2025                         release_sock(sk);
2026                         timeout = schedule_timeout(timeout);
2027                         lock_sock(sk);
2028                 } while (!signal_pending(tsk) && timeout);
2029
2030                 finish_wait(sk->sk_sleep, &wait);
2031         }
2032
2033 adjudge_to_death:
2034         /* It is the last release_sock in its life. It will remove backlog. */
2035         release_sock(sk);
2036
2037
2038         /* Now socket is owned by kernel and we acquire BH lock
2039            to finish close. No need to check for user refs.
2040          */
2041         local_bh_disable();
2042         bh_lock_sock(sk);
2043         BUG_TRAP(!sock_owned_by_user(sk));
2044
2045         sock_hold(sk);
2046         sock_orphan(sk);
2047
2048         /*      This is a (useful) BSD violating of the RFC. There is a
2049          *      problem with TCP as specified in that the other end could
2050          *      keep a socket open forever with no application left this end.
2051          *      We use a 3 minute timeout (about the same as BSD) then kill
2052          *      our end. If they send after that then tough - BUT: long enough
2053          *      that we won't make the old 4*rto = almost no time - whoops
2054          *      reset mistake.
2055          *
2056          *      Nope, it was not mistake. It is really desired behaviour
2057          *      f.e. on http servers, when such sockets are useless, but
2058          *      consume significant resources. Let's do it with special
2059          *      linger2 option.                                 --ANK
2060          */
2061
2062         if (sk->sk_state == TCP_FIN_WAIT2) {
2063                 struct tcp_opt *tp = tcp_sk(sk);
2064                 if (tp->linger2 < 0) {
2065                         tcp_set_state(sk, TCP_CLOSE);
2066                         tcp_send_active_reset(sk, GFP_ATOMIC);
2067                         NET_INC_STATS_BH(TCPAbortOnLinger);
2068                 } else {
2069                         int tmo = tcp_fin_time(tp);
2070
2071                         if (tmo > TCP_TIMEWAIT_LEN) {
2072                                 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2073                         } else {
2074                                 atomic_inc(&tcp_orphan_count);
2075                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2076                                 goto out;
2077                         }
2078                 }
2079         }
2080         if (sk->sk_state != TCP_CLOSE) {
2081                 tcp_mem_reclaim(sk);
2082                 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2083                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
2084                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2085                         if (net_ratelimit())
2086                                 printk(KERN_INFO "TCP: too many of orphaned "
2087                                        "sockets\n");
2088                         tcp_set_state(sk, TCP_CLOSE);
2089                         tcp_send_active_reset(sk, GFP_ATOMIC);
2090                         NET_INC_STATS_BH(TCPAbortOnMemory);
2091                 }
2092         }
2093         atomic_inc(&tcp_orphan_count);
2094
2095         if (sk->sk_state == TCP_CLOSE)
2096                 tcp_destroy_sock(sk);
2097         /* Otherwise, socket is reprieved until protocol close. */
2098
2099 out:
2100         bh_unlock_sock(sk);
2101         local_bh_enable();
2102         sock_put(sk);
2103 }
2104
2105 /* These states need RST on ABORT according to RFC793 */
2106
2107 static inline int tcp_need_reset(int state)
2108 {
2109         return (1 << state) &
2110                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
2111                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
2112 }
2113
2114 int tcp_disconnect(struct sock *sk, int flags)
2115 {
2116         struct inet_opt *inet = inet_sk(sk);
2117         struct tcp_opt *tp = tcp_sk(sk);
2118         int err = 0;
2119         int old_state = sk->sk_state;
2120
2121         if (old_state != TCP_CLOSE)
2122                 tcp_set_state(sk, TCP_CLOSE);
2123
2124         /* ABORT function of RFC793 */
2125         if (old_state == TCP_LISTEN) {
2126                 tcp_listen_stop(sk);
2127         } else if (tcp_need_reset(old_state) ||
2128                    (tp->snd_nxt != tp->write_seq &&
2129                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
2130                 /* The last check adjusts for discrepance of Linux wrt. RFC
2131                  * states
2132                  */
2133                 tcp_send_active_reset(sk, gfp_any());
2134                 sk->sk_err = ECONNRESET;
2135         } else if (old_state == TCP_SYN_SENT)
2136                 sk->sk_err = ECONNRESET;
2137
2138         tcp_clear_xmit_timers(sk);
2139         __skb_queue_purge(&sk->sk_receive_queue);
2140         tcp_writequeue_purge(sk);
2141         __skb_queue_purge(&tp->out_of_order_queue);
2142
2143         inet->dport = 0;
2144
2145         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2146                 inet_reset_saddr(sk);
2147
2148         sk->sk_shutdown = 0;
2149         sock_reset_flag(sk, SOCK_DONE);
2150         tp->srtt = 0;
2151         if ((tp->write_seq += tp->max_window + 2) == 0)
2152                 tp->write_seq = 1;
2153         tp->backoff = 0;
2154         tp->snd_cwnd = 2;
2155         tp->probes_out = 0;
2156         tp->packets_out = 0;
2157         tp->snd_ssthresh = 0x7fffffff;
2158         tp->snd_cwnd_cnt = 0;
2159         tp->ca_state = TCP_CA_Open;
2160         tcp_clear_retrans(tp);
2161         tcp_delack_init(tp);
2162         tp->send_head = NULL;
2163         tp->saw_tstamp = 0;
2164         tcp_sack_reset(tp);
2165         __sk_dst_reset(sk);
2166
2167         BUG_TRAP(!inet->num || tp->bind_hash);
2168
2169         sk->sk_error_report(sk);
2170         return err;
2171 }
2172
2173 /*
2174  *      Wait for an incoming connection, avoid race
2175  *      conditions. This must be called with the socket locked.
2176  */
2177 static int wait_for_connect(struct sock *sk, long timeo)
2178 {
2179         struct tcp_opt *tp = tcp_sk(sk);
2180         DEFINE_WAIT(wait);
2181         int err;
2182
2183         /*
2184          * True wake-one mechanism for incoming connections: only
2185          * one process gets woken up, not the 'whole herd'.
2186          * Since we do not 'race & poll' for established sockets
2187          * anymore, the common case will execute the loop only once.
2188          *
2189          * Subtle issue: "add_wait_queue_exclusive()" will be added
2190          * after any current non-exclusive waiters, and we know that
2191          * it will always _stay_ after any new non-exclusive waiters
2192          * because all non-exclusive waiters are added at the
2193          * beginning of the wait-queue. As such, it's ok to "drop"
2194          * our exclusiveness temporarily when we get woken up without
2195          * having to remove and re-insert us on the wait queue.
2196          */
2197         for (;;) {
2198                 prepare_to_wait_exclusive(sk->sk_sleep, &wait,
2199                                           TASK_INTERRUPTIBLE);
2200                 release_sock(sk);
2201                 if (!tp->accept_queue)
2202                         timeo = schedule_timeout(timeo);
2203                 lock_sock(sk);
2204                 err = 0;
2205                 if (tp->accept_queue)
2206                         break;
2207                 err = -EINVAL;
2208                 if (sk->sk_state != TCP_LISTEN)
2209                         break;
2210                 err = sock_intr_errno(timeo);
2211                 if (signal_pending(current))
2212                         break;
2213                 err = -EAGAIN;
2214                 if (!timeo)
2215                         break;
2216         }
2217         finish_wait(sk->sk_sleep, &wait);
2218         return err;
2219 }
2220
2221 /*
2222  *      This will accept the next outstanding connection.
2223  */
2224
2225 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2226 {
2227         struct tcp_opt *tp = tcp_sk(sk);
2228         struct open_request *req;
2229         struct sock *newsk;
2230         int error;
2231
2232         lock_sock(sk);
2233
2234         /* We need to make sure that this socket is listening,
2235          * and that it has something pending.
2236          */
2237         error = -EINVAL;
2238         if (sk->sk_state != TCP_LISTEN)
2239                 goto out;
2240
2241         /* Find already established connection */
2242         if (!tp->accept_queue) {
2243                 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2244
2245                 /* If this is a non blocking socket don't sleep */
2246                 error = -EAGAIN;
2247                 if (!timeo)
2248                         goto out;
2249
2250                 error = wait_for_connect(sk, timeo);
2251                 if (error)
2252                         goto out;
2253         }
2254
2255         req = tp->accept_queue;
2256         if ((tp->accept_queue = req->dl_next) == NULL)
2257                 tp->accept_queue_tail = NULL;
2258
2259         newsk = req->sk;
2260         tcp_acceptq_removed(sk);
2261         tcp_openreq_fastfree(req);
2262         BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
2263         release_sock(sk);
2264         return newsk;
2265
2266 out:
2267         release_sock(sk);
2268         *err = error;
2269         return NULL;
2270 }
2271
2272 /*
2273  *      Socket option code for TCP.
2274  */
2275 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2276                    int optlen)
2277 {
2278         struct tcp_opt *tp = tcp_sk(sk);
2279         int val;
2280         int err = 0;
2281
2282         if (level != SOL_TCP)
2283                 return tp->af_specific->setsockopt(sk, level, optname,
2284                                                    optval, optlen);
2285
2286         if (optlen < sizeof(int))
2287                 return -EINVAL;
2288
2289         if (get_user(val, (int *)optval))
2290                 return -EFAULT;
2291
2292         lock_sock(sk);
2293
2294         switch (optname) {
2295         case TCP_MAXSEG:
2296                 /* Values greater than interface MTU won't take effect. However
2297                  * at the point when this call is done we typically don't yet
2298                  * know which interface is going to be used */
2299                 if (val < 8 || val > MAX_TCP_WINDOW) {
2300                         err = -EINVAL;
2301                         break;
2302                 }
2303                 tp->user_mss = val;
2304                 break;
2305
2306         case TCP_NODELAY:
2307                 if (val) {
2308                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2309                          * this option on corked socket is remembered, but
2310                          * it is not activated until cork is cleared.
2311                          *
2312                          * However, when TCP_NODELAY is set we make
2313                          * an explicit push, which overrides even TCP_CORK
2314                          * for currently queued segments.
2315                          */
2316                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2317                         tcp_push_pending_frames(sk, tp);
2318                 } else {
2319                         tp->nonagle &= ~TCP_NAGLE_OFF;
2320                 }
2321                 break;
2322
2323         case TCP_CORK:
2324                 /* When set indicates to always queue non-full frames.
2325                  * Later the user clears this option and we transmit
2326                  * any pending partial frames in the queue.  This is
2327                  * meant to be used alongside sendfile() to get properly
2328                  * filled frames when the user (for example) must write
2329                  * out headers with a write() call first and then use
2330                  * sendfile to send out the data parts.
2331                  *
2332                  * TCP_CORK can be set together with TCP_NODELAY and it is
2333                  * stronger than TCP_NODELAY.
2334                  */
2335                 if (val) {
2336                         tp->nonagle |= TCP_NAGLE_CORK;
2337                 } else {
2338                         tp->nonagle &= ~TCP_NAGLE_CORK;
2339                         if (tp->nonagle&TCP_NAGLE_OFF)
2340                                 tp->nonagle |= TCP_NAGLE_PUSH;
2341                         tcp_push_pending_frames(sk, tp);
2342                 }
2343                 break;
2344
2345         case TCP_KEEPIDLE:
2346                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2347                         err = -EINVAL;
2348                 else {
2349                         tp->keepalive_time = val * HZ;
2350                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2351                             !((1 << sk->sk_state) &
2352                               (TCPF_CLOSE | TCPF_LISTEN))) {
2353                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2354                                 if (tp->keepalive_time > elapsed)
2355                                         elapsed = tp->keepalive_time - elapsed;
2356                                 else
2357                                         elapsed = 0;
2358                                 tcp_reset_keepalive_timer(sk, elapsed);
2359                         }
2360                 }
2361                 break;
2362         case TCP_KEEPINTVL:
2363                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2364                         err = -EINVAL;
2365                 else
2366                         tp->keepalive_intvl = val * HZ;
2367                 break;
2368         case TCP_KEEPCNT:
2369                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2370                         err = -EINVAL;
2371                 else
2372                         tp->keepalive_probes = val;
2373                 break;
2374         case TCP_SYNCNT:
2375                 if (val < 1 || val > MAX_TCP_SYNCNT)
2376                         err = -EINVAL;
2377                 else
2378                         tp->syn_retries = val;
2379                 break;
2380
2381         case TCP_LINGER2:
2382                 if (val < 0)
2383                         tp->linger2 = -1;
2384                 else if (val > sysctl_tcp_fin_timeout / HZ)
2385                         tp->linger2 = 0;
2386                 else
2387                         tp->linger2 = val * HZ;
2388                 break;
2389
2390         case TCP_DEFER_ACCEPT:
2391                 tp->defer_accept = 0;
2392                 if (val > 0) {
2393                         /* Translate value in seconds to number of
2394                          * retransmits */
2395                         while (tp->defer_accept < 32 &&
2396                                val > ((TCP_TIMEOUT_INIT / HZ) <<
2397                                        tp->defer_accept))
2398                                 tp->defer_accept++;
2399                         tp->defer_accept++;
2400                 }
2401                 break;
2402
2403         case TCP_WINDOW_CLAMP:
2404                 if (!val) {
2405                         if (sk->sk_state != TCP_CLOSE) {
2406                                 err = -EINVAL;
2407                                 break;
2408                         }
2409                         tp->window_clamp = 0;
2410                 } else
2411                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2412                                                 SOCK_MIN_RCVBUF / 2 : val;
2413                 break;
2414
2415         case TCP_QUICKACK:
2416                 if (!val) {
2417                         tp->ack.pingpong = 1;
2418                 } else {
2419                         tp->ack.pingpong = 0;
2420                         if ((1 << sk->sk_state) &
2421                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2422                             tcp_ack_scheduled(tp)) {
2423                                 tp->ack.pending |= TCP_ACK_PUSHED;
2424                                 cleanup_rbuf(sk, 1);
2425                                 if (!(val & 1))
2426                                         tp->ack.pingpong = 1;
2427                         }
2428                 }
2429                 break;
2430
2431         default:
2432                 err = -ENOPROTOOPT;
2433                 break;
2434         };
2435         release_sock(sk);
2436         return err;
2437 }
2438
2439 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2440                    int *optlen)
2441 {
2442         struct tcp_opt *tp = tcp_sk(sk);
2443         int val, len;
2444
2445         if (level != SOL_TCP)
2446                 return tp->af_specific->getsockopt(sk, level, optname,
2447                                                    optval, optlen);
2448
2449         if (get_user(len, optlen))
2450                 return -EFAULT;
2451
2452         len = min_t(unsigned int, len, sizeof(int));
2453
2454         if (len < 0)
2455                 return -EINVAL;
2456
2457         switch (optname) {
2458         case TCP_MAXSEG:
2459                 val = tp->mss_cache_std;
2460                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2461                         val = tp->user_mss;
2462                 break;
2463         case TCP_NODELAY:
2464                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2465                 break;
2466         case TCP_CORK:
2467                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2468                 break;
2469         case TCP_KEEPIDLE:
2470                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2471                 break;
2472         case TCP_KEEPINTVL:
2473                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2474                 break;
2475         case TCP_KEEPCNT:
2476                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2477                 break;
2478         case TCP_SYNCNT:
2479                 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2480                 break;
2481         case TCP_LINGER2:
2482                 val = tp->linger2;
2483                 if (val >= 0)
2484                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2485                 break;
2486         case TCP_DEFER_ACCEPT:
2487                 val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
2488                                                (tp->defer_accept - 1));
2489                 break;
2490         case TCP_WINDOW_CLAMP:
2491                 val = tp->window_clamp;
2492                 break;
2493         case TCP_INFO: {
2494                 struct tcp_info info;
2495                 u32 now = tcp_time_stamp;
2496
2497                 if (get_user(len, optlen))
2498                         return -EFAULT;
2499                 info.tcpi_state = sk->sk_state;
2500                 info.tcpi_ca_state = tp->ca_state;
2501                 info.tcpi_retransmits = tp->retransmits;
2502                 info.tcpi_probes = tp->probes_out;
2503                 info.tcpi_backoff = tp->backoff;
2504                 info.tcpi_options = 0;
2505                 if (tp->tstamp_ok)
2506                         info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2507                 if (tp->sack_ok)
2508                         info.tcpi_options |= TCPI_OPT_SACK;
2509                 if (tp->wscale_ok) {
2510                         info.tcpi_options |= TCPI_OPT_WSCALE;
2511                         info.tcpi_snd_wscale = tp->snd_wscale;
2512                         info.tcpi_rcv_wscale = tp->rcv_wscale;
2513                 } else {
2514                         info.tcpi_snd_wscale = 0;
2515                         info.tcpi_rcv_wscale = 0;
2516                 }
2517                 if (tp->ecn_flags & TCP_ECN_OK)
2518                         info.tcpi_options |= TCPI_OPT_ECN;
2519
2520                 info.tcpi_rto = (1000000 * tp->rto) / HZ;
2521                 info.tcpi_ato = (1000000 * tp->ack.ato) / HZ;
2522                 info.tcpi_snd_mss = tp->mss_cache_std;
2523                 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2524
2525                 info.tcpi_unacked = tp->packets_out;
2526                 info.tcpi_sacked = tp->sacked_out;
2527                 info.tcpi_lost = tp->lost_out;
2528                 info.tcpi_retrans = tp->retrans_out;
2529                 info.tcpi_fackets = tp->fackets_out;
2530
2531                 info.tcpi_last_data_sent = ((now - tp->lsndtime) * 1000) / HZ;
2532                 info.tcpi_last_ack_sent = 0;
2533                 info.tcpi_last_data_recv = ((now -
2534                                              tp->ack.lrcvtime) * 1000) / HZ;
2535                 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp) * 1000) / HZ;
2536
2537                 info.tcpi_pmtu = tp->pmtu_cookie;
2538                 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2539                 info.tcpi_rtt = ((1000000 * tp->srtt) / HZ) >> 3;
2540                 info.tcpi_rttvar = ((1000000 * tp->mdev) / HZ) >> 2;
2541                 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2542                 info.tcpi_snd_cwnd = tp->snd_cwnd;
2543                 info.tcpi_advmss = tp->advmss;
2544                 info.tcpi_reordering = tp->reordering;
2545
2546                 len = min_t(unsigned int, len, sizeof(info));
2547                 if (put_user(len, optlen))
2548                         return -EFAULT;
2549                 if (copy_to_user(optval, &info, len))
2550                         return -EFAULT;
2551                 return 0;
2552         }
2553         case TCP_QUICKACK:
2554                 val = !tp->ack.pingpong;
2555                 break;
2556         default:
2557                 return -ENOPROTOOPT;
2558         };
2559
2560         if (put_user(len, optlen))
2561                 return -EFAULT;
2562         if (copy_to_user(optval, &val, len))
2563                 return -EFAULT;
2564         return 0;
2565 }
2566
2567
2568 extern void __skb_cb_too_small_for_tcp(int, int);
2569 extern void tcpdiag_init(void);
2570
2571 void __init tcp_init(void)
2572 {
2573         struct sk_buff *skb = NULL;
2574         unsigned long goal;
2575         int order, i;
2576
2577         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2578                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2579                                            sizeof(skb->cb));
2580
2581         tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2582                                                    sizeof(struct open_request),
2583                                                0, SLAB_HWCACHE_ALIGN,
2584                                                NULL, NULL);
2585         if (!tcp_openreq_cachep)
2586                 panic("tcp_init: Cannot alloc open_request cache.");
2587
2588         tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2589                                               sizeof(struct tcp_bind_bucket),
2590                                               0, SLAB_HWCACHE_ALIGN,
2591                                               NULL, NULL);
2592         if (!tcp_bucket_cachep)
2593                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2594
2595         tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2596                                                 sizeof(struct tcp_tw_bucket),
2597                                                 0, SLAB_HWCACHE_ALIGN,
2598                                                 NULL, NULL);
2599         if (!tcp_timewait_cachep)
2600                 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2601
2602         /* Size and allocate the main established and bind bucket
2603          * hash tables.
2604          *
2605          * The methodology is similar to that of the buffer cache.
2606          */
2607         if (num_physpages >= (128 * 1024))
2608                 goal = num_physpages >> (21 - PAGE_SHIFT);
2609         else
2610                 goal = num_physpages >> (23 - PAGE_SHIFT);
2611
2612         for (order = 0; (1UL << order) < goal; order++)
2613                 ;
2614         do {
2615                 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2616                         sizeof(struct tcp_ehash_bucket);
2617                 tcp_ehash_size >>= 1;
2618                 while (tcp_ehash_size & (tcp_ehash_size - 1))
2619                         tcp_ehash_size--;
2620                 tcp_ehash = (struct tcp_ehash_bucket *)
2621                         __get_free_pages(GFP_ATOMIC, order);
2622         } while (!tcp_ehash && --order > 0);
2623
2624         if (!tcp_ehash)
2625                 panic("Failed to allocate TCP established hash table\n");
2626         for (i = 0; i < (tcp_ehash_size << 1); i++) {
2627                 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2628                 INIT_HLIST_HEAD(&tcp_ehash[i].chain);
2629         }
2630
2631         do {
2632                 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2633                         sizeof(struct tcp_bind_hashbucket);
2634                 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2635                         continue;
2636                 tcp_bhash = (struct tcp_bind_hashbucket *)
2637                         __get_free_pages(GFP_ATOMIC, order);
2638         } while (!tcp_bhash && --order >= 0);
2639
2640         if (!tcp_bhash)
2641                 panic("Failed to allocate TCP bind hash table\n");
2642         for (i = 0; i < tcp_bhash_size; i++) {
2643                 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2644                 INIT_HLIST_HEAD(&tcp_bhash[i].chain);
2645         }
2646
2647         /* Try to be a bit smarter and adjust defaults depending
2648          * on available memory.
2649          */
2650         if (order > 4) {
2651                 sysctl_local_port_range[0] = 32768;
2652                 sysctl_local_port_range[1] = 61000;
2653                 sysctl_tcp_max_tw_buckets = 180000;
2654                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2655                 sysctl_max_syn_backlog = 1024;
2656         } else if (order < 3) {
2657                 sysctl_local_port_range[0] = 1024 * (3 - order);
2658                 sysctl_tcp_max_tw_buckets >>= (3 - order);
2659                 sysctl_tcp_max_orphans >>= (3 - order);
2660                 sysctl_max_syn_backlog = 128;
2661         }
2662         tcp_port_rover = sysctl_local_port_range[0] - 1;
2663
2664         sysctl_tcp_mem[0] =  768 << order;
2665         sysctl_tcp_mem[1] = 1024 << order;
2666         sysctl_tcp_mem[2] = 1536 << order;
2667         if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 512)
2668                 sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 512;
2669         if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 512)
2670                 sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 512;
2671
2672         if (order < 3) {
2673                 sysctl_tcp_wmem[2] = 64 * 1024;
2674                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2675                 sysctl_tcp_rmem[1] = 43689;
2676                 sysctl_tcp_rmem[2] = 2 * 43689;
2677         }
2678
2679         printk(KERN_INFO "TCP: Hash tables configured "
2680                "(established %d bind %d)\n",
2681                tcp_ehash_size << 1, tcp_bhash_size);
2682
2683         tcpdiag_init();
2684 }