release/src-rt-6.x/linux/linux-2.6/net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/module.h>
 251 #include <linux/types.h>
 252 #include <linux/fcntl.h>
 253 #include <linux/poll.h>
 254 #include <linux/init.h>
 255 #include <linux/fs.h>
 256 #include <linux/skbuff.h>
 257 #include <linux/splice.h>
 258 #include <linux/net.h>
 259 #include <linux/socket.h>
 260 #include <linux/random.h>
 261 #include <linux/bootmem.h>
 262 #include <linux/cache.h>
 263 #include <linux/err.h>
 264 #include <linux/crypto.h>
 265
 266 #include <net/icmp.h>
 267 #include <net/tcp.h>
 268 #include <net/xfrm.h>
 269 #include <net/ip.h>
 270 #include <net/netdma.h>
 271 #include <net/sock.h>
 272
 273 #include <asm/uaccess.h>
 274 #include <asm/ioctls.h>
 275
 276 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 277
 278 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 279
 280 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 281
 282 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 283
 284 int sysctl_tcp_mem[3] __read_mostly;
 285 int sysctl_tcp_wmem[3] __read_mostly;
 286 int sysctl_tcp_rmem[3] __read_mostly;
 287
 288 EXPORT_SYMBOL(sysctl_tcp_mem);
 289 EXPORT_SYMBOL(sysctl_tcp_rmem);
 290 EXPORT_SYMBOL(sysctl_tcp_wmem);
 291
 292 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 293 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 294
 295 EXPORT_SYMBOL(tcp_memory_allocated);
 296 EXPORT_SYMBOL(tcp_sockets_allocated);
 297
 298 /*
 299  * TCP splice context
 300  */
 301 struct tcp_splice_state {
 302         struct pipe_inode_info *pipe;
 303         size_t len;
 304         unsigned int flags;
 305 };
 306
 307 /*
 308  * Pressure flag: try to collapse.
 309  * Technical note: it is used by multiple contexts non atomically.
 310  * All the sk_stream_mem_schedule() is of this nature: accounting
 311  * is strict, actions are advisory and have some latency.
 312  */
 313 int tcp_memory_pressure __read_mostly;
 314
 315 EXPORT_SYMBOL(tcp_memory_pressure);
 316
 317 void tcp_enter_memory_pressure(void)
 318 {
 319         if (!tcp_memory_pressure) {
 320                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 321                 tcp_memory_pressure = 1;
 322         }
 323 }
 324
 325 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 326
 327 /* Convert seconds to retransmits based on initial and max timeout */
 328 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
 329 {
 330         u8 res = 0;
 331
 332         if (seconds > 0) {
 333                 int period = timeout;
 334
 335                 res = 1;
 336                 while (seconds > period && res < 255) {
 337                         res++;
 338                         timeout <<= 1;
 339                         if (timeout > rto_max)
 340                                 timeout = rto_max;
 341                         period += timeout;
 342                 }
 343         }
 344         return res;
 345 }
 346
 347 /* Convert retransmits to seconds based on initial and max timeout */
 348 static int retrans_to_secs(u8 retrans, int timeout, int rto_max)
 349 {
 350         int period = 0;
 351
 352         if (retrans > 0) {
 353                 period = timeout;
 354                 while (--retrans) {
 355                         timeout <<= 1;
 356                         if (timeout > rto_max)
 357                                 timeout = rto_max;
 358                         period += timeout;
 359                 }
 360         }
 361         return period;
 362 }
 363
 364 /*
 365  *      Wait for a TCP event.
 366  *
 367  *      Note that we don't need to lock the socket, as the upper poll layers
 368  *      take care of normal races (between the test and the event) and we don't
 369  *      go look at any of the socket buffers directly.
 370  */
 371 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 372 {
 373         unsigned int mask;
 374         struct sock *sk = sock->sk;
 375         struct tcp_sock *tp = tcp_sk(sk);
 376
 377         poll_wait(file, sk->sk_sleep, wait);
 378         if (sk->sk_state == TCP_LISTEN)
 379                 return inet_csk_listen_poll(sk);
 380
 381         /* Socket is not locked. We are protected from async events
 382            by poll logic and correct handling of state changes
 383            made by another threads is impossible in any case.
 384          */
 385
 386         mask = 0;
 387         if (sk->sk_err)
 388                 mask = POLLERR;
 389
 390         /*
 391          * POLLHUP is certainly not done right. But poll() doesn't
 392          * have a notion of HUP in just one direction, and for a
 393          * socket the read side is more interesting.
 394          *
 395          * Some poll() documentation says that POLLHUP is incompatible
 396          * with the POLLOUT/POLLWR flags, so somebody should check this
 397          * all. But careful, it tends to be safer to return too many
 398          * bits than too few, and you can easily break real applications
 399          * if you don't tell them that something has hung up!
 400          *
 401          * Check-me.
 402          *
 403          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 404          * our fs/select.c). It means that after we received EOF,
 405          * poll always returns immediately, making impossible poll() on write()
 406          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 407          * if and only if shutdown has been made in both directions.
 408          * Actually, it is interesting to look how Solaris and DUX
 409          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 410          * then we could set it on SND_SHUTDOWN. BTW examples given
 411          * in Stevens' books assume exactly this behaviour, it explains
 412          * why PULLHUP is incompatible with POLLOUT.    --ANK
 413          *
 414          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 415          * blocking on fresh not-connected or disconnected socket. --ANK
 416          */
 417         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 418                 mask |= POLLHUP;
 419         if (sk->sk_shutdown & RCV_SHUTDOWN)
 420                 mask |= POLLIN | POLLRDNORM | POLLRDHUP;
 421
 422         /* Connected? */
 423         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 424                 /* Potential race condition. If read of tp below will
 425                  * escape above sk->sk_state, we can be illegally awaken
 426                  * in SYN_* states. */
 427                 if ((tp->rcv_nxt != tp->copied_seq) &&
 428                     (tp->urg_seq != tp->copied_seq ||
 429                      tp->rcv_nxt != tp->copied_seq + 1 ||
 430                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 431                         mask |= POLLIN | POLLRDNORM;
 432
 433                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 434                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 435                                 mask |= POLLOUT | POLLWRNORM;
 436                         } else {  /* send SIGIO later */
 437                                 set_bit(SOCK_ASYNC_NOSPACE,
 438                                         &sk->sk_socket->flags);
 439                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 440
 441                                 /* Race breaker. If space is freed after
 442                                  * wspace test but before the flags are set,
 443                                  * IO signal will be lost.
 444                                  */
 445                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 446                                         mask |= POLLOUT | POLLWRNORM;
 447                         }
 448                 } else
 449                         mask |= POLLOUT | POLLWRNORM;
 450
 451                 if (tp->urg_data & TCP_URG_VALID)
 452                         mask |= POLLPRI;
 453         }
 454         return mask;
 455 }
 456
 457 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 458 {
 459         struct tcp_sock *tp = tcp_sk(sk);
 460         int answ;
 461
 462         switch (cmd) {
 463         case SIOCINQ:
 464                 if (sk->sk_state == TCP_LISTEN)
 465                         return -EINVAL;
 466
 467                 lock_sock(sk);
 468                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 469                         answ = 0;
 470                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 471                          !tp->urg_data ||
 472                          before(tp->urg_seq, tp->copied_seq) ||
 473                          !before(tp->urg_seq, tp->rcv_nxt)) {
 474                         answ = tp->rcv_nxt - tp->copied_seq;
 475
 476                         /* Subtract 1, if FIN is in queue. */
 477                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 478                                 answ -=
 479                        tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin;
 480                 } else
 481                         answ = tp->urg_seq - tp->copied_seq;
 482                 release_sock(sk);
 483                 break;
 484         case SIOCATMARK:
 485                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 486                 break;
 487         case SIOCOUTQ:
 488                 if (sk->sk_state == TCP_LISTEN)
 489                         return -EINVAL;
 490
 491                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 492                         answ = 0;
 493                 else
 494                         answ = tp->write_seq - tp->snd_una;
 495                 break;
 496         default:
 497                 return -ENOIOCTLCMD;
 498         }
 499
 500         return put_user(answ, (int __user *)arg);
 501 }
 502
 503 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 504 {
 505         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 506         tp->pushed_seq = tp->write_seq;
 507 }
 508
 509 static inline int forced_push(struct tcp_sock *tp)
 510 {
 511         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 512 }
 513
 514 static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
 515 {
 516         struct tcp_sock *tp = tcp_sk(sk);
 517         struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
 518
 519         skb->csum    = 0;
 520         tcb->seq     = tcb->end_seq = tp->write_seq;
 521         tcb->flags   = TCPCB_FLAG_ACK;
 522         tcb->sacked  = 0;
 523         skb_header_release(skb);
 524         tcp_add_write_queue_tail(sk, skb);
 525         sk_charge_skb(sk, skb);
 526         if (tp->nonagle & TCP_NAGLE_PUSH)
 527                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 528 }
 529
 530 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 531                                 struct sk_buff *skb)
 532 {
 533         if (flags & MSG_OOB) {
 534                 tp->urg_mode = 1;
 535                 tp->snd_up = tp->write_seq;
 536                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 537         }
 538 }
 539
 540 static inline void tcp_push(struct sock *sk, int flags, int mss_now,
 541                             int nonagle)
 542 {
 543         struct tcp_sock *tp = tcp_sk(sk);
 544
 545         if (tcp_send_head(sk)) {
 546                 struct sk_buff *skb = tcp_write_queue_tail(sk);
 547                 if (!(flags & MSG_MORE) || forced_push(tp))
 548                         tcp_mark_push(tp, skb);
 549                 tcp_mark_urg(tp, flags, skb);
 550                 __tcp_push_pending_frames(sk, mss_now,
 551                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 552         }
 553 }
 554
 555 int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
 556                          unsigned int offset, size_t len)
 557 {
 558         struct tcp_splice_state *tss = rd_desc->arg.data;
 559         int ret;
 560
 561         ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len),
 562                               tss->flags);
 563         if (ret > 0)
 564                 rd_desc->count -= ret;
 565         return ret;
 566 }
 567
 568 static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
 569 {
 570         /* Store TCP splice context information in read_descriptor_t. */
 571         read_descriptor_t rd_desc = {
 572                 .arg.data = tss,
 573                 .count    = tss->len,
 574         };
 575
 576         return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
 577 }
 578
 579 /**
 580  *  tcp_splice_read - splice data from TCP socket to a pipe
 581  * @sock:       socket to splice from
 582  * @ppos:       position (not valid)
 583  * @pipe:       pipe to splice to
 584  * @len:        number of bytes to splice
 585  * @flags:      splice modifier flags
 586  *
 587  * Description:
 588  *    Will read pages from given socket and fill them into a pipe.
 589  *
 590  **/
 591 ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 592                         struct pipe_inode_info *pipe, size_t len,
 593                         unsigned int flags)
 594 {
 595         struct sock *sk = sock->sk;
 596         struct tcp_splice_state tss = {
 597                 .pipe = pipe,
 598                 .len = len,
 599                 .flags = flags,
 600         };
 601         long timeo;
 602         ssize_t spliced;
 603         int ret;
 604
 605         /*
 606          * We can't seek on a socket input
 607          */
 608         if (unlikely(*ppos))
 609                 return -ESPIPE;
 610
 611         ret = spliced = 0;
 612
 613         lock_sock(sk);
 614
 615         timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK);
 616         while (tss.len) {
 617                 ret = __tcp_splice_read(sk, &tss);
 618                 if (ret < 0)
 619                         break;
 620                 else if (!ret) {
 621                         if (spliced)
 622                                 break;
 623                         if (flags & SPLICE_F_NONBLOCK) {
 624                                 ret = -EAGAIN;
 625                                 break;
 626                         }
 627                         if (sock_flag(sk, SOCK_DONE))
 628                                 break;
 629                         if (sk->sk_err) {
 630                                 ret = sock_error(sk);
 631                                 break;
 632                         }
 633                         if (sk->sk_shutdown & RCV_SHUTDOWN)
 634                                 break;
 635                         if (sk->sk_state == TCP_CLOSE) {
 636                                 /*
 637                                  * This occurs when user tries to read
 638                                  * from never connected socket.
 639                                  */
 640                                 if (!sock_flag(sk, SOCK_DONE))
 641                                         ret = -ENOTCONN;
 642                                 break;
 643                         }
 644                         if (!timeo) {
 645                                 ret = -EAGAIN;
 646                                 break;
 647                         }
 648                         sk_wait_data(sk, &timeo);
 649                         if (signal_pending(current)) {
 650                                 ret = sock_intr_errno(timeo);
 651                                 break;
 652                         }
 653                         continue;
 654                 }
 655                 tss.len -= ret;
 656                 spliced += ret;
 657
 658                 if (!timeo)
 659                         break;
 660                 release_sock(sk);
 661                 lock_sock(sk);
 662
 663                 if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
 664                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
 665                     signal_pending(current))
 666                         break;
 667         }
 668
 669         release_sock(sk);
 670
 671         if (spliced)
 672                 return spliced;
 673
 674         return ret;
 675 }
 676
 677 struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
 678 {
 679         struct sk_buff *skb;
 680
 681         /* The TCP header must be at least 32-bit aligned.  */
 682         size = ALIGN(size, 4);
 683
 684         skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
 685         if (skb) {
 686                 if (sk_stream_wmem_schedule(sk, skb->truesize)) {
 687                         /*
 688                          * Make sure that we have exactly size bytes
 689                          * available to the caller, no more, no less.
 690                          */
 691                         skb_reserve(skb, skb_tailroom(skb) - size);
 692                         return skb;
 693                 }
 694                 __kfree_skb(skb);
 695         } else {
 696                 sk->sk_prot->enter_memory_pressure();
 697                 sk_stream_moderate_sndbuf(sk);
 698         }
 699         return NULL;
 700 }
 701
 702 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 703                          size_t psize, int flags)
 704 {
 705         struct tcp_sock *tp = tcp_sk(sk);
 706         int mss_now, size_goal;
 707         int err;
 708         ssize_t copied;
 709         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 710
 711         /* Wait for a connection to finish. */
 712         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 713                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 714                         goto out_err;
 715
 716         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 717
 718         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 719         size_goal = tp->xmit_size_goal;
 720         copied = 0;
 721
 722         err = -EPIPE;
 723         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 724                 goto do_error;
 725
 726         while (psize > 0) {
 727                 struct sk_buff *skb = tcp_write_queue_tail(sk);
 728                 struct page *page = pages[poffset / PAGE_SIZE];
 729                 int copy, i, can_coalesce;
 730                 int offset = poffset % PAGE_SIZE;
 731                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 732
 733                 if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) {
 734 new_segment:
 735                         if (!sk_stream_memory_free(sk))
 736                                 goto wait_for_sndbuf;
 737
 738                         skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
 739                         if (!skb)
 740                                 goto wait_for_memory;
 741
 742                         skb_entail(sk, skb);
 743                         copy = size_goal;
 744                 }
 745
 746                 if (copy > size)
 747                         copy = size;
 748
 749                 i = skb_shinfo(skb)->nr_frags;
 750                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 751                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 752                         tcp_mark_push(tp, skb);
 753                         goto new_segment;
 754                 }
 755                 if (!sk_stream_wmem_schedule(sk, copy))
 756                         goto wait_for_memory;
 757
 758                 if (can_coalesce) {
 759                         skb_shinfo(skb)->frags[i - 1].size += copy;
 760                 } else {
 761                         get_page(page);
 762                         skb_fill_page_desc(skb, i, page, offset, copy);
 763                 }
 764
 765                 skb->len += copy;
 766                 skb->data_len += copy;
 767                 skb->truesize += copy;
 768                 sk->sk_wmem_queued += copy;
 769                 sk->sk_forward_alloc -= copy;
 770                 skb->ip_summed = CHECKSUM_PARTIAL;
 771                 tp->write_seq += copy;
 772                 TCP_SKB_CB(skb)->end_seq += copy;
 773                 skb_shinfo(skb)->gso_segs = 0;
 774
 775                 if (!copied)
 776                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 777
 778                 copied += copy;
 779                 poffset += copy;
 780                 if (!(psize -= copy))
 781                         goto out;
 782
 783                 if (skb->len < size_goal || (flags & MSG_OOB))
 784                         continue;
 785
 786                 if (forced_push(tp)) {
 787                         tcp_mark_push(tp, skb);
 788                         __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
 789                 } else if (skb == tcp_send_head(sk))
 790                         tcp_push_one(sk, mss_now);
 791                 continue;
 792
 793 wait_for_sndbuf:
 794                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 795 wait_for_memory:
 796                 if (copied)
 797                         tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 798
 799                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 800                         goto do_error;
 801
 802                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 803                 size_goal = tp->xmit_size_goal;
 804         }
 805
 806 out:
 807         if (copied)
 808                 tcp_push(sk, flags, mss_now, tp->nonagle);
 809         return copied;
 810
 811 do_error:
 812         if (copied)
 813                 goto out;
 814 out_err:
 815         return sk_stream_error(sk, flags, err);
 816 }
 817
 818 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 819                      size_t size, int flags)
 820 {
 821         ssize_t res;
 822         struct sock *sk = sock->sk;
 823
 824         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 825             !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
 826                 return sock_no_sendpage(sock, page, offset, size, flags);
 827
 828         lock_sock(sk);
 829         TCP_CHECK_TIMER(sk);
 830         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 831         TCP_CHECK_TIMER(sk);
 832         release_sock(sk);
 833         return res;
 834 }
 835
 836 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 837 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 838
 839 static inline int select_size(struct sock *sk)
 840 {
 841         struct tcp_sock *tp = tcp_sk(sk);
 842         int tmp = tp->mss_cache;
 843
 844         if (sk->sk_route_caps & NETIF_F_SG) {
 845                 if (sk_can_gso(sk))
 846                         tmp = 0;
 847                 else {
 848                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 849
 850                         if (tmp >= pgbreak &&
 851                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 852                                 tmp = pgbreak;
 853                 }
 854         }
 855
 856         return tmp;
 857 }
 858
 859 int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 860                 size_t size)
 861 {
 862         struct sock *sk = sock->sk;
 863         struct iovec *iov;
 864         struct tcp_sock *tp = tcp_sk(sk);
 865         struct sk_buff *skb;
 866         int iovlen, flags;
 867         int mss_now, size_goal;
 868         int err, copied;
 869         long timeo;
 870
 871         lock_sock(sk);
 872         TCP_CHECK_TIMER(sk);
 873
 874         flags = msg->msg_flags;
 875         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 876
 877         /* Wait for a connection to finish. */
 878         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 879                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 880                         goto out_err;
 881
 882         /* This should be in poll */
 883         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 884
 885         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 886         size_goal = tp->xmit_size_goal;
 887
 888         /* Ok commence sending. */
 889         iovlen = msg->msg_iovlen;
 890         iov = msg->msg_iov;
 891         copied = 0;
 892
 893         err = -EPIPE;
 894         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 895                 goto do_error;
 896
 897         while (--iovlen >= 0) {
 898                 int seglen = iov->iov_len;
 899                 unsigned char __user *from = iov->iov_base;
 900
 901                 iov++;
 902
 903                 while (seglen > 0) {
 904                         int copy = 0;
 905                         int max = size_goal;
 906
 907                         skb = tcp_write_queue_tail(sk);
 908                         if (tcp_send_head(sk)) {
 909                                 if (skb->ip_summed == CHECKSUM_NONE)
 910                                         max = mss_now;
 911                                 copy = max - skb->len;
 912                         }
 913
 914                         if (copy <= 0) {
 915 new_segment:
 916                                 /* Allocate new segment. If the interface is SG,
 917                                  * allocate skb fitting to single page.
 918                                  */
 919                                 if (!sk_stream_memory_free(sk))
 920                                         goto wait_for_sndbuf;
 921
 922                                 skb = sk_stream_alloc_skb(sk, select_size(sk),
 923                                                 sk->sk_allocation);
 924                                 if (!skb)
 925                                         goto wait_for_memory;
 926
 927                                 /*
 928                                  * Check whether we can use HW checksum.
 929                                  */
 930                                 if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
 931                                         skb->ip_summed = CHECKSUM_PARTIAL;
 932
 933                                 skb_entail(sk, skb);
 934                                 copy = size_goal;
 935                                 max = size_goal;
 936                         }
 937
 938                         /* Try to append data to the end of skb. */
 939                         if (copy > seglen)
 940                                 copy = seglen;
 941
 942                         /* Where to copy to? */
 943                         if (skb_tailroom(skb) > 0) {
 944                                 /* We have some space in skb head. Superb! */
 945                                 if (copy > skb_tailroom(skb))
 946                                         copy = skb_tailroom(skb);
 947                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 948                                         goto do_fault;
 949                         } else {
 950                                 int merge = 0;
 951                                 int i = skb_shinfo(skb)->nr_frags;
 952                                 struct page *page = TCP_PAGE(sk);
 953                                 int off = TCP_OFF(sk);
 954
 955                                 if (skb_can_coalesce(skb, i, page, off) &&
 956                                     off != PAGE_SIZE) {
 957                                         /* We can extend the last page
 958                                          * fragment. */
 959                                         merge = 1;
 960                                 } else if (i == MAX_SKB_FRAGS ||
 961                                            (!i &&
 962                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 963                                         /* Need to add new fragment and cannot
 964                                          * do this because interface is non-SG,
 965                                          * or because all the page slots are
 966                                          * busy. */
 967                                         tcp_mark_push(tp, skb);
 968                                         goto new_segment;
 969                                 } else if (page) {
 970                                         if (off == PAGE_SIZE) {
 971                                                 put_page(page);
 972                                                 TCP_PAGE(sk) = page = NULL;
 973                                                 off = 0;
 974                                         }
 975                                 } else
 976                                         off = 0;
 977
 978                                 if (copy > PAGE_SIZE - off)
 979                                         copy = PAGE_SIZE - off;
 980
 981                                 if (!sk_stream_wmem_schedule(sk, copy))
 982                                         goto wait_for_memory;
 983
 984                                 if (!page) {
 985                                         /* Allocate new cache page. */
 986                                         if (!(page = sk_stream_alloc_page(sk)))
 987                                                 goto wait_for_memory;
 988                                 }
 989
 990                                 /* Time to copy data. We are close to
 991                                  * the end! */
 992                                 err = skb_copy_to_page(sk, from, skb, page,
 993                                                        off, copy);
 994                                 if (err) {
 995                                         /* If this page was new, give it to the
 996                                          * socket so it does not get leaked.
 997                                          */
 998                                         if (!TCP_PAGE(sk)) {
 999                                                 TCP_PAGE(sk) = page;
1000                                                 TCP_OFF(sk) = 0;
1001                                         }
1002                                         goto do_error;
1003                                 }
1004
1005                                 /* Update the skb. */
1006                                 if (merge) {
1007                                         skb_shinfo(skb)->frags[i - 1].size +=
1008                                                                         copy;
1009                                 } else {
1010                                         skb_fill_page_desc(skb, i, page, off, copy);
1011                                         if (TCP_PAGE(sk)) {
1012                                                 get_page(page);
1013                                         } else if (off + copy < PAGE_SIZE) {
1014                                                 get_page(page);
1015                                                 TCP_PAGE(sk) = page;
1016                                         }
1017                                 }
1018
1019                                 TCP_OFF(sk) = off + copy;
1020                         }
1021
1022                         if (!copied)
1023                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1024
1025                         tp->write_seq += copy;
1026                         TCP_SKB_CB(skb)->end_seq += copy;
1027                         skb_shinfo(skb)->gso_segs = 0;
1028
1029                         from += copy;
1030                         copied += copy;
1031                         if ((seglen -= copy) == 0 && iovlen == 0)
1032                                 goto out;
1033
1034                         if (skb->len < max || (flags & MSG_OOB))
1035                                 continue;
1036
1037 #ifdef CONFIG_INET_GSO
1038                         if (iov->iov_len > PAGE_SIZE)
1039                                 continue;
1040 #endif /* CONFIG_INET_GSO */
1041
1042                         if (forced_push(tp)) {
1043                                 tcp_mark_push(tp, skb);
1044                                 __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
1045                         } else if (skb == tcp_send_head(sk))
1046                                 tcp_push_one(sk, mss_now);
1047                         continue;
1048
1049 wait_for_sndbuf:
1050                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1051 wait_for_memory:
1052                         if (copied)
1053                                 tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
1054
1055                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1056                                 goto do_error;
1057
1058                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
1059                         size_goal = tp->xmit_size_goal;
1060                 }
1061         }
1062
1063 out:
1064         if (copied)
1065                 tcp_push(sk, flags, mss_now, tp->nonagle);
1066         TCP_CHECK_TIMER(sk);
1067         release_sock(sk);
1068         return copied;
1069
1070 do_fault:
1071         if (!skb->len) {
1072                 tcp_unlink_write_queue(skb, sk);
1073                 /* It is the one place in all of TCP, except connection
1074                  * reset, where we can be unlinking the send_head.
1075                  */
1076                 tcp_check_send_head(sk, skb);
1077                 sk_stream_free_skb(sk, skb);
1078         }
1079
1080 do_error:
1081         if (copied)
1082                 goto out;
1083 out_err:
1084         err = sk_stream_error(sk, flags, err);
1085         TCP_CHECK_TIMER(sk);
1086         release_sock(sk);
1087         return err;
1088 }
1089
1090 /*
1091  *      Handle reading urgent data. BSD has very simple semantics for
1092  *      this, no blocking and very strange errors 8)
1093  */
1094
1095 static int tcp_recv_urg(struct sock *sk, long timeo,
1096                         struct msghdr *msg, int len, int flags,
1097                         int *addr_len)
1098 {
1099         struct tcp_sock *tp = tcp_sk(sk);
1100
1101         /* No URG data to read. */
1102         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
1103             tp->urg_data == TCP_URG_READ)
1104                 return -EINVAL; /* Yes this is right ! */
1105
1106         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
1107                 return -ENOTCONN;
1108
1109         if (tp->urg_data & TCP_URG_VALID) {
1110                 int err = 0;
1111                 char c = tp->urg_data;
1112
1113                 if (!(flags & MSG_PEEK))
1114                         tp->urg_data = TCP_URG_READ;
1115
1116                 /* Read urgent data. */
1117                 msg->msg_flags |= MSG_OOB;
1118
1119                 if (len > 0) {
1120                         if (!(flags & MSG_TRUNC))
1121                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1122                         len = 1;
1123                 } else
1124                         msg->msg_flags |= MSG_TRUNC;
1125
1126                 return err ? -EFAULT : len;
1127         }
1128
1129         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
1130                 return 0;
1131
1132         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1133          * the available implementations agree in this case:
1134          * this call should never block, independent of the
1135          * blocking state of the socket.
1136          * Mike <pall@rz.uni-karlsruhe.de>
1137          */
1138         return -EAGAIN;
1139 }
1140
1141 /* Clean up the receive buffer for full frames taken by the user,
1142  * then send an ACK if necessary.  COPIED is the number of bytes
1143  * tcp_recvmsg has given to the user so far, it speeds up the
1144  * calculation of whether or not we must ACK for the sake of
1145  * a window update.
1146  */
1147 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1148 {
1149         struct tcp_sock *tp = tcp_sk(sk);
1150         int time_to_ack = 0;
1151
1152 #if TCP_DEBUG
1153         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1154
1155         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1156 #endif
1157
1158         if (inet_csk_ack_scheduled(sk)) {
1159                 const struct inet_connection_sock *icsk = inet_csk(sk);
1160                    /* Delayed ACKs frequently hit locked sockets during bulk
1161                     * receive. */
1162                 if (icsk->icsk_ack.blocked ||
1163                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
1164                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1165                     /*
1166                      * If this read emptied read buffer, we send ACK, if
1167                      * connection is not bidirectional, user drained
1168                      * receive buffer and there was a small segment
1169                      * in queue.
1170                      */
1171                     (copied > 0 &&
1172                      ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1173                       ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1174                        !icsk->icsk_ack.pingpong)) &&
1175                       !atomic_read(&sk->sk_rmem_alloc)))
1176                         time_to_ack = 1;
1177         }
1178
1179         /* We send an ACK if we can now advertise a non-zero window
1180          * which has been raised "significantly".
1181          *
1182          * Even if window raised up to infinity, do not send window open ACK
1183          * in states, where we will not receive more. It is useless.
1184          */
1185         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1186                 __u32 rcv_window_now = tcp_receive_window(tp);
1187
1188                 /* Optimize, __tcp_select_window() is not cheap. */
1189                 if (2*rcv_window_now <= tp->window_clamp) {
1190                         __u32 new_window = __tcp_select_window(sk);
1191
1192                         /* Send ACK now, if this read freed lots of space
1193                          * in our buffer. Certainly, new_window is new window.
1194                          * We can advertise it now, if it is not less than current one.
1195                          * "Lots" means "at least twice" here.
1196                          */
1197                         if (new_window && new_window >= 2 * rcv_window_now)
1198                                 time_to_ack = 1;
1199                 }
1200         }
1201         if (time_to_ack)
1202                 tcp_send_ack(sk);
1203 }
1204
1205 static void tcp_prequeue_process(struct sock *sk)
1206 {
1207         struct sk_buff *skb;
1208         struct tcp_sock *tp = tcp_sk(sk);
1209
1210         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
1211
1212         /* RX process wants to run with disabled BHs, though it is not
1213          * necessary */
1214         local_bh_disable();
1215         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1216                 sk->sk_backlog_rcv(sk, skb);
1217         local_bh_enable();
1218
1219         /* Clear memory counter. */
1220         tp->ucopy.memory = 0;
1221 }
1222
1223 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1224 {
1225         struct sk_buff *skb;
1226         u32 offset;
1227
1228         skb_queue_walk(&sk->sk_receive_queue, skb) {
1229                 offset = seq - TCP_SKB_CB(skb)->seq;
1230                 if (tcp_hdr(skb)->syn)
1231                         offset--;
1232                 if (offset < skb->len || tcp_hdr(skb)->fin) {
1233                         *off = offset;
1234                         return skb;
1235                 }
1236         }
1237         return NULL;
1238 }
1239
1240 /*
1241  * This routine provides an alternative to tcp_recvmsg() for routines
1242  * that would like to handle copying from skbuffs directly in 'sendfile'
1243  * fashion.
1244  * Note:
1245  *      - It is assumed that the socket was locked by the caller.
1246  *      - The routine does not block.
1247  *      - At present, there is no support for reading OOB data
1248  *        or for 'peeking' the socket using this routine
1249  *        (although both would be easy to implement).
1250  */
1251 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1252                   sk_read_actor_t recv_actor)
1253 {
1254         struct sk_buff *skb;
1255         struct tcp_sock *tp = tcp_sk(sk);
1256         u32 seq = tp->copied_seq;
1257         u32 offset;
1258         int copied = 0;
1259
1260         if (sk->sk_state == TCP_LISTEN)
1261                 return -ENOTCONN;
1262         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1263                 if (offset < skb->len) {
1264                         size_t used, len;
1265
1266                         len = skb->len - offset;
1267                         /* Stop reading if we hit a patch of urgent data */
1268                         if (tp->urg_data) {
1269                                 u32 urg_offset = tp->urg_seq - seq;
1270                                 if (urg_offset < len)
1271                                         len = urg_offset;
1272                                 if (!len)
1273                                         break;
1274                         }
1275                         used = recv_actor(desc, skb, offset, len);
1276                         if (used < 0) {
1277                                 if (!copied)
1278                                         copied = used;
1279                                 break;
1280                         } else if (used <= len) {
1281                                 seq += used;
1282                                 copied += used;
1283                                 offset += used;
1284                         }
1285                         /*
1286                          * If recv_actor drops the lock (e.g. TCP splice
1287                          * receive) the skb pointer might be invalid when
1288                          * getting here: tcp_collapse might have deleted it
1289                          * while aggregating skbs from the socket queue.
1290                          */
1291                         skb = tcp_recv_skb(sk, seq-1, &offset);
1292                         if (!skb || (offset+1 != skb->len))
1293                                 break;
1294                 }
1295                 if (tcp_hdr(skb)->fin) {
1296                         sk_eat_skb(sk, skb, 0);
1297                         ++seq;
1298                         break;
1299                 }
1300                 sk_eat_skb(sk, skb, 0);
1301                 if (!desc->count)
1302                         break;
1303                 tp->copied_seq = seq;
1304         }
1305         tp->copied_seq = seq;
1306
1307         tcp_rcv_space_adjust(sk);
1308
1309         /* Clean up data we have read: This will do ACK frames. */
1310         if (copied > 0)
1311                 tcp_cleanup_rbuf(sk, copied);
1312         return copied;
1313 }
1314
1315 /*
1316  *      This routine copies from a sock struct into the user buffer.
1317  *
1318  *      Technical note: in 2.3 we work on _locked_ socket, so that
1319  *      tricks with *seq access order and skb->users are not required.
1320  *      Probably, code can be easily improved even more.
1321  */
1322
1323 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1324                 size_t len, int nonblock, int flags, int *addr_len)
1325 {
1326         struct tcp_sock *tp = tcp_sk(sk);
1327         int copied = 0;
1328         u32 peek_seq;
1329         u32 *seq;
1330         unsigned long used;
1331         int err;
1332         int target;             /* Read at least this many bytes */
1333         long timeo;
1334         struct task_struct *user_recv = NULL;
1335         int copied_early = 0;
1336         u32 urg_hole = 0;
1337
1338         lock_sock(sk);
1339
1340         TCP_CHECK_TIMER(sk);
1341
1342         err = -ENOTCONN;
1343         if (sk->sk_state == TCP_LISTEN)
1344                 goto out;
1345
1346         timeo = sock_rcvtimeo(sk, nonblock);
1347
1348         /* Urgent data needs to be handled specially. */
1349         if (flags & MSG_OOB)
1350                 goto recv_urg;
1351
1352         seq = &tp->copied_seq;
1353         if (flags & MSG_PEEK) {
1354                 peek_seq = tp->copied_seq;
1355                 seq = &peek_seq;
1356         }
1357
1358         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1359
1360 #ifdef CONFIG_NET_DMA
1361         tp->ucopy.dma_chan = NULL;
1362         preempt_disable();
1363         if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) &&
1364             !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) {
1365                 preempt_enable_no_resched();
1366                 tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len);
1367         } else
1368                 preempt_enable_no_resched();
1369 #endif
1370
1371         do {
1372                 struct sk_buff *skb;
1373                 u32 offset;
1374
1375                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1376                 if (tp->urg_data && tp->urg_seq == *seq) {
1377                         if (copied)
1378                                 break;
1379                         if (signal_pending(current)) {
1380                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1381                                 break;
1382                         }
1383                 }
1384
1385                 /* Next get a buffer. */
1386
1387                 skb = skb_peek(&sk->sk_receive_queue);
1388                 do {
1389                         if (!skb)
1390                                 break;
1391
1392                         /* Now that we have two receive queues this
1393                          * shouldn't happen.
1394                          */
1395                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1396                                 printk(KERN_INFO "recvmsg bug: copied %X "
1397                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1398                                 break;
1399                         }
1400                         offset = *seq - TCP_SKB_CB(skb)->seq;
1401                         if (tcp_hdr(skb)->syn)
1402                                 offset--;
1403                         if (offset < skb->len)
1404                                 goto found_ok_skb;
1405                         if (tcp_hdr(skb)->fin)
1406                                 goto found_fin_ok;
1407                         BUG_TRAP(flags & MSG_PEEK);
1408                         skb = skb->next;
1409                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1410
1411                 /* Well, if we have backlog, try to process it now yet. */
1412
1413                 if (copied >= target && !sk->sk_backlog.tail)
1414                         break;
1415
1416                 if (copied) {
1417                         if (sk->sk_err ||
1418                             sk->sk_state == TCP_CLOSE ||
1419                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1420                             !timeo ||
1421                             signal_pending(current))
1422                                 break;
1423                 } else {
1424                         if (sock_flag(sk, SOCK_DONE))
1425                                 break;
1426
1427                         if (sk->sk_err) {
1428                                 copied = sock_error(sk);
1429                                 break;
1430                         }
1431
1432                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1433                                 break;
1434
1435                         if (sk->sk_state == TCP_CLOSE) {
1436                                 if (!sock_flag(sk, SOCK_DONE)) {
1437                                         /* This occurs when user tries to read
1438                                          * from never connected socket.
1439                                          */
1440                                         copied = -ENOTCONN;
1441                                         break;
1442                                 }
1443                                 break;
1444                         }
1445
1446                         if (!timeo) {
1447                                 copied = -EAGAIN;
1448                                 break;
1449                         }
1450
1451                         if (signal_pending(current)) {
1452                                 copied = sock_intr_errno(timeo);
1453                                 break;
1454                         }
1455                 }
1456
1457                 tcp_cleanup_rbuf(sk, copied);
1458
1459                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1460                         /* Install new reader */
1461                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1462                                 user_recv = current;
1463                                 tp->ucopy.task = user_recv;
1464                                 tp->ucopy.iov = msg->msg_iov;
1465                         }
1466
1467                         tp->ucopy.len = len;
1468
1469                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1470                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1471
1472                         /* Ugly... If prequeue is not empty, we have to
1473                          * process it before releasing socket, otherwise
1474                          * order will be broken at second iteration.
1475                          * More elegant solution is required!!!
1476                          *
1477                          * Look: we have the following (pseudo)queues:
1478                          *
1479                          * 1. packets in flight
1480                          * 2. backlog
1481                          * 3. prequeue
1482                          * 4. receive_queue
1483                          *
1484                          * Each queue can be processed only if the next ones
1485                          * are empty. At this point we have empty receive_queue.
1486                          * But prequeue _can_ be not empty after 2nd iteration,
1487                          * when we jumped to start of loop because backlog
1488                          * processing added something to receive_queue.
1489                          * We cannot release_sock(), because backlog contains
1490                          * packets arrived _after_ prequeued ones.
1491                          *
1492                          * Shortly, algorithm is clear --- to process all
1493                          * the queues in order. We could make it more directly,
1494                          * requeueing packets from backlog to prequeue, if
1495                          * is not empty. It is more elegant, but eats cycles,
1496                          * unfortunately.
1497                          */
1498                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1499                                 goto do_prequeue;
1500
1501                         /* __ Set realtime policy in scheduler __ */
1502                 }
1503
1504                 if (copied >= target) {
1505                         /* Do not sleep, just process backlog. */
1506                         release_sock(sk);
1507                         lock_sock(sk);
1508                 } else
1509                         sk_wait_data(sk, &timeo);
1510
1511 #ifdef CONFIG_NET_DMA
1512                 tp->ucopy.wakeup = 0;
1513 #endif
1514
1515                 if (user_recv) {
1516                         int chunk;
1517
1518                         /* __ Restore normal policy in scheduler __ */
1519
1520                         if ((chunk = len - tp->ucopy.len) != 0) {
1521                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1522                                 len -= chunk;
1523                                 copied += chunk;
1524                         }
1525
1526                         if (tp->rcv_nxt == tp->copied_seq &&
1527                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1528 do_prequeue:
1529                                 tcp_prequeue_process(sk);
1530
1531                                 if ((chunk = len - tp->ucopy.len) != 0) {
1532                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1533                                         len -= chunk;
1534                                         copied += chunk;
1535                                 }
1536                         }
1537                 }
1538                 if ((flags & MSG_PEEK) &&
1539                     (peek_seq - copied - urg_hole != tp->copied_seq)) {
1540                         if (net_ratelimit())
1541                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1542                                        current->comm, current->pid);
1543                         peek_seq = tp->copied_seq;
1544                 }
1545                 continue;
1546
1547         found_ok_skb:
1548                 /* Ok so how much can we use? */
1549                 used = skb->len - offset;
1550                 if (len < used)
1551                         used = len;
1552
1553                 /* Do we have urgent data here? */
1554                 if (tp->urg_data) {
1555                         u32 urg_offset = tp->urg_seq - *seq;
1556                         if (urg_offset < used) {
1557                                 if (!urg_offset) {
1558                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1559                                                 ++*seq;
1560                                                 urg_hole++;
1561                                                 offset++;
1562                                                 used--;
1563                                                 if (!used)
1564                                                         goto skip_copy;
1565                                         }
1566                                 } else
1567                                         used = urg_offset;
1568                         }
1569                 }
1570
1571                 if (!(flags & MSG_TRUNC)) {
1572 #ifdef CONFIG_NET_DMA
1573                         if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1574                                 tp->ucopy.dma_chan = get_softnet_dma();
1575
1576                         if (tp->ucopy.dma_chan) {
1577                                 tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
1578                                         tp->ucopy.dma_chan, skb, offset,
1579                                         msg->msg_iov, used,
1580                                         tp->ucopy.pinned_list);
1581
1582                                 if (tp->ucopy.dma_cookie < 0) {
1583
1584                                         printk(KERN_ALERT "dma_cookie < 0\n");
1585
1586                                         /* Exception. Bailout! */
1587                                         if (!copied)
1588                                                 copied = -EFAULT;
1589                                         break;
1590                                 }
1591                                 if ((offset + used) == skb->len)
1592                                         copied_early = 1;
1593
1594                         } else
1595 #endif
1596                         {
1597                                 err = skb_copy_datagram_iovec(skb, offset,
1598                                                 msg->msg_iov, used);
1599                                 if (err) {
1600                                         /* Exception. Bailout! */
1601                                         if (!copied)
1602                                                 copied = -EFAULT;
1603                                         break;
1604                                 }
1605                         }
1606                 }
1607
1608                 *seq += used;
1609                 copied += used;
1610                 len -= used;
1611
1612                 tcp_rcv_space_adjust(sk);
1613
1614 skip_copy:
1615                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1616                         tp->urg_data = 0;
1617                         tcp_fast_path_check(sk);
1618                 }
1619                 if (used + offset < skb->len)
1620                         continue;
1621
1622                 if (tcp_hdr(skb)->fin)
1623                         goto found_fin_ok;
1624                 if (!(flags & MSG_PEEK)) {
1625                         sk_eat_skb(sk, skb, copied_early);
1626                         copied_early = 0;
1627                 }
1628                 continue;
1629
1630         found_fin_ok:
1631                 /* Process the FIN. */
1632                 ++*seq;
1633                 if (!(flags & MSG_PEEK)) {
1634                         sk_eat_skb(sk, skb, copied_early);
1635                         copied_early = 0;
1636                 }
1637                 break;
1638         } while (len > 0);
1639
1640         if (user_recv) {
1641                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1642                         int chunk;
1643
1644                         tp->ucopy.len = copied > 0 ? len : 0;
1645
1646                         tcp_prequeue_process(sk);
1647
1648                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1649                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1650                                 len -= chunk;
1651                                 copied += chunk;
1652                         }
1653                 }
1654
1655                 tp->ucopy.task = NULL;
1656                 tp->ucopy.len = 0;
1657         }
1658
1659 #ifdef CONFIG_NET_DMA
1660         if (tp->ucopy.dma_chan) {
1661                 struct sk_buff *skb;
1662                 dma_cookie_t done, used;
1663
1664                 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1665
1666                 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1667                                                  tp->ucopy.dma_cookie, &done,
1668                                                  &used) == DMA_IN_PROGRESS) {
1669                         /* do partial cleanup of sk_async_wait_queue */
1670                         while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1671                                (dma_async_is_complete(skb->dma_cookie, done,
1672                                                       used) == DMA_SUCCESS)) {
1673                                 __skb_dequeue(&sk->sk_async_wait_queue);
1674                                 kfree_skb(skb);
1675                         }
1676                 }
1677
1678                 /* Safe to free early-copied skbs now */
1679                 __skb_queue_purge(&sk->sk_async_wait_queue);
1680                 dma_chan_put(tp->ucopy.dma_chan);
1681                 tp->ucopy.dma_chan = NULL;
1682         }
1683         if (tp->ucopy.pinned_list) {
1684                 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1685                 tp->ucopy.pinned_list = NULL;
1686         }
1687 #endif
1688
1689         /* According to UNIX98, msg_name/msg_namelen are ignored
1690          * on connected socket. I was just happy when found this 8) --ANK
1691          */
1692
1693         /* Clean up data we have read: This will do ACK frames. */
1694         tcp_cleanup_rbuf(sk, copied);
1695
1696         TCP_CHECK_TIMER(sk);
1697         release_sock(sk);
1698         return copied;
1699
1700 out:
1701         TCP_CHECK_TIMER(sk);
1702         release_sock(sk);
1703         return err;
1704
1705 recv_urg:
1706         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1707         goto out;
1708 }
1709
1710 void tcp_set_state(struct sock *sk, int state)
1711 {
1712         int oldstate = sk->sk_state;
1713
1714         switch (state) {
1715         case TCP_ESTABLISHED:
1716                 if (oldstate != TCP_ESTABLISHED)
1717                         TCP_INC_STATS(TCP_MIB_CURRESTAB);
1718                 break;
1719
1720         case TCP_CLOSE:
1721                 if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
1722                         TCP_INC_STATS(TCP_MIB_ESTABRESETS);
1723
1724                 sk->sk_prot->unhash(sk);
1725                 if (inet_csk(sk)->icsk_bind_hash &&
1726                     !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
1727                         inet_put_port(&tcp_hashinfo, sk);
1728                 /* fall through */
1729         default:
1730                 if (oldstate==TCP_ESTABLISHED)
1731                         TCP_DEC_STATS(TCP_MIB_CURRESTAB);
1732         }
1733
1734         /* Change state AFTER socket is unhashed to avoid closed
1735          * socket sitting in hash tables.
1736          */
1737         sk->sk_state = state;
1738
1739 #ifdef STATE_TRACE
1740         SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]);
1741 #endif
1742 }
1743 EXPORT_SYMBOL_GPL(tcp_set_state);
1744
1745 /*
1746  *      State processing on a close. This implements the state shift for
1747  *      sending our FIN frame. Note that we only send a FIN for some
1748  *      states. A shutdown() may have already sent the FIN, or we may be
1749  *      closed.
1750  */
1751
1752 static const unsigned char new_state[16] = {
1753   /* current state:        new state:      action:      */
1754   /* (Invalid)          */ TCP_CLOSE,
1755   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1756   /* TCP_SYN_SENT       */ TCP_CLOSE,
1757   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1758   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1759   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1760   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1761   /* TCP_CLOSE          */ TCP_CLOSE,
1762   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1763   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1764   /* TCP_LISTEN         */ TCP_CLOSE,
1765   /* TCP_CLOSING        */ TCP_CLOSING,
1766 };
1767
1768 static int tcp_close_state(struct sock *sk)
1769 {
1770         int next = (int)new_state[sk->sk_state];
1771         int ns = next & TCP_STATE_MASK;
1772
1773         tcp_set_state(sk, ns);
1774
1775         return next & TCP_ACTION_FIN;
1776 }
1777
1778 /*
1779  *      Shutdown the sending side of a connection. Much like close except
1780  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1781  */
1782
1783 void tcp_shutdown(struct sock *sk, int how)
1784 {
1785         /*      We need to grab some memory, and put together a FIN,
1786          *      and then put it into the queue to be sent.
1787          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1788          */
1789         if (!(how & SEND_SHUTDOWN))
1790                 return;
1791
1792         /* If we've already sent a FIN, or it's a closed state, skip this. */
1793         if ((1 << sk->sk_state) &
1794             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1795              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1796                 /* Clear out any half completed packets.  FIN if needed. */
1797                 if (tcp_close_state(sk))
1798                         tcp_send_fin(sk);
1799         }
1800 }
1801
1802 void tcp_close(struct sock *sk, long timeout)
1803 {
1804         struct sk_buff *skb;
1805         int data_was_unread = 0;
1806         int state;
1807
1808         lock_sock(sk);
1809         sk->sk_shutdown = SHUTDOWN_MASK;
1810
1811         if (sk->sk_state == TCP_LISTEN) {
1812                 tcp_set_state(sk, TCP_CLOSE);
1813
1814                 /* Special case. */
1815                 inet_csk_listen_stop(sk);
1816
1817                 goto adjudge_to_death;
1818         }
1819
1820         /*  We need to flush the recv. buffs.  We do this only on the
1821          *  descriptor close, not protocol-sourced closes, because the
1822          *  reader process may not have drained the data yet!
1823          */
1824         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1825                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1826                           tcp_hdr(skb)->fin;
1827                 data_was_unread += len;
1828                 __kfree_skb(skb);
1829         }
1830
1831         sk_stream_mem_reclaim(sk);
1832
1833         /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
1834         if (sk->sk_state == TCP_CLOSE)
1835                 goto adjudge_to_death;
1836
1837         /* As outlined in RFC 2525, section 2.17, we send a RST here because
1838          * data was lost. To witness the awful effects of the old behavior of
1839          * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
1840          * GET in an FTP client, suspend the process, wait for the client to
1841          * advertise a zero window, then kill -9 the FTP client, wheee...
1842          * Note: timeout is always zero in such a case.
1843          */
1844         if (data_was_unread) {
1845                 /* Unread data was tossed, zap the connection. */
1846                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1847                 tcp_set_state(sk, TCP_CLOSE);
1848                 tcp_send_active_reset(sk, GFP_KERNEL);
1849         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1850                 /* Check zero linger _after_ checking for unread data. */
1851                 sk->sk_prot->disconnect(sk, 0);
1852                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1853         } else if (tcp_close_state(sk)) {
1854                 /* We FIN if the application ate all the data before
1855                  * zapping the connection.
1856                  */
1857
1858                 /* RED-PEN. Formally speaking, we have broken TCP state
1859                  * machine. State transitions:
1860                  *
1861                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1862                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1863                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1864                  *
1865                  * are legal only when FIN has been sent (i.e. in window),
1866                  * rather than queued out of window. Purists blame.
1867                  *
1868                  * F.e. "RFC state" is ESTABLISHED,
1869                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1870                  *
1871                  * The visible declinations are that sometimes
1872                  * we enter time-wait state, when it is not required really
1873                  * (harmless), do not send active resets, when they are
1874                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1875                  * they look as CLOSING or LAST_ACK for Linux)
1876                  * Probably, I missed some more holelets.
1877                  *                                              --ANK
1878                  */
1879                 tcp_send_fin(sk);
1880         }
1881
1882         sk_stream_wait_close(sk, timeout);
1883
1884 adjudge_to_death:
1885         state = sk->sk_state;
1886         sock_hold(sk);
1887         sock_orphan(sk);
1888         atomic_inc(sk->sk_prot->orphan_count);
1889
1890         /* It is the last release_sock in its life. It will remove backlog. */
1891         release_sock(sk);
1892
1893
1894         /* Now socket is owned by kernel and we acquire BH lock
1895            to finish close. No need to check for user refs.
1896          */
1897         local_bh_disable();
1898         bh_lock_sock(sk);
1899         BUG_TRAP(!sock_owned_by_user(sk));
1900
1901         /* Have we already been destroyed by a softirq or backlog? */
1902         if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
1903                 goto out;
1904
1905         /*      This is a (useful) BSD violating of the RFC. There is a
1906          *      problem with TCP as specified in that the other end could
1907          *      keep a socket open forever with no application left this end.
1908          *      We use a 3 minute timeout (about the same as BSD) then kill
1909          *      our end. If they send after that then tough - BUT: long enough
1910          *      that we won't make the old 4*rto = almost no time - whoops
1911          *      reset mistake.
1912          *
1913          *      Nope, it was not mistake. It is really desired behaviour
1914          *      f.e. on http servers, when such sockets are useless, but
1915          *      consume significant resources. Let's do it with special
1916          *      linger2 option.                                 --ANK
1917          */
1918
1919         if (sk->sk_state == TCP_FIN_WAIT2) {
1920                 struct tcp_sock *tp = tcp_sk(sk);
1921                 if (tp->linger2 < 0) {
1922                         tcp_set_state(sk, TCP_CLOSE);
1923                         tcp_send_active_reset(sk, GFP_ATOMIC);
1924                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1925                 } else {
1926                         const int tmo = tcp_fin_time(sk);
1927
1928                         if (tmo > TCP_TIMEWAIT_LEN) {
1929                                 inet_csk_reset_keepalive_timer(sk,
1930                                                 tmo - TCP_TIMEWAIT_LEN);
1931                         } else {
1932                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1933                                 goto out;
1934                         }
1935                 }
1936         }
1937         if (sk->sk_state != TCP_CLOSE) {
1938                 sk_stream_mem_reclaim(sk);
1939                 if (tcp_too_many_orphans(sk,
1940                                 atomic_read(sk->sk_prot->orphan_count))) {
1941                         if (net_ratelimit())
1942                                 printk(KERN_INFO "TCP: too many of orphaned "
1943                                        "sockets\n");
1944                         tcp_set_state(sk, TCP_CLOSE);
1945                         tcp_send_active_reset(sk, GFP_ATOMIC);
1946                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1947                 }
1948         }
1949
1950         if (sk->sk_state == TCP_CLOSE)
1951                 inet_csk_destroy_sock(sk);
1952         /* Otherwise, socket is reprieved until protocol close. */
1953
1954 out:
1955         bh_unlock_sock(sk);
1956         local_bh_enable();
1957         sock_put(sk);
1958 }
1959
1960 /* These states need RST on ABORT according to RFC793 */
1961
1962 static inline int tcp_need_reset(int state)
1963 {
1964         return (1 << state) &
1965                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1966                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1967 }
1968
1969 int tcp_disconnect(struct sock *sk, int flags)
1970 {
1971         struct inet_sock *inet = inet_sk(sk);
1972         struct inet_connection_sock *icsk = inet_csk(sk);
1973         struct tcp_sock *tp = tcp_sk(sk);
1974         int err = 0;
1975         int old_state = sk->sk_state;
1976
1977         if (old_state != TCP_CLOSE)
1978                 tcp_set_state(sk, TCP_CLOSE);
1979
1980         /* ABORT function of RFC793 */
1981         if (old_state == TCP_LISTEN) {
1982                 inet_csk_listen_stop(sk);
1983         } else if (tcp_need_reset(old_state) ||
1984                    (tp->snd_nxt != tp->write_seq &&
1985                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1986                 /* The last check adjusts for discrepancy of Linux wrt. RFC
1987                  * states
1988                  */
1989                 tcp_send_active_reset(sk, gfp_any());
1990                 sk->sk_err = ECONNRESET;
1991         } else if (old_state == TCP_SYN_SENT)
1992                 sk->sk_err = ECONNRESET;
1993
1994         tcp_clear_xmit_timers(sk);
1995         __skb_queue_purge(&sk->sk_receive_queue);
1996         tcp_write_queue_purge(sk);
1997         __skb_queue_purge(&tp->out_of_order_queue);
1998 #ifdef CONFIG_NET_DMA
1999         __skb_queue_purge(&sk->sk_async_wait_queue);
2000 #endif
2001
2002         inet->dport = 0;
2003
2004         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
2005                 inet_reset_saddr(sk);
2006
2007         sk->sk_shutdown = 0;
2008         sock_reset_flag(sk, SOCK_DONE);
2009         tp->srtt = 0;
2010         if ((tp->write_seq += tp->max_window + 2) == 0)
2011                 tp->write_seq = 1;
2012         icsk->icsk_backoff = 0;
2013         tp->snd_cwnd = 2;
2014         icsk->icsk_probes_out = 0;
2015         tp->packets_out = 0;
2016         tp->snd_ssthresh = 0x7fffffff;
2017         tp->snd_cwnd_cnt = 0;
2018         tp->bytes_acked = 0;
2019         tcp_set_ca_state(sk, TCP_CA_Open);
2020         tcp_clear_retrans(tp);
2021         inet_csk_delack_init(sk);
2022         tcp_init_send_head(sk);
2023         memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
2024         __sk_dst_reset(sk);
2025
2026         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
2027
2028         sk->sk_error_report(sk);
2029         return err;
2030 }
2031
2032 /*
2033  *      Socket option code for TCP.
2034  */
2035 static int do_tcp_setsockopt(struct sock *sk, int level,
2036                 int optname, char __user *optval, int optlen)
2037 {
2038         struct tcp_sock *tp = tcp_sk(sk);
2039         struct inet_connection_sock *icsk = inet_csk(sk);
2040         int val;
2041         int err = 0;
2042
2043         /* This is a string value all the others are int's */
2044         if (optname == TCP_CONGESTION) {
2045                 char name[TCP_CA_NAME_MAX];
2046
2047                 if (optlen < 1)
2048                         return -EINVAL;
2049
2050                 val = strncpy_from_user(name, optval,
2051                                         min(TCP_CA_NAME_MAX-1, optlen));
2052                 if (val < 0)
2053                         return -EFAULT;
2054                 name[val] = 0;
2055
2056                 lock_sock(sk);
2057                 err = tcp_set_congestion_control(sk, name);
2058                 release_sock(sk);
2059                 return err;
2060         }
2061
2062         if (optlen < sizeof(int))
2063                 return -EINVAL;
2064
2065         if (get_user(val, (int __user *)optval))
2066                 return -EFAULT;
2067
2068         lock_sock(sk);
2069
2070         switch (optname) {
2071         case TCP_MAXSEG:
2072                 /* Values greater than interface MTU won't take effect. However
2073                  * at the point when this call is done we typically don't yet
2074                  * know which interface is going to be used */
2075                 if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) {
2076                         err = -EINVAL;
2077                         break;
2078                 }
2079                 tp->rx_opt.user_mss = val;
2080                 break;
2081
2082         case TCP_NODELAY:
2083                 if (val) {
2084                         /* TCP_NODELAY is weaker than TCP_CORK, so that
2085                          * this option on corked socket is remembered, but
2086                          * it is not activated until cork is cleared.
2087                          *
2088                          * However, when TCP_NODELAY is set we make
2089                          * an explicit push, which overrides even TCP_CORK
2090                          * for currently queued segments.
2091                          */
2092                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
2093                         tcp_push_pending_frames(sk);
2094                 } else {
2095                         tp->nonagle &= ~TCP_NAGLE_OFF;
2096                 }
2097                 break;
2098
2099         case TCP_CORK:
2100                 /* When set indicates to always queue non-full frames.
2101                  * Later the user clears this option and we transmit
2102                  * any pending partial frames in the queue.  This is
2103                  * meant to be used alongside sendfile() to get properly
2104                  * filled frames when the user (for example) must write
2105                  * out headers with a write() call first and then use
2106                  * sendfile to send out the data parts.
2107                  *
2108                  * TCP_CORK can be set together with TCP_NODELAY and it is
2109                  * stronger than TCP_NODELAY.
2110                  */
2111                 if (val) {
2112                         tp->nonagle |= TCP_NAGLE_CORK;
2113                 } else {
2114                         tp->nonagle &= ~TCP_NAGLE_CORK;
2115                         if (tp->nonagle&TCP_NAGLE_OFF)
2116                                 tp->nonagle |= TCP_NAGLE_PUSH;
2117                         tcp_push_pending_frames(sk);
2118                 }
2119                 break;
2120
2121         case TCP_KEEPIDLE:
2122                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2123                         err = -EINVAL;
2124                 else {
2125                         tp->keepalive_time = val * HZ;
2126                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
2127                             !((1 << sk->sk_state) &
2128                               (TCPF_CLOSE | TCPF_LISTEN))) {
2129                                 u32 elapsed = keepalive_time_elapsed(tp);
2130                                 if (tp->keepalive_time > elapsed)
2131                                         elapsed = tp->keepalive_time - elapsed;
2132                                 else
2133                                         elapsed = 0;
2134                                 inet_csk_reset_keepalive_timer(sk, elapsed);
2135                         }
2136                 }
2137                 break;
2138         case TCP_KEEPINTVL:
2139                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2140                         err = -EINVAL;
2141                 else
2142                         tp->keepalive_intvl = val * HZ;
2143                 break;
2144         case TCP_KEEPCNT:
2145                 if (val < 1 || val > MAX_TCP_KEEPCNT)
2146                         err = -EINVAL;
2147                 else
2148                         tp->keepalive_probes = val;
2149                 break;
2150         case TCP_SYNCNT:
2151                 if (val < 1 || val > MAX_TCP_SYNCNT)
2152                         err = -EINVAL;
2153                 else
2154                         icsk->icsk_syn_retries = val;
2155                 break;
2156
2157         case TCP_LINGER2:
2158                 if (val < 0)
2159                         tp->linger2 = -1;
2160                 else if (val > sysctl_tcp_fin_timeout / HZ)
2161                         tp->linger2 = 0;
2162                 else
2163                         tp->linger2 = val * HZ;
2164                 break;
2165
2166         case TCP_DEFER_ACCEPT:
2167                 /* Translate value in seconds to number of retransmits */
2168                 icsk->icsk_accept_queue.rskq_defer_accept =
2169                         secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
2170                                         TCP_RTO_MAX / HZ);
2171                 break;
2172
2173         case TCP_WINDOW_CLAMP:
2174                 if (!val) {
2175                         if (sk->sk_state != TCP_CLOSE) {
2176                                 err = -EINVAL;
2177                                 break;
2178                         }
2179                         tp->window_clamp = 0;
2180                 } else
2181                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
2182                                                 SOCK_MIN_RCVBUF / 2 : val;
2183                 break;
2184
2185         case TCP_QUICKACK:
2186                 if (!val) {
2187                         icsk->icsk_ack.pingpong = 1;
2188                 } else {
2189                         icsk->icsk_ack.pingpong = 0;
2190                         if ((1 << sk->sk_state) &
2191                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
2192                             inet_csk_ack_scheduled(sk)) {
2193                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
2194                                 tcp_cleanup_rbuf(sk, 1);
2195                                 if (!(val & 1))
2196                                         icsk->icsk_ack.pingpong = 1;
2197                         }
2198                 }
2199                 break;
2200
2201 #ifdef CONFIG_TCP_MD5SIG
2202         case TCP_MD5SIG:
2203                 /* Read the IP->Key mappings from userspace */
2204                 err = tp->af_specific->md5_parse(sk, optval, optlen);
2205                 break;
2206 #endif
2207
2208         default:
2209                 err = -ENOPROTOOPT;
2210                 break;
2211         }
2212
2213         release_sock(sk);
2214         return err;
2215 }
2216
2217 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
2218                    int optlen)
2219 {
2220         struct inet_connection_sock *icsk = inet_csk(sk);
2221
2222         if (level != SOL_TCP)
2223                 return icsk->icsk_af_ops->setsockopt(sk, level, optname,
2224                                                      optval, optlen);
2225         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2226 }
2227
2228 #ifdef CONFIG_COMPAT
2229 int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
2230                           char __user *optval, int optlen)
2231 {
2232         if (level != SOL_TCP)
2233                 return inet_csk_compat_setsockopt(sk, level, optname,
2234                                                   optval, optlen);
2235         return do_tcp_setsockopt(sk, level, optname, optval, optlen);
2236 }
2237
2238 EXPORT_SYMBOL(compat_tcp_setsockopt);
2239 #endif
2240
2241 /* Return information about state of tcp endpoint in API format. */
2242 void tcp_get_info(struct sock *sk, struct tcp_info *info)
2243 {
2244         struct tcp_sock *tp = tcp_sk(sk);
2245         const struct inet_connection_sock *icsk = inet_csk(sk);
2246         u32 now = tcp_time_stamp;
2247
2248         memset(info, 0, sizeof(*info));
2249
2250         info->tcpi_state = sk->sk_state;
2251         info->tcpi_ca_state = icsk->icsk_ca_state;
2252         info->tcpi_retransmits = icsk->icsk_retransmits;
2253         info->tcpi_probes = icsk->icsk_probes_out;
2254         info->tcpi_backoff = icsk->icsk_backoff;
2255
2256         if (tp->rx_opt.tstamp_ok)
2257                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
2258         if (tp->rx_opt.sack_ok)
2259                 info->tcpi_options |= TCPI_OPT_SACK;
2260         if (tp->rx_opt.wscale_ok) {
2261                 info->tcpi_options |= TCPI_OPT_WSCALE;
2262                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
2263                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
2264         }
2265
2266         if (tp->ecn_flags&TCP_ECN_OK)
2267                 info->tcpi_options |= TCPI_OPT_ECN;
2268
2269         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
2270         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
2271         info->tcpi_snd_mss = tp->mss_cache;
2272         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
2273
2274         info->tcpi_unacked = tp->packets_out;
2275         info->tcpi_sacked = tp->sacked_out;
2276         info->tcpi_lost = tp->lost_out;
2277         info->tcpi_retrans = tp->retrans_out;
2278         info->tcpi_fackets = tp->fackets_out;
2279
2280         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
2281         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
2282         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
2283
2284         info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
2285         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2286         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
2287         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
2288         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
2289         info->tcpi_snd_cwnd = tp->snd_cwnd;
2290         info->tcpi_advmss = tp->advmss;
2291         info->tcpi_reordering = tp->reordering;
2292
2293         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
2294         info->tcpi_rcv_space = tp->rcvq_space.space;
2295
2296         info->tcpi_total_retrans = tp->total_retrans;
2297 }
2298
2299 EXPORT_SYMBOL_GPL(tcp_get_info);
2300
2301 static int do_tcp_getsockopt(struct sock *sk, int level,
2302                 int optname, char __user *optval, int __user *optlen)
2303 {
2304         struct inet_connection_sock *icsk = inet_csk(sk);
2305         struct tcp_sock *tp = tcp_sk(sk);
2306         int val, len;
2307
2308         if (get_user(len, optlen))
2309                 return -EFAULT;
2310
2311         len = min_t(unsigned int, len, sizeof(int));
2312
2313         if (len < 0)
2314                 return -EINVAL;
2315
2316         switch (optname) {
2317         case TCP_MAXSEG:
2318                 val = tp->mss_cache;
2319                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
2320                         val = tp->rx_opt.user_mss;
2321                 break;
2322         case TCP_NODELAY:
2323                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
2324                 break;
2325         case TCP_CORK:
2326                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
2327                 break;
2328         case TCP_KEEPIDLE:
2329                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
2330                 break;
2331         case TCP_KEEPINTVL:
2332                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
2333                 break;
2334         case TCP_KEEPCNT:
2335                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2336                 break;
2337         case TCP_SYNCNT:
2338                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
2339                 break;
2340         case TCP_LINGER2:
2341                 val = tp->linger2;
2342                 if (val >= 0)
2343                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
2344                 break;
2345         case TCP_DEFER_ACCEPT:
2346                 val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
2347                                       TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
2348                 break;
2349         case TCP_WINDOW_CLAMP:
2350                 val = tp->window_clamp;
2351                 break;
2352         case TCP_INFO: {
2353                 struct tcp_info info;
2354
2355                 if (get_user(len, optlen))
2356                         return -EFAULT;
2357
2358                 tcp_get_info(sk, &info);
2359
2360                 len = min_t(unsigned int, len, sizeof(info));
2361                 if (put_user(len, optlen))
2362                         return -EFAULT;
2363                 if (copy_to_user(optval, &info, len))
2364                         return -EFAULT;
2365                 return 0;
2366         }
2367         case TCP_QUICKACK:
2368                 val = !icsk->icsk_ack.pingpong;
2369                 break;
2370
2371         case TCP_CONGESTION:
2372                 if (get_user(len, optlen))
2373                         return -EFAULT;
2374                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2375                 if (put_user(len, optlen))
2376                         return -EFAULT;
2377                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2378                         return -EFAULT;
2379                 return 0;
2380         default:
2381                 return -ENOPROTOOPT;
2382         }
2383
2384         if (put_user(len, optlen))
2385                 return -EFAULT;
2386         if (copy_to_user(optval, &val, len))
2387                 return -EFAULT;
2388         return 0;
2389 }
2390
2391 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
2392                    int __user *optlen)
2393 {
2394         struct inet_connection_sock *icsk = inet_csk(sk);
2395
2396         if (level != SOL_TCP)
2397                 return icsk->icsk_af_ops->getsockopt(sk, level, optname,
2398                                                      optval, optlen);
2399         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2400 }
2401
2402 #ifdef CONFIG_COMPAT
2403 int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2404                           char __user *optval, int __user *optlen)
2405 {
2406         if (level != SOL_TCP)
2407                 return inet_csk_compat_getsockopt(sk, level, optname,
2408                                                   optval, optlen);
2409         return do_tcp_getsockopt(sk, level, optname, optval, optlen);
2410 }
2411
2412 EXPORT_SYMBOL(compat_tcp_getsockopt);
2413 #endif
2414
2415 struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features)
2416 {
2417         struct sk_buff *segs = ERR_PTR(-EINVAL);
2418         struct tcphdr *th;
2419         unsigned thlen;
2420         unsigned int seq;
2421         __be32 delta;
2422         unsigned int oldlen;
2423         unsigned int len;
2424
2425         if (!pskb_may_pull(skb, sizeof(*th)))
2426                 goto out;
2427
2428         th = tcp_hdr(skb);
2429         thlen = th->doff * 4;
2430         if (thlen < sizeof(*th))
2431                 goto out;
2432
2433         if (!pskb_may_pull(skb, thlen))
2434                 goto out;
2435
2436         oldlen = (u16)~skb->len;
2437         __skb_pull(skb, thlen);
2438
2439         if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
2440                 /* Packet is from an untrusted source, reset gso_segs. */
2441                 int type = skb_shinfo(skb)->gso_type;
2442                 int mss;
2443
2444                 if (unlikely(type &
2445                              ~(SKB_GSO_TCPV4 |
2446                                SKB_GSO_DODGY |
2447                                SKB_GSO_TCP_ECN |
2448                                SKB_GSO_TCPV6 |
2449                                0) ||
2450                              !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
2451                         goto out;
2452
2453                 mss = skb_shinfo(skb)->gso_size;
2454                 skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss;
2455
2456                 segs = NULL;
2457                 goto out;
2458         }
2459
2460         segs = skb_segment(skb, features);
2461         if (IS_ERR(segs))
2462                 goto out;
2463
2464         len = skb_shinfo(skb)->gso_size;
2465         delta = htonl(oldlen + (thlen + len));
2466
2467         skb = segs;
2468         th = tcp_hdr(skb);
2469         seq = ntohl(th->seq);
2470
2471         do {
2472                 th->fin = th->psh = 0;
2473
2474                 th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2475                                        (__force u32)delta));
2476                 if (skb->ip_summed != CHECKSUM_PARTIAL)
2477                         th->check =
2478                              csum_fold(csum_partial(skb_transport_header(skb),
2479                                                     thlen, skb->csum));
2480
2481                 seq += len;
2482                 skb = skb->next;
2483                 th = tcp_hdr(skb);
2484
2485                 th->seq = htonl(seq);
2486                 th->cwr = 0;
2487         } while (skb->next);
2488
2489         delta = htonl(oldlen + (skb->tail - skb->transport_header) +
2490                       skb->data_len);
2491         th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
2492                                 (__force u32)delta));
2493         if (skb->ip_summed != CHECKSUM_PARTIAL)
2494                 th->check = csum_fold(csum_partial(skb_transport_header(skb),
2495                                                    thlen, skb->csum));
2496
2497 out:
2498         return segs;
2499 }
2500 EXPORT_SYMBOL(tcp_tso_segment);
2501
2502 #ifdef CONFIG_TCP_MD5SIG
2503 static unsigned long tcp_md5sig_users;
2504 static struct tcp_md5sig_pool **tcp_md5sig_pool;
2505 static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
2506
2507 static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
2508 {
2509         int cpu;
2510         for_each_possible_cpu(cpu) {
2511                 struct tcp_md5sig_pool *p = *per_cpu_ptr(pool, cpu);
2512                 if (p) {
2513                         if (p->md5_desc.tfm)
2514                                 crypto_free_hash(p->md5_desc.tfm);
2515                         kfree(p);
2516                         p = NULL;
2517                 }
2518         }
2519         free_percpu(pool);
2520 }
2521
2522 void tcp_free_md5sig_pool(void)
2523 {
2524         struct tcp_md5sig_pool **pool = NULL;
2525
2526         spin_lock_bh(&tcp_md5sig_pool_lock);
2527         if (--tcp_md5sig_users == 0) {
2528                 pool = tcp_md5sig_pool;
2529                 tcp_md5sig_pool = NULL;
2530         }
2531         spin_unlock_bh(&tcp_md5sig_pool_lock);
2532         if (pool)
2533                 __tcp_free_md5sig_pool(pool);
2534 }
2535
2536 EXPORT_SYMBOL(tcp_free_md5sig_pool);
2537
2538 static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(void)
2539 {
2540         int cpu;
2541         struct tcp_md5sig_pool **pool;
2542
2543         pool = alloc_percpu(struct tcp_md5sig_pool *);
2544         if (!pool)
2545                 return NULL;
2546
2547         for_each_possible_cpu(cpu) {
2548                 struct tcp_md5sig_pool *p;
2549                 struct crypto_hash *hash;
2550
2551                 p = kzalloc(sizeof(*p), GFP_KERNEL);
2552                 if (!p)
2553                         goto out_free;
2554                 *per_cpu_ptr(pool, cpu) = p;
2555
2556                 hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
2557                 if (!hash || IS_ERR(hash))
2558                         goto out_free;
2559
2560                 p->md5_desc.tfm = hash;
2561         }
2562         return pool;
2563 out_free:
2564         __tcp_free_md5sig_pool(pool);
2565         return NULL;
2566 }
2567
2568 struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void)
2569 {
2570         struct tcp_md5sig_pool **pool;
2571         int alloc = 0;
2572
2573 retry:
2574         spin_lock_bh(&tcp_md5sig_pool_lock);
2575         pool = tcp_md5sig_pool;
2576         if (tcp_md5sig_users++ == 0) {
2577                 alloc = 1;
2578                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2579         } else if (!pool) {
2580                 tcp_md5sig_users--;
2581                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2582                 cpu_relax();
2583                 goto retry;
2584         } else
2585                 spin_unlock_bh(&tcp_md5sig_pool_lock);
2586
2587         if (alloc) {
2588                 /* we cannot hold spinlock here because this may sleep. */
2589                 struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool();
2590                 spin_lock_bh(&tcp_md5sig_pool_lock);
2591                 if (!p) {
2592                         tcp_md5sig_users--;
2593                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2594                         return NULL;
2595                 }
2596                 pool = tcp_md5sig_pool;
2597                 if (pool) {
2598                         /* oops, it has already been assigned. */
2599                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2600                         __tcp_free_md5sig_pool(p);
2601                 } else {
2602                         tcp_md5sig_pool = pool = p;
2603                         spin_unlock_bh(&tcp_md5sig_pool_lock);
2604                 }
2605         }
2606         return pool;
2607 }
2608
2609 EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
2610
2611 struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
2612 {
2613         struct tcp_md5sig_pool **p;
2614         spin_lock_bh(&tcp_md5sig_pool_lock);
2615         p = tcp_md5sig_pool;
2616         if (p)
2617                 tcp_md5sig_users++;
2618         spin_unlock_bh(&tcp_md5sig_pool_lock);
2619         return (p ? *per_cpu_ptr(p, cpu) : NULL);
2620 }
2621
2622 EXPORT_SYMBOL(__tcp_get_md5sig_pool);
2623
2624 void __tcp_put_md5sig_pool(void)
2625 {
2626         tcp_free_md5sig_pool();
2627 }
2628
2629 EXPORT_SYMBOL(__tcp_put_md5sig_pool);
2630 #endif
2631
2632 void tcp_done(struct sock *sk)
2633 {
2634         if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
2635                 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
2636
2637         tcp_set_state(sk, TCP_CLOSE);
2638         tcp_clear_xmit_timers(sk);
2639
2640         sk->sk_shutdown = SHUTDOWN_MASK;
2641
2642         if (!sock_flag(sk, SOCK_DEAD))
2643                 sk->sk_state_change(sk);
2644         else
2645                 inet_csk_destroy_sock(sk);
2646 }
2647 EXPORT_SYMBOL_GPL(tcp_done);
2648
2649 extern void __skb_cb_too_small_for_tcp(int, int);
2650 extern struct tcp_congestion_ops tcp_reno;
2651
2652 static __initdata unsigned long thash_entries;
2653 static int __init set_thash_entries(char *str)
2654 {
2655         if (!str)
2656                 return 0;
2657         thash_entries = simple_strtoul(str, &str, 0);
2658         return 1;
2659 }
2660 __setup("thash_entries=", set_thash_entries);
2661
2662 void __init tcp_init(void)
2663 {
2664         struct sk_buff *skb = NULL;
2665         unsigned long limit;
2666         int order, i, max_share;
2667
2668         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2669                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2670                                            sizeof(skb->cb));
2671
2672         tcp_hashinfo.bind_bucket_cachep =
2673                 kmem_cache_create("tcp_bind_bucket",
2674                                   sizeof(struct inet_bind_bucket), 0,
2675                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
2676
2677         /* Size and allocate the main established and bind bucket
2678          * hash tables.
2679          *
2680          * The methodology is similar to that of the buffer cache.
2681          */
2682         tcp_hashinfo.ehash =
2683                 alloc_large_system_hash("TCP established",
2684                                         sizeof(struct inet_ehash_bucket),
2685                                         thash_entries,
2686                                         (num_physpages >= 128 * 1024) ?
2687                                         13 : 15,
2688                                         0,
2689                                         &tcp_hashinfo.ehash_size,
2690                                         NULL,
2691                                         0);
2692         tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
2693         for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
2694                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2695                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2696                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain);
2697         }
2698
2699         tcp_hashinfo.bhash =
2700                 alloc_large_system_hash("TCP bind",
2701                                         sizeof(struct inet_bind_hashbucket),
2702                                         tcp_hashinfo.ehash_size,
2703                                         (num_physpages >= 128 * 1024) ?
2704                                         13 : 15,
2705                                         0,
2706                                         &tcp_hashinfo.bhash_size,
2707                                         NULL,
2708                                         64 * 1024);
2709         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2710         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2711                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2712                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2713         }
2714
2715         /* Try to be a bit smarter and adjust defaults depending
2716          * on available memory.
2717          */
2718         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2719                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2720                         order++)
2721                 ;
2722         if (order >= 4) {
2723                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2724                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2725                 sysctl_max_syn_backlog = 1024;
2726         } else if (order < 3) {
2727                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2728                 sysctl_tcp_max_orphans >>= (3 - order);
2729                 sysctl_max_syn_backlog = 128;
2730         }
2731
2732         /* Set the pressure threshold to be a fraction of global memory that
2733          * is up to 1/2 at 256 MB, decreasing toward zero with the amount of
2734          * memory, with a floor of 128 pages.
2735          */
2736         limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
2737         limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
2738         limit = max(limit, 128UL);
2739         sysctl_tcp_mem[0] = limit / 4 * 3;
2740         sysctl_tcp_mem[1] = limit;
2741         sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
2742
2743         /* Set per-socket limits to no more than 1/128 the pressure threshold */
2744         limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
2745         max_share = min(4UL*1024*1024, limit);
2746
2747         sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM;
2748         sysctl_tcp_wmem[1] = 16*1024;
2749         sysctl_tcp_wmem[2] = max(64*1024, max_share);
2750
2751         sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM;
2752         sysctl_tcp_rmem[1] = 87380;
2753         sysctl_tcp_rmem[2] = max(87380, max_share);
2754
2755         printk(KERN_INFO "TCP: Hash tables configured "
2756                "(established %d bind %d)\n",
2757                tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
2758
2759         tcp_register_congestion_control(&tcp_reno);
2760 }
2761
2762 EXPORT_SYMBOL(tcp_close);
2763 EXPORT_SYMBOL(tcp_disconnect);
2764 EXPORT_SYMBOL(tcp_getsockopt);
2765 EXPORT_SYMBOL(tcp_ioctl);
2766 EXPORT_SYMBOL(tcp_poll);
2767 EXPORT_SYMBOL(tcp_read_sock);
2768 EXPORT_SYMBOL(tcp_recvmsg);
2769 EXPORT_SYMBOL(tcp_sendmsg);
2770 EXPORT_SYMBOL(tcp_splice_read);
2771 EXPORT_SYMBOL(tcp_sendpage);
2772 EXPORT_SYMBOL(tcp_setsockopt);
2773 EXPORT_SYMBOL(tcp_shutdown);
2774 EXPORT_SYMBOL(tcp_statistics);