net/ipv4/tcp.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  *
  22  * Fixes:
  23  *              Alan Cox        :       Numerous verify_area() calls
  24  *              Alan Cox        :       Set the ACK bit on a reset
  25  *              Alan Cox        :       Stopped it crashing if it closed while
  26  *                                      sk->inuse=1 and was trying to connect
  27  *                                      (tcp_err()).
  28  *              Alan Cox        :       All icmp error handling was broken
  29  *                                      pointers passed where wrong and the
  30  *                                      socket was looked up backwards. Nobody
  31  *                                      tested any icmp error code obviously.
  32  *              Alan Cox        :       tcp_err() now handled properly. It
  33  *                                      wakes people on errors. poll
  34  *                                      behaves and the icmp error race
  35  *                                      has gone by moving it into sock.c
  36  *              Alan Cox        :       tcp_send_reset() fixed to work for
  37  *                                      everything not just packets for
  38  *                                      unknown sockets.
  39  *              Alan Cox        :       tcp option processing.
  40  *              Alan Cox        :       Reset tweaked (still not 100%) [Had
  41  *                                      syn rule wrong]
  42  *              Herp Rosmanith  :       More reset fixes
  43  *              Alan Cox        :       No longer acks invalid rst frames.
  44  *                                      Acking any kind of RST is right out.
  45  *              Alan Cox        :       Sets an ignore me flag on an rst
  46  *                                      receive otherwise odd bits of prattle
  47  *                                      escape still
  48  *              Alan Cox        :       Fixed another acking RST frame bug.
  49  *                                      Should stop LAN workplace lockups.
  50  *              Alan Cox        :       Some tidyups using the new skb list
  51  *                                      facilities
  52  *              Alan Cox        :       sk->keepopen now seems to work
  53  *              Alan Cox        :       Pulls options out correctly on accepts
  54  *              Alan Cox        :       Fixed assorted sk->rqueue->next errors
  55  *              Alan Cox        :       PSH doesn't end a TCP read. Switched a
  56  *                                      bit to skb ops.
  57  *              Alan Cox        :       Tidied tcp_data to avoid a potential
  58  *                                      nasty.
  59  *              Alan Cox        :       Added some better commenting, as the
  60  *                                      tcp is hard to follow
  61  *              Alan Cox        :       Removed incorrect check for 20 * psh
  62  *      Michael O'Reilly        :       ack < copied bug fix.
  63  *      Johannes Stille         :       Misc tcp fixes (not all in yet).
  64  *              Alan Cox        :       FIN with no memory -> CRASH
  65  *              Alan Cox        :       Added socket option proto entries.
  66  *                                      Also added awareness of them to accept.
  67  *              Alan Cox        :       Added TCP options (SOL_TCP)
  68  *              Alan Cox        :       Switched wakeup calls to callbacks,
  69  *                                      so the kernel can layer network
  70  *                                      sockets.
  71  *              Alan Cox        :       Use ip_tos/ip_ttl settings.
  72  *              Alan Cox        :       Handle FIN (more) properly (we hope).
  73  *              Alan Cox        :       RST frames sent on unsynchronised
  74  *                                      state ack error.
  75  *              Alan Cox        :       Put in missing check for SYN bit.
  76  *              Alan Cox        :       Added tcp_select_window() aka NET2E
  77  *                                      window non shrink trick.
  78  *              Alan Cox        :       Added a couple of small NET2E timer
  79  *                                      fixes
  80  *              Charles Hedrick :       TCP fixes
  81  *              Toomas Tamm     :       TCP window fixes
  82  *              Alan Cox        :       Small URG fix to rlogin ^C ack fight
  83  *              Charles Hedrick :       Rewrote most of it to actually work
  84  *              Linus           :       Rewrote tcp_read() and URG handling
  85  *                                      completely
  86  *              Gerhard Koerting:       Fixed some missing timer handling
  87  *              Matthew Dillon  :       Reworked TCP machine states as per RFC
  88  *              Gerhard Koerting:       PC/TCP workarounds
  89  *              Adam Caldwell   :       Assorted timer/timing errors
  90  *              Matthew Dillon  :       Fixed another RST bug
  91  *              Alan Cox        :       Move to kernel side addressing changes.
  92  *              Alan Cox        :       Beginning work on TCP fastpathing
  93  *                                      (not yet usable)
  94  *              Arnt Gulbrandsen:       Turbocharged tcp_check() routine.
  95  *              Alan Cox        :       TCP fast path debugging
  96  *              Alan Cox        :       Window clamping
  97  *              Michael Riepe   :       Bug in tcp_check()
  98  *              Matt Dillon     :       More TCP improvements and RST bug fixes
  99  *              Matt Dillon     :       Yet more small nasties remove from the
 100  *                                      TCP code (Be very nice to this man if
 101  *                                      tcp finally works 100%) 8)
 102  *              Alan Cox        :       BSD accept semantics.
 103  *              Alan Cox        :       Reset on closedown bug.
 104  *      Peter De Schrijver      :       ENOTCONN check missing in tcp_sendto().
 105  *              Michael Pall    :       Handle poll() after URG properly in
 106  *                                      all cases.
 107  *              Michael Pall    :       Undo the last fix in tcp_read_urg()
 108  *                                      (multi URG PUSH broke rlogin).
 109  *              Michael Pall    :       Fix the multi URG PUSH problem in
 110  *                                      tcp_readable(), poll() after URG
 111  *                                      works now.
 112  *              Michael Pall    :       recv(...,MSG_OOB) never blocks in the
 113  *                                      BSD api.
 114  *              Alan Cox        :       Changed the semantics of sk->socket to
 115  *                                      fix a race and a signal problem with
 116  *                                      accept() and async I/O.
 117  *              Alan Cox        :       Relaxed the rules on tcp_sendto().
 118  *              Yury Shevchuk   :       Really fixed accept() blocking problem.
 119  *              Craig I. Hagan  :       Allow for BSD compatible TIME_WAIT for
 120  *                                      clients/servers which listen in on
 121  *                                      fixed ports.
 122  *              Alan Cox        :       Cleaned the above up and shrank it to
 123  *                                      a sensible code size.
 124  *              Alan Cox        :       Self connect lockup fix.
 125  *              Alan Cox        :       No connect to multicast.
 126  *              Ross Biro       :       Close unaccepted children on master
 127  *                                      socket close.
 128  *              Alan Cox        :       Reset tracing code.
 129  *              Alan Cox        :       Spurious resets on shutdown.
 130  *              Alan Cox        :       Giant 15 minute/60 second timer error
 131  *              Alan Cox        :       Small whoops in polling before an
 132  *                                      accept.
 133  *              Alan Cox        :       Kept the state trace facility since
 134  *                                      it's handy for debugging.
 135  *              Alan Cox        :       More reset handler fixes.
 136  *              Alan Cox        :       Started rewriting the code based on
 137  *                                      the RFC's for other useful protocol
 138  *                                      references see: Comer, KA9Q NOS, and
 139  *                                      for a reference on the difference
 140  *                                      between specifications and how BSD
 141  *                                      works see the 4.4lite source.
 142  *              A.N.Kuznetsov   :       Don't time wait on completion of tidy
 143  *                                      close.
 144  *              Linus Torvalds  :       Fin/Shutdown & copied_seq changes.
 145  *              Linus Torvalds  :       Fixed BSD port reuse to work first syn
 146  *              Alan Cox        :       Reimplemented timers as per the RFC
 147  *                                      and using multiple timers for sanity.
 148  *              Alan Cox        :       Small bug fixes, and a lot of new
 149  *                                      comments.
 150  *              Alan Cox        :       Fixed dual reader crash by locking
 151  *                                      the buffers (much like datagram.c)
 152  *              Alan Cox        :       Fixed stuck sockets in probe. A probe
 153  *                                      now gets fed up of retrying without
 154  *                                      (even a no space) answer.
 155  *              Alan Cox        :       Extracted closing code better
 156  *              Alan Cox        :       Fixed the closing state machine to
 157  *                                      resemble the RFC.
 158  *              Alan Cox        :       More 'per spec' fixes.
 159  *              Jorge Cwik      :       Even faster checksumming.
 160  *              Alan Cox        :       tcp_data() doesn't ack illegal PSH
 161  *                                      only frames. At least one pc tcp stack
 162  *                                      generates them.
 163  *              Alan Cox        :       Cache last socket.
 164  *              Alan Cox        :       Per route irtt.
 165  *              Matt Day        :       poll()->select() match BSD precisely on error
 166  *              Alan Cox        :       New buffers
 167  *              Marc Tamsky     :       Various sk->prot->retransmits and
 168  *                                      sk->retransmits misupdating fixed.
 169  *                                      Fixed tcp_write_timeout: stuck close,
 170  *                                      and TCP syn retries gets used now.
 171  *              Mark Yarvis     :       In tcp_read_wakeup(), don't send an
 172  *                                      ack if state is TCP_CLOSED.
 173  *              Alan Cox        :       Look up device on a retransmit - routes may
 174  *                                      change. Doesn't yet cope with MSS shrink right
 175  *                                      but it's a start!
 176  *              Marc Tamsky     :       Closing in closing fixes.
 177  *              Mike Shaver     :       RFC1122 verifications.
 178  *              Alan Cox        :       rcv_saddr errors.
 179  *              Alan Cox        :       Block double connect().
 180  *              Alan Cox        :       Small hooks for enSKIP.
 181  *              Alexey Kuznetsov:       Path MTU discovery.
 182  *              Alan Cox        :       Support soft errors.
 183  *              Alan Cox        :       Fix MTU discovery pathological case
 184  *                                      when the remote claims no mtu!
 185  *              Marc Tamsky     :       TCP_CLOSE fix.
 186  *              Colin (G3TNE)   :       Send a reset on syn ack replies in
 187  *                                      window but wrong (fixes NT lpd problems)
 188  *              Pedro Roque     :       Better TCP window handling, delayed ack.
 189  *              Joerg Reuter    :       No modification of locked buffers in
 190  *                                      tcp_do_retransmit()
 191  *              Eric Schenk     :       Changed receiver side silly window
 192  *                                      avoidance algorithm to BSD style
 193  *                                      algorithm. This doubles throughput
 194  *                                      against machines running Solaris,
 195  *                                      and seems to result in general
 196  *                                      improvement.
 197  *      Stefan Magdalinski      :       adjusted tcp_readable() to fix FIONREAD
 198  *      Willy Konynenberg       :       Transparent proxying support.
 199  *      Mike McLagan            :       Routing by source
 200  *              Keith Owens     :       Do proper merging with partial SKB's in
 201  *                                      tcp_do_sendmsg to avoid burstiness.
 202  *              Eric Schenk     :       Fix fast close down bug with
 203  *                                      shutdown() followed by close().
 204  *              Andi Kleen      :       Make poll agree with SIGIO
 205  *      Salvatore Sanfilippo    :       Support SO_LINGER with linger == 1 and
 206  *                                      lingertime == 0 (RFC 793 ABORT Call)
 207  *      Hirokazu Takahashi      :       Use copy_from_user() instead of
 208  *                                      csum_and_copy_from_user() if possible.
 209  *
 210  *              This program is free software; you can redistribute it and/or
 211  *              modify it under the terms of the GNU General Public License
 212  *              as published by the Free Software Foundation; either version
 213  *              2 of the License, or(at your option) any later version.
 214  *
 215  * Description of States:
 216  *
 217  *      TCP_SYN_SENT            sent a connection request, waiting for ack
 218  *
 219  *      TCP_SYN_RECV            received a connection request, sent ack,
 220  *                              waiting for final ack in three-way handshake.
 221  *
 222  *      TCP_ESTABLISHED         connection established
 223  *
 224  *      TCP_FIN_WAIT1           our side has shutdown, waiting to complete
 225  *                              transmission of remaining buffered data
 226  *
 227  *      TCP_FIN_WAIT2           all buffered data sent, waiting for remote
 228  *                              to shutdown
 229  *
 230  *      TCP_CLOSING             both sides have shutdown but we still have
 231  *                              data we have to finish sending
 232  *
 233  *      TCP_TIME_WAIT           timeout to catch resent junk before entering
 234  *                              closed, can only be entered from FIN_WAIT2
 235  *                              or CLOSING.  Required because the other end
 236  *                              may not have gotten our last ACK causing it
 237  *                              to retransmit the data packet (which we ignore)
 238  *
 239  *      TCP_CLOSE_WAIT          remote side has shutdown and is waiting for
 240  *                              us to finish writing our data and to shutdown
 241  *                              (we have to close() to move on to LAST_ACK)
 242  *
 243  *      TCP_LAST_ACK            out side has shutdown after remote has
 244  *                              shutdown.  There may still be data in our
 245  *                              buffer that we have to finish sending
 246  *
 247  *      TCP_CLOSE               socket is finished
 248  */
 249
 250 #include <linux/config.h>
 251 #include <linux/module.h>
 252 #include <linux/types.h>
 253 #include <linux/fcntl.h>
 254 #include <linux/poll.h>
 255 #include <linux/init.h>
 256 #include <linux/smp_lock.h>
 257 #include <linux/fs.h>
 258 #include <linux/random.h>
 259 #include <linux/bootmem.h>
 260
 261 #include <net/icmp.h>
 262 #include <net/tcp.h>
 263 #include <net/xfrm.h>
 264 #include <net/ip.h>
 265
 266
 267 #include <asm/uaccess.h>
 268 #include <asm/ioctls.h>
 269
 270 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 271
 272 DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly;
 273
 274 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
 275
 276 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 277
 278 int sysctl_tcp_mem[3];
 279 int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
 280 int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
 281
 282 EXPORT_SYMBOL(sysctl_tcp_mem);
 283 EXPORT_SYMBOL(sysctl_tcp_rmem);
 284 EXPORT_SYMBOL(sysctl_tcp_wmem);
 285
 286 atomic_t tcp_memory_allocated;  /* Current allocated memory. */
 287 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
 288
 289 EXPORT_SYMBOL(tcp_memory_allocated);
 290 EXPORT_SYMBOL(tcp_sockets_allocated);
 291
 292 /*
 293  * Pressure flag: try to collapse.
 294  * Technical note: it is used by multiple contexts non atomically.
 295  * All the sk_stream_mem_schedule() is of this nature: accounting
 296  * is strict, actions are advisory and have some latency.
 297  */
 298 int tcp_memory_pressure;
 299
 300 EXPORT_SYMBOL(tcp_memory_pressure);
 301
 302 void tcp_enter_memory_pressure(void)
 303 {
 304         if (!tcp_memory_pressure) {
 305                 NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
 306                 tcp_memory_pressure = 1;
 307         }
 308 }
 309
 310 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 311
 312 /*
 313  *      Wait for a TCP event.
 314  *
 315  *      Note that we don't need to lock the socket, as the upper poll layers
 316  *      take care of normal races (between the test and the event) and we don't
 317  *      go look at any of the socket buffers directly.
 318  */
 319 unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
 320 {
 321         unsigned int mask;
 322         struct sock *sk = sock->sk;
 323         struct tcp_sock *tp = tcp_sk(sk);
 324
 325         poll_wait(file, sk->sk_sleep, wait);
 326         if (sk->sk_state == TCP_LISTEN)
 327                 return inet_csk_listen_poll(sk);
 328
 329         /* Socket is not locked. We are protected from async events
 330            by poll logic and correct handling of state changes
 331            made by another threads is impossible in any case.
 332          */
 333
 334         mask = 0;
 335         if (sk->sk_err)
 336                 mask = POLLERR;
 337
 338         /*
 339          * POLLHUP is certainly not done right. But poll() doesn't
 340          * have a notion of HUP in just one direction, and for a
 341          * socket the read side is more interesting.
 342          *
 343          * Some poll() documentation says that POLLHUP is incompatible
 344          * with the POLLOUT/POLLWR flags, so somebody should check this
 345          * all. But careful, it tends to be safer to return too many
 346          * bits than too few, and you can easily break real applications
 347          * if you don't tell them that something has hung up!
 348          *
 349          * Check-me.
 350          *
 351          * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
 352          * our fs/select.c). It means that after we received EOF,
 353          * poll always returns immediately, making impossible poll() on write()
 354          * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
 355          * if and only if shutdown has been made in both directions.
 356          * Actually, it is interesting to look how Solaris and DUX
 357          * solve this dilemma. I would prefer, if PULLHUP were maskable,
 358          * then we could set it on SND_SHUTDOWN. BTW examples given
 359          * in Stevens' books assume exactly this behaviour, it explains
 360          * why PULLHUP is incompatible with POLLOUT.    --ANK
 361          *
 362          * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
 363          * blocking on fresh not-connected or disconnected socket. --ANK
 364          */
 365         if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
 366                 mask |= POLLHUP;
 367         if (sk->sk_shutdown & RCV_SHUTDOWN)
 368                 mask |= POLLIN | POLLRDNORM;
 369
 370         /* Connected? */
 371         if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 372                 /* Potential race condition. If read of tp below will
 373                  * escape above sk->sk_state, we can be illegally awaken
 374                  * in SYN_* states. */
 375                 if ((tp->rcv_nxt != tp->copied_seq) &&
 376                     (tp->urg_seq != tp->copied_seq ||
 377                      tp->rcv_nxt != tp->copied_seq + 1 ||
 378                      sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
 379                         mask |= POLLIN | POLLRDNORM;
 380
 381                 if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
 382                         if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
 383                                 mask |= POLLOUT | POLLWRNORM;
 384                         } else {  /* send SIGIO later */
 385                                 set_bit(SOCK_ASYNC_NOSPACE,
 386                                         &sk->sk_socket->flags);
 387                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 388
 389                                 /* Race breaker. If space is freed after
 390                                  * wspace test but before the flags are set,
 391                                  * IO signal will be lost.
 392                                  */
 393                                 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
 394                                         mask |= POLLOUT | POLLWRNORM;
 395                         }
 396                 }
 397
 398                 if (tp->urg_data & TCP_URG_VALID)
 399                         mask |= POLLPRI;
 400         }
 401         return mask;
 402 }
 403
 404 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 405 {
 406         struct tcp_sock *tp = tcp_sk(sk);
 407         int answ;
 408
 409         switch (cmd) {
 410         case SIOCINQ:
 411                 if (sk->sk_state == TCP_LISTEN)
 412                         return -EINVAL;
 413
 414                 lock_sock(sk);
 415                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 416                         answ = 0;
 417                 else if (sock_flag(sk, SOCK_URGINLINE) ||
 418                          !tp->urg_data ||
 419                          before(tp->urg_seq, tp->copied_seq) ||
 420                          !before(tp->urg_seq, tp->rcv_nxt)) {
 421                         answ = tp->rcv_nxt - tp->copied_seq;
 422
 423                         /* Subtract 1, if FIN is in queue. */
 424                         if (answ && !skb_queue_empty(&sk->sk_receive_queue))
 425                                 answ -=
 426                        ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
 427                 } else
 428                         answ = tp->urg_seq - tp->copied_seq;
 429                 release_sock(sk);
 430                 break;
 431         case SIOCATMARK:
 432                 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
 433                 break;
 434         case SIOCOUTQ:
 435                 if (sk->sk_state == TCP_LISTEN)
 436                         return -EINVAL;
 437
 438                 if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
 439                         answ = 0;
 440                 else
 441                         answ = tp->write_seq - tp->snd_una;
 442                 break;
 443         default:
 444                 return -ENOIOCTLCMD;
 445         };
 446
 447         return put_user(answ, (int __user *)arg);
 448 }
 449
 450 static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
 451 {
 452         TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
 453         tp->pushed_seq = tp->write_seq;
 454 }
 455
 456 static inline int forced_push(struct tcp_sock *tp)
 457 {
 458         return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
 459 }
 460
 461 static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
 462                               struct sk_buff *skb)
 463 {
 464         skb->csum = 0;
 465         TCP_SKB_CB(skb)->seq = tp->write_seq;
 466         TCP_SKB_CB(skb)->end_seq = tp->write_seq;
 467         TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
 468         TCP_SKB_CB(skb)->sacked = 0;
 469         skb_header_release(skb);
 470         __skb_queue_tail(&sk->sk_write_queue, skb);
 471         sk_charge_skb(sk, skb);
 472         if (!sk->sk_send_head)
 473                 sk->sk_send_head = skb;
 474         if (tp->nonagle & TCP_NAGLE_PUSH)
 475                 tp->nonagle &= ~TCP_NAGLE_PUSH;
 476 }
 477
 478 static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
 479                                 struct sk_buff *skb)
 480 {
 481         if (flags & MSG_OOB) {
 482                 tp->urg_mode = 1;
 483                 tp->snd_up = tp->write_seq;
 484                 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
 485         }
 486 }
 487
 488 static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
 489                             int mss_now, int nonagle)
 490 {
 491         if (sk->sk_send_head) {
 492                 struct sk_buff *skb = sk->sk_write_queue.prev;
 493                 if (!(flags & MSG_MORE) || forced_push(tp))
 494                         tcp_mark_push(tp, skb);
 495                 tcp_mark_urg(tp, flags, skb);
 496                 __tcp_push_pending_frames(sk, tp, mss_now,
 497                                           (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
 498         }
 499 }
 500
 501 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
 502                          size_t psize, int flags)
 503 {
 504         struct tcp_sock *tp = tcp_sk(sk);
 505         int mss_now, size_goal;
 506         int err;
 507         ssize_t copied;
 508         long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 509
 510         /* Wait for a connection to finish. */
 511         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 512                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 513                         goto out_err;
 514
 515         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 516
 517         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 518         size_goal = tp->xmit_size_goal;
 519         copied = 0;
 520
 521         err = -EPIPE;
 522         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 523                 goto do_error;
 524
 525         while (psize > 0) {
 526                 struct sk_buff *skb = sk->sk_write_queue.prev;
 527                 struct page *page = pages[poffset / PAGE_SIZE];
 528                 int copy, i, can_coalesce;
 529                 int offset = poffset % PAGE_SIZE;
 530                 int size = min_t(size_t, psize, PAGE_SIZE - offset);
 531
 532                 if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
 533 new_segment:
 534                         if (!sk_stream_memory_free(sk))
 535                                 goto wait_for_sndbuf;
 536
 537                         skb = sk_stream_alloc_pskb(sk, 0, 0,
 538                                                    sk->sk_allocation);
 539                         if (!skb)
 540                                 goto wait_for_memory;
 541
 542                         skb_entail(sk, tp, skb);
 543                         copy = size_goal;
 544                 }
 545
 546                 if (copy > size)
 547                         copy = size;
 548
 549                 i = skb_shinfo(skb)->nr_frags;
 550                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
 551                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
 552                         tcp_mark_push(tp, skb);
 553                         goto new_segment;
 554                 }
 555                 if (sk->sk_forward_alloc < copy &&
 556                     !sk_stream_mem_schedule(sk, copy, 0))
 557                         goto wait_for_memory;
 558
 559                 if (can_coalesce) {
 560                         skb_shinfo(skb)->frags[i - 1].size += copy;
 561                 } else {
 562                         get_page(page);
 563                         skb_fill_page_desc(skb, i, page, offset, copy);
 564                 }
 565
 566                 skb->len += copy;
 567                 skb->data_len += copy;
 568                 skb->truesize += copy;
 569                 sk->sk_wmem_queued += copy;
 570                 sk->sk_forward_alloc -= copy;
 571                 skb->ip_summed = CHECKSUM_HW;
 572                 tp->write_seq += copy;
 573                 TCP_SKB_CB(skb)->end_seq += copy;
 574                 skb_shinfo(skb)->tso_segs = 0;
 575
 576                 if (!copied)
 577                         TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 578
 579                 copied += copy;
 580                 poffset += copy;
 581                 if (!(psize -= copy))
 582                         goto out;
 583
 584                 if (skb->len < mss_now || (flags & MSG_OOB))
 585                         continue;
 586
 587                 if (forced_push(tp)) {
 588                         tcp_mark_push(tp, skb);
 589                         __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 590                 } else if (skb == sk->sk_send_head)
 591                         tcp_push_one(sk, mss_now);
 592                 continue;
 593
 594 wait_for_sndbuf:
 595                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 596 wait_for_memory:
 597                 if (copied)
 598                         tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 599
 600                 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 601                         goto do_error;
 602
 603                 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 604                 size_goal = tp->xmit_size_goal;
 605         }
 606
 607 out:
 608         if (copied)
 609                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 610         return copied;
 611
 612 do_error:
 613         if (copied)
 614                 goto out;
 615 out_err:
 616         return sk_stream_error(sk, flags, err);
 617 }
 618
 619 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
 620                      size_t size, int flags)
 621 {
 622         ssize_t res;
 623         struct sock *sk = sock->sk;
 624
 625 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 626
 627         if (!(sk->sk_route_caps & NETIF_F_SG) ||
 628             !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
 629                 return sock_no_sendpage(sock, page, offset, size, flags);
 630
 631 #undef TCP_ZC_CSUM_FLAGS
 632
 633         lock_sock(sk);
 634         TCP_CHECK_TIMER(sk);
 635         res = do_tcp_sendpages(sk, &page, offset, size, flags);
 636         TCP_CHECK_TIMER(sk);
 637         release_sock(sk);
 638         return res;
 639 }
 640
 641 #define TCP_PAGE(sk)    (sk->sk_sndmsg_page)
 642 #define TCP_OFF(sk)     (sk->sk_sndmsg_off)
 643
 644 static inline int select_size(struct sock *sk, struct tcp_sock *tp)
 645 {
 646         int tmp = tp->mss_cache;
 647
 648         if (sk->sk_route_caps & NETIF_F_SG) {
 649                 if (sk->sk_route_caps & NETIF_F_TSO)
 650                         tmp = 0;
 651                 else {
 652                         int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
 653
 654                         if (tmp >= pgbreak &&
 655                             tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
 656                                 tmp = pgbreak;
 657                 }
 658         }
 659
 660         return tmp;
 661 }
 662
 663 int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 664                 size_t size)
 665 {
 666         struct iovec *iov;
 667         struct tcp_sock *tp = tcp_sk(sk);
 668         struct sk_buff *skb;
 669         int iovlen, flags;
 670         int mss_now, size_goal;
 671         int err, copied;
 672         long timeo;
 673
 674         lock_sock(sk);
 675         TCP_CHECK_TIMER(sk);
 676
 677         flags = msg->msg_flags;
 678         timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 679
 680         /* Wait for a connection to finish. */
 681         if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
 682                 if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
 683                         goto out_err;
 684
 685         /* This should be in poll */
 686         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 687
 688         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 689         size_goal = tp->xmit_size_goal;
 690
 691         /* Ok commence sending. */
 692         iovlen = msg->msg_iovlen;
 693         iov = msg->msg_iov;
 694         copied = 0;
 695
 696         err = -EPIPE;
 697         if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
 698                 goto do_error;
 699
 700         while (--iovlen >= 0) {
 701                 int seglen = iov->iov_len;
 702                 unsigned char __user *from = iov->iov_base;
 703
 704                 iov++;
 705
 706                 while (seglen > 0) {
 707                         int copy;
 708
 709                         skb = sk->sk_write_queue.prev;
 710
 711                         if (!sk->sk_send_head ||
 712                             (copy = size_goal - skb->len) <= 0) {
 713
 714 new_segment:
 715                                 /* Allocate new segment. If the interface is SG,
 716                                  * allocate skb fitting to single page.
 717                                  */
 718                                 if (!sk_stream_memory_free(sk))
 719                                         goto wait_for_sndbuf;
 720
 721                                 skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
 722                                                            0, sk->sk_allocation);
 723                                 if (!skb)
 724                                         goto wait_for_memory;
 725
 726                                 /*
 727                                  * Check whether we can use HW checksum.
 728                                  */
 729                                 if (sk->sk_route_caps &
 730                                     (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
 731                                      NETIF_F_HW_CSUM))
 732                                         skb->ip_summed = CHECKSUM_HW;
 733
 734                                 skb_entail(sk, tp, skb);
 735                                 copy = size_goal;
 736                         }
 737
 738                         /* Try to append data to the end of skb. */
 739                         if (copy > seglen)
 740                                 copy = seglen;
 741
 742                         /* Where to copy to? */
 743                         if (skb_tailroom(skb) > 0) {
 744                                 /* We have some space in skb head. Superb! */
 745                                 if (copy > skb_tailroom(skb))
 746                                         copy = skb_tailroom(skb);
 747                                 if ((err = skb_add_data(skb, from, copy)) != 0)
 748                                         goto do_fault;
 749                         } else {
 750                                 int merge = 0;
 751                                 int i = skb_shinfo(skb)->nr_frags;
 752                                 struct page *page = TCP_PAGE(sk);
 753                                 int off = TCP_OFF(sk);
 754
 755                                 if (skb_can_coalesce(skb, i, page, off) &&
 756                                     off != PAGE_SIZE) {
 757                                         /* We can extend the last page
 758                                          * fragment. */
 759                                         merge = 1;
 760                                 } else if (i == MAX_SKB_FRAGS ||
 761                                            (!i &&
 762                                            !(sk->sk_route_caps & NETIF_F_SG))) {
 763                                         /* Need to add new fragment and cannot
 764                                          * do this because interface is non-SG,
 765                                          * or because all the page slots are
 766                                          * busy. */
 767                                         tcp_mark_push(tp, skb);
 768                                         goto new_segment;
 769                                 } else if (page) {
 770                                         if (off == PAGE_SIZE) {
 771                                                 put_page(page);
 772                                                 TCP_PAGE(sk) = page = NULL;
 773                                         }
 774                                 }
 775
 776                                 if (!page) {
 777                                         /* Allocate new cache page. */
 778                                         if (!(page = sk_stream_alloc_page(sk)))
 779                                                 goto wait_for_memory;
 780                                         off = 0;
 781                                 }
 782
 783                                 if (copy > PAGE_SIZE - off)
 784                                         copy = PAGE_SIZE - off;
 785
 786                                 /* Time to copy data. We are close to
 787                                  * the end! */
 788                                 err = skb_copy_to_page(sk, from, skb, page,
 789                                                        off, copy);
 790                                 if (err) {
 791                                         /* If this page was new, give it to the
 792                                          * socket so it does not get leaked.
 793                                          */
 794                                         if (!TCP_PAGE(sk)) {
 795                                                 TCP_PAGE(sk) = page;
 796                                                 TCP_OFF(sk) = 0;
 797                                         }
 798                                         goto do_error;
 799                                 }
 800
 801                                 /* Update the skb. */
 802                                 if (merge) {
 803                                         skb_shinfo(skb)->frags[i - 1].size +=
 804                                                                         copy;
 805                                 } else {
 806                                         skb_fill_page_desc(skb, i, page, off, copy);
 807                                         if (TCP_PAGE(sk)) {
 808                                                 get_page(page);
 809                                         } else if (off + copy < PAGE_SIZE) {
 810                                                 get_page(page);
 811                                                 TCP_PAGE(sk) = page;
 812                                         }
 813                                 }
 814
 815                                 TCP_OFF(sk) = off + copy;
 816                         }
 817
 818                         if (!copied)
 819                                 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
 820
 821                         tp->write_seq += copy;
 822                         TCP_SKB_CB(skb)->end_seq += copy;
 823                         skb_shinfo(skb)->tso_segs = 0;
 824
 825                         from += copy;
 826                         copied += copy;
 827                         if ((seglen -= copy) == 0 && iovlen == 0)
 828                                 goto out;
 829
 830                         if (skb->len < mss_now || (flags & MSG_OOB))
 831                                 continue;
 832
 833                         if (forced_push(tp)) {
 834                                 tcp_mark_push(tp, skb);
 835                                 __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
 836                         } else if (skb == sk->sk_send_head)
 837                                 tcp_push_one(sk, mss_now);
 838                         continue;
 839
 840 wait_for_sndbuf:
 841                         set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 842 wait_for_memory:
 843                         if (copied)
 844                                 tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
 845
 846                         if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
 847                                 goto do_error;
 848
 849                         mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
 850                         size_goal = tp->xmit_size_goal;
 851                 }
 852         }
 853
 854 out:
 855         if (copied)
 856                 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
 857         TCP_CHECK_TIMER(sk);
 858         release_sock(sk);
 859         return copied;
 860
 861 do_fault:
 862         if (!skb->len) {
 863                 if (sk->sk_send_head == skb)
 864                         sk->sk_send_head = NULL;
 865                 __skb_unlink(skb, &sk->sk_write_queue);
 866                 sk_stream_free_skb(sk, skb);
 867         }
 868
 869 do_error:
 870         if (copied)
 871                 goto out;
 872 out_err:
 873         err = sk_stream_error(sk, flags, err);
 874         TCP_CHECK_TIMER(sk);
 875         release_sock(sk);
 876         return err;
 877 }
 878
 879 /*
 880  *      Handle reading urgent data. BSD has very simple semantics for
 881  *      this, no blocking and very strange errors 8)
 882  */
 883
 884 static int tcp_recv_urg(struct sock *sk, long timeo,
 885                         struct msghdr *msg, int len, int flags,
 886                         int *addr_len)
 887 {
 888         struct tcp_sock *tp = tcp_sk(sk);
 889
 890         /* No URG data to read. */
 891         if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
 892             tp->urg_data == TCP_URG_READ)
 893                 return -EINVAL; /* Yes this is right ! */
 894
 895         if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
 896                 return -ENOTCONN;
 897
 898         if (tp->urg_data & TCP_URG_VALID) {
 899                 int err = 0;
 900                 char c = tp->urg_data;
 901
 902                 if (!(flags & MSG_PEEK))
 903                         tp->urg_data = TCP_URG_READ;
 904
 905                 /* Read urgent data. */
 906                 msg->msg_flags |= MSG_OOB;
 907
 908                 if (len > 0) {
 909                         if (!(flags & MSG_TRUNC))
 910                                 err = memcpy_toiovec(msg->msg_iov, &c, 1);
 911                         len = 1;
 912                 } else
 913                         msg->msg_flags |= MSG_TRUNC;
 914
 915                 return err ? -EFAULT : len;
 916         }
 917
 918         if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
 919                 return 0;
 920
 921         /* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
 922          * the available implementations agree in this case:
 923          * this call should never block, independent of the
 924          * blocking state of the socket.
 925          * Mike <pall@rz.uni-karlsruhe.de>
 926          */
 927         return -EAGAIN;
 928 }
 929
 930 /* Clean up the receive buffer for full frames taken by the user,
 931  * then send an ACK if necessary.  COPIED is the number of bytes
 932  * tcp_recvmsg has given to the user so far, it speeds up the
 933  * calculation of whether or not we must ACK for the sake of
 934  * a window update.
 935  */
 936 static void cleanup_rbuf(struct sock *sk, int copied)
 937 {
 938         struct tcp_sock *tp = tcp_sk(sk);
 939         int time_to_ack = 0;
 940
 941 #if TCP_DEBUG
 942         struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 943
 944         BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
 945 #endif
 946
 947         if (inet_csk_ack_scheduled(sk)) {
 948                 const struct inet_connection_sock *icsk = inet_csk(sk);
 949                    /* Delayed ACKs frequently hit locked sockets during bulk
 950                     * receive. */
 951                 if (icsk->icsk_ack.blocked ||
 952                     /* Once-per-two-segments ACK was not sent by tcp_input.c */
 953                     tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
 954                     /*
 955                      * If this read emptied read buffer, we send ACK, if
 956                      * connection is not bidirectional, user drained
 957                      * receive buffer and there was a small segment
 958                      * in queue.
 959                      */
 960                     (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
 961                      !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
 962                         time_to_ack = 1;
 963         }
 964
 965         /* We send an ACK if we can now advertise a non-zero window
 966          * which has been raised "significantly".
 967          *
 968          * Even if window raised up to infinity, do not send window open ACK
 969          * in states, where we will not receive more. It is useless.
 970          */
 971         if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
 972                 __u32 rcv_window_now = tcp_receive_window(tp);
 973
 974                 /* Optimize, __tcp_select_window() is not cheap. */
 975                 if (2*rcv_window_now <= tp->window_clamp) {
 976                         __u32 new_window = __tcp_select_window(sk);
 977
 978                         /* Send ACK now, if this read freed lots of space
 979                          * in our buffer. Certainly, new_window is new window.
 980                          * We can advertise it now, if it is not less than current one.
 981                          * "Lots" means "at least twice" here.
 982                          */
 983                         if (new_window && new_window >= 2 * rcv_window_now)
 984                                 time_to_ack = 1;
 985                 }
 986         }
 987         if (time_to_ack)
 988                 tcp_send_ack(sk);
 989 }
 990
 991 static void tcp_prequeue_process(struct sock *sk)
 992 {
 993         struct sk_buff *skb;
 994         struct tcp_sock *tp = tcp_sk(sk);
 995
 996         NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED);
 997
 998         /* RX process wants to run with disabled BHs, though it is not
 999          * necessary */
1000         local_bh_disable();
1001         while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1002                 sk->sk_backlog_rcv(sk, skb);
1003         local_bh_enable();
1004
1005         /* Clear memory counter. */
1006         tp->ucopy.memory = 0;
1007 }
1008
1009 static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1010 {
1011         struct sk_buff *skb;
1012         u32 offset;
1013
1014         skb_queue_walk(&sk->sk_receive_queue, skb) {
1015                 offset = seq - TCP_SKB_CB(skb)->seq;
1016                 if (skb->h.th->syn)
1017                         offset--;
1018                 if (offset < skb->len || skb->h.th->fin) {
1019                         *off = offset;
1020                         return skb;
1021                 }
1022         }
1023         return NULL;
1024 }
1025
1026 /*
1027  * This routine provides an alternative to tcp_recvmsg() for routines
1028  * that would like to handle copying from skbuffs directly in 'sendfile'
1029  * fashion.
1030  * Note:
1031  *      - It is assumed that the socket was locked by the caller.
1032  *      - The routine does not block.
1033  *      - At present, there is no support for reading OOB data
1034  *        or for 'peeking' the socket using this routine
1035  *        (although both would be easy to implement).
1036  */
1037 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1038                   sk_read_actor_t recv_actor)
1039 {
1040         struct sk_buff *skb;
1041         struct tcp_sock *tp = tcp_sk(sk);
1042         u32 seq = tp->copied_seq;
1043         u32 offset;
1044         int copied = 0;
1045
1046         if (sk->sk_state == TCP_LISTEN)
1047                 return -ENOTCONN;
1048         while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1049                 if (offset < skb->len) {
1050                         size_t used, len;
1051
1052                         len = skb->len - offset;
1053                         /* Stop reading if we hit a patch of urgent data */
1054                         if (tp->urg_data) {
1055                                 u32 urg_offset = tp->urg_seq - seq;
1056                                 if (urg_offset < len)
1057                                         len = urg_offset;
1058                                 if (!len)
1059                                         break;
1060                         }
1061                         used = recv_actor(desc, skb, offset, len);
1062                         if (used <= len) {
1063                                 seq += used;
1064                                 copied += used;
1065                                 offset += used;
1066                         }
1067                         if (offset != skb->len)
1068                                 break;
1069                 }
1070                 if (skb->h.th->fin) {
1071                         sk_eat_skb(sk, skb);
1072                         ++seq;
1073                         break;
1074                 }
1075                 sk_eat_skb(sk, skb);
1076                 if (!desc->count)
1077                         break;
1078         }
1079         tp->copied_seq = seq;
1080
1081         tcp_rcv_space_adjust(sk);
1082
1083         /* Clean up data we have read: This will do ACK frames. */
1084         if (copied)
1085                 cleanup_rbuf(sk, copied);
1086         return copied;
1087 }
1088
1089 /*
1090  *      This routine copies from a sock struct into the user buffer.
1091  *
1092  *      Technical note: in 2.3 we work on _locked_ socket, so that
1093  *      tricks with *seq access order and skb->users are not required.
1094  *      Probably, code can be easily improved even more.
1095  */
1096
1097 int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1098                 size_t len, int nonblock, int flags, int *addr_len)
1099 {
1100         struct tcp_sock *tp = tcp_sk(sk);
1101         int copied = 0;
1102         u32 peek_seq;
1103         u32 *seq;
1104         unsigned long used;
1105         int err;
1106         int target;             /* Read at least this many bytes */
1107         long timeo;
1108         struct task_struct *user_recv = NULL;
1109
1110         lock_sock(sk);
1111
1112         TCP_CHECK_TIMER(sk);
1113
1114         err = -ENOTCONN;
1115         if (sk->sk_state == TCP_LISTEN)
1116                 goto out;
1117
1118         timeo = sock_rcvtimeo(sk, nonblock);
1119
1120         /* Urgent data needs to be handled specially. */
1121         if (flags & MSG_OOB)
1122                 goto recv_urg;
1123
1124         seq = &tp->copied_seq;
1125         if (flags & MSG_PEEK) {
1126                 peek_seq = tp->copied_seq;
1127                 seq = &peek_seq;
1128         }
1129
1130         target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1131
1132         do {
1133                 struct sk_buff *skb;
1134                 u32 offset;
1135
1136                 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1137                 if (tp->urg_data && tp->urg_seq == *seq) {
1138                         if (copied)
1139                                 break;
1140                         if (signal_pending(current)) {
1141                                 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1142                                 break;
1143                         }
1144                 }
1145
1146                 /* Next get a buffer. */
1147
1148                 skb = skb_peek(&sk->sk_receive_queue);
1149                 do {
1150                         if (!skb)
1151                                 break;
1152
1153                         /* Now that we have two receive queues this
1154                          * shouldn't happen.
1155                          */
1156                         if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1157                                 printk(KERN_INFO "recvmsg bug: copied %X "
1158                                        "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
1159                                 break;
1160                         }
1161                         offset = *seq - TCP_SKB_CB(skb)->seq;
1162                         if (skb->h.th->syn)
1163                                 offset--;
1164                         if (offset < skb->len)
1165                                 goto found_ok_skb;
1166                         if (skb->h.th->fin)
1167                                 goto found_fin_ok;
1168                         BUG_TRAP(flags & MSG_PEEK);
1169                         skb = skb->next;
1170                 } while (skb != (struct sk_buff *)&sk->sk_receive_queue);
1171
1172                 /* Well, if we have backlog, try to process it now yet. */
1173
1174                 if (copied >= target && !sk->sk_backlog.tail)
1175                         break;
1176
1177                 if (copied) {
1178                         if (sk->sk_err ||
1179                             sk->sk_state == TCP_CLOSE ||
1180                             (sk->sk_shutdown & RCV_SHUTDOWN) ||
1181                             !timeo ||
1182                             signal_pending(current) ||
1183                             (flags & MSG_PEEK))
1184                                 break;
1185                 } else {
1186                         if (sock_flag(sk, SOCK_DONE))
1187                                 break;
1188
1189                         if (sk->sk_err) {
1190                                 copied = sock_error(sk);
1191                                 break;
1192                         }
1193
1194                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1195                                 break;
1196
1197                         if (sk->sk_state == TCP_CLOSE) {
1198                                 if (!sock_flag(sk, SOCK_DONE)) {
1199                                         /* This occurs when user tries to read
1200                                          * from never connected socket.
1201                                          */
1202                                         copied = -ENOTCONN;
1203                                         break;
1204                                 }
1205                                 break;
1206                         }
1207
1208                         if (!timeo) {
1209                                 copied = -EAGAIN;
1210                                 break;
1211                         }
1212
1213                         if (signal_pending(current)) {
1214                                 copied = sock_intr_errno(timeo);
1215                                 break;
1216                         }
1217                 }
1218
1219                 cleanup_rbuf(sk, copied);
1220
1221                 if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
1222                         /* Install new reader */
1223                         if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
1224                                 user_recv = current;
1225                                 tp->ucopy.task = user_recv;
1226                                 tp->ucopy.iov = msg->msg_iov;
1227                         }
1228
1229                         tp->ucopy.len = len;
1230
1231                         BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
1232                                  (flags & (MSG_PEEK | MSG_TRUNC)));
1233
1234                         /* Ugly... If prequeue is not empty, we have to
1235                          * process it before releasing socket, otherwise
1236                          * order will be broken at second iteration.
1237                          * More elegant solution is required!!!
1238                          *
1239                          * Look: we have the following (pseudo)queues:
1240                          *
1241                          * 1. packets in flight
1242                          * 2. backlog
1243                          * 3. prequeue
1244                          * 4. receive_queue
1245                          *
1246                          * Each queue can be processed only if the next ones
1247                          * are empty. At this point we have empty receive_queue.
1248                          * But prequeue _can_ be not empty after 2nd iteration,
1249                          * when we jumped to start of loop because backlog
1250                          * processing added something to receive_queue.
1251                          * We cannot release_sock(), because backlog contains
1252                          * packets arrived _after_ prequeued ones.
1253                          *
1254                          * Shortly, algorithm is clear --- to process all
1255                          * the queues in order. We could make it more directly,
1256                          * requeueing packets from backlog to prequeue, if
1257                          * is not empty. It is more elegant, but eats cycles,
1258                          * unfortunately.
1259                          */
1260                         if (!skb_queue_empty(&tp->ucopy.prequeue))
1261                                 goto do_prequeue;
1262
1263                         /* __ Set realtime policy in scheduler __ */
1264                 }
1265
1266                 if (copied >= target) {
1267                         /* Do not sleep, just process backlog. */
1268                         release_sock(sk);
1269                         lock_sock(sk);
1270                 } else
1271                         sk_wait_data(sk, &timeo);
1272
1273                 if (user_recv) {
1274                         int chunk;
1275
1276                         /* __ Restore normal policy in scheduler __ */
1277
1278                         if ((chunk = len - tp->ucopy.len) != 0) {
1279                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
1280                                 len -= chunk;
1281                                 copied += chunk;
1282                         }
1283
1284                         if (tp->rcv_nxt == tp->copied_seq &&
1285                             !skb_queue_empty(&tp->ucopy.prequeue)) {
1286 do_prequeue:
1287                                 tcp_prequeue_process(sk);
1288
1289                                 if ((chunk = len - tp->ucopy.len) != 0) {
1290                                         NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1291                                         len -= chunk;
1292                                         copied += chunk;
1293                                 }
1294                         }
1295                 }
1296                 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1297                         if (net_ratelimit())
1298                                 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1299                                        current->comm, current->pid);
1300                         peek_seq = tp->copied_seq;
1301                 }
1302                 continue;
1303
1304         found_ok_skb:
1305                 /* Ok so how much can we use? */
1306                 used = skb->len - offset;
1307                 if (len < used)
1308                         used = len;
1309
1310                 /* Do we have urgent data here? */
1311                 if (tp->urg_data) {
1312                         u32 urg_offset = tp->urg_seq - *seq;
1313                         if (urg_offset < used) {
1314                                 if (!urg_offset) {
1315                                         if (!sock_flag(sk, SOCK_URGINLINE)) {
1316                                                 ++*seq;
1317                                                 offset++;
1318                                                 used--;
1319                                                 if (!used)
1320                                                         goto skip_copy;
1321                                         }
1322                                 } else
1323                                         used = urg_offset;
1324                         }
1325                 }
1326
1327                 if (!(flags & MSG_TRUNC)) {
1328                         err = skb_copy_datagram_iovec(skb, offset,
1329                                                       msg->msg_iov, used);
1330                         if (err) {
1331                                 /* Exception. Bailout! */
1332                                 if (!copied)
1333                                         copied = -EFAULT;
1334                                 break;
1335                         }
1336                 }
1337
1338                 *seq += used;
1339                 copied += used;
1340                 len -= used;
1341
1342                 tcp_rcv_space_adjust(sk);
1343
1344 skip_copy:
1345                 if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
1346                         tp->urg_data = 0;
1347                         tcp_fast_path_check(sk, tp);
1348                 }
1349                 if (used + offset < skb->len)
1350                         continue;
1351
1352                 if (skb->h.th->fin)
1353                         goto found_fin_ok;
1354                 if (!(flags & MSG_PEEK))
1355                         sk_eat_skb(sk, skb);
1356                 continue;
1357
1358         found_fin_ok:
1359                 /* Process the FIN. */
1360                 ++*seq;
1361                 if (!(flags & MSG_PEEK))
1362                         sk_eat_skb(sk, skb);
1363                 break;
1364         } while (len > 0);
1365
1366         if (user_recv) {
1367                 if (!skb_queue_empty(&tp->ucopy.prequeue)) {
1368                         int chunk;
1369
1370                         tp->ucopy.len = copied > 0 ? len : 0;
1371
1372                         tcp_prequeue_process(sk);
1373
1374                         if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1375                                 NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
1376                                 len -= chunk;
1377                                 copied += chunk;
1378                         }
1379                 }
1380
1381                 tp->ucopy.task = NULL;
1382                 tp->ucopy.len = 0;
1383         }
1384
1385         /* According to UNIX98, msg_name/msg_namelen are ignored
1386          * on connected socket. I was just happy when found this 8) --ANK
1387          */
1388
1389         /* Clean up data we have read: This will do ACK frames. */
1390         cleanup_rbuf(sk, copied);
1391
1392         TCP_CHECK_TIMER(sk);
1393         release_sock(sk);
1394         return copied;
1395
1396 out:
1397         TCP_CHECK_TIMER(sk);
1398         release_sock(sk);
1399         return err;
1400
1401 recv_urg:
1402         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1403         goto out;
1404 }
1405
1406 /*
1407  *      State processing on a close. This implements the state shift for
1408  *      sending our FIN frame. Note that we only send a FIN for some
1409  *      states. A shutdown() may have already sent the FIN, or we may be
1410  *      closed.
1411  */
1412
1413 static unsigned char new_state[16] = {
1414   /* current state:        new state:      action:      */
1415   /* (Invalid)          */ TCP_CLOSE,
1416   /* TCP_ESTABLISHED    */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1417   /* TCP_SYN_SENT       */ TCP_CLOSE,
1418   /* TCP_SYN_RECV       */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1419   /* TCP_FIN_WAIT1      */ TCP_FIN_WAIT1,
1420   /* TCP_FIN_WAIT2      */ TCP_FIN_WAIT2,
1421   /* TCP_TIME_WAIT      */ TCP_CLOSE,
1422   /* TCP_CLOSE          */ TCP_CLOSE,
1423   /* TCP_CLOSE_WAIT     */ TCP_LAST_ACK  | TCP_ACTION_FIN,
1424   /* TCP_LAST_ACK       */ TCP_LAST_ACK,
1425   /* TCP_LISTEN         */ TCP_CLOSE,
1426   /* TCP_CLOSING        */ TCP_CLOSING,
1427 };
1428
1429 static int tcp_close_state(struct sock *sk)
1430 {
1431         int next = (int)new_state[sk->sk_state];
1432         int ns = next & TCP_STATE_MASK;
1433
1434         tcp_set_state(sk, ns);
1435
1436         return next & TCP_ACTION_FIN;
1437 }
1438
1439 /*
1440  *      Shutdown the sending side of a connection. Much like close except
1441  *      that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
1442  */
1443
1444 void tcp_shutdown(struct sock *sk, int how)
1445 {
1446         /*      We need to grab some memory, and put together a FIN,
1447          *      and then put it into the queue to be sent.
1448          *              Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1449          */
1450         if (!(how & SEND_SHUTDOWN))
1451                 return;
1452
1453         /* If we've already sent a FIN, or it's a closed state, skip this. */
1454         if ((1 << sk->sk_state) &
1455             (TCPF_ESTABLISHED | TCPF_SYN_SENT |
1456              TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
1457                 /* Clear out any half completed packets.  FIN if needed. */
1458                 if (tcp_close_state(sk))
1459                         tcp_send_fin(sk);
1460         }
1461 }
1462
1463 void tcp_close(struct sock *sk, long timeout)
1464 {
1465         struct sk_buff *skb;
1466         int data_was_unread = 0;
1467
1468         lock_sock(sk);
1469         sk->sk_shutdown = SHUTDOWN_MASK;
1470
1471         if (sk->sk_state == TCP_LISTEN) {
1472                 tcp_set_state(sk, TCP_CLOSE);
1473
1474                 /* Special case. */
1475                 inet_csk_listen_stop(sk);
1476
1477                 goto adjudge_to_death;
1478         }
1479
1480         /*  We need to flush the recv. buffs.  We do this only on the
1481          *  descriptor close, not protocol-sourced closes, because the
1482          *  reader process may not have drained the data yet!
1483          */
1484         while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
1485                 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
1486                           skb->h.th->fin;
1487                 data_was_unread += len;
1488                 __kfree_skb(skb);
1489         }
1490
1491         sk_stream_mem_reclaim(sk);
1492
1493         /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1494          * 3.10, we send a RST here because data was lost.  To
1495          * witness the awful effects of the old behavior of always
1496          * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1497          * a bulk GET in an FTP client, suspend the process, wait
1498          * for the client to advertise a zero window, then kill -9
1499          * the FTP client, wheee...  Note: timeout is always zero
1500          * in such a case.
1501          */
1502         if (data_was_unread) {
1503                 /* Unread data was tossed, zap the connection. */
1504                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
1505                 tcp_set_state(sk, TCP_CLOSE);
1506                 tcp_send_active_reset(sk, GFP_KERNEL);
1507         } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
1508                 /* Check zero linger _after_ checking for unread data. */
1509                 sk->sk_prot->disconnect(sk, 0);
1510                 NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
1511         } else if (tcp_close_state(sk)) {
1512                 /* We FIN if the application ate all the data before
1513                  * zapping the connection.
1514                  */
1515
1516                 /* RED-PEN. Formally speaking, we have broken TCP state
1517                  * machine. State transitions:
1518                  *
1519                  * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1520                  * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1521                  * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1522                  *
1523                  * are legal only when FIN has been sent (i.e. in window),
1524                  * rather than queued out of window. Purists blame.
1525                  *
1526                  * F.e. "RFC state" is ESTABLISHED,
1527                  * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1528                  *
1529                  * The visible declinations are that sometimes
1530                  * we enter time-wait state, when it is not required really
1531                  * (harmless), do not send active resets, when they are
1532                  * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1533                  * they look as CLOSING or LAST_ACK for Linux)
1534                  * Probably, I missed some more holelets.
1535                  *                                              --ANK
1536                  */
1537                 tcp_send_fin(sk);
1538         }
1539
1540         sk_stream_wait_close(sk, timeout);
1541
1542 adjudge_to_death:
1543         /* It is the last release_sock in its life. It will remove backlog. */
1544         release_sock(sk);
1545
1546
1547         /* Now socket is owned by kernel and we acquire BH lock
1548            to finish close. No need to check for user refs.
1549          */
1550         local_bh_disable();
1551         bh_lock_sock(sk);
1552         BUG_TRAP(!sock_owned_by_user(sk));
1553
1554         sock_hold(sk);
1555         sock_orphan(sk);
1556
1557         /*      This is a (useful) BSD violating of the RFC. There is a
1558          *      problem with TCP as specified in that the other end could
1559          *      keep a socket open forever with no application left this end.
1560          *      We use a 3 minute timeout (about the same as BSD) then kill
1561          *      our end. If they send after that then tough - BUT: long enough
1562          *      that we won't make the old 4*rto = almost no time - whoops
1563          *      reset mistake.
1564          *
1565          *      Nope, it was not mistake. It is really desired behaviour
1566          *      f.e. on http servers, when such sockets are useless, but
1567          *      consume significant resources. Let's do it with special
1568          *      linger2 option.                                 --ANK
1569          */
1570
1571         if (sk->sk_state == TCP_FIN_WAIT2) {
1572                 struct tcp_sock *tp = tcp_sk(sk);
1573                 if (tp->linger2 < 0) {
1574                         tcp_set_state(sk, TCP_CLOSE);
1575                         tcp_send_active_reset(sk, GFP_ATOMIC);
1576                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
1577                 } else {
1578                         const int tmo = tcp_fin_time(sk);
1579
1580                         if (tmo > TCP_TIMEWAIT_LEN) {
1581                                 inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk));
1582                         } else {
1583                                 atomic_inc(sk->sk_prot->orphan_count);
1584                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
1585                                 goto out;
1586                         }
1587                 }
1588         }
1589         if (sk->sk_state != TCP_CLOSE) {
1590                 sk_stream_mem_reclaim(sk);
1591                 if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans ||
1592                     (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
1593                      atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
1594                         if (net_ratelimit())
1595                                 printk(KERN_INFO "TCP: too many of orphaned "
1596                                        "sockets\n");
1597                         tcp_set_state(sk, TCP_CLOSE);
1598                         tcp_send_active_reset(sk, GFP_ATOMIC);
1599                         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
1600                 }
1601         }
1602         atomic_inc(sk->sk_prot->orphan_count);
1603
1604         if (sk->sk_state == TCP_CLOSE)
1605                 inet_csk_destroy_sock(sk);
1606         /* Otherwise, socket is reprieved until protocol close. */
1607
1608 out:
1609         bh_unlock_sock(sk);
1610         local_bh_enable();
1611         sock_put(sk);
1612 }
1613
1614 /* These states need RST on ABORT according to RFC793 */
1615
1616 static inline int tcp_need_reset(int state)
1617 {
1618         return (1 << state) &
1619                (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
1620                 TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
1621 }
1622
1623 int tcp_disconnect(struct sock *sk, int flags)
1624 {
1625         struct inet_sock *inet = inet_sk(sk);
1626         struct inet_connection_sock *icsk = inet_csk(sk);
1627         struct tcp_sock *tp = tcp_sk(sk);
1628         int err = 0;
1629         int old_state = sk->sk_state;
1630
1631         if (old_state != TCP_CLOSE)
1632                 tcp_set_state(sk, TCP_CLOSE);
1633
1634         /* ABORT function of RFC793 */
1635         if (old_state == TCP_LISTEN) {
1636                 inet_csk_listen_stop(sk);
1637         } else if (tcp_need_reset(old_state) ||
1638                    (tp->snd_nxt != tp->write_seq &&
1639                     (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
1640                 /* The last check adjusts for discrepance of Linux wrt. RFC
1641                  * states
1642                  */
1643                 tcp_send_active_reset(sk, gfp_any());
1644                 sk->sk_err = ECONNRESET;
1645         } else if (old_state == TCP_SYN_SENT)
1646                 sk->sk_err = ECONNRESET;
1647
1648         tcp_clear_xmit_timers(sk);
1649         __skb_queue_purge(&sk->sk_receive_queue);
1650         sk_stream_writequeue_purge(sk);
1651         __skb_queue_purge(&tp->out_of_order_queue);
1652
1653         inet->dport = 0;
1654
1655         if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
1656                 inet_reset_saddr(sk);
1657
1658         sk->sk_shutdown = 0;
1659         sock_reset_flag(sk, SOCK_DONE);
1660         tp->srtt = 0;
1661         if ((tp->write_seq += tp->max_window + 2) == 0)
1662                 tp->write_seq = 1;
1663         icsk->icsk_backoff = 0;
1664         tp->snd_cwnd = 2;
1665         icsk->icsk_probes_out = 0;
1666         tp->packets_out = 0;
1667         tp->snd_ssthresh = 0x7fffffff;
1668         tp->snd_cwnd_cnt = 0;
1669         tcp_set_ca_state(sk, TCP_CA_Open);
1670         tcp_clear_retrans(tp);
1671         inet_csk_delack_init(sk);
1672         sk->sk_send_head = NULL;
1673         tp->rx_opt.saw_tstamp = 0;
1674         tcp_sack_reset(&tp->rx_opt);
1675         __sk_dst_reset(sk);
1676
1677         BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
1678
1679         sk->sk_error_report(sk);
1680         return err;
1681 }
1682
1683 /*
1684  *      Socket option code for TCP.
1685  */
1686 int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
1687                    int optlen)
1688 {
1689         struct tcp_sock *tp = tcp_sk(sk);
1690         struct inet_connection_sock *icsk = inet_csk(sk);
1691         int val;
1692         int err = 0;
1693
1694         if (level != SOL_TCP)
1695                 return tp->af_specific->setsockopt(sk, level, optname,
1696                                                    optval, optlen);
1697
1698         /* This is a string value all the others are int's */
1699         if (optname == TCP_CONGESTION) {
1700                 char name[TCP_CA_NAME_MAX];
1701
1702                 if (optlen < 1)
1703                         return -EINVAL;
1704
1705                 val = strncpy_from_user(name, optval,
1706                                         min(TCP_CA_NAME_MAX-1, optlen));
1707                 if (val < 0)
1708                         return -EFAULT;
1709                 name[val] = 0;
1710
1711                 lock_sock(sk);
1712                 err = tcp_set_congestion_control(sk, name);
1713                 release_sock(sk);
1714                 return err;
1715         }
1716
1717         if (optlen < sizeof(int))
1718                 return -EINVAL;
1719
1720         if (get_user(val, (int __user *)optval))
1721                 return -EFAULT;
1722
1723         lock_sock(sk);
1724
1725         switch (optname) {
1726         case TCP_MAXSEG:
1727                 /* Values greater than interface MTU won't take effect. However
1728                  * at the point when this call is done we typically don't yet
1729                  * know which interface is going to be used */
1730                 if (val < 8 || val > MAX_TCP_WINDOW) {
1731                         err = -EINVAL;
1732                         break;
1733                 }
1734                 tp->rx_opt.user_mss = val;
1735                 break;
1736
1737         case TCP_NODELAY:
1738                 if (val) {
1739                         /* TCP_NODELAY is weaker than TCP_CORK, so that
1740                          * this option on corked socket is remembered, but
1741                          * it is not activated until cork is cleared.
1742                          *
1743                          * However, when TCP_NODELAY is set we make
1744                          * an explicit push, which overrides even TCP_CORK
1745                          * for currently queued segments.
1746                          */
1747                         tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
1748                         tcp_push_pending_frames(sk, tp);
1749                 } else {
1750                         tp->nonagle &= ~TCP_NAGLE_OFF;
1751                 }
1752                 break;
1753
1754         case TCP_CORK:
1755                 /* When set indicates to always queue non-full frames.
1756                  * Later the user clears this option and we transmit
1757                  * any pending partial frames in the queue.  This is
1758                  * meant to be used alongside sendfile() to get properly
1759                  * filled frames when the user (for example) must write
1760                  * out headers with a write() call first and then use
1761                  * sendfile to send out the data parts.
1762                  *
1763                  * TCP_CORK can be set together with TCP_NODELAY and it is
1764                  * stronger than TCP_NODELAY.
1765                  */
1766                 if (val) {
1767                         tp->nonagle |= TCP_NAGLE_CORK;
1768                 } else {
1769                         tp->nonagle &= ~TCP_NAGLE_CORK;
1770                         if (tp->nonagle&TCP_NAGLE_OFF)
1771                                 tp->nonagle |= TCP_NAGLE_PUSH;
1772                         tcp_push_pending_frames(sk, tp);
1773                 }
1774                 break;
1775
1776         case TCP_KEEPIDLE:
1777                 if (val < 1 || val > MAX_TCP_KEEPIDLE)
1778                         err = -EINVAL;
1779                 else {
1780                         tp->keepalive_time = val * HZ;
1781                         if (sock_flag(sk, SOCK_KEEPOPEN) &&
1782                             !((1 << sk->sk_state) &
1783                               (TCPF_CLOSE | TCPF_LISTEN))) {
1784                                 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
1785                                 if (tp->keepalive_time > elapsed)
1786                                         elapsed = tp->keepalive_time - elapsed;
1787                                 else
1788                                         elapsed = 0;
1789                                 inet_csk_reset_keepalive_timer(sk, elapsed);
1790                         }
1791                 }
1792                 break;
1793         case TCP_KEEPINTVL:
1794                 if (val < 1 || val > MAX_TCP_KEEPINTVL)
1795                         err = -EINVAL;
1796                 else
1797                         tp->keepalive_intvl = val * HZ;
1798                 break;
1799         case TCP_KEEPCNT:
1800                 if (val < 1 || val > MAX_TCP_KEEPCNT)
1801                         err = -EINVAL;
1802                 else
1803                         tp->keepalive_probes = val;
1804                 break;
1805         case TCP_SYNCNT:
1806                 if (val < 1 || val > MAX_TCP_SYNCNT)
1807                         err = -EINVAL;
1808                 else
1809                         icsk->icsk_syn_retries = val;
1810                 break;
1811
1812         case TCP_LINGER2:
1813                 if (val < 0)
1814                         tp->linger2 = -1;
1815                 else if (val > sysctl_tcp_fin_timeout / HZ)
1816                         tp->linger2 = 0;
1817                 else
1818                         tp->linger2 = val * HZ;
1819                 break;
1820
1821         case TCP_DEFER_ACCEPT:
1822                 icsk->icsk_accept_queue.rskq_defer_accept = 0;
1823                 if (val > 0) {
1824                         /* Translate value in seconds to number of
1825                          * retransmits */
1826                         while (icsk->icsk_accept_queue.rskq_defer_accept < 32 &&
1827                                val > ((TCP_TIMEOUT_INIT / HZ) <<
1828                                        icsk->icsk_accept_queue.rskq_defer_accept))
1829                                 icsk->icsk_accept_queue.rskq_defer_accept++;
1830                         icsk->icsk_accept_queue.rskq_defer_accept++;
1831                 }
1832                 break;
1833
1834         case TCP_WINDOW_CLAMP:
1835                 if (!val) {
1836                         if (sk->sk_state != TCP_CLOSE) {
1837                                 err = -EINVAL;
1838                                 break;
1839                         }
1840                         tp->window_clamp = 0;
1841                 } else
1842                         tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
1843                                                 SOCK_MIN_RCVBUF / 2 : val;
1844                 break;
1845
1846         case TCP_QUICKACK:
1847                 if (!val) {
1848                         icsk->icsk_ack.pingpong = 1;
1849                 } else {
1850                         icsk->icsk_ack.pingpong = 0;
1851                         if ((1 << sk->sk_state) &
1852                             (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
1853                             inet_csk_ack_scheduled(sk)) {
1854                                 icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
1855                                 cleanup_rbuf(sk, 1);
1856                                 if (!(val & 1))
1857                                         icsk->icsk_ack.pingpong = 1;
1858                         }
1859                 }
1860                 break;
1861
1862         default:
1863                 err = -ENOPROTOOPT;
1864                 break;
1865         };
1866         release_sock(sk);
1867         return err;
1868 }
1869
1870 /* Return information about state of tcp endpoint in API format. */
1871 void tcp_get_info(struct sock *sk, struct tcp_info *info)
1872 {
1873         struct tcp_sock *tp = tcp_sk(sk);
1874         const struct inet_connection_sock *icsk = inet_csk(sk);
1875         u32 now = tcp_time_stamp;
1876
1877         memset(info, 0, sizeof(*info));
1878
1879         info->tcpi_state = sk->sk_state;
1880         info->tcpi_ca_state = icsk->icsk_ca_state;
1881         info->tcpi_retransmits = icsk->icsk_retransmits;
1882         info->tcpi_probes = icsk->icsk_probes_out;
1883         info->tcpi_backoff = icsk->icsk_backoff;
1884
1885         if (tp->rx_opt.tstamp_ok)
1886                 info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
1887         if (tp->rx_opt.sack_ok)
1888                 info->tcpi_options |= TCPI_OPT_SACK;
1889         if (tp->rx_opt.wscale_ok) {
1890                 info->tcpi_options |= TCPI_OPT_WSCALE;
1891                 info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
1892                 info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
1893         }
1894
1895         if (tp->ecn_flags&TCP_ECN_OK)
1896                 info->tcpi_options |= TCPI_OPT_ECN;
1897
1898         info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
1899         info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
1900         info->tcpi_snd_mss = tp->mss_cache;
1901         info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
1902
1903         info->tcpi_unacked = tp->packets_out;
1904         info->tcpi_sacked = tp->sacked_out;
1905         info->tcpi_lost = tp->lost_out;
1906         info->tcpi_retrans = tp->retrans_out;
1907         info->tcpi_fackets = tp->fackets_out;
1908
1909         info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
1910         info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime);
1911         info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
1912
1913         info->tcpi_pmtu = tp->pmtu_cookie;
1914         info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
1915         info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
1916         info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
1917         info->tcpi_snd_ssthresh = tp->snd_ssthresh;
1918         info->tcpi_snd_cwnd = tp->snd_cwnd;
1919         info->tcpi_advmss = tp->advmss;
1920         info->tcpi_reordering = tp->reordering;
1921
1922         info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
1923         info->tcpi_rcv_space = tp->rcvq_space.space;
1924
1925         info->tcpi_total_retrans = tp->total_retrans;
1926 }
1927
1928 EXPORT_SYMBOL_GPL(tcp_get_info);
1929
1930 int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
1931                    int __user *optlen)
1932 {
1933         struct inet_connection_sock *icsk = inet_csk(sk);
1934         struct tcp_sock *tp = tcp_sk(sk);
1935         int val, len;
1936
1937         if (level != SOL_TCP)
1938                 return tp->af_specific->getsockopt(sk, level, optname,
1939                                                    optval, optlen);
1940
1941         if (get_user(len, optlen))
1942                 return -EFAULT;
1943
1944         len = min_t(unsigned int, len, sizeof(int));
1945
1946         if (len < 0)
1947                 return -EINVAL;
1948
1949         switch (optname) {
1950         case TCP_MAXSEG:
1951                 val = tp->mss_cache;
1952                 if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
1953                         val = tp->rx_opt.user_mss;
1954                 break;
1955         case TCP_NODELAY:
1956                 val = !!(tp->nonagle&TCP_NAGLE_OFF);
1957                 break;
1958         case TCP_CORK:
1959                 val = !!(tp->nonagle&TCP_NAGLE_CORK);
1960                 break;
1961         case TCP_KEEPIDLE:
1962                 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
1963                 break;
1964         case TCP_KEEPINTVL:
1965                 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
1966                 break;
1967         case TCP_KEEPCNT:
1968                 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
1969                 break;
1970         case TCP_SYNCNT:
1971                 val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
1972                 break;
1973         case TCP_LINGER2:
1974                 val = tp->linger2;
1975                 if (val >= 0)
1976                         val = (val ? : sysctl_tcp_fin_timeout) / HZ;
1977                 break;
1978         case TCP_DEFER_ACCEPT:
1979                 val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 :
1980                         ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1));
1981                 break;
1982         case TCP_WINDOW_CLAMP:
1983                 val = tp->window_clamp;
1984                 break;
1985         case TCP_INFO: {
1986                 struct tcp_info info;
1987
1988                 if (get_user(len, optlen))
1989                         return -EFAULT;
1990
1991                 tcp_get_info(sk, &info);
1992
1993                 len = min_t(unsigned int, len, sizeof(info));
1994                 if (put_user(len, optlen))
1995                         return -EFAULT;
1996                 if (copy_to_user(optval, &info, len))
1997                         return -EFAULT;
1998                 return 0;
1999         }
2000         case TCP_QUICKACK:
2001                 val = !icsk->icsk_ack.pingpong;
2002                 break;
2003
2004         case TCP_CONGESTION:
2005                 if (get_user(len, optlen))
2006                         return -EFAULT;
2007                 len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
2008                 if (put_user(len, optlen))
2009                         return -EFAULT;
2010                 if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
2011                         return -EFAULT;
2012                 return 0;
2013         default:
2014                 return -ENOPROTOOPT;
2015         };
2016
2017         if (put_user(len, optlen))
2018                 return -EFAULT;
2019         if (copy_to_user(optval, &val, len))
2020                 return -EFAULT;
2021         return 0;
2022 }
2023
2024
2025 extern void __skb_cb_too_small_for_tcp(int, int);
2026 extern struct tcp_congestion_ops tcp_reno;
2027
2028 static __initdata unsigned long thash_entries;
2029 static int __init set_thash_entries(char *str)
2030 {
2031         if (!str)
2032                 return 0;
2033         thash_entries = simple_strtoul(str, &str, 0);
2034         return 1;
2035 }
2036 __setup("thash_entries=", set_thash_entries);
2037
2038 void __init tcp_init(void)
2039 {
2040         struct sk_buff *skb = NULL;
2041         int order, i;
2042
2043         if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2044                 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2045                                            sizeof(skb->cb));
2046
2047         tcp_hashinfo.bind_bucket_cachep =
2048                 kmem_cache_create("tcp_bind_bucket",
2049                                   sizeof(struct inet_bind_bucket), 0,
2050                                   SLAB_HWCACHE_ALIGN, NULL, NULL);
2051         if (!tcp_hashinfo.bind_bucket_cachep)
2052                 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2053
2054         /* Size and allocate the main established and bind bucket
2055          * hash tables.
2056          *
2057          * The methodology is similar to that of the buffer cache.
2058          */
2059         tcp_hashinfo.ehash =
2060                 alloc_large_system_hash("TCP established",
2061                                         sizeof(struct inet_ehash_bucket),
2062                                         thash_entries,
2063                                         (num_physpages >= 128 * 1024) ?
2064                                                 (25 - PAGE_SHIFT) :
2065                                                 (27 - PAGE_SHIFT),
2066                                         HASH_HIGHMEM,
2067                                         &tcp_hashinfo.ehash_size,
2068                                         NULL,
2069                                         0);
2070         tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1;
2071         for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) {
2072                 rwlock_init(&tcp_hashinfo.ehash[i].lock);
2073                 INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
2074         }
2075
2076         tcp_hashinfo.bhash =
2077                 alloc_large_system_hash("TCP bind",
2078                                         sizeof(struct inet_bind_hashbucket),
2079                                         tcp_hashinfo.ehash_size,
2080                                         (num_physpages >= 128 * 1024) ?
2081                                                 (25 - PAGE_SHIFT) :
2082                                                 (27 - PAGE_SHIFT),
2083                                         HASH_HIGHMEM,
2084                                         &tcp_hashinfo.bhash_size,
2085                                         NULL,
2086                                         64 * 1024);
2087         tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size;
2088         for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
2089                 spin_lock_init(&tcp_hashinfo.bhash[i].lock);
2090                 INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
2091         }
2092
2093         /* Try to be a bit smarter and adjust defaults depending
2094          * on available memory.
2095          */
2096         for (order = 0; ((1 << order) << PAGE_SHIFT) <
2097                         (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket));
2098                         order++)
2099                 ;
2100         if (order >= 4) {
2101                 sysctl_local_port_range[0] = 32768;
2102                 sysctl_local_port_range[1] = 61000;
2103                 tcp_death_row.sysctl_max_tw_buckets = 180000;
2104                 sysctl_tcp_max_orphans = 4096 << (order - 4);
2105                 sysctl_max_syn_backlog = 1024;
2106         } else if (order < 3) {
2107                 sysctl_local_port_range[0] = 1024 * (3 - order);
2108                 tcp_death_row.sysctl_max_tw_buckets >>= (3 - order);
2109                 sysctl_tcp_max_orphans >>= (3 - order);
2110                 sysctl_max_syn_backlog = 128;
2111         }
2112         tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1;
2113
2114         sysctl_tcp_mem[0] =  768 << order;
2115         sysctl_tcp_mem[1] = 1024 << order;
2116         sysctl_tcp_mem[2] = 1536 << order;
2117
2118         if (order < 3) {
2119                 sysctl_tcp_wmem[2] = 64 * 1024;
2120                 sysctl_tcp_rmem[0] = PAGE_SIZE;
2121                 sysctl_tcp_rmem[1] = 43689;
2122                 sysctl_tcp_rmem[2] = 2 * 43689;
2123         }
2124
2125         printk(KERN_INFO "TCP: Hash tables configured "
2126                "(established %d bind %d)\n",
2127                tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size);
2128
2129         tcp_register_congestion_control(&tcp_reno);
2130 }
2131
2132 EXPORT_SYMBOL(tcp_close);
2133 EXPORT_SYMBOL(tcp_disconnect);
2134 EXPORT_SYMBOL(tcp_getsockopt);
2135 EXPORT_SYMBOL(tcp_ioctl);
2136 EXPORT_SYMBOL(tcp_poll);
2137 EXPORT_SYMBOL(tcp_read_sock);
2138 EXPORT_SYMBOL(tcp_recvmsg);
2139 EXPORT_SYMBOL(tcp_sendmsg);
2140 EXPORT_SYMBOL(tcp_sendpage);
2141 EXPORT_SYMBOL(tcp_setsockopt);
2142 EXPORT_SYMBOL(tcp_shutdown);
2143 EXPORT_SYMBOL(tcp_statistics);